├── evar
    ├── __init__.py
    ├── utils
    │   ├── m2d_add_norm_stats.py
    │   ├── calculations.py
    │   ├── download_voxforge.py
    │   ├── download_cremad.py
    │   └── __init__.py
    ├── ar_openl3.py
    ├── ar_trill.py
    ├── ar_msclap.py
    ├── ar_spec.py
    ├── ar_dasheng.py
    ├── ar_ced.py
    ├── ar_atst_frame.py
    ├── ar_cnn14.py
    ├── ar_data2vec.py
    ├── ar_byola2.py
    ├── ar_wavlm.py
    ├── ar_hubert.py
    ├── ar_vggish.py
    ├── ar_esresnext_fbsp.py
    ├── ar_htsat.py
    ├── ar_beats.py
    ├── ar_wavcaps.py
    ├── ar_coala.py
    ├── ar_ast.py
    ├── ds_tasks.py
    ├── ar_byola.py
    ├── ar_wav2vec2.py
    ├── common.py
    ├── model_utils.py
    ├── ar_laionclap.py
    └── ar_opera.py
├── external
    ├── coala_scaler_top_1000_plus_clip.pkl
    ├── opera.patch
    ├── coala.patch
    ├── wavcaps.patch
    └── ast_models.patch
├── requirements.txt
├── plugin
    ├── MARBLE
    │   ├── benchmark
    │   │   └── models
    │   │   │   └── evar
    │   │   │       ├── evar.yaml
    │   │   │       └── extract_evar_features.py
    │   ├── configs
    │   │   └── evar
    │   │   │   ├── MTT.yaml
    │   │   │   ├── GTZAN.yaml
    │   │   │   ├── NSynthI.yaml
    │   │   │   ├── NSynthP.yaml
    │   │   │   ├── GS.yaml
    │   │   │   ├── VocalSetS.yaml
    │   │   │   ├── VocalSetT.yaml
    │   │   │   ├── MTGMood.yaml
    │   │   │   ├── MTGGenre.yaml
    │   │   │   ├── MTGTop50.yaml
    │   │   │   ├── MTGInstrument.yaml
    │   │   │   └── EMO.yaml
    │   ├── evar_marble.sh
    │   ├── REAEDME_MARBLE.md
    │   └── evar_marble_diff.patch
    └── OPERA
    │   ├── evar_openl3env.sh
    │   ├── evar_atst_clip.sh
    │   ├── evar_m2d.sh
    │   ├── evar_beats.sh
    │   ├── evar_ast.sh
    │   ├── evar_htsat.sh
    │   ├── evar_byola.sh
    │   ├── evar_hubert.sh
    │   ├── evar_wavlm.sh
    │   ├── evar_atst_frame.sh
    │   ├── evar_beats_plus.sh
    │   ├── evar_wav2vec2.sh
    │   └── evar_m2d_layers.sh
├── config
    ├── coala.yaml
    ├── esresnextfbsp.yaml
    ├── linspec.yaml
    ├── melspec.yaml
    ├── openl3env.yaml
    ├── openl3mus.yaml
    ├── wav2vec2feature.yaml
    ├── dasheng.yaml
    ├── wavlm.yaml
    ├── hubert.yaml
    ├── trill.yaml
    ├── data2vec.yaml
    ├── wav2vec2logit.yaml
    ├── wav2vec2context.yaml
    ├── laionclap.yaml
    ├── atst.yaml
    ├── atst_frame.yaml
    ├── byola2.yaml
    ├── cnn14.yaml
    ├── byola.yaml
    ├── byolax.yaml
    ├── opera.yaml
    ├── ast.yaml
    ├── vggish.yaml
    ├── vggish_4k.yaml
    ├── msclap.yaml
    ├── wavcaps.yaml
    ├── ced.yaml
    ├── htsat.yaml
    ├── beats.yaml
    ├── m2d_clap.yaml
    ├── m2d_clap_32k.yaml
    ├── beats_plus.yaml
    ├── m2d.yaml
    └── m2d_32k.yaml
├── app
    ├── bmdhs
    │   ├── ev_ast.sh
    │   ├── ev_beats.sh
    │   ├── ev_byola.sh
    │   ├── ev_m2d.sh
    │   ├── make_metadata.py
    │   └── README_BMDHS.md
    ├── circor
    │   ├── ev_ast.sh
    │   ├── ev_beats.sh
    │   ├── ev_byola.sh
    │   ├── ev_m2d.sh
    │   ├── patch-heart-murmur-detection.diff
    │   ├── README_CirCor.md
    │   └── rearrange_data.py
    ├── icbhi_sprs
    │   ├── ev_icbhi_ast.sh
    │   ├── ev_icbhi_m2d.sh
    │   ├── ev_icbhi_mlp_m2d.sh
    │   ├── ev_icbhi_opera.sh
    │   ├── ev_sprs_ast.sh
    │   ├── ev_icbhi_beats.sh
    │   ├── ev_sprs_m2d.sh
    │   ├── ev_sprs_opera.sh
    │   ├── ev_sprs_byola.sh
    │   ├── ev_icbhi_byola.sh
    │   ├── ev_sprs_beats.sh
    │   └── README_ICBHI_SPRS.md
    └── README.md
├── run
    ├── all_byola2.sh
    ├── all_ced.sh
    ├── all_atst.sh
    ├── all_beats.sh
    ├── all_htsat.sh
    ├── all_m2d.sh
    ├── all_atst_frame.sh
    ├── all_beats_plus.sh
    ├── all_msclap.sh
    ├── all_laionclap.sh
    └── all_wavcaps.sh
├── 2pass_lineareval.py
├── prepare_wav.py
├── .gitignore
├── Evaluation-examples.md
├── summarize.py
└── Preparing-models.md


/evar/__init__.py:
--------------------------------------------------------------------------------
1 | # EVAR
2 | 


--------------------------------------------------------------------------------
/external/coala_scaler_top_1000_plus_clip.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nttcslab/eval-audio-repr/HEAD/external/coala_scaler_top_1000_plus_clip.pkl


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | pandas
 2 | numpy
 3 | torch
 4 | torchvision
 5 | torchaudio
 6 | easydict
 7 | fire
 8 | tqdm
 9 | sklearn
10 | nnAudio
11 | torchlibrosa
12 | torchopenl3
13 | pyyaml


--------------------------------------------------------------------------------
/plugin/MARBLE/benchmark/models/evar/evar.yaml:
--------------------------------------------------------------------------------
1 | name: evar
2 | target_sr: 16000
3 | num_features: 
4 | pretrain_folder: benchmark/models/evar
5 | evar_config: m2d.yaml
6 | weight: m2d_vit_base-80x608p16x16-221006-mr7/checkpoint-300.pth
7 | options: 


--------------------------------------------------------------------------------
/config/coala.yaml:
--------------------------------------------------------------------------------
 1 | # AR parameters such as FFT parameters.
 2 | name: AR_COALA
 3 | audio_repr: ar_coala.AR_COALA
 4 | feature_d: 1152
 5 | sample_rate: 22000 # COALA Special
 6 | temporal_pooling_type: mean_max
 7 | 
 8 | # Training parameters.
 9 | batch_size: 64
10 | lr_lineareval: 0.0003
11 | report_per_epochs: 20
12 | early_stop_epochs: 20
13 | 


--------------------------------------------------------------------------------
/config/esresnextfbsp.yaml:
--------------------------------------------------------------------------------
 1 | # AR parameters such as FFT parameters.
 2 | audio_repr: ar_esresnext_fbsp.AR_ESResNeXtFBSP
 3 | feature_d: 2048
 4 | sample_rate: 44100
 5 | # temporal_pooling_type: -> not using common temporal pooling.
 6 | 
 7 | # Training parameters.
 8 | batch_size: 64
 9 | lr_lineareval: 0.0003
10 | report_per_epochs: 20
11 | early_stop_epochs: 20
12 | 


--------------------------------------------------------------------------------
/config/linspec.yaml:
--------------------------------------------------------------------------------
 1 | # AR parameters such as FFT parameters.
 2 | audio_repr: ar_spec.AR_LinSpec
 3 | feature_d: 513
 4 | sample_rate: 16000
 5 | n_fft: 1024
 6 | window_size: 1024
 7 | hop_size: 160
 8 | #n_mels: 64
 9 | f_min: 60
10 | f_max: 7800
11 | temporal_pooling_type: mean_max
12 | 
13 | # Training parameters.
14 | batch_size: 256
15 | lr_lineareval: 0.0003
16 | report_per_epochs: 20
17 | early_stop_epochs: 20
18 | 


--------------------------------------------------------------------------------
/config/melspec.yaml:
--------------------------------------------------------------------------------
 1 | # AR parameters such as FFT parameters.
 2 | audio_repr: ar_spec.AR_MelSpec
 3 | feature_d: 64
 4 | sample_rate: 16000
 5 | n_fft: 1024
 6 | window_size: 1024
 7 | hop_size: 160
 8 | n_mels: 64
 9 | f_min: 60
10 | f_max: 7800
11 | temporal_pooling_type: mean_max
12 | 
13 | # Training parameters.
14 | batch_size: 256
15 | lr_lineareval: 0.0003
16 | report_per_epochs: 20
17 | early_stop_epochs: 20
18 | 


--------------------------------------------------------------------------------
/config/openl3env.yaml:
--------------------------------------------------------------------------------
 1 | # AR parameters such as FFT parameters.
 2 | name: AR_OpenL3Env
 3 | audio_repr: ar_openl3.AR_OpenL3
 4 | feature_d: 6144
 5 | sample_rate: 48000
 6 | temporal_pooling_type: mean_max
 7 | 
 8 | # Training parameters.
 9 | batch_size: 64
10 | lr_lineareval: 0.0003
11 | report_per_epochs: 20
12 | early_stop_epochs: 20
13 | 
14 | # Model specific parameters.
15 | openl3_input_repr: mel256
16 | openl3_content_type: env
17 | 


--------------------------------------------------------------------------------
/config/openl3mus.yaml:
--------------------------------------------------------------------------------
 1 | # AR parameters such as FFT parameters.
 2 | name: AR_OpenL3Mus
 3 | audio_repr: ar_openl3.AR_OpenL3
 4 | feature_d: 6144
 5 | sample_rate: 48000
 6 | temporal_pooling_type: mean_max
 7 | 
 8 | # Training parameters.
 9 | batch_size: 64
10 | lr_lineareval: 0.0003
11 | report_per_epochs: 20
12 | early_stop_epochs: 20
13 | 
14 | # Model specific parameters.
15 | openl3_input_repr: mel256
16 | openl3_content_type: music
17 | 


--------------------------------------------------------------------------------
/config/wav2vec2feature.yaml:
--------------------------------------------------------------------------------
 1 | # AR parameters such as FFT parameters.
 2 | audio_repr: ar_wav2vec2.AR_Wav2Vec2Feature
 3 | feature_d: 512
 4 | sample_rate: 16000
 5 | temporal_pooling_type: mean_max
 6 | 
 7 | # Training parameters.
 8 | batch_size: 256
 9 | lr_lineareval: 0.0003
10 | report_per_epochs: 20
11 | early_stop_epochs: 20
12 | 
13 | # Model specific parameters.
14 | wav2vec_model: facebook/wav2vec2-base-960h
15 | # wav2vec_model: facebook/wav2vec2-large-960h-lv60
16 | 


--------------------------------------------------------------------------------
/config/dasheng.yaml:
--------------------------------------------------------------------------------
 1 | # AR parameters such as FFT parameters.
 2 | audio_repr: ar_dasheng.AR_Dasheng
 3 | feature_d: 768
 4 | sample_rate: 16000
 5 | # temporal_pooling_type: mean_max
 6 | 
 7 | # Training parameters.
 8 | batch_size: 256
 9 | lr_lineareval: 0.0003
10 | report_per_epochs: 20
11 | early_stop_epochs: 20
12 | 
13 | # Model specific parameters.
14 | # model_name: mispeech/dasheng-base
15 | # model_name: mispeech/dasheng-0.6B
16 | model_name: mispeech/dasheng-1.2B
17 | 


--------------------------------------------------------------------------------
/config/wavlm.yaml:
--------------------------------------------------------------------------------
 1 | # AR parameters such as FFT parameters.
 2 | audio_repr: ar_wavlm.AR_WavLM
 3 | feature_d: 768
 4 | sample_rate: 16000
 5 | temporal_pooling_type: mean
 6 | 
 7 | # Training parameters.
 8 | batch_size: 64
 9 | lr_lineareval: 0.0003
10 | report_per_epochs: 20
11 | early_stop_epochs: 20
12 | 
13 | # Model specific parameters.
14 | pretrained_model: microsoft/wavlm-base
15 | # pretrained_model: microsoft/wavlm-large
16 | output_layers: [-1]  # list of layers to stack
17 | 


--------------------------------------------------------------------------------
/config/hubert.yaml:
--------------------------------------------------------------------------------
 1 | # AR parameters such as FFT parameters.
 2 | audio_repr: ar_hubert.AR_Hubert
 3 | feature_d: 768
 4 | sample_rate: 16000
 5 | temporal_pooling_type: mean
 6 | 
 7 | # Training parameters.
 8 | batch_size: 64
 9 | lr_lineareval: 0.0003
10 | report_per_epochs: 20
11 | early_stop_epochs: 20
12 | 
13 | # Model specific parameters.
14 | pretrained_model: facebook/hubert-base-ls960
15 | # pretrained_model: facebook/hubert-large-ls960-ft
16 | output_layers: [-1]  # list of layers to stack
17 | 


--------------------------------------------------------------------------------
/config/trill.yaml:
--------------------------------------------------------------------------------
 1 | # AR parameters such as FFT parameters.
 2 | name: AR_TRILL
 3 | audio_repr: ar_trill.AR_TRILL
 4 | feature_d: 12288
 5 | sample_rate: 16000
 6 | temporal_pooling_type: mean_max
 7 | 
 8 | # Training parameters.
 9 | batch_size: 64
10 | lr_lineareval: 0.0003
11 | report_per_epochs: 20
12 | early_stop_epochs: 20
13 | 
14 | # Model specific parameters.
15 | trill_emb_type: layer19
16 | # trill_emb_type: embedding
17 | trill_url: https://tfhub.dev/google/nonsemantic-speech-benchmark/trill/2
18 | 


--------------------------------------------------------------------------------
/config/data2vec.yaml:
--------------------------------------------------------------------------------
 1 | # AR parameters such as FFT parameters.
 2 | audio_repr: ar_data2vec.AR_Data2Vec
 3 | feature_d: 768
 4 | sample_rate: 16000
 5 | temporal_pooling_type: mean
 6 | 
 7 | # Training parameters.
 8 | batch_size: 64
 9 | lr_lineareval: 0.0003
10 | report_per_epochs: 20
11 | early_stop_epochs: 20
12 | 
13 | # Model specific parameters.
14 | # pretrained_model: facebook/data2vec-audio-base-960h
15 | pretrained_model: facebook/data2vec-audio-large-960h
16 | output_layers: [-1]  # list of layers to stack
17 | 


--------------------------------------------------------------------------------
/external/opera.patch:
--------------------------------------------------------------------------------
 1 | diff -ur src_org/model/models_cola.py src/model/models_cola.py
 2 | --- src_org/model/models_cola.py	2024-12-13 22:35:31.594477687 +0900
 3 | +++ src/model/models_cola.py	2024-12-14 00:05:54.756510677 +0900
 4 | @@ -1,6 +1,6 @@
 5 |  import pytorch_lightning as pl
 6 |  import torch
 7 | -from efficientnet_pytorch import EfficientNet
 8 | +#from efficientnet_pytorch import EfficientNet
 9 |  from torch.nn import functional as F
10 |  import numpy as np
11 |  from src.model.htsat.htsat import HTSATWrapper
12 | 


--------------------------------------------------------------------------------
/app/bmdhs/ev_ast.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | split=$1
 4 | n_iter=$2
 5 | seed=$3
 6 | lr_prm=$4
 7 | bs=64
 8 | epochs=30
 9 | gpu=0
10 | 
11 | echo Repeating $n_iter times...
12 | 
13 | for i in $(seq $n_iter); do
14 |   seed=$((seed + 1))
15 |   cmdline="CUDA_VISIBLE_DEVICES=$gpu python -m app.bmdhs.solve_bmdhs config/ast.yaml bmdhs$split --lr=$lr_prm --freq_mask 0 --time_mask 0 --training_mask 0.0 --mixup 0.0 --rrc False --epochs $epochs --warmup_epochs 0 --seed $seed --batch_size $bs"
16 |   echo $cmdline
17 |   eval $cmdline
18 | done
19 | 


--------------------------------------------------------------------------------
/app/bmdhs/ev_beats.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | split=$1
 4 | n_iter=$2
 5 | seed=$3
 6 | lr_prm=$4
 7 | bs=128
 8 | epochs=30
 9 | gpu=0
10 | 
11 | echo Repeating $n_iter times...
12 | 
13 | for i in $(seq $n_iter); do
14 |   seed=$((seed + 1))
15 |   cmdline="CUDA_VISIBLE_DEVICES=$gpu python -m app.bmdhs.solve_bmdhs config/beats.yaml bmdhs$split --lr=$lr_prm --freq_mask 0 --time_mask 0 --training_mask 0.0 --mixup 0.0 --rrc False --epochs $epochs --warmup_epochs 0 --seed $seed --batch_size $bs"
16 |   echo $cmdline
17 |   eval $cmdline
18 | done
19 | 


--------------------------------------------------------------------------------
/app/bmdhs/ev_byola.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | split=$1
 4 | n_iter=$2
 5 | seed=$3
 6 | lr_prm=$4
 7 | bs=128
 8 | epochs=30
 9 | gpu=0
10 | 
11 | echo Repeating $n_iter times...
12 | 
13 | for i in $(seq $n_iter); do
14 |   seed=$((seed + 1))
15 |   cmdline="CUDA_VISIBLE_DEVICES=$gpu python -m app.bmdhs.solve_bmdhs config/byola.yaml bmdhs$split --lr=$lr_prm --freq_mask 0 --time_mask 0 --training_mask 0.0 --mixup 0.0 --rrc False --epochs $epochs --warmup_epochs 0 --seed $seed --batch_size $bs"
16 |   echo $cmdline
17 |   eval $cmdline
18 | done
19 | 


--------------------------------------------------------------------------------
/app/circor/ev_ast.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | split=$1
 4 | n_iter=$2
 5 | seed=$3
 6 | lr_prm=$4
 7 | bs=1024
 8 | gpu=0
 9 | hidden='\(128,\)'
10 | 
11 | echo Repeating $n_iter times...
12 | 
13 | for i in $(seq $n_iter); do
14 |   seed=$((seed + 1))
15 |   cmdline="CUDA_VISIBLE_DEVICES=$gpu python -m app.circor.solve_circor config/ast.yaml circor$split --lr=$lr_prm --freq_mask 0 --time_mask 0 --training_mask 0.0 --mixup 0.0 --rrc False --epochs 50 --warmup_epochs 0 --seed $seed --batch_size $bs --hidden $hidden"
16 |   echo $cmdline
17 |   eval $cmdline
18 | done
19 | 


--------------------------------------------------------------------------------
/config/wav2vec2logit.yaml:
--------------------------------------------------------------------------------
 1 | # AR parameters such as FFT parameters.
 2 | audio_repr: ar_wav2vec2.AR_Wav2Vec2Logit
 3 | feature_d: 32
 4 | sample_rate: 16000
 5 | temporal_pooling_type: mean # `mean+max` severely degrades performances on some tasks such as VC1. We set the `mean` for wav2vec2.
 6 | 
 7 | # Training parameters.
 8 | batch_size: 256
 9 | lr_lineareval: 0.0003
10 | report_per_epochs: 20
11 | early_stop_epochs: 20
12 | 
13 | # Model specific parameters.
14 | wav2vec_model: facebook/wav2vec2-base-960h
15 | # wav2vec_model: facebook/wav2vec2-large-960h-lv60
16 | 


--------------------------------------------------------------------------------
/app/circor/ev_beats.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | split=$1
 4 | n_iter=$2
 5 | seed=$3
 6 | lr_prm=$4
 7 | bs=1024
 8 | gpu=0
 9 | hidden='\(128,\)'
10 | 
11 | echo Repeating $n_iter times...
12 | 
13 | for i in $(seq $n_iter); do
14 |   seed=$((seed + 1))
15 |   cmdline="CUDA_VISIBLE_DEVICES=$gpu python -m app.circor.solve_circor config/beats.yaml circor$split --lr=$lr_prm --freq_mask 0 --time_mask 0 --training_mask 0.0 --mixup 0.0 --rrc False --epochs 50 --warmup_epochs 0 --seed $seed --batch_size $bs --hidden $hidden"
16 |   echo $cmdline
17 |   eval $cmdline
18 | done
19 | 


--------------------------------------------------------------------------------
/app/circor/ev_byola.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | split=$1
 4 | n_iter=$2
 5 | seed=$3
 6 | lr_prm=$4
 7 | bs=1024
 8 | gpu=0
 9 | hidden='\(128,\)'
10 | 
11 | echo Repeating $n_iter times...
12 | 
13 | for i in $(seq $n_iter); do
14 |   seed=$((seed + 1))
15 |   cmdline="CUDA_VISIBLE_DEVICES=$gpu python -m app.circor.solve_circor config/byola.yaml circor$split --lr=$lr_prm --freq_mask 0 --time_mask 0 --training_mask 0.0 --mixup 0.0 --rrc False --epochs 50 --warmup_epochs 0 --seed $seed --batch_size $bs --hidden $hidden"
16 |   echo $cmdline
17 |   eval $cmdline
18 | done
19 | 


--------------------------------------------------------------------------------
/app/icbhi_sprs/ev_icbhi_ast.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | if [ $# -lt 1 ]; then
 4 |   n_iter=3
 5 | else
 6 |   n_iter=$1
 7 | fi
 8 | 
 9 | if [ $# -lt 2 ]; then
10 |   lr_prm=0.00003
11 | else
12 |   lr_prm=$2
13 | fi
14 | bs=256
15 | spl=1
16 | head=tfm
17 | extra=--freeze_body
18 | # --freeze_embed
19 | 
20 | echo Repeating $n_iter times...
21 | 
22 | for i in $(seq $n_iter); do
23 |     cmdline="CUDA_VISIBLE_DEVICES=0 python solve.py ../../config/ast.yaml --epochs 150  --bs $bs --lr $lr_prm --head $head $extra --split_iter $spl"
24 |     echo $cmdline
25 |     eval $cmdline
26 | done
27 | 


--------------------------------------------------------------------------------
/app/bmdhs/ev_m2d.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | weight=$1
 4 | split=$2
 5 | n_iter=$3
 6 | seed=$4
 7 | lr_prm=$5
 8 | bs=128
 9 | epochs=100
10 | gpu=0
11 | 
12 | echo Repeating $n_iter times...
13 | 
14 | for i in $(seq $n_iter); do
15 |   seed=$((seed + 1))
16 |   cmdline="CUDA_VISIBLE_DEVICES=$gpu python -m app.bmdhs.solve_bmdhs config/m2d.yaml bmdhs$split weight_file=$weight,encoder_only=True --lr=$lr_prm --freq_mask 0 --time_mask 0 --training_mask 0.0 --mixup 0.0 --rrc False --epochs $epochs --warmup_epochs 0 --seed $seed --batch_size $bs"
17 |   echo $cmdline
18 |   eval $cmdline
19 | done
20 | 


--------------------------------------------------------------------------------
/app/icbhi_sprs/ev_icbhi_m2d.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | if [ $# -lt 2 ]; then
 4 |   n_iter=3
 5 | else
 6 |   n_iter=$2
 7 | fi
 8 | 
 9 | if [ $# -lt 3 ]; then
10 |   lr_prm=0.00003
11 | else
12 |   lr_prm=$3
13 | fi
14 | bs=256
15 | spl=1
16 | head=tfm
17 | extra=--freeze_body
18 | # --freeze_embed
19 | 
20 | echo Repeating $n_iter times...
21 | 
22 | for i in $(seq $n_iter); do
23 |     cmdline="CUDA_VISIBLE_DEVICES=0 python solve.py ../../config/m2d.yaml --epochs 150  --bs $bs --lr $lr_prm --weightspath $1 --head $head $extra --split_iter $spl"
24 |     echo $cmdline
25 |     eval $cmdline
26 | done
27 | 


--------------------------------------------------------------------------------
/app/icbhi_sprs/ev_icbhi_mlp_m2d.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | if [ $# -lt 2 ]; then
 4 |   n_iter=3
 5 | else
 6 |   n_iter=$2
 7 | fi
 8 | 
 9 | if [ $# -lt 3 ]; then
10 |   lr_prm=0.0003
11 | else
12 |   lr_prm=$3
13 | fi
14 | bs=256
15 | spl=1
16 | head=mlp
17 | extra=--freeze_body
18 | # --freeze_embed
19 | 
20 | echo Repeating $n_iter times...
21 | 
22 | for i in $(seq $n_iter); do
23 |     cmdline="CUDA_VISIBLE_DEVICES=0 python solve.py ../../config/m2d.yaml --epochs 150  --bs $bs --lr $lr_prm --weightspath $1 --head $head $extra --split_iter $spl"
24 |     echo $cmdline
25 |     eval $cmdline
26 | done
27 | 


--------------------------------------------------------------------------------
/app/icbhi_sprs/ev_icbhi_opera.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | if [ $# -lt 1 ]; then
 4 |   n_iter=3
 5 | else
 6 |   n_iter=$1
 7 | fi
 8 | 
 9 | if [ $# -lt 2 ]; then
10 |   lr_prm=0.003
11 | else
12 |   lr_prm=$2
13 | fi
14 | bs=256
15 | spl=1
16 | head=tfm
17 | extra=--freeze_body
18 | 
19 | echo Repeating $n_iter times...
20 | 
21 | for i in $(seq $n_iter); do
22 |     cmdline="CUDA_VISIBLE_DEVICES=0 python solve.py ../../config/opera.yaml --epochs 150 --weightspath ../../external/OPERA/encoder-operaCT.ckpt --bs $bs --lr $lr_prm --head $head $extra --split_iter $spl"
23 |     echo $cmdline
24 |     eval $cmdline
25 | done
26 | 


--------------------------------------------------------------------------------
/app/icbhi_sprs/ev_sprs_ast.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | if [ $# -lt 1 ]; then
 4 |   n_iter=3
 5 | else
 6 |   n_iter=$1
 7 | fi
 8 | 
 9 | if [ $# -lt 2 ]; then
10 |   lr_prm=0.00003
11 | else
12 |   lr_prm=$2
13 | fi
14 | bs=256
15 | spl=1
16 | head=tfm
17 | extra=--freeze_body
18 | # --freeze_embed
19 | 
20 | echo Repeating $n_iter times...
21 | 
22 | for i in $(seq $n_iter); do
23 |     cmdline="CUDA_VISIBLE_DEVICES=0 python solve.py ../../config/ast.yaml --dataset SPRS --datapath data/SPRS --epochs 50  --bs $bs --lr $lr_prm --head $head $extra --split_iter $spl"
24 |     echo $cmdline
25 |     eval $cmdline
26 | done
27 | 


--------------------------------------------------------------------------------
/config/wav2vec2context.yaml:
--------------------------------------------------------------------------------
 1 | # AR parameters such as FFT parameters.
 2 | audio_repr: ar_wav2vec2.AR_Wav2Vec2Context
 3 | feature_d: 1024
 4 | sample_rate: 16000
 5 | temporal_pooling_type: mean # `mean+max` severely degrades performances on some tasks such as VC1. We set the `mean` for wav2vec2.
 6 | 
 7 | # Training parameters.
 8 | batch_size: 64
 9 | lr_lineareval: 0.0003
10 | report_per_epochs: 20
11 | early_stop_epochs: 20
12 | 
13 | # Model specific parameters.
14 | wav2vec_model: facebook/wav2vec2-base-960h
15 | # wav2vec_model: facebook/wav2vec2-large-960h-lv60
16 | output_layers: [-1]  # list of layers to stack
17 | 


--------------------------------------------------------------------------------
/app/icbhi_sprs/ev_icbhi_beats.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | if [ $# -lt 1 ]; then
 4 |   n_iter=3
 5 | else
 6 |   n_iter=$1
 7 | fi
 8 | 
 9 | if [ $# -lt 2 ]; then
10 |   lr_prm=0.00003
11 | else
12 |   lr_prm=$2
13 | fi
14 | bs=256
15 | spl=1
16 | head=tfm
17 | extra=--freeze_body
18 | # --freeze_embed
19 | 
20 | echo Repeating $n_iter times...
21 | 
22 | for i in $(seq $n_iter); do
23 |     cmdline="CUDA_VISIBLE_DEVICES=0 python solve.py ../../config/beats.yaml --epochs 150 --weightspath ../../external/BEATs_iter3.pt --bs $bs --lr $lr_prm --head $head $extra --split_iter $spl"
24 |     echo $cmdline
25 |     eval $cmdline
26 | done
27 | 


--------------------------------------------------------------------------------
/app/icbhi_sprs/ev_sprs_m2d.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | if [ $# -lt 2 ]; then
 4 |   n_iter=3
 5 | else
 6 |   n_iter=$2
 7 | fi
 8 | 
 9 | if [ $# -lt 3 ]; then
10 |   lr_prm=0.00003
11 | else
12 |   lr_prm=$3
13 | fi
14 | bs=256
15 | spl=1
16 | head=tfm
17 | extra=--freeze_body
18 | # --freeze_embed
19 | 
20 | echo Repeating $n_iter times...
21 | 
22 | for i in $(seq $n_iter); do
23 |     cmdline="CUDA_VISIBLE_DEVICES=0 python solve.py --dataset SPRS --datapath data/SPRS ../../config/m2d.yaml --epochs 50  --bs $bs --lr $lr_prm --weightspath $1 --head $head $extra --split_iter $spl"
24 |     echo $cmdline
25 |     eval $cmdline
26 | done
27 | 


--------------------------------------------------------------------------------
/app/icbhi_sprs/ev_sprs_opera.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | if [ $# -lt 1 ]; then
 4 |   n_iter=3
 5 | else
 6 |   n_iter=$1
 7 | fi
 8 | 
 9 | if [ $# -lt 2 ]; then
10 |   lr_prm=0.0003
11 | else
12 |   lr_prm=$2
13 | fi
14 | bs=256
15 | spl=1
16 | head=tfm
17 | extra=--freeze_body
18 | 
19 | echo Repeating $n_iter times...
20 | 
21 | for i in $(seq $n_iter); do
22 |     cmdline="CUDA_VISIBLE_DEVICES=0 python solve.py ../../config/opera.yaml --dataset SPRS --datapath data/SPRS --epochs 50 --weightspath ../../external/OPERA/encoder-operaCT.ckpt --bs $bs --lr $lr_prm --head $head $extra --split_iter $spl"
23 |     echo $cmdline
24 |     eval $cmdline
25 | done
26 | 


--------------------------------------------------------------------------------
/app/bmdhs/make_metadata.py:
--------------------------------------------------------------------------------
 1 | # Generate EVAR metadata csv files.
 2 | import pandas as pd
 3 | 
 4 | 
 5 | splits = [pd.read_csv(f) for f in ['split1.csv', 'split2.csv', 'split3.csv']]
 6 | 
 7 | for df_index, spl_df in enumerate(splits):
 8 |     d = pd.DataFrame()
 9 |     for index, row in spl_df.iterrows():
10 |         labels, split = row['label'], row['split']
11 |         for filestem in row[[f'recording_{i}' for i in range(1, 8+1)]].values:
12 |             d = pd.concat([d, pd.DataFrame({'file_name': [f'train/{filestem}.wav'], 'label': [labels], 'split': [split]})])
13 |     d.to_csv(f'../../evar/metadata/bmdhs{df_index + 1}.csv')
14 |     print(d[:3])
15 | 


--------------------------------------------------------------------------------
/app/circor/ev_m2d.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | weight=$1
 4 | split=$2
 5 | n_iter=$3
 6 | seed=$4
 7 | lr_prm=$5
 8 | bs=1024
 9 | gpu=0
10 | hidden='\(128,\)'
11 | reweight=True
12 | 
13 | echo Repeating $n_iter times...
14 | 
15 | for i in $(seq $n_iter); do
16 |   seed=$((seed + 1))
17 |   cmdline="CUDA_VISIBLE_DEVICES=$gpu python -m app.circor.solve_circor config/m2d.yaml circor$split weight_file=$weight,encoder_only=True --lr=$lr_prm --freq_mask 0 --time_mask 0 --training_mask 0.0 --mixup 0.0 --rrc False --epochs 50 --warmup_epochs 0 --seed $seed --batch_size $bs --hidden $hidden --reweight $reweight"
18 |   echo $cmdline
19 |   eval $cmdline
20 | done
21 | 


--------------------------------------------------------------------------------
/app/icbhi_sprs/ev_sprs_byola.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | if [ $# -lt 1 ]; then
 4 |   n_iter=3
 5 | else
 6 |   n_iter=$1
 7 | fi
 8 | 
 9 | if [ $# -lt 2 ]; then
10 |   lr_prm=0.00003
11 | else
12 |   lr_prm=$2
13 | fi
14 | bs=256
15 | spl=1
16 | head=tfm
17 | extra=--freeze_body
18 | # --freeze_embed
19 | 
20 | echo Repeating $n_iter times...
21 | 
22 | for i in $(seq $n_iter); do
23 |     cmdline="CUDA_VISIBLE_DEVICES=0 python solve.py ../../config/beats.yaml --dataset SPRS --datapath data/SPRS --epochs 50 --weightspath ../../external/BEATs_iter3.pt --bs $bs --lr $lr_prm --head $head $extra --split_iter $spl"
24 |     echo $cmdline
25 |     eval $cmdline
26 | done
27 | 


--------------------------------------------------------------------------------
/app/icbhi_sprs/ev_icbhi_byola.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | if [ $# -lt 1 ]; then
 4 |   n_iter=3
 5 | else
 6 |   n_iter=$1
 7 | fi
 8 | 
 9 | if [ $# -lt 2 ]; then
10 |   lr_prm=0.00003
11 |   #5e-5 @bs64
12 | else
13 |   lr_prm=$2
14 | fi
15 | bs=256
16 | spl=1
17 | head=tfm
18 | extra=--freeze_body
19 | # --freeze_embed
20 | 
21 | echo Repeating $n_iter times...
22 | 
23 | for i in $(seq $n_iter); do
24 |     cmdline="CUDA_VISIBLE_DEVICES=0 python solve.py ../../config/byola.yaml --epochs 150 --weightspath ../../external/byol_a/pretrained_weights/AudioNTT2020-BYOLA-64x96d2048.pth --bs $bs --lr $lr_prm --head $head $extra --split_iter $spl"
25 |     echo $cmdline
26 |     eval $cmdline
27 | done
28 | 


--------------------------------------------------------------------------------
/config/laionclap.yaml:
--------------------------------------------------------------------------------
 1 | # Zero-shot-ready
 2 | 
 3 | # AR parameters such as FFT parameters.
 4 | audio_repr: ar_laionclap.AR_LAIONCLAP
 5 | weight_file: 
 6 | feature_d: 768
 7 | sample_rate: 48000
 8 | 
 9 | # Model specific parameters.
10 | 
11 | # Linear evaluation/Fine-tuning common parameters.
12 | 
13 | # Linear evaluaition parameters.
14 | batch_size: 128
15 | lr_lineareval: 0.0003
16 | report_per_epochs: 50
17 | early_stop_epochs: 20
18 | 
19 | # Fine-tuning parameters.
20 | warmup_epochs: 5
21 | mixup: 0.5
22 | ft_bs: 128
23 | ft_lr: 2.0
24 | ft_early_stop_epochs: -1  # -1: no early stopping
25 | ft_epochs: 200
26 | ft_freq_mask: 8
27 | ft_time_mask: 64
28 | ft_noise: 0.0
29 | ft_rrc: True
30 | 


--------------------------------------------------------------------------------
/run/all_byola2.sh:
--------------------------------------------------------------------------------
 1 | python 2pass_lineareval.py config/byola2.yaml esc50  --lr=0.001
 2 | python 2pass_lineareval.py config/byola2.yaml us8k  --lr=0.00003
 3 | python 2pass_lineareval.py config/byola2.yaml spcv2  --lr=0.00003
 4 | python 2pass_lineareval.py config/byola2.yaml nsynth  --lr=0.001
 5 | python 2pass_lineareval.py config/byola2.yaml vc1  --lr=0.00004
 6 | python 2pass_lineareval.py config/byola2.yaml voxforge  --lr=0.0001
 7 | python 2pass_lineareval.py config/byola2.yaml cremad 
 8 | python 2pass_lineareval.py config/byola2.yaml surge  --lr=0.00003
 9 | python 2pass_lineareval.py config/byola2.yaml gtzan batch_size=64 --lr=0.001
10 | python summarize.py external/byol_a/v2/AudioNTT2022-BYOLA-64x96d2048.pth
11 | 


--------------------------------------------------------------------------------
/app/icbhi_sprs/ev_sprs_beats.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | if [ $# -lt 1 ]; then
 4 |   n_iter=3
 5 | else
 6 |   n_iter=$1
 7 | fi
 8 | 
 9 | if [ $# -lt 2 ]; then
10 |   lr_prm=0.00003
11 |   #5e-5 @bs64
12 | else
13 |   lr_prm=$2
14 | fi
15 | bs=256
16 | spl=1
17 | head=tfm
18 | extra=--freeze_body
19 | # --freeze_embed
20 | 
21 | echo Repeating $n_iter times...
22 | 
23 | for i in $(seq $n_iter); do
24 |     cmdline="CUDA_VISIBLE_DEVICES=0 python solve.py ../../config/byola.yaml --dataset SPRS --datapath data/SPRS --epochs 50 --weightspath ../../external/byol_a/pretrained_weights/AudioNTT2020-BYOLA-64x96d2048.pth --bs $bs --lr $lr_prm --head $head $extra --split_iter $spl"
25 |     echo $cmdline
26 |     eval $cmdline
27 | done
28 | 


--------------------------------------------------------------------------------
/run/all_ced.sh:
--------------------------------------------------------------------------------
 1 | NAME=CED
 2 | python 2pass_lineareval.py config/ced.yaml cremad batch_size=16,name=$NAME
 3 | python 2pass_lineareval.py config/ced.yaml gtzan batch_size=16,name=$NAME
 4 | python 2pass_lineareval.py config/ced.yaml spcv2 batch_size=64,name=$NAME
 5 | python 2pass_lineareval.py config/ced.yaml esc50 batch_size=64,name=$NAME
 6 | python 2pass_lineareval.py config/ced.yaml us8k batch_size=64,name=$NAME
 7 | python 2pass_lineareval.py config/ced.yaml vc1 batch_size=64,name=$NAME
 8 | python 2pass_lineareval.py config/ced.yaml voxforge batch_size=64,name=$NAME
 9 | python 2pass_lineareval.py config/ced.yaml nsynth batch_size=64,name=$NAME
10 | python 2pass_lineareval.py config/ced.yaml surge batch_size=64,name=$NAME
11 | python summarize.py $NAME
12 | 


--------------------------------------------------------------------------------
/run/all_atst.sh:
--------------------------------------------------------------------------------
 1 | NAME=ATST
 2 | python 2pass_lineareval.py config/atst.yaml cremad batch_size=16,name=$NAME
 3 | python 2pass_lineareval.py config/atst.yaml gtzan batch_size=16,name=$NAME
 4 | python 2pass_lineareval.py config/atst.yaml spcv2 batch_size=64,name=$NAME
 5 | python 2pass_lineareval.py config/atst.yaml esc50 batch_size=64,name=$NAME
 6 | python 2pass_lineareval.py config/atst.yaml us8k batch_size=64,name=$NAME
 7 | python 2pass_lineareval.py config/atst.yaml vc1 batch_size=64,name=$NAME
 8 | python 2pass_lineareval.py config/atst.yaml voxforge batch_size=64,name=$NAME
 9 | python 2pass_lineareval.py config/atst.yaml nsynth batch_size=64,name=$NAME
10 | python 2pass_lineareval.py config/atst.yaml surge batch_size=64,name=$NAME
11 | python summarize.py $NAME
12 | 


--------------------------------------------------------------------------------
/config/atst.yaml:
--------------------------------------------------------------------------------
 1 | # AR parameters such as FFT parameters.
 2 | audio_repr: ar_atst.AR_ATST
 3 | weight_file: external/atst_base.ckpt
 4 | feature_d: 1536
 5 | sample_rate: 16000
 6 | n_fft: 1024
 7 | window_size: 1024
 8 | hop_size: 160
 9 | n_mels: 64
10 | window: hanning
11 | 
12 | n_blocks: 1
13 | 
14 | # Training parameters.
15 | batch_size: 128
16 | lr_lineareval: 0.0003
17 | # not ready lr_finetune_frozen: 0.001
18 | # not ready lr_finetune_finetune: 0.00003
19 | report_per_epochs: 20
20 | early_stop_epochs: 20
21 | 
22 | # Fine-tuning parameters.
23 | warmup_epochs: 5
24 | mixup: 0.5
25 | ft_bs: 64
26 | ft_lr: 2.0
27 | ft_early_stop_epochs: -1 # -1: no early stopping
28 | ft_epochs: 200
29 | ft_freq_mask: 30
30 | ft_time_mask: 192
31 | ft_noise: 0.0
32 | ft_rrc: True
33 | 


--------------------------------------------------------------------------------
/run/all_beats.sh:
--------------------------------------------------------------------------------
 1 | NAME=BEATs
 2 | python 2pass_lineareval.py config/beats.yaml cremad batch_size=16,name=$NAME
 3 | python 2pass_lineareval.py config/beats.yaml gtzan batch_size=16,name=$NAME
 4 | python 2pass_lineareval.py config/beats.yaml spcv2 batch_size=64,name=$NAME
 5 | python 2pass_lineareval.py config/beats.yaml esc50 batch_size=64,name=$NAME
 6 | python 2pass_lineareval.py config/beats.yaml us8k batch_size=64,name=$NAME
 7 | python 2pass_lineareval.py config/beats.yaml vc1 batch_size=64,name=$NAME
 8 | python 2pass_lineareval.py config/beats.yaml voxforge batch_size=64,name=$NAME
 9 | python 2pass_lineareval.py config/beats.yaml nsynth batch_size=64,name=$NAME
10 | python 2pass_lineareval.py config/beats.yaml surge batch_size=64,name=$NAME
11 | python summarize.py $NAME
12 | 


--------------------------------------------------------------------------------
/run/all_htsat.sh:
--------------------------------------------------------------------------------
 1 | name=HTSAT
 2 | python 2pass_lineareval.py config/htsat.yaml cremad batch_size=16,name=$name
 3 | python 2pass_lineareval.py config/htsat.yaml gtzan batch_size=16,name=$name
 4 | python 2pass_lineareval.py config/htsat.yaml spcv2 batch_size=64,name=$name
 5 | python 2pass_lineareval.py config/htsat.yaml esc50 batch_size=64,name=$name
 6 | python 2pass_lineareval.py config/htsat.yaml us8k batch_size=64,name=$name
 7 | python 2pass_lineareval.py config/htsat.yaml vc1 batch_size=64,name=$name
 8 | python 2pass_lineareval.py config/htsat.yaml voxforge batch_size=64,name=$name
 9 | python 2pass_lineareval.py config/htsat.yaml nsynth batch_size=64,name=$name
10 | python 2pass_lineareval.py config/htsat.yaml surge batch_size=64,name=$name
11 | python summarize.py $name
12 | 


--------------------------------------------------------------------------------
/run/all_m2d.sh:
--------------------------------------------------------------------------------
 1 | python 2pass_lineareval.py config/m2d.yaml cremad batch_size=16,weight_file=$1
 2 | python 2pass_lineareval.py config/m2d.yaml gtzan batch_size=16,weight_file=$1
 3 | python 2pass_lineareval.py config/m2d.yaml spcv2 batch_size=64,weight_file=$1
 4 | python 2pass_lineareval.py config/m2d.yaml esc50 batch_size=64,weight_file=$1
 5 | python 2pass_lineareval.py config/m2d.yaml us8k batch_size=64,weight_file=$1
 6 | python 2pass_lineareval.py config/m2d.yaml vc1 batch_size=64,weight_file=$1
 7 | python 2pass_lineareval.py config/m2d.yaml voxforge batch_size=64,weight_file=$1
 8 | python 2pass_lineareval.py config/m2d.yaml nsynth batch_size=64,weight_file=$1
 9 | python 2pass_lineareval.py config/m2d.yaml surge batch_size=64,weight_file=$1
10 | 
11 | python summarize.py $1
12 | 


--------------------------------------------------------------------------------
/config/atst_frame.yaml:
--------------------------------------------------------------------------------
 1 | # AR parameters such as FFT parameters.
 2 | audio_repr: ar_atst_frame.AR_ATST_Frame
 3 | weight_file: external/atstframe_base.ckpt
 4 | feature_d: 9216
 5 | sample_rate: 16000
 6 | n_fft: 1024
 7 | window_size: 1024
 8 | hop_size: 160
 9 | n_mels: 64
10 | window: hanning
11 | 
12 | n_blocks: 1
13 | 
14 | # Training parameters.
15 | batch_size: 128
16 | lr_lineareval: 0.00003
17 | # not ready lr_finetune_frozen: 0.001
18 | # not ready lr_finetune_finetune: 0.00003
19 | report_per_epochs: 20
20 | early_stop_epochs: 20
21 | 
22 | # Fine-tuning parameters.
23 | warmup_epochs: 5
24 | mixup: 0.5
25 | ft_bs: 64
26 | ft_lr: 2.0
27 | ft_early_stop_epochs: -1 # -1: no early stopping
28 | ft_epochs: 200
29 | ft_freq_mask: 30
30 | ft_time_mask: 192
31 | ft_noise: 0.0
32 | ft_rrc: True
33 | 


--------------------------------------------------------------------------------
/config/byola2.yaml:
--------------------------------------------------------------------------------
 1 | # AR parameters such as FFT parameters.
 2 | audio_repr: ar_byola2.AR_BYOLA2
 3 | weight_file: external/byol_a/v2/AudioNTT2022-BYOLA-64x96d2048.pth
 4 | feature_d: 3072
 5 | sample_rate: 16000
 6 | n_fft: 1024
 7 | window_size: 1024
 8 | hop_size: 160
 9 | n_mels: 64
10 | f_min: 60
11 | f_max: 7800
12 | temporal_pooling_type: mean_max
13 | 
14 | # Training parameters.
15 | batch_size: 256
16 | lr_lineareval: 0.0003
17 | report_per_epochs: 20
18 | early_stop_epochs: 20
19 | 
20 | # Fine-tuning parameters.
21 | ## CAUTION: The following parameters not confirmed to work. ##
22 | warmup_epochs: 5
23 | mixup: 0.5
24 | ft_bs: 256
25 | ft_lr: 0.001
26 | ft_early_stop_epochs: -1 # -1: no early stopping
27 | ft_epochs: 200
28 | ft_freq_mask: 30
29 | ft_time_mask: 100
30 | ft_rrc: True
31 | 


--------------------------------------------------------------------------------
/config/cnn14.yaml:
--------------------------------------------------------------------------------
 1 | # AR parameters such as FFT parameters.
 2 | audio_repr: ar_cnn14.AR_Cnn14
 3 | weight_file: external/Cnn14_16k_mAP=0.438.pth
 4 | feature_d: 2048
 5 | sample_rate: 16000
 6 | n_fft: 512
 7 | window_size: 512
 8 | hop_size: 160
 9 | n_mels: 64
10 | f_min: 50
11 | f_max: 8000
12 | # temporal_pooling_type: -> not using common temporal pooling.
13 | 
14 | # Training parameters.
15 | batch_size: 256
16 | lr_lineareval: 0.0003
17 | report_per_epochs: 20
18 | early_stop_epochs: 20
19 | 
20 | # Fine-tuning parameters.
21 | ## CAUTION: The following parameters not confirmed to work. ##
22 | warmup_epochs: 5
23 | mixup: 0.5
24 | ft_bs: 256
25 | ft_lr: 0.001
26 | ft_early_stop_epochs: -1 # -1: no early stopping
27 | ft_epochs: 200
28 | ft_freq_mask: 30
29 | ft_time_mask: 100
30 | ft_rrc: False
31 | 


--------------------------------------------------------------------------------
/config/byola.yaml:
--------------------------------------------------------------------------------
 1 | # AR parameters such as FFT parameters.
 2 | audio_repr: ar_byola.AR_BYOLA
 3 | weight_file: external/byol_a/pretrained_weights/AudioNTT2020-BYOLA-64x96d2048.pth
 4 | feature_d: 2048
 5 | sample_rate: 16000
 6 | n_fft: 1024
 7 | window_size: 1024
 8 | hop_size: 160
 9 | n_mels: 64
10 | f_min: 60
11 | f_max: 7800
12 | temporal_pooling_type: mean_max
13 | 
14 | # Training parameters.
15 | batch_size: 256
16 | lr_lineareval: 0.0003
17 | report_per_epochs: 20
18 | early_stop_epochs: 20
19 | 
20 | # Fine-tuning parameters.
21 | ## CAUTION: The following parameters not confirmed to work. ##
22 | warmup_epochs: 5
23 | mixup: 0.5
24 | ft_bs: 256
25 | ft_lr: 0.001
26 | ft_early_stop_epochs: -1 # -1: no early stopping
27 | ft_epochs: 200
28 | ft_freq_mask: 30
29 | ft_time_mask: 100
30 | ft_rrc: True
31 | 


--------------------------------------------------------------------------------
/config/byolax.yaml:
--------------------------------------------------------------------------------
 1 | # AR parameters such as FFT parameters.
 2 | audio_repr: ar_byola.AR_BYOLAX
 3 | weight_file: external/byol_a/pretrained_weights/AudioNTT2020-BYOLA-64x96d2048.pth
 4 | feature_d: 2048
 5 | sample_rate: 16000
 6 | n_fft: 1024
 7 | window_size: 1024
 8 | hop_size: 160
 9 | n_mels: 64
10 | f_min: 60
11 | f_max: 7800
12 | temporal_pooling_type: mean_max
13 | 
14 | # Training parameters.
15 | batch_size: 256
16 | lr_lineareval: 0.0003
17 | report_per_epochs: 20
18 | early_stop_epochs: 20
19 | 
20 | # Fine-tuning parameters.
21 | ## CAUTION: The following parameters not confirmed to work. ##
22 | warmup_epochs: 5
23 | mixup: 0.5
24 | ft_bs: 256
25 | ft_lr: 0.001
26 | ft_early_stop_epochs: -1 # -1: no early stopping
27 | ft_epochs: 200
28 | ft_freq_mask: 30
29 | ft_time_mask: 100
30 | ft_rrc: True
31 | 


--------------------------------------------------------------------------------
/config/opera.yaml:
--------------------------------------------------------------------------------
 1 | # AR parameters such as FFT parameters.
 2 | audio_repr: ar_opera.AR_OPERA_CT
 3 | weight_file: external/OPERA/encoder-operaCT.ckpt
 4 | feature_d: 768
 5 | sample_rate: 16000
 6 | n_fft: 1024
 7 | # window_size: 1024
 8 | hop_size: 512
 9 | n_mels: 64
10 | f_min: 50
11 | f_max: 8000
12 | # window: hanning
13 | 
14 | # Linear evaluation/Fine-tuning common parameters.
15 | training_mask: 0.0
16 | 
17 | # Linear evaluaition parameters.
18 | batch_size: 128
19 | lr_lineareval: 0.0003
20 | report_per_epochs: 50
21 | early_stop_epochs: 20
22 | 
23 | # Fine-tuning parameters.
24 | warmup_epochs: 5
25 | mixup: 0.5
26 | ft_bs: 128
27 | ft_lr: 2.0
28 | ft_early_stop_epochs: -1  # -1: no early stopping
29 | ft_epochs: 200
30 | ft_freq_mask: 8
31 | ft_time_mask: 64
32 | ft_noise: 0.0
33 | ft_rrc: True
34 | 


--------------------------------------------------------------------------------
/external/coala.patch:
--------------------------------------------------------------------------------
 1 | diff --git a/encode.py b/encode.py
 2 | index d8d892f..3646540 100755
 3 | --- a/encode.py
 4 | +++ b/encode.py
 5 | @@ -12,12 +12,12 @@ from pathlib import Path
 6 |  from tqdm import tqdm
 7 |  import librosa
 8 |  
 9 | -from utils import compute_spectrogram
10 | -from models_t1000 import AudioEncoder, TagEncoder, CNN
11 | +from .utils import compute_spectrogram
12 | +from .models_t1000 import AudioEncoder, TagEncoder, CNN
13 |  
14 |  
15 | -scaler = pickle.load(open('./scaler_top_1000.pkl', 'rb'))
16 | -id2tag = json.load(open('./json/id2token_top_1000.json', 'rb'))
17 | +scaler = pickle.load(open('external/coala_scaler_top_1000_plus_clip.pkl', 'rb'))
18 | +id2tag = json.load(open('external/coala/json/id2token_top_1000.json', 'rb'))
19 |  tag2id = {tag: id for id, tag in id2tag.items()}
20 |  
21 | 


--------------------------------------------------------------------------------
/config/ast.yaml:
--------------------------------------------------------------------------------
 1 | # AR parameters such as FFT parameters.
 2 | audio_repr: ar_ast.AR_AST
 3 | weight_file: external/ast/pretrained_models/ast_audioset.pth
 4 | feature_d: 768
 5 | sample_rate: 16000
 6 | n_fft: 400
 7 | window_size: 400
 8 | hop_size: 160
 9 | n_mels: 128
10 | window: hanning # paper is typo (https://github.com/YuanGongND/ast/issues/13)
11 | 
12 | # Training parameters.
13 | batch_size: 128
14 | lr_lineareval: 0.0003
15 | report_per_epochs: 20
16 | early_stop_epochs: 20
17 | 
18 | # Fine-tuning parameters.
19 | ## CAUTION: The following parameters not confirmed to work. ##
20 | warmup_epochs: 4
21 | mixup: 0.5
22 | ft_bs: 64
23 | ft_lr: 2.0
24 | ft_early_stop_epochs: -1 # -1: no early stopping
25 | ft_epochs: 200
26 | ft_freq_mask: 24 # for ESC-50 in this case
27 | ft_time_mask: 96 # for ESC-50
28 | ft_rrc: True
29 | 


--------------------------------------------------------------------------------
/config/vggish.yaml:
--------------------------------------------------------------------------------
 1 | # AR parameters such as FFT parameters.
 2 | audio_repr: ar_vggish.AR_VGGish
 3 | feature_d: 128
 4 | sample_rate: 16000
 5 | n_fft: 400
 6 | window_size: 400
 7 | hop_size: 160
 8 | n_mels: 64
 9 | f_min: 125
10 | f_max: 7500
11 | temporal_pooling_type: mean_max
12 | 
13 | # Training parameters.
14 | batch_size: 256
15 | lr_lineareval: 0.0003
16 | report_per_epochs: 20
17 | early_stop_epochs: 20
18 | 
19 | # SAMPLE_RATE = 16000
20 | # STFT_WINDOW_LENGTH_SECONDS = 0.025 -> 400
21 | # STFT_HOP_LENGTH_SECONDS = 0.010 -> 160
22 | # NUM_MEL_BINS = NUM_BANDS
23 | # MEL_MIN_HZ = 125
24 | # MEL_MAX_HZ = 7500
25 | # LOG_OFFSET = 0.01  # Offset used for stabilized log of input mel-spectrogram.
26 | # EXAMPLE_WINDOW_SECONDS = 0.96  # Each example contains 96 10ms frames
27 | # EXAMPLE_HOP_SECONDS = 0.96  # with zero overlap.
28 | 


--------------------------------------------------------------------------------
/run/all_atst_frame.sh:
--------------------------------------------------------------------------------
 1 | NAME=ATSTFrame
 2 | python 2pass_lineareval.py config/atst_frame.yaml cremad batch_size=16,name=$NAME
 3 | python 2pass_lineareval.py config/atst_frame.yaml gtzan batch_size=16,name=$NAME
 4 | python 2pass_lineareval.py config/atst_frame.yaml spcv2 batch_size=64,name=$NAME
 5 | python 2pass_lineareval.py config/atst_frame.yaml esc50 batch_size=64,name=$NAME
 6 | python 2pass_lineareval.py config/atst_frame.yaml us8k batch_size=64,name=$NAME
 7 | python 2pass_lineareval.py config/atst_frame.yaml vc1 batch_size=64,name=$NAME
 8 | python 2pass_lineareval.py config/atst_frame.yaml voxforge batch_size=64,name=$NAME
 9 | python 2pass_lineareval.py config/atst_frame.yaml nsynth batch_size=64,name=$NAME
10 | python 2pass_lineareval.py config/atst_frame.yaml surge batch_size=64,name=$NAME
11 | python summarize.py $NAME
12 | 


--------------------------------------------------------------------------------
/run/all_beats_plus.sh:
--------------------------------------------------------------------------------
 1 | NAME=BEATsPlus
 2 | python 2pass_lineareval.py config/beats_plus.yaml cremad batch_size=16,name=$NAME
 3 | python 2pass_lineareval.py config/beats_plus.yaml gtzan batch_size=16,name=$NAME
 4 | python 2pass_lineareval.py config/beats_plus.yaml spcv2 batch_size=64,name=$NAME
 5 | python 2pass_lineareval.py config/beats_plus.yaml esc50 batch_size=64,name=$NAME
 6 | python 2pass_lineareval.py config/beats_plus.yaml us8k batch_size=64,name=$NAME
 7 | python 2pass_lineareval.py config/beats_plus.yaml vc1 batch_size=64,name=$NAME
 8 | python 2pass_lineareval.py config/beats_plus.yaml voxforge batch_size=64,name=$NAME
 9 | python 2pass_lineareval.py config/beats_plus.yaml nsynth batch_size=64,name=$NAME
10 | python 2pass_lineareval.py config/beats_plus.yaml surge batch_size=64,name=$NAME
11 | python summarize.py $NAME
12 | 


--------------------------------------------------------------------------------
/config/vggish_4k.yaml:
--------------------------------------------------------------------------------
 1 | # AR parameters such as FFT parameters.
 2 | audio_repr: ar_vggish.AR_VGGish_4K
 3 | feature_d: 4096
 4 | sample_rate: 16000
 5 | n_fft: 400
 6 | window_size: 400
 7 | hop_size: 160
 8 | n_mels: 64
 9 | f_min: 125
10 | f_max: 7500
11 | temporal_pooling_type: mean_max
12 | 
13 | # Training parameters.
14 | batch_size: 256
15 | lr_lineareval: 0.0003
16 | report_per_epochs: 20
17 | early_stop_epochs: 20
18 | 
19 | # SAMPLE_RATE = 16000
20 | # STFT_WINDOW_LENGTH_SECONDS = 0.025 -> 400
21 | # STFT_HOP_LENGTH_SECONDS = 0.010 -> 160
22 | # NUM_MEL_BINS = NUM_BANDS
23 | # MEL_MIN_HZ = 125
24 | # MEL_MAX_HZ = 7500
25 | # LOG_OFFSET = 0.01  # Offset used for stabilized log of input mel-spectrogram.
26 | # EXAMPLE_WINDOW_SECONDS = 0.96  # Each example contains 96 10ms frames
27 | # EXAMPLE_HOP_SECONDS = 0.96  # with zero overlap.
28 | 


--------------------------------------------------------------------------------
/plugin/MARBLE/configs/evar/MTT.yaml:
--------------------------------------------------------------------------------
 1 | _import:
 2 |   - !include benchmark/tasks/MTT/MTT_base_config.yaml
 3 | 
 4 | dataset:
 5 |   pre_extract:
 6 |     accelerator: gpu
 7 |     audio_dir: data/MTT/mp3
 8 |     output_dir: 
 9 |     keep_folder_structure: true
10 |     overwrite: true
11 | 
12 |     feature_extractor:
13 |       pretrain:
14 |         !include benchmark/models/evar/evar.yaml
15 | 
16 |   dataset: MTT
17 |   input_type: feature # [audio, feature]
18 |   input_dir: 
19 |   metadata_dir: data/MTT
20 | 
21 | model:    
22 |   downstream_structure:
23 |     components:
24 |       - name: feature_selector
25 |         layer: 0 # not a transformer model, so layer is set to 0
26 |         normalized_weight_sum: false
27 |       - name: mlp
28 |         hidden_layer_sizes: [512]
29 |         dropout_p: 0.2
30 |         num_outputs: 50
31 |   


--------------------------------------------------------------------------------
/config/msclap.yaml:
--------------------------------------------------------------------------------
 1 | # Zero-shot-ready
 2 | 
 3 | # AR parameters such as FFT parameters.
 4 | audio_repr: ar_msclap.AR_MSCLAP
 5 | weight_file: 2023
 6 | feature_d: 1024
 7 | sample_rate: 32000
 8 | n_fft: 1024
 9 | window_size: 1024
10 | hop_size: 320
11 | n_mels: 64
12 | f_min: 50
13 | f_max: 14000
14 | window: hanning
15 | 
16 | # Model specific parameters.
17 | return_filename: True
18 | 
19 | # Linear evaluation/Fine-tuning common parameters.
20 | 
21 | # Linear evaluaition parameters.
22 | batch_size: 128
23 | lr_lineareval: 0.0003
24 | report_per_epochs: 50
25 | early_stop_epochs: 20
26 | 
27 | # Fine-tuning parameters.
28 | warmup_epochs: 5
29 | mixup: 0.5
30 | ft_bs: 128
31 | ft_lr: 2.0
32 | ft_early_stop_epochs: -1  # -1: no early stopping
33 | ft_epochs: 200
34 | ft_freq_mask: 8
35 | ft_time_mask: 64
36 | ft_noise: 0.0
37 | ft_rrc: True
38 | 


--------------------------------------------------------------------------------
/config/wavcaps.yaml:
--------------------------------------------------------------------------------
 1 | # Zero-shot-ready
 2 | 
 3 | # AR parameters such as FFT parameters.
 4 | audio_repr: ar_wavcaps.AR_WavCaps
 5 | weight_file: external/WavCaps/HTSAT-BERT-PT.pt
 6 | feature_d: 768
 7 | sample_rate: 32000
 8 | n_fft: 1024
 9 | window_size: 1024
10 | hop_size: 320
11 | n_mels: 64
12 | f_min: 50
13 | f_max: 14000
14 | window: hanning
15 | 
16 | # Model specific parameters.
17 | 
18 | # Linear evaluation/Fine-tuning common parameters.
19 | 
20 | # Linear evaluaition parameters.
21 | batch_size: 128
22 | lr_lineareval: 0.0003
23 | report_per_epochs: 50
24 | early_stop_epochs: 20
25 | 
26 | # Fine-tuning parameters.
27 | warmup_epochs: 5
28 | mixup: 0.5
29 | ft_bs: 128
30 | ft_lr: 2.0
31 | ft_early_stop_epochs: -1  # -1: no early stopping
32 | ft_epochs: 200
33 | ft_freq_mask: 8
34 | ft_time_mask: 64
35 | ft_noise: 0.0
36 | ft_rrc: True
37 | 


--------------------------------------------------------------------------------
/plugin/MARBLE/configs/evar/GTZAN.yaml:
--------------------------------------------------------------------------------
 1 | _import:
 2 |   - !include benchmark/tasks/GTZAN/GTZAN_base_config.yaml
 3 | 
 4 | dataset:
 5 |   pre_extract:
 6 |     accelerator: gpu
 7 |     audio_dir: data/GTZAN/genres
 8 |     output_dir: 
 9 |     keep_folder_structure: true
10 |     overwrite: true
11 | 
12 |     feature_extractor:
13 |       pretrain:
14 |         !include benchmark/models/evar/evar.yaml
15 | 
16 |   dataset: GTZAN
17 |   input_type: feature # [audio, feature]
18 |   input_dir: 
19 |   metadata_dir: data/GTZAN
20 | 
21 | model:    
22 |   downstream_structure:
23 |     components:
24 |       - name: feature_selector
25 |         layer: 0 # not a transformer model, so layer is set to 0
26 |         normalized_weight_sum: false
27 |       - name: mlp
28 |         hidden_layer_sizes: [512]
29 |         dropout_p: 0.2
30 |         num_outputs: 10
31 |   


--------------------------------------------------------------------------------
/plugin/MARBLE/configs/evar/NSynthI.yaml:
--------------------------------------------------------------------------------
 1 | _import:
 2 |   - !include benchmark/tasks/NSynth/NSynthI_base_config.yaml
 3 | 
 4 | dataset:
 5 |   pre_extract:
 6 |     accelerator: gpu
 7 |     audio_dir: data/NSynth
 8 |     output_dir: 
 9 |     keep_folder_structure: true
10 |     overwrite: true
11 | 
12 |     feature_extractor:
13 |       pretrain:
14 |         !include benchmark/models/evar/evar.yaml
15 | 
16 |   dataset: NSynthI
17 |   input_type: feature # [audio, feature]
18 |   input_dir: 
19 |   metadata_dir: data/NSynth
20 | 
21 | model:    
22 |   downstream_structure:
23 |     components:
24 |       - name: feature_selector
25 |         layer: 0 # not a transformer model, so layer is set to 0
26 |         normalized_weight_sum: false
27 |       - name: mlp
28 |         hidden_layer_sizes: [512]
29 |         dropout_p: 0.2
30 |         num_outputs: 11
31 |   


--------------------------------------------------------------------------------
/plugin/MARBLE/configs/evar/NSynthP.yaml:
--------------------------------------------------------------------------------
 1 | _import:
 2 |   - !include benchmark/tasks/NSynth/NSynthP_base_config.yaml
 3 | 
 4 | dataset:
 5 |   pre_extract:
 6 |     accelerator: gpu
 7 |     audio_dir: data/NSynth
 8 |     output_dir: 
 9 |     keep_folder_structure: true
10 |     overwrite: true
11 | 
12 |     feature_extractor:
13 |       pretrain:
14 |         !include benchmark/models/evar/evar.yaml
15 | 
16 |   dataset: NSynthP
17 |   input_type: feature # [audio, feature]
18 |   input_dir: 
19 |   metadata_dir: data/NSynth
20 | 
21 | model:    
22 |   downstream_structure:
23 |     components:
24 |       - name: feature_selector
25 |         layer: 0 # not a transformer model, so layer is set to 0
26 |         normalized_weight_sum: false
27 |       - name: mlp
28 |         hidden_layer_sizes: [512]
29 |         dropout_p: 0.2
30 |         num_outputs: 128
31 |   


--------------------------------------------------------------------------------
/plugin/MARBLE/configs/evar/GS.yaml:
--------------------------------------------------------------------------------
 1 | _import:
 2 |   - !include benchmark/tasks/GS/GS_base_config.yaml
 3 | 
 4 | dataset:
 5 |   pre_extract:
 6 |     accelerator: gpu
 7 |     audio_dir: data/GS/giantsteps_clips/wav
 8 |     output_dir: 
 9 |     keep_folder_structure: true
10 |     overwrite: true
11 | 
12 |     feature_extractor:
13 |       pretrain:
14 |         !include benchmark/models/evar/evar.yaml
15 | 
16 |   dataset: GS
17 |   input_type: feature # [audio, feature]
18 |   input_dir: 
19 |   metadata_dir: data/GS/giantsteps_clips
20 | 
21 | model:    
22 |   downstream_structure:
23 |     components:
24 |       - name: feature_selector
25 |         layer: 0 # not a transformer model, so layer is set to 0
26 |         normalized_weight_sum: false
27 |       - name: mlp
28 |         hidden_layer_sizes: [512]
29 |         dropout_p: 0.2
30 |         num_outputs: 24
31 |     


--------------------------------------------------------------------------------
/plugin/MARBLE/configs/evar/VocalSetS.yaml:
--------------------------------------------------------------------------------
 1 | _import:
 2 |   - !include benchmark/tasks/VocalSet/VocalSetS_base_config.yaml
 3 | 
 4 | dataset:
 5 |   pre_extract:
 6 |     accelerator: gpu
 7 |     audio_dir: data/VocalSet/audio
 8 |     output_dir: 
 9 |     keep_folder_structure: true
10 |     overwrite: true
11 | 
12 |     feature_extractor:
13 |       pretrain:
14 |         !include benchmark/models/evar/evar.yaml
15 | 
16 |   dataset: VocalSetS
17 |   input_type: feature # [audio, feature]
18 |   input_dir: 
19 |   metadata_dir: data/VocalSet
20 | 
21 | model:    
22 |   downstream_structure:
23 |     components:
24 |       - name: feature_selector
25 |         layer: 0 # not a transformer model, so layer is set to 0
26 |         normalized_weight_sum: false
27 |       - name: mlp
28 |         hidden_layer_sizes: [512]
29 |         dropout_p: 0.2
30 |         num_outputs: 20
31 |   


--------------------------------------------------------------------------------
/plugin/MARBLE/configs/evar/VocalSetT.yaml:
--------------------------------------------------------------------------------
 1 | _import:
 2 |   - !include benchmark/tasks/VocalSet/VocalSetT_base_config.yaml
 3 | 
 4 | dataset:
 5 |   pre_extract:
 6 |     accelerator: gpu
 7 |     audio_dir: data/VocalSet/audio
 8 |     output_dir: 
 9 |     keep_folder_structure: true
10 |     overwrite: true
11 | 
12 |     feature_extractor:
13 |       pretrain:
14 |         !include benchmark/models/evar/evar.yaml
15 | 
16 |   dataset: VocalSetT
17 |   input_type: feature # [audio, feature]
18 |   input_dir: 
19 |   metadata_dir: data/VocalSet
20 | 
21 | model:    
22 |   downstream_structure:
23 |     components:
24 |       - name: feature_selector
25 |         layer: 0 # not a transformer model, so layer is set to 0
26 |         normalized_weight_sum: false
27 |       - name: mlp
28 |         hidden_layer_sizes: [512]
29 |         dropout_p: 0.2
30 |         num_outputs: 10
31 |   


--------------------------------------------------------------------------------
/plugin/MARBLE/configs/evar/MTGMood.yaml:
--------------------------------------------------------------------------------
 1 | _import:
 2 |   - !include benchmark/tasks/MTG/MTGMood_base_config.yaml
 3 | 
 4 | dataset:
 5 |   pre_extract:
 6 |     accelerator: gpu
 7 |     audio_dir: data/MTG/audio-low
 8 |     output_dir: 
 9 |     keep_folder_structure: true
10 |     overwrite: true
11 | 
12 |     feature_extractor:
13 |       pretrain:
14 |         !include benchmark/models/evar/evar.yaml
15 | 
16 |   dataset: MTGMood
17 |   input_type: feature # [audio, feature]
18 |   input_dir: 
19 |   metadata_dir: data/MTG/mtg-jamendo-dataset
20 | 
21 | model:    
22 |   downstream_structure:
23 |     components:
24 |       - name: feature_selector
25 |         layer: 0 # not a transformer model, so layer is set to 0
26 |         normalized_weight_sum: false
27 |       - name: mlp
28 |         hidden_layer_sizes: [512]
29 |         dropout_p: 0.2
30 |         num_outputs: 56
31 |   


--------------------------------------------------------------------------------
/plugin/MARBLE/configs/evar/MTGGenre.yaml:
--------------------------------------------------------------------------------
 1 | _import:
 2 |   - !include benchmark/tasks/MTG/MTGGenre_base_config.yaml
 3 | 
 4 | dataset:
 5 |   pre_extract:
 6 |     accelerator: gpu
 7 |     audio_dir: data/MTG/audio-low
 8 |     output_dir: 
 9 |     keep_folder_structure: true
10 |     overwrite: true
11 | 
12 |     feature_extractor:
13 |       pretrain:
14 |         !include benchmark/models/evar/evar.yaml
15 | 
16 |   dataset: MTGGenre
17 |   input_type: feature # [audio, feature]
18 |   input_dir: 
19 |   metadata_dir: data/MTG/mtg-jamendo-dataset
20 | 
21 | model:    
22 |   downstream_structure:
23 |     components:
24 |       - name: feature_selector
25 |         layer: 0 # not a transformer model, so layer is set to 0
26 |         normalized_weight_sum: false
27 |       - name: mlp
28 |         hidden_layer_sizes: [512]
29 |         dropout_p: 0.2
30 |         num_outputs: 87
31 |   


--------------------------------------------------------------------------------
/plugin/MARBLE/configs/evar/MTGTop50.yaml:
--------------------------------------------------------------------------------
 1 | _import:
 2 |   - !include benchmark/tasks/MTG/MTGTop50_base_config.yaml
 3 | 
 4 | dataset:
 5 |   pre_extract:
 6 |     accelerator: gpu
 7 |     audio_dir: data/MTG/audio-low
 8 |     output_dir: 
 9 |     keep_folder_structure: true
10 |     overwrite: true
11 | 
12 |     feature_extractor:
13 |       pretrain:
14 |         !include benchmark/models/evar/evar.yaml
15 | 
16 |   dataset: MTGTop50
17 |   input_type: feature # [audio, feature]
18 |   input_dir: 
19 |   metadata_dir: data/MTG/mtg-jamendo-dataset
20 | 
21 | model:    
22 |   downstream_structure:
23 |     components:
24 |       - name: feature_selector
25 |         layer: 0 # not a transformer model, so layer is set to 0
26 |         normalized_weight_sum: false
27 |       - name: mlp
28 |         hidden_layer_sizes: [512]
29 |         dropout_p: 0.2
30 |         num_outputs: 50
31 |   


--------------------------------------------------------------------------------
/plugin/MARBLE/configs/evar/MTGInstrument.yaml:
--------------------------------------------------------------------------------
 1 | _import:
 2 |   - !include benchmark/tasks/MTG/MTGInstrument_base_config.yaml
 3 | 
 4 | dataset:
 5 |   pre_extract:
 6 |     accelerator: gpu
 7 |     audio_dir: data/MTG/audio-low
 8 |     output_dir: 
 9 |     keep_folder_structure: true
10 |     overwrite: true
11 | 
12 |     feature_extractor:
13 |       pretrain:
14 |         !include benchmark/models/evar/evar.yaml
15 | 
16 |   dataset: MTGInstrument
17 |   input_type: feature # [audio, feature]
18 |   input_dir: 
19 |   metadata_dir: data/MTG/mtg-jamendo-dataset
20 | 
21 | model:    
22 |   downstream_structure:
23 |     components:
24 |       - name: feature_selector
25 |         layer: 0 # not a transformer model, so layer is set to 0
26 |         normalized_weight_sum: false
27 |       - name: mlp
28 |         hidden_layer_sizes: [512]
29 |         dropout_p: 0.2
30 |         num_outputs: 40
31 |   


--------------------------------------------------------------------------------
/plugin/MARBLE/configs/evar/EMO.yaml:
--------------------------------------------------------------------------------
 1 | _import:
 2 |   - !include benchmark/tasks/EMO/EMO_base_config.yaml
 3 | 
 4 | dataset:
 5 |   pre_extract:
 6 |     accelerator: gpu
 7 |     audio_dir: data/EMO/emomusic/wav
 8 |     output_dir: 
 9 |     keep_folder_structure: true
10 |     overwrite: true
11 | 
12 |     feature_extractor:
13 |       pretrain:
14 |         !include benchmark/models/evar/evar.yaml
15 | 
16 |   dataset: EMO
17 |   input_type: feature # [audio, feature]
18 |   input_dir: 
19 |   metadata_dir: data/EMO/emomusic
20 | 
21 | model:    
22 |   downstream_structure:
23 |     components:
24 |       - name: feature_selector
25 |         layer: 0 # [all, 0, 1, 2, ..., $n_tranformer_layer]
26 |         # weighted sum is only effective when layer is set to all
27 |         normalized_weight_sum: false
28 | 
29 |       - name: mlp
30 |         hidden_layer_sizes: [512]
31 |         dropout_p: 0.2
32 |         num_outputs: 2
33 | 


--------------------------------------------------------------------------------
/evar/utils/m2d_add_norm_stats.py:
--------------------------------------------------------------------------------
 1 | """A small utility for M2D fine-tuned by EVAR.
 2 | This utility adds a parameter "module.ar.runtime.backbone.norm_stats" to a checkpoint file with constant normalization statistic values [-7.1, 4.2].
 3 | These values are the dataset average and standard deviation when pre-trained on AudioSet with M2D.
 4 | 
 5 | Usage: python [this script] [source checkpoint file] [output checkpoint file]
 6 | """
 7 | 
 8 | import torch
 9 | import sys
10 | 
11 | src_file = sys.argv[1]
12 | dest_file = sys.argv[2]
13 | 
14 | checkpoint = torch.load(src_file, map_location='cpu')
15 | if 'module.ar.runtime.backbone.cls_token' not in checkpoint:
16 |     print(f'{src_file} is not a fine-tuned checkpoint; no "module.ar.runtime.backbone.cls_token".')
17 |     exit(1)
18 | 
19 | checkpoint['module.ar.runtime.backbone.norm_stats'] = torch.tensor([-7.1, 4.2])
20 | torch.save(checkpoint, dest_file)
21 | print(f'Saved {dest_file} with an additional parameter "module.ar.runtime.backbone.norm_stats".')
22 | 


--------------------------------------------------------------------------------
/config/ced.yaml:
--------------------------------------------------------------------------------
 1 | # AR parameters such as FFT parameters.
 2 | audio_repr: ar_ced.AR_CED
 3 | weight_file: mispeech/ced-base
 4 | feature_d: 768
 5 | sample_rate: 16000
 6 | n_fft: 512
 7 | window_size: 512
 8 | hop_size: 160
 9 | n_mels: 64
10 | f_min: 0
11 | f_max: 8000
12 | # window: hanning
13 | 
14 | # Model specific parameters.
15 | cls_token: False # Use CLS token
16 | output_layers: [-1]  # List of layers to stack
17 | encoder_only: False
18 | dur_frames:  # None for no desired number of frames
19 | freeze_embed:  # Set True if freezing PatchEmbed during fine-tuning [2211.09359] How to Fine-Tune Vision Models with SGD
20 | 
21 | # Linear evaluation/Fine-tuning common parameters.
22 | training_mask: 0.0
23 | flat_features: False  # 768-d if True else 3840-d
24 | 
25 | # Linear evaluaition parameters.
26 | batch_size: 128
27 | lr_lineareval: 0.0003
28 | report_per_epochs: 50
29 | early_stop_epochs: 20
30 | 
31 | # Fine-tuning parameters.
32 | warmup_epochs: 5
33 | mixup: 0.5
34 | ft_bs: 128
35 | ft_lr: 2.0
36 | ft_early_stop_epochs: -1  # -1: no early stopping
37 | ft_epochs: 200
38 | ft_freq_mask: 30
39 | ft_time_mask: 192
40 | ft_noise: 0.0
41 | ft_rrc: True
42 | 


--------------------------------------------------------------------------------
/run/all_msclap.sh:
--------------------------------------------------------------------------------
 1 | NAME=MSCLAP
 2 | python 2pass_lineareval.py config/msclap.yaml cremad batch_size=16,name=$NAME
 3 | python 2pass_lineareval.py config/msclap.yaml gtzan batch_size=16,name=$NAME
 4 | python 2pass_lineareval.py config/msclap.yaml spcv2 batch_size=64,name=$NAME
 5 | python 2pass_lineareval.py config/msclap.yaml esc50 batch_size=64,name=$NAME
 6 | python 2pass_lineareval.py config/msclap.yaml us8k batch_size=64,name=$NAME
 7 | python 2pass_lineareval.py config/msclap.yaml vc1 batch_size=64,name=$NAME
 8 | python 2pass_lineareval.py config/msclap.yaml voxforge batch_size=64,name=$NAME
 9 | python 2pass_lineareval.py config/msclap.yaml nsynth batch_size=64,name=$NAME
10 | python 2pass_lineareval.py config/msclap.yaml surge batch_size=64,name=$NAME
11 | 
12 | python zeroshot.py config/msclap.yaml cremad batch_size=16,name=$NAME
13 | python zeroshot.py config/msclap.yaml gtzan batch_size=16,name=$NAME
14 | python zeroshot.py config/msclap.yaml nsynth batch_size=64,name=$NAME
15 | python zeroshot.py config/msclap.yaml esc50 batch_size=64,name=$NAME
16 | python zeroshot.py config/msclap.yaml us8k batch_size=64,name=$NAME
17 | 
18 | python summarize.py $NAME
19 | 


--------------------------------------------------------------------------------
/config/htsat.yaml:
--------------------------------------------------------------------------------
 1 | # AR parameters such as FFT parameters.
 2 | audio_repr: ar_htsat.AR_HTSAT
 3 | weight_file: external/HTSAT_AudioSet_Saved_1.ckpt
 4 | feature_d: 768
 5 | sample_rate: 32000
 6 | n_fft: 1024
 7 | window_size: 1024
 8 | hop_size: 320
 9 | n_mels: 64
10 | f_min: 50
11 | f_max: 14000
12 | window: hanning
13 | 
14 | # Model specific parameters.
15 | cls_token: False # Use CLS token
16 | output_layers: [-1]  # List of layers to stack
17 | encoder_only: False
18 | dur_frames:  # None for no desired number of frames
19 | freeze_embed:  # Set True if freezing PatchEmbed during fine-tuning [2211.09359] How to Fine-Tune Vision Models with SGD
20 | 
21 | # Linear evaluation/Fine-tuning common parameters.
22 | training_mask: 0.0
23 | flat_features: False  # 768-d if True else 3840-d
24 | 
25 | # Linear evaluaition parameters.
26 | batch_size: 128
27 | lr_lineareval: 0.0003
28 | report_per_epochs: 50
29 | early_stop_epochs: 20
30 | 
31 | # Fine-tuning parameters.
32 | warmup_epochs: 5
33 | mixup: 0.5
34 | ft_bs: 128
35 | ft_lr: 2.0
36 | ft_early_stop_epochs: -1  # -1: no early stopping
37 | ft_epochs: 200
38 | ft_freq_mask: 8
39 | ft_time_mask: 64
40 | ft_noise: 0.0
41 | ft_rrc: True
42 | 


--------------------------------------------------------------------------------
/config/beats.yaml:
--------------------------------------------------------------------------------
 1 | # AR parameters such as FFT parameters.
 2 | audio_repr: ar_beats.AR_BEATs
 3 | name: BEATs_iter3
 4 | weight_file: external/BEATs_iter3.pt
 5 | feature_d: 768
 6 | sample_rate: 16000
 7 | n_fft: 400
 8 | window_size: 400
 9 | hop_size: 160
10 | n_mels: 80
11 | f_min: 50
12 | f_max: 8000
13 | window: hanning
14 | 
15 | # Model specific parameters.
16 | cls_token: False # Use CLS token
17 | output_layers: [-1]  # List of layers to stack
18 | encoder_only: False
19 | dur_frames:  # None for no desired number of frames
20 | freeze_embed:  # Set True if freezing PatchEmbed during fine-tuning [2211.09359] How to Fine-Tune Vision Models with SGD
21 | 
22 | # Linear evaluation/Fine-tuning common parameters.
23 | training_mask: 0.0
24 | flat_features: False  # 768-d if True else 3840-d
25 | 
26 | # Linear evaluaition parameters.
27 | batch_size: 128
28 | lr_lineareval: 0.0003
29 | report_per_epochs: 50
30 | early_stop_epochs: 20
31 | 
32 | # Fine-tuning parameters.
33 | warmup_epochs: 5
34 | mixup: 0.5
35 | ft_bs: 128
36 | ft_lr: 2.0
37 | ft_early_stop_epochs: -1  # -1: no early stopping
38 | ft_epochs: 200
39 | ft_freq_mask: 30
40 | ft_time_mask: 192
41 | ft_noise: 0.0
42 | ft_rrc: True
43 | 


--------------------------------------------------------------------------------
/config/m2d_clap.yaml:
--------------------------------------------------------------------------------
 1 | # Zero-shot-ready
 2 | 
 3 | # AR parameters such as FFT parameters.
 4 | audio_repr: ar_m2d.AR_M2D_CLAP
 5 | weight_file: m2d_clap_vit_base-80x208p16x16p16k-random/random
 6 | feature_d: 3840
 7 | sample_rate: 16000
 8 | n_fft: 400
 9 | window_size: 400
10 | hop_size: 160
11 | n_mels: 80
12 | f_min: 50
13 | f_max: 8000
14 | window: hanning
15 | 
16 | # Model specific parameters.
17 | output_layers: [-1]  # List of layers to stack
18 | encoder_only: True
19 | dur_frames:  # None for no desired number of frames
20 | freeze_embed:  # Set True if freezing PatchEmbed during fine-tuning [2211.09359] How to Fine-Tune Vision Models with SGD
21 | 
22 | # Linear evaluation/Fine-tuning common parameters.
23 | flat_features: True  # 768-d if True else 3840-d
24 | 
25 | # Linear evaluaition parameters.
26 | batch_size: 128
27 | lr_lineareval: 0.00003
28 | report_per_epochs: 50
29 | early_stop_epochs: 20
30 | 
31 | # Fine-tuning parameters.
32 | training_mask: 0.0
33 | warmup_epochs: 5
34 | mixup: 0.5
35 | ft_bs: 128
36 | ft_lr: 2.0
37 | ft_early_stop_epochs: -1  # -1: no early stopping
38 | ft_epochs: 200
39 | ft_freq_mask: 30
40 | ft_time_mask: 192
41 | ft_noise: 0.0
42 | ft_rrc: True
43 | 


--------------------------------------------------------------------------------
/config/m2d_clap_32k.yaml:
--------------------------------------------------------------------------------
 1 | # Zero-shot-ready
 2 | 
 3 | # AR parameters such as FFT parameters.
 4 | audio_repr: ar_m2d.AR_M2D_CLAP
 5 | weight_file: m2d_clap_vit_base-80x208p16x16p32k-random/random
 6 | feature_d: 3840
 7 | sample_rate: 32000
 8 | n_fft: 800
 9 | window_size: 800
10 | hop_size: 320
11 | n_mels: 80
12 | f_min: 50
13 | f_max: 16000
14 | window: hanning
15 | 
16 | # Model specific parameters.
17 | output_layers: [-1]  # List of layers to stack
18 | encoder_only: False
19 | dur_frames:  # None for no desired number of frames
20 | freeze_embed:  # Set True if freezing PatchEmbed during fine-tuning [2211.09359] How to Fine-Tune Vision Models with SGD
21 | 
22 | # Linear evaluation/Fine-tuning common parameters.
23 | flat_features: True  # 768-d if True else 3840-d
24 | 
25 | # Linear evaluaition parameters.
26 | batch_size: 128
27 | lr_lineareval: 0.00003
28 | report_per_epochs: 50
29 | early_stop_epochs: 20
30 | 
31 | # Fine-tuning parameters.
32 | training_mask: 0.0
33 | warmup_epochs: 5
34 | mixup: 0.5
35 | ft_bs: 128
36 | ft_lr: 2.0
37 | ft_early_stop_epochs: -1  # -1: no early stopping
38 | ft_epochs: 200
39 | ft_freq_mask: 30
40 | ft_time_mask: 192
41 | ft_noise: 0.0
42 | ft_rrc: True
43 | 


--------------------------------------------------------------------------------
/run/all_laionclap.sh:
--------------------------------------------------------------------------------
 1 | NAME=LAIONCLAP
 2 | python 2pass_lineareval.py config/laionclap.yaml cremad batch_size=16,name=$NAME
 3 | python 2pass_lineareval.py config/laionclap.yaml gtzan batch_size=16,name=$NAME
 4 | python 2pass_lineareval.py config/laionclap.yaml spcv2 batch_size=64,name=$NAME
 5 | python 2pass_lineareval.py config/laionclap.yaml esc50 batch_size=64,name=$NAME
 6 | python 2pass_lineareval.py config/laionclap.yaml us8k batch_size=64,name=$NAME
 7 | python 2pass_lineareval.py config/laionclap.yaml vc1 batch_size=64,name=$NAME
 8 | python 2pass_lineareval.py config/laionclap.yaml voxforge batch_size=64,name=$NAME
 9 | python 2pass_lineareval.py config/laionclap.yaml nsynth batch_size=64,name=$NAME
10 | python 2pass_lineareval.py config/laionclap.yaml surge batch_size=64,name=$NAME
11 | 
12 | python zeroshot.py config/laionclap.yaml cremad batch_size=16,name=$NAME
13 | python zeroshot.py config/laionclap.yaml gtzan batch_size=16,name=$NAME
14 | python zeroshot.py config/laionclap.yaml nsynth batch_size=64,name=$NAME
15 | python zeroshot.py config/laionclap.yaml esc50 batch_size=64,name=$NAME
16 | python zeroshot.py config/laionclap.yaml us8k batch_size=64,name=$NAME
17 | 
18 | python summarize.py $NAME
19 | 


--------------------------------------------------------------------------------
/config/beats_plus.yaml:
--------------------------------------------------------------------------------
 1 | # AR parameters such as FFT parameters.
 2 | audio_repr: ar_beats.AR_BEATs
 3 | name: BEATs_iter3_plus_AS2M
 4 | weight_file: external/BEATs_iter3_plus_AS2M.pt
 5 | feature_d: 768
 6 | sample_rate: 16000
 7 | n_fft: 400
 8 | window_size: 400
 9 | hop_size: 160
10 | n_mels: 80
11 | f_min: 50
12 | f_max: 8000
13 | window: hanning
14 | 
15 | # Model specific parameters.
16 | cls_token: False # Use CLS token
17 | output_layers: [-1]  # List of layers to stack
18 | encoder_only: False
19 | dur_frames:  # None for no desired number of frames
20 | freeze_embed:  # Set True if freezing PatchEmbed during fine-tuning [2211.09359] How to Fine-Tune Vision Models with SGD
21 | 
22 | # Linear evaluation/Fine-tuning common parameters.
23 | training_mask: 0.0
24 | flat_features: False  # 768-d if True else 3840-d
25 | 
26 | # Linear evaluaition parameters.
27 | batch_size: 128
28 | lr_lineareval: 0.0003
29 | report_per_epochs: 50
30 | early_stop_epochs: 20
31 | 
32 | # Fine-tuning parameters.
33 | warmup_epochs: 5
34 | mixup: 0.5
35 | ft_bs: 128
36 | ft_lr: 2.0
37 | ft_early_stop_epochs: -1  # -1: no early stopping
38 | ft_epochs: 200
39 | ft_freq_mask: 30
40 | ft_time_mask: 192
41 | ft_noise: 0.0
42 | ft_rrc: True
43 | 


--------------------------------------------------------------------------------
/config/m2d.yaml:
--------------------------------------------------------------------------------
 1 | # AR parameters such as FFT parameters.
 2 | audio_repr: ar_m2d.AR_M2D
 3 | weight_file: m2d_vit_base-80x208p16x16p16k-random/random
 4 | feature_d: 3840
 5 | sample_rate: 16000
 6 | n_fft: 400
 7 | window_size: 400
 8 | hop_size: 160
 9 | n_mels: 80
10 | f_min: 50
11 | f_max: 8000
12 | window: hanning
13 | 
14 | # Statistics for normalization: average and standard deviation
15 | mean: -7.1
16 | std: 4.2
17 | 
18 | # Model specific parameters.
19 | output_layers: [-1]  # List of layers to stack
20 | encoder_only: True
21 | dur_frames:  # None for no desired number of frames
22 | freeze_embed:  # Set True if freezing PatchEmbed during fine-tuning [2211.09359] How to Fine-Tune Vision Models with SGD
23 | 
24 | # Linear evaluation/Fine-tuning common parameters.
25 | flat_features: False  # 768-d if True else 3840-d
26 | 
27 | # Linear evaluaition parameters.
28 | batch_size: 128
29 | lr_lineareval: 0.00003
30 | report_per_epochs: 50
31 | early_stop_epochs: 20
32 | 
33 | # Fine-tuning parameters.
34 | training_mask: 0.0
35 | warmup_epochs: 5
36 | mixup: 0.5
37 | ft_bs: 128
38 | ft_lr: 2.0
39 | ft_early_stop_epochs: -1  # -1: no early stopping
40 | ft_epochs: 200
41 | ft_freq_mask: 30
42 | ft_time_mask: 192
43 | ft_noise: 0.0
44 | ft_rrc: True
45 | 


--------------------------------------------------------------------------------
/config/m2d_32k.yaml:
--------------------------------------------------------------------------------
 1 | # AR parameters such as FFT parameters.
 2 | audio_repr: ar_m2d.AR_M2D
 3 | weight_file: m2d_vit_base-80x208p16x16p32k-random/random
 4 | feature_d: 3840
 5 | sample_rate: 32000
 6 | n_fft: 800
 7 | window_size: 800
 8 | hop_size: 320
 9 | n_mels: 80
10 | f_min: 50
11 | f_max: 16000
12 | window: hanning
13 | 
14 | # Statistics for normalization: average and standard deviation
15 | mean: -7.1
16 | std: 4.2
17 | 
18 | # Model specific parameters.
19 | output_layers: [-1]  # List of layers to stack
20 | encoder_only: False
21 | dur_frames:  # None for no desired number of frames
22 | freeze_embed:  # Set True if freezing PatchEmbed during fine-tuning [2211.09359] How to Fine-Tune Vision Models with SGD
23 | 
24 | # Linear evaluation/Fine-tuning common parameters.
25 | flat_features: False  # 768-d if True else 3840-d
26 | 
27 | # Linear evaluaition parameters.
28 | batch_size: 128
29 | lr_lineareval: 0.00003
30 | report_per_epochs: 50
31 | early_stop_epochs: 20
32 | 
33 | # Fine-tuning parameters.
34 | training_mask: 0.0
35 | warmup_epochs: 5
36 | mixup: 0.5
37 | ft_bs: 128
38 | ft_lr: 2.0
39 | ft_early_stop_epochs: -1  # -1: no early stopping
40 | ft_epochs: 200
41 | ft_freq_mask: 30
42 | ft_time_mask: 192
43 | ft_noise: 0.0
44 | ft_rrc: True
45 | 


--------------------------------------------------------------------------------
/evar/ar_openl3.py:
--------------------------------------------------------------------------------
 1 | """Wrapper code for:
 2 | 
 3 | Look, Listen and Learn More: Design Choices for Deep Audio Embeddings
 4 | 
 5 | ## Reference
 6 | - [1] https://arxiv.org/abs/2104.11587
 7 | - [2] https://github.com/marl/openl3
 8 | - [3] https://github.com/torchopenl3/torchopenl3
 9 | """
10 | 
11 | from evar.ar_base import (BaseAudioRepr, temporal_pooling)
12 | import torch
13 | import logging
14 | try:
15 |     import torchopenl3
16 |     from torchopenl3.utils import preprocess_audio_batch
17 | except:
18 |     pass  # logging.error('Install toprchopenl3.\n>>> pip install torchopenl3')
19 | 
20 | 
21 | class AR_OpenL3(BaseAudioRepr):
22 | 
23 |     def __init__(self, cfg):
24 |         super().__init__(cfg=cfg)
25 |         self.openl3_model = torchopenl3.models.load_audio_embedding_model(
26 |             cfg.openl3_input_repr, cfg.openl3_content_type, cfg.feature_d)
27 | 
28 |     def encode_frames(self, batch_audio):
29 |         frame_embeddings, ts_list = torchopenl3.get_audio_embedding(batch_audio,
30 |             self.cfg.sample_rate, model=self.openl3_model) # -> [B, T, D]
31 |         return frame_embeddings.transpose(1, 2) # -> [B, D, T]
32 | 
33 |     def forward(self, batch_audio):
34 |         frame_embeddings = self.encode_frames(batch_audio)
35 |         return temporal_pooling(self, frame_embeddings)
36 | 


--------------------------------------------------------------------------------
/app/README.md:
--------------------------------------------------------------------------------
 1 | # Application-specific evaluation
 2 | 
 3 | Some applications use their own evaluation protocols, including specialized metrics, and their benchmarking code typically restricts the models that can be evaluated. To address this limitation and enable evaluation code to work with models available on EVAR (with wrapper implementations), we modify these applications to support a broader range of models. This subproject outlines the precise steps and codes required to integrate EVAR into each application.
 4 | 
 5 | ## Assessing the Utility of Audio Foundation Models for Heart and Respiratory Sound Analysis
 6 | 
 7 | For our paper:
 8 | 
 9 | *[D. Niizumi, D. Takeuchi, M. Yasuda, B. T. Nguyen, Y. Ohishi, and N. Harada, "Assessing the Utility of Audio Foundation Models for Heart and Respiratory Sound Analysis," to appear at IEEE EMBC, 2025](https://arxiv.org/abs/2504.18004).*
10 | 
11 | We provide code to reproduce experiments for the tasks:
12 | 
13 | - Heart sound task: CirCor  👉 [circor](circor/README_CirCor.md).
14 | - Heart sound task: BMD-HS  👉 [bmdhs](bmdhs/README_BMDHS.md).
15 | - Respiratory sound task: SPRSound (SPRS) 👉  [icbhi_sprs](icbhi_sprs/README_ICBHI_SPRS.md)
16 | - Respiratory sound task: ICBHI2017 👉  [icbhi_sprs](icbhi_sprs/README_ICBHI_SPRS.md)
17 | 
18 | Please follow the instructions in each folder.
19 | 


--------------------------------------------------------------------------------
/evar/ar_trill.py:
--------------------------------------------------------------------------------
 1 | """Wrapper code for:
 2 | 
 3 | Towards Learning a Universal Non-Semantic Representation of Speech
 4 | 
 5 | ## Reference
 6 | - [1] http://arxiv.org/abs/2002.12764
 7 | - [2] https://aihub.cloud.google.com/u/0/p/products%2F41239b97-c960-479a-be50-ae7a23ae1561
 8 | """
 9 | 
10 | from evar.ar_base import (BaseAudioRepr, temporal_pooling)
11 | import torch
12 | import logging
13 | try:
14 |     import tensorflow.compat.v2 as tf
15 |     tf.enable_v2_behavior()
16 |     assert tf.executing_eagerly()
17 |     import tensorflow_hub as hub
18 | except:
19 |     pass  # logging.error('Install tensorflow and tensorflow_hub.\n>>> pip install tensorflow tensorflow_hub')
20 | 
21 | 
22 | class AR_TRILL(BaseAudioRepr):
23 |     def __init__(self, cfg):
24 |         super().__init__(cfg=cfg)
25 |         self.model = hub.load(cfg.trill_url)
26 |         self.emb_type = cfg.trill_emb_type
27 | 
28 |     def encode_frames(self, batch_audio):
29 |         device = batch_audio.device
30 |         x = self.model(samples=tf.convert_to_tensor(batch_audio.cpu().numpy()), sample_rate=16000)[self.emb_type].numpy()
31 |         x = torch.tensor(x.transpose(0, 2, 1)).float().to(device) # transpose: [B,T,D] -> [B,D,T]
32 |         return x
33 | 
34 |     def forward(self, batch_audio):
35 |         x = self.encode_frames(batch_audio)
36 |         x = temporal_pooling(self, x)
37 |         return x
38 | 


--------------------------------------------------------------------------------
/evar/ar_msclap.py:
--------------------------------------------------------------------------------
 1 | """Wrapper code for:
 2 | 
 3 | Natural Language Supervision for General-Purpose Audio Representations
 4 | 
 5 | ## Reference
 6 | - [1] https://arxiv.org/abs/2309.05767
 7 | - [2] https://github.com/microsoft/CLAP
 8 | """
 9 | 
10 | from evar.ar_base import BaseCLAP
11 | try:
12 |     from msclap import CLAP
13 | except:
14 |     pass  # please install: pip install msclap
15 | 
16 | 
17 | class AR_MSCLAP(BaseCLAP):
18 | 
19 |     def __init__(self, cfg):
20 |         super().__init__(cfg=cfg)
21 |         # MS CLAP accepts file name as audio input.
22 |         self.filename_mode = True
23 | 
24 |         self.backbone = CLAP(version=str(cfg.weight_file), use_cuda=True)
25 | 
26 |     def encode_frames(self, batch_audio):
27 |         assert False, 'encode_frames for MS CLAP is not supported for now'
28 | 
29 |     def forward(self, batch_audio):
30 |         audio_embeddings = self.backbone.get_audio_embeddings(batch_audio)
31 |         return audio_embeddings
32 | 
33 |     def encode_audio(self, batch_audio):
34 |         audio_embeddings = self.forward(batch_audio)
35 |         return audio_embeddings
36 | 
37 |     def encode_text(self, batch_text):
38 |         text_embeddings = self.backbone.get_text_embeddings(batch_text)
39 |         return text_embeddings
40 | 
41 |     def compute_similarity(self, text_embs, audio_embs):
42 |         similarity = self.backbone.compute_similarity(audio_embs, text_embs)
43 |         return similarity.T
44 | 


--------------------------------------------------------------------------------
/evar/ar_spec.py:
--------------------------------------------------------------------------------
 1 | """Wrapper code for:
 2 | 
 3 | Mel-spectrogram and linear spectrogram.
 4 | """
 5 | 
 6 | from evar.ar_base import (BaseAudioRepr, ToLogMelSpec,
 7 |     calculate_norm_stats, normalize_spectrogram, temporal_pooling)
 8 | import nnAudio.features
 9 | 
10 | 
11 | class AR_MelSpec(BaseAudioRepr):
12 |     def __init__(self, cfg):
13 |         super().__init__(cfg=cfg)
14 |         self.to_feature = ToLogMelSpec(cfg)
15 | 
16 |     def precompute(self, device, data_loader):
17 |         self.norm_stats = calculate_norm_stats(device, data_loader, self.to_feature)
18 | 
19 |     def encode_frames(self, batch_audio):
20 |         x = self.to_feature(batch_audio)
21 |         return normalize_spectrogram(self.norm_stats, x)
22 | 
23 |     def forward(self, batch_audio):
24 |         x = self.encode_frames(batch_audio)
25 |         return temporal_pooling(self, x)
26 | 
27 | 
28 | class ToLogLinSpec(ToLogMelSpec):
29 |     def __init__(self, cfg):
30 |         super().__init__(cfg)
31 |         self.to_spec = nnAudio.features.STFT(n_fft=cfg.n_fft, win_length=cfg.window_size,
32 |             freq_bins=None, hop_length=cfg.hop_size, 
33 |             center=True, sr=cfg.sample_rate,
34 |             output_format="Magnitude",
35 |             verbose=False,
36 |         )
37 | 
38 | 
39 | class AR_LinSpec(AR_MelSpec):
40 |     def __init__(self, cfg):
41 |         cfg.n_mels = 64 # dummy for making reuse of AR_MelSpec easy
42 |         super().__init__(cfg=cfg)
43 |         self.to_feature = ToLogLinSpec(cfg)
44 | 


--------------------------------------------------------------------------------
/evar/ar_dasheng.py:
--------------------------------------------------------------------------------
 1 | """Wrapper code for:
 2 | 
 3 | Scaling up masked audio encoder learning for general audio classification
 4 | 
 5 | ## Reference
 6 | - [1] https://www.isca-archive.org/interspeech_2024/dinkel24b_interspeech.html
 7 | - [2] https://huggingface.co/mispeech/dasheng-base
 8 | """
 9 | 
10 | from evar.ar_base import BaseAudioRepr
11 | import torch
12 | import logging
13 | try:
14 |     from dasheng_model.feature_extraction_dasheng import DashengFeatureExtractor
15 |     from dasheng_model.modeling_dasheng import DashengModel
16 | except:
17 |     logging.error('Install as follows.\n>>> pip install git+https://github.com/jimbozhang/hf_transformers_custom_model_dasheng.git')
18 | 
19 | 
20 | class AR_Dasheng(BaseAudioRepr):
21 |     def __init__(self, cfg):
22 |         super().__init__(cfg=cfg)
23 | 
24 |         self.preprocessor = DashengFeatureExtractor.from_pretrained(cfg.model_name)
25 |         self.backbone = DashengModel.from_pretrained(cfg.model_name, outputdim=None)
26 | 
27 |     def encode_frames(self, batch_audio):
28 |         preprocessed = self.preprocessor(audio.cpu(), sampling_rate=16000, return_tensors="pt")
29 |         preprocessed = preprocessed.to(batch_audio.device)
30 |         hidden_states = self.backbone(**preprocessed).hidden_states  # [B, T, D]
31 |         return hidden_states.transpose(1, 2)  # [B, D, T]
32 | 
33 |     def forward(self, batch_audio):
34 |         preprocessed = self.preprocessor(batch_audio.cpu(), sampling_rate=16000, return_tensors="pt")
35 |         preprocessed = preprocessed.to(batch_audio.device)
36 |         return self.backbone(**preprocessed).logits
37 | 


--------------------------------------------------------------------------------
/evar/ar_ced.py:
--------------------------------------------------------------------------------
 1 | """Wrapper code for:
 2 | 
 3 | CED: Consistent ensemble distillation for audio tagging
 4 | 
 5 | ## Reference
 6 | - [1] https://arxiv.org/abs/2308.11957
 7 | - [2] https://github.com/RicherMans/ced
 8 | """
 9 | 
10 | from evar.ar_base import BaseAudioRepr, temporal_pooling
11 | import sys
12 | import logging
13 | import torch
14 | try:
15 |     sys.path.append('../../external/hf_transformers_custom_model_ced')
16 |     sys.path.append('external/hf_transformers_custom_model_ced')
17 |     from ced_model.feature_extraction_ced import CedFeatureExtractor
18 |     from ced_model.modeling_ced import CedForAudioClassification
19 |     from transformers.modeling_outputs import SequenceClassifierOutput
20 | except:
21 |     pass  # please install CED
22 | 
23 | 
24 | class AR_CED(BaseAudioRepr):
25 | 
26 |     def __init__(self, cfg):
27 |         super().__init__(cfg=cfg)
28 | 
29 |         model_path = cfg.weight_file
30 |         self.feature_extractor = CedFeatureExtractor.from_pretrained(model_path)
31 |         self.backbone = CedForAudioClassification.from_pretrained(model_path)
32 | 
33 |         logging.info(f' Using weight from Hugging Face: {cfg.weight_file}')
34 | 
35 |     def encode_frames(self, batch_audio):
36 |         inputs = self.feature_extractor(batch_audio.to('cpu'), sampling_rate=16000, return_tensors="pt")
37 |         inputs['input_values'] = inputs['input_values'].to('cuda')
38 |         features = self.backbone(**inputs).hidden_states
39 |         return features.transpose(1, 2) # [B, D, T]
40 | 
41 |     def forward(self, batch_audio):
42 |         features = self.encode_frames(batch_audio)
43 |         return features.mean(-1)
44 | 
45 | 


--------------------------------------------------------------------------------
/evar/ar_atst_frame.py:
--------------------------------------------------------------------------------
 1 | """Wrapper code for:
 2 | 
 3 | Self-supervised Audio Teacher-Student Transformer for Both Clip-level and Frame-level Tasks
 4 | 
 5 | ## Reference
 6 | - [1] https://arxiv.org/abs/2306.04186
 7 | - [2] https://github.com/Audio-WestlakeU/audiossl/blob/main/audiossl/methods/atstframe
 8 | """
 9 | 
10 | from evar.ar_base import BaseAudioRepr
11 | import logging
12 | import sys
13 | from einops import rearrange
14 | try:
15 |     import os
16 |     evar_home = os.getenv('EVAR', '')
17 |     sys.path.append(os.path.join(evar_home, 'external/audiossl'))
18 |     sys.path.append('../../external/audiossl')
19 |     from audiossl.methods.atstframe.embedding import load_model, get_scene_embedding, get_timestamp_embedding
20 | except Exception as e:
21 |     pass  # Please clone audiossl
22 | 
23 | 
24 | class AR_ATST_Frame(BaseAudioRepr):
25 |     def __init__(self, cfg):
26 |         super().__init__(cfg=cfg)
27 | 
28 |         self.backbone = load_model(cfg.weight_file)
29 |         logging.info(f' Using weight file: {cfg.weight_file}')
30 | 
31 |     def encode_frames(self, batch_audio):
32 |         batch_audio = batch_audio.unsqueeze(1)  # [B, L] -> [B, 1, L] as described in the README
33 |         x, _ = get_timestamp_embedding(batch_audio, self.backbone)  # -> [B,T,N_BLOCKS*emb_size]
34 |         # no need x = rearrange(x, 'B 1 T N D -> B (N * D) T')
35 |         return x
36 | 
37 |     def forward(self, batch_audio):
38 |         #import pdb; pdb.set_trace()
39 |         batch_audio = batch_audio.unsqueeze(1)  # [B, L] -> [B, 1, L] as described in the README
40 |         x = get_scene_embedding(batch_audio, self.backbone)  # [B,N_BLOCKS*emb_size]
41 |         return x
42 | 


--------------------------------------------------------------------------------
/app/circor/patch-heart-murmur-detection.diff:
--------------------------------------------------------------------------------
 1 | --- org/heart-murmur-detection/ModelEvaluation/evaluate_model.py	2024-01-12 15:29:10.126397375 +0900
 2 | +++ /heart-murmur-detection/ModelEvaluation/evaluate_model.py	2023-11-15 16:47:47.351524689 +0900
 3 | @@ -59,6 +59,10 @@
 4 |      murmur_weighted_accuracy = compute_weighted_accuracy(
 5 |          murmur_labels, output_labels, murmur_classes
 6 |      )  # This is the murmur scoring metric.
 7 | +
 8 | +    # UAR
 9 | +    murmur_uar = murmur_accuracy_classes.mean()
10 | +
11 |      murmur_scores = (
12 |          murmur_classes,
13 |          murmur_auroc,
14 | @@ -70,6 +74,7 @@
15 |          murmur_accuracy,
16 |          murmur_accuracy_classes,
17 |          murmur_weighted_accuracy,
18 | +        murmur_uar,
19 |      )
20 |  
21 |      (
22 | @@ -83,11 +88,12 @@
23 |          accuracy,
24 |          accuracy_classes,
25 |          weighted_accuracy,
26 | +        uar,
27 |      ) = murmur_scores
28 |      murmur_output_string = (
29 | -        "AUROC,AUPRC,F-measure,Accuracy,Weighted Accuracy"
30 | -        "\n{:.3f},{:.3f},{:.3f},{:.3f},{:.3f}\n".format(
31 | -            auroc, auprc, f_measure, accuracy, weighted_accuracy
32 | +        "AUROC,AUPRC,F-measure,Accuracy,Weighted Accuracy,UAR"
33 | +        "\n{:.3f},{:.3f},{:.3f},{:.3f},{:.3f},{:.3f}\n".format(
34 | +            auroc, auprc, f_measure, accuracy, weighted_accuracy, uar
35 |          )
36 |      )
37 |      murmur_class_output_string = (
38 | @@ -109,8 +115,10 @@
39 |          + murmur_class_output_string
40 |      )
41 |  
42 | +    print(output_string)
43 | +
44 |      # Return the results.
45 | -    return output_string
46 | +    return murmur_scores
47 |  
48 |  
49 |  # Find Challenge files.


--------------------------------------------------------------------------------
/evar/ar_cnn14.py:
--------------------------------------------------------------------------------
 1 | """Wrapper code for:
 2 | 
 3 | PANNs: Large-Scale Pretrained Audio Neural Networks for Audio Pattern Recognition
 4 | 
 5 | ## Reference
 6 | - [1] https://arxiv.org/abs/1912.10211
 7 | - [2] https://github.com/qiuqiangkong/audioset_tagging_cnn
 8 | """
 9 | 
10 | from evar.ar_base import BaseAudioRepr
11 | from evar.model_utils import ensure_weights, load_pretrained_weights
12 | import logging
13 | try:
14 |     from evar.cnn14_decoupled import AudioFeatureExtractor, Cnn14_Decoupled
15 | except:
16 |     logging.info('** Install torchlibrosa if you use Cnn14 **')
17 | 
18 | 
19 | class AR_Cnn14(BaseAudioRepr):
20 |     def __init__(self, cfg):
21 |         super().__init__(cfg=cfg)
22 |         self.feature_extractor = AudioFeatureExtractor(n_fft=cfg.n_fft, hop_length=cfg.hop_size, win_length=cfg.window_size,
23 |             sample_rate=cfg.sample_rate, n_mels=cfg.n_mels, f_min=cfg.f_min, f_max=cfg.f_max)
24 |         self.body = Cnn14_Decoupled()
25 |         weight_file = 'external/Cnn14_16k_mAP=0.438.pth' if cfg.weight_file is None else cfg.weight_file
26 |         ensure_weights(weight_file, 'https://zenodo.org/record/3987831/files/Cnn14_16k_mAP%3D0.438.pth')
27 |         load_pretrained_weights(self.body, weight_file)
28 | 
29 |     def encode_frames(self, batch_audio):
30 |         x = self.feature_extractor(batch_audio)  # (B, 1, T, F(mel_bins))
31 |         x = self.augment_if_training(x.transpose(-2, -1)).transpose(-2, -1)  # (..., T, F) -> (..., F, T) -augment-> (..., T, F)
32 |         return self.body.encode(x)               # (B, D, T)
33 | 
34 |     def forward(self, batch_audio):
35 |         frame_embeddings = self.encode_frames(batch_audio)  # (B, D, T)
36 |         return self.body.temporal_pooling(frame_embeddings) # (B, D)
37 | 


--------------------------------------------------------------------------------
/evar/ar_data2vec.py:
--------------------------------------------------------------------------------
 1 | """Wrapper code for:
 2 | 
 3 | Data2vec: A General Framework for Self-supervised Learning in Speech, Vision and Language
 4 | 
 5 | ## Reference
 6 | - [1] https://ai.facebook.com/research/data2vec-a-general-framework-for-self-supervised-learning-in-speech-vision-and-language/
 7 | - [2] https://huggingface.co/facebook/data2vec-audio-large-960h
 8 | """
 9 | 
10 | from evar.ar_base import BaseAudioRepr, temporal_pooling
11 | import logging
12 | import torch
13 | try:
14 |     from transformers import Data2VecAudioModel, Wav2Vec2Processor
15 | except:
16 |     logging.error('Install transformers.\n>>> pip install transformers')
17 | 
18 | 
19 | class AR_Data2Vec(BaseAudioRepr):
20 | 
21 |     def __init__(self, cfg):
22 |         super().__init__(cfg=cfg)
23 | 
24 |         self.processor = Wav2Vec2Processor.from_pretrained(cfg.pretrained_model)
25 |         self.backbone = Data2VecAudioModel.from_pretrained(cfg.pretrained_model)
26 | 
27 |     def encode_frames(self, batch_audio):
28 |         device = batch_audio.device
29 |         preprocessed = self.processor(batch_audio.cpu().numpy(), return_tensors="pt", sampling_rate=16000).input_values
30 |         preprocessed = preprocessed[0].to(device) # [1, B, raw wave length] -> [B, raw wave length]
31 |         hidden_states = self.backbone(preprocessed, output_hidden_states=True).hidden_states # [B, T, D]
32 |         # stack layer outputs
33 |         states_to_stack = [hidden_states[index] for index in self.cfg.output_layers] if self.cfg.output_layers else hidden_states
34 |         features = torch.cat(states_to_stack, axis=-1)
35 |         return features.transpose(1, 2) # [B, D, T]
36 | 
37 |     def forward(self, batch_audio):
38 |         return temporal_pooling(self, self.encode_frames(batch_audio))
39 | 


--------------------------------------------------------------------------------
/evar/ar_byola2.py:
--------------------------------------------------------------------------------
 1 | """Wrapper code for:
 2 | 
 3 | BYOL for Audio: Exploring Pre-trained General-purpose Audio Representations
 4 | 
 5 | ## Reference
 6 | - [1] https://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber=9944865
 7 | - [1] https://arxiv.org/abs/2204.07402
 8 | """
 9 | 
10 | from evar.ar_base import (BaseAudioRepr, ToLogMelSpec, calculate_norm_stats, normalize_spectrogram, temporal_pooling)
11 | from evar.model_utils import load_pretrained_weights
12 | import logging
13 | try:
14 |     from external.byol_a.v2.byol_a2.models import AudioNTT2022Encoder
15 | except Exception as e:
16 |     pass  # logging.info(f'Make your copy of BYOL-A under external folder. Check Preparing-models.md for the details.')
17 | 
18 | 
19 | class AR_BYOLA2(BaseAudioRepr):
20 |     def __init__(self, cfg):
21 |         super().__init__(cfg=cfg)
22 |         self.to_feature = ToLogMelSpec(cfg)
23 | 
24 |         self.body = AudioNTT2022Encoder(n_mels=cfg.n_mels, d=cfg.feature_d)
25 |         if cfg.weight_file is not None and cfg.weight_file != '':
26 |             load_pretrained_weights(self.body, cfg.weight_file, model_key='body')
27 | 
28 |     def precompute(self, device, data_loader):
29 |         self.norm_stats = calculate_norm_stats(device, data_loader, self.to_feature)
30 | 
31 |     def encode_frames(self, batch_audio):
32 |         x = self.to_feature(batch_audio)
33 |         x = normalize_spectrogram(self.norm_stats, x) # B,F,T
34 |         x = self.augment_if_training(x)
35 |         x = x.unsqueeze(1)    # -> B,1,F,T
36 |         x = self.body(x)      # -> B,T,D=C*F
37 |         x = x.transpose(1, 2) # -> B,D,T
38 |         return x
39 | 
40 |     def forward(self, batch_audio):
41 |         x = self.encode_frames(batch_audio)
42 |         x = temporal_pooling(self, x)
43 |         return x
44 | 
45 | 


--------------------------------------------------------------------------------
/2pass_lineareval.py:
--------------------------------------------------------------------------------
 1 | """2-pass linear evaluation runner.
 2 | 
 3 | This program is a wrapper for the lineareval.py enables:
 4 | - Multiple runs of the lineareval.py with a cache of embeddings.
 5 | - Evaluating Tensorflow models.
 6 | 
 7 | ## Evaluation flow
 8 | 
 9 | This will run lineareval.py twice or more so that we can decouple inference and linear evaluation phase,
10 | making it possible to use the TF model in the inference phase.
11 | 
12 | 1. Run lineareval.py with `--step=2pass_1_precompute_only`.
13 |    Conduct inference by any model (whichever TF or torch) to convert raw audio into embeddings, and store embedding in a cache.
14 | 2. Run lineareval.py with `--step=2pass_2_train_test`. Conduct linear evaluation by using embeddings from the cache using torch.
15 | 3. (if repeat > 1) Repeat the step 2 with incremented random seed.
16 | """
17 | 
18 | from evar.utils import run_command
19 | import fire
20 | 
21 | 
22 | def lineareval_two_pass(config_file, task, options='', lr=None, hidden=(), standard_scaler=True, mixup=False,
23 |     early_stop_epochs=None, step=None, repeat=3, seed=None):
24 | 
25 |     seed = seed or 42
26 |     command_line = [
27 |         'python',
28 |         'lineareval.py',
29 |         config_file,
30 |         task, 
31 |         f'--options={options}',
32 |         f'--lr={lr}', 
33 |         f'--hidden={hidden}',
34 |         f'--standard_scaler={standard_scaler}',
35 |         f'--mixup={mixup}',
36 |         f'--early_stop_epochs={early_stop_epochs}'
37 |     ]
38 | 
39 |     run_command(command_line + [f'--seed={seed}', '--step=2pass_1_precompute_only'])
40 |     for i in range(repeat):
41 |         run_command(command_line + [f'--seed={seed + i}', '--step=2pass_2_train_test'])
42 | 
43 | 
44 | if __name__ == '__main__':
45 |     fire.Fire(lineareval_two_pass)
46 | 


--------------------------------------------------------------------------------
/plugin/OPERA/evar_openl3env.sh:
--------------------------------------------------------------------------------
 1 | base=$EVAR
 2 | dim=6144
 3 | name=OpenL3
 4 | GPU=0
 5 | # filename=$(basename $weight)
 6 | # "$(basename "$(dirname "$weight")")_${filename%.*}"
 7 | 
 8 | python -m src.benchmark.processing.copd_processing --pretrain evar:$base:config/openl3env.yaml:$name
 9 | CUDA_VISIBLE_DEVICES=$GPU python -m src.benchmark.linear_eval --task copd --pretrain $name --dim $dim
10 | python -m src.benchmark.processing.icbhi_processing --pretrain evar:$base:config/openl3env.yaml:$name
11 | CUDA_VISIBLE_DEVICES=$GPU python -m src.benchmark.linear_eval --task icbhidisease --pretrain $name --dim $dim
12 | python -m src.benchmark.processing.kauh_processing --pretrain evar:$base:config/openl3env.yaml:$name
13 | CUDA_VISIBLE_DEVICES=$GPU python -m src.benchmark.linear_eval --task kauh --pretrain $name --dim $dim
14 | 
15 | python -m src.benchmark.processing.coughvid_processing --pretrain evar:$base:config/openl3env.yaml:$name --label covid
16 | CUDA_VISIBLE_DEVICES=$GPU python -m src.benchmark.linear_eval --task coughvidcovid --pretrain $name --dim $dim
17 | python -m src.benchmark.processing.coughvid_processing --pretrain evar:$base:config/openl3env.yaml:$name --label gender
18 | CUDA_VISIBLE_DEVICES=$GPU python -m src.benchmark.linear_eval --task coughvidsex --pretrain $name --dim $dim
19 | 
20 | python -m src.benchmark.processing.coswara_processing --pretrain evar:$base:config/openl3env.yaml:$name --label smoker
21 | CUDA_VISIBLE_DEVICES=$GPU python -m src.benchmark.linear_eval --task coswarasmoker --pretrain $name --dim $dim --modality cough-shallow
22 | python -m src.benchmark.processing.coswara_processing --pretrain evar:$base:config/openl3env.yaml:$name --label sex
23 | CUDA_VISIBLE_DEVICES=$GPU python -m src.benchmark.linear_eval --task coswarasex --pretrain $name --dim $dim --modality cough-shallow
24 | 
25 | 


--------------------------------------------------------------------------------
/plugin/OPERA/evar_atst_clip.sh:
--------------------------------------------------------------------------------
 1 | base=$EVAR
 2 | weight=${base}/external/atst_base.ckpt
 3 | dim=1536
 4 | name=ATST-CLIP
 5 | GPU=0
 6 | 
 7 | python -m src.benchmark.processing.copd_processing --pretrain evar:$base:config/atst.yaml:$name:weight_file=$weight
 8 | CUDA_VISIBLE_DEVICES=$GPU python -m src.benchmark.linear_eval --task copd --pretrain $name --dim $dim
 9 | python -m src.benchmark.processing.icbhi_processing --pretrain evar:$base:config/atst.yaml:$name:weight_file=$weight
10 | CUDA_VISIBLE_DEVICES=$GPU python -m src.benchmark.linear_eval --task icbhidisease --pretrain $name --dim $dim
11 | python -m src.benchmark.processing.kauh_processing --pretrain evar:$base:config/atst.yaml:$name:weight_file=$weight
12 | CUDA_VISIBLE_DEVICES=$GPU python -m src.benchmark.linear_eval --task kauh --pretrain $name --dim $dim
13 | 
14 | python -m src.benchmark.processing.coughvid_processing --pretrain evar:$base:config/atst.yaml:$name:weight_file=$weight --label covid
15 | CUDA_VISIBLE_DEVICES=$GPU python -m src.benchmark.linear_eval --task coughvidcovid --pretrain $name --dim $dim
16 | python -m src.benchmark.processing.coughvid_processing --pretrain evar:$base:config/atst.yaml:$name:weight_file=$weight --label gender
17 | CUDA_VISIBLE_DEVICES=$GPU python -m src.benchmark.linear_eval --task coughvidsex --pretrain $name --dim $dim
18 | 
19 | python -m src.benchmark.processing.coswara_processing --pretrain evar:$base:config/atst.yaml:$name:weight_file=$weight --label smoker
20 | CUDA_VISIBLE_DEVICES=$GPU python -m src.benchmark.linear_eval --task coswarasmoker --pretrain $name --dim $dim --modality cough-shallow
21 | python -m src.benchmark.processing.coswara_processing --pretrain evar:$base:config/atst.yaml:$name:weight_file=$weight --label sex
22 | CUDA_VISIBLE_DEVICES=$GPU python -m src.benchmark.linear_eval --task coswarasex --pretrain $name --dim $dim --modality cough-shallow
23 | 
24 | 


--------------------------------------------------------------------------------
/plugin/OPERA/evar_m2d.sh:
--------------------------------------------------------------------------------
 1 | base=$EVAR
 2 | weight=$1
 3 | dim=$2
 4 | filename=$(basename $weight)
 5 | name="$(basename "$(dirname "$weight")")_${filename%.*}"
 6 | 
 7 | python -m src.benchmark.processing.copd_processing --pretrain evar:$base:config/m2d.yaml:$name:weight_file=$weight
 8 | CUDA_VISIBLE_DEVICES=0 python -m src.benchmark.linear_eval --task copd --pretrain $name --dim $dim
 9 | python -m src.benchmark.processing.icbhi_processing --pretrain evar:$base:config/m2d.yaml:$name:weight_file=$weight
10 | CUDA_VISIBLE_DEVICES=0 python -m src.benchmark.linear_eval --task icbhidisease --pretrain $name --dim $dim
11 | python -m src.benchmark.processing.kauh_processing --pretrain evar:$base:config/m2d.yaml:$name:weight_file=$weight
12 | CUDA_VISIBLE_DEVICES=0 python -m src.benchmark.linear_eval --task kauh --pretrain $name --dim $dim
13 | 
14 | python -m src.benchmark.processing.coughvid_processing --pretrain evar:$base:config/m2d.yaml:$name:weight_file=$weight --label covid
15 | CUDA_VISIBLE_DEVICES=0 python -m src.benchmark.linear_eval --task coughvidcovid --pretrain $name --dim $dim
16 | python -m src.benchmark.processing.coughvid_processing --pretrain evar:$base:config/m2d.yaml:$name:weight_file=$weight --label gender
17 | CUDA_VISIBLE_DEVICES=0 python -m src.benchmark.linear_eval --task coughvidsex --pretrain $name --dim $dim
18 | 
19 | python -m src.benchmark.processing.coswara_processing --pretrain evar:$base:config/m2d.yaml:$name:weight_file=$weight --label smoker
20 | CUDA_VISIBLE_DEVICES=0 python -m src.benchmark.linear_eval --task coswarasmoker --pretrain $name --dim $dim --modality cough-shallow
21 | python -m src.benchmark.processing.coswara_processing --pretrain evar:$base:config/m2d.yaml:$name:weight_file=$weight --label sex
22 | CUDA_VISIBLE_DEVICES=0 python -m src.benchmark.linear_eval --task coswarasex --pretrain $name --dim $dim --modality cough-shallow
23 | 
24 | 


--------------------------------------------------------------------------------
/run/all_wavcaps.sh:
--------------------------------------------------------------------------------
 1 | NAME=WavCapsZS
 2 | python 2pass_lineareval.py config/wavcaps.yaml cremad batch_size=16,name=$NAME,weight_file=external/WavCaps/HTSAT_BERT_zero_shot.pt
 3 | python 2pass_lineareval.py config/wavcaps.yaml gtzan batch_size=16,name=$NAME,weight_file=external/WavCaps/HTSAT_BERT_zero_shot.pt
 4 | python 2pass_lineareval.py config/wavcaps.yaml spcv2 batch_size=64,name=$NAME,weight_file=external/WavCaps/HTSAT_BERT_zero_shot.pt
 5 | python 2pass_lineareval.py config/wavcaps.yaml esc50 batch_size=64,name=$NAME,weight_file=external/WavCaps/HTSAT_BERT_zero_shot.pt
 6 | python 2pass_lineareval.py config/wavcaps.yaml us8k batch_size=64,name=$NAME,weight_file=external/WavCaps/HTSAT_BERT_zero_shot.pt
 7 | python 2pass_lineareval.py config/wavcaps.yaml vc1 batch_size=64,name=$NAME,weight_file=external/WavCaps/HTSAT_BERT_zero_shot.pt
 8 | python 2pass_lineareval.py config/wavcaps.yaml voxforge batch_size=64,name=$NAME,weight_file=external/WavCaps/HTSAT_BERT_zero_shot.pt
 9 | python 2pass_lineareval.py config/wavcaps.yaml nsynth batch_size=64,name=$NAME,weight_file=external/WavCaps/HTSAT_BERT_zero_shot.pt
10 | python 2pass_lineareval.py config/wavcaps.yaml surge batch_size=64,name=$NAME,weight_file=external/WavCaps/HTSAT_BERT_zero_shot.pt
11 | 
12 | python zeroshot.py config/wavcaps.yaml cremad batch_size=16,name=$NAME,weight_file=external/WavCaps/HTSAT_BERT_zero_shot.pt
13 | python zeroshot.py config/wavcaps.yaml gtzan batch_size=16,name=$NAME,weight_file=external/WavCaps/HTSAT_BERT_zero_shot.pt
14 | python zeroshot.py config/wavcaps.yaml nsynth batch_size=64,name=$NAME,weight_file=external/WavCaps/HTSAT_BERT_zero_shot.pt
15 | python zeroshot.py config/wavcaps.yaml esc50 batch_size=64,name=$NAME,weight_file=external/WavCaps/HTSAT_BERT_zero_shot.pt
16 | python zeroshot.py config/wavcaps.yaml us8k batch_size=64,name=$NAME,weight_file=external/WavCaps/HTSAT_BERT_zero_shot.pt
17 | 
18 | python summarize.py $NAME
19 | 


--------------------------------------------------------------------------------
/app/bmdhs/README_BMDHS.md:
--------------------------------------------------------------------------------
 1 | # CirCor evaluation
 2 | 
 3 | We provide code to evaluate BMD-HS with various models.
 4 | In addition, the exact stratified data splits used in the paper are provided for reproducibility.
 5 | 
 6 | **NOTE: The code freezes the audio representation model weights.**
 7 | 
 8 | Prepare data and metadata files before your evaluation.
 9 | 
10 | In this folder `app/bmdhs`, download the dataset and fix one file name:
11 | 
12 | ```sh
13 | git clone https://github.com/mHealthBuet/BMD-HS-Dataset
14 | mv BMD-HS-Dataset/train/MD_085_sit_Tri6_06.wav BMD-HS-Dataset/train/MD_085_sit_Tri.wav
15 | ```
16 | 
17 | Then, the following will resample/copy data files from `BMD-HS-Dataset` to `../../work/16k/bmdhs`.
18 | 
19 | ```sh
20 | python ../../prepare_wav.py BMD-HS-Dataset/ ../../work/16k/bmdhs 16000
21 | ```
22 | 
23 | In addition, the following will create metadata files as `../../evar/metadata/bmdhs[1-3].csv`.
24 | 
25 | ```sh
26 | python make_metadata.py
27 | ```
28 | 
29 | ## Run evaluations
30 | 
31 | In the **root folder of EVAR**, run the scripts `ev_*.sh`. The following is the complete set of command lines for the paper.
32 | 
33 | The results will be recorded in `results/bmdhs-scores.csv`.
34 | 
35 | ```sh
36 | bash app/bmdhs/ev_ast.sh 1 5 42 0.1
37 | bash app/bmdhs/ev_ast.sh 2 5 42 0.1
38 | bash app/bmdhs/ev_ast.sh 3 5 42 0.1
39 | 
40 | bash app/bmdhs/ev_beats.sh 1 5 42 0.1
41 | bash app/bmdhs/ev_beats.sh 2 5 42 0.1
42 | bash app/bmdhs/ev_beats.sh 3 5 42 0.1
43 | 
44 | bash app/bmdhs/ev_byola.sh 1 5 42 0.1
45 | bash app/bmdhs/ev_byola.sh 2 5 42 0.1
46 | bash app/bmdhs/ev_byola.sh 3 5 42 0.1
47 | 
48 | bash app/bmdhs/ev_m2d.sh m2d_vit_base-80x608p16x16-221006-mr7/checkpoint-300.pth 1 5 42 0.1
49 | bash app/bmdhs/ev_m2d.sh m2d_vit_base-80x608p16x16-221006-mr7/checkpoint-300.pth 2 5 42 0.1
50 | bash app/bmdhs/ev_m2d.sh m2d_vit_base-80x608p16x16-221006-mr7/checkpoint-300.pth 3 5 42 0.1
51 | ```
52 | 
53 | 
54 | 
55 | 


--------------------------------------------------------------------------------
/evar/utils/calculations.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | class RunningMean:
 5 |     """Running mean calculator for arbitrary axis configuration.
 6 |     Thanks to https://math.stackexchange.com/questions/106700/incremental-averageing
 7 |     """
 8 | 
 9 |     def __init__(self, axis):
10 |         self.n = 0
11 |         self.axis = axis
12 | 
13 |     def put(self, x):
14 |         if self.n == 0:
15 |             self.mu = x.mean(self.axis, keepdims=True)
16 |         else:
17 |             self.mu += (x.mean(self.axis, keepdims=True) - self.mu) / self.n
18 |         self.n += 1
19 | 
20 |     def __call__(self):
21 |         return self.mu
22 | 
23 |     def __len__(self):
24 |         return self.n
25 | 
26 | 
27 | class RunningVariance:
28 |     """Calculate mean/variance of tensors online.
29 |     Thanks to https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
30 |     """
31 | 
32 |     def __init__(self, axis, mean):
33 |         self.update_mean(mean)
34 |         self.s2 = RunningMean(axis)
35 | 
36 |     def update_mean(self, mean):
37 |         self.mean = mean
38 | 
39 |     def put(self, x):
40 |         self.s2.put((x - self.mean) **2)
41 | 
42 |     def __call__(self):
43 |         return self.s2()
44 | 
45 |     def std(self):
46 |         return np.sqrt(self())
47 | 
48 | 
49 | class RunningStats:
50 |     def __init__(self, axis=None):
51 |         self.axis = axis
52 |         self.mean = self.var = None
53 | 
54 |     def put(self, x):
55 |         assert type(x)
56 |         if self.mean is None:
57 |             if self.axis is None:
58 |                 self.axis = list(range(len(x.shape)))
59 |             self.mean = RunningMean(self.axis)
60 |             self.var = RunningVariance(self.axis, 0)
61 |         self.mean.put(x)
62 |         self.var.update_mean(self.mean())
63 |         self.var.put(x)
64 | 
65 |     def __call__(self):
66 |         return self.mean(), self.var.std()
67 | 


--------------------------------------------------------------------------------
/plugin/OPERA/evar_beats.sh:
--------------------------------------------------------------------------------
 1 | base=$EVAR
 2 | weight=${base}/external/BEATs_iter3.pt
 3 | dim=768
 4 | name=BEATs
 5 | GPU=0
 6 | # filename=$(basename $weight)
 7 | # "$(basename "$(dirname "$weight")")_${filename%.*}"
 8 | 
 9 | python -m src.benchmark.processing.copd_processing --pretrain evar:$base:config/beats.yaml:$name:weight_file=$weight
10 | CUDA_VISIBLE_DEVICES=$GPU python -m src.benchmark.linear_eval --task copd --pretrain $name --dim $dim
11 | python -m src.benchmark.processing.icbhi_processing --pretrain evar:$base:config/beats.yaml:$name:weight_file=$weight
12 | CUDA_VISIBLE_DEVICES=$GPU python -m src.benchmark.linear_eval --task icbhidisease --pretrain $name --dim $dim
13 | python -m src.benchmark.processing.kauh_processing --pretrain evar:$base:config/beats.yaml:$name:weight_file=$weight
14 | CUDA_VISIBLE_DEVICES=$GPU python -m src.benchmark.linear_eval --task kauh --pretrain $name --dim $dim
15 | 
16 | python -m src.benchmark.processing.coughvid_processing --pretrain evar:$base:config/beats.yaml:$name:weight_file=$weight --label covid
17 | CUDA_VISIBLE_DEVICES=$GPU python -m src.benchmark.linear_eval --task coughvidcovid --pretrain $name --dim $dim
18 | python -m src.benchmark.processing.coughvid_processing --pretrain evar:$base:config/beats.yaml:$name:weight_file=$weight --label gender
19 | CUDA_VISIBLE_DEVICES=$GPU python -m src.benchmark.linear_eval --task coughvidsex --pretrain $name --dim $dim
20 | 
21 | python -m src.benchmark.processing.coswara_processing --pretrain evar:$base:config/beats.yaml:$name:weight_file=$weight --label smoker
22 | CUDA_VISIBLE_DEVICES=$GPU python -m src.benchmark.linear_eval --task coswarasmoker --pretrain $name --dim $dim --modality cough-shallow
23 | python -m src.benchmark.processing.coswara_processing --pretrain evar:$base:config/beats.yaml:$name:weight_file=$weight --label sex
24 | CUDA_VISIBLE_DEVICES=$GPU python -m src.benchmark.linear_eval --task coswarasex --pretrain $name --dim $dim --modality cough-shallow
25 | 
26 | 


--------------------------------------------------------------------------------
/plugin/OPERA/evar_ast.sh:
--------------------------------------------------------------------------------
 1 | base=$EVAR
 2 | weight=${base}/external/ast/pretrained_models/ast_audioset.pth
 3 | dim=768
 4 | name=AST
 5 | GPU=0
 6 | # filename=$(basename $weight)
 7 | # "$(basename "$(dirname "$weight")")_${filename%.*}"
 8 | 
 9 | python -m src.benchmark.processing.copd_processing --pretrain evar:$base:config/ast.yaml:$name:weight_file=$weight
10 | CUDA_VISIBLE_DEVICES=$GPU python -m src.benchmark.linear_eval --task copd --pretrain $name --dim $dim
11 | python -m src.benchmark.processing.icbhi_processing --pretrain evar:$base:config/ast.yaml:$name:weight_file=$weight
12 | CUDA_VISIBLE_DEVICES=$GPU python -m src.benchmark.linear_eval --task icbhidisease --pretrain $name --dim $dim
13 | python -m src.benchmark.processing.kauh_processing --pretrain evar:$base:config/ast.yaml:$name:weight_file=$weight
14 | CUDA_VISIBLE_DEVICES=$GPU python -m src.benchmark.linear_eval --task kauh --pretrain $name --dim $dim
15 | 
16 | python -m src.benchmark.processing.coughvid_processing --pretrain evar:$base:config/ast.yaml:$name:weight_file=$weight --label covid
17 | CUDA_VISIBLE_DEVICES=$GPU python -m src.benchmark.linear_eval --task coughvidcovid --pretrain $name --dim $dim
18 | python -m src.benchmark.processing.coughvid_processing --pretrain evar:$base:config/ast.yaml:$name:weight_file=$weight --label gender
19 | CUDA_VISIBLE_DEVICES=$GPU python -m src.benchmark.linear_eval --task coughvidsex --pretrain $name --dim $dim
20 | 
21 | python -m src.benchmark.processing.coswara_processing --pretrain evar:$base:config/ast.yaml:$name:weight_file=$weight --label smoker
22 | CUDA_VISIBLE_DEVICES=$GPU python -m src.benchmark.linear_eval --task coswarasmoker --pretrain $name --dim $dim --modality cough-shallow
23 | python -m src.benchmark.processing.coswara_processing --pretrain evar:$base:config/ast.yaml:$name:weight_file=$weight --label sex
24 | CUDA_VISIBLE_DEVICES=$GPU python -m src.benchmark.linear_eval --task coswarasex --pretrain $name --dim $dim --modality cough-shallow
25 | 
26 | 


--------------------------------------------------------------------------------
/plugin/OPERA/evar_htsat.sh:
--------------------------------------------------------------------------------
 1 | base=$EVAR
 2 | weight=${base}/external/HTSAT_AudioSet_Saved_1.ckpt
 3 | dim=768
 4 | name=HTS-AT
 5 | GPU=0
 6 | # filename=$(basename $weight)
 7 | # "$(basename "$(dirname "$weight")")_${filename%.*}"
 8 | 
 9 | python -m src.benchmark.processing.copd_processing --pretrain evar:$base:config/htsat.yaml:$name:weight_file=$weight
10 | CUDA_VISIBLE_DEVICES=$GPU python -m src.benchmark.linear_eval --task copd --pretrain $name --dim $dim
11 | python -m src.benchmark.processing.icbhi_processing --pretrain evar:$base:config/htsat.yaml:$name:weight_file=$weight
12 | CUDA_VISIBLE_DEVICES=$GPU python -m src.benchmark.linear_eval --task icbhidisease --pretrain $name --dim $dim
13 | python -m src.benchmark.processing.kauh_processing --pretrain evar:$base:config/htsat.yaml:$name:weight_file=$weight
14 | CUDA_VISIBLE_DEVICES=$GPU python -m src.benchmark.linear_eval --task kauh --pretrain $name --dim $dim
15 | 
16 | python -m src.benchmark.processing.coughvid_processing --pretrain evar:$base:config/htsat.yaml:$name:weight_file=$weight --label covid
17 | CUDA_VISIBLE_DEVICES=$GPU python -m src.benchmark.linear_eval --task coughvidcovid --pretrain $name --dim $dim
18 | python -m src.benchmark.processing.coughvid_processing --pretrain evar:$base:config/htsat.yaml:$name:weight_file=$weight --label gender
19 | CUDA_VISIBLE_DEVICES=$GPU python -m src.benchmark.linear_eval --task coughvidsex --pretrain $name --dim $dim
20 | 
21 | python -m src.benchmark.processing.coswara_processing --pretrain evar:$base:config/htsat.yaml:$name:weight_file=$weight --label smoker
22 | CUDA_VISIBLE_DEVICES=$GPU python -m src.benchmark.linear_eval --task coswarasmoker --pretrain $name --dim $dim --modality cough-shallow
23 | python -m src.benchmark.processing.coswara_processing --pretrain evar:$base:config/htsat.yaml:$name:weight_file=$weight --label sex
24 | CUDA_VISIBLE_DEVICES=$GPU python -m src.benchmark.linear_eval --task coswarasex --pretrain $name --dim $dim --modality cough-shallow
25 | 
26 | 


--------------------------------------------------------------------------------
/plugin/OPERA/evar_byola.sh:
--------------------------------------------------------------------------------
 1 | base=$EVAR
 2 | weight=${base}/external/byol_a/pretrained_weights/AudioNTT2020-BYOLA-64x96d2048.pth
 3 | dim=2048
 4 | name=BYOL-A
 5 | # filename=$(basename $weight)
 6 | # "$(basename "$(dirname "$weight")")_${filename%.*}"
 7 | 
 8 | python -m src.benchmark.processing.copd_processing --pretrain evar:$base:config/byola.yaml:$name:weight_file=$weight
 9 | CUDA_VISIBLE_DEVICES=0 python -m src.benchmark.linear_eval --task copd --pretrain $name --dim $dim
10 | python -m src.benchmark.processing.icbhi_processing --pretrain evar:$base:config/byola.yaml:$name:weight_file=$weight
11 | CUDA_VISIBLE_DEVICES=0 python -m src.benchmark.linear_eval --task icbhidisease --pretrain $name --dim $dim
12 | python -m src.benchmark.processing.kauh_processing --pretrain evar:$base:config/byola.yaml:$name:weight_file=$weight
13 | CUDA_VISIBLE_DEVICES=0 python -m src.benchmark.linear_eval --task kauh --pretrain $name --dim $dim
14 | 
15 | python -m src.benchmark.processing.coughvid_processing --pretrain evar:$base:config/byola.yaml:$name:weight_file=$weight --label covid
16 | CUDA_VISIBLE_DEVICES=0 python -m src.benchmark.linear_eval --task coughvidcovid --pretrain $name --dim $dim
17 | python -m src.benchmark.processing.coughvid_processing --pretrain evar:$base:config/byola.yaml:$name:weight_file=$weight --label gender
18 | CUDA_VISIBLE_DEVICES=0 python -m src.benchmark.linear_eval --task coughvidsex --pretrain $name --dim $dim
19 | 
20 | python -m src.benchmark.processing.coswara_processing --pretrain evar:$base:config/byola.yaml:$name:weight_file=$weight --label smoker
21 | CUDA_VISIBLE_DEVICES=0 python -m src.benchmark.linear_eval --task coswarasmoker --pretrain $name --dim $dim --modality cough-shallow
22 | python -m src.benchmark.processing.coswara_processing --pretrain evar:$base:config/byola.yaml:$name:weight_file=$weight --label sex
23 | CUDA_VISIBLE_DEVICES=0 python -m src.benchmark.linear_eval --task coswarasex --pretrain $name --dim $dim --modality cough-shallow
24 | 
25 | 


--------------------------------------------------------------------------------
/plugin/OPERA/evar_hubert.sh:
--------------------------------------------------------------------------------
 1 | base=$EVAR
 2 | dim=768
 3 | GPU=0
 4 | # filename=$(basename $weight)
 5 | # "$(basename "$(dirname "$weight")")_${filename%.*}"
 6 | 
 7 | for i in 1 2 3 4 5 6 7 8 9 10 11 12; do
 8 | 
 9 | name="HuBERT_$i"
10 | python -m src.benchmark.processing.copd_processing --pretrain evar:$base:config/hubert.yaml:$name:output_layers=[$i]
11 | CUDA_VISIBLE_DEVICES=$GPU python -m src.benchmark.linear_eval --task copd --pretrain $name --dim $dim
12 | python -m src.benchmark.processing.icbhi_processing --pretrain evar:$base:config/hubert.yaml:$name:output_layers=[$i]
13 | CUDA_VISIBLE_DEVICES=$GPU python -m src.benchmark.linear_eval --task icbhidisease --pretrain $name --dim $dim
14 | python -m src.benchmark.processing.kauh_processing --pretrain evar:$base:config/hubert.yaml:$name:output_layers=[$i]
15 | CUDA_VISIBLE_DEVICES=$GPU python -m src.benchmark.linear_eval --task kauh --pretrain $name --dim $dim
16 | 
17 | python -m src.benchmark.processing.coughvid_processing --pretrain evar:$base:config/hubert.yaml:$name:output_layers=[$i] --label covid
18 | CUDA_VISIBLE_DEVICES=$GPU python -m src.benchmark.linear_eval --task coughvidcovid --pretrain $name --dim $dim
19 | python -m src.benchmark.processing.coughvid_processing --pretrain evar:$base:config/hubert.yaml:$name:output_layers=[$i] --label gender
20 | CUDA_VISIBLE_DEVICES=$GPU python -m src.benchmark.linear_eval --task coughvidsex --pretrain $name --dim $dim
21 | 
22 | python -m src.benchmark.processing.coswara_processing --pretrain evar:$base:config/hubert.yaml:$name:output_layers=[$i] --label smoker
23 | CUDA_VISIBLE_DEVICES=$GPU python -m src.benchmark.linear_eval --task coswarasmoker --pretrain $name --dim $dim --modality cough-shallow
24 | python -m src.benchmark.processing.coswara_processing --pretrain evar:$base:config/hubert.yaml:$name:output_layers=[$i] --label sex
25 | CUDA_VISIBLE_DEVICES=$GPU python -m src.benchmark.linear_eval --task coswarasex --pretrain $name --dim $dim --modality cough-shallow
26 | 
27 | done
28 | 


--------------------------------------------------------------------------------
/plugin/OPERA/evar_wavlm.sh:
--------------------------------------------------------------------------------
 1 | base=$EVAR
 2 | dim=768
 3 | GPU=0
 4 | # filename=$(basename $weight)
 5 | # "$(basename "$(dirname "$weight")")_${filename%.*}"
 6 | 
 7 | for i in 1 2 3 4 5 6 7 8 9 10 11 12; do
 8 | 
 9 | name="WavLM_$i"
10 | python -m src.benchmark.processing.copd_processing --pretrain evar:$base:config/wavlm.yaml:$name:+output_layers=[$i]
11 | CUDA_VISIBLE_DEVICES=$GPU python -m src.benchmark.linear_eval --task copd --pretrain $name --dim $dim
12 | python -m src.benchmark.processing.icbhi_processing --pretrain evar:$base:config/wavlm.yaml:$name:+output_layers=[$i]
13 | CUDA_VISIBLE_DEVICES=$GPU python -m src.benchmark.linear_eval --task icbhidisease --pretrain $name --dim $dim
14 | python -m src.benchmark.processing.kauh_processing --pretrain evar:$base:config/wavlm.yaml:$name:+output_layers=[$i]
15 | CUDA_VISIBLE_DEVICES=$GPU python -m src.benchmark.linear_eval --task kauh --pretrain $name --dim $dim
16 | 
17 | python -m src.benchmark.processing.coughvid_processing --pretrain evar:$base:config/wavlm.yaml:$name:+output_layers=[$i] --label covid
18 | CUDA_VISIBLE_DEVICES=$GPU python -m src.benchmark.linear_eval --task coughvidcovid --pretrain $name --dim $dim
19 | python -m src.benchmark.processing.coughvid_processing --pretrain evar:$base:config/wavlm.yaml:$name:+output_layers=[$i] --label gender
20 | CUDA_VISIBLE_DEVICES=$GPU python -m src.benchmark.linear_eval --task coughvidsex --pretrain $name --dim $dim
21 | 
22 | python -m src.benchmark.processing.coswara_processing --pretrain evar:$base:config/wavlm.yaml:$name:+output_layers=[$i] --label smoker
23 | CUDA_VISIBLE_DEVICES=$GPU python -m src.benchmark.linear_eval --task coswarasmoker --pretrain $name --dim $dim --modality cough-shallow
24 | python -m src.benchmark.processing.coswara_processing --pretrain evar:$base:config/wavlm.yaml:$name:+output_layers=[$i] --label sex
25 | CUDA_VISIBLE_DEVICES=$GPU python -m src.benchmark.linear_eval --task coswarasex --pretrain $name --dim $dim --modality cough-shallow
26 | 
27 | done
28 | 


--------------------------------------------------------------------------------
/plugin/OPERA/evar_atst_frame.sh:
--------------------------------------------------------------------------------
 1 | base=$EVAR
 2 | weight=${base}/external/atstframe_base.ckpt
 3 | dim=9216
 4 | name=ATST-Frame
 5 | GPU=0
 6 | # filename=$(basename $weight)
 7 | # "$(basename "$(dirname "$weight")")_${filename%.*}"
 8 | 
 9 | python -m src.benchmark.processing.copd_processing --pretrain evar:$base:config/atst_frame.yaml:$name:weight_file=$weight
10 | CUDA_VISIBLE_DEVICES=$GPU python -m src.benchmark.linear_eval --task copd --pretrain $name --dim $dim
11 | python -m src.benchmark.processing.icbhi_processing --pretrain evar:$base:config/atst_frame.yaml:$name:weight_file=$weight
12 | CUDA_VISIBLE_DEVICES=$GPU python -m src.benchmark.linear_eval --task icbhidisease --pretrain $name --dim $dim
13 | python -m src.benchmark.processing.kauh_processing --pretrain evar:$base:config/atst_frame.yaml:$name:weight_file=$weight
14 | CUDA_VISIBLE_DEVICES=$GPU python -m src.benchmark.linear_eval --task kauh --pretrain $name --dim $dim
15 | 
16 | python -m src.benchmark.processing.coughvid_processing --pretrain evar:$base:config/atst_frame.yaml:$name:weight_file=$weight --label covid
17 | CUDA_VISIBLE_DEVICES=$GPU python -m src.benchmark.linear_eval --task coughvidcovid --pretrain $name --dim $dim
18 | python -m src.benchmark.processing.coughvid_processing --pretrain evar:$base:config/atst_frame.yaml:$name:weight_file=$weight --label gender
19 | CUDA_VISIBLE_DEVICES=$GPU python -m src.benchmark.linear_eval --task coughvidsex --pretrain $name --dim $dim
20 | 
21 | python -m src.benchmark.processing.coswara_processing --pretrain evar:$base:config/atst_frame.yaml:$name:weight_file=$weight --label smoker
22 | CUDA_VISIBLE_DEVICES=$GPU python -m src.benchmark.linear_eval --task coswarasmoker --pretrain $name --dim $dim --modality cough-shallow
23 | python -m src.benchmark.processing.coswara_processing --pretrain evar:$base:config/atst_frame.yaml:$name:weight_file=$weight --label sex
24 | CUDA_VISIBLE_DEVICES=$GPU python -m src.benchmark.linear_eval --task coswarasex --pretrain $name --dim $dim --modality cough-shallow
25 | 
26 | 


--------------------------------------------------------------------------------
/plugin/OPERA/evar_beats_plus.sh:
--------------------------------------------------------------------------------
 1 | base=$EVAR
 2 | weight=${base}/external/BEATs_iter3_plus_AS2M.pt
 3 | dim=768
 4 | name=BEATs
 5 | GPU=0
 6 | # filename=$(basename $weight)
 7 | # "$(basename "$(dirname "$weight")")_${filename%.*}"
 8 | 
 9 | python -m src.benchmark.processing.copd_processing --pretrain evar:$base:config/beats_plus.yaml:$name:weight_file=$weight
10 | CUDA_VISIBLE_DEVICES=$GPU python -m src.benchmark.linear_eval --task copd --pretrain $name --dim $dim
11 | python -m src.benchmark.processing.icbhi_processing --pretrain evar:$base:config/beats_plus.yaml:$name:weight_file=$weight
12 | CUDA_VISIBLE_DEVICES=$GPU python -m src.benchmark.linear_eval --task icbhidisease --pretrain $name --dim $dim
13 | python -m src.benchmark.processing.kauh_processing --pretrain evar:$base:config/beats_plus.yaml:$name:weight_file=$weight
14 | CUDA_VISIBLE_DEVICES=$GPU python -m src.benchmark.linear_eval --task kauh --pretrain $name --dim $dim
15 | 
16 | python -m src.benchmark.processing.coughvid_processing --pretrain evar:$base:config/beats_plus.yaml:$name:weight_file=$weight --label covid
17 | CUDA_VISIBLE_DEVICES=$GPU python -m src.benchmark.linear_eval --task coughvidcovid --pretrain $name --dim $dim
18 | python -m src.benchmark.processing.coughvid_processing --pretrain evar:$base:config/beats_plus.yaml:$name:weight_file=$weight --label gender
19 | CUDA_VISIBLE_DEVICES=$GPU python -m src.benchmark.linear_eval --task coughvidsex --pretrain $name --dim $dim
20 | 
21 | python -m src.benchmark.processing.coswara_processing --pretrain evar:$base:config/beats_plus.yaml:$name:weight_file=$weight --label smoker
22 | CUDA_VISIBLE_DEVICES=$GPU python -m src.benchmark.linear_eval --task coswarasmoker --pretrain $name --dim $dim --modality cough-shallow
23 | python -m src.benchmark.processing.coswara_processing --pretrain evar:$base:config/beats_plus.yaml:$name:weight_file=$weight --label sex
24 | CUDA_VISIBLE_DEVICES=$GPU python -m src.benchmark.linear_eval --task coswarasex --pretrain $name --dim $dim --modality cough-shallow
25 | 
26 | 


--------------------------------------------------------------------------------
/evar/ar_wavlm.py:
--------------------------------------------------------------------------------
 1 | """Wrapper code for:
 2 | 
 3 | WavLM: Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing
 4 | 
 5 | ## Reference
 6 | - [1] https://arxiv.org/abs/2110.13900
 7 | - [2] https://huggingface.co/microsoft/wavlm-large
 8 | - [3] https://github.com/microsoft/unilm/tree/master/wavlm
 9 | - [4] https://github.com/huggingface/transformers/blob/main/src/transformers/models/wavlm/modeling_wavlm.py
10 | """
11 | 
12 | from evar.ar_base import BaseAudioRepr, temporal_pooling
13 | import logging
14 | import torch
15 | try:
16 |     from transformers import WavLMModel, Wav2Vec2Processor
17 | except:
18 |     logging.error('Install transformers.\n>>> pip install transformers')
19 | 
20 | 
21 | class AR_WavLM(BaseAudioRepr):
22 | 
23 |     def __init__(self, cfg):
24 |         super().__init__(cfg=cfg)
25 | 
26 |         self.processor = Wav2Vec2Processor.from_pretrained('facebook/wav2vec2-base-960h') # instead of cfg.pretrained_model because non-ft models fail. wav2vec2-base-960h should be fine for preprocessing.
27 |         self.backbone = WavLMModel.from_pretrained(cfg.pretrained_model)
28 | 
29 |     def encode_frames(self, batch_audio):
30 |         device = batch_audio.device
31 |         preprocessed = self.processor(batch_audio.cpu().numpy(), return_tensors="pt", sampling_rate=16000).input_values
32 |         preprocessed = preprocessed[0] if preprocessed.shape[0] == 1 else preprocessed  # [1, B, raw wave length] -> [B, raw wave length]
33 |         preprocessed = preprocessed.to(device)
34 |         hidden_states = self.backbone(preprocessed, output_hidden_states=True).hidden_states # [B, T, D]
35 |         # stack layer outputs
36 |         states_to_stack = [hidden_states[index] for index in self.cfg.output_layers] if self.cfg.output_layers else hidden_states
37 |         features = torch.cat(states_to_stack, axis=-1)
38 |         return features.transpose(1, 2) # [B, D, T]
39 | 
40 |     def forward(self, batch_audio):
41 |         return temporal_pooling(self, self.encode_frames(batch_audio))
42 | 


--------------------------------------------------------------------------------
/plugin/MARBLE/evar_marble.sh:
--------------------------------------------------------------------------------
 1 | #
 2 | NAME=$1
 3 | WEIGHT=$2
 4 | SEED=42
 5 | ITER=5
 6 | FEATURES=768
 7 | FEAT_NAME=$NAME
 8 | 
 9 | if [ $# -gt 2 ]; then
10 |     SEED=$3
11 |     echo "Seed = $SEED."
12 | fi
13 | if [ $# -gt 3 ]; then
14 |     ITER=$4
15 |     echo "Number of iteration = $ITER."
16 | fi
17 | if [ $# -gt 4 ]; then
18 |     FEAT_NAME=$5
19 |     echo "Feature name = $FEAT_NAME."
20 | fi
21 | if [ $# -gt 5 ]; then
22 |     FEATURES=$6
23 |     echo "Num_features = $FEATURES."
24 | fi
25 | 
26 | OPTION="dataset.pre_extract.output_dir=outputs/feat/evar_"$FEAT_NAME"_feats,,dataset.input_dir=outputs/feat/evar_"$FEAT_NAME"_feats,,dataset.pre_extract.feature_extractor.pretrain.evar_config=$EVAR/config/$NAME.yaml,,dataset.pre_extract.feature_extractor.pretrain.weight=$WEIGHT,,dataset.pre_extract.feature_extractor.pretrain.num_features=$FEATURES"
27 | 
28 | #GS
29 | TASKS="EMO GTZAN MTT"
30 | for task in $TASKS; do
31 |   python . extract -c configs/evar/$task.yaml -o $OPTION",,trainer.seed=$SEED"
32 |   for i in $(seq $ITER); do
33 |     python . probe -c configs/evar/$task.yaml -o $OPTION",,trainer.seed=$SEED"
34 |     SEED=$((SEED + 1))
35 |   done
36 | done
37 | 
38 | python . extract -c configs/evar/VocalSetS.yaml -o $OPTION
39 | TASKS="VocalSetS VocalSetT"
40 | for task in $TASKS; do
41 |   for i in $(seq $ITER); do
42 |     python . probe -c configs/evar/$task.yaml -o $OPTION",,trainer.seed=$SEED"
43 |     SEED=$((SEED + 1))
44 |   done
45 | done
46 | 
47 | python . extract -c configs/evar/NSynthI.yaml -o $OPTION
48 | TASKS="NSynthI NSynthP"
49 | for task in $TASKS; do
50 |   for i in $(seq $ITER); do
51 |     python . probe -c configs/evar/$task.yaml -o $OPTION",,trainer.seed=$SEED"
52 |     SEED=$((SEED + 1))
53 |   done
54 | done
55 | 
56 | python . extract -c configs/evar/MTGGenre.yaml -o $OPTION
57 | TASKS="MTGGenre MTGInstrument MTGMood MTGTop50"
58 | for task in $TASKS; do
59 |   for i in $(seq $ITER); do
60 |     python . probe -c configs/evar/$task.yaml -o $OPTION",,trainer.seed=$SEED"
61 |     SEED=$((SEED + 1))
62 |   done
63 | done
64 | 


--------------------------------------------------------------------------------
/plugin/OPERA/evar_wav2vec2.sh:
--------------------------------------------------------------------------------
 1 | base=$EVAR
 2 | dim=512
 3 | GPU=0
 4 | # filename=$(basename $weight)
 5 | # "$(basename "$(dirname "$weight")")_${filename%.*}"
 6 | 
 7 | for i in 1 2 3 4 5 6 7 8 9 10 11 12; do
 8 | 
 9 | name="wav2vec2_$i"
10 | python -m src.benchmark.processing.copd_processing --pretrain evar:$base:config/wav2vec2feature.yaml:$name:+output_layers=[$i]
11 | CUDA_VISIBLE_DEVICES=$GPU python -m src.benchmark.linear_eval --task copd --pretrain $name --dim $dim
12 | python -m src.benchmark.processing.icbhi_processing --pretrain evar:$base:config/wav2vec2feature.yaml:$name:+output_layers=[$i]
13 | CUDA_VISIBLE_DEVICES=$GPU python -m src.benchmark.linear_eval --task icbhidisease --pretrain $name --dim $dim
14 | python -m src.benchmark.processing.kauh_processing --pretrain evar:$base:config/wav2vec2feature.yaml:$name:+output_layers=[$i]
15 | CUDA_VISIBLE_DEVICES=$GPU python -m src.benchmark.linear_eval --task kauh --pretrain $name --dim $dim
16 | 
17 | python -m src.benchmark.processing.coughvid_processing --pretrain evar:$base:config/wav2vec2feature.yaml:$name:+output_layers=[$i] --label covid
18 | CUDA_VISIBLE_DEVICES=$GPU python -m src.benchmark.linear_eval --task coughvidcovid --pretrain $name --dim $dim
19 | python -m src.benchmark.processing.coughvid_processing --pretrain evar:$base:config/wav2vec2feature.yaml:$name:+output_layers=[$i] --label gender
20 | CUDA_VISIBLE_DEVICES=$GPU python -m src.benchmark.linear_eval --task coughvidsex --pretrain $name --dim $dim
21 | 
22 | python -m src.benchmark.processing.coswara_processing --pretrain evar:$base:config/wav2vec2feature.yaml:$name:+output_layers=[$i] --label smoker
23 | CUDA_VISIBLE_DEVICES=$GPU python -m src.benchmark.linear_eval --task coswarasmoker --pretrain $name --dim $dim --modality cough-shallow
24 | python -m src.benchmark.processing.coswara_processing --pretrain evar:$base:config/wav2vec2feature.yaml:$name:+output_layers=[$i] --label sex
25 | CUDA_VISIBLE_DEVICES=$GPU python -m src.benchmark.linear_eval --task coswarasex --pretrain $name --dim $dim --modality cough-shallow
26 | 
27 | done
28 | 


--------------------------------------------------------------------------------
/evar/ar_hubert.py:
--------------------------------------------------------------------------------
 1 | """Wrapper code for:
 2 | 
 3 | HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units
 4 | 
 5 | ## Reference
 6 | - [1] https://ai.facebook.com/blog/hubert-self-supervised-representation-learning-for-speech-recognition-generation-and-compression/
 7 | - [2] https://huggingface.co/facebook/hubert-large-ls960-ft
 8 | - [3] https://github.com/huggingface/transformers/blob/main/src/transformers/models/hubert/modeling_hubert.py
 9 | """
10 | 
11 | from evar.ar_base import BaseAudioRepr, temporal_pooling
12 | import logging
13 | import torch
14 | try:
15 |     from transformers import HubertModel, Wav2Vec2Processor
16 | except:
17 |     logging.error('Install transformers.\n>>> pip install transformers')
18 | 
19 | 
20 | class AR_Hubert(BaseAudioRepr):
21 | 
22 |     def __init__(self, cfg):
23 |         super().__init__(cfg=cfg)
24 | 
25 |         self.processor = Wav2Vec2Processor.from_pretrained('facebook/wav2vec2-base-960h')
26 |         # instead of cfg.pretrained_model because non-ft models fail. wav2vec2-base-960h should be fine for preprocessing.
27 |         self.backbone = HubertModel.from_pretrained(cfg.pretrained_model)
28 | 
29 |     def encode_frames(self, batch_audio):
30 |         device = batch_audio.device
31 |         preprocessed = self.processor(batch_audio.cpu().numpy(), return_tensors="pt", sampling_rate=16000).input_values
32 |         preprocessed = preprocessed[0] if preprocessed.shape[0] == 1 else preprocessed  # [1, B, raw wave length] -> [B, raw wave length]
33 |         preprocessed = preprocessed.to(device)
34 |         hidden_states = self.backbone(preprocessed, output_hidden_states=True).hidden_states # [B, T, D]
35 |         # stack layer outputs
36 |         states_to_stack = [hidden_states[index] for index in self.cfg.output_layers] if self.cfg.output_layers else hidden_states
37 |         features = torch.cat(states_to_stack, axis=-1)
38 |         return features.transpose(1, 2) # [B, D, T]
39 | 
40 |     def forward(self, batch_audio):
41 |         return temporal_pooling(self, self.encode_frames(batch_audio))
42 | 


--------------------------------------------------------------------------------
/plugin/OPERA/evar_m2d_layers.sh:
--------------------------------------------------------------------------------
 1 | base=$EVAR
 2 | weight=$1
 3 | dim=$2
 4 | filename=$(basename $weight)
 5 | name="$(basename "$(dirname "$weight")")_${filename%.*}"
 6 | 
 7 | for i in 0 1 2 3 4 5 6 7 8 9 10 11; do
 8 | 
 9 | name="M2D_$i"
10 | 
11 | python -m src.benchmark.processing.copd_processing --pretrain evar:$base:config/m2d.yaml:$name:weight_file=$weight,+output_layers=[$i]
12 | CUDA_VISIBLE_DEVICES=0 python -m src.benchmark.linear_eval --task copd --pretrain $name --dim $dim
13 | python -m src.benchmark.processing.icbhi_processing --pretrain evar:$base:config/m2d.yaml:$name:weight_file=$weight,+output_layers=[$i]
14 | CUDA_VISIBLE_DEVICES=0 python -m src.benchmark.linear_eval --task icbhidisease --pretrain $name --dim $dim
15 | python -m src.benchmark.processing.kauh_processing --pretrain evar:$base:config/m2d.yaml:$name:weight_file=$weight,+output_layers=[$i]
16 | CUDA_VISIBLE_DEVICES=0 python -m src.benchmark.linear_eval --task kauh --pretrain $name --dim $dim
17 | 
18 | python -m src.benchmark.processing.coughvid_processing --pretrain evar:$base:config/m2d.yaml:$name:weight_file=$weight,+output_layers=[$i] --label covid
19 | CUDA_VISIBLE_DEVICES=0 python -m src.benchmark.linear_eval --task coughvidcovid --pretrain $name --dim $dim
20 | python -m src.benchmark.processing.coughvid_processing --pretrain evar:$base:config/m2d.yaml:$name:weight_file=$weight,+output_layers=[$i] --label gender
21 | CUDA_VISIBLE_DEVICES=0 python -m src.benchmark.linear_eval --task coughvidsex --pretrain $name --dim $dim
22 | 
23 | python -m src.benchmark.processing.coswara_processing --pretrain evar:$base:config/m2d.yaml:$name:weight_file=$weight,+output_layers=[$i] --label smoker
24 | CUDA_VISIBLE_DEVICES=0 python -m src.benchmark.linear_eval --task coswarasmoker --pretrain $name --dim $dim --modality cough-shallow
25 | python -m src.benchmark.processing.coswara_processing --pretrain evar:$base:config/m2d.yaml:$name:weight_file=$weight,+output_layers=[$i] --label sex
26 | CUDA_VISIBLE_DEVICES=0 python -m src.benchmark.linear_eval --task coswarasex --pretrain $name --dim $dim --modality cough-shallow
27 | 
28 | done
29 | 


--------------------------------------------------------------------------------
/evar/ar_vggish.py:
--------------------------------------------------------------------------------
 1 | """Wrapper code for:
 2 | 
 3 | CNN Architectures for Large-Scale Audio Classification
 4 | 
 5 | ## References
 6 | - [1] https://research.google/pubs/pub45611/
 7 | - [2] VGGish: https://github.com/tcvrick/audioset-vggish-tensorflow-to-pytorch/blob/master/vggish.py
 8 | - [3] VGG: https://github.com/pytorch/vision/blob/master/torchvision/models/vgg.py
 9 | """
10 | 
11 | from evar.ar_base import (BaseAudioRepr, temporal_pooling)
12 | import torch
13 | import numpy as np
14 | import logging
15 | try:
16 |     from external.tcvrick_vggish import vggish
17 |     from external.tcvrick_vggish.audioset import vggish_input
18 | except:
19 |     pass  # logging.error('Make your copy of VGGish under external folder. Check Preparing-models.md for the details.')
20 | 
21 | 
22 | class AR_VGGish(BaseAudioRepr):
23 |     def __init__(self, cfg, vggish_class=None):
24 |         super().__init__(cfg=cfg)
25 | 
26 |         self.vggish = vggish.VGGish() if vggish_class is None else vggish_class()
27 |         weight_file = 'external/pytorch_vggish.pth'
28 |         logging.info(f' using pretrained weight: {weight_file}')
29 |         self.vggish.load_state_dict(torch.load(weight_file))
30 | 
31 |     def to_audio_features(self, batch_audio):
32 |         # raw audio -> spectrogram
33 |         device = batch_audio.device
34 |         X = [vggish_input.waveform_to_examples(x.cpu().numpy(), self.cfg.sample_rate) for x in batch_audio]
35 |         X = torch.tensor(np.array(X)).float().to(device) # ex.) [256, 7, 96, 64] if fsd50k. [B,Frame,T,F]
36 |         return X
37 | 
38 |     def encode_frames(self, batch_audio):
39 |         X = self.to_audio_features(batch_audio)
40 |         Xs = [self.vggish(X[:, i:i+1]) for i in range(X.shape[1])]
41 |         X = torch.stack(Xs, dim=2) # [B, D] x Frame -> [B, D, Frame]
42 |         return X
43 | 
44 |     def forward(self, batch_audio):
45 |         return temporal_pooling(self, self.encode_frames(batch_audio)) # [B, D]
46 | 
47 | 
48 | class AR_VGGish_4K(AR_VGGish):
49 |     def __init__(self, cfg):
50 |         super().__init__(cfg=cfg)
51 | 
52 |         # Remove all the layers after the first FC layer.
53 |         self.vggish.fc = torch.nn.Sequential(*list(self.vggish.fc.children())[:-4])
54 | 


--------------------------------------------------------------------------------
/evar/ar_esresnext_fbsp.py:
--------------------------------------------------------------------------------
 1 | """Wrapper code for:
 2 | 
 3 | ESResNe(X)t-fbsp: Learning Robust Time-Frequency Transformation of Audio
 4 | 
 5 | ## Reference
 6 | - [1] https://arxiv.org/abs/2104.11587
 7 | - [2] https://github.com/AndreyGuzhov/ESResNeXt-fbsp
 8 | """
 9 | 
10 | from evar.ar_base import BaseAudioRepr
11 | import torch
12 | import librosa
13 | import numpy as np
14 | import logging
15 | from evar.model_utils import ensure_weights
16 | try:
17 |     from external.esresnext.model.esresnet_fbsp import ESResNeXtFBSP
18 | except:
19 |     logging.info('Make your copy of ESResNeXt-fbsp under external folder. Check Preparing-models.md for the details.')
20 |     class ESResNeXtFBSP:
21 |         pass
22 | 
23 | 
24 | class ESResNeXtFBSP_(ESResNeXtFBSP):
25 | 
26 |     def forward_reduced_featues(self, x, tfm=None):
27 |         x = self._forward_pre_processing(x)
28 |         if tfm is not None:
29 |             x = tfm(x)
30 |         x = self._forward_features(x)
31 |         x = self._forward_reduction(x)
32 |         return x
33 | 
34 | 
35 | class AR_ESResNeXtFBSP(BaseAudioRepr):
36 |     def __init__(self, cfg):
37 |         super().__init__(cfg=cfg)
38 | 
39 |         self.backbone = ESResNeXtFBSP_(
40 |             **{"n_fft": 2048,
41 |                 "hop_length": 561,
42 |                 "win_length": 1654,
43 |                 "window": "blackmanharris",
44 |                 "normalized": True,
45 |                 "onesided": True,
46 |                 "spec_height": -1,
47 |                 "spec_width": -1,
48 |                 "num_classes": 527,
49 |                 "apply_attention": True,
50 |             }
51 |         )
52 |         ensure_weights('external/ESResNeXtFBSP_AudioSet.pt',
53 |             'https://github.com/AndreyGuzhov/ESResNeXt-fbsp/releases/download/v0.1/ESResNeXtFBSP_AudioSet.pt')
54 |         self.backbone.load_state_dict(torch.load('external/ESResNeXtFBSP_AudioSet.pt'))
55 | 
56 |     def encode_frames(self, batch_audio):
57 |         X = self.forward(batch_audio)
58 |         X = X.unsqueeze(1) # Already have temporally pooled, just adding extra frame dimension [B, 2048] -> [B, 1, 2048]
59 |         return X
60 | 
61 |     def forward(self, batch_audio):
62 |         return self.backbone.forward_reduced_featues(batch_audio * 32767) # [B, 2048]
63 | 


--------------------------------------------------------------------------------
/external/wavcaps.patch:
--------------------------------------------------------------------------------
 1 | diff --git a/retrieval/models/ase_model.py b/retrieval/models/ase_model.py
 2 | index 04e2d02..ca8ae98 100644
 3 | --- a/retrieval/models/ase_model.py
 4 | +++ b/retrieval/models/ase_model.py
 5 | @@ -6,12 +6,12 @@
 6 |  
 7 |  import torch
 8 |  import torch.nn as nn
 9 | -from models.audio_encoder import AudioEncoder
10 | -from models.text_encoder import TextEncoder
11 | +from ..models.audio_encoder import AudioEncoder
12 | +from ..models.text_encoder import TextEncoder
13 |  import torch.nn.functional as F
14 |  import copy
15 | -from tools.losses import AudioTextContrastiveLoss, NTXent
16 | -from tools.utils import remove_grad
17 | +from ..tools.losses import AudioTextContrastiveLoss, NTXent
18 | +from ..tools.utils import remove_grad
19 |  
20 |  
21 |  class ASE(nn.Module):
22 | diff --git a/retrieval/models/audio_encoder.py b/retrieval/models/audio_encoder.py
23 | index e3b9394..2201e7f 100644
24 | --- a/retrieval/models/audio_encoder.py
25 | +++ b/retrieval/models/audio_encoder.py
26 | @@ -6,8 +6,8 @@
27 |  
28 |  import torch
29 |  import torch.nn as nn
30 | -from models.cnns import ResNet38, Cnn14
31 | -from models.htsat import HTSAT_Swin_Transformer
32 | +from ..models.cnns import ResNet38, Cnn14
33 | +from ..models.htsat import HTSAT_Swin_Transformer
34 |  
35 |  
36 |  class AudioEncoder(nn.Module):
37 | diff --git a/retrieval/models/cnns.py b/retrieval/models/cnns.py
38 | index be2ed5a..61ccd7c 100644
39 | --- a/retrieval/models/cnns.py
40 | +++ b/retrieval/models/cnns.py
41 | @@ -12,7 +12,7 @@ import torch
42 |  import torch.nn as nn
43 |  import torch.nn.functional as F
44 |  from torchlibrosa.augmentation import SpecAugmentation
45 | -from models.feature_extractor import AudioFeature
46 | +from ..models.feature_extractor import AudioFeature
47 |  
48 |  
49 |  def init_layer(layer):
50 | diff --git a/retrieval/models/htsat.py b/retrieval/models/htsat.py
51 | index b5a9ff2..4795f45 100644
52 | --- a/retrieval/models/htsat.py
53 | +++ b/retrieval/models/htsat.py
54 | @@ -23,7 +23,7 @@ from torch.nn.init import _calculate_fan_in_and_fan_out
55 |  from itertools import repeat
56 |  from typing import List
57 |  
58 | -from models.feature_extractor import AudioFeature
59 | +from ..models.feature_extractor import AudioFeature
60 |  
61 |  
62 |  def interpolate(x, ratio):
63 | 


--------------------------------------------------------------------------------
/prepare_wav.py:
--------------------------------------------------------------------------------
 1 | """Audio file converter.
 2 | 
 3 | This converts the original audio files found in the source folder recursively,
 4 | then store under the destination folder with the same relative path structure.
 5 | 
 6 | The conversion process includes the following steps:
 7 |     - Stereo to mono
 8 |     - Resample to the sampling rate
 9 | 
10 | Usage:
11 |     python convert_wav.py /path/to/fsd50k work/16k/fsd50k 16000
12 |     python convert_wav.py /path/to/speech_commands_v0.02 work/16k/spcv2 16000
13 |     python convert_wav.py /data/A/VoxCeleb1 work/16k/vc1 16000
14 | """
15 | 
16 | from pathlib import Path
17 | from multiprocessing import Pool
18 | import fire
19 | from tqdm import tqdm
20 | import soundfile as sf
21 | import librosa
22 | 
23 | 
24 | def _converter_worker(args):
25 |     subpathname, from_dir, to_dir, sample_rate, verbose = args
26 |     from_dir, to_dir = Path(from_dir), Path(to_dir)
27 |     to_name = to_dir/subpathname
28 |     if verbose:
29 |         print(from_dir, '->', to_name)
30 | 
31 |     # load wav
32 |     wav, org_sr = sf.read(from_dir/subpathname, dtype='float32', always_2d=True)
33 |     wav = wav.T  # (wave length, 1 or 2) -> (1 or 2, wave length)
34 | 
35 |     # stereo to mono (compatible with librosa)
36 |     # ref: https://librosa.org/doc/main/generated/librosa.to_mono.html#librosa.to_mono
37 |     wav = wav.mean(axis=0)
38 | 
39 |     # resample
40 |     wav = librosa.resample(wav, orig_sr=org_sr, target_sr=sample_rate)
41 | 
42 |     # save wav
43 |     to_name.parent.mkdir(exist_ok=True, parents=True)
44 |     sf.write(to_name, data=wav, samplerate=sample_rate)  # subtype=sf.default_subtype('WAV') -- not always wav
45 | 
46 |     return to_name.name
47 | 
48 | 
49 | def convert_wav(from_dir, to_dir, sample_rate, suffix='.wav', verbose=False) -> None:
50 |     from_dir = str(from_dir)
51 |     files = [str(f).replace(from_dir, '') for f in Path(from_dir).glob(f'**/*{suffix}')]
52 |     files = [f[1:] if f[0] == '/' else f for f in files]
53 |     print(f'Processing {len(files)} {suffix} files at a sampling rate of {sample_rate} Hz...')
54 |     assert len(files) > 0
55 | 
56 |     with Pool() as p:
57 |         args = [[f, from_dir, to_dir, sample_rate, verbose] for f in files]
58 |         shapes = list(tqdm(p.imap(_converter_worker, args), total=len(args)))
59 | 
60 |     print('finished.')
61 | 
62 | 
63 | if __name__ == "__main__":
64 |     fire.Fire(convert_wav)
65 | 


--------------------------------------------------------------------------------
/evar/utils/download_voxforge.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Download VoxForge dataset to your "to_folder".
 3 | This code uses a file list from TFDS, downloads .tgz files, and extract them.
 4 | The definition of labels and data splits is available in evar/metadata/voxforge.csv.
 5 | 
 6 | Following TFDS implementation for the details.
 7 | 
 8 | ## Usage
 9 | 
10 | '''sh
11 | python download_voxforge.py <to_folder>
12 | '''
13 | 
14 | ## Reference
15 | 
16 | - [1] http://www.voxforge.org/
17 | - [2] TFDS: https://github.com/tensorflow/datasets/blob/master/tensorflow_datasets/audio/voxforge.py
18 | 
19 | @article{maclean2018voxforge,
20 |     title={Voxforge},
21 |     author={MacLean, Ken},
22 |     journal={Ken MacLean.[Online]. Available: http://www.voxforge.org/home.[Acedido em 2012]},
23 |     year={2018}
24 | }
25 | """
26 | 
27 | import urllib.request
28 | import shutil
29 | import os
30 | from pathlib import Path
31 | from multiprocessing import Pool
32 | from tqdm import tqdm
33 | import fire
34 | 
35 | 
36 | TFDS_URL = 'https://storage.googleapis.com/tfds-data/downloads/voxforge/voxforge_urls.txt'
37 | 
38 | 
39 | def _download_extract_worker(args):
40 |     url, filename, dest_path = args
41 | 
42 |     if (Path(dest_path)/Path(filename).stem).exists():
43 |         #print(' skip', Path(filename).stem)
44 |         #print('.', end='')
45 |         return
46 | 
47 |     tmpfile = '/tmp/' + filename
48 |     try:
49 |         urllib.request.urlretrieve('http://' + url, tmpfile)
50 |     except:
51 |         print('ERROR to download', url)
52 |         return
53 |     try:
54 |         shutil.unpack_archive(tmpfile, dest_path)
55 |     except:
56 |         print('ERROR to extract', url)
57 | 
58 |     os.remove(tmpfile)
59 | 
60 | 
61 | def download_extract_voxforge(dest_path):
62 |     file = urllib.request.urlopen(TFDS_URL)
63 |     urls = [line.decode('utf-8').strip() for line in file]
64 |     filenames = [url.split('/')[-1] for url in urls]
65 |     assert len(set(filenames)) == len(urls)
66 | 
67 |     print('Downloading voxforge for', len(urls), 'tgz archives.')
68 |     Path(dest_path).mkdir(exist_ok=True, parents=True)
69 |     with Pool() as p:
70 |         args = [[url, filename, dest_path] for url, filename in zip(urls, filenames)]
71 |         shapes = list(tqdm(p.imap(_download_extract_worker, args), total=len(args)))
72 | 
73 |     print('finished.')
74 | 
75 | 
76 | if __name__ == "__main__":
77 |     fire.Fire(download_extract_voxforge)
78 | 


--------------------------------------------------------------------------------
/app/circor/README_CirCor.md:
--------------------------------------------------------------------------------
 1 | # CirCor evaluation
 2 | 
 3 | We provide code to evaluate CirCor with various models.
 4 | In addition, the exact stratified data splits used in the paper are provided for reproducibility.
 5 | 
 6 | **NOTE: The code freezes the audio representation model weights.**
 7 | 
 8 | Prepare code and download datasets before your evaluation.
 9 | 
10 | ## Prepare codebase
11 | 
12 | In this folder `app/circor`, run the following:
13 | 
14 | ```sh
15 | git clone https://github.com/Benjamin-Walker/heart-murmur-detection.git
16 | (cd heart-murmur-detection && git checkout 60f5420918b151e06932f70a52649d9562f0be2d)
17 | patch -p1 < patch-heart-murmur-detection.diff
18 | 
19 | wget https://raw.githubusercontent.com/nttcslab/m2d/refs/heads/master/app/circor/datalist_stratified_data1.csv
20 | wget https://raw.githubusercontent.com/nttcslab/m2d/refs/heads/master/app/circor/datalist_stratified_data2.csv
21 | wget https://raw.githubusercontent.com/nttcslab/m2d/refs/heads/master/app/circor/datalist_stratified_data3.csv
22 | ```
23 | 
24 | ## Download and rearrange dataset
25 | 
26 | In this folder `app/circor`, download the dataset:
27 | 
28 | ```sh
29 | wget -r -N -c -np https://physionet.org/files/circor-heart-sound/1.0.3/
30 | ```
31 | 
32 | Then, do the following to rearrange data files into stratified splits and copy them under `heart-murmur-detection/data` and `../../work/16k/circor`.
33 | 
34 | ```sh
35 | python rearrange_data.py
36 | ```
37 | 
38 | It also creates metadata files as `../../evar/metadata/circor[1-3].csv`.
39 | 
40 | ## Run evaluations
41 | 
42 | In the **root folder of EVAR**, run the scripts `ev_*.sh`. The following is the complete set of command lines for the paper.
43 | 
44 | The results will be recorded in `results/circor-scores.csv`.
45 | 
46 | ```sh
47 | bash app/circor/ev_ast.sh 1 5 7 0.03
48 | bash app/circor/ev_ast.sh 2 5 7 0.03
49 | bash app/circor/ev_ast.sh 3 5 7 0.03
50 | 
51 | bash app/circor/ev_beats.sh 1 5 7 0.03
52 | bash app/circor/ev_beats.sh 2 5 7 0.03
53 | bash app/circor/ev_beats.sh 3 5 7 0.03
54 | 
55 | bash app/circor/ev_byola.sh 1 5 7 0.1
56 | bash app/circor/ev_byola.sh 2 5 7 0.1
57 | bash app/circor/ev_byola.sh 3 5 7 0.1
58 | 
59 | bash app/circor/ev_m2d.sh m2d_vit_base-80x608p16x16-221006-mr7/checkpoint-300.pth 1 5 7 0.1
60 | bash app/circor/ev_m2d.sh m2d_vit_base-80x608p16x16-221006-mr7/checkpoint-300.pth 2 5 7 0.1
61 | bash app/circor/ev_m2d.sh m2d_vit_base-80x608p16x16-221006-mr7/checkpoint-300.pth 3 5 7 0.1
62 | ```
63 | 


--------------------------------------------------------------------------------
/evar/utils/download_cremad.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Download CREMA-D dataset to your "to_folder".
 3 | This code uses a file list from TFDS and downloads .wav files only.
 4 | The definition of labels and data splits is available in evar/metadata/cremad.csv.
 5 | 
 6 | Following NOSS [2] split. We assign 70 % of speakers (63) as training, 10 % (9) as validation,
 7 | and the remaining 20 % (19) as test splits, with no speaker duplication in multiple splits.
 8 | 
 9 | ## Usage
10 | 
11 | '''sh
12 | python download_cremad.py <to_folder>
13 | '''
14 | 
15 | ## Reference
16 | 
17 | - [1] H. Cao, D. G. Cooper, M. K. Keutmann, R. C. Gur, A. Nenkova and R. Verma, "CREMA-D: Crowd-Sourced Emotional Multimodal Actors Dataset," in IEEE Transactions on Affective Computing, vol. 5, no. 4, pp. 377-390, 1 Oct.-Dec. 2014, doi: 10.1109/TAFFC.2014.2336244.
18 | - [2] J. Shor, A. Jansen, R. Maor, O. Lang, O. Tuval, F. d. C. Quitry, M. Tagliasacchi, I. Shavitt, D. Emanuel, and Y. Haviv, “Towards learning a universal non-semantic representation of speech,” in Interspeech, Oct 2020.
19 | - [3] https://github.com/tensorflow/datasets/blob/master/tensorflow_datasets/audio/crema_d.py
20 | """
21 | 
22 | import urllib.request
23 | from pathlib import Path
24 | from multiprocessing import Pool
25 | from tqdm import tqdm
26 | import fire
27 | 
28 | 
29 | TFDS_URL = 'https://storage.googleapis.com/tfds-data/manual_checksums/crema_d.txt'
30 | 
31 | 
32 | def _download_worker(args):
33 |     url, dest_path = args
34 |     filename = url.split('/')[-1]
35 | 
36 |     if (Path(dest_path)/Path(filename).name).exists():
37 |         print(' skip', Path(filename).stem)
38 |         return
39 | 
40 |     destfile = f'{dest_path}/{filename}'
41 |     try:
42 |         urllib.request.urlretrieve(url, destfile)
43 |     except:
44 |         print('ERROR to download', url)
45 | 
46 | 
47 | def download_extract_cremad(dest_path):
48 |     lines = urllib.request.urlopen(TFDS_URL)
49 |     urls = [line.decode('utf-8').strip().split()[0] for line in lines]
50 |     urls = [url for url in urls if url[-4:] == '.wav'] # wav only, excluding summaryTable.csv
51 | 
52 |     print('Downloading CREMA-D for', len(urls), 'wav files.')
53 |     Path(dest_path).mkdir(exist_ok=True, parents=True)
54 |     with Pool() as p:
55 |         args = [[url, dest_path] for url in urls]
56 |         shapes = list(tqdm(p.imap(_download_worker, args), total=len(args)))
57 | 
58 |     print('finished.')
59 | 
60 | 
61 | if __name__ == "__main__":
62 |     fire.Fire(download_extract_cremad)
63 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | 


--------------------------------------------------------------------------------
/evar/ar_htsat.py:
--------------------------------------------------------------------------------
 1 | """Wrapper code for:
 2 | 
 3 | HTS-AT: A Hierarchical Token-Semantic Audio Transformer for Sound Classification and Detection
 4 | 
 5 | ## Reference
 6 | - [1] https://arxiv.org/abs/2202.00874
 7 | - [2] https://github.com/RetroCirce/HTS-Audio-Transformer
 8 | """
 9 | 
10 | from evar.ar_base import BaseAudioRepr, temporal_pooling
11 | import sys
12 | import logging
13 | import torch
14 | try:
15 |     import os
16 |     evar_home = os.getenv('EVAR', '')
17 |     sys.path.append(os.path.join(evar_home, 'external/htsat'))
18 |     sys.path.append('../../external/htsat')
19 |     from model.htsat import HTSAT_Swin_Transformer
20 |     import config
21 | except:
22 |     pass  # please install HTS-AT
23 | 
24 | 
25 | class AR_HTSAT(BaseAudioRepr):
26 | 
27 |     def __init__(self, cfg):
28 |         super().__init__(cfg=cfg)
29 | 
30 |         # load the pre-trained checkpoints
31 |         checkpoint = torch.load(cfg.weight_file)
32 |         logging.info(f' Using weight_file: {cfg.weight_file}')
33 | 
34 |         self.backbone = HTSAT_Swin_Transformer(
35 |             spec_size=config.htsat_spec_size,
36 |             patch_size=config.htsat_patch_size,
37 |             in_chans=1,
38 |             num_classes=config.classes_num,
39 |             window_size=config.htsat_window_size,
40 |             config = config,
41 |             depths = config.htsat_depth,
42 |             embed_dim = config.htsat_dim,
43 |             patch_stride=config.htsat_stride,
44 |             num_heads=config.htsat_num_head)
45 | 
46 |         states, L = {}, len('sed_model.')
47 |         for k in checkpoint["state_dict"]:
48 |             new_k = k[L:] if k.startswith('sed_model.') else k
49 |             states[new_k] = checkpoint["state_dict"][k]
50 |         self.backbone.load_state_dict(states)
51 |         # cfg = checkpoint['config']
52 | 
53 |     def encode_frames(self, batch_audio):
54 |         assert False, 'encode_frames for HTS-AT is not supported for now'
55 | 
56 |     def forward(self, batch_audio):
57 |         # Split long audio into pieces and average the features.
58 |         features = []
59 |         for chunk_index in range((batch_audio.shape[-1] + config.clip_samples - 1) // config.clip_samples):
60 |             chunk = batch_audio[:, chunk_index*config.clip_samples:(chunk_index + 1)*config.clip_samples]
61 |             features.append(self.backbone(chunk, mixup_lambda=None, infer_mode=True)['latent_output'])
62 |         features = torch.stack(features)
63 |         features = torch.mean(features, dim=0)
64 |         return features


--------------------------------------------------------------------------------
/app/icbhi_sprs/README_ICBHI_SPRS.md:
--------------------------------------------------------------------------------
 1 | # ICBHI 2017 and SPRSound evaluation
 2 | 
 3 | We provide code to evaluate ICBHI 2017 and SPRSound with various models.
 4 | 
 5 | **NOTE: The code freezes the audio representation model weights.**
 6 | 
 7 | Prepare code and download datasets before running the evaluation.
 8 | 
 9 | ## Prepare code
10 | 
11 | ```sh
12 | pip install torchinfo
13 | git clone https://github.com/ilyassmoummad/scl_icbhi2017.git
14 | cd scl_icbhi2017
15 | git reset --hard 915c1120719a9357d662c5fe484bce7fbe845139
16 | mv dataset.py augmentations.py utils.py losses.py args.py ..
17 | mv data ..
18 | mv ce.py ..
19 | cd ..
20 | patch -p2 < patch_scl_icbhi2017_evar.diff
21 | rm -fr scl_icbhi2017
22 | ```
23 | 
24 | ## Download ICBHI 2017
25 | 
26 | ```sh
27 | wget https://bhichallenge.med.auth.gr/sites/default/files/ICBHI_final_database/ICBHI_final_database.zip --no-check-certificate
28 | 
29 | unzip ICBHI_final_database.zip | awk 'BEGIN {ORS=" "} {if(NR%10==0)print "."}'
30 | mv ICBHI_final_database/* data/ICBHI
31 | rmdir ICBHI_final_database
32 | ```
33 | 
34 | ## Download SPRS
35 | 
36 | ```sh
37 | git clone https://github.com/SJTU-YONGFU-RESEARCH-GRP/SPRSound.git
38 | (cd SPRSound && git reset --hard 45b0d5d435ff320c46585762fa1090afd0ebb318)
39 | cp -r SPRSound/train_wav SPRSound/test_wav data/SPRS/
40 | ```
41 | 
42 | ## Run evaluations
43 | 
44 | The following examples run evaluations on ICBHI 2017 for the models.
45 | 
46 | ```sh
47 | bash ev_icbhi_beats.sh
48 | bash ev_icbhi_m2d.sh ../../m2d_clap_vit_base-80x1001p16x16-240128_AS-FT_enconly/weights_ep67it3124-0.48558.pth
49 | ```
50 | 
51 | Find the shell scripts for more evaluations.
52 | 
53 | **NOTE: All the evaluations employ a transformer head except for ev_icbhi_mlp_m2d.sh, which uses MLP instead.** 
54 | 
55 | The following is the list of command lines for reproduction.
56 | 
57 | ```sh
58 | bash ev_icbhi_ast.sh 5
59 | bash ev_icbhi_beats.sh 5
60 | bash ev_icbhi_byola.sh 5
61 | bash ev_icbhi_opera.sh 5
62 | bash ev_icbhi_m2d.sh m2d_vit_base-80x608p16x16-221006-mr7/checkpoint-300.pth 5
63 | bash ev_icbhi_m2d.sh m2d_vit_base-80x200p16x4-230529/checkpoint-300.pth 5
64 | 
65 | bash ev_sprs_ast.sh 5
66 | bash ev_sprs_beats.sh 5
67 | bash ev_sprs_byola.sh 5
68 | bash ev_sprs_opera.sh 5
69 | bash ev_sprs_m2d.sh m2d_vit_base-80x608p16x16-221006-mr7/checkpoint-300.pth 5
70 | bash ev_sprs_m2d.sh m2d_vit_base-80x200p16x4-230529/checkpoint-300.pth 5
71 | 
72 | # Ablations: M2D (16×4, MLP)
73 | bash ev_icbhi_mlp_m2d.sh m2d_vit_base-80x200p16x4-230529/checkpoint-300.pth 5
74 | bash ev_sprs_mlp_m2d.sh m2d_vit_base-80x200p16x4-230529/checkpoint-300.pth 5
75 | ```
76 | 


--------------------------------------------------------------------------------
/evar/ar_beats.py:
--------------------------------------------------------------------------------
 1 | """Wrapper code for:
 2 | 
 3 | BEATs: Audio Pre-Training with Acoustic Tokenizers
 4 | 
 5 | ## Reference
 6 | - [1] https://arxiv.org/abs/2212.09058
 7 | - [2] https://github.com/microsoft/unilm/blob/master/beats/README.md
 8 | """
 9 | 
10 | from evar.ar_base import BaseAudioRepr, temporal_pooling
11 | import sys
12 | import logging
13 | import torch
14 | try:
15 |     import os
16 |     evar_home = os.getenv('EVAR', '')
17 |     sys.path.append(os.path.join(evar_home, 'external/unilm/beats'))
18 |     sys.path.append('../../external/unilm/beats')
19 |     from Tokenizers import TokenizersConfig, Tokenizers
20 |     from BEATs import BEATs, BEATsConfig
21 | except:
22 |     pass
23 | 
24 | 
25 | class AR_BEATs(BaseAudioRepr):
26 | 
27 |     def __init__(self, cfg):
28 |         super().__init__(cfg=cfg)
29 | 
30 |         # load the pre-trained checkpoints
31 |         checkpoint = torch.load(cfg.weight_file)
32 |         logging.info(f' Using weight_file: {cfg.weight_file}')
33 | 
34 |         cfg = BEATsConfig(checkpoint['cfg'])
35 |         BEATs_model = BEATs(cfg)
36 |         BEATs_model.load_state_dict(checkpoint['model'])
37 |         self.backbone = BEATs_model.eval()
38 | 
39 |     def encode_frames(self, batch_audio):
40 |         padding_mask = torch.zeros_like(batch_audio).bool()
41 |         features = self.backbone.extract_features(batch_audio, padding_mask=padding_mask)[0]
42 |         return features.transpose(1, 2) # [B, D, T]
43 | 
44 |     def forward(self, batch_audio):
45 |         x = self.encode_frames(batch_audio)
46 |         return x.mean(dim=-1) # [B, D, T] -> [B, D]
47 | 
48 | 
49 | class AR_BEATsTokenizer(BaseAudioRepr):
50 |     """EXPERIMENTAL"""
51 | 
52 |     def __init__(self, cfg):
53 |         super().__init__(cfg=cfg)
54 | 
55 |         # load the pre-trained checkpoints
56 |         checkpoint = torch.load(cfg.weight_file)
57 |         logging.info(f' Using weight_file: {cfg.weight_file}')
58 | 
59 |         cfg = TokenizersConfig(checkpoint['cfg'])
60 |         BEATs_tokenizer = Tokenizers(cfg)
61 |         BEATs_tokenizer.load_state_dict(checkpoint['model'])
62 |         self.backbone = BEATs_tokenizer.eval()
63 | 
64 |     def encode_frames(self, batch_audio):
65 |         padding_mask = torch.zeros_like(batch_audio).bool()
66 |         features = self.backbone.extract_labels(batch_audio, padding_mask=padding_mask)
67 |         features = features.reshape(batch_audio.shape[0], -1).unsqueeze(-1)
68 |         return features.to(float) # [B, D, T]
69 | 
70 |     def forward(self, batch_audio):
71 |         x = self.encode_frames(batch_audio)
72 |         return x.mean(dim=-1) # [B, D, T] -> [B, D]
73 | 


--------------------------------------------------------------------------------
/evar/ar_wavcaps.py:
--------------------------------------------------------------------------------
 1 | """Wrapper code for:
 2 | 
 3 | WavCaps: A ChatGPT-Assisted Weakly-Labelled Audio Captioning Dataset for Audio-Language Multimodal Research
 4 | 
 5 | ## Reference
 6 | - [1] https://arxiv.org/abs/2303.17395
 7 | - [2] https://github.com/XinhaoMei/WavCaps
 8 | """
 9 | 
10 | from evar.ar_base import BaseCLAP
11 | import sys
12 | import torch
13 | try:
14 |     sys.path.append('external/WavCaps')
15 |     from retrieval.models.ase_model import ASE
16 | except:
17 |     pass  # please install WavCaps
18 | 
19 | 
20 | class AR_WavCaps(BaseCLAP):
21 | 
22 |     def __init__(self, cfg):
23 |         super().__init__(cfg=cfg)
24 |         cp = torch.load(cfg.weight_file)
25 |         config = cp["config"]
26 |         config['audio_encoder_args']['pretrained'] = False
27 |         model = ASE(config)
28 |         model.load_state_dict(cp["model"], strict=False)
29 |         self.backbone = model
30 | 
31 |     def encode_frames(self, batch_audio):
32 |         assert False, 'encode_frames for MS CLAP is not supported for now'
33 | 
34 |     def forward(self, batch_audio):
35 |         # Split long audio into pieces and average the features.
36 |         features, clip_samples = [], 32000 * 10
37 |         for chunk_index in range((batch_audio.shape[-1] + clip_samples - 1) // clip_samples):
38 |             chunk = batch_audio[:, chunk_index*clip_samples:(chunk_index + 1)*clip_samples]
39 |             if chunk.shape[-1] < clip_samples:  # from https://github.com/XinhaoMei/WavCaps/blob/master/retrieval/zero_shot_classification.py
40 |                 pad_length = clip_samples - chunk.shape[-1]
41 |                 chunk = torch.nn.functional.pad(chunk, [0, pad_length], "constant", 0.0)
42 |             features.append(self.backbone.encode_audio(chunk))
43 |         features = torch.stack(features)
44 |         features = torch.mean(features, dim=0)
45 |         return features
46 | 
47 |     def encode_audio(self, batch_audio):
48 |         audio_embeddings = self.forward(batch_audio)
49 |         return audio_embeddings
50 | 
51 |     def encode_text(self, batch_text):
52 |         text_input = self.backbone.text_encoder.tokenizer(batch_text,
53 |                                     padding='longest',
54 |                                     truncation=True,
55 |                                     max_length=30,
56 |                                     return_tensors="pt").to(self.backbone.text_encoder.device)
57 |         text_feats = self.backbone.text_encoder.text_encoder(input_ids=text_input.input_ids,
58 |                                     attention_mask=text_input.attention_mask)[0]
59 |         text_feats = self.backbone.text_proj(text_feats[:, 0, :])
60 |         return text_feats
61 | 


--------------------------------------------------------------------------------
/external/ast_models.patch:
--------------------------------------------------------------------------------
 1 | diff --git a/src/models/ast_models.py b/src/models/ast_models.py
 2 | index 897d6b5..e542ad2 100644
 3 | --- a/ast/src/models/ast_models.py
 4 | +++ b/ast/src/models/ast_models.py
 5 | @@ -44,7 +44,7 @@ class ASTModel(nn.Module):
 6 |      :param audioset_pretrain: if pretrain the model with full AudioSet in addition to ImageNet
 7 |      :param model_size: the model size of AST, should be in [tiny224, small224, base224, base384], base224 and base 384 are same model, but are trained differently during pretraining.
 8 |      """
 9 | -    def __init__(self, label_dim=527, fstride=10, tstride=10, input_fdim=128, input_tdim=1024, imagenet_pretrain=True, audioset_pretrain=False, model_size='base384', verbose=True):
10 | +    def __init__(self, label_dim=527, fstride=10, tstride=10, input_fdim=128, input_tdim=1024, imagenet_pretrain=True, audioset_pretrain=False, model_size='base384', verbose=True, pretrained_weight='../../pretrained_models/ast_audioset.pth'):
11 |  
12 |          super(ASTModel, self).__init__()
13 |          assert timm.__version__ == '0.4.5', 'Please use timm == 0.4.5, the code might not be compatible with newer versions.'
14 | @@ -119,11 +119,11 @@ class ASTModel(nn.Module):
15 |              if model_size != 'base384':
16 |                  raise ValueError('currently only has base384 AudioSet pretrained model.')
17 |              device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
18 | -            if os.path.exists('../../pretrained_models/audioset_10_10_0.4593.pth') == False:
19 | +            if os.path.exists(pretrained_weight) == False:
20 |                  # this model performs 0.4593 mAP on the audioset eval set
21 |                  audioset_mdl_url = 'https://www.dropbox.com/s/cv4knew8mvbrnvq/audioset_0.4593.pth?dl=1'
22 | -                wget.download(audioset_mdl_url, out='../../pretrained_models/audioset_10_10_0.4593.pth')
23 | -            sd = torch.load('../../pretrained_models/audioset_10_10_0.4593.pth', map_location=device)
24 | +                wget.download(audioset_mdl_url, out=pretrained_weight)
25 | +            sd = torch.load(pretrained_weight, map_location=device)
26 |              audio_model = ASTModel(label_dim=527, fstride=10, tstride=10, input_fdim=128, input_tdim=1024, imagenet_pretrain=False, audioset_pretrain=False, model_size='base384', verbose=False)
27 |              audio_model = torch.nn.DataParallel(audio_model)
28 |              audio_model.load_state_dict(sd, strict=False)
29 | @@ -178,7 +178,7 @@ class ASTModel(nn.Module):
30 |          x = self.v.norm(x)
31 |          x = (x[:, 0] + x[:, 1]) / 2
32 |  
33 | -        x = self.mlp_head(x)
34 | +        # x = self.mlp_head(x)
35 |          return x
36 |  
37 |  if __name__ == '__main__':
38 | 


--------------------------------------------------------------------------------
/evar/ar_coala.py:
--------------------------------------------------------------------------------
 1 | """Wrapper code for:
 2 | 
 3 | COALA: Co-Aligned Autoencoders for Learning Semantically Enriched Audio Representations
 4 | 
 5 | ## Note
 6 | 
 7 | - FS is 22,000: https://github.com/xavierfav/coala/blob/master/utils.py#L66
 8 | - Fixed the original scaler_top_1000.pkl: https://github.com/xavierfav/coala/issues/3
 9 | 
10 | ## Reference
11 | - [1] https://arxiv.org/abs/2006.08386
12 | - [2] https://github.com/xavierfav/coala
13 | """
14 | 
15 | from evar.ar_base import (BaseAudioRepr, temporal_pooling)
16 | import torch
17 | import librosa
18 | import numpy as np
19 | import logging
20 | try:
21 |     from external.coala.encode import return_loaded_model, scaler
22 |     from external.coala.models_t1000 import AudioEncoder
23 |     from external.coala.utils import pad
24 | except:
25 |     pass  # logging.error('Make your copy of COALA under external folder. Check Preparing-models.md for the details.')
26 | 
27 | 
28 | def _compute_spectrogram(audio, sr=22000, n_mels=96):
29 |     """Borrowed from coala/utils.py, removed wav loading to accept raw audio input."""
30 |     # zero pad and compute log mel spec
31 |     try:
32 |         x = pad(audio, sr)
33 |     except ValueError:
34 |         x = audio
35 |     audio_rep = librosa.feature.melspectrogram(y=x, sr=sr, hop_length=512, n_fft=1024, n_mels=n_mels, power=1.)
36 |     audio_rep = np.log(audio_rep + np.finfo(np.float32).eps)
37 |     return audio_rep
38 | 
39 | 
40 | def _extract_audio_embedding_chunks(model, audio):
41 |     """Borrowed from coala/encode.py, modified to accept torch tensor raw audio input."""
42 |     with torch.no_grad():
43 |         device = audio.device
44 |         x = _compute_spectrogram(audio.cpu().numpy())
45 |         x_chunks = np.array([scaler.transform(chunk.T) for chunk in 
46 |                 librosa.util.frame(np.asfortranarray(x), frame_length=96, hop_length=96, axis=-1).T])
47 |         x_chunks = torch.unsqueeze(torch.tensor(x_chunks), 1).to(device)
48 |         embedding_chunks, embedding_d_chunks = model(x_chunks)
49 |         return embedding_chunks, embedding_d_chunks
50 | 
51 | 
52 | class AR_COALA(BaseAudioRepr):
53 |     def __init__(self, cfg):
54 |         super().__init__(cfg=cfg)
55 |         self.model = return_loaded_model(AudioEncoder, 'external/coala/saved_models/dual_ae_c/audio_encoder_epoch_200.pt')
56 | 
57 |     def encode_frames(self, batch_audio):
58 |         xs = [_extract_audio_embedding_chunks(self.model, x)[0] for x in batch_audio]
59 |         x = torch.stack(xs).transpose(1, 2) # [Frame, D] x B -> [B, Frame, D] -> [B, D, Frame (T)]
60 |         return x
61 | 
62 |     def forward(self, batch_audio):
63 |         x = self.encode_frames(batch_audio)
64 |         x = temporal_pooling(self, x)
65 |         return x
66 | 


--------------------------------------------------------------------------------
/evar/ar_ast.py:
--------------------------------------------------------------------------------
 1 | """Wrapper code for:
 2 | 
 3 | AST: Audio Spectrogram Transformer
 4 | 
 5 | ## Reference
 6 | - [1] https://arxiv.org/abs/2104.01778 Y. Gong, Y.-A. Chung, and J. Glass, “Ast: Audio spectrogram transformer,” arXiv preprint arXiv:2104.01778, 2021.
 7 | - [2] https://github.com/YuanGongND/ast
 8 | """
 9 | 
10 | from evar.ar_base import (BaseAudioRepr, calculate_norm_stats)
11 | import torch
12 | import torchaudio
13 | 
14 | try:
15 |     from external.ast.src.models import ASTModel
16 | except Exception as e:
17 |     pass  # print(f'(For AST users) Make your copy of AST under external folder. Check Preparing-models.md for the details.')
18 | 
19 | 
20 | class AST_Feature(torch.nn.Module):
21 |     def __init__(self, cfg):
22 |         super().__init__()
23 |         self.cfg = cfg
24 | 
25 |     def forward(self, waveforms):
26 |         def get_one(waveform):
27 |             waveform = waveform - waveform.mean()
28 |             fbank = torchaudio.compliance.kaldi.fbank(waveform, htk_compat=True,
29 |                 sample_frequency=self.cfg.sample_rate, use_energy=False,
30 |                 window_type=self.cfg.window, num_mel_bins=self.cfg.n_mels,
31 |                 dither=0.0, frame_shift=10)
32 |             return fbank
33 |         device = waveforms.device
34 |         if len(waveforms.shape) == 1:  # [L] -> [1, L]
35 |             waveforms = waveforms.unsqueeze(0)
36 |         fbanks = torch.stack([get_one(w.unsqueeze(0)) for w in waveforms])
37 |         return fbanks.to(device)
38 | 
39 | 
40 | class AR_AST(BaseAudioRepr):
41 |     def __init__(self, cfg):
42 |         super().__init__(cfg=cfg)
43 |         self.to_feature = AST_Feature(cfg)
44 |         tdim = self.to_feature(torch.rand(1, cfg.unit_samples)).shape[1]
45 |         self.backbone = ASTModel(label_dim=10, input_tdim=tdim, imagenet_pretrain=True,
46 |                                  audioset_pretrain=True, pretrained_weight=cfg.weight_file)
47 | 
48 |     def precompute(self, device, data_loader):
49 |         self.norm_stats = calculate_norm_stats(device, data_loader, self.to_feature)
50 | 
51 |     def encode_frames(self, batch_audio):
52 |         # AST returns a single embeddings for one audio, then simply add time axis.
53 |         return self.forward(batch_audio).unsqueeze(-1) # B,D -> B,D,1
54 | 
55 |     def forward(self, batch_audio):
56 |         x = self.to_feature(batch_audio)
57 |         x = self.normalize_spectrogram(x)
58 |         x = self.augment_if_training(x)
59 |         x = self.backbone(x)
60 |         return x
61 | 
62 |     def normalize_spectrogram(self, spectrograms):
63 |         mu, sigma = self.norm_stats
64 |         spectrograms = (spectrograms - mu) / (sigma * 2) # follows the original AudiosetDataset
65 |         return spectrograms
66 | 


--------------------------------------------------------------------------------
/Evaluation-examples.md:
--------------------------------------------------------------------------------
 1 | # Example command lines for evaluating models
 2 | 
 3 | ## AST
 4 | 
 5 |     python 2pass_blackbox.py config/ast.yaml fsd50k
 6 |     python 2pass_blackbox.py config/ast.yaml spcv2
 7 |     python 2pass_blackbox.py config/ast.yaml us8k
 8 |     python 2pass_blackbox.py config/ast.yaml surge --lr=0.0001
 9 |     python 2pass_blackbox.py config/ast.yaml nsynth
10 |     python 2pass_blackbox.py config/ast.yaml nspitch
11 |     python 2pass_blackbox.py config/ast.yaml vc1
12 |     python 2pass_blackbox.py config/ast.yaml cremad
13 |     python 2pass_blackbox.py config/ast.yaml voxforge
14 |     python 2pass_blackbox.py config/ast.yaml esc50
15 |     python 2pass_blackbox.py config/ast.yaml gtzan batch_size=12
16 | 
17 | ## BYOL-A
18 | 
19 |     python 2pass_blackbox.py config/byola.yaml fsd50k
20 |     python 2pass_blackbox.py config/byola.yaml spcv2
21 |     python 2pass_blackbox.py config/byola.yaml us8k
22 |     python 2pass_blackbox.py config/byola.yaml surge --lr=0.0001
23 |     python 2pass_blackbox.py config/byola.yaml nsynth
24 |     python 2pass_blackbox.py config/byola.yaml nspitch
25 |     python 2pass_blackbox.py config/byola.yaml vc1
26 |     python 2pass_blackbox.py config/byola.yaml cremad
27 |     python 2pass_blackbox.py config/byola.yaml voxforge
28 |     python 2pass_blackbox.py config/byola.yaml esc50
29 |     python 2pass_blackbox.py config/byola.yaml gtzan batch_size=64 --lr=0.001
30 | 
31 | ## PANNs' CNN14
32 | 
33 |     python 2pass_blackbox.py config/cnn14.yaml cremad
34 |     python 2pass_blackbox.py config/cnn14.yaml voxforge
35 |     python 2pass_blackbox.py config/cnn14.yaml esc50
36 |     python 2pass_blackbox.py config/cnn14.yaml gtzan
37 |     python 2pass_blackbox.py config/cnn14.yaml fsd50k
38 |     python 2pass_blackbox.py config/cnn14.yaml nsynth --lr=0.00001
39 |     python 2pass_blackbox.py config/cnn14.yaml nspitch
40 |     python 2pass_blackbox.py config/cnn14.yaml surge
41 |     python 2pass_blackbox.py config/cnn14.yaml vc1
42 |     python 2pass_blackbox.py config/cnn14.yaml spcv2
43 |     python 2pass_blackbox.py config/cnn14.yaml us8k
44 | 
45 | ## VGGish
46 | 
47 |     python 2pass_blackbox.py config/vggish.yaml fsd50k
48 |     python 2pass_blackbox.py config/vggish.yaml nsynth
49 |     python 2pass_blackbox.py config/vggish.yaml nspitch
50 |     python 2pass_blackbox.py config/vggish.yaml surge
51 |     python 2pass_blackbox.py config/vggish.yaml vc1 --lr=0.0005
52 |     python 2pass_blackbox.py config/vggish.yaml spcv2
53 |     python 2pass_blackbox.py config/vggish.yaml us8k
54 |     python 2pass_blackbox.py config/vggish.yaml cremad
55 |     python 2pass_blackbox.py config/vggish.yaml voxforge
56 |     python 2pass_blackbox.py config/vggish.yaml esc50 --lr=0.003
57 |     python 2pass_blackbox.py config/vggish.yaml gtzan batch_size=128
58 | 
59 | 


--------------------------------------------------------------------------------
/summarize.py:
--------------------------------------------------------------------------------
 1 | """Summarize results for a model.
 2 | """
 3 | 
 4 | from evar.common import (np, pd, Path, RESULT_DIR)
 5 | import fire
 6 | 
 7 | 
 8 | def get_weight(weight_file):
 9 |     weight_file = Path(weight_file)
10 |     weight = weight_file.parent.name + '/' + weight_file.stem
11 |     return weight
12 | 
13 | 
14 | def available_tasks(df):
15 |     ALL_TASKS = ['esc50', 'us8k', 'spcv2', 'vc1', 'voxforge', 'cremad', 'gtzan', 'nsynth', 'surge', 'fsd50k'] \
16 |         + ['zs_esc50', 'zs_us8k', 'zs_spcv2', 'zs_vc1', 'zs_voxforge', 'zs_cremad', 'zs_gtzan', 'zs_nsynth', 'zs_surge', 'zs_fsd50k', 'zs_as']
17 |     tasks = [t for t in ALL_TASKS if t in list(df.columns)]
18 |     return tasks
19 | 
20 | 
21 | def summarize(weight_file, post=True):
22 |     # Summarize LE
23 |     df = pd.read_csv(f'{RESULT_DIR}/scores.csv')
24 |     df = df[df.report.str.contains(weight_file, na=False, regex=False)]
25 |     df['weight'] = get_weight(weight_file)
26 |     src_df = df.copy()
27 | 
28 |     df = pd.pivot_table(df, index=['weight'], columns=['task'], values=['score'], aggfunc=np.mean)
29 |     df.columns = df.columns.get_level_values(1)
30 |     df = df[available_tasks(df)]
31 |     if len(df) == 0:
32 |         print(f'No data for {weight_file}.')
33 |         return
34 |     df['average'] = df.mean(1)
35 | 
36 |     # Summarize ATR
37 |     if Path(f'{RESULT_DIR}/retrieval_scores.csv').exists():
38 |         d = pd.read_csv(f'{RESULT_DIR}/retrieval_scores.csv')
39 |         d = d[d.weight.str.contains(weight_file, na=False, regex=False)]
40 |         if len(d) > 0:
41 |             d = d.set_index('model')
42 |             d['weight'] = get_weight(weight_file)
43 |             d.columns = ['task', 'a2tR1', 'a2tR5', 'a2tR10', 'a2tmAP10', 't2aR1', 't2aR5', 't2aR10', 't2amAP10', 'weight']
44 |             new_d = None
45 |             for t, shortname in [('audiocaps', 'A'), ('clotho', 'C')]:
46 |                 d_ = d[d.task == t][['a2tR1', 'a2tR5', 'a2tR10', 't2aR1', 't2aR5', 't2aR10']]
47 |                 d_.columns = [shortname + c for c in list(d_.columns)]
48 |                 d_.index = ['same_index']
49 |                 new_d = d_ if new_d is None else pd.concat([new_d, d_], axis=1)
50 |             new_d['weight'] = get_weight(weight_file)
51 |             new_d = new_d.set_index('weight') * 0.01
52 |             df = pd.concat([df, new_d], axis=1)
53 | 
54 |     # Report
55 |     report = df.applymap(lambda x: f'{x*100:.2f}%' if str(x).isnumeric else x).to_markdown()
56 |     print(report)
57 | 
58 |     # Save source results to a csv
59 |     report_csv = RESULT_DIR + '/' + str(df.index[0]).replace('/', '_') + '.csv'
60 |     src_df.report = src_df.report.str.replace('\n', ' ')
61 |     src_df.to_csv(report_csv, index=None)
62 | 
63 | 
64 | if __name__ == '__main__':
65 |     fire.Fire(summarize)
66 | 


--------------------------------------------------------------------------------
/evar/ds_tasks.py:
--------------------------------------------------------------------------------
 1 | """Downstream task definitions."""
 2 | 
 3 | from evar.common import (os, Path, WORK, METADATA_DIR)
 4 | 
 5 | 
 6 | _defs = {
 7 |     # folds, unit_sec, data_folder (None if task name is the folder name), balanced training when fine-tining
 8 |     'us8k': [10, 4.0, None, False],
 9 |     'esc50': [5, 5.0, None, False],
10 |     'fsd50k': [1, 7.6358, None, False], ## Changed to NOT balanced: to make it the same as PaSST.
11 |     'fsdnoisy18k': [1, 8.25, None, False],
12 |     'gtzan': [1, 30.0, None, False],
13 |     'nsynth': [1, 4.0, None, False],
14 |     'cremad': [1, 2.5, None, False],
15 |     'spcv1': [1, 1.0, None, False],
16 |     'spcv2': [1, 1.0, None, False],
17 |     'surge': [1, 4.0, None, False],
18 |     'vc1': [1, 8.2, None, False],
19 |     'vocalsound': [1, 4.18, None, False],
20 |     'voxforge': [1, 5.8, None, False],
21 |     'as20k': [1, 10.0, 'as', False],
22 |     'as': [1, 10.0, 'as', True],
23 |     'audiocaps': [1, 10.0, None, False],
24 |     'ja_audiocaps': [1, 10.0, 'audiocaps', False],
25 |     'clotho': [1, 30.0, None, False],
26 |     'circor1': [1, 5.0, None, False],
27 |     'circor2': [1, 5.0, None, False],
28 |     'circor3': [1, 5.0, None, False],
29 |     'bmdhs1': [1, 20.0, 'bmdhs', False],
30 |     'bmdhs2': [1, 20.0, 'bmdhs', False],
31 |     'bmdhs3': [1, 20.0, 'bmdhs', False],
32 |     'xacle': [1, 10.0, None, False],
33 |     'xacle_test': [1, 10.0, 'xacle', False],
34 | }
35 | 
36 | _fs_table = {
37 |     16000: '16k',
38 |     22000: '22k', # Following COALA that uses 22,000 Hz
39 |     32000: '32k',
40 |     44100: '44k',
41 |     48000: '48k',
42 | }
43 | 
44 | def get_original_folder(task, folder):
45 |     orgs = {
46 |         'us8k': 'UrbanSound8K',
47 |         'esc50': 'ESC-50-master',
48 |         'as20k': 'AudioSet',
49 |         'as': 'AudioSet',
50 |         'vocalsound': 'vocalsound_44k/data_44k',
51 |     }
52 |     return orgs[task] if task in orgs else folder
53 | 
54 | 
55 | def get_defs(cfg, task, original_data=False):
56 |     """Get task definition parameters.
57 | 
58 |     Returns:
59 |         pathname (str): Metadata .csv file path.
60 |         wav_folder (str): "work/16k/us8k" for example.
61 |         folds (int): Number of LOOCV folds or 1. 1 means no cross validation.
62 |         unit_sec (float): Unit duration in seconds.
63 |         weighted (bool): True if the training requires a weighted loss calculation.
64 |         balanced (bool): True if the training requires a class-balanced sampling.
65 |     """
66 |     folds, unit_sec, folder, balanced = _defs[task]
67 |     folder = folder or task
68 |     evar_path = Path(os.environ.get('EVAR', '.'))
69 |     workfolder = f'{WORK}/original/{get_original_folder(task, folder)}' if original_data else f'{WORK}/{_fs_table[cfg.sample_rate]}/{folder}'
70 |     return str(evar_path/f'{METADATA_DIR}/{task}.csv'), str(evar_path/workfolder), folds, unit_sec, balanced
71 | 


--------------------------------------------------------------------------------
/evar/ar_byola.py:
--------------------------------------------------------------------------------
 1 | """Wrapper code for:
 2 | 
 3 | BYOL for Audio: Self-Supervised Learning for General-Purpose Audio Representation
 4 | 
 5 | ## Reference
 6 | - [1] https://arxiv.org/abs/2103.06695
 7 | - [2] https://github.com/nttcslab/byol-a
 8 | """
 9 | 
10 | from evar.ar_base import (BaseAudioRepr, ToLogMelSpec, calculate_norm_stats, normalize_spectrogram, temporal_pooling)
11 | from evar.model_utils import load_pretrained_weights
12 | import logging
13 | try:
14 |     from external.byol_a.byol_a.models import AudioNTT2020Task6, AudioNTT2020Task6X
15 | except Exception as e:
16 |     pass  # logging.info(f'Make your copy of BYOL-A under external folder. Check Preparing-models.md for the details.')
17 | 
18 | 
19 | class AR_BYOLA(BaseAudioRepr):
20 |     def __init__(self, cfg):
21 |         super().__init__(cfg=cfg)
22 |         self.to_feature = ToLogMelSpec(cfg)
23 | 
24 |         self.body = AudioNTT2020Task6(n_mels=cfg.n_mels, d=cfg.feature_d)
25 |         if cfg.weight_file is not None:
26 |             load_pretrained_weights(self.body, cfg.weight_file, model_key='body')
27 | 
28 |     def precompute(self, device, data_loader):
29 |         self.norm_stats = calculate_norm_stats(device, data_loader, self.to_feature)
30 | 
31 |     def encode_frames(self, batch_audio):
32 |         x = self.to_feature(batch_audio)
33 |         x = normalize_spectrogram(self.norm_stats, x) # B,F,T
34 |         x = self.augment_if_training(x)
35 |         x = x.unsqueeze(1)    # -> B,1,F,T
36 |         x = self.body(x)      # -> B,T,D=C*F
37 |         x = x.transpose(1, 2) # -> B,D,T
38 |         return x
39 | 
40 |     def forward(self, batch_audio):
41 |         x = self.encode_frames(batch_audio)
42 |         x = temporal_pooling(self, x)
43 |         return x
44 | 
45 | 
46 | class AR_BYOLAX(BaseAudioRepr):
47 |     """A BYOL-A variant extended to stack features from all the layers."""
48 |     def __init__(self, cfg):
49 |         super().__init__(cfg=cfg)
50 |         self.to_feature = ToLogMelSpec(cfg)
51 | 
52 |         self.body = AudioNTT2020Task6X(n_mels=cfg.n_mels, d=cfg.feature_d)
53 |         if cfg.weight_file is not None:
54 |             self.body.load_weight(cfg.weight_file, device='cpu')
55 |         self.cfg.feature_d = self.cfg.feature_d * self.body.n_feature_layer
56 | 
57 |     def precompute(self, device, data_loader):
58 |         self.norm_stats = calculate_norm_stats(device, data_loader, self.to_feature)
59 | 
60 |     def encode_frames(self, batch_audio):
61 |         x = self.to_feature(batch_audio)
62 |         x = normalize_spectrogram(self.norm_stats, x) # B,F,T
63 |         x = self.augment_if_training(x)
64 |         x = x.unsqueeze(1)    # -> B,1,F,T
65 |         x = self.body(x, layered=True) # -> B,T,D=C*F*Layer
66 |         x = x.transpose(1, 2) # -> B,D,T
67 |         return x
68 | 
69 |     def forward(self, batch_audio):
70 |         x = self.encode_frames(batch_audio)
71 |         x = temporal_pooling(self, x)
72 |         return x
73 | 
74 | 


--------------------------------------------------------------------------------
/plugin/MARBLE/REAEDME_MARBLE.md:
--------------------------------------------------------------------------------
 1 | ## MARBLE Benchmark Integration
 2 | 
 3 | This repository provides the code to integrate EVAR as a plugin model into the [MARBLE](https://github.com/a43992899/MARBLE) benchmark for music tasks, enabling MARBLE to evaluate pre-trained audio representation models supported by EVAR. Based on this MARBLE extension, it also provides scripts and instructions to reproduce the results from the M2D2 paper.
 4 | 
 5 | NOTE: We support MARBLE v1 for now.
 6 | 
 7 | ## How to Integrate with the MARBLE Benchmark
 8 | 
 9 | Follow the steps below to integrate EVAR into your MARBLE directory cloned from GitHub.
10 | 
11 | *NOTE*: In addition to the integration steps, set the environment variable `EVAR` to point to the local EVAR folder so that MARBLE can reference it.
12 | 
13 | ```sh
14 | export EVAR=/lab/eval-audio-repr
15 | 
16 | git clone https://github.com/a43992899/MARBLE-Benchmark.git
17 | cd MARBLE-Benchmark
18 | git checkout d9300e335eefdad8d6b825418e8c44b22d0919c7
19 | 
20 | patch -p1 < $EVAR/plugin/MARBLE/evar_marble_diff.patch
21 | cp -r $EVAR/plugin/MARBLE/benchmark/models/evar benchmark/models
22 | cp -r $EVAR/plugin/MARBLE/configs/evar configs
23 | cp $EVAR/plugin/MARBLE/evar_marble.sh .
24 | ```
25 | 
26 | For the task datasets for MARBLE, follow the instructions provided by the MARBLE.
27 | 
28 | ## Evaluating models on MARBLE
29 | 
30 | Once you prepare EVAR on MARBLE, you can use the script `evar_marble.sh` to evaluate models. The following is an example of M2D.
31 | 
32 | ```sh
33 | bash evar_marble.sh m2d /your/m2d_vit_base-80x608p16x16-221006-mr7/checkpoint-300.pth 7 5 feat-m2d-mr7 3840
34 | ```
35 | 
36 | The results will be stored in CSV files, such as `score_EMO.csv`.
37 | 
38 | ### More example command lines
39 | 
40 | ```sh
41 | EVAR=/your/evar bash evar_marble.sh beats_plus /your/BEATs_iter3_plus_AS2M.pt 7 5
42 | EVAR=/your/evar bash evar_marble.sh atst_frame /your/atstframe_base.ckpt 7 5
43 | EVAR=/your/evar bash evar_marble.sh msclap 2023 7 5
44 | EVAR=/your/evar bash evar_marble.sh m2d /your/m2d_vit_base-80x608p16x16-221006-mr7/checkpoint-300.pth 7 5 feat-m2d-mr7
45 | EVAR=/your/evar bash evar_marble.sh m2d /your/m2d_vit_base-80x1001p16x16-221006-mr7_as_46ab246d/weights_ep67it3124-0.47941.pth 7 5 feat-m2d-mr7-as
46 | EVAR=/your/evar bash evar_marble.sh m2d /your/clap/m2d_clap_vit_base-80x608p16x16-240128/checkpoint-300.pth 7 5 feat-m2d-clap
47 | EVAR=/your/evar bash evar_marble.sh m2d /your/msm_mae_vit_base-80x608p16x16-220924-mr75/checkpoint-300.pth 7 5 feat-msm-mae
48 | ```
49 | 
50 | ## Referecnces
51 | 
52 | - MARBLE: *[R. Yuan, Y. Ma, Y. Li, G. Zhang, X. Chen, H. Yin, z. le, Y. Liu, J. Huang, Z. Tian, B. Deng, N. Wang, C. Lin, E. Benetos, A. Ragni, N. Gyenge, R. Dannenberg, W. Chen, G. Xia, W. Xue, S. Liu, S. Wang, R. Liu, Y. Guo, and J. Fu, “MARBLE: Music audio representation benchmark for universal evaluation,” in NeurIPS, vol. 36, 2023, pp. 39 626–39 647.](https://proceedings.neurips.cc/paper_files/paper/2023/hash/7cbeec46f979618beafb4f46d8f39f36-Abstract-Datasets_and_Benchmarks.html).* 👉  [GitHub](https://github.com/a43992899/MARBLE/tree/main-v1-archived).
53 | 
54 | 


--------------------------------------------------------------------------------
/evar/ar_wav2vec2.py:
--------------------------------------------------------------------------------
 1 | """Wrapper code for:
 2 | 
 3 | wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations
 4 | 
 5 | ## Reference
 6 | - [1] https://ai.facebook.com/blog/wav2vec-20-learning-the-structure-of-speech-from-raw-audio/
 7 | - [2] https://huggingface.co/facebook/wav2vec2-large-960h-lv60
 8 | """
 9 | 
10 | from evar.ar_base import BaseAudioRepr, temporal_pooling
11 | import torch
12 | import logging
13 | try:
14 |     from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
15 | except:
16 |     logging.error('Install transformers.\n>>> pip install transformers')
17 | 
18 | 
19 | class AR_Wav2Vec2Logit(BaseAudioRepr):
20 |     """Wav2Vec2.0 logits from LM output.
21 |     https://huggingface.co/facebook/wav2vec2-large-960h-lv60
22 |     """
23 |     def __init__(self, cfg):
24 |         super().__init__(cfg=cfg)
25 | 
26 |         self.processor = Wav2Vec2Processor.from_pretrained(cfg.wav2vec_model)
27 |         self.backbone = Wav2Vec2ForCTC.from_pretrained(cfg.wav2vec_model)
28 | 
29 |     def encode_frames(self, batch_audio):
30 |         device = batch_audio.device
31 |         preprocessed = self.processor(batch_audio.cpu().numpy(), return_tensors="pt", sampling_rate=16000).input_values
32 |         preprocessed = preprocessed[0] if preprocessed.shape[0] == 1 else preprocessed  # [1, B, raw wave length] -> [B, raw wave length]
33 |         preprocessed = preprocessed.to(device)
34 |         logits = self.backbone(preprocessed).logits # [B, T, D]
35 |         return logits.transpose(1, 2) # [B, D, T]
36 | 
37 |     def forward(self, batch_audio):
38 |         return temporal_pooling(self, self.encode_frames(batch_audio))
39 | 
40 | 
41 | class AR_Wav2Vec2Context(AR_Wav2Vec2Logit):
42 |     """Wav2Vec2.0 context network.
43 |     https://github.com/huggingface/transformers/blob/master/src/transformers/models/wav2vec2/modeling_wav2vec2.py#L1529
44 |     """
45 | 
46 |     def encode_frames(self, batch_audio):
47 |         device = batch_audio.device
48 |         preprocessed = self.processor(batch_audio.cpu().numpy(), return_tensors="pt", sampling_rate=16000).input_values
49 |         preprocessed = preprocessed[0] if preprocessed.shape[0] == 1 else preprocessed  # [1, B, raw wave length] -> [B, raw wave length]
50 |         preprocessed = preprocessed.to(device)
51 |         features = self.backbone.wav2vec2(preprocessed, output_hidden_states=True).hidden_states # [B, T, D]
52 |         hidden_states = self.backbone(preprocessed, output_hidden_states=True).hidden_states # [B, T, D]
53 |         # stack layer outputs
54 |         states_to_stack = [hidden_states[index] for index in self.cfg.output_layers] if self.cfg.output_layers else hidden_states
55 |         features = torch.cat(states_to_stack, axis=-1)
56 |         return features.transpose(1, 2) # [B, D, T]
57 | 
58 | 
59 | class AR_Wav2Vec2Feature(AR_Wav2Vec2Logit):
60 |     """Wav2Vec2.0 feature encoder."""
61 | 
62 |     def encode_frames(self, batch_audio):
63 |         device = batch_audio.device
64 |         preprocessed = self.processor(batch_audio.cpu().numpy(), return_tensors="pt", sampling_rate=16000).input_values
65 |         preprocessed = preprocessed[0] if preprocessed.shape[0] == 1 else preprocessed  # [1, B, raw wave length] -> [B, raw wave length]
66 |         preprocessed = preprocessed.to(device)
67 |         features = self.backbone.wav2vec2.feature_extractor(preprocessed) # [B, D, T]
68 |         features = features.transpose(1, 2) # -> [B, T, D]
69 |         return features.transpose(1, 2) # [B, D, T]
70 | 


--------------------------------------------------------------------------------
/evar/common.py:
--------------------------------------------------------------------------------
  1 | """Common imports/constants/small functions."""
  2 | 
  3 | from evar.utils import *
  4 | import shutil
  5 | import re
  6 | from torch import nn
  7 | import torch.nn.functional as F
  8 | import torchaudio
  9 | import torchaudio.functional as AF
 10 | import torchaudio.transforms as AT
 11 | from torch.utils.data import DataLoader, Dataset
 12 | 
 13 | 
 14 | # Folders
 15 | WORK = 'work'
 16 | METADATA_DIR = 'evar/metadata'
 17 | RESULT_DIR = 'results'
 18 | LOG_DIR = 'logs'
 19 | 
 20 | 
 21 | def eval_if_possible(text):
 22 |     for pat in [r'\[.*\]', r'\(.*\)']:
 23 |         if re.search(pat, text):
 24 |             return eval(text)
 25 |     if re_valuable.match(text):
 26 |         return eval(text)
 27 |     return text
 28 | 
 29 | 
 30 | def split_camma(text):
 31 |     flag = None
 32 |     elements = []
 33 |     cur = []
 34 |     for c in text:
 35 |         if flag is not None:
 36 |             cur.append(c)
 37 |             if flag == '[' and c == ']': flag = None
 38 |             if flag == '(' and c == ')': flag = None
 39 |             if flag == '"' and c == '"': flag = None
 40 |             if flag == "'" and c == "'": flag = None
 41 |             continue
 42 |         if c in ['[', '(', '"', "'"]:
 43 |             cur.append(c)
 44 |             flag = c
 45 |             continue
 46 |         if c == ',':
 47 |             elements.append(''.join(cur))
 48 |             cur = []
 49 |         else:
 50 |             cur.append(c)
 51 |     if cur:
 52 |             elements.append(''.join(cur))
 53 |     return elements
 54 | 
 55 | 
 56 | # App level utilities
 57 | def complete_cfg(cfg, options, no_id=False):
 58 |     # Override parameter values with given "options".
 59 |     if 'name' not in cfg or not isinstance(cfg['name'], str):
 60 |         cfg['name'] = ''
 61 |     print(options)
 62 |     for item in split_camma(options):
 63 |         if item == '': continue
 64 |         keyvalues = item.split('=')
 65 |         assert len(keyvalues) == 2, f'An option need one and only one "=" in the option {item} in {options}.'
 66 |         key, value = keyvalues
 67 |         value = eval_if_possible(value)
 68 |         if key[0] == '+':
 69 |             key = key[1:]
 70 |             cfg[key] = None
 71 |         if key not in cfg.keys():
 72 |             raise Exception(f'Cannot find a setting named: {key} of the option {item}')
 73 |         cfg[key] = value
 74 |     # Set ID.
 75 |     if not no_id:
 76 |         task = Path(cfg.task_metadata).stem if 'task_metadata' in cfg else ''
 77 |         if 'name' in cfg and len(cfg['name']) > 0:
 78 |             name = cfg.name
 79 |         elif 'weight_file' in cfg and len(str(cfg['weight_file'])) > 0:
 80 |             weight_path = Path(str(cfg['weight_file']))
 81 |             parent = weight_path.parent.name.replace('.', '_') if len(weight_path.parent.name) > 0 else str(cfg.audio_repr.split(',')[-1])
 82 |             name = f'{parent}-{weight_path.stem}'
 83 |         else:
 84 |             name = str(cfg.audio_repr.split(',')[-1])
 85 |         cfg.id = name + '_' + task + '_' + hash_text(str(cfg), L=8)
 86 |     return cfg
 87 | 
 88 | 
 89 | def kwarg_cfg(**kwargs):
 90 |     cfg = EasyDict(kwargs)
 91 |     cfg.id = hash_text(str(cfg), L=8)
 92 |     return cfg
 93 | 
 94 | 
 95 | def app_setup_logger(cfg, level=logging.INFO):
 96 |     logpath = Path(LOG_DIR)/cfg.id
 97 |     logpath.mkdir(parents=True, exist_ok=True)
 98 |     setup_logger(filename=logpath/'log.txt', level=level)
 99 |     print('Logging to', logpath/'log.txt')
100 |     logging.info(str(cfg))
101 |     return logpath
102 | 
103 | 
104 | def setup_dir(dirs=[]):
105 |     for d in dirs:
106 |         Path(d).mkdir(parents=True, exist_ok=True)
107 | 


--------------------------------------------------------------------------------
/evar/utils/__init__.py:
--------------------------------------------------------------------------------
  1 | """Utilities for EVAR
  2 | """
  3 | 
  4 | import os
  5 | import sys
  6 | from itertools import chain
  7 | import subprocess
  8 | import re
  9 | import logging
 10 | from easydict import EasyDict
 11 | from pathlib import Path
 12 | import pandas as pd
 13 | import yaml
 14 | import numpy as np
 15 | import random
 16 | import datetime
 17 | import hashlib
 18 | import torch
 19 | try:
 20 |     import pickle5 as pickle
 21 | except:
 22 |     import pickle
 23 | 
 24 | 
 25 | # Regular expression to check string can be converted into variables
 26 | # Thanks to -- https://stackoverflow.com/a/385597/6528729
 27 | re_valuable = re.compile("""(?x)
 28 |    ^
 29 |       (               # int|float|double
 30 |         [+-]?\ *      # first, match an optional sign *and space*
 31 |         (             # then match integers or f.p. mantissas:
 32 |             \d+       # start out with a ...
 33 |             (
 34 |                 \.\d* # mantissa of the form a.b or a.
 35 |             )?        # ? takes care of integers of the form a
 36 |             |\.\d+     # mantissa of the form .b
 37 |         )
 38 |         ([eE][+-]?\d+)?  # finally, optionally match an exponent
 39 |       )
 40 |       |(              # bool
 41 |         False|True
 42 |       )
 43 |    $""")
 44 | 
 45 | 
 46 | def run_command(cmd_line):
 47 |     print('>>>', ' '.join(cmd_line))
 48 |     def runner():
 49 |         proc = subprocess.Popen(cmd_line, bufsize=0, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
 50 | 
 51 |         while True:
 52 |             line = proc.stdout.readline()
 53 |             if line:
 54 |                 yield line
 55 | 
 56 |             if not line and proc.poll() is not None:
 57 |                 break
 58 | 
 59 |     for line in runner():
 60 |         sys.stdout.write(line.decode())
 61 | 
 62 | 
 63 | def seed_everything(seed=42):
 64 |     random.seed(seed)
 65 |     os.environ['PYTHONHASHSEED'] = str(seed)
 66 |     np.random.seed(seed)
 67 |     torch.manual_seed(seed)
 68 |     torch.backends.cudnn.deterministic = True
 69 |     torch.backends.cudnn.benchmark = False
 70 | 
 71 | 
 72 | def get_timestamp():
 73 |     """ex) Outputs 202104220830"""
 74 |     return datetime.datetime.now().strftime('%y%m%d%H%M')
 75 | 
 76 | 
 77 | def load_yaml_config(path_to_config):
 78 |     """Loads yaml configuration settings as an EasyDict object."""
 79 |     path_to_config = Path(path_to_config)
 80 |     assert path_to_config.is_file(), f'{path_to_config} not found, cwd={Path(".").resolve()}'
 81 |     with open(path_to_config) as f:
 82 |         yaml_contents = yaml.safe_load(f)
 83 |     cfg = EasyDict(yaml_contents)
 84 |     return cfg
 85 | 
 86 | 
 87 | def hash_text(text, L=128):
 88 |     hashed = hashlib.shake_128(text.encode()).hexdigest(L//2 + 1)
 89 |     return hashed[:L]
 90 | 
 91 | 
 92 | def setup_logger(name='', filename=None, level=logging.INFO):
 93 |     # Thanks to https://stackoverflow.com/a/53553516/6528729
 94 |     from imp import reload
 95 |     reload(logging)
 96 | 
 97 |     logging.basicConfig(format='%(asctime)s %(levelname)s %(message)s',
 98 |                         datefmt='%Y-%m-%d %H:%M:%S', level=level, filename=filename)
 99 |     logger = logging.getLogger(name)
100 |     console = logging.StreamHandler()
101 |     console.setLevel(level)
102 |     logger.addHandler(console)
103 | 
104 | 
105 | def flatten_list(lists):
106 |     return list(chain.from_iterable(lists))
107 | 
108 | 
109 | def append_to_csv(csv_filename, data):
110 |     filename = Path(csv_filename)
111 |     filename.parent.mkdir(parents=True, exist_ok=True)
112 |     df = pd.read_csv(filename) if filename.exists() else pd.DataFrame()
113 |     df = pd.concat([df, data], ignore_index=True).to_csv(filename, index=False)
114 | 


--------------------------------------------------------------------------------
/app/circor/rearrange_data.py:
--------------------------------------------------------------------------------
 1 | """CirCor evaluation utility.
 2 | 
 3 | This program stratified-splits physionet.org/files/circor-heart-sound/1.0.3/training_data to heart-murmur-detection/data.
 4 | Then, it copies stratified data (under heart-murmur-detection/data) to evar working folder (under evar/work/16k).
 5 | It also creates metadata files as ../../evar/metadata/circor[1-3].csv.
 6 | """
 7 | 
 8 | import pandas as pd
 9 | import numpy as np
10 | from pathlib import Path
11 | import shutil
12 | import librosa
13 | import torch
14 | import torchaudio
15 | 
16 | 
17 | ## Copy raw data under physionet.org/files/circor-heart-sound/1.0.3/training_data to heart-murmur-detection/data
18 | split_csvs = ['./datalist_stratified_data1.csv', './datalist_stratified_data2.csv', './datalist_stratified_data3.csv']
19 | df = pd.concat([pd.read_csv(f) for f in split_csvs], ignore_index=True)
20 | 
21 | dest = Path('heart-murmur-detection/data')
22 | for f in df.dest_file.values:
23 |     f = Path(f)
24 |     f.parent.mkdir(exist_ok=True, parents=True)
25 |     from_file = Path('physionet.org/files/circor-heart-sound/1.0.3/training_data')/f.name
26 |     #print('Copy', from_file, 'to', f)
27 |     shutil.copy(from_file, f)
28 | 
29 | 
30 | ## Copy stratified data (under heart-murmur-detection/data) to evar working folder (evar/work/16k)
31 | dfs = []
32 | 
33 | for split_no in [1, 2, 3]:
34 |     trn = sorted(Path(f'heart-murmur-detection/data/stratified_data{split_no}/train_data/').glob('*.wav'))
35 |     val = sorted(Path(f'heart-murmur-detection/data/stratified_data{split_no}/vali_data/').glob('*.wav'))
36 |     tst = sorted(Path(f'heart-murmur-detection/data/stratified_data{split_no}/test_data/').glob('*.wav'))
37 |     #Tr, V, Te = len(trn), len(val), len(tst)
38 | 
39 |     itrn = sorted(list(set([int(f.stem.split('_')[0]) for f in trn])))
40 |     ival = sorted(list(set([int(f.stem.split('_')[0]) for f in val])))
41 |     itst = sorted(list(set([int(f.stem.split('_')[0]) for f in tst])))
42 |     Tr, V, Te = len(itrn), len(ival), len(itst)
43 |     N = Tr + V + Te
44 |     print(f'Split #{split_no} has samples: Training:{Tr}({Tr/N*100:.2f}%), Val:{V}({V/N*100:.2f}%), Test:{Te}({Te/N*100:.2f}%)')
45 |     print(' Training sample IDs are:', itrn[:3], '...')
46 | 
47 |     df = pd.read_csv('physionet.org/files/circor-heart-sound/1.0.3/training_data.csv')
48 | 
49 |     def get_split(pid):
50 |         if pid in itrn: return 'train'
51 |         if pid in ival: return 'valid'
52 |         if pid in itst: return 'test'
53 |         assert False, f'Patient ID {pid} Unknown'
54 |     df['split'] = df['Patient ID'].apply(get_split)
55 | 
56 | 
57 |     SR = 16000
58 |     L = int(SR * 5.0)
59 |     STEP = int(SR * 2.5)
60 | 
61 |     ROOT = Path('physionet.org/files/circor-heart-sound/1.0.3/training_data/')
62 |     TO_FOLDER = Path(f'../../work/16k/circor{split_no}')
63 | 
64 |     evardf = pd.DataFrame()
65 | 
66 |     for i, r in df.iterrows():
67 |         pid, recloc, split, label = str(r['Patient ID']), r['Recording locations:'], r.split, r.Murmur
68 |         # Not using recloc. Search real recordings...
69 |         recloc = [f.stem.replace(pid+'_', '') for f in sorted(ROOT.glob(f'{pid}_*.wav'))]
70 |         #print(pid, recloc, split, label)
71 |         for rl in recloc:
72 |             wav, sr = librosa.load(f'{ROOT}/{pid}_{rl}.wav', sr=SR)
73 |             for widx, pos in enumerate(range(0, len(wav) - STEP + 1, STEP)):
74 |                 w = wav[pos:pos+L]
75 |                 org_len = len(w)
76 |                 if org_len < L:
77 |                     w = np.pad(w, (0, L - org_len))
78 |                     assert len(w) == L
79 |                 to_name = TO_FOLDER/split/f'{pid}_{rl}_{widx}.wav'
80 |                 to_rel_name = to_name.relative_to(TO_FOLDER)
81 |                 #print(pid, rl, len(wav)/SR, to_name, to_rel_name, org_len, len(w), pos)
82 |                 evardf.loc[to_name.stem, 'file_name'] = to_rel_name
83 |                 evardf.loc[to_name.stem, 'label'] = label
84 |                 evardf.loc[to_name.stem, 'split'] = split
85 | 
86 |                 to_name.parent.mkdir(exist_ok=True, parents=True)
87 |                 w = torch.tensor(w * 32767.0).to(torch.int16).unsqueeze(0)
88 |                 torchaudio.save(to_name, w, SR)
89 |     evardf.to_csv(f'../../evar/metadata/circor{split_no}.csv', index=None)
90 |     print('Split', split_no)
91 |     print(evardf[:3])
92 | 
93 | df[:3]
94 | 


--------------------------------------------------------------------------------
/plugin/MARBLE/evar_marble_diff.patch:
--------------------------------------------------------------------------------
 1 | diff --git a/benchmark/constants/model_constants.py b/benchmark/constants/model_constants.py
 2 | index beaf1a6..ff474df 100644
 3 | --- a/benchmark/constants/model_constants.py
 4 | +++ b/benchmark/constants/model_constants.py
 5 | @@ -54,6 +54,7 @@ NAME_TO_EXTRACT_FEATURES_MAIN = {
 6 |      "music2vec_target12": "extract_data2vec_audio_features_main",
 7 |      "music2vec_span15": "extract_data2vec_audio_features_main",
 8 |      "yue": "extract_yue_features_main",     
 9 | +    'evar': "extract_evar_features_main",
10 |  }
11 |  
12 |  SUPPORTED_REPRESENTATIONS = list(NAME_TO_EXTRACT_FEATURES_MAIN.keys())
13 | diff --git a/benchmark/extract.py b/benchmark/extract.py
14 | index 3869b43..8308d98 100644
15 | --- a/benchmark/extract.py
16 | +++ b/benchmark/extract.py
17 | @@ -7,11 +7,12 @@ def main(args):
18 |      from benchmark.models.data2vec.extract_data2vec_features import main as extract_data2vec_audio_features_main #data2vec-audio
19 |      from benchmark.models.handcrafted.extract_handcrafted_features import main as extract_handcrafted_features_main
20 |      from benchmark.models.jukebox.extract_jukemir_features import main as extract_jukemir_features_main
21 | -    from benchmark.models.musicnn.extract_musicnn_features import main as extract_musicnn_features_main
22 | +    #from benchmark.models.musicnn.extract_musicnn_features import main as extract_musicnn_features_main
23 |      from benchmark.models.clmr.extract_clmr_features import main as extract_clmr_features_main
24 | -    from benchmark.models.mule.extract_mule_features import main as extract_mule_features_main
25 | +    #from benchmark.models.mule.extract_mule_features import main as extract_mule_features_main
26 |      from benchmark.models.hubert.extract_hubert_features import main as extract_speech_hubert_features_main #hubert
27 | -    from benchmark.models.yue.extract_yue_features import main as extract_yue_features_main
28 | +    #from benchmark.models.yue.extract_yue_features import main as extract_yue_features_main
29 | +    from benchmark.models.evar.extract_evar_features import main as extract_evar_features_main
30 |  
31 |      config = load_config(args.config, namespace=True)
32 |  
33 | diff --git a/benchmark/probe.py b/benchmark/probe.py
34 | index 4f2b746..7aaad6d 100644
35 | --- a/benchmark/probe.py
36 | +++ b/benchmark/probe.py
37 | @@ -1,6 +1,8 @@
38 |  import wandb
39 |  import argparse
40 |  import torch
41 | +import pandas as pd
42 | +from pathlib import Path
43 |  import pytorch_lightning as pl
44 |  
45 |  import benchmark as bench
46 | @@ -27,6 +29,12 @@ def main(args):
47 |      assert cfg.trainer.paradigm == 'probe', "paradigm must be probe for probe.py"
48 |      pl.seed_everything(cfg.trainer.seed)
49 |  
50 | +    if cfg.dataset.pre_extract.feature_extractor.pretrain.num_features is None:
51 | +        import yaml
52 | +        with open(cfg.dataset.pre_extract.feature_extractor.pretrain.evar_config) as f:
53 | +            evar_cfg = yaml.safe_load(f)
54 | +        cfg.dataset.pre_extract.feature_extractor.pretrain.num_features = evar_cfg['feature_d']
55 | +
56 |      logger = get_logger(cfg)
57 |      model = get_model(cfg)
58 |      train_loader, valid_loader, test_loader = get_dataloaders(cfg)
59 | @@ -73,5 +81,26 @@ def main(args):
60 |          # does it really save the best model?
61 |          if cfg.checkpoint.save_best_to is not None: trainer.save_checkpoint(cfg.checkpoint.save_best_to)
62 |  
63 | +    def append_to_csv(csv_filename, data):
64 | +        filename = Path(csv_filename)
65 | +        filename.parent.mkdir(parents=True, exist_ok=True)
66 | +        df = pd.read_csv(filename) if filename.exists() else pd.DataFrame()
67 | +        df = pd.concat([df, data], ignore_index=True).to_csv(filename, index=False)
68 | +
69 | +    csvname = f'score_{cfg.dataset.dataset}.csv'
70 | +    model = cfg.dataset.pre_extract.feature_extractor.pretrain.name
71 | +    model = Path(cfg.dataset.pre_extract.feature_extractor.pretrain.evar_config).stem if model == 'evar' else model
72 | +    weight = Path(str(cfg.dataset.pre_extract.feature_extractor.pretrain.weight))
73 | +    report = {
74 | +        'model': [model],
75 | +        'weight': [weight.parent.name + '/' + weight.name],
76 | +        'task': [cfg.dataset.dataset],
77 | +    }
78 | +    for k in trainer.logged_metrics:
79 | +        report[k] = trainer.logged_metrics[k].item()
80 | +    result_df = pd.DataFrame(report)
81 | +    append_to_csv(csvname, result_df)
82 | +    print(report)
83 | +
84 |      wandb.finish()
85 |  
86 | 


--------------------------------------------------------------------------------
/evar/model_utils.py:
--------------------------------------------------------------------------------
  1 | """Model utilities.
  2 | """
  3 | 
  4 | import logging
  5 | from pathlib import Path
  6 | import torch
  7 | from torch import nn
  8 | 
  9 | 
 10 | def ensure_weights(filename, url):
 11 |     """Ensures thar `filename` exists, or download from the `url`"""
 12 | 
 13 |     if not Path(filename).is_file():
 14 |         import urllib.request
 15 |         logging.info(f'Downloading {url} as {filename} ...')
 16 |         urllib.request.urlretrieve(url, filename)
 17 | 
 18 | 
 19 | def load_pretrained_weights(model, pathname, model_key='model', strict=True):
 20 |     state_dict = torch.load(pathname)
 21 |     if 'state_dict' in state_dict:
 22 |         state_dict = state_dict['state_dict']
 23 |     if 'model' in state_dict:
 24 |         state_dict = state_dict['model']
 25 |     children = sorted([n + '.' for n, _ in model.named_children()])
 26 | 
 27 |     # 'model.xxx' -> 'xxx"
 28 |     weights = {}
 29 |     for k in state_dict:
 30 |         weights[k[len(model_key)+1:] if k.startswith(model_key+'.') else k] = state_dict[k]
 31 |     state_dict = weights
 32 | 
 33 |     # model's parameter only
 34 |     def find_model_prm(k):
 35 |         for name in children:
 36 |             if name in k: # ex) "conv_block1" in "model.conv_block1.conv1.weight"
 37 |                 return k
 38 |         return None
 39 | 
 40 |     weights = {}
 41 |     for k in state_dict:
 42 |         if find_model_prm(k) is None: continue
 43 |         weights[k] = state_dict[k]
 44 | 
 45 |     logging.info(f' using network pretrained weight: {Path(pathname).name}')
 46 |     print(list(weights.keys()))
 47 |     logging.info(str(model.load_state_dict(weights, strict=strict)))
 48 |     return sorted(list(weights.keys()))
 49 | 
 50 | 
 51 | def set_layers_trainable(layer, trainable=False):
 52 |     for n, p in layer.named_parameters():
 53 |         p.requires_grad = trainable
 54 | 
 55 | 
 56 | def show_layers_trainable(layer, show_all_trainable=True, print_str=True):
 57 |     total_params = sum(p.numel() for p in layer.parameters())
 58 |     total_trainable_params = sum(p.numel() for p in layer.parameters() if p.requires_grad)
 59 |     str_total = f'Total number of parameters: {total_params:,} (trainable {total_trainable_params:,})\n'
 60 |     if print_str: print(str_total)
 61 |     trainable = [n for n, p in layer.named_parameters() if p.requires_grad]
 62 |     str_trainable = f'Trainable parameters: {trainable if show_all_trainable else trainable[:10]} ...\n'
 63 |     frozen = [n for n, p in layer.named_parameters() if not p.requires_grad]
 64 |     str_frozen = f'\nOthers are frozen such as: {frozen[:3]} ...' if len(frozen) >= 3 else ''
 65 |     if print_str: print(str_trainable)
 66 |     if print_str: print(str_frozen)
 67 |     return str_total + str_trainable + str_frozen
 68 | 
 69 | 
 70 | def initialize_layers(layer):
 71 |     # initialize all childrens first.
 72 |     for l in layer.children():
 73 |         initialize_layers(l)
 74 | 
 75 |     # initialize only linaer
 76 |     if type(layer) != nn.Linear:
 77 |         return
 78 | 
 79 |     # Thanks to https://github.com/qiuqiangkong/audioset_tagging_cnn/blob/d2f4b8c18eab44737fcc0de1248ae21eb43f6aa4/pytorch/models.py#L10
 80 |     logging.debug(f' initialize {layer}.weight')
 81 |     nn.init.xavier_uniform_(layer.weight)
 82 |     if hasattr(layer, 'bias'):
 83 |         if layer.bias is not None:
 84 |             logging.debug(f' initialize {layer}.bias')
 85 |             layer.bias.data.fill_(0.)
 86 | 
 87 | 
 88 | class MLP(nn.Module):
 89 |     def __init__(self, input_size, hidden_sizes, output_size, hidden_dropout=0.5, mean=0.0, std=0.01, bias=0.):
 90 |         super().__init__()
 91 |         sizes = [input_size] + list(hidden_sizes) + [output_size]
 92 |         fcs = []
 93 |         for l, (in_size, out_size) in enumerate(zip(sizes[:-1], sizes[1:])):
 94 |             if l > 0:
 95 |                 fcs.append(nn.Dropout(hidden_dropout))
 96 |             linear = nn.Linear(in_size, out_size)
 97 |             nn.init.normal_(linear.weight, mean=mean, std=std)
 98 |             nn.init.constant_(linear.bias, bias)
 99 |             fcs.append(linear)
100 |             fcs.append(nn.ReLU())
101 |         self.mlp = nn.Sequential(*fcs[:-1])
102 | 
103 |     def forward(self, x):
104 |         out = self.mlp(x)
105 |         return out
106 | 
107 | 
108 | def mean_max_pooling(frame_embeddings, dim=-1):
109 |     assert len(frame_embeddings.shape) == 3 # Batch,Feature Dimension,Time
110 |     (x1, _) = torch.max(frame_embeddings, dim=dim)
111 |     x2 = torch.mean(frame_embeddings, dim=dim)
112 |     x = x1 + x2
113 |     return x
114 | 
115 | 
116 | def mean_pooling(frame_embeddings, dim=-1):
117 |     assert len(frame_embeddings.shape) == 3 # Batch,Feature Dimension,Time
118 |     x2 = torch.mean(frame_embeddings, dim=dim)
119 |     return x2
120 | 
121 | 
122 | def max_pooling(frame_embeddings, dim=-1):
123 |     assert len(frame_embeddings.shape) == 3 # Batch,Feature Dimension,Time
124 |     (x1, _) = torch.max(frame_embeddings, dim=dim)
125 |     return x1
126 | 


--------------------------------------------------------------------------------
/evar/ar_laionclap.py:
--------------------------------------------------------------------------------
  1 | """Wrapper code for:
  2 | 
  3 | Large-Scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation
  4 | 
  5 | ## Reference
  6 | - [1] https://arxiv.org/abs/2211.06687
  7 | - [2] https://github.com/LAION-AI/CLAP
  8 | """
  9 | 
 10 | from evar.ar_base import BaseCLAP
 11 | try:
 12 |     from packaging import version
 13 |     import torch
 14 |     import transformers, os
 15 |     import laion_clap
 16 | except:
 17 |     pass  # please install: pip install laion-clap
 18 | 
 19 | 
 20 | def load_state_dict(checkpoint_path: str, map_location="cpu", skip_params=True):
 21 |     # https://github.com/LAION-AI/CLAP/blob/817041c079af560fa2c610287c68c7c97ace50b6/src/laion_clap/clap_module/factory.py#L53
 22 |     checkpoint = torch.load(checkpoint_path, map_location=map_location)
 23 |     if isinstance(checkpoint, dict) and "state_dict" in checkpoint:
 24 |         state_dict = checkpoint["state_dict"]
 25 |     else:
 26 |         state_dict = checkpoint
 27 |     if skip_params:
 28 |         if next(iter(state_dict.items()))[0].startswith("module"):
 29 |             state_dict = {k[7:]: v for k, v in state_dict.items()}
 30 |         
 31 |         # removing position_ids to maintain compatibility with latest transformers update
 32 |         if version.parse(transformers.__version__) >= version.parse("4.31.0"): 
 33 |             del state_dict["text_branch.embeddings.position_ids"]
 34 |     return state_dict
 35 | 
 36 | 
 37 | def load_ckpt(self, ckpt = None, model_id = -1, verbose = True):
 38 |     # https://github.com/LAION-AI/CLAP/blob/817041c079af560fa2c610287c68c7c97ace50b6/src/laion_clap/hook.py#L74C2-L119C5
 39 |     """Load the pretrained checkpoint of CLAP model
 40 | 
 41 |     Parameters
 42 |     ----------
 43 |     ckpt: str
 44 |         if ckpt is specified, the model will load this ckpt, otherwise the model will download the ckpt from zenodo. \n 
 45 |         For fusion model, it will download the 630k+audioset fusion model (id=3). For non-fusion model, it will download the 630k+audioset model (id=1).
 46 |     model_id:
 47 |         if model_id is specified, you can download our best ckpt, as:
 48 |             id = 0 --> 630k non-fusion ckpt \n
 49 |             id = 1 --> 630k+audioset non-fusion ckpt \n
 50 |             id = 2 --> 630k fusion ckpt \n
 51 |             id = 3 --> 630k+audioset fusion ckpt \n
 52 |         Note that if your model is specied as non-fusion model but you download a fusion model ckpt, you will face an error.
 53 |     """
 54 |     import wget
 55 |     download_link = 'https://huggingface.co/lukewys/laion_clap/resolve/main/'
 56 |     download_names = [
 57 |         '630k-best.pt',
 58 |         '630k-audioset-best.pt',
 59 |         '630k-fusion-best.pt',
 60 |         '630k-audioset-fusion-best.pt'
 61 |     ]
 62 |     if ckpt is not None:
 63 |         print(f'Load the specified checkpoint {ckpt} from users.')
 64 |     else:
 65 |         print(f'Load our best checkpoint in the paper.')
 66 |         if model_id == -1:
 67 |             model_id = 3 if self.enable_fusion else 1
 68 |         package_dir = os.path.dirname(os.path.realpath(__file__))
 69 |         weight_file_name = download_names[model_id]
 70 |         ckpt = os.path.join(package_dir, weight_file_name)
 71 |         print(ckpt)
 72 |         if os.path.exists(ckpt):
 73 |             print(f'The checkpoint is already downloaded')
 74 |         else:
 75 |             print('Downloading laion_clap weight files...')
 76 |             ckpt = wget.download(download_link + weight_file_name, os.path.dirname(ckpt))
 77 |             print('Download completed!')
 78 |     print('Load Checkpoint...')
 79 |     ckpt = load_state_dict(ckpt, skip_params=True)
 80 |     self.model.load_state_dict(ckpt)
 81 |     if verbose:
 82 |         param_names = [n for n, p in self.model.named_parameters()]
 83 |         for n in param_names:
 84 |             print(n, "\t", "Loaded" if n in ckpt else "Unloaded")
 85 | 
 86 | 
 87 | class AR_LAIONCLAP(BaseCLAP):
 88 | 
 89 |     def __init__(self, cfg):
 90 |         super().__init__(cfg=cfg)
 91 | 
 92 |         self.backbone = laion_clap.CLAP_Module()
 93 |         # workaround to make sure: del state_dict["text_branch.embeddings.position_ids"]
 94 |         print(version.parse(transformers.__version__))
 95 |         self.backbone.load_ckpt = load_ckpt.__get__(self.backbone, laion_clap.CLAP_Module)  
 96 |         self.backbone.load_ckpt()
 97 | 
 98 |     def encode_frames(self, batch_audio):
 99 |         assert False, 'encode_frames for LAION-CLAP is not supported for now'
100 | 
101 |     def forward(self, batch_audio):
102 |         audio_embeddings = self.backbone.get_audio_embedding_from_data(x=batch_audio, use_tensor=True)
103 |         return audio_embeddings
104 | 
105 |     def encode_audio(self, batch_audio):
106 |         audio_embeddings = self.forward(batch_audio)
107 |         return audio_embeddings
108 | 
109 |     def encode_text(self, batch_text):
110 |         text_embeddings = self.backbone.get_text_embedding(batch_text, use_tensor=True)
111 |         return text_embeddings
112 | 


--------------------------------------------------------------------------------
/Preparing-models.md:
--------------------------------------------------------------------------------
  1 | # Instructions for preparing models
  2 | 
  3 | The followings are command lines to prepare models.
  4 | 
  5 | **Note: you can setup only the models you need.**
  6 | 
  7 | ## AST
  8 | 
  9 |     cd external/
 10 |     git clone https://github.com/YuanGongND/ast.git
 11 |     patch -p1 < ast_models.patch
 12 |     pip install wget
 13 |     cd ..
 14 | 
 15 | ## ATST & ATST-Frame
 16 | 
 17 | In addition to the following steps, please download the ATST-Frame checkpoint as `external/atstframe_base.ckpt` from https://github.com/Audio-WestlakeU/audiossl/tree/main/audiossl/methods/atstframe.
 18 | 
 19 |     (cd external && git clone https://github.com/Audio-WestlakeU/audiossl.git)
 20 |     (cd external && wget https://checkpointstorage.oss-cn-beijing.aliyuncs.com/atst/base.ckpt -O atst_base.ckpt)
 21 |     pip install pytorch_lightning fairseq
 22 | 
 23 | ## BEATs
 24 | 
 25 | In addition to the following steps, please download the BEATs_iter3 and BEATs_iter3_plus checkpoints as `external/BEATs_iter3.pt` and `external/BEATs_iter3_plus_AS2M.pt` from https://github.com/microsoft/unilm/tree/master/beats.
 26 | 
 27 |     (cd external && git clone https://github.com/microsoft/unilm.git)
 28 | 
 29 | ## BYOL-A (IJCNN2021) & BYOL-A v2 (TASLP2023)
 30 | 
 31 |     cd external/
 32 |     git clone https://github.com/nttcslab/byol-a.git
 33 |     mv byol-a byol_a
 34 |     cd ..
 35 | 
 36 | ## CED
 37 | 
 38 |     (cd external && git clone https://github.com/jimbozhang/hf_transformers_custom_model_ced.git)
 39 |     pip install transformers
 40 | 
 41 | ## COALA
 42 | 
 43 |     cd external/
 44 |     git clone https://github.com/xavierfav/coala.git
 45 |     cd coala
 46 |     patch -p1 < ../../external/coala.patch
 47 |     cd ../..
 48 | 
 49 | ## Dasheng
 50 | 
 51 |     pip install git+https://github.com/jimbozhang/hf_transformers_custom_model_dasheng.git
 52 | 
 53 | ## ESResNe(X)t-fbsp
 54 | 
 55 |     cd external
 56 |     wget https://github.com/AndreyGuzhov/ESResNeXt-fbsp/releases/download/v0.1/ESResNeXtFBSP_AudioSet.pt
 57 |     git clone https://github.com/AndreyGuzhov/ESResNeXt-fbsp.git esresnext
 58 |     pip install msgpack_numpy
 59 |     cd esresnext
 60 |     sed -i 's/import ignite_trainer as it/#import ignite_trainer as it/' model/esresnet_base.py utils/transforms.py utils/datasets.py utils/datasets.py
 61 |     sed -i 's/it\.AbstractNet/torch.nn\.Module/' model/esresnet_base.py
 62 |     sed -i 's/it\.AbstractTransform/torch.nn\.Module/' utils/transforms.py
 63 |     sed -i 's/from model /from \. /' model/esresnet_base.py
 64 |     sed -i 's/from model\./from \./' model/esresnet_fbsp.py
 65 |     sed -i 's/from utils/from \.\.utils/' model/esresnet_base.py model/esresnet_fbsp.py
 66 |     sed -i 's/from utils/from \./' utils/datasets.py
 67 |     cd ../..
 68 | 
 69 | ## HTS-AT
 70 | 
 71 | In addition to the following steps, please download the checkpoint as `external/HTSAT_AudioSet_Saved_1.ckpt` from https://github.com/RetroCirce/HTS-Audio-Transformer?tab=readme-ov-file#model-checkpoints.
 72 | 
 73 |     (cd external && git clone https://github.com/RetroCirce/HTS-Audio-Transformer.git htsat)
 74 |     pip install h5py museval torchlibrosa
 75 | 
 76 | ## M2D
 77 | 
 78 | To get M2D ready, follow the steps 👉 [M2D setup](https://github.com/nttcslab/m2d?tab=readme-ov-file#1-setup):
 79 | 
 80 |     cd external
 81 |     << follow the steps described in https://github.com/nttcslab/m2d?tab=readme-ov-file#1-setup >>
 82 | 
 83 | Download the weights from the GitHub. Example:
 84 | 
 85 |     wget https://github.com/nttcslab/m2d/releases/download/v0.3.0/m2d_clap_vit_base-80x1001p16x16-240128_AS-FT_enconly.zip
 86 |     unzip m2d_clap_vit_base-80x1001p16x16-240128_AS-FT_enconly.zip
 87 | 
 88 | You will find the `m2d_clap_vit_base-80x1001p16x16-240128_AS-FT_enconly` folder.
 89 | The following runs a linear evaluation on CREMA-D.
 90 | 
 91 |     python lineareval.py config/m2d.yaml cremad weight_file=m2d_clap_vit_base-80x1001p16x16-240128_AS-FT_enconly/weights_ep67it3124-0.48558.pth
 92 | 
 93 | ## MS-CLAP, LAION-CLAP
 94 | 
 95 |     pip install msclap
 96 |     pip install laion-clap
 97 | 
 98 | ## Opera
 99 | 
100 |     (cd external && git clone https://github.com/evelyn0414/OPERA.git)
101 |     (cd external/OPERA && curl -L -O https://huggingface.co/evelyn0414/OPERA/resolve/main/encoder-operaCT.ckpt)
102 |     (cd external/OPERA && patch -p0 < ../opera.patch)
103 | 
104 | ## VGGish
105 | 
106 |     cd external
107 |     git clone https://github.com/tcvrick/audioset-vggish-tensorflow-to-pytorch.git tcvrick_vggish
108 |     sed -i 's/from audioset import/from \. import/' tcvrick_vggish/audioset/vggish_input.py
109 |     wget https://github.com/tcvrick/audioset-vggish-tensorflow-to-pytorch/releases/download/v0.1/pytorch_vggish.zip
110 |     unzip pytorch_vggish.zip
111 |     cd ..
112 | 
113 | ## WavCaps
114 | 
115 | In addition to the following steps, please download the checkpoint `HTSAT-BERT-PT.pt` in the folder `external/WavCaps` from https://github.com/XinhaoMei/WavCaps/tree/master/retrieval.
116 | 
117 |     (cd external && git clone https://github.com/XinhaoMei/WavCaps.git)
118 |     (cd external/WavCaps && git apply ../../external/wavcaps.patch)
119 |     pip install ruamel.yaml sentence_transformers wandb loguru torchlibrosa
120 | 
121 | 


--------------------------------------------------------------------------------
/evar/ar_opera.py:
--------------------------------------------------------------------------------
  1 | """Wrapper code for:
  2 | 
  3 | Towards Open Respiratory Acoustic Foundation Models: Pretraining and Benchmarking
  4 | 
  5 | ## Reference
  6 | - [1] https://arxiv.org/abs/2406.16148
  7 | - [2] https://github.com/evelyn0414/OPERA
  8 | """
  9 | 
 10 | from evar.ar_base import BaseAudioRepr, np
 11 | import torch
 12 | import librosa
 13 | import logging
 14 | 
 15 | try:
 16 |     import sys
 17 |     sys.path.append('../../external/OPERA')
 18 |     import os
 19 |     evar_home = os.getenv('EVAR', '')
 20 |     sys.path.append(os.path.join(evar_home, 'external/OPERA'))
 21 |     from src.model.models_cola import Cola
 22 |     from src.util import _equally_slice_pad_sample, _duplicate_padding
 23 | except Exception as e:
 24 |     pass  # print(f'(For M2D users) Build your EVAR in your M2D folder.')
 25 | 
 26 | 
 27 | def split_pad_sample(sample, desired_length, sample_rate, types='repeat'):
 28 |     # Quoted from https://github.com/evelyn0414/OPERA/blob/main/src/util.py
 29 |     """
 30 |     if the audio sample length > desired_length, then split and pad samples
 31 |     else simply pad samples according to pad_types
 32 |     * types 'zero'   : simply pad by zeros (zero-padding)
 33 |     * types 'repeat' : pad with duplicate on both sides (half-n-half)
 34 |     * types 'aug'    : pad with augmented sample on both sides (half-n-half)	
 35 |     """
 36 |     if types == 'zero':
 37 |         return _equally_slice_pad_sample(sample, desired_length, sample_rate)
 38 | 
 39 |     output_length = int(desired_length * sample_rate)
 40 |     soundclip = sample[0].copy()
 41 |     n_samples = len(soundclip)
 42 | 
 43 |     output = []
 44 |     if n_samples > output_length:
 45 |         """
 46 |         if sample length > desired_length, slice samples with desired_length then just use them,
 47 |         and the last sample is padded according to the padding types
 48 |         """
 49 |         # frames[j] = x[j * hop_length : j * hop_length + frame_length]
 50 |         frames = librosa.util.frame(
 51 |             soundclip, frame_length=output_length, hop_length=output_length//2, axis=0)
 52 |         for i in range(frames.shape[0]):
 53 |             output.append((frames[i], sample[1], sample[2]))
 54 | 
 55 |         # get the last sample
 56 |         last_id = frames.shape[0] * (output_length//2)
 57 |         last_sample = soundclip[last_id:]
 58 | 
 59 |         padded = _duplicate_padding(
 60 |             soundclip, last_sample, output_length, sample_rate, types)
 61 |         output.append((padded, sample[1], sample[2]))
 62 |     else:  # only pad
 63 |         padded = _duplicate_padding(
 64 |             soundclip, soundclip, output_length, sample_rate, types)
 65 |         output.append((padded, sample[1], sample[2]))
 66 | 
 67 |     return output
 68 | 
 69 | 
 70 | def pre_process_audio_mel_t(audio, sample_rate=16000, n_mels=64, f_min=50, f_max=8000, nfft=1024, hop=512):
 71 |     # Quoted from https://github.com/evelyn0414/OPERA/blob/main/src/util.py
 72 |     S = librosa.feature.melspectrogram(
 73 |         y=audio, sr=sample_rate, n_mels=n_mels, fmin=f_min, fmax=f_max, n_fft=nfft, hop_length=hop)
 74 |     # convert scale to dB from magnitude
 75 |     S = librosa.power_to_db(S, ref=np.max)
 76 |     if S.max() != S.min():
 77 |         mel_db = (S - S.min()) / (S.max() - S.min())
 78 |     else:
 79 |         mel_db = S
 80 |         print("warning in producing spectrogram!")
 81 | 
 82 |     return mel_db
 83 | 
 84 | 
 85 | def get_entire_signal_librosa(data, input_sec=8, sample_rate=16000, butterworth_filter=None, pad=False, from_cycle=False, yt=None, types='repeat'):
 86 |     device = data.device
 87 |     # Cut from https://github.com/evelyn0414/OPERA/blob/main/src/util.py
 88 |     # Trim leading and trailing silence from an audio signal.
 89 |     FRAME_LEN = int(sample_rate / 10)  # 
 90 |     HOP = int(FRAME_LEN / 2)  # 50% overlap, meaning 5ms hop length
 91 |     yt, index = librosa.effects.trim(data.cpu().numpy(), frame_length=FRAME_LEN, hop_length=HOP)
 92 | 
 93 |     # check audio not too short    
 94 |     duration = librosa.get_duration(y=yt, sr=sample_rate)
 95 |     if duration < input_sec:
 96 |         yt = split_pad_sample([yt, 0,0], input_sec, sample_rate, types)[0][0]
 97 |     
 98 |     # # visualization for testing the spectrogram parameters
 99 |     # plot_melspectrogram(yt.squeeze(), title=filename.replace("/", "-"))
100 |     return torch.tensor(pre_process_audio_mel_t(yt.squeeze(), f_max=8000)).to(device)
101 | 
102 | 
103 | class AR_OPERA_CT(BaseAudioRepr):
104 | 
105 |     def __init__(self, cfg):
106 |         super().__init__(cfg=cfg)
107 |         if 'icbhi_sprs_mode' not in cfg:
108 |             logging.error('\n\n *** The model supports app/ICBHI_SPRT only. Exiting... ***\n')
109 |             exit(-1)
110 |         self.backbone = Cola(encoder="htsat")
111 |         ckpt = torch.load(cfg.weight_file)
112 |         self.backbone.load_state_dict(ckpt["state_dict"], strict=False)
113 | 
114 |     def encode_frames(self, batch_audio):
115 |         x = get_entire_signal_librosa(batch_audio, input_sec=8) #, input_sec=self.cfg.unit_samples / self.cfg.sample_rate)
116 |         x = self.augment_if_training(x)
117 |         x = x.transpose(-2, -1)  # B,D,T -> B,T,D
118 |         features = self.backbone.extract_feature(x, self.cfg.feature_d)
119 |         return features.unsqueeze(-1) # [B, D] -> [B, D, 1]
120 | 
121 |     def forward(self, batch_audio):
122 |         x = self.encode_frames(batch_audio)
123 |         return x.mean(dim=-1) # [B, D, T] -> [B, D]
124 | 
125 | 


--------------------------------------------------------------------------------
/plugin/MARBLE/benchmark/models/evar/extract_evar_features.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import argparse
  3 | 
  4 | import torch
  5 | import numpy as np
  6 | from tqdm import tqdm
  7 | import wget
  8 | import sys
  9 | import pandas as pd
 10 | import librosa
 11 | 
 12 | from benchmark.utils.audio_utils import load_audio, find_audios
 13 | 
 14 | sys.path.append(os.environ.get('EVAR', ''))
 15 | import evar
 16 | from lineareval import make_cfg
 17 | 
 18 | 
 19 | def select_args(config):
 20 |     args = argparse.Namespace()
 21 |     args.accelerator = config.dataset.pre_extract.accelerator
 22 |     args.output_dir = config.dataset.pre_extract.output_dir
 23 |     args.overwrite = config.dataset.pre_extract.overwrite
 24 |     args.audio_dir = config.dataset.pre_extract.audio_dir
 25 |     args.n_shard = config.args.n_shard
 26 |     args.shard_rank = config.args.shard_rank
 27 |     args.keep_folder_structure = config.dataset.pre_extract.keep_folder_structure
 28 |     args.evar_config = config.dataset.pre_extract.feature_extractor.pretrain.evar_config
 29 |     args.weight = config.dataset.pre_extract.feature_extractor.pretrain.weight
 30 |     args.options = config.dataset.pre_extract.feature_extractor.pretrain.options
 31 |     return args
 32 | 
 33 | 
 34 | class WavDataset(evar.data.BaseRawAudioDataset):
 35 |     def __init__(self, cfg, files):
 36 |         super().__init__(cfg.unit_samples, tfms=None, random_crop=False, return_filename=cfg.return_filename)
 37 |         self.cfg = cfg
 38 |         self.df = pd.DataFrame({'file_name': files})
 39 |         self.cfg.task_data = 'dummy'
 40 | 
 41 |     def __len__(self):
 42 |         return len(self.df)
 43 | 
 44 |     def get_audio(self, index):
 45 |         filename = self.df.file_name.values[index]
 46 |         if self.cfg.return_filename:
 47 |             return filename
 48 |         wav, sr = librosa.load(filename, sr=self.cfg.sample_rate, mono=True)
 49 |         wav = torch.tensor(wav).to(torch.float32)
 50 |         return wav
 51 | 
 52 |     def __getitem__(self, index):
 53 |         wav = self.get_audio(index)
 54 |         return wav
 55 | 
 56 | 
 57 | def collate_trunc_wav(original_batch):
 58 |     if isinstance(original_batch[0], (str)):
 59 |         return original_batch  # return_filename
 60 |     # truncate all items to the size of the shortest item
 61 |     truncated = []
 62 |     shortest = min([b.shape[-1] for b in original_batch])
 63 |     for item in original_batch:
 64 |         l = item.shape[-1]
 65 |         if l > shortest:
 66 |             i = np.random.randint(l - shortest)
 67 |             item = item[..., i:i+shortest]
 68 |         truncated.append(item)
 69 |     return torch.stack(truncated)
 70 | 
 71 | 
 72 | def main(config):
 73 |     args = select_args(config)
 74 | 
 75 |     os.makedirs(args.output_dir, exist_ok=True)
 76 | 
 77 |     audio_files = find_audios(args.audio_dir)
 78 |     print(f'Found {len(audio_files)} audio files')
 79 | 
 80 |     if args.n_shard > 1:
 81 |         print(f'processing shard {args.shard_rank} of {args.n_shard}')
 82 |         audio_files.sort() # make sure no intersetction
 83 |         audio_files = audio_files[args.shard_rank * len(audio_files) // args.n_shard : (args.shard_rank + 1) * len(audio_files) // args.n_shard]
 84 |     
 85 |     device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 86 |     options = f'weight_file={args.weight},' + ('' if args.options is None else args.options)
 87 |     cfg, n_folds, balanced = make_cfg(args.evar_config, 'as20k', options, extras={}, abs_unit_sec=10)  # as20k is a dummy task, 10s is a dummy input unit second
 88 |     model = eval('evar.'+cfg.audio_repr)(cfg).to(device)
 89 | 
 90 |     batch_size = 32  # TODO make it flexible
 91 |     dataset = WavDataset(cfg, np.random.default_rng().choice(audio_files, min(len(audio_files), 1000), replace=False))  # choose random 1000< samples for calculating statistics
 92 |     data_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_trunc_wav)
 93 |     print(f'Precomputing using the audio representation: {cfg.id} for {len(dataset)} files ({len(data_loader)} batches)')
 94 |     model.precompute(device, data_loader)
 95 | 
 96 |     print(f'Extracting features using {cfg.id}')
 97 |     for audio_file in tqdm(audio_files):
 98 |         # load audio
 99 |         try:
100 |             waveform = load_audio(
101 |                 audio_file,
102 |                 target_sr=config.dataset.pre_extract.feature_extractor.pretrain.target_sr,
103 |                 is_mono=True,
104 |                 is_normalize=False,
105 |                 crop_to_length_in_sec=None,
106 |             )
107 |         except Exception as e:
108 |             print(f"skip audio {audio_file} because of {e}")
109 |             continue
110 |         
111 |         # extract features
112 |         #waveform = waveform.squeeze().cpu().numpy()
113 |         with torch.no_grad():
114 |             audio_data = [audio_file] if cfg.return_filename else waveform.to('cuda')
115 |             embeddings = model(audio_data) # [dims]
116 |         # reshape to [1, 1, dims]
117 |         out = embeddings.reshape(1, 1, -1).cpu().detach().numpy()
118 |         
119 |         # save to npy
120 |         if args.keep_folder_structure:
121 |             output_file = os.path.join(
122 |                 args.output_dir,
123 |                 os.path.relpath(audio_file, args.audio_dir)+'.npy',
124 |             )
125 |             os.makedirs(os.path.dirname(output_file), exist_ok=True)
126 |         else:
127 |             output_file = os.path.join(
128 |                 args.output_dir,
129 |                 os.path.basename(audio_file)+'.npy',
130 |             )
131 |         if not args.overwrite:
132 |             assert not os.path.exists(output_file), f"{output_file} exists"
133 |         np.save(output_file, out)
134 | 


--------------------------------------------------------------------------------