├── evar ├── __init__.py ├── utils │ ├── m2d_add_norm_stats.py │ ├── calculations.py │ ├── download_voxforge.py │ ├── download_cremad.py │ └── __init__.py ├── ar_openl3.py ├── ar_trill.py ├── ar_msclap.py ├── ar_spec.py ├── ar_dasheng.py ├── ar_ced.py ├── ar_atst_frame.py ├── ar_cnn14.py ├── ar_data2vec.py ├── ar_byola2.py ├── ar_wavlm.py ├── ar_hubert.py ├── ar_vggish.py ├── ar_esresnext_fbsp.py ├── ar_htsat.py ├── ar_beats.py ├── ar_wavcaps.py ├── ar_coala.py ├── ar_ast.py ├── ds_tasks.py ├── ar_byola.py ├── ar_wav2vec2.py ├── common.py ├── model_utils.py ├── ar_laionclap.py └── ar_opera.py ├── external ├── coala_scaler_top_1000_plus_clip.pkl ├── opera.patch ├── coala.patch ├── wavcaps.patch └── ast_models.patch ├── requirements.txt ├── plugin ├── MARBLE │ ├── benchmark │ │ └── models │ │ │ └── evar │ │ │ ├── evar.yaml │ │ │ └── extract_evar_features.py │ ├── configs │ │ └── evar │ │ │ ├── MTT.yaml │ │ │ ├── GTZAN.yaml │ │ │ ├── NSynthI.yaml │ │ │ ├── NSynthP.yaml │ │ │ ├── GS.yaml │ │ │ ├── VocalSetS.yaml │ │ │ ├── VocalSetT.yaml │ │ │ ├── MTGMood.yaml │ │ │ ├── MTGGenre.yaml │ │ │ ├── MTGTop50.yaml │ │ │ ├── MTGInstrument.yaml │ │ │ └── EMO.yaml │ ├── evar_marble.sh │ ├── REAEDME_MARBLE.md │ └── evar_marble_diff.patch └── OPERA │ ├── evar_openl3env.sh │ ├── evar_atst_clip.sh │ ├── evar_m2d.sh │ ├── evar_beats.sh │ ├── evar_ast.sh │ ├── evar_htsat.sh │ ├── evar_byola.sh │ ├── evar_hubert.sh │ ├── evar_wavlm.sh │ ├── evar_atst_frame.sh │ ├── evar_beats_plus.sh │ ├── evar_wav2vec2.sh │ └── evar_m2d_layers.sh ├── config ├── coala.yaml ├── esresnextfbsp.yaml ├── linspec.yaml ├── melspec.yaml ├── openl3env.yaml ├── openl3mus.yaml ├── wav2vec2feature.yaml ├── dasheng.yaml ├── wavlm.yaml ├── hubert.yaml ├── trill.yaml ├── data2vec.yaml ├── wav2vec2logit.yaml ├── wav2vec2context.yaml ├── laionclap.yaml ├── atst.yaml ├── atst_frame.yaml ├── byola2.yaml ├── cnn14.yaml ├── byola.yaml ├── byolax.yaml ├── opera.yaml ├── ast.yaml ├── vggish.yaml ├── vggish_4k.yaml ├── msclap.yaml ├── wavcaps.yaml ├── ced.yaml ├── htsat.yaml ├── beats.yaml ├── m2d_clap.yaml ├── m2d_clap_32k.yaml ├── beats_plus.yaml ├── m2d.yaml └── m2d_32k.yaml ├── app ├── bmdhs │ ├── ev_ast.sh │ ├── ev_beats.sh │ ├── ev_byola.sh │ ├── ev_m2d.sh │ ├── make_metadata.py │ └── README_BMDHS.md ├── circor │ ├── ev_ast.sh │ ├── ev_beats.sh │ ├── ev_byola.sh │ ├── ev_m2d.sh │ ├── patch-heart-murmur-detection.diff │ ├── README_CirCor.md │ └── rearrange_data.py ├── icbhi_sprs │ ├── ev_icbhi_ast.sh │ ├── ev_icbhi_m2d.sh │ ├── ev_icbhi_mlp_m2d.sh │ ├── ev_icbhi_opera.sh │ ├── ev_sprs_ast.sh │ ├── ev_icbhi_beats.sh │ ├── ev_sprs_m2d.sh │ ├── ev_sprs_opera.sh │ ├── ev_sprs_byola.sh │ ├── ev_icbhi_byola.sh │ ├── ev_sprs_beats.sh │ └── README_ICBHI_SPRS.md └── README.md ├── run ├── all_byola2.sh ├── all_ced.sh ├── all_atst.sh ├── all_beats.sh ├── all_htsat.sh ├── all_m2d.sh ├── all_atst_frame.sh ├── all_beats_plus.sh ├── all_msclap.sh ├── all_laionclap.sh └── all_wavcaps.sh ├── 2pass_lineareval.py ├── prepare_wav.py ├── .gitignore ├── Evaluation-examples.md ├── summarize.py └── Preparing-models.md /evar/__init__.py: -------------------------------------------------------------------------------- 1 | # EVAR 2 | -------------------------------------------------------------------------------- /external/coala_scaler_top_1000_plus_clip.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nttcslab/eval-audio-repr/HEAD/external/coala_scaler_top_1000_plus_clip.pkl -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | pandas 2 | numpy 3 | torch 4 | torchvision 5 | torchaudio 6 | easydict 7 | fire 8 | tqdm 9 | sklearn 10 | nnAudio 11 | torchlibrosa 12 | torchopenl3 13 | pyyaml -------------------------------------------------------------------------------- /plugin/MARBLE/benchmark/models/evar/evar.yaml: -------------------------------------------------------------------------------- 1 | name: evar 2 | target_sr: 16000 3 | num_features: 4 | pretrain_folder: benchmark/models/evar 5 | evar_config: m2d.yaml 6 | weight: m2d_vit_base-80x608p16x16-221006-mr7/checkpoint-300.pth 7 | options: -------------------------------------------------------------------------------- /config/coala.yaml: -------------------------------------------------------------------------------- 1 | # AR parameters such as FFT parameters. 2 | name: AR_COALA 3 | audio_repr: ar_coala.AR_COALA 4 | feature_d: 1152 5 | sample_rate: 22000 # COALA Special 6 | temporal_pooling_type: mean_max 7 | 8 | # Training parameters. 9 | batch_size: 64 10 | lr_lineareval: 0.0003 11 | report_per_epochs: 20 12 | early_stop_epochs: 20 13 | -------------------------------------------------------------------------------- /config/esresnextfbsp.yaml: -------------------------------------------------------------------------------- 1 | # AR parameters such as FFT parameters. 2 | audio_repr: ar_esresnext_fbsp.AR_ESResNeXtFBSP 3 | feature_d: 2048 4 | sample_rate: 44100 5 | # temporal_pooling_type: -> not using common temporal pooling. 6 | 7 | # Training parameters. 8 | batch_size: 64 9 | lr_lineareval: 0.0003 10 | report_per_epochs: 20 11 | early_stop_epochs: 20 12 | -------------------------------------------------------------------------------- /config/linspec.yaml: -------------------------------------------------------------------------------- 1 | # AR parameters such as FFT parameters. 2 | audio_repr: ar_spec.AR_LinSpec 3 | feature_d: 513 4 | sample_rate: 16000 5 | n_fft: 1024 6 | window_size: 1024 7 | hop_size: 160 8 | #n_mels: 64 9 | f_min: 60 10 | f_max: 7800 11 | temporal_pooling_type: mean_max 12 | 13 | # Training parameters. 14 | batch_size: 256 15 | lr_lineareval: 0.0003 16 | report_per_epochs: 20 17 | early_stop_epochs: 20 18 | -------------------------------------------------------------------------------- /config/melspec.yaml: -------------------------------------------------------------------------------- 1 | # AR parameters such as FFT parameters. 2 | audio_repr: ar_spec.AR_MelSpec 3 | feature_d: 64 4 | sample_rate: 16000 5 | n_fft: 1024 6 | window_size: 1024 7 | hop_size: 160 8 | n_mels: 64 9 | f_min: 60 10 | f_max: 7800 11 | temporal_pooling_type: mean_max 12 | 13 | # Training parameters. 14 | batch_size: 256 15 | lr_lineareval: 0.0003 16 | report_per_epochs: 20 17 | early_stop_epochs: 20 18 | -------------------------------------------------------------------------------- /config/openl3env.yaml: -------------------------------------------------------------------------------- 1 | # AR parameters such as FFT parameters. 2 | name: AR_OpenL3Env 3 | audio_repr: ar_openl3.AR_OpenL3 4 | feature_d: 6144 5 | sample_rate: 48000 6 | temporal_pooling_type: mean_max 7 | 8 | # Training parameters. 9 | batch_size: 64 10 | lr_lineareval: 0.0003 11 | report_per_epochs: 20 12 | early_stop_epochs: 20 13 | 14 | # Model specific parameters. 15 | openl3_input_repr: mel256 16 | openl3_content_type: env 17 | -------------------------------------------------------------------------------- /config/openl3mus.yaml: -------------------------------------------------------------------------------- 1 | # AR parameters such as FFT parameters. 2 | name: AR_OpenL3Mus 3 | audio_repr: ar_openl3.AR_OpenL3 4 | feature_d: 6144 5 | sample_rate: 48000 6 | temporal_pooling_type: mean_max 7 | 8 | # Training parameters. 9 | batch_size: 64 10 | lr_lineareval: 0.0003 11 | report_per_epochs: 20 12 | early_stop_epochs: 20 13 | 14 | # Model specific parameters. 15 | openl3_input_repr: mel256 16 | openl3_content_type: music 17 | -------------------------------------------------------------------------------- /config/wav2vec2feature.yaml: -------------------------------------------------------------------------------- 1 | # AR parameters such as FFT parameters. 2 | audio_repr: ar_wav2vec2.AR_Wav2Vec2Feature 3 | feature_d: 512 4 | sample_rate: 16000 5 | temporal_pooling_type: mean_max 6 | 7 | # Training parameters. 8 | batch_size: 256 9 | lr_lineareval: 0.0003 10 | report_per_epochs: 20 11 | early_stop_epochs: 20 12 | 13 | # Model specific parameters. 14 | wav2vec_model: facebook/wav2vec2-base-960h 15 | # wav2vec_model: facebook/wav2vec2-large-960h-lv60 16 | -------------------------------------------------------------------------------- /config/dasheng.yaml: -------------------------------------------------------------------------------- 1 | # AR parameters such as FFT parameters. 2 | audio_repr: ar_dasheng.AR_Dasheng 3 | feature_d: 768 4 | sample_rate: 16000 5 | # temporal_pooling_type: mean_max 6 | 7 | # Training parameters. 8 | batch_size: 256 9 | lr_lineareval: 0.0003 10 | report_per_epochs: 20 11 | early_stop_epochs: 20 12 | 13 | # Model specific parameters. 14 | # model_name: mispeech/dasheng-base 15 | # model_name: mispeech/dasheng-0.6B 16 | model_name: mispeech/dasheng-1.2B 17 | -------------------------------------------------------------------------------- /config/wavlm.yaml: -------------------------------------------------------------------------------- 1 | # AR parameters such as FFT parameters. 2 | audio_repr: ar_wavlm.AR_WavLM 3 | feature_d: 768 4 | sample_rate: 16000 5 | temporal_pooling_type: mean 6 | 7 | # Training parameters. 8 | batch_size: 64 9 | lr_lineareval: 0.0003 10 | report_per_epochs: 20 11 | early_stop_epochs: 20 12 | 13 | # Model specific parameters. 14 | pretrained_model: microsoft/wavlm-base 15 | # pretrained_model: microsoft/wavlm-large 16 | output_layers: [-1] # list of layers to stack 17 | -------------------------------------------------------------------------------- /config/hubert.yaml: -------------------------------------------------------------------------------- 1 | # AR parameters such as FFT parameters. 2 | audio_repr: ar_hubert.AR_Hubert 3 | feature_d: 768 4 | sample_rate: 16000 5 | temporal_pooling_type: mean 6 | 7 | # Training parameters. 8 | batch_size: 64 9 | lr_lineareval: 0.0003 10 | report_per_epochs: 20 11 | early_stop_epochs: 20 12 | 13 | # Model specific parameters. 14 | pretrained_model: facebook/hubert-base-ls960 15 | # pretrained_model: facebook/hubert-large-ls960-ft 16 | output_layers: [-1] # list of layers to stack 17 | -------------------------------------------------------------------------------- /config/trill.yaml: -------------------------------------------------------------------------------- 1 | # AR parameters such as FFT parameters. 2 | name: AR_TRILL 3 | audio_repr: ar_trill.AR_TRILL 4 | feature_d: 12288 5 | sample_rate: 16000 6 | temporal_pooling_type: mean_max 7 | 8 | # Training parameters. 9 | batch_size: 64 10 | lr_lineareval: 0.0003 11 | report_per_epochs: 20 12 | early_stop_epochs: 20 13 | 14 | # Model specific parameters. 15 | trill_emb_type: layer19 16 | # trill_emb_type: embedding 17 | trill_url: https://tfhub.dev/google/nonsemantic-speech-benchmark/trill/2 18 | -------------------------------------------------------------------------------- /config/data2vec.yaml: -------------------------------------------------------------------------------- 1 | # AR parameters such as FFT parameters. 2 | audio_repr: ar_data2vec.AR_Data2Vec 3 | feature_d: 768 4 | sample_rate: 16000 5 | temporal_pooling_type: mean 6 | 7 | # Training parameters. 8 | batch_size: 64 9 | lr_lineareval: 0.0003 10 | report_per_epochs: 20 11 | early_stop_epochs: 20 12 | 13 | # Model specific parameters. 14 | # pretrained_model: facebook/data2vec-audio-base-960h 15 | pretrained_model: facebook/data2vec-audio-large-960h 16 | output_layers: [-1] # list of layers to stack 17 | -------------------------------------------------------------------------------- /external/opera.patch: -------------------------------------------------------------------------------- 1 | diff -ur src_org/model/models_cola.py src/model/models_cola.py 2 | --- src_org/model/models_cola.py 2024-12-13 22:35:31.594477687 +0900 3 | +++ src/model/models_cola.py 2024-12-14 00:05:54.756510677 +0900 4 | @@ -1,6 +1,6 @@ 5 | import pytorch_lightning as pl 6 | import torch 7 | -from efficientnet_pytorch import EfficientNet 8 | +#from efficientnet_pytorch import EfficientNet 9 | from torch.nn import functional as F 10 | import numpy as np 11 | from src.model.htsat.htsat import HTSATWrapper 12 | -------------------------------------------------------------------------------- /app/bmdhs/ev_ast.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | split=$1 4 | n_iter=$2 5 | seed=$3 6 | lr_prm=$4 7 | bs=64 8 | epochs=30 9 | gpu=0 10 | 11 | echo Repeating $n_iter times... 12 | 13 | for i in $(seq $n_iter); do 14 | seed=$((seed + 1)) 15 | cmdline="CUDA_VISIBLE_DEVICES=$gpu python -m app.bmdhs.solve_bmdhs config/ast.yaml bmdhs$split --lr=$lr_prm --freq_mask 0 --time_mask 0 --training_mask 0.0 --mixup 0.0 --rrc False --epochs $epochs --warmup_epochs 0 --seed $seed --batch_size $bs" 16 | echo $cmdline 17 | eval $cmdline 18 | done 19 | -------------------------------------------------------------------------------- /app/bmdhs/ev_beats.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | split=$1 4 | n_iter=$2 5 | seed=$3 6 | lr_prm=$4 7 | bs=128 8 | epochs=30 9 | gpu=0 10 | 11 | echo Repeating $n_iter times... 12 | 13 | for i in $(seq $n_iter); do 14 | seed=$((seed + 1)) 15 | cmdline="CUDA_VISIBLE_DEVICES=$gpu python -m app.bmdhs.solve_bmdhs config/beats.yaml bmdhs$split --lr=$lr_prm --freq_mask 0 --time_mask 0 --training_mask 0.0 --mixup 0.0 --rrc False --epochs $epochs --warmup_epochs 0 --seed $seed --batch_size $bs" 16 | echo $cmdline 17 | eval $cmdline 18 | done 19 | -------------------------------------------------------------------------------- /app/bmdhs/ev_byola.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | split=$1 4 | n_iter=$2 5 | seed=$3 6 | lr_prm=$4 7 | bs=128 8 | epochs=30 9 | gpu=0 10 | 11 | echo Repeating $n_iter times... 12 | 13 | for i in $(seq $n_iter); do 14 | seed=$((seed + 1)) 15 | cmdline="CUDA_VISIBLE_DEVICES=$gpu python -m app.bmdhs.solve_bmdhs config/byola.yaml bmdhs$split --lr=$lr_prm --freq_mask 0 --time_mask 0 --training_mask 0.0 --mixup 0.0 --rrc False --epochs $epochs --warmup_epochs 0 --seed $seed --batch_size $bs" 16 | echo $cmdline 17 | eval $cmdline 18 | done 19 | -------------------------------------------------------------------------------- /app/circor/ev_ast.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | split=$1 4 | n_iter=$2 5 | seed=$3 6 | lr_prm=$4 7 | bs=1024 8 | gpu=0 9 | hidden='\(128,\)' 10 | 11 | echo Repeating $n_iter times... 12 | 13 | for i in $(seq $n_iter); do 14 | seed=$((seed + 1)) 15 | cmdline="CUDA_VISIBLE_DEVICES=$gpu python -m app.circor.solve_circor config/ast.yaml circor$split --lr=$lr_prm --freq_mask 0 --time_mask 0 --training_mask 0.0 --mixup 0.0 --rrc False --epochs 50 --warmup_epochs 0 --seed $seed --batch_size $bs --hidden $hidden" 16 | echo $cmdline 17 | eval $cmdline 18 | done 19 | -------------------------------------------------------------------------------- /config/wav2vec2logit.yaml: -------------------------------------------------------------------------------- 1 | # AR parameters such as FFT parameters. 2 | audio_repr: ar_wav2vec2.AR_Wav2Vec2Logit 3 | feature_d: 32 4 | sample_rate: 16000 5 | temporal_pooling_type: mean # `mean+max` severely degrades performances on some tasks such as VC1. We set the `mean` for wav2vec2. 6 | 7 | # Training parameters. 8 | batch_size: 256 9 | lr_lineareval: 0.0003 10 | report_per_epochs: 20 11 | early_stop_epochs: 20 12 | 13 | # Model specific parameters. 14 | wav2vec_model: facebook/wav2vec2-base-960h 15 | # wav2vec_model: facebook/wav2vec2-large-960h-lv60 16 | -------------------------------------------------------------------------------- /app/circor/ev_beats.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | split=$1 4 | n_iter=$2 5 | seed=$3 6 | lr_prm=$4 7 | bs=1024 8 | gpu=0 9 | hidden='\(128,\)' 10 | 11 | echo Repeating $n_iter times... 12 | 13 | for i in $(seq $n_iter); do 14 | seed=$((seed + 1)) 15 | cmdline="CUDA_VISIBLE_DEVICES=$gpu python -m app.circor.solve_circor config/beats.yaml circor$split --lr=$lr_prm --freq_mask 0 --time_mask 0 --training_mask 0.0 --mixup 0.0 --rrc False --epochs 50 --warmup_epochs 0 --seed $seed --batch_size $bs --hidden $hidden" 16 | echo $cmdline 17 | eval $cmdline 18 | done 19 | -------------------------------------------------------------------------------- /app/circor/ev_byola.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | split=$1 4 | n_iter=$2 5 | seed=$3 6 | lr_prm=$4 7 | bs=1024 8 | gpu=0 9 | hidden='\(128,\)' 10 | 11 | echo Repeating $n_iter times... 12 | 13 | for i in $(seq $n_iter); do 14 | seed=$((seed + 1)) 15 | cmdline="CUDA_VISIBLE_DEVICES=$gpu python -m app.circor.solve_circor config/byola.yaml circor$split --lr=$lr_prm --freq_mask 0 --time_mask 0 --training_mask 0.0 --mixup 0.0 --rrc False --epochs 50 --warmup_epochs 0 --seed $seed --batch_size $bs --hidden $hidden" 16 | echo $cmdline 17 | eval $cmdline 18 | done 19 | -------------------------------------------------------------------------------- /app/icbhi_sprs/ev_icbhi_ast.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [ $# -lt 1 ]; then 4 | n_iter=3 5 | else 6 | n_iter=$1 7 | fi 8 | 9 | if [ $# -lt 2 ]; then 10 | lr_prm=0.00003 11 | else 12 | lr_prm=$2 13 | fi 14 | bs=256 15 | spl=1 16 | head=tfm 17 | extra=--freeze_body 18 | # --freeze_embed 19 | 20 | echo Repeating $n_iter times... 21 | 22 | for i in $(seq $n_iter); do 23 | cmdline="CUDA_VISIBLE_DEVICES=0 python solve.py ../../config/ast.yaml --epochs 150 --bs $bs --lr $lr_prm --head $head $extra --split_iter $spl" 24 | echo $cmdline 25 | eval $cmdline 26 | done 27 | -------------------------------------------------------------------------------- /app/bmdhs/ev_m2d.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | weight=$1 4 | split=$2 5 | n_iter=$3 6 | seed=$4 7 | lr_prm=$5 8 | bs=128 9 | epochs=100 10 | gpu=0 11 | 12 | echo Repeating $n_iter times... 13 | 14 | for i in $(seq $n_iter); do 15 | seed=$((seed + 1)) 16 | cmdline="CUDA_VISIBLE_DEVICES=$gpu python -m app.bmdhs.solve_bmdhs config/m2d.yaml bmdhs$split weight_file=$weight,encoder_only=True --lr=$lr_prm --freq_mask 0 --time_mask 0 --training_mask 0.0 --mixup 0.0 --rrc False --epochs $epochs --warmup_epochs 0 --seed $seed --batch_size $bs" 17 | echo $cmdline 18 | eval $cmdline 19 | done 20 | -------------------------------------------------------------------------------- /app/icbhi_sprs/ev_icbhi_m2d.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [ $# -lt 2 ]; then 4 | n_iter=3 5 | else 6 | n_iter=$2 7 | fi 8 | 9 | if [ $# -lt 3 ]; then 10 | lr_prm=0.00003 11 | else 12 | lr_prm=$3 13 | fi 14 | bs=256 15 | spl=1 16 | head=tfm 17 | extra=--freeze_body 18 | # --freeze_embed 19 | 20 | echo Repeating $n_iter times... 21 | 22 | for i in $(seq $n_iter); do 23 | cmdline="CUDA_VISIBLE_DEVICES=0 python solve.py ../../config/m2d.yaml --epochs 150 --bs $bs --lr $lr_prm --weightspath $1 --head $head $extra --split_iter $spl" 24 | echo $cmdline 25 | eval $cmdline 26 | done 27 | -------------------------------------------------------------------------------- /app/icbhi_sprs/ev_icbhi_mlp_m2d.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [ $# -lt 2 ]; then 4 | n_iter=3 5 | else 6 | n_iter=$2 7 | fi 8 | 9 | if [ $# -lt 3 ]; then 10 | lr_prm=0.0003 11 | else 12 | lr_prm=$3 13 | fi 14 | bs=256 15 | spl=1 16 | head=mlp 17 | extra=--freeze_body 18 | # --freeze_embed 19 | 20 | echo Repeating $n_iter times... 21 | 22 | for i in $(seq $n_iter); do 23 | cmdline="CUDA_VISIBLE_DEVICES=0 python solve.py ../../config/m2d.yaml --epochs 150 --bs $bs --lr $lr_prm --weightspath $1 --head $head $extra --split_iter $spl" 24 | echo $cmdline 25 | eval $cmdline 26 | done 27 | -------------------------------------------------------------------------------- /app/icbhi_sprs/ev_icbhi_opera.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [ $# -lt 1 ]; then 4 | n_iter=3 5 | else 6 | n_iter=$1 7 | fi 8 | 9 | if [ $# -lt 2 ]; then 10 | lr_prm=0.003 11 | else 12 | lr_prm=$2 13 | fi 14 | bs=256 15 | spl=1 16 | head=tfm 17 | extra=--freeze_body 18 | 19 | echo Repeating $n_iter times... 20 | 21 | for i in $(seq $n_iter); do 22 | cmdline="CUDA_VISIBLE_DEVICES=0 python solve.py ../../config/opera.yaml --epochs 150 --weightspath ../../external/OPERA/encoder-operaCT.ckpt --bs $bs --lr $lr_prm --head $head $extra --split_iter $spl" 23 | echo $cmdline 24 | eval $cmdline 25 | done 26 | -------------------------------------------------------------------------------- /app/icbhi_sprs/ev_sprs_ast.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [ $# -lt 1 ]; then 4 | n_iter=3 5 | else 6 | n_iter=$1 7 | fi 8 | 9 | if [ $# -lt 2 ]; then 10 | lr_prm=0.00003 11 | else 12 | lr_prm=$2 13 | fi 14 | bs=256 15 | spl=1 16 | head=tfm 17 | extra=--freeze_body 18 | # --freeze_embed 19 | 20 | echo Repeating $n_iter times... 21 | 22 | for i in $(seq $n_iter); do 23 | cmdline="CUDA_VISIBLE_DEVICES=0 python solve.py ../../config/ast.yaml --dataset SPRS --datapath data/SPRS --epochs 50 --bs $bs --lr $lr_prm --head $head $extra --split_iter $spl" 24 | echo $cmdline 25 | eval $cmdline 26 | done 27 | -------------------------------------------------------------------------------- /config/wav2vec2context.yaml: -------------------------------------------------------------------------------- 1 | # AR parameters such as FFT parameters. 2 | audio_repr: ar_wav2vec2.AR_Wav2Vec2Context 3 | feature_d: 1024 4 | sample_rate: 16000 5 | temporal_pooling_type: mean # `mean+max` severely degrades performances on some tasks such as VC1. We set the `mean` for wav2vec2. 6 | 7 | # Training parameters. 8 | batch_size: 64 9 | lr_lineareval: 0.0003 10 | report_per_epochs: 20 11 | early_stop_epochs: 20 12 | 13 | # Model specific parameters. 14 | wav2vec_model: facebook/wav2vec2-base-960h 15 | # wav2vec_model: facebook/wav2vec2-large-960h-lv60 16 | output_layers: [-1] # list of layers to stack 17 | -------------------------------------------------------------------------------- /app/icbhi_sprs/ev_icbhi_beats.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [ $# -lt 1 ]; then 4 | n_iter=3 5 | else 6 | n_iter=$1 7 | fi 8 | 9 | if [ $# -lt 2 ]; then 10 | lr_prm=0.00003 11 | else 12 | lr_prm=$2 13 | fi 14 | bs=256 15 | spl=1 16 | head=tfm 17 | extra=--freeze_body 18 | # --freeze_embed 19 | 20 | echo Repeating $n_iter times... 21 | 22 | for i in $(seq $n_iter); do 23 | cmdline="CUDA_VISIBLE_DEVICES=0 python solve.py ../../config/beats.yaml --epochs 150 --weightspath ../../external/BEATs_iter3.pt --bs $bs --lr $lr_prm --head $head $extra --split_iter $spl" 24 | echo $cmdline 25 | eval $cmdline 26 | done 27 | -------------------------------------------------------------------------------- /app/icbhi_sprs/ev_sprs_m2d.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [ $# -lt 2 ]; then 4 | n_iter=3 5 | else 6 | n_iter=$2 7 | fi 8 | 9 | if [ $# -lt 3 ]; then 10 | lr_prm=0.00003 11 | else 12 | lr_prm=$3 13 | fi 14 | bs=256 15 | spl=1 16 | head=tfm 17 | extra=--freeze_body 18 | # --freeze_embed 19 | 20 | echo Repeating $n_iter times... 21 | 22 | for i in $(seq $n_iter); do 23 | cmdline="CUDA_VISIBLE_DEVICES=0 python solve.py --dataset SPRS --datapath data/SPRS ../../config/m2d.yaml --epochs 50 --bs $bs --lr $lr_prm --weightspath $1 --head $head $extra --split_iter $spl" 24 | echo $cmdline 25 | eval $cmdline 26 | done 27 | -------------------------------------------------------------------------------- /app/icbhi_sprs/ev_sprs_opera.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [ $# -lt 1 ]; then 4 | n_iter=3 5 | else 6 | n_iter=$1 7 | fi 8 | 9 | if [ $# -lt 2 ]; then 10 | lr_prm=0.0003 11 | else 12 | lr_prm=$2 13 | fi 14 | bs=256 15 | spl=1 16 | head=tfm 17 | extra=--freeze_body 18 | 19 | echo Repeating $n_iter times... 20 | 21 | for i in $(seq $n_iter); do 22 | cmdline="CUDA_VISIBLE_DEVICES=0 python solve.py ../../config/opera.yaml --dataset SPRS --datapath data/SPRS --epochs 50 --weightspath ../../external/OPERA/encoder-operaCT.ckpt --bs $bs --lr $lr_prm --head $head $extra --split_iter $spl" 23 | echo $cmdline 24 | eval $cmdline 25 | done 26 | -------------------------------------------------------------------------------- /app/bmdhs/make_metadata.py: -------------------------------------------------------------------------------- 1 | # Generate EVAR metadata csv files. 2 | import pandas as pd 3 | 4 | 5 | splits = [pd.read_csv(f) for f in ['split1.csv', 'split2.csv', 'split3.csv']] 6 | 7 | for df_index, spl_df in enumerate(splits): 8 | d = pd.DataFrame() 9 | for index, row in spl_df.iterrows(): 10 | labels, split = row['label'], row['split'] 11 | for filestem in row[[f'recording_{i}' for i in range(1, 8+1)]].values: 12 | d = pd.concat([d, pd.DataFrame({'file_name': [f'train/{filestem}.wav'], 'label': [labels], 'split': [split]})]) 13 | d.to_csv(f'../../evar/metadata/bmdhs{df_index + 1}.csv') 14 | print(d[:3]) 15 | -------------------------------------------------------------------------------- /app/circor/ev_m2d.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | weight=$1 4 | split=$2 5 | n_iter=$3 6 | seed=$4 7 | lr_prm=$5 8 | bs=1024 9 | gpu=0 10 | hidden='\(128,\)' 11 | reweight=True 12 | 13 | echo Repeating $n_iter times... 14 | 15 | for i in $(seq $n_iter); do 16 | seed=$((seed + 1)) 17 | cmdline="CUDA_VISIBLE_DEVICES=$gpu python -m app.circor.solve_circor config/m2d.yaml circor$split weight_file=$weight,encoder_only=True --lr=$lr_prm --freq_mask 0 --time_mask 0 --training_mask 0.0 --mixup 0.0 --rrc False --epochs 50 --warmup_epochs 0 --seed $seed --batch_size $bs --hidden $hidden --reweight $reweight" 18 | echo $cmdline 19 | eval $cmdline 20 | done 21 | -------------------------------------------------------------------------------- /app/icbhi_sprs/ev_sprs_byola.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [ $# -lt 1 ]; then 4 | n_iter=3 5 | else 6 | n_iter=$1 7 | fi 8 | 9 | if [ $# -lt 2 ]; then 10 | lr_prm=0.00003 11 | else 12 | lr_prm=$2 13 | fi 14 | bs=256 15 | spl=1 16 | head=tfm 17 | extra=--freeze_body 18 | # --freeze_embed 19 | 20 | echo Repeating $n_iter times... 21 | 22 | for i in $(seq $n_iter); do 23 | cmdline="CUDA_VISIBLE_DEVICES=0 python solve.py ../../config/beats.yaml --dataset SPRS --datapath data/SPRS --epochs 50 --weightspath ../../external/BEATs_iter3.pt --bs $bs --lr $lr_prm --head $head $extra --split_iter $spl" 24 | echo $cmdline 25 | eval $cmdline 26 | done 27 | -------------------------------------------------------------------------------- /app/icbhi_sprs/ev_icbhi_byola.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [ $# -lt 1 ]; then 4 | n_iter=3 5 | else 6 | n_iter=$1 7 | fi 8 | 9 | if [ $# -lt 2 ]; then 10 | lr_prm=0.00003 11 | #5e-5 @bs64 12 | else 13 | lr_prm=$2 14 | fi 15 | bs=256 16 | spl=1 17 | head=tfm 18 | extra=--freeze_body 19 | # --freeze_embed 20 | 21 | echo Repeating $n_iter times... 22 | 23 | for i in $(seq $n_iter); do 24 | cmdline="CUDA_VISIBLE_DEVICES=0 python solve.py ../../config/byola.yaml --epochs 150 --weightspath ../../external/byol_a/pretrained_weights/AudioNTT2020-BYOLA-64x96d2048.pth --bs $bs --lr $lr_prm --head $head $extra --split_iter $spl" 25 | echo $cmdline 26 | eval $cmdline 27 | done 28 | -------------------------------------------------------------------------------- /config/laionclap.yaml: -------------------------------------------------------------------------------- 1 | # Zero-shot-ready 2 | 3 | # AR parameters such as FFT parameters. 4 | audio_repr: ar_laionclap.AR_LAIONCLAP 5 | weight_file: 6 | feature_d: 768 7 | sample_rate: 48000 8 | 9 | # Model specific parameters. 10 | 11 | # Linear evaluation/Fine-tuning common parameters. 12 | 13 | # Linear evaluaition parameters. 14 | batch_size: 128 15 | lr_lineareval: 0.0003 16 | report_per_epochs: 50 17 | early_stop_epochs: 20 18 | 19 | # Fine-tuning parameters. 20 | warmup_epochs: 5 21 | mixup: 0.5 22 | ft_bs: 128 23 | ft_lr: 2.0 24 | ft_early_stop_epochs: -1 # -1: no early stopping 25 | ft_epochs: 200 26 | ft_freq_mask: 8 27 | ft_time_mask: 64 28 | ft_noise: 0.0 29 | ft_rrc: True 30 | -------------------------------------------------------------------------------- /run/all_byola2.sh: -------------------------------------------------------------------------------- 1 | python 2pass_lineareval.py config/byola2.yaml esc50 --lr=0.001 2 | python 2pass_lineareval.py config/byola2.yaml us8k --lr=0.00003 3 | python 2pass_lineareval.py config/byola2.yaml spcv2 --lr=0.00003 4 | python 2pass_lineareval.py config/byola2.yaml nsynth --lr=0.001 5 | python 2pass_lineareval.py config/byola2.yaml vc1 --lr=0.00004 6 | python 2pass_lineareval.py config/byola2.yaml voxforge --lr=0.0001 7 | python 2pass_lineareval.py config/byola2.yaml cremad 8 | python 2pass_lineareval.py config/byola2.yaml surge --lr=0.00003 9 | python 2pass_lineareval.py config/byola2.yaml gtzan batch_size=64 --lr=0.001 10 | python summarize.py external/byol_a/v2/AudioNTT2022-BYOLA-64x96d2048.pth 11 | -------------------------------------------------------------------------------- /app/icbhi_sprs/ev_sprs_beats.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [ $# -lt 1 ]; then 4 | n_iter=3 5 | else 6 | n_iter=$1 7 | fi 8 | 9 | if [ $# -lt 2 ]; then 10 | lr_prm=0.00003 11 | #5e-5 @bs64 12 | else 13 | lr_prm=$2 14 | fi 15 | bs=256 16 | spl=1 17 | head=tfm 18 | extra=--freeze_body 19 | # --freeze_embed 20 | 21 | echo Repeating $n_iter times... 22 | 23 | for i in $(seq $n_iter); do 24 | cmdline="CUDA_VISIBLE_DEVICES=0 python solve.py ../../config/byola.yaml --dataset SPRS --datapath data/SPRS --epochs 50 --weightspath ../../external/byol_a/pretrained_weights/AudioNTT2020-BYOLA-64x96d2048.pth --bs $bs --lr $lr_prm --head $head $extra --split_iter $spl" 25 | echo $cmdline 26 | eval $cmdline 27 | done 28 | -------------------------------------------------------------------------------- /run/all_ced.sh: -------------------------------------------------------------------------------- 1 | NAME=CED 2 | python 2pass_lineareval.py config/ced.yaml cremad batch_size=16,name=$NAME 3 | python 2pass_lineareval.py config/ced.yaml gtzan batch_size=16,name=$NAME 4 | python 2pass_lineareval.py config/ced.yaml spcv2 batch_size=64,name=$NAME 5 | python 2pass_lineareval.py config/ced.yaml esc50 batch_size=64,name=$NAME 6 | python 2pass_lineareval.py config/ced.yaml us8k batch_size=64,name=$NAME 7 | python 2pass_lineareval.py config/ced.yaml vc1 batch_size=64,name=$NAME 8 | python 2pass_lineareval.py config/ced.yaml voxforge batch_size=64,name=$NAME 9 | python 2pass_lineareval.py config/ced.yaml nsynth batch_size=64,name=$NAME 10 | python 2pass_lineareval.py config/ced.yaml surge batch_size=64,name=$NAME 11 | python summarize.py $NAME 12 | -------------------------------------------------------------------------------- /run/all_atst.sh: -------------------------------------------------------------------------------- 1 | NAME=ATST 2 | python 2pass_lineareval.py config/atst.yaml cremad batch_size=16,name=$NAME 3 | python 2pass_lineareval.py config/atst.yaml gtzan batch_size=16,name=$NAME 4 | python 2pass_lineareval.py config/atst.yaml spcv2 batch_size=64,name=$NAME 5 | python 2pass_lineareval.py config/atst.yaml esc50 batch_size=64,name=$NAME 6 | python 2pass_lineareval.py config/atst.yaml us8k batch_size=64,name=$NAME 7 | python 2pass_lineareval.py config/atst.yaml vc1 batch_size=64,name=$NAME 8 | python 2pass_lineareval.py config/atst.yaml voxforge batch_size=64,name=$NAME 9 | python 2pass_lineareval.py config/atst.yaml nsynth batch_size=64,name=$NAME 10 | python 2pass_lineareval.py config/atst.yaml surge batch_size=64,name=$NAME 11 | python summarize.py $NAME 12 | -------------------------------------------------------------------------------- /config/atst.yaml: -------------------------------------------------------------------------------- 1 | # AR parameters such as FFT parameters. 2 | audio_repr: ar_atst.AR_ATST 3 | weight_file: external/atst_base.ckpt 4 | feature_d: 1536 5 | sample_rate: 16000 6 | n_fft: 1024 7 | window_size: 1024 8 | hop_size: 160 9 | n_mels: 64 10 | window: hanning 11 | 12 | n_blocks: 1 13 | 14 | # Training parameters. 15 | batch_size: 128 16 | lr_lineareval: 0.0003 17 | # not ready lr_finetune_frozen: 0.001 18 | # not ready lr_finetune_finetune: 0.00003 19 | report_per_epochs: 20 20 | early_stop_epochs: 20 21 | 22 | # Fine-tuning parameters. 23 | warmup_epochs: 5 24 | mixup: 0.5 25 | ft_bs: 64 26 | ft_lr: 2.0 27 | ft_early_stop_epochs: -1 # -1: no early stopping 28 | ft_epochs: 200 29 | ft_freq_mask: 30 30 | ft_time_mask: 192 31 | ft_noise: 0.0 32 | ft_rrc: True 33 | -------------------------------------------------------------------------------- /run/all_beats.sh: -------------------------------------------------------------------------------- 1 | NAME=BEATs 2 | python 2pass_lineareval.py config/beats.yaml cremad batch_size=16,name=$NAME 3 | python 2pass_lineareval.py config/beats.yaml gtzan batch_size=16,name=$NAME 4 | python 2pass_lineareval.py config/beats.yaml spcv2 batch_size=64,name=$NAME 5 | python 2pass_lineareval.py config/beats.yaml esc50 batch_size=64,name=$NAME 6 | python 2pass_lineareval.py config/beats.yaml us8k batch_size=64,name=$NAME 7 | python 2pass_lineareval.py config/beats.yaml vc1 batch_size=64,name=$NAME 8 | python 2pass_lineareval.py config/beats.yaml voxforge batch_size=64,name=$NAME 9 | python 2pass_lineareval.py config/beats.yaml nsynth batch_size=64,name=$NAME 10 | python 2pass_lineareval.py config/beats.yaml surge batch_size=64,name=$NAME 11 | python summarize.py $NAME 12 | -------------------------------------------------------------------------------- /run/all_htsat.sh: -------------------------------------------------------------------------------- 1 | name=HTSAT 2 | python 2pass_lineareval.py config/htsat.yaml cremad batch_size=16,name=$name 3 | python 2pass_lineareval.py config/htsat.yaml gtzan batch_size=16,name=$name 4 | python 2pass_lineareval.py config/htsat.yaml spcv2 batch_size=64,name=$name 5 | python 2pass_lineareval.py config/htsat.yaml esc50 batch_size=64,name=$name 6 | python 2pass_lineareval.py config/htsat.yaml us8k batch_size=64,name=$name 7 | python 2pass_lineareval.py config/htsat.yaml vc1 batch_size=64,name=$name 8 | python 2pass_lineareval.py config/htsat.yaml voxforge batch_size=64,name=$name 9 | python 2pass_lineareval.py config/htsat.yaml nsynth batch_size=64,name=$name 10 | python 2pass_lineareval.py config/htsat.yaml surge batch_size=64,name=$name 11 | python summarize.py $name 12 | -------------------------------------------------------------------------------- /run/all_m2d.sh: -------------------------------------------------------------------------------- 1 | python 2pass_lineareval.py config/m2d.yaml cremad batch_size=16,weight_file=$1 2 | python 2pass_lineareval.py config/m2d.yaml gtzan batch_size=16,weight_file=$1 3 | python 2pass_lineareval.py config/m2d.yaml spcv2 batch_size=64,weight_file=$1 4 | python 2pass_lineareval.py config/m2d.yaml esc50 batch_size=64,weight_file=$1 5 | python 2pass_lineareval.py config/m2d.yaml us8k batch_size=64,weight_file=$1 6 | python 2pass_lineareval.py config/m2d.yaml vc1 batch_size=64,weight_file=$1 7 | python 2pass_lineareval.py config/m2d.yaml voxforge batch_size=64,weight_file=$1 8 | python 2pass_lineareval.py config/m2d.yaml nsynth batch_size=64,weight_file=$1 9 | python 2pass_lineareval.py config/m2d.yaml surge batch_size=64,weight_file=$1 10 | 11 | python summarize.py $1 12 | -------------------------------------------------------------------------------- /config/atst_frame.yaml: -------------------------------------------------------------------------------- 1 | # AR parameters such as FFT parameters. 2 | audio_repr: ar_atst_frame.AR_ATST_Frame 3 | weight_file: external/atstframe_base.ckpt 4 | feature_d: 9216 5 | sample_rate: 16000 6 | n_fft: 1024 7 | window_size: 1024 8 | hop_size: 160 9 | n_mels: 64 10 | window: hanning 11 | 12 | n_blocks: 1 13 | 14 | # Training parameters. 15 | batch_size: 128 16 | lr_lineareval: 0.00003 17 | # not ready lr_finetune_frozen: 0.001 18 | # not ready lr_finetune_finetune: 0.00003 19 | report_per_epochs: 20 20 | early_stop_epochs: 20 21 | 22 | # Fine-tuning parameters. 23 | warmup_epochs: 5 24 | mixup: 0.5 25 | ft_bs: 64 26 | ft_lr: 2.0 27 | ft_early_stop_epochs: -1 # -1: no early stopping 28 | ft_epochs: 200 29 | ft_freq_mask: 30 30 | ft_time_mask: 192 31 | ft_noise: 0.0 32 | ft_rrc: True 33 | -------------------------------------------------------------------------------- /config/byola2.yaml: -------------------------------------------------------------------------------- 1 | # AR parameters such as FFT parameters. 2 | audio_repr: ar_byola2.AR_BYOLA2 3 | weight_file: external/byol_a/v2/AudioNTT2022-BYOLA-64x96d2048.pth 4 | feature_d: 3072 5 | sample_rate: 16000 6 | n_fft: 1024 7 | window_size: 1024 8 | hop_size: 160 9 | n_mels: 64 10 | f_min: 60 11 | f_max: 7800 12 | temporal_pooling_type: mean_max 13 | 14 | # Training parameters. 15 | batch_size: 256 16 | lr_lineareval: 0.0003 17 | report_per_epochs: 20 18 | early_stop_epochs: 20 19 | 20 | # Fine-tuning parameters. 21 | ## CAUTION: The following parameters not confirmed to work. ## 22 | warmup_epochs: 5 23 | mixup: 0.5 24 | ft_bs: 256 25 | ft_lr: 0.001 26 | ft_early_stop_epochs: -1 # -1: no early stopping 27 | ft_epochs: 200 28 | ft_freq_mask: 30 29 | ft_time_mask: 100 30 | ft_rrc: True 31 | -------------------------------------------------------------------------------- /config/cnn14.yaml: -------------------------------------------------------------------------------- 1 | # AR parameters such as FFT parameters. 2 | audio_repr: ar_cnn14.AR_Cnn14 3 | weight_file: external/Cnn14_16k_mAP=0.438.pth 4 | feature_d: 2048 5 | sample_rate: 16000 6 | n_fft: 512 7 | window_size: 512 8 | hop_size: 160 9 | n_mels: 64 10 | f_min: 50 11 | f_max: 8000 12 | # temporal_pooling_type: -> not using common temporal pooling. 13 | 14 | # Training parameters. 15 | batch_size: 256 16 | lr_lineareval: 0.0003 17 | report_per_epochs: 20 18 | early_stop_epochs: 20 19 | 20 | # Fine-tuning parameters. 21 | ## CAUTION: The following parameters not confirmed to work. ## 22 | warmup_epochs: 5 23 | mixup: 0.5 24 | ft_bs: 256 25 | ft_lr: 0.001 26 | ft_early_stop_epochs: -1 # -1: no early stopping 27 | ft_epochs: 200 28 | ft_freq_mask: 30 29 | ft_time_mask: 100 30 | ft_rrc: False 31 | -------------------------------------------------------------------------------- /config/byola.yaml: -------------------------------------------------------------------------------- 1 | # AR parameters such as FFT parameters. 2 | audio_repr: ar_byola.AR_BYOLA 3 | weight_file: external/byol_a/pretrained_weights/AudioNTT2020-BYOLA-64x96d2048.pth 4 | feature_d: 2048 5 | sample_rate: 16000 6 | n_fft: 1024 7 | window_size: 1024 8 | hop_size: 160 9 | n_mels: 64 10 | f_min: 60 11 | f_max: 7800 12 | temporal_pooling_type: mean_max 13 | 14 | # Training parameters. 15 | batch_size: 256 16 | lr_lineareval: 0.0003 17 | report_per_epochs: 20 18 | early_stop_epochs: 20 19 | 20 | # Fine-tuning parameters. 21 | ## CAUTION: The following parameters not confirmed to work. ## 22 | warmup_epochs: 5 23 | mixup: 0.5 24 | ft_bs: 256 25 | ft_lr: 0.001 26 | ft_early_stop_epochs: -1 # -1: no early stopping 27 | ft_epochs: 200 28 | ft_freq_mask: 30 29 | ft_time_mask: 100 30 | ft_rrc: True 31 | -------------------------------------------------------------------------------- /config/byolax.yaml: -------------------------------------------------------------------------------- 1 | # AR parameters such as FFT parameters. 2 | audio_repr: ar_byola.AR_BYOLAX 3 | weight_file: external/byol_a/pretrained_weights/AudioNTT2020-BYOLA-64x96d2048.pth 4 | feature_d: 2048 5 | sample_rate: 16000 6 | n_fft: 1024 7 | window_size: 1024 8 | hop_size: 160 9 | n_mels: 64 10 | f_min: 60 11 | f_max: 7800 12 | temporal_pooling_type: mean_max 13 | 14 | # Training parameters. 15 | batch_size: 256 16 | lr_lineareval: 0.0003 17 | report_per_epochs: 20 18 | early_stop_epochs: 20 19 | 20 | # Fine-tuning parameters. 21 | ## CAUTION: The following parameters not confirmed to work. ## 22 | warmup_epochs: 5 23 | mixup: 0.5 24 | ft_bs: 256 25 | ft_lr: 0.001 26 | ft_early_stop_epochs: -1 # -1: no early stopping 27 | ft_epochs: 200 28 | ft_freq_mask: 30 29 | ft_time_mask: 100 30 | ft_rrc: True 31 | -------------------------------------------------------------------------------- /config/opera.yaml: -------------------------------------------------------------------------------- 1 | # AR parameters such as FFT parameters. 2 | audio_repr: ar_opera.AR_OPERA_CT 3 | weight_file: external/OPERA/encoder-operaCT.ckpt 4 | feature_d: 768 5 | sample_rate: 16000 6 | n_fft: 1024 7 | # window_size: 1024 8 | hop_size: 512 9 | n_mels: 64 10 | f_min: 50 11 | f_max: 8000 12 | # window: hanning 13 | 14 | # Linear evaluation/Fine-tuning common parameters. 15 | training_mask: 0.0 16 | 17 | # Linear evaluaition parameters. 18 | batch_size: 128 19 | lr_lineareval: 0.0003 20 | report_per_epochs: 50 21 | early_stop_epochs: 20 22 | 23 | # Fine-tuning parameters. 24 | warmup_epochs: 5 25 | mixup: 0.5 26 | ft_bs: 128 27 | ft_lr: 2.0 28 | ft_early_stop_epochs: -1 # -1: no early stopping 29 | ft_epochs: 200 30 | ft_freq_mask: 8 31 | ft_time_mask: 64 32 | ft_noise: 0.0 33 | ft_rrc: True 34 | -------------------------------------------------------------------------------- /external/coala.patch: -------------------------------------------------------------------------------- 1 | diff --git a/encode.py b/encode.py 2 | index d8d892f..3646540 100755 3 | --- a/encode.py 4 | +++ b/encode.py 5 | @@ -12,12 +12,12 @@ from pathlib import Path 6 | from tqdm import tqdm 7 | import librosa 8 | 9 | -from utils import compute_spectrogram 10 | -from models_t1000 import AudioEncoder, TagEncoder, CNN 11 | +from .utils import compute_spectrogram 12 | +from .models_t1000 import AudioEncoder, TagEncoder, CNN 13 | 14 | 15 | -scaler = pickle.load(open('./scaler_top_1000.pkl', 'rb')) 16 | -id2tag = json.load(open('./json/id2token_top_1000.json', 'rb')) 17 | +scaler = pickle.load(open('external/coala_scaler_top_1000_plus_clip.pkl', 'rb')) 18 | +id2tag = json.load(open('external/coala/json/id2token_top_1000.json', 'rb')) 19 | tag2id = {tag: id for id, tag in id2tag.items()} 20 | 21 | -------------------------------------------------------------------------------- /config/ast.yaml: -------------------------------------------------------------------------------- 1 | # AR parameters such as FFT parameters. 2 | audio_repr: ar_ast.AR_AST 3 | weight_file: external/ast/pretrained_models/ast_audioset.pth 4 | feature_d: 768 5 | sample_rate: 16000 6 | n_fft: 400 7 | window_size: 400 8 | hop_size: 160 9 | n_mels: 128 10 | window: hanning # paper is typo (https://github.com/YuanGongND/ast/issues/13) 11 | 12 | # Training parameters. 13 | batch_size: 128 14 | lr_lineareval: 0.0003 15 | report_per_epochs: 20 16 | early_stop_epochs: 20 17 | 18 | # Fine-tuning parameters. 19 | ## CAUTION: The following parameters not confirmed to work. ## 20 | warmup_epochs: 4 21 | mixup: 0.5 22 | ft_bs: 64 23 | ft_lr: 2.0 24 | ft_early_stop_epochs: -1 # -1: no early stopping 25 | ft_epochs: 200 26 | ft_freq_mask: 24 # for ESC-50 in this case 27 | ft_time_mask: 96 # for ESC-50 28 | ft_rrc: True 29 | -------------------------------------------------------------------------------- /config/vggish.yaml: -------------------------------------------------------------------------------- 1 | # AR parameters such as FFT parameters. 2 | audio_repr: ar_vggish.AR_VGGish 3 | feature_d: 128 4 | sample_rate: 16000 5 | n_fft: 400 6 | window_size: 400 7 | hop_size: 160 8 | n_mels: 64 9 | f_min: 125 10 | f_max: 7500 11 | temporal_pooling_type: mean_max 12 | 13 | # Training parameters. 14 | batch_size: 256 15 | lr_lineareval: 0.0003 16 | report_per_epochs: 20 17 | early_stop_epochs: 20 18 | 19 | # SAMPLE_RATE = 16000 20 | # STFT_WINDOW_LENGTH_SECONDS = 0.025 -> 400 21 | # STFT_HOP_LENGTH_SECONDS = 0.010 -> 160 22 | # NUM_MEL_BINS = NUM_BANDS 23 | # MEL_MIN_HZ = 125 24 | # MEL_MAX_HZ = 7500 25 | # LOG_OFFSET = 0.01 # Offset used for stabilized log of input mel-spectrogram. 26 | # EXAMPLE_WINDOW_SECONDS = 0.96 # Each example contains 96 10ms frames 27 | # EXAMPLE_HOP_SECONDS = 0.96 # with zero overlap. 28 | -------------------------------------------------------------------------------- /run/all_atst_frame.sh: -------------------------------------------------------------------------------- 1 | NAME=ATSTFrame 2 | python 2pass_lineareval.py config/atst_frame.yaml cremad batch_size=16,name=$NAME 3 | python 2pass_lineareval.py config/atst_frame.yaml gtzan batch_size=16,name=$NAME 4 | python 2pass_lineareval.py config/atst_frame.yaml spcv2 batch_size=64,name=$NAME 5 | python 2pass_lineareval.py config/atst_frame.yaml esc50 batch_size=64,name=$NAME 6 | python 2pass_lineareval.py config/atst_frame.yaml us8k batch_size=64,name=$NAME 7 | python 2pass_lineareval.py config/atst_frame.yaml vc1 batch_size=64,name=$NAME 8 | python 2pass_lineareval.py config/atst_frame.yaml voxforge batch_size=64,name=$NAME 9 | python 2pass_lineareval.py config/atst_frame.yaml nsynth batch_size=64,name=$NAME 10 | python 2pass_lineareval.py config/atst_frame.yaml surge batch_size=64,name=$NAME 11 | python summarize.py $NAME 12 | -------------------------------------------------------------------------------- /run/all_beats_plus.sh: -------------------------------------------------------------------------------- 1 | NAME=BEATsPlus 2 | python 2pass_lineareval.py config/beats_plus.yaml cremad batch_size=16,name=$NAME 3 | python 2pass_lineareval.py config/beats_plus.yaml gtzan batch_size=16,name=$NAME 4 | python 2pass_lineareval.py config/beats_plus.yaml spcv2 batch_size=64,name=$NAME 5 | python 2pass_lineareval.py config/beats_plus.yaml esc50 batch_size=64,name=$NAME 6 | python 2pass_lineareval.py config/beats_plus.yaml us8k batch_size=64,name=$NAME 7 | python 2pass_lineareval.py config/beats_plus.yaml vc1 batch_size=64,name=$NAME 8 | python 2pass_lineareval.py config/beats_plus.yaml voxforge batch_size=64,name=$NAME 9 | python 2pass_lineareval.py config/beats_plus.yaml nsynth batch_size=64,name=$NAME 10 | python 2pass_lineareval.py config/beats_plus.yaml surge batch_size=64,name=$NAME 11 | python summarize.py $NAME 12 | -------------------------------------------------------------------------------- /config/vggish_4k.yaml: -------------------------------------------------------------------------------- 1 | # AR parameters such as FFT parameters. 2 | audio_repr: ar_vggish.AR_VGGish_4K 3 | feature_d: 4096 4 | sample_rate: 16000 5 | n_fft: 400 6 | window_size: 400 7 | hop_size: 160 8 | n_mels: 64 9 | f_min: 125 10 | f_max: 7500 11 | temporal_pooling_type: mean_max 12 | 13 | # Training parameters. 14 | batch_size: 256 15 | lr_lineareval: 0.0003 16 | report_per_epochs: 20 17 | early_stop_epochs: 20 18 | 19 | # SAMPLE_RATE = 16000 20 | # STFT_WINDOW_LENGTH_SECONDS = 0.025 -> 400 21 | # STFT_HOP_LENGTH_SECONDS = 0.010 -> 160 22 | # NUM_MEL_BINS = NUM_BANDS 23 | # MEL_MIN_HZ = 125 24 | # MEL_MAX_HZ = 7500 25 | # LOG_OFFSET = 0.01 # Offset used for stabilized log of input mel-spectrogram. 26 | # EXAMPLE_WINDOW_SECONDS = 0.96 # Each example contains 96 10ms frames 27 | # EXAMPLE_HOP_SECONDS = 0.96 # with zero overlap. 28 | -------------------------------------------------------------------------------- /plugin/MARBLE/configs/evar/MTT.yaml: -------------------------------------------------------------------------------- 1 | _import: 2 | - !include benchmark/tasks/MTT/MTT_base_config.yaml 3 | 4 | dataset: 5 | pre_extract: 6 | accelerator: gpu 7 | audio_dir: data/MTT/mp3 8 | output_dir: 9 | keep_folder_structure: true 10 | overwrite: true 11 | 12 | feature_extractor: 13 | pretrain: 14 | !include benchmark/models/evar/evar.yaml 15 | 16 | dataset: MTT 17 | input_type: feature # [audio, feature] 18 | input_dir: 19 | metadata_dir: data/MTT 20 | 21 | model: 22 | downstream_structure: 23 | components: 24 | - name: feature_selector 25 | layer: 0 # not a transformer model, so layer is set to 0 26 | normalized_weight_sum: false 27 | - name: mlp 28 | hidden_layer_sizes: [512] 29 | dropout_p: 0.2 30 | num_outputs: 50 31 | -------------------------------------------------------------------------------- /config/msclap.yaml: -------------------------------------------------------------------------------- 1 | # Zero-shot-ready 2 | 3 | # AR parameters such as FFT parameters. 4 | audio_repr: ar_msclap.AR_MSCLAP 5 | weight_file: 2023 6 | feature_d: 1024 7 | sample_rate: 32000 8 | n_fft: 1024 9 | window_size: 1024 10 | hop_size: 320 11 | n_mels: 64 12 | f_min: 50 13 | f_max: 14000 14 | window: hanning 15 | 16 | # Model specific parameters. 17 | return_filename: True 18 | 19 | # Linear evaluation/Fine-tuning common parameters. 20 | 21 | # Linear evaluaition parameters. 22 | batch_size: 128 23 | lr_lineareval: 0.0003 24 | report_per_epochs: 50 25 | early_stop_epochs: 20 26 | 27 | # Fine-tuning parameters. 28 | warmup_epochs: 5 29 | mixup: 0.5 30 | ft_bs: 128 31 | ft_lr: 2.0 32 | ft_early_stop_epochs: -1 # -1: no early stopping 33 | ft_epochs: 200 34 | ft_freq_mask: 8 35 | ft_time_mask: 64 36 | ft_noise: 0.0 37 | ft_rrc: True 38 | -------------------------------------------------------------------------------- /config/wavcaps.yaml: -------------------------------------------------------------------------------- 1 | # Zero-shot-ready 2 | 3 | # AR parameters such as FFT parameters. 4 | audio_repr: ar_wavcaps.AR_WavCaps 5 | weight_file: external/WavCaps/HTSAT-BERT-PT.pt 6 | feature_d: 768 7 | sample_rate: 32000 8 | n_fft: 1024 9 | window_size: 1024 10 | hop_size: 320 11 | n_mels: 64 12 | f_min: 50 13 | f_max: 14000 14 | window: hanning 15 | 16 | # Model specific parameters. 17 | 18 | # Linear evaluation/Fine-tuning common parameters. 19 | 20 | # Linear evaluaition parameters. 21 | batch_size: 128 22 | lr_lineareval: 0.0003 23 | report_per_epochs: 50 24 | early_stop_epochs: 20 25 | 26 | # Fine-tuning parameters. 27 | warmup_epochs: 5 28 | mixup: 0.5 29 | ft_bs: 128 30 | ft_lr: 2.0 31 | ft_early_stop_epochs: -1 # -1: no early stopping 32 | ft_epochs: 200 33 | ft_freq_mask: 8 34 | ft_time_mask: 64 35 | ft_noise: 0.0 36 | ft_rrc: True 37 | -------------------------------------------------------------------------------- /plugin/MARBLE/configs/evar/GTZAN.yaml: -------------------------------------------------------------------------------- 1 | _import: 2 | - !include benchmark/tasks/GTZAN/GTZAN_base_config.yaml 3 | 4 | dataset: 5 | pre_extract: 6 | accelerator: gpu 7 | audio_dir: data/GTZAN/genres 8 | output_dir: 9 | keep_folder_structure: true 10 | overwrite: true 11 | 12 | feature_extractor: 13 | pretrain: 14 | !include benchmark/models/evar/evar.yaml 15 | 16 | dataset: GTZAN 17 | input_type: feature # [audio, feature] 18 | input_dir: 19 | metadata_dir: data/GTZAN 20 | 21 | model: 22 | downstream_structure: 23 | components: 24 | - name: feature_selector 25 | layer: 0 # not a transformer model, so layer is set to 0 26 | normalized_weight_sum: false 27 | - name: mlp 28 | hidden_layer_sizes: [512] 29 | dropout_p: 0.2 30 | num_outputs: 10 31 | -------------------------------------------------------------------------------- /plugin/MARBLE/configs/evar/NSynthI.yaml: -------------------------------------------------------------------------------- 1 | _import: 2 | - !include benchmark/tasks/NSynth/NSynthI_base_config.yaml 3 | 4 | dataset: 5 | pre_extract: 6 | accelerator: gpu 7 | audio_dir: data/NSynth 8 | output_dir: 9 | keep_folder_structure: true 10 | overwrite: true 11 | 12 | feature_extractor: 13 | pretrain: 14 | !include benchmark/models/evar/evar.yaml 15 | 16 | dataset: NSynthI 17 | input_type: feature # [audio, feature] 18 | input_dir: 19 | metadata_dir: data/NSynth 20 | 21 | model: 22 | downstream_structure: 23 | components: 24 | - name: feature_selector 25 | layer: 0 # not a transformer model, so layer is set to 0 26 | normalized_weight_sum: false 27 | - name: mlp 28 | hidden_layer_sizes: [512] 29 | dropout_p: 0.2 30 | num_outputs: 11 31 | -------------------------------------------------------------------------------- /plugin/MARBLE/configs/evar/NSynthP.yaml: -------------------------------------------------------------------------------- 1 | _import: 2 | - !include benchmark/tasks/NSynth/NSynthP_base_config.yaml 3 | 4 | dataset: 5 | pre_extract: 6 | accelerator: gpu 7 | audio_dir: data/NSynth 8 | output_dir: 9 | keep_folder_structure: true 10 | overwrite: true 11 | 12 | feature_extractor: 13 | pretrain: 14 | !include benchmark/models/evar/evar.yaml 15 | 16 | dataset: NSynthP 17 | input_type: feature # [audio, feature] 18 | input_dir: 19 | metadata_dir: data/NSynth 20 | 21 | model: 22 | downstream_structure: 23 | components: 24 | - name: feature_selector 25 | layer: 0 # not a transformer model, so layer is set to 0 26 | normalized_weight_sum: false 27 | - name: mlp 28 | hidden_layer_sizes: [512] 29 | dropout_p: 0.2 30 | num_outputs: 128 31 | -------------------------------------------------------------------------------- /plugin/MARBLE/configs/evar/GS.yaml: -------------------------------------------------------------------------------- 1 | _import: 2 | - !include benchmark/tasks/GS/GS_base_config.yaml 3 | 4 | dataset: 5 | pre_extract: 6 | accelerator: gpu 7 | audio_dir: data/GS/giantsteps_clips/wav 8 | output_dir: 9 | keep_folder_structure: true 10 | overwrite: true 11 | 12 | feature_extractor: 13 | pretrain: 14 | !include benchmark/models/evar/evar.yaml 15 | 16 | dataset: GS 17 | input_type: feature # [audio, feature] 18 | input_dir: 19 | metadata_dir: data/GS/giantsteps_clips 20 | 21 | model: 22 | downstream_structure: 23 | components: 24 | - name: feature_selector 25 | layer: 0 # not a transformer model, so layer is set to 0 26 | normalized_weight_sum: false 27 | - name: mlp 28 | hidden_layer_sizes: [512] 29 | dropout_p: 0.2 30 | num_outputs: 24 31 | -------------------------------------------------------------------------------- /plugin/MARBLE/configs/evar/VocalSetS.yaml: -------------------------------------------------------------------------------- 1 | _import: 2 | - !include benchmark/tasks/VocalSet/VocalSetS_base_config.yaml 3 | 4 | dataset: 5 | pre_extract: 6 | accelerator: gpu 7 | audio_dir: data/VocalSet/audio 8 | output_dir: 9 | keep_folder_structure: true 10 | overwrite: true 11 | 12 | feature_extractor: 13 | pretrain: 14 | !include benchmark/models/evar/evar.yaml 15 | 16 | dataset: VocalSetS 17 | input_type: feature # [audio, feature] 18 | input_dir: 19 | metadata_dir: data/VocalSet 20 | 21 | model: 22 | downstream_structure: 23 | components: 24 | - name: feature_selector 25 | layer: 0 # not a transformer model, so layer is set to 0 26 | normalized_weight_sum: false 27 | - name: mlp 28 | hidden_layer_sizes: [512] 29 | dropout_p: 0.2 30 | num_outputs: 20 31 | -------------------------------------------------------------------------------- /plugin/MARBLE/configs/evar/VocalSetT.yaml: -------------------------------------------------------------------------------- 1 | _import: 2 | - !include benchmark/tasks/VocalSet/VocalSetT_base_config.yaml 3 | 4 | dataset: 5 | pre_extract: 6 | accelerator: gpu 7 | audio_dir: data/VocalSet/audio 8 | output_dir: 9 | keep_folder_structure: true 10 | overwrite: true 11 | 12 | feature_extractor: 13 | pretrain: 14 | !include benchmark/models/evar/evar.yaml 15 | 16 | dataset: VocalSetT 17 | input_type: feature # [audio, feature] 18 | input_dir: 19 | metadata_dir: data/VocalSet 20 | 21 | model: 22 | downstream_structure: 23 | components: 24 | - name: feature_selector 25 | layer: 0 # not a transformer model, so layer is set to 0 26 | normalized_weight_sum: false 27 | - name: mlp 28 | hidden_layer_sizes: [512] 29 | dropout_p: 0.2 30 | num_outputs: 10 31 | -------------------------------------------------------------------------------- /plugin/MARBLE/configs/evar/MTGMood.yaml: -------------------------------------------------------------------------------- 1 | _import: 2 | - !include benchmark/tasks/MTG/MTGMood_base_config.yaml 3 | 4 | dataset: 5 | pre_extract: 6 | accelerator: gpu 7 | audio_dir: data/MTG/audio-low 8 | output_dir: 9 | keep_folder_structure: true 10 | overwrite: true 11 | 12 | feature_extractor: 13 | pretrain: 14 | !include benchmark/models/evar/evar.yaml 15 | 16 | dataset: MTGMood 17 | input_type: feature # [audio, feature] 18 | input_dir: 19 | metadata_dir: data/MTG/mtg-jamendo-dataset 20 | 21 | model: 22 | downstream_structure: 23 | components: 24 | - name: feature_selector 25 | layer: 0 # not a transformer model, so layer is set to 0 26 | normalized_weight_sum: false 27 | - name: mlp 28 | hidden_layer_sizes: [512] 29 | dropout_p: 0.2 30 | num_outputs: 56 31 | -------------------------------------------------------------------------------- /plugin/MARBLE/configs/evar/MTGGenre.yaml: -------------------------------------------------------------------------------- 1 | _import: 2 | - !include benchmark/tasks/MTG/MTGGenre_base_config.yaml 3 | 4 | dataset: 5 | pre_extract: 6 | accelerator: gpu 7 | audio_dir: data/MTG/audio-low 8 | output_dir: 9 | keep_folder_structure: true 10 | overwrite: true 11 | 12 | feature_extractor: 13 | pretrain: 14 | !include benchmark/models/evar/evar.yaml 15 | 16 | dataset: MTGGenre 17 | input_type: feature # [audio, feature] 18 | input_dir: 19 | metadata_dir: data/MTG/mtg-jamendo-dataset 20 | 21 | model: 22 | downstream_structure: 23 | components: 24 | - name: feature_selector 25 | layer: 0 # not a transformer model, so layer is set to 0 26 | normalized_weight_sum: false 27 | - name: mlp 28 | hidden_layer_sizes: [512] 29 | dropout_p: 0.2 30 | num_outputs: 87 31 | -------------------------------------------------------------------------------- /plugin/MARBLE/configs/evar/MTGTop50.yaml: -------------------------------------------------------------------------------- 1 | _import: 2 | - !include benchmark/tasks/MTG/MTGTop50_base_config.yaml 3 | 4 | dataset: 5 | pre_extract: 6 | accelerator: gpu 7 | audio_dir: data/MTG/audio-low 8 | output_dir: 9 | keep_folder_structure: true 10 | overwrite: true 11 | 12 | feature_extractor: 13 | pretrain: 14 | !include benchmark/models/evar/evar.yaml 15 | 16 | dataset: MTGTop50 17 | input_type: feature # [audio, feature] 18 | input_dir: 19 | metadata_dir: data/MTG/mtg-jamendo-dataset 20 | 21 | model: 22 | downstream_structure: 23 | components: 24 | - name: feature_selector 25 | layer: 0 # not a transformer model, so layer is set to 0 26 | normalized_weight_sum: false 27 | - name: mlp 28 | hidden_layer_sizes: [512] 29 | dropout_p: 0.2 30 | num_outputs: 50 31 | -------------------------------------------------------------------------------- /plugin/MARBLE/configs/evar/MTGInstrument.yaml: -------------------------------------------------------------------------------- 1 | _import: 2 | - !include benchmark/tasks/MTG/MTGInstrument_base_config.yaml 3 | 4 | dataset: 5 | pre_extract: 6 | accelerator: gpu 7 | audio_dir: data/MTG/audio-low 8 | output_dir: 9 | keep_folder_structure: true 10 | overwrite: true 11 | 12 | feature_extractor: 13 | pretrain: 14 | !include benchmark/models/evar/evar.yaml 15 | 16 | dataset: MTGInstrument 17 | input_type: feature # [audio, feature] 18 | input_dir: 19 | metadata_dir: data/MTG/mtg-jamendo-dataset 20 | 21 | model: 22 | downstream_structure: 23 | components: 24 | - name: feature_selector 25 | layer: 0 # not a transformer model, so layer is set to 0 26 | normalized_weight_sum: false 27 | - name: mlp 28 | hidden_layer_sizes: [512] 29 | dropout_p: 0.2 30 | num_outputs: 40 31 | -------------------------------------------------------------------------------- /plugin/MARBLE/configs/evar/EMO.yaml: -------------------------------------------------------------------------------- 1 | _import: 2 | - !include benchmark/tasks/EMO/EMO_base_config.yaml 3 | 4 | dataset: 5 | pre_extract: 6 | accelerator: gpu 7 | audio_dir: data/EMO/emomusic/wav 8 | output_dir: 9 | keep_folder_structure: true 10 | overwrite: true 11 | 12 | feature_extractor: 13 | pretrain: 14 | !include benchmark/models/evar/evar.yaml 15 | 16 | dataset: EMO 17 | input_type: feature # [audio, feature] 18 | input_dir: 19 | metadata_dir: data/EMO/emomusic 20 | 21 | model: 22 | downstream_structure: 23 | components: 24 | - name: feature_selector 25 | layer: 0 # [all, 0, 1, 2, ..., $n_tranformer_layer] 26 | # weighted sum is only effective when layer is set to all 27 | normalized_weight_sum: false 28 | 29 | - name: mlp 30 | hidden_layer_sizes: [512] 31 | dropout_p: 0.2 32 | num_outputs: 2 33 | -------------------------------------------------------------------------------- /evar/utils/m2d_add_norm_stats.py: -------------------------------------------------------------------------------- 1 | """A small utility for M2D fine-tuned by EVAR. 2 | This utility adds a parameter "module.ar.runtime.backbone.norm_stats" to a checkpoint file with constant normalization statistic values [-7.1, 4.2]. 3 | These values are the dataset average and standard deviation when pre-trained on AudioSet with M2D. 4 | 5 | Usage: python [this script] [source checkpoint file] [output checkpoint file] 6 | """ 7 | 8 | import torch 9 | import sys 10 | 11 | src_file = sys.argv[1] 12 | dest_file = sys.argv[2] 13 | 14 | checkpoint = torch.load(src_file, map_location='cpu') 15 | if 'module.ar.runtime.backbone.cls_token' not in checkpoint: 16 | print(f'{src_file} is not a fine-tuned checkpoint; no "module.ar.runtime.backbone.cls_token".') 17 | exit(1) 18 | 19 | checkpoint['module.ar.runtime.backbone.norm_stats'] = torch.tensor([-7.1, 4.2]) 20 | torch.save(checkpoint, dest_file) 21 | print(f'Saved {dest_file} with an additional parameter "module.ar.runtime.backbone.norm_stats".') 22 | -------------------------------------------------------------------------------- /config/ced.yaml: -------------------------------------------------------------------------------- 1 | # AR parameters such as FFT parameters. 2 | audio_repr: ar_ced.AR_CED 3 | weight_file: mispeech/ced-base 4 | feature_d: 768 5 | sample_rate: 16000 6 | n_fft: 512 7 | window_size: 512 8 | hop_size: 160 9 | n_mels: 64 10 | f_min: 0 11 | f_max: 8000 12 | # window: hanning 13 | 14 | # Model specific parameters. 15 | cls_token: False # Use CLS token 16 | output_layers: [-1] # List of layers to stack 17 | encoder_only: False 18 | dur_frames: # None for no desired number of frames 19 | freeze_embed: # Set True if freezing PatchEmbed during fine-tuning [2211.09359] How to Fine-Tune Vision Models with SGD 20 | 21 | # Linear evaluation/Fine-tuning common parameters. 22 | training_mask: 0.0 23 | flat_features: False # 768-d if True else 3840-d 24 | 25 | # Linear evaluaition parameters. 26 | batch_size: 128 27 | lr_lineareval: 0.0003 28 | report_per_epochs: 50 29 | early_stop_epochs: 20 30 | 31 | # Fine-tuning parameters. 32 | warmup_epochs: 5 33 | mixup: 0.5 34 | ft_bs: 128 35 | ft_lr: 2.0 36 | ft_early_stop_epochs: -1 # -1: no early stopping 37 | ft_epochs: 200 38 | ft_freq_mask: 30 39 | ft_time_mask: 192 40 | ft_noise: 0.0 41 | ft_rrc: True 42 | -------------------------------------------------------------------------------- /run/all_msclap.sh: -------------------------------------------------------------------------------- 1 | NAME=MSCLAP 2 | python 2pass_lineareval.py config/msclap.yaml cremad batch_size=16,name=$NAME 3 | python 2pass_lineareval.py config/msclap.yaml gtzan batch_size=16,name=$NAME 4 | python 2pass_lineareval.py config/msclap.yaml spcv2 batch_size=64,name=$NAME 5 | python 2pass_lineareval.py config/msclap.yaml esc50 batch_size=64,name=$NAME 6 | python 2pass_lineareval.py config/msclap.yaml us8k batch_size=64,name=$NAME 7 | python 2pass_lineareval.py config/msclap.yaml vc1 batch_size=64,name=$NAME 8 | python 2pass_lineareval.py config/msclap.yaml voxforge batch_size=64,name=$NAME 9 | python 2pass_lineareval.py config/msclap.yaml nsynth batch_size=64,name=$NAME 10 | python 2pass_lineareval.py config/msclap.yaml surge batch_size=64,name=$NAME 11 | 12 | python zeroshot.py config/msclap.yaml cremad batch_size=16,name=$NAME 13 | python zeroshot.py config/msclap.yaml gtzan batch_size=16,name=$NAME 14 | python zeroshot.py config/msclap.yaml nsynth batch_size=64,name=$NAME 15 | python zeroshot.py config/msclap.yaml esc50 batch_size=64,name=$NAME 16 | python zeroshot.py config/msclap.yaml us8k batch_size=64,name=$NAME 17 | 18 | python summarize.py $NAME 19 | -------------------------------------------------------------------------------- /config/htsat.yaml: -------------------------------------------------------------------------------- 1 | # AR parameters such as FFT parameters. 2 | audio_repr: ar_htsat.AR_HTSAT 3 | weight_file: external/HTSAT_AudioSet_Saved_1.ckpt 4 | feature_d: 768 5 | sample_rate: 32000 6 | n_fft: 1024 7 | window_size: 1024 8 | hop_size: 320 9 | n_mels: 64 10 | f_min: 50 11 | f_max: 14000 12 | window: hanning 13 | 14 | # Model specific parameters. 15 | cls_token: False # Use CLS token 16 | output_layers: [-1] # List of layers to stack 17 | encoder_only: False 18 | dur_frames: # None for no desired number of frames 19 | freeze_embed: # Set True if freezing PatchEmbed during fine-tuning [2211.09359] How to Fine-Tune Vision Models with SGD 20 | 21 | # Linear evaluation/Fine-tuning common parameters. 22 | training_mask: 0.0 23 | flat_features: False # 768-d if True else 3840-d 24 | 25 | # Linear evaluaition parameters. 26 | batch_size: 128 27 | lr_lineareval: 0.0003 28 | report_per_epochs: 50 29 | early_stop_epochs: 20 30 | 31 | # Fine-tuning parameters. 32 | warmup_epochs: 5 33 | mixup: 0.5 34 | ft_bs: 128 35 | ft_lr: 2.0 36 | ft_early_stop_epochs: -1 # -1: no early stopping 37 | ft_epochs: 200 38 | ft_freq_mask: 8 39 | ft_time_mask: 64 40 | ft_noise: 0.0 41 | ft_rrc: True 42 | -------------------------------------------------------------------------------- /config/beats.yaml: -------------------------------------------------------------------------------- 1 | # AR parameters such as FFT parameters. 2 | audio_repr: ar_beats.AR_BEATs 3 | name: BEATs_iter3 4 | weight_file: external/BEATs_iter3.pt 5 | feature_d: 768 6 | sample_rate: 16000 7 | n_fft: 400 8 | window_size: 400 9 | hop_size: 160 10 | n_mels: 80 11 | f_min: 50 12 | f_max: 8000 13 | window: hanning 14 | 15 | # Model specific parameters. 16 | cls_token: False # Use CLS token 17 | output_layers: [-1] # List of layers to stack 18 | encoder_only: False 19 | dur_frames: # None for no desired number of frames 20 | freeze_embed: # Set True if freezing PatchEmbed during fine-tuning [2211.09359] How to Fine-Tune Vision Models with SGD 21 | 22 | # Linear evaluation/Fine-tuning common parameters. 23 | training_mask: 0.0 24 | flat_features: False # 768-d if True else 3840-d 25 | 26 | # Linear evaluaition parameters. 27 | batch_size: 128 28 | lr_lineareval: 0.0003 29 | report_per_epochs: 50 30 | early_stop_epochs: 20 31 | 32 | # Fine-tuning parameters. 33 | warmup_epochs: 5 34 | mixup: 0.5 35 | ft_bs: 128 36 | ft_lr: 2.0 37 | ft_early_stop_epochs: -1 # -1: no early stopping 38 | ft_epochs: 200 39 | ft_freq_mask: 30 40 | ft_time_mask: 192 41 | ft_noise: 0.0 42 | ft_rrc: True 43 | -------------------------------------------------------------------------------- /config/m2d_clap.yaml: -------------------------------------------------------------------------------- 1 | # Zero-shot-ready 2 | 3 | # AR parameters such as FFT parameters. 4 | audio_repr: ar_m2d.AR_M2D_CLAP 5 | weight_file: m2d_clap_vit_base-80x208p16x16p16k-random/random 6 | feature_d: 3840 7 | sample_rate: 16000 8 | n_fft: 400 9 | window_size: 400 10 | hop_size: 160 11 | n_mels: 80 12 | f_min: 50 13 | f_max: 8000 14 | window: hanning 15 | 16 | # Model specific parameters. 17 | output_layers: [-1] # List of layers to stack 18 | encoder_only: True 19 | dur_frames: # None for no desired number of frames 20 | freeze_embed: # Set True if freezing PatchEmbed during fine-tuning [2211.09359] How to Fine-Tune Vision Models with SGD 21 | 22 | # Linear evaluation/Fine-tuning common parameters. 23 | flat_features: True # 768-d if True else 3840-d 24 | 25 | # Linear evaluaition parameters. 26 | batch_size: 128 27 | lr_lineareval: 0.00003 28 | report_per_epochs: 50 29 | early_stop_epochs: 20 30 | 31 | # Fine-tuning parameters. 32 | training_mask: 0.0 33 | warmup_epochs: 5 34 | mixup: 0.5 35 | ft_bs: 128 36 | ft_lr: 2.0 37 | ft_early_stop_epochs: -1 # -1: no early stopping 38 | ft_epochs: 200 39 | ft_freq_mask: 30 40 | ft_time_mask: 192 41 | ft_noise: 0.0 42 | ft_rrc: True 43 | -------------------------------------------------------------------------------- /config/m2d_clap_32k.yaml: -------------------------------------------------------------------------------- 1 | # Zero-shot-ready 2 | 3 | # AR parameters such as FFT parameters. 4 | audio_repr: ar_m2d.AR_M2D_CLAP 5 | weight_file: m2d_clap_vit_base-80x208p16x16p32k-random/random 6 | feature_d: 3840 7 | sample_rate: 32000 8 | n_fft: 800 9 | window_size: 800 10 | hop_size: 320 11 | n_mels: 80 12 | f_min: 50 13 | f_max: 16000 14 | window: hanning 15 | 16 | # Model specific parameters. 17 | output_layers: [-1] # List of layers to stack 18 | encoder_only: False 19 | dur_frames: # None for no desired number of frames 20 | freeze_embed: # Set True if freezing PatchEmbed during fine-tuning [2211.09359] How to Fine-Tune Vision Models with SGD 21 | 22 | # Linear evaluation/Fine-tuning common parameters. 23 | flat_features: True # 768-d if True else 3840-d 24 | 25 | # Linear evaluaition parameters. 26 | batch_size: 128 27 | lr_lineareval: 0.00003 28 | report_per_epochs: 50 29 | early_stop_epochs: 20 30 | 31 | # Fine-tuning parameters. 32 | training_mask: 0.0 33 | warmup_epochs: 5 34 | mixup: 0.5 35 | ft_bs: 128 36 | ft_lr: 2.0 37 | ft_early_stop_epochs: -1 # -1: no early stopping 38 | ft_epochs: 200 39 | ft_freq_mask: 30 40 | ft_time_mask: 192 41 | ft_noise: 0.0 42 | ft_rrc: True 43 | -------------------------------------------------------------------------------- /run/all_laionclap.sh: -------------------------------------------------------------------------------- 1 | NAME=LAIONCLAP 2 | python 2pass_lineareval.py config/laionclap.yaml cremad batch_size=16,name=$NAME 3 | python 2pass_lineareval.py config/laionclap.yaml gtzan batch_size=16,name=$NAME 4 | python 2pass_lineareval.py config/laionclap.yaml spcv2 batch_size=64,name=$NAME 5 | python 2pass_lineareval.py config/laionclap.yaml esc50 batch_size=64,name=$NAME 6 | python 2pass_lineareval.py config/laionclap.yaml us8k batch_size=64,name=$NAME 7 | python 2pass_lineareval.py config/laionclap.yaml vc1 batch_size=64,name=$NAME 8 | python 2pass_lineareval.py config/laionclap.yaml voxforge batch_size=64,name=$NAME 9 | python 2pass_lineareval.py config/laionclap.yaml nsynth batch_size=64,name=$NAME 10 | python 2pass_lineareval.py config/laionclap.yaml surge batch_size=64,name=$NAME 11 | 12 | python zeroshot.py config/laionclap.yaml cremad batch_size=16,name=$NAME 13 | python zeroshot.py config/laionclap.yaml gtzan batch_size=16,name=$NAME 14 | python zeroshot.py config/laionclap.yaml nsynth batch_size=64,name=$NAME 15 | python zeroshot.py config/laionclap.yaml esc50 batch_size=64,name=$NAME 16 | python zeroshot.py config/laionclap.yaml us8k batch_size=64,name=$NAME 17 | 18 | python summarize.py $NAME 19 | -------------------------------------------------------------------------------- /config/beats_plus.yaml: -------------------------------------------------------------------------------- 1 | # AR parameters such as FFT parameters. 2 | audio_repr: ar_beats.AR_BEATs 3 | name: BEATs_iter3_plus_AS2M 4 | weight_file: external/BEATs_iter3_plus_AS2M.pt 5 | feature_d: 768 6 | sample_rate: 16000 7 | n_fft: 400 8 | window_size: 400 9 | hop_size: 160 10 | n_mels: 80 11 | f_min: 50 12 | f_max: 8000 13 | window: hanning 14 | 15 | # Model specific parameters. 16 | cls_token: False # Use CLS token 17 | output_layers: [-1] # List of layers to stack 18 | encoder_only: False 19 | dur_frames: # None for no desired number of frames 20 | freeze_embed: # Set True if freezing PatchEmbed during fine-tuning [2211.09359] How to Fine-Tune Vision Models with SGD 21 | 22 | # Linear evaluation/Fine-tuning common parameters. 23 | training_mask: 0.0 24 | flat_features: False # 768-d if True else 3840-d 25 | 26 | # Linear evaluaition parameters. 27 | batch_size: 128 28 | lr_lineareval: 0.0003 29 | report_per_epochs: 50 30 | early_stop_epochs: 20 31 | 32 | # Fine-tuning parameters. 33 | warmup_epochs: 5 34 | mixup: 0.5 35 | ft_bs: 128 36 | ft_lr: 2.0 37 | ft_early_stop_epochs: -1 # -1: no early stopping 38 | ft_epochs: 200 39 | ft_freq_mask: 30 40 | ft_time_mask: 192 41 | ft_noise: 0.0 42 | ft_rrc: True 43 | -------------------------------------------------------------------------------- /config/m2d.yaml: -------------------------------------------------------------------------------- 1 | # AR parameters such as FFT parameters. 2 | audio_repr: ar_m2d.AR_M2D 3 | weight_file: m2d_vit_base-80x208p16x16p16k-random/random 4 | feature_d: 3840 5 | sample_rate: 16000 6 | n_fft: 400 7 | window_size: 400 8 | hop_size: 160 9 | n_mels: 80 10 | f_min: 50 11 | f_max: 8000 12 | window: hanning 13 | 14 | # Statistics for normalization: average and standard deviation 15 | mean: -7.1 16 | std: 4.2 17 | 18 | # Model specific parameters. 19 | output_layers: [-1] # List of layers to stack 20 | encoder_only: True 21 | dur_frames: # None for no desired number of frames 22 | freeze_embed: # Set True if freezing PatchEmbed during fine-tuning [2211.09359] How to Fine-Tune Vision Models with SGD 23 | 24 | # Linear evaluation/Fine-tuning common parameters. 25 | flat_features: False # 768-d if True else 3840-d 26 | 27 | # Linear evaluaition parameters. 28 | batch_size: 128 29 | lr_lineareval: 0.00003 30 | report_per_epochs: 50 31 | early_stop_epochs: 20 32 | 33 | # Fine-tuning parameters. 34 | training_mask: 0.0 35 | warmup_epochs: 5 36 | mixup: 0.5 37 | ft_bs: 128 38 | ft_lr: 2.0 39 | ft_early_stop_epochs: -1 # -1: no early stopping 40 | ft_epochs: 200 41 | ft_freq_mask: 30 42 | ft_time_mask: 192 43 | ft_noise: 0.0 44 | ft_rrc: True 45 | -------------------------------------------------------------------------------- /config/m2d_32k.yaml: -------------------------------------------------------------------------------- 1 | # AR parameters such as FFT parameters. 2 | audio_repr: ar_m2d.AR_M2D 3 | weight_file: m2d_vit_base-80x208p16x16p32k-random/random 4 | feature_d: 3840 5 | sample_rate: 32000 6 | n_fft: 800 7 | window_size: 800 8 | hop_size: 320 9 | n_mels: 80 10 | f_min: 50 11 | f_max: 16000 12 | window: hanning 13 | 14 | # Statistics for normalization: average and standard deviation 15 | mean: -7.1 16 | std: 4.2 17 | 18 | # Model specific parameters. 19 | output_layers: [-1] # List of layers to stack 20 | encoder_only: False 21 | dur_frames: # None for no desired number of frames 22 | freeze_embed: # Set True if freezing PatchEmbed during fine-tuning [2211.09359] How to Fine-Tune Vision Models with SGD 23 | 24 | # Linear evaluation/Fine-tuning common parameters. 25 | flat_features: False # 768-d if True else 3840-d 26 | 27 | # Linear evaluaition parameters. 28 | batch_size: 128 29 | lr_lineareval: 0.00003 30 | report_per_epochs: 50 31 | early_stop_epochs: 20 32 | 33 | # Fine-tuning parameters. 34 | training_mask: 0.0 35 | warmup_epochs: 5 36 | mixup: 0.5 37 | ft_bs: 128 38 | ft_lr: 2.0 39 | ft_early_stop_epochs: -1 # -1: no early stopping 40 | ft_epochs: 200 41 | ft_freq_mask: 30 42 | ft_time_mask: 192 43 | ft_noise: 0.0 44 | ft_rrc: True 45 | -------------------------------------------------------------------------------- /evar/ar_openl3.py: -------------------------------------------------------------------------------- 1 | """Wrapper code for: 2 | 3 | Look, Listen and Learn More: Design Choices for Deep Audio Embeddings 4 | 5 | ## Reference 6 | - [1] https://arxiv.org/abs/2104.11587 7 | - [2] https://github.com/marl/openl3 8 | - [3] https://github.com/torchopenl3/torchopenl3 9 | """ 10 | 11 | from evar.ar_base import (BaseAudioRepr, temporal_pooling) 12 | import torch 13 | import logging 14 | try: 15 | import torchopenl3 16 | from torchopenl3.utils import preprocess_audio_batch 17 | except: 18 | pass # logging.error('Install toprchopenl3.\n>>> pip install torchopenl3') 19 | 20 | 21 | class AR_OpenL3(BaseAudioRepr): 22 | 23 | def __init__(self, cfg): 24 | super().__init__(cfg=cfg) 25 | self.openl3_model = torchopenl3.models.load_audio_embedding_model( 26 | cfg.openl3_input_repr, cfg.openl3_content_type, cfg.feature_d) 27 | 28 | def encode_frames(self, batch_audio): 29 | frame_embeddings, ts_list = torchopenl3.get_audio_embedding(batch_audio, 30 | self.cfg.sample_rate, model=self.openl3_model) # -> [B, T, D] 31 | return frame_embeddings.transpose(1, 2) # -> [B, D, T] 32 | 33 | def forward(self, batch_audio): 34 | frame_embeddings = self.encode_frames(batch_audio) 35 | return temporal_pooling(self, frame_embeddings) 36 | -------------------------------------------------------------------------------- /app/README.md: -------------------------------------------------------------------------------- 1 | # Application-specific evaluation 2 | 3 | Some applications use their own evaluation protocols, including specialized metrics, and their benchmarking code typically restricts the models that can be evaluated. To address this limitation and enable evaluation code to work with models available on EVAR (with wrapper implementations), we modify these applications to support a broader range of models. This subproject outlines the precise steps and codes required to integrate EVAR into each application. 4 | 5 | ## Assessing the Utility of Audio Foundation Models for Heart and Respiratory Sound Analysis 6 | 7 | For our paper: 8 | 9 | *[D. Niizumi, D. Takeuchi, M. Yasuda, B. T. Nguyen, Y. Ohishi, and N. Harada, "Assessing the Utility of Audio Foundation Models for Heart and Respiratory Sound Analysis," to appear at IEEE EMBC, 2025](https://arxiv.org/abs/2504.18004).* 10 | 11 | We provide code to reproduce experiments for the tasks: 12 | 13 | - Heart sound task: CirCor 👉 [circor](circor/README_CirCor.md). 14 | - Heart sound task: BMD-HS 👉 [bmdhs](bmdhs/README_BMDHS.md). 15 | - Respiratory sound task: SPRSound (SPRS) 👉 [icbhi_sprs](icbhi_sprs/README_ICBHI_SPRS.md) 16 | - Respiratory sound task: ICBHI2017 👉 [icbhi_sprs](icbhi_sprs/README_ICBHI_SPRS.md) 17 | 18 | Please follow the instructions in each folder. 19 | -------------------------------------------------------------------------------- /evar/ar_trill.py: -------------------------------------------------------------------------------- 1 | """Wrapper code for: 2 | 3 | Towards Learning a Universal Non-Semantic Representation of Speech 4 | 5 | ## Reference 6 | - [1] http://arxiv.org/abs/2002.12764 7 | - [2] https://aihub.cloud.google.com/u/0/p/products%2F41239b97-c960-479a-be50-ae7a23ae1561 8 | """ 9 | 10 | from evar.ar_base import (BaseAudioRepr, temporal_pooling) 11 | import torch 12 | import logging 13 | try: 14 | import tensorflow.compat.v2 as tf 15 | tf.enable_v2_behavior() 16 | assert tf.executing_eagerly() 17 | import tensorflow_hub as hub 18 | except: 19 | pass # logging.error('Install tensorflow and tensorflow_hub.\n>>> pip install tensorflow tensorflow_hub') 20 | 21 | 22 | class AR_TRILL(BaseAudioRepr): 23 | def __init__(self, cfg): 24 | super().__init__(cfg=cfg) 25 | self.model = hub.load(cfg.trill_url) 26 | self.emb_type = cfg.trill_emb_type 27 | 28 | def encode_frames(self, batch_audio): 29 | device = batch_audio.device 30 | x = self.model(samples=tf.convert_to_tensor(batch_audio.cpu().numpy()), sample_rate=16000)[self.emb_type].numpy() 31 | x = torch.tensor(x.transpose(0, 2, 1)).float().to(device) # transpose: [B,T,D] -> [B,D,T] 32 | return x 33 | 34 | def forward(self, batch_audio): 35 | x = self.encode_frames(batch_audio) 36 | x = temporal_pooling(self, x) 37 | return x 38 | -------------------------------------------------------------------------------- /evar/ar_msclap.py: -------------------------------------------------------------------------------- 1 | """Wrapper code for: 2 | 3 | Natural Language Supervision for General-Purpose Audio Representations 4 | 5 | ## Reference 6 | - [1] https://arxiv.org/abs/2309.05767 7 | - [2] https://github.com/microsoft/CLAP 8 | """ 9 | 10 | from evar.ar_base import BaseCLAP 11 | try: 12 | from msclap import CLAP 13 | except: 14 | pass # please install: pip install msclap 15 | 16 | 17 | class AR_MSCLAP(BaseCLAP): 18 | 19 | def __init__(self, cfg): 20 | super().__init__(cfg=cfg) 21 | # MS CLAP accepts file name as audio input. 22 | self.filename_mode = True 23 | 24 | self.backbone = CLAP(version=str(cfg.weight_file), use_cuda=True) 25 | 26 | def encode_frames(self, batch_audio): 27 | assert False, 'encode_frames for MS CLAP is not supported for now' 28 | 29 | def forward(self, batch_audio): 30 | audio_embeddings = self.backbone.get_audio_embeddings(batch_audio) 31 | return audio_embeddings 32 | 33 | def encode_audio(self, batch_audio): 34 | audio_embeddings = self.forward(batch_audio) 35 | return audio_embeddings 36 | 37 | def encode_text(self, batch_text): 38 | text_embeddings = self.backbone.get_text_embeddings(batch_text) 39 | return text_embeddings 40 | 41 | def compute_similarity(self, text_embs, audio_embs): 42 | similarity = self.backbone.compute_similarity(audio_embs, text_embs) 43 | return similarity.T 44 | -------------------------------------------------------------------------------- /evar/ar_spec.py: -------------------------------------------------------------------------------- 1 | """Wrapper code for: 2 | 3 | Mel-spectrogram and linear spectrogram. 4 | """ 5 | 6 | from evar.ar_base import (BaseAudioRepr, ToLogMelSpec, 7 | calculate_norm_stats, normalize_spectrogram, temporal_pooling) 8 | import nnAudio.features 9 | 10 | 11 | class AR_MelSpec(BaseAudioRepr): 12 | def __init__(self, cfg): 13 | super().__init__(cfg=cfg) 14 | self.to_feature = ToLogMelSpec(cfg) 15 | 16 | def precompute(self, device, data_loader): 17 | self.norm_stats = calculate_norm_stats(device, data_loader, self.to_feature) 18 | 19 | def encode_frames(self, batch_audio): 20 | x = self.to_feature(batch_audio) 21 | return normalize_spectrogram(self.norm_stats, x) 22 | 23 | def forward(self, batch_audio): 24 | x = self.encode_frames(batch_audio) 25 | return temporal_pooling(self, x) 26 | 27 | 28 | class ToLogLinSpec(ToLogMelSpec): 29 | def __init__(self, cfg): 30 | super().__init__(cfg) 31 | self.to_spec = nnAudio.features.STFT(n_fft=cfg.n_fft, win_length=cfg.window_size, 32 | freq_bins=None, hop_length=cfg.hop_size, 33 | center=True, sr=cfg.sample_rate, 34 | output_format="Magnitude", 35 | verbose=False, 36 | ) 37 | 38 | 39 | class AR_LinSpec(AR_MelSpec): 40 | def __init__(self, cfg): 41 | cfg.n_mels = 64 # dummy for making reuse of AR_MelSpec easy 42 | super().__init__(cfg=cfg) 43 | self.to_feature = ToLogLinSpec(cfg) 44 | -------------------------------------------------------------------------------- /evar/ar_dasheng.py: -------------------------------------------------------------------------------- 1 | """Wrapper code for: 2 | 3 | Scaling up masked audio encoder learning for general audio classification 4 | 5 | ## Reference 6 | - [1] https://www.isca-archive.org/interspeech_2024/dinkel24b_interspeech.html 7 | - [2] https://huggingface.co/mispeech/dasheng-base 8 | """ 9 | 10 | from evar.ar_base import BaseAudioRepr 11 | import torch 12 | import logging 13 | try: 14 | from dasheng_model.feature_extraction_dasheng import DashengFeatureExtractor 15 | from dasheng_model.modeling_dasheng import DashengModel 16 | except: 17 | logging.error('Install as follows.\n>>> pip install git+https://github.com/jimbozhang/hf_transformers_custom_model_dasheng.git') 18 | 19 | 20 | class AR_Dasheng(BaseAudioRepr): 21 | def __init__(self, cfg): 22 | super().__init__(cfg=cfg) 23 | 24 | self.preprocessor = DashengFeatureExtractor.from_pretrained(cfg.model_name) 25 | self.backbone = DashengModel.from_pretrained(cfg.model_name, outputdim=None) 26 | 27 | def encode_frames(self, batch_audio): 28 | preprocessed = self.preprocessor(audio.cpu(), sampling_rate=16000, return_tensors="pt") 29 | preprocessed = preprocessed.to(batch_audio.device) 30 | hidden_states = self.backbone(**preprocessed).hidden_states # [B, T, D] 31 | return hidden_states.transpose(1, 2) # [B, D, T] 32 | 33 | def forward(self, batch_audio): 34 | preprocessed = self.preprocessor(batch_audio.cpu(), sampling_rate=16000, return_tensors="pt") 35 | preprocessed = preprocessed.to(batch_audio.device) 36 | return self.backbone(**preprocessed).logits 37 | -------------------------------------------------------------------------------- /evar/ar_ced.py: -------------------------------------------------------------------------------- 1 | """Wrapper code for: 2 | 3 | CED: Consistent ensemble distillation for audio tagging 4 | 5 | ## Reference 6 | - [1] https://arxiv.org/abs/2308.11957 7 | - [2] https://github.com/RicherMans/ced 8 | """ 9 | 10 | from evar.ar_base import BaseAudioRepr, temporal_pooling 11 | import sys 12 | import logging 13 | import torch 14 | try: 15 | sys.path.append('../../external/hf_transformers_custom_model_ced') 16 | sys.path.append('external/hf_transformers_custom_model_ced') 17 | from ced_model.feature_extraction_ced import CedFeatureExtractor 18 | from ced_model.modeling_ced import CedForAudioClassification 19 | from transformers.modeling_outputs import SequenceClassifierOutput 20 | except: 21 | pass # please install CED 22 | 23 | 24 | class AR_CED(BaseAudioRepr): 25 | 26 | def __init__(self, cfg): 27 | super().__init__(cfg=cfg) 28 | 29 | model_path = cfg.weight_file 30 | self.feature_extractor = CedFeatureExtractor.from_pretrained(model_path) 31 | self.backbone = CedForAudioClassification.from_pretrained(model_path) 32 | 33 | logging.info(f' Using weight from Hugging Face: {cfg.weight_file}') 34 | 35 | def encode_frames(self, batch_audio): 36 | inputs = self.feature_extractor(batch_audio.to('cpu'), sampling_rate=16000, return_tensors="pt") 37 | inputs['input_values'] = inputs['input_values'].to('cuda') 38 | features = self.backbone(**inputs).hidden_states 39 | return features.transpose(1, 2) # [B, D, T] 40 | 41 | def forward(self, batch_audio): 42 | features = self.encode_frames(batch_audio) 43 | return features.mean(-1) 44 | 45 | -------------------------------------------------------------------------------- /evar/ar_atst_frame.py: -------------------------------------------------------------------------------- 1 | """Wrapper code for: 2 | 3 | Self-supervised Audio Teacher-Student Transformer for Both Clip-level and Frame-level Tasks 4 | 5 | ## Reference 6 | - [1] https://arxiv.org/abs/2306.04186 7 | - [2] https://github.com/Audio-WestlakeU/audiossl/blob/main/audiossl/methods/atstframe 8 | """ 9 | 10 | from evar.ar_base import BaseAudioRepr 11 | import logging 12 | import sys 13 | from einops import rearrange 14 | try: 15 | import os 16 | evar_home = os.getenv('EVAR', '') 17 | sys.path.append(os.path.join(evar_home, 'external/audiossl')) 18 | sys.path.append('../../external/audiossl') 19 | from audiossl.methods.atstframe.embedding import load_model, get_scene_embedding, get_timestamp_embedding 20 | except Exception as e: 21 | pass # Please clone audiossl 22 | 23 | 24 | class AR_ATST_Frame(BaseAudioRepr): 25 | def __init__(self, cfg): 26 | super().__init__(cfg=cfg) 27 | 28 | self.backbone = load_model(cfg.weight_file) 29 | logging.info(f' Using weight file: {cfg.weight_file}') 30 | 31 | def encode_frames(self, batch_audio): 32 | batch_audio = batch_audio.unsqueeze(1) # [B, L] -> [B, 1, L] as described in the README 33 | x, _ = get_timestamp_embedding(batch_audio, self.backbone) # -> [B,T,N_BLOCKS*emb_size] 34 | # no need x = rearrange(x, 'B 1 T N D -> B (N * D) T') 35 | return x 36 | 37 | def forward(self, batch_audio): 38 | #import pdb; pdb.set_trace() 39 | batch_audio = batch_audio.unsqueeze(1) # [B, L] -> [B, 1, L] as described in the README 40 | x = get_scene_embedding(batch_audio, self.backbone) # [B,N_BLOCKS*emb_size] 41 | return x 42 | -------------------------------------------------------------------------------- /app/circor/patch-heart-murmur-detection.diff: -------------------------------------------------------------------------------- 1 | --- org/heart-murmur-detection/ModelEvaluation/evaluate_model.py 2024-01-12 15:29:10.126397375 +0900 2 | +++ /heart-murmur-detection/ModelEvaluation/evaluate_model.py 2023-11-15 16:47:47.351524689 +0900 3 | @@ -59,6 +59,10 @@ 4 | murmur_weighted_accuracy = compute_weighted_accuracy( 5 | murmur_labels, output_labels, murmur_classes 6 | ) # This is the murmur scoring metric. 7 | + 8 | + # UAR 9 | + murmur_uar = murmur_accuracy_classes.mean() 10 | + 11 | murmur_scores = ( 12 | murmur_classes, 13 | murmur_auroc, 14 | @@ -70,6 +74,7 @@ 15 | murmur_accuracy, 16 | murmur_accuracy_classes, 17 | murmur_weighted_accuracy, 18 | + murmur_uar, 19 | ) 20 | 21 | ( 22 | @@ -83,11 +88,12 @@ 23 | accuracy, 24 | accuracy_classes, 25 | weighted_accuracy, 26 | + uar, 27 | ) = murmur_scores 28 | murmur_output_string = ( 29 | - "AUROC,AUPRC,F-measure,Accuracy,Weighted Accuracy" 30 | - "\n{:.3f},{:.3f},{:.3f},{:.3f},{:.3f}\n".format( 31 | - auroc, auprc, f_measure, accuracy, weighted_accuracy 32 | + "AUROC,AUPRC,F-measure,Accuracy,Weighted Accuracy,UAR" 33 | + "\n{:.3f},{:.3f},{:.3f},{:.3f},{:.3f},{:.3f}\n".format( 34 | + auroc, auprc, f_measure, accuracy, weighted_accuracy, uar 35 | ) 36 | ) 37 | murmur_class_output_string = ( 38 | @@ -109,8 +115,10 @@ 39 | + murmur_class_output_string 40 | ) 41 | 42 | + print(output_string) 43 | + 44 | # Return the results. 45 | - return output_string 46 | + return murmur_scores 47 | 48 | 49 | # Find Challenge files. -------------------------------------------------------------------------------- /evar/ar_cnn14.py: -------------------------------------------------------------------------------- 1 | """Wrapper code for: 2 | 3 | PANNs: Large-Scale Pretrained Audio Neural Networks for Audio Pattern Recognition 4 | 5 | ## Reference 6 | - [1] https://arxiv.org/abs/1912.10211 7 | - [2] https://github.com/qiuqiangkong/audioset_tagging_cnn 8 | """ 9 | 10 | from evar.ar_base import BaseAudioRepr 11 | from evar.model_utils import ensure_weights, load_pretrained_weights 12 | import logging 13 | try: 14 | from evar.cnn14_decoupled import AudioFeatureExtractor, Cnn14_Decoupled 15 | except: 16 | logging.info('** Install torchlibrosa if you use Cnn14 **') 17 | 18 | 19 | class AR_Cnn14(BaseAudioRepr): 20 | def __init__(self, cfg): 21 | super().__init__(cfg=cfg) 22 | self.feature_extractor = AudioFeatureExtractor(n_fft=cfg.n_fft, hop_length=cfg.hop_size, win_length=cfg.window_size, 23 | sample_rate=cfg.sample_rate, n_mels=cfg.n_mels, f_min=cfg.f_min, f_max=cfg.f_max) 24 | self.body = Cnn14_Decoupled() 25 | weight_file = 'external/Cnn14_16k_mAP=0.438.pth' if cfg.weight_file is None else cfg.weight_file 26 | ensure_weights(weight_file, 'https://zenodo.org/record/3987831/files/Cnn14_16k_mAP%3D0.438.pth') 27 | load_pretrained_weights(self.body, weight_file) 28 | 29 | def encode_frames(self, batch_audio): 30 | x = self.feature_extractor(batch_audio) # (B, 1, T, F(mel_bins)) 31 | x = self.augment_if_training(x.transpose(-2, -1)).transpose(-2, -1) # (..., T, F) -> (..., F, T) -augment-> (..., T, F) 32 | return self.body.encode(x) # (B, D, T) 33 | 34 | def forward(self, batch_audio): 35 | frame_embeddings = self.encode_frames(batch_audio) # (B, D, T) 36 | return self.body.temporal_pooling(frame_embeddings) # (B, D) 37 | -------------------------------------------------------------------------------- /evar/ar_data2vec.py: -------------------------------------------------------------------------------- 1 | """Wrapper code for: 2 | 3 | Data2vec: A General Framework for Self-supervised Learning in Speech, Vision and Language 4 | 5 | ## Reference 6 | - [1] https://ai.facebook.com/research/data2vec-a-general-framework-for-self-supervised-learning-in-speech-vision-and-language/ 7 | - [2] https://huggingface.co/facebook/data2vec-audio-large-960h 8 | """ 9 | 10 | from evar.ar_base import BaseAudioRepr, temporal_pooling 11 | import logging 12 | import torch 13 | try: 14 | from transformers import Data2VecAudioModel, Wav2Vec2Processor 15 | except: 16 | logging.error('Install transformers.\n>>> pip install transformers') 17 | 18 | 19 | class AR_Data2Vec(BaseAudioRepr): 20 | 21 | def __init__(self, cfg): 22 | super().__init__(cfg=cfg) 23 | 24 | self.processor = Wav2Vec2Processor.from_pretrained(cfg.pretrained_model) 25 | self.backbone = Data2VecAudioModel.from_pretrained(cfg.pretrained_model) 26 | 27 | def encode_frames(self, batch_audio): 28 | device = batch_audio.device 29 | preprocessed = self.processor(batch_audio.cpu().numpy(), return_tensors="pt", sampling_rate=16000).input_values 30 | preprocessed = preprocessed[0].to(device) # [1, B, raw wave length] -> [B, raw wave length] 31 | hidden_states = self.backbone(preprocessed, output_hidden_states=True).hidden_states # [B, T, D] 32 | # stack layer outputs 33 | states_to_stack = [hidden_states[index] for index in self.cfg.output_layers] if self.cfg.output_layers else hidden_states 34 | features = torch.cat(states_to_stack, axis=-1) 35 | return features.transpose(1, 2) # [B, D, T] 36 | 37 | def forward(self, batch_audio): 38 | return temporal_pooling(self, self.encode_frames(batch_audio)) 39 | -------------------------------------------------------------------------------- /evar/ar_byola2.py: -------------------------------------------------------------------------------- 1 | """Wrapper code for: 2 | 3 | BYOL for Audio: Exploring Pre-trained General-purpose Audio Representations 4 | 5 | ## Reference 6 | - [1] https://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber=9944865 7 | - [1] https://arxiv.org/abs/2204.07402 8 | """ 9 | 10 | from evar.ar_base import (BaseAudioRepr, ToLogMelSpec, calculate_norm_stats, normalize_spectrogram, temporal_pooling) 11 | from evar.model_utils import load_pretrained_weights 12 | import logging 13 | try: 14 | from external.byol_a.v2.byol_a2.models import AudioNTT2022Encoder 15 | except Exception as e: 16 | pass # logging.info(f'Make your copy of BYOL-A under external folder. Check Preparing-models.md for the details.') 17 | 18 | 19 | class AR_BYOLA2(BaseAudioRepr): 20 | def __init__(self, cfg): 21 | super().__init__(cfg=cfg) 22 | self.to_feature = ToLogMelSpec(cfg) 23 | 24 | self.body = AudioNTT2022Encoder(n_mels=cfg.n_mels, d=cfg.feature_d) 25 | if cfg.weight_file is not None and cfg.weight_file != '': 26 | load_pretrained_weights(self.body, cfg.weight_file, model_key='body') 27 | 28 | def precompute(self, device, data_loader): 29 | self.norm_stats = calculate_norm_stats(device, data_loader, self.to_feature) 30 | 31 | def encode_frames(self, batch_audio): 32 | x = self.to_feature(batch_audio) 33 | x = normalize_spectrogram(self.norm_stats, x) # B,F,T 34 | x = self.augment_if_training(x) 35 | x = x.unsqueeze(1) # -> B,1,F,T 36 | x = self.body(x) # -> B,T,D=C*F 37 | x = x.transpose(1, 2) # -> B,D,T 38 | return x 39 | 40 | def forward(self, batch_audio): 41 | x = self.encode_frames(batch_audio) 42 | x = temporal_pooling(self, x) 43 | return x 44 | 45 | -------------------------------------------------------------------------------- /2pass_lineareval.py: -------------------------------------------------------------------------------- 1 | """2-pass linear evaluation runner. 2 | 3 | This program is a wrapper for the lineareval.py enables: 4 | - Multiple runs of the lineareval.py with a cache of embeddings. 5 | - Evaluating Tensorflow models. 6 | 7 | ## Evaluation flow 8 | 9 | This will run lineareval.py twice or more so that we can decouple inference and linear evaluation phase, 10 | making it possible to use the TF model in the inference phase. 11 | 12 | 1. Run lineareval.py with `--step=2pass_1_precompute_only`. 13 | Conduct inference by any model (whichever TF or torch) to convert raw audio into embeddings, and store embedding in a cache. 14 | 2. Run lineareval.py with `--step=2pass_2_train_test`. Conduct linear evaluation by using embeddings from the cache using torch. 15 | 3. (if repeat > 1) Repeat the step 2 with incremented random seed. 16 | """ 17 | 18 | from evar.utils import run_command 19 | import fire 20 | 21 | 22 | def lineareval_two_pass(config_file, task, options='', lr=None, hidden=(), standard_scaler=True, mixup=False, 23 | early_stop_epochs=None, step=None, repeat=3, seed=None): 24 | 25 | seed = seed or 42 26 | command_line = [ 27 | 'python', 28 | 'lineareval.py', 29 | config_file, 30 | task, 31 | f'--options={options}', 32 | f'--lr={lr}', 33 | f'--hidden={hidden}', 34 | f'--standard_scaler={standard_scaler}', 35 | f'--mixup={mixup}', 36 | f'--early_stop_epochs={early_stop_epochs}' 37 | ] 38 | 39 | run_command(command_line + [f'--seed={seed}', '--step=2pass_1_precompute_only']) 40 | for i in range(repeat): 41 | run_command(command_line + [f'--seed={seed + i}', '--step=2pass_2_train_test']) 42 | 43 | 44 | if __name__ == '__main__': 45 | fire.Fire(lineareval_two_pass) 46 | -------------------------------------------------------------------------------- /plugin/OPERA/evar_openl3env.sh: -------------------------------------------------------------------------------- 1 | base=$EVAR 2 | dim=6144 3 | name=OpenL3 4 | GPU=0 5 | # filename=$(basename $weight) 6 | # "$(basename "$(dirname "$weight")")_${filename%.*}" 7 | 8 | python -m src.benchmark.processing.copd_processing --pretrain evar:$base:config/openl3env.yaml:$name 9 | CUDA_VISIBLE_DEVICES=$GPU python -m src.benchmark.linear_eval --task copd --pretrain $name --dim $dim 10 | python -m src.benchmark.processing.icbhi_processing --pretrain evar:$base:config/openl3env.yaml:$name 11 | CUDA_VISIBLE_DEVICES=$GPU python -m src.benchmark.linear_eval --task icbhidisease --pretrain $name --dim $dim 12 | python -m src.benchmark.processing.kauh_processing --pretrain evar:$base:config/openl3env.yaml:$name 13 | CUDA_VISIBLE_DEVICES=$GPU python -m src.benchmark.linear_eval --task kauh --pretrain $name --dim $dim 14 | 15 | python -m src.benchmark.processing.coughvid_processing --pretrain evar:$base:config/openl3env.yaml:$name --label covid 16 | CUDA_VISIBLE_DEVICES=$GPU python -m src.benchmark.linear_eval --task coughvidcovid --pretrain $name --dim $dim 17 | python -m src.benchmark.processing.coughvid_processing --pretrain evar:$base:config/openl3env.yaml:$name --label gender 18 | CUDA_VISIBLE_DEVICES=$GPU python -m src.benchmark.linear_eval --task coughvidsex --pretrain $name --dim $dim 19 | 20 | python -m src.benchmark.processing.coswara_processing --pretrain evar:$base:config/openl3env.yaml:$name --label smoker 21 | CUDA_VISIBLE_DEVICES=$GPU python -m src.benchmark.linear_eval --task coswarasmoker --pretrain $name --dim $dim --modality cough-shallow 22 | python -m src.benchmark.processing.coswara_processing --pretrain evar:$base:config/openl3env.yaml:$name --label sex 23 | CUDA_VISIBLE_DEVICES=$GPU python -m src.benchmark.linear_eval --task coswarasex --pretrain $name --dim $dim --modality cough-shallow 24 | 25 | -------------------------------------------------------------------------------- /plugin/OPERA/evar_atst_clip.sh: -------------------------------------------------------------------------------- 1 | base=$EVAR 2 | weight=${base}/external/atst_base.ckpt 3 | dim=1536 4 | name=ATST-CLIP 5 | GPU=0 6 | 7 | python -m src.benchmark.processing.copd_processing --pretrain evar:$base:config/atst.yaml:$name:weight_file=$weight 8 | CUDA_VISIBLE_DEVICES=$GPU python -m src.benchmark.linear_eval --task copd --pretrain $name --dim $dim 9 | python -m src.benchmark.processing.icbhi_processing --pretrain evar:$base:config/atst.yaml:$name:weight_file=$weight 10 | CUDA_VISIBLE_DEVICES=$GPU python -m src.benchmark.linear_eval --task icbhidisease --pretrain $name --dim $dim 11 | python -m src.benchmark.processing.kauh_processing --pretrain evar:$base:config/atst.yaml:$name:weight_file=$weight 12 | CUDA_VISIBLE_DEVICES=$GPU python -m src.benchmark.linear_eval --task kauh --pretrain $name --dim $dim 13 | 14 | python -m src.benchmark.processing.coughvid_processing --pretrain evar:$base:config/atst.yaml:$name:weight_file=$weight --label covid 15 | CUDA_VISIBLE_DEVICES=$GPU python -m src.benchmark.linear_eval --task coughvidcovid --pretrain $name --dim $dim 16 | python -m src.benchmark.processing.coughvid_processing --pretrain evar:$base:config/atst.yaml:$name:weight_file=$weight --label gender 17 | CUDA_VISIBLE_DEVICES=$GPU python -m src.benchmark.linear_eval --task coughvidsex --pretrain $name --dim $dim 18 | 19 | python -m src.benchmark.processing.coswara_processing --pretrain evar:$base:config/atst.yaml:$name:weight_file=$weight --label smoker 20 | CUDA_VISIBLE_DEVICES=$GPU python -m src.benchmark.linear_eval --task coswarasmoker --pretrain $name --dim $dim --modality cough-shallow 21 | python -m src.benchmark.processing.coswara_processing --pretrain evar:$base:config/atst.yaml:$name:weight_file=$weight --label sex 22 | CUDA_VISIBLE_DEVICES=$GPU python -m src.benchmark.linear_eval --task coswarasex --pretrain $name --dim $dim --modality cough-shallow 23 | 24 | -------------------------------------------------------------------------------- /plugin/OPERA/evar_m2d.sh: -------------------------------------------------------------------------------- 1 | base=$EVAR 2 | weight=$1 3 | dim=$2 4 | filename=$(basename $weight) 5 | name="$(basename "$(dirname "$weight")")_${filename%.*}" 6 | 7 | python -m src.benchmark.processing.copd_processing --pretrain evar:$base:config/m2d.yaml:$name:weight_file=$weight 8 | CUDA_VISIBLE_DEVICES=0 python -m src.benchmark.linear_eval --task copd --pretrain $name --dim $dim 9 | python -m src.benchmark.processing.icbhi_processing --pretrain evar:$base:config/m2d.yaml:$name:weight_file=$weight 10 | CUDA_VISIBLE_DEVICES=0 python -m src.benchmark.linear_eval --task icbhidisease --pretrain $name --dim $dim 11 | python -m src.benchmark.processing.kauh_processing --pretrain evar:$base:config/m2d.yaml:$name:weight_file=$weight 12 | CUDA_VISIBLE_DEVICES=0 python -m src.benchmark.linear_eval --task kauh --pretrain $name --dim $dim 13 | 14 | python -m src.benchmark.processing.coughvid_processing --pretrain evar:$base:config/m2d.yaml:$name:weight_file=$weight --label covid 15 | CUDA_VISIBLE_DEVICES=0 python -m src.benchmark.linear_eval --task coughvidcovid --pretrain $name --dim $dim 16 | python -m src.benchmark.processing.coughvid_processing --pretrain evar:$base:config/m2d.yaml:$name:weight_file=$weight --label gender 17 | CUDA_VISIBLE_DEVICES=0 python -m src.benchmark.linear_eval --task coughvidsex --pretrain $name --dim $dim 18 | 19 | python -m src.benchmark.processing.coswara_processing --pretrain evar:$base:config/m2d.yaml:$name:weight_file=$weight --label smoker 20 | CUDA_VISIBLE_DEVICES=0 python -m src.benchmark.linear_eval --task coswarasmoker --pretrain $name --dim $dim --modality cough-shallow 21 | python -m src.benchmark.processing.coswara_processing --pretrain evar:$base:config/m2d.yaml:$name:weight_file=$weight --label sex 22 | CUDA_VISIBLE_DEVICES=0 python -m src.benchmark.linear_eval --task coswarasex --pretrain $name --dim $dim --modality cough-shallow 23 | 24 | -------------------------------------------------------------------------------- /run/all_wavcaps.sh: -------------------------------------------------------------------------------- 1 | NAME=WavCapsZS 2 | python 2pass_lineareval.py config/wavcaps.yaml cremad batch_size=16,name=$NAME,weight_file=external/WavCaps/HTSAT_BERT_zero_shot.pt 3 | python 2pass_lineareval.py config/wavcaps.yaml gtzan batch_size=16,name=$NAME,weight_file=external/WavCaps/HTSAT_BERT_zero_shot.pt 4 | python 2pass_lineareval.py config/wavcaps.yaml spcv2 batch_size=64,name=$NAME,weight_file=external/WavCaps/HTSAT_BERT_zero_shot.pt 5 | python 2pass_lineareval.py config/wavcaps.yaml esc50 batch_size=64,name=$NAME,weight_file=external/WavCaps/HTSAT_BERT_zero_shot.pt 6 | python 2pass_lineareval.py config/wavcaps.yaml us8k batch_size=64,name=$NAME,weight_file=external/WavCaps/HTSAT_BERT_zero_shot.pt 7 | python 2pass_lineareval.py config/wavcaps.yaml vc1 batch_size=64,name=$NAME,weight_file=external/WavCaps/HTSAT_BERT_zero_shot.pt 8 | python 2pass_lineareval.py config/wavcaps.yaml voxforge batch_size=64,name=$NAME,weight_file=external/WavCaps/HTSAT_BERT_zero_shot.pt 9 | python 2pass_lineareval.py config/wavcaps.yaml nsynth batch_size=64,name=$NAME,weight_file=external/WavCaps/HTSAT_BERT_zero_shot.pt 10 | python 2pass_lineareval.py config/wavcaps.yaml surge batch_size=64,name=$NAME,weight_file=external/WavCaps/HTSAT_BERT_zero_shot.pt 11 | 12 | python zeroshot.py config/wavcaps.yaml cremad batch_size=16,name=$NAME,weight_file=external/WavCaps/HTSAT_BERT_zero_shot.pt 13 | python zeroshot.py config/wavcaps.yaml gtzan batch_size=16,name=$NAME,weight_file=external/WavCaps/HTSAT_BERT_zero_shot.pt 14 | python zeroshot.py config/wavcaps.yaml nsynth batch_size=64,name=$NAME,weight_file=external/WavCaps/HTSAT_BERT_zero_shot.pt 15 | python zeroshot.py config/wavcaps.yaml esc50 batch_size=64,name=$NAME,weight_file=external/WavCaps/HTSAT_BERT_zero_shot.pt 16 | python zeroshot.py config/wavcaps.yaml us8k batch_size=64,name=$NAME,weight_file=external/WavCaps/HTSAT_BERT_zero_shot.pt 17 | 18 | python summarize.py $NAME 19 | -------------------------------------------------------------------------------- /app/bmdhs/README_BMDHS.md: -------------------------------------------------------------------------------- 1 | # CirCor evaluation 2 | 3 | We provide code to evaluate BMD-HS with various models. 4 | In addition, the exact stratified data splits used in the paper are provided for reproducibility. 5 | 6 | **NOTE: The code freezes the audio representation model weights.** 7 | 8 | Prepare data and metadata files before your evaluation. 9 | 10 | In this folder `app/bmdhs`, download the dataset and fix one file name: 11 | 12 | ```sh 13 | git clone https://github.com/mHealthBuet/BMD-HS-Dataset 14 | mv BMD-HS-Dataset/train/MD_085_sit_Tri6_06.wav BMD-HS-Dataset/train/MD_085_sit_Tri.wav 15 | ``` 16 | 17 | Then, the following will resample/copy data files from `BMD-HS-Dataset` to `../../work/16k/bmdhs`. 18 | 19 | ```sh 20 | python ../../prepare_wav.py BMD-HS-Dataset/ ../../work/16k/bmdhs 16000 21 | ``` 22 | 23 | In addition, the following will create metadata files as `../../evar/metadata/bmdhs[1-3].csv`. 24 | 25 | ```sh 26 | python make_metadata.py 27 | ``` 28 | 29 | ## Run evaluations 30 | 31 | In the **root folder of EVAR**, run the scripts `ev_*.sh`. The following is the complete set of command lines for the paper. 32 | 33 | The results will be recorded in `results/bmdhs-scores.csv`. 34 | 35 | ```sh 36 | bash app/bmdhs/ev_ast.sh 1 5 42 0.1 37 | bash app/bmdhs/ev_ast.sh 2 5 42 0.1 38 | bash app/bmdhs/ev_ast.sh 3 5 42 0.1 39 | 40 | bash app/bmdhs/ev_beats.sh 1 5 42 0.1 41 | bash app/bmdhs/ev_beats.sh 2 5 42 0.1 42 | bash app/bmdhs/ev_beats.sh 3 5 42 0.1 43 | 44 | bash app/bmdhs/ev_byola.sh 1 5 42 0.1 45 | bash app/bmdhs/ev_byola.sh 2 5 42 0.1 46 | bash app/bmdhs/ev_byola.sh 3 5 42 0.1 47 | 48 | bash app/bmdhs/ev_m2d.sh m2d_vit_base-80x608p16x16-221006-mr7/checkpoint-300.pth 1 5 42 0.1 49 | bash app/bmdhs/ev_m2d.sh m2d_vit_base-80x608p16x16-221006-mr7/checkpoint-300.pth 2 5 42 0.1 50 | bash app/bmdhs/ev_m2d.sh m2d_vit_base-80x608p16x16-221006-mr7/checkpoint-300.pth 3 5 42 0.1 51 | ``` 52 | 53 | 54 | 55 | -------------------------------------------------------------------------------- /evar/utils/calculations.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | class RunningMean: 5 | """Running mean calculator for arbitrary axis configuration. 6 | Thanks to https://math.stackexchange.com/questions/106700/incremental-averageing 7 | """ 8 | 9 | def __init__(self, axis): 10 | self.n = 0 11 | self.axis = axis 12 | 13 | def put(self, x): 14 | if self.n == 0: 15 | self.mu = x.mean(self.axis, keepdims=True) 16 | else: 17 | self.mu += (x.mean(self.axis, keepdims=True) - self.mu) / self.n 18 | self.n += 1 19 | 20 | def __call__(self): 21 | return self.mu 22 | 23 | def __len__(self): 24 | return self.n 25 | 26 | 27 | class RunningVariance: 28 | """Calculate mean/variance of tensors online. 29 | Thanks to https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance 30 | """ 31 | 32 | def __init__(self, axis, mean): 33 | self.update_mean(mean) 34 | self.s2 = RunningMean(axis) 35 | 36 | def update_mean(self, mean): 37 | self.mean = mean 38 | 39 | def put(self, x): 40 | self.s2.put((x - self.mean) **2) 41 | 42 | def __call__(self): 43 | return self.s2() 44 | 45 | def std(self): 46 | return np.sqrt(self()) 47 | 48 | 49 | class RunningStats: 50 | def __init__(self, axis=None): 51 | self.axis = axis 52 | self.mean = self.var = None 53 | 54 | def put(self, x): 55 | assert type(x) 56 | if self.mean is None: 57 | if self.axis is None: 58 | self.axis = list(range(len(x.shape))) 59 | self.mean = RunningMean(self.axis) 60 | self.var = RunningVariance(self.axis, 0) 61 | self.mean.put(x) 62 | self.var.update_mean(self.mean()) 63 | self.var.put(x) 64 | 65 | def __call__(self): 66 | return self.mean(), self.var.std() 67 | -------------------------------------------------------------------------------- /plugin/OPERA/evar_beats.sh: -------------------------------------------------------------------------------- 1 | base=$EVAR 2 | weight=${base}/external/BEATs_iter3.pt 3 | dim=768 4 | name=BEATs 5 | GPU=0 6 | # filename=$(basename $weight) 7 | # "$(basename "$(dirname "$weight")")_${filename%.*}" 8 | 9 | python -m src.benchmark.processing.copd_processing --pretrain evar:$base:config/beats.yaml:$name:weight_file=$weight 10 | CUDA_VISIBLE_DEVICES=$GPU python -m src.benchmark.linear_eval --task copd --pretrain $name --dim $dim 11 | python -m src.benchmark.processing.icbhi_processing --pretrain evar:$base:config/beats.yaml:$name:weight_file=$weight 12 | CUDA_VISIBLE_DEVICES=$GPU python -m src.benchmark.linear_eval --task icbhidisease --pretrain $name --dim $dim 13 | python -m src.benchmark.processing.kauh_processing --pretrain evar:$base:config/beats.yaml:$name:weight_file=$weight 14 | CUDA_VISIBLE_DEVICES=$GPU python -m src.benchmark.linear_eval --task kauh --pretrain $name --dim $dim 15 | 16 | python -m src.benchmark.processing.coughvid_processing --pretrain evar:$base:config/beats.yaml:$name:weight_file=$weight --label covid 17 | CUDA_VISIBLE_DEVICES=$GPU python -m src.benchmark.linear_eval --task coughvidcovid --pretrain $name --dim $dim 18 | python -m src.benchmark.processing.coughvid_processing --pretrain evar:$base:config/beats.yaml:$name:weight_file=$weight --label gender 19 | CUDA_VISIBLE_DEVICES=$GPU python -m src.benchmark.linear_eval --task coughvidsex --pretrain $name --dim $dim 20 | 21 | python -m src.benchmark.processing.coswara_processing --pretrain evar:$base:config/beats.yaml:$name:weight_file=$weight --label smoker 22 | CUDA_VISIBLE_DEVICES=$GPU python -m src.benchmark.linear_eval --task coswarasmoker --pretrain $name --dim $dim --modality cough-shallow 23 | python -m src.benchmark.processing.coswara_processing --pretrain evar:$base:config/beats.yaml:$name:weight_file=$weight --label sex 24 | CUDA_VISIBLE_DEVICES=$GPU python -m src.benchmark.linear_eval --task coswarasex --pretrain $name --dim $dim --modality cough-shallow 25 | 26 | -------------------------------------------------------------------------------- /plugin/OPERA/evar_ast.sh: -------------------------------------------------------------------------------- 1 | base=$EVAR 2 | weight=${base}/external/ast/pretrained_models/ast_audioset.pth 3 | dim=768 4 | name=AST 5 | GPU=0 6 | # filename=$(basename $weight) 7 | # "$(basename "$(dirname "$weight")")_${filename%.*}" 8 | 9 | python -m src.benchmark.processing.copd_processing --pretrain evar:$base:config/ast.yaml:$name:weight_file=$weight 10 | CUDA_VISIBLE_DEVICES=$GPU python -m src.benchmark.linear_eval --task copd --pretrain $name --dim $dim 11 | python -m src.benchmark.processing.icbhi_processing --pretrain evar:$base:config/ast.yaml:$name:weight_file=$weight 12 | CUDA_VISIBLE_DEVICES=$GPU python -m src.benchmark.linear_eval --task icbhidisease --pretrain $name --dim $dim 13 | python -m src.benchmark.processing.kauh_processing --pretrain evar:$base:config/ast.yaml:$name:weight_file=$weight 14 | CUDA_VISIBLE_DEVICES=$GPU python -m src.benchmark.linear_eval --task kauh --pretrain $name --dim $dim 15 | 16 | python -m src.benchmark.processing.coughvid_processing --pretrain evar:$base:config/ast.yaml:$name:weight_file=$weight --label covid 17 | CUDA_VISIBLE_DEVICES=$GPU python -m src.benchmark.linear_eval --task coughvidcovid --pretrain $name --dim $dim 18 | python -m src.benchmark.processing.coughvid_processing --pretrain evar:$base:config/ast.yaml:$name:weight_file=$weight --label gender 19 | CUDA_VISIBLE_DEVICES=$GPU python -m src.benchmark.linear_eval --task coughvidsex --pretrain $name --dim $dim 20 | 21 | python -m src.benchmark.processing.coswara_processing --pretrain evar:$base:config/ast.yaml:$name:weight_file=$weight --label smoker 22 | CUDA_VISIBLE_DEVICES=$GPU python -m src.benchmark.linear_eval --task coswarasmoker --pretrain $name --dim $dim --modality cough-shallow 23 | python -m src.benchmark.processing.coswara_processing --pretrain evar:$base:config/ast.yaml:$name:weight_file=$weight --label sex 24 | CUDA_VISIBLE_DEVICES=$GPU python -m src.benchmark.linear_eval --task coswarasex --pretrain $name --dim $dim --modality cough-shallow 25 | 26 | -------------------------------------------------------------------------------- /plugin/OPERA/evar_htsat.sh: -------------------------------------------------------------------------------- 1 | base=$EVAR 2 | weight=${base}/external/HTSAT_AudioSet_Saved_1.ckpt 3 | dim=768 4 | name=HTS-AT 5 | GPU=0 6 | # filename=$(basename $weight) 7 | # "$(basename "$(dirname "$weight")")_${filename%.*}" 8 | 9 | python -m src.benchmark.processing.copd_processing --pretrain evar:$base:config/htsat.yaml:$name:weight_file=$weight 10 | CUDA_VISIBLE_DEVICES=$GPU python -m src.benchmark.linear_eval --task copd --pretrain $name --dim $dim 11 | python -m src.benchmark.processing.icbhi_processing --pretrain evar:$base:config/htsat.yaml:$name:weight_file=$weight 12 | CUDA_VISIBLE_DEVICES=$GPU python -m src.benchmark.linear_eval --task icbhidisease --pretrain $name --dim $dim 13 | python -m src.benchmark.processing.kauh_processing --pretrain evar:$base:config/htsat.yaml:$name:weight_file=$weight 14 | CUDA_VISIBLE_DEVICES=$GPU python -m src.benchmark.linear_eval --task kauh --pretrain $name --dim $dim 15 | 16 | python -m src.benchmark.processing.coughvid_processing --pretrain evar:$base:config/htsat.yaml:$name:weight_file=$weight --label covid 17 | CUDA_VISIBLE_DEVICES=$GPU python -m src.benchmark.linear_eval --task coughvidcovid --pretrain $name --dim $dim 18 | python -m src.benchmark.processing.coughvid_processing --pretrain evar:$base:config/htsat.yaml:$name:weight_file=$weight --label gender 19 | CUDA_VISIBLE_DEVICES=$GPU python -m src.benchmark.linear_eval --task coughvidsex --pretrain $name --dim $dim 20 | 21 | python -m src.benchmark.processing.coswara_processing --pretrain evar:$base:config/htsat.yaml:$name:weight_file=$weight --label smoker 22 | CUDA_VISIBLE_DEVICES=$GPU python -m src.benchmark.linear_eval --task coswarasmoker --pretrain $name --dim $dim --modality cough-shallow 23 | python -m src.benchmark.processing.coswara_processing --pretrain evar:$base:config/htsat.yaml:$name:weight_file=$weight --label sex 24 | CUDA_VISIBLE_DEVICES=$GPU python -m src.benchmark.linear_eval --task coswarasex --pretrain $name --dim $dim --modality cough-shallow 25 | 26 | -------------------------------------------------------------------------------- /plugin/OPERA/evar_byola.sh: -------------------------------------------------------------------------------- 1 | base=$EVAR 2 | weight=${base}/external/byol_a/pretrained_weights/AudioNTT2020-BYOLA-64x96d2048.pth 3 | dim=2048 4 | name=BYOL-A 5 | # filename=$(basename $weight) 6 | # "$(basename "$(dirname "$weight")")_${filename%.*}" 7 | 8 | python -m src.benchmark.processing.copd_processing --pretrain evar:$base:config/byola.yaml:$name:weight_file=$weight 9 | CUDA_VISIBLE_DEVICES=0 python -m src.benchmark.linear_eval --task copd --pretrain $name --dim $dim 10 | python -m src.benchmark.processing.icbhi_processing --pretrain evar:$base:config/byola.yaml:$name:weight_file=$weight 11 | CUDA_VISIBLE_DEVICES=0 python -m src.benchmark.linear_eval --task icbhidisease --pretrain $name --dim $dim 12 | python -m src.benchmark.processing.kauh_processing --pretrain evar:$base:config/byola.yaml:$name:weight_file=$weight 13 | CUDA_VISIBLE_DEVICES=0 python -m src.benchmark.linear_eval --task kauh --pretrain $name --dim $dim 14 | 15 | python -m src.benchmark.processing.coughvid_processing --pretrain evar:$base:config/byola.yaml:$name:weight_file=$weight --label covid 16 | CUDA_VISIBLE_DEVICES=0 python -m src.benchmark.linear_eval --task coughvidcovid --pretrain $name --dim $dim 17 | python -m src.benchmark.processing.coughvid_processing --pretrain evar:$base:config/byola.yaml:$name:weight_file=$weight --label gender 18 | CUDA_VISIBLE_DEVICES=0 python -m src.benchmark.linear_eval --task coughvidsex --pretrain $name --dim $dim 19 | 20 | python -m src.benchmark.processing.coswara_processing --pretrain evar:$base:config/byola.yaml:$name:weight_file=$weight --label smoker 21 | CUDA_VISIBLE_DEVICES=0 python -m src.benchmark.linear_eval --task coswarasmoker --pretrain $name --dim $dim --modality cough-shallow 22 | python -m src.benchmark.processing.coswara_processing --pretrain evar:$base:config/byola.yaml:$name:weight_file=$weight --label sex 23 | CUDA_VISIBLE_DEVICES=0 python -m src.benchmark.linear_eval --task coswarasex --pretrain $name --dim $dim --modality cough-shallow 24 | 25 | -------------------------------------------------------------------------------- /plugin/OPERA/evar_hubert.sh: -------------------------------------------------------------------------------- 1 | base=$EVAR 2 | dim=768 3 | GPU=0 4 | # filename=$(basename $weight) 5 | # "$(basename "$(dirname "$weight")")_${filename%.*}" 6 | 7 | for i in 1 2 3 4 5 6 7 8 9 10 11 12; do 8 | 9 | name="HuBERT_$i" 10 | python -m src.benchmark.processing.copd_processing --pretrain evar:$base:config/hubert.yaml:$name:output_layers=[$i] 11 | CUDA_VISIBLE_DEVICES=$GPU python -m src.benchmark.linear_eval --task copd --pretrain $name --dim $dim 12 | python -m src.benchmark.processing.icbhi_processing --pretrain evar:$base:config/hubert.yaml:$name:output_layers=[$i] 13 | CUDA_VISIBLE_DEVICES=$GPU python -m src.benchmark.linear_eval --task icbhidisease --pretrain $name --dim $dim 14 | python -m src.benchmark.processing.kauh_processing --pretrain evar:$base:config/hubert.yaml:$name:output_layers=[$i] 15 | CUDA_VISIBLE_DEVICES=$GPU python -m src.benchmark.linear_eval --task kauh --pretrain $name --dim $dim 16 | 17 | python -m src.benchmark.processing.coughvid_processing --pretrain evar:$base:config/hubert.yaml:$name:output_layers=[$i] --label covid 18 | CUDA_VISIBLE_DEVICES=$GPU python -m src.benchmark.linear_eval --task coughvidcovid --pretrain $name --dim $dim 19 | python -m src.benchmark.processing.coughvid_processing --pretrain evar:$base:config/hubert.yaml:$name:output_layers=[$i] --label gender 20 | CUDA_VISIBLE_DEVICES=$GPU python -m src.benchmark.linear_eval --task coughvidsex --pretrain $name --dim $dim 21 | 22 | python -m src.benchmark.processing.coswara_processing --pretrain evar:$base:config/hubert.yaml:$name:output_layers=[$i] --label smoker 23 | CUDA_VISIBLE_DEVICES=$GPU python -m src.benchmark.linear_eval --task coswarasmoker --pretrain $name --dim $dim --modality cough-shallow 24 | python -m src.benchmark.processing.coswara_processing --pretrain evar:$base:config/hubert.yaml:$name:output_layers=[$i] --label sex 25 | CUDA_VISIBLE_DEVICES=$GPU python -m src.benchmark.linear_eval --task coswarasex --pretrain $name --dim $dim --modality cough-shallow 26 | 27 | done 28 | -------------------------------------------------------------------------------- /plugin/OPERA/evar_wavlm.sh: -------------------------------------------------------------------------------- 1 | base=$EVAR 2 | dim=768 3 | GPU=0 4 | # filename=$(basename $weight) 5 | # "$(basename "$(dirname "$weight")")_${filename%.*}" 6 | 7 | for i in 1 2 3 4 5 6 7 8 9 10 11 12; do 8 | 9 | name="WavLM_$i" 10 | python -m src.benchmark.processing.copd_processing --pretrain evar:$base:config/wavlm.yaml:$name:+output_layers=[$i] 11 | CUDA_VISIBLE_DEVICES=$GPU python -m src.benchmark.linear_eval --task copd --pretrain $name --dim $dim 12 | python -m src.benchmark.processing.icbhi_processing --pretrain evar:$base:config/wavlm.yaml:$name:+output_layers=[$i] 13 | CUDA_VISIBLE_DEVICES=$GPU python -m src.benchmark.linear_eval --task icbhidisease --pretrain $name --dim $dim 14 | python -m src.benchmark.processing.kauh_processing --pretrain evar:$base:config/wavlm.yaml:$name:+output_layers=[$i] 15 | CUDA_VISIBLE_DEVICES=$GPU python -m src.benchmark.linear_eval --task kauh --pretrain $name --dim $dim 16 | 17 | python -m src.benchmark.processing.coughvid_processing --pretrain evar:$base:config/wavlm.yaml:$name:+output_layers=[$i] --label covid 18 | CUDA_VISIBLE_DEVICES=$GPU python -m src.benchmark.linear_eval --task coughvidcovid --pretrain $name --dim $dim 19 | python -m src.benchmark.processing.coughvid_processing --pretrain evar:$base:config/wavlm.yaml:$name:+output_layers=[$i] --label gender 20 | CUDA_VISIBLE_DEVICES=$GPU python -m src.benchmark.linear_eval --task coughvidsex --pretrain $name --dim $dim 21 | 22 | python -m src.benchmark.processing.coswara_processing --pretrain evar:$base:config/wavlm.yaml:$name:+output_layers=[$i] --label smoker 23 | CUDA_VISIBLE_DEVICES=$GPU python -m src.benchmark.linear_eval --task coswarasmoker --pretrain $name --dim $dim --modality cough-shallow 24 | python -m src.benchmark.processing.coswara_processing --pretrain evar:$base:config/wavlm.yaml:$name:+output_layers=[$i] --label sex 25 | CUDA_VISIBLE_DEVICES=$GPU python -m src.benchmark.linear_eval --task coswarasex --pretrain $name --dim $dim --modality cough-shallow 26 | 27 | done 28 | -------------------------------------------------------------------------------- /plugin/OPERA/evar_atst_frame.sh: -------------------------------------------------------------------------------- 1 | base=$EVAR 2 | weight=${base}/external/atstframe_base.ckpt 3 | dim=9216 4 | name=ATST-Frame 5 | GPU=0 6 | # filename=$(basename $weight) 7 | # "$(basename "$(dirname "$weight")")_${filename%.*}" 8 | 9 | python -m src.benchmark.processing.copd_processing --pretrain evar:$base:config/atst_frame.yaml:$name:weight_file=$weight 10 | CUDA_VISIBLE_DEVICES=$GPU python -m src.benchmark.linear_eval --task copd --pretrain $name --dim $dim 11 | python -m src.benchmark.processing.icbhi_processing --pretrain evar:$base:config/atst_frame.yaml:$name:weight_file=$weight 12 | CUDA_VISIBLE_DEVICES=$GPU python -m src.benchmark.linear_eval --task icbhidisease --pretrain $name --dim $dim 13 | python -m src.benchmark.processing.kauh_processing --pretrain evar:$base:config/atst_frame.yaml:$name:weight_file=$weight 14 | CUDA_VISIBLE_DEVICES=$GPU python -m src.benchmark.linear_eval --task kauh --pretrain $name --dim $dim 15 | 16 | python -m src.benchmark.processing.coughvid_processing --pretrain evar:$base:config/atst_frame.yaml:$name:weight_file=$weight --label covid 17 | CUDA_VISIBLE_DEVICES=$GPU python -m src.benchmark.linear_eval --task coughvidcovid --pretrain $name --dim $dim 18 | python -m src.benchmark.processing.coughvid_processing --pretrain evar:$base:config/atst_frame.yaml:$name:weight_file=$weight --label gender 19 | CUDA_VISIBLE_DEVICES=$GPU python -m src.benchmark.linear_eval --task coughvidsex --pretrain $name --dim $dim 20 | 21 | python -m src.benchmark.processing.coswara_processing --pretrain evar:$base:config/atst_frame.yaml:$name:weight_file=$weight --label smoker 22 | CUDA_VISIBLE_DEVICES=$GPU python -m src.benchmark.linear_eval --task coswarasmoker --pretrain $name --dim $dim --modality cough-shallow 23 | python -m src.benchmark.processing.coswara_processing --pretrain evar:$base:config/atst_frame.yaml:$name:weight_file=$weight --label sex 24 | CUDA_VISIBLE_DEVICES=$GPU python -m src.benchmark.linear_eval --task coswarasex --pretrain $name --dim $dim --modality cough-shallow 25 | 26 | -------------------------------------------------------------------------------- /plugin/OPERA/evar_beats_plus.sh: -------------------------------------------------------------------------------- 1 | base=$EVAR 2 | weight=${base}/external/BEATs_iter3_plus_AS2M.pt 3 | dim=768 4 | name=BEATs 5 | GPU=0 6 | # filename=$(basename $weight) 7 | # "$(basename "$(dirname "$weight")")_${filename%.*}" 8 | 9 | python -m src.benchmark.processing.copd_processing --pretrain evar:$base:config/beats_plus.yaml:$name:weight_file=$weight 10 | CUDA_VISIBLE_DEVICES=$GPU python -m src.benchmark.linear_eval --task copd --pretrain $name --dim $dim 11 | python -m src.benchmark.processing.icbhi_processing --pretrain evar:$base:config/beats_plus.yaml:$name:weight_file=$weight 12 | CUDA_VISIBLE_DEVICES=$GPU python -m src.benchmark.linear_eval --task icbhidisease --pretrain $name --dim $dim 13 | python -m src.benchmark.processing.kauh_processing --pretrain evar:$base:config/beats_plus.yaml:$name:weight_file=$weight 14 | CUDA_VISIBLE_DEVICES=$GPU python -m src.benchmark.linear_eval --task kauh --pretrain $name --dim $dim 15 | 16 | python -m src.benchmark.processing.coughvid_processing --pretrain evar:$base:config/beats_plus.yaml:$name:weight_file=$weight --label covid 17 | CUDA_VISIBLE_DEVICES=$GPU python -m src.benchmark.linear_eval --task coughvidcovid --pretrain $name --dim $dim 18 | python -m src.benchmark.processing.coughvid_processing --pretrain evar:$base:config/beats_plus.yaml:$name:weight_file=$weight --label gender 19 | CUDA_VISIBLE_DEVICES=$GPU python -m src.benchmark.linear_eval --task coughvidsex --pretrain $name --dim $dim 20 | 21 | python -m src.benchmark.processing.coswara_processing --pretrain evar:$base:config/beats_plus.yaml:$name:weight_file=$weight --label smoker 22 | CUDA_VISIBLE_DEVICES=$GPU python -m src.benchmark.linear_eval --task coswarasmoker --pretrain $name --dim $dim --modality cough-shallow 23 | python -m src.benchmark.processing.coswara_processing --pretrain evar:$base:config/beats_plus.yaml:$name:weight_file=$weight --label sex 24 | CUDA_VISIBLE_DEVICES=$GPU python -m src.benchmark.linear_eval --task coswarasex --pretrain $name --dim $dim --modality cough-shallow 25 | 26 | -------------------------------------------------------------------------------- /evar/ar_wavlm.py: -------------------------------------------------------------------------------- 1 | """Wrapper code for: 2 | 3 | WavLM: Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing 4 | 5 | ## Reference 6 | - [1] https://arxiv.org/abs/2110.13900 7 | - [2] https://huggingface.co/microsoft/wavlm-large 8 | - [3] https://github.com/microsoft/unilm/tree/master/wavlm 9 | - [4] https://github.com/huggingface/transformers/blob/main/src/transformers/models/wavlm/modeling_wavlm.py 10 | """ 11 | 12 | from evar.ar_base import BaseAudioRepr, temporal_pooling 13 | import logging 14 | import torch 15 | try: 16 | from transformers import WavLMModel, Wav2Vec2Processor 17 | except: 18 | logging.error('Install transformers.\n>>> pip install transformers') 19 | 20 | 21 | class AR_WavLM(BaseAudioRepr): 22 | 23 | def __init__(self, cfg): 24 | super().__init__(cfg=cfg) 25 | 26 | self.processor = Wav2Vec2Processor.from_pretrained('facebook/wav2vec2-base-960h') # instead of cfg.pretrained_model because non-ft models fail. wav2vec2-base-960h should be fine for preprocessing. 27 | self.backbone = WavLMModel.from_pretrained(cfg.pretrained_model) 28 | 29 | def encode_frames(self, batch_audio): 30 | device = batch_audio.device 31 | preprocessed = self.processor(batch_audio.cpu().numpy(), return_tensors="pt", sampling_rate=16000).input_values 32 | preprocessed = preprocessed[0] if preprocessed.shape[0] == 1 else preprocessed # [1, B, raw wave length] -> [B, raw wave length] 33 | preprocessed = preprocessed.to(device) 34 | hidden_states = self.backbone(preprocessed, output_hidden_states=True).hidden_states # [B, T, D] 35 | # stack layer outputs 36 | states_to_stack = [hidden_states[index] for index in self.cfg.output_layers] if self.cfg.output_layers else hidden_states 37 | features = torch.cat(states_to_stack, axis=-1) 38 | return features.transpose(1, 2) # [B, D, T] 39 | 40 | def forward(self, batch_audio): 41 | return temporal_pooling(self, self.encode_frames(batch_audio)) 42 | -------------------------------------------------------------------------------- /plugin/MARBLE/evar_marble.sh: -------------------------------------------------------------------------------- 1 | # 2 | NAME=$1 3 | WEIGHT=$2 4 | SEED=42 5 | ITER=5 6 | FEATURES=768 7 | FEAT_NAME=$NAME 8 | 9 | if [ $# -gt 2 ]; then 10 | SEED=$3 11 | echo "Seed = $SEED." 12 | fi 13 | if [ $# -gt 3 ]; then 14 | ITER=$4 15 | echo "Number of iteration = $ITER." 16 | fi 17 | if [ $# -gt 4 ]; then 18 | FEAT_NAME=$5 19 | echo "Feature name = $FEAT_NAME." 20 | fi 21 | if [ $# -gt 5 ]; then 22 | FEATURES=$6 23 | echo "Num_features = $FEATURES." 24 | fi 25 | 26 | OPTION="dataset.pre_extract.output_dir=outputs/feat/evar_"$FEAT_NAME"_feats,,dataset.input_dir=outputs/feat/evar_"$FEAT_NAME"_feats,,dataset.pre_extract.feature_extractor.pretrain.evar_config=$EVAR/config/$NAME.yaml,,dataset.pre_extract.feature_extractor.pretrain.weight=$WEIGHT,,dataset.pre_extract.feature_extractor.pretrain.num_features=$FEATURES" 27 | 28 | #GS 29 | TASKS="EMO GTZAN MTT" 30 | for task in $TASKS; do 31 | python . extract -c configs/evar/$task.yaml -o $OPTION",,trainer.seed=$SEED" 32 | for i in $(seq $ITER); do 33 | python . probe -c configs/evar/$task.yaml -o $OPTION",,trainer.seed=$SEED" 34 | SEED=$((SEED + 1)) 35 | done 36 | done 37 | 38 | python . extract -c configs/evar/VocalSetS.yaml -o $OPTION 39 | TASKS="VocalSetS VocalSetT" 40 | for task in $TASKS; do 41 | for i in $(seq $ITER); do 42 | python . probe -c configs/evar/$task.yaml -o $OPTION",,trainer.seed=$SEED" 43 | SEED=$((SEED + 1)) 44 | done 45 | done 46 | 47 | python . extract -c configs/evar/NSynthI.yaml -o $OPTION 48 | TASKS="NSynthI NSynthP" 49 | for task in $TASKS; do 50 | for i in $(seq $ITER); do 51 | python . probe -c configs/evar/$task.yaml -o $OPTION",,trainer.seed=$SEED" 52 | SEED=$((SEED + 1)) 53 | done 54 | done 55 | 56 | python . extract -c configs/evar/MTGGenre.yaml -o $OPTION 57 | TASKS="MTGGenre MTGInstrument MTGMood MTGTop50" 58 | for task in $TASKS; do 59 | for i in $(seq $ITER); do 60 | python . probe -c configs/evar/$task.yaml -o $OPTION",,trainer.seed=$SEED" 61 | SEED=$((SEED + 1)) 62 | done 63 | done 64 | -------------------------------------------------------------------------------- /plugin/OPERA/evar_wav2vec2.sh: -------------------------------------------------------------------------------- 1 | base=$EVAR 2 | dim=512 3 | GPU=0 4 | # filename=$(basename $weight) 5 | # "$(basename "$(dirname "$weight")")_${filename%.*}" 6 | 7 | for i in 1 2 3 4 5 6 7 8 9 10 11 12; do 8 | 9 | name="wav2vec2_$i" 10 | python -m src.benchmark.processing.copd_processing --pretrain evar:$base:config/wav2vec2feature.yaml:$name:+output_layers=[$i] 11 | CUDA_VISIBLE_DEVICES=$GPU python -m src.benchmark.linear_eval --task copd --pretrain $name --dim $dim 12 | python -m src.benchmark.processing.icbhi_processing --pretrain evar:$base:config/wav2vec2feature.yaml:$name:+output_layers=[$i] 13 | CUDA_VISIBLE_DEVICES=$GPU python -m src.benchmark.linear_eval --task icbhidisease --pretrain $name --dim $dim 14 | python -m src.benchmark.processing.kauh_processing --pretrain evar:$base:config/wav2vec2feature.yaml:$name:+output_layers=[$i] 15 | CUDA_VISIBLE_DEVICES=$GPU python -m src.benchmark.linear_eval --task kauh --pretrain $name --dim $dim 16 | 17 | python -m src.benchmark.processing.coughvid_processing --pretrain evar:$base:config/wav2vec2feature.yaml:$name:+output_layers=[$i] --label covid 18 | CUDA_VISIBLE_DEVICES=$GPU python -m src.benchmark.linear_eval --task coughvidcovid --pretrain $name --dim $dim 19 | python -m src.benchmark.processing.coughvid_processing --pretrain evar:$base:config/wav2vec2feature.yaml:$name:+output_layers=[$i] --label gender 20 | CUDA_VISIBLE_DEVICES=$GPU python -m src.benchmark.linear_eval --task coughvidsex --pretrain $name --dim $dim 21 | 22 | python -m src.benchmark.processing.coswara_processing --pretrain evar:$base:config/wav2vec2feature.yaml:$name:+output_layers=[$i] --label smoker 23 | CUDA_VISIBLE_DEVICES=$GPU python -m src.benchmark.linear_eval --task coswarasmoker --pretrain $name --dim $dim --modality cough-shallow 24 | python -m src.benchmark.processing.coswara_processing --pretrain evar:$base:config/wav2vec2feature.yaml:$name:+output_layers=[$i] --label sex 25 | CUDA_VISIBLE_DEVICES=$GPU python -m src.benchmark.linear_eval --task coswarasex --pretrain $name --dim $dim --modality cough-shallow 26 | 27 | done 28 | -------------------------------------------------------------------------------- /evar/ar_hubert.py: -------------------------------------------------------------------------------- 1 | """Wrapper code for: 2 | 3 | HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units 4 | 5 | ## Reference 6 | - [1] https://ai.facebook.com/blog/hubert-self-supervised-representation-learning-for-speech-recognition-generation-and-compression/ 7 | - [2] https://huggingface.co/facebook/hubert-large-ls960-ft 8 | - [3] https://github.com/huggingface/transformers/blob/main/src/transformers/models/hubert/modeling_hubert.py 9 | """ 10 | 11 | from evar.ar_base import BaseAudioRepr, temporal_pooling 12 | import logging 13 | import torch 14 | try: 15 | from transformers import HubertModel, Wav2Vec2Processor 16 | except: 17 | logging.error('Install transformers.\n>>> pip install transformers') 18 | 19 | 20 | class AR_Hubert(BaseAudioRepr): 21 | 22 | def __init__(self, cfg): 23 | super().__init__(cfg=cfg) 24 | 25 | self.processor = Wav2Vec2Processor.from_pretrained('facebook/wav2vec2-base-960h') 26 | # instead of cfg.pretrained_model because non-ft models fail. wav2vec2-base-960h should be fine for preprocessing. 27 | self.backbone = HubertModel.from_pretrained(cfg.pretrained_model) 28 | 29 | def encode_frames(self, batch_audio): 30 | device = batch_audio.device 31 | preprocessed = self.processor(batch_audio.cpu().numpy(), return_tensors="pt", sampling_rate=16000).input_values 32 | preprocessed = preprocessed[0] if preprocessed.shape[0] == 1 else preprocessed # [1, B, raw wave length] -> [B, raw wave length] 33 | preprocessed = preprocessed.to(device) 34 | hidden_states = self.backbone(preprocessed, output_hidden_states=True).hidden_states # [B, T, D] 35 | # stack layer outputs 36 | states_to_stack = [hidden_states[index] for index in self.cfg.output_layers] if self.cfg.output_layers else hidden_states 37 | features = torch.cat(states_to_stack, axis=-1) 38 | return features.transpose(1, 2) # [B, D, T] 39 | 40 | def forward(self, batch_audio): 41 | return temporal_pooling(self, self.encode_frames(batch_audio)) 42 | -------------------------------------------------------------------------------- /plugin/OPERA/evar_m2d_layers.sh: -------------------------------------------------------------------------------- 1 | base=$EVAR 2 | weight=$1 3 | dim=$2 4 | filename=$(basename $weight) 5 | name="$(basename "$(dirname "$weight")")_${filename%.*}" 6 | 7 | for i in 0 1 2 3 4 5 6 7 8 9 10 11; do 8 | 9 | name="M2D_$i" 10 | 11 | python -m src.benchmark.processing.copd_processing --pretrain evar:$base:config/m2d.yaml:$name:weight_file=$weight,+output_layers=[$i] 12 | CUDA_VISIBLE_DEVICES=0 python -m src.benchmark.linear_eval --task copd --pretrain $name --dim $dim 13 | python -m src.benchmark.processing.icbhi_processing --pretrain evar:$base:config/m2d.yaml:$name:weight_file=$weight,+output_layers=[$i] 14 | CUDA_VISIBLE_DEVICES=0 python -m src.benchmark.linear_eval --task icbhidisease --pretrain $name --dim $dim 15 | python -m src.benchmark.processing.kauh_processing --pretrain evar:$base:config/m2d.yaml:$name:weight_file=$weight,+output_layers=[$i] 16 | CUDA_VISIBLE_DEVICES=0 python -m src.benchmark.linear_eval --task kauh --pretrain $name --dim $dim 17 | 18 | python -m src.benchmark.processing.coughvid_processing --pretrain evar:$base:config/m2d.yaml:$name:weight_file=$weight,+output_layers=[$i] --label covid 19 | CUDA_VISIBLE_DEVICES=0 python -m src.benchmark.linear_eval --task coughvidcovid --pretrain $name --dim $dim 20 | python -m src.benchmark.processing.coughvid_processing --pretrain evar:$base:config/m2d.yaml:$name:weight_file=$weight,+output_layers=[$i] --label gender 21 | CUDA_VISIBLE_DEVICES=0 python -m src.benchmark.linear_eval --task coughvidsex --pretrain $name --dim $dim 22 | 23 | python -m src.benchmark.processing.coswara_processing --pretrain evar:$base:config/m2d.yaml:$name:weight_file=$weight,+output_layers=[$i] --label smoker 24 | CUDA_VISIBLE_DEVICES=0 python -m src.benchmark.linear_eval --task coswarasmoker --pretrain $name --dim $dim --modality cough-shallow 25 | python -m src.benchmark.processing.coswara_processing --pretrain evar:$base:config/m2d.yaml:$name:weight_file=$weight,+output_layers=[$i] --label sex 26 | CUDA_VISIBLE_DEVICES=0 python -m src.benchmark.linear_eval --task coswarasex --pretrain $name --dim $dim --modality cough-shallow 27 | 28 | done 29 | -------------------------------------------------------------------------------- /evar/ar_vggish.py: -------------------------------------------------------------------------------- 1 | """Wrapper code for: 2 | 3 | CNN Architectures for Large-Scale Audio Classification 4 | 5 | ## References 6 | - [1] https://research.google/pubs/pub45611/ 7 | - [2] VGGish: https://github.com/tcvrick/audioset-vggish-tensorflow-to-pytorch/blob/master/vggish.py 8 | - [3] VGG: https://github.com/pytorch/vision/blob/master/torchvision/models/vgg.py 9 | """ 10 | 11 | from evar.ar_base import (BaseAudioRepr, temporal_pooling) 12 | import torch 13 | import numpy as np 14 | import logging 15 | try: 16 | from external.tcvrick_vggish import vggish 17 | from external.tcvrick_vggish.audioset import vggish_input 18 | except: 19 | pass # logging.error('Make your copy of VGGish under external folder. Check Preparing-models.md for the details.') 20 | 21 | 22 | class AR_VGGish(BaseAudioRepr): 23 | def __init__(self, cfg, vggish_class=None): 24 | super().__init__(cfg=cfg) 25 | 26 | self.vggish = vggish.VGGish() if vggish_class is None else vggish_class() 27 | weight_file = 'external/pytorch_vggish.pth' 28 | logging.info(f' using pretrained weight: {weight_file}') 29 | self.vggish.load_state_dict(torch.load(weight_file)) 30 | 31 | def to_audio_features(self, batch_audio): 32 | # raw audio -> spectrogram 33 | device = batch_audio.device 34 | X = [vggish_input.waveform_to_examples(x.cpu().numpy(), self.cfg.sample_rate) for x in batch_audio] 35 | X = torch.tensor(np.array(X)).float().to(device) # ex.) [256, 7, 96, 64] if fsd50k. [B,Frame,T,F] 36 | return X 37 | 38 | def encode_frames(self, batch_audio): 39 | X = self.to_audio_features(batch_audio) 40 | Xs = [self.vggish(X[:, i:i+1]) for i in range(X.shape[1])] 41 | X = torch.stack(Xs, dim=2) # [B, D] x Frame -> [B, D, Frame] 42 | return X 43 | 44 | def forward(self, batch_audio): 45 | return temporal_pooling(self, self.encode_frames(batch_audio)) # [B, D] 46 | 47 | 48 | class AR_VGGish_4K(AR_VGGish): 49 | def __init__(self, cfg): 50 | super().__init__(cfg=cfg) 51 | 52 | # Remove all the layers after the first FC layer. 53 | self.vggish.fc = torch.nn.Sequential(*list(self.vggish.fc.children())[:-4]) 54 | -------------------------------------------------------------------------------- /evar/ar_esresnext_fbsp.py: -------------------------------------------------------------------------------- 1 | """Wrapper code for: 2 | 3 | ESResNe(X)t-fbsp: Learning Robust Time-Frequency Transformation of Audio 4 | 5 | ## Reference 6 | - [1] https://arxiv.org/abs/2104.11587 7 | - [2] https://github.com/AndreyGuzhov/ESResNeXt-fbsp 8 | """ 9 | 10 | from evar.ar_base import BaseAudioRepr 11 | import torch 12 | import librosa 13 | import numpy as np 14 | import logging 15 | from evar.model_utils import ensure_weights 16 | try: 17 | from external.esresnext.model.esresnet_fbsp import ESResNeXtFBSP 18 | except: 19 | logging.info('Make your copy of ESResNeXt-fbsp under external folder. Check Preparing-models.md for the details.') 20 | class ESResNeXtFBSP: 21 | pass 22 | 23 | 24 | class ESResNeXtFBSP_(ESResNeXtFBSP): 25 | 26 | def forward_reduced_featues(self, x, tfm=None): 27 | x = self._forward_pre_processing(x) 28 | if tfm is not None: 29 | x = tfm(x) 30 | x = self._forward_features(x) 31 | x = self._forward_reduction(x) 32 | return x 33 | 34 | 35 | class AR_ESResNeXtFBSP(BaseAudioRepr): 36 | def __init__(self, cfg): 37 | super().__init__(cfg=cfg) 38 | 39 | self.backbone = ESResNeXtFBSP_( 40 | **{"n_fft": 2048, 41 | "hop_length": 561, 42 | "win_length": 1654, 43 | "window": "blackmanharris", 44 | "normalized": True, 45 | "onesided": True, 46 | "spec_height": -1, 47 | "spec_width": -1, 48 | "num_classes": 527, 49 | "apply_attention": True, 50 | } 51 | ) 52 | ensure_weights('external/ESResNeXtFBSP_AudioSet.pt', 53 | 'https://github.com/AndreyGuzhov/ESResNeXt-fbsp/releases/download/v0.1/ESResNeXtFBSP_AudioSet.pt') 54 | self.backbone.load_state_dict(torch.load('external/ESResNeXtFBSP_AudioSet.pt')) 55 | 56 | def encode_frames(self, batch_audio): 57 | X = self.forward(batch_audio) 58 | X = X.unsqueeze(1) # Already have temporally pooled, just adding extra frame dimension [B, 2048] -> [B, 1, 2048] 59 | return X 60 | 61 | def forward(self, batch_audio): 62 | return self.backbone.forward_reduced_featues(batch_audio * 32767) # [B, 2048] 63 | -------------------------------------------------------------------------------- /external/wavcaps.patch: -------------------------------------------------------------------------------- 1 | diff --git a/retrieval/models/ase_model.py b/retrieval/models/ase_model.py 2 | index 04e2d02..ca8ae98 100644 3 | --- a/retrieval/models/ase_model.py 4 | +++ b/retrieval/models/ase_model.py 5 | @@ -6,12 +6,12 @@ 6 | 7 | import torch 8 | import torch.nn as nn 9 | -from models.audio_encoder import AudioEncoder 10 | -from models.text_encoder import TextEncoder 11 | +from ..models.audio_encoder import AudioEncoder 12 | +from ..models.text_encoder import TextEncoder 13 | import torch.nn.functional as F 14 | import copy 15 | -from tools.losses import AudioTextContrastiveLoss, NTXent 16 | -from tools.utils import remove_grad 17 | +from ..tools.losses import AudioTextContrastiveLoss, NTXent 18 | +from ..tools.utils import remove_grad 19 | 20 | 21 | class ASE(nn.Module): 22 | diff --git a/retrieval/models/audio_encoder.py b/retrieval/models/audio_encoder.py 23 | index e3b9394..2201e7f 100644 24 | --- a/retrieval/models/audio_encoder.py 25 | +++ b/retrieval/models/audio_encoder.py 26 | @@ -6,8 +6,8 @@ 27 | 28 | import torch 29 | import torch.nn as nn 30 | -from models.cnns import ResNet38, Cnn14 31 | -from models.htsat import HTSAT_Swin_Transformer 32 | +from ..models.cnns import ResNet38, Cnn14 33 | +from ..models.htsat import HTSAT_Swin_Transformer 34 | 35 | 36 | class AudioEncoder(nn.Module): 37 | diff --git a/retrieval/models/cnns.py b/retrieval/models/cnns.py 38 | index be2ed5a..61ccd7c 100644 39 | --- a/retrieval/models/cnns.py 40 | +++ b/retrieval/models/cnns.py 41 | @@ -12,7 +12,7 @@ import torch 42 | import torch.nn as nn 43 | import torch.nn.functional as F 44 | from torchlibrosa.augmentation import SpecAugmentation 45 | -from models.feature_extractor import AudioFeature 46 | +from ..models.feature_extractor import AudioFeature 47 | 48 | 49 | def init_layer(layer): 50 | diff --git a/retrieval/models/htsat.py b/retrieval/models/htsat.py 51 | index b5a9ff2..4795f45 100644 52 | --- a/retrieval/models/htsat.py 53 | +++ b/retrieval/models/htsat.py 54 | @@ -23,7 +23,7 @@ from torch.nn.init import _calculate_fan_in_and_fan_out 55 | from itertools import repeat 56 | from typing import List 57 | 58 | -from models.feature_extractor import AudioFeature 59 | +from ..models.feature_extractor import AudioFeature 60 | 61 | 62 | def interpolate(x, ratio): 63 | -------------------------------------------------------------------------------- /prepare_wav.py: -------------------------------------------------------------------------------- 1 | """Audio file converter. 2 | 3 | This converts the original audio files found in the source folder recursively, 4 | then store under the destination folder with the same relative path structure. 5 | 6 | The conversion process includes the following steps: 7 | - Stereo to mono 8 | - Resample to the sampling rate 9 | 10 | Usage: 11 | python convert_wav.py /path/to/fsd50k work/16k/fsd50k 16000 12 | python convert_wav.py /path/to/speech_commands_v0.02 work/16k/spcv2 16000 13 | python convert_wav.py /data/A/VoxCeleb1 work/16k/vc1 16000 14 | """ 15 | 16 | from pathlib import Path 17 | from multiprocessing import Pool 18 | import fire 19 | from tqdm import tqdm 20 | import soundfile as sf 21 | import librosa 22 | 23 | 24 | def _converter_worker(args): 25 | subpathname, from_dir, to_dir, sample_rate, verbose = args 26 | from_dir, to_dir = Path(from_dir), Path(to_dir) 27 | to_name = to_dir/subpathname 28 | if verbose: 29 | print(from_dir, '->', to_name) 30 | 31 | # load wav 32 | wav, org_sr = sf.read(from_dir/subpathname, dtype='float32', always_2d=True) 33 | wav = wav.T # (wave length, 1 or 2) -> (1 or 2, wave length) 34 | 35 | # stereo to mono (compatible with librosa) 36 | # ref: https://librosa.org/doc/main/generated/librosa.to_mono.html#librosa.to_mono 37 | wav = wav.mean(axis=0) 38 | 39 | # resample 40 | wav = librosa.resample(wav, orig_sr=org_sr, target_sr=sample_rate) 41 | 42 | # save wav 43 | to_name.parent.mkdir(exist_ok=True, parents=True) 44 | sf.write(to_name, data=wav, samplerate=sample_rate) # subtype=sf.default_subtype('WAV') -- not always wav 45 | 46 | return to_name.name 47 | 48 | 49 | def convert_wav(from_dir, to_dir, sample_rate, suffix='.wav', verbose=False) -> None: 50 | from_dir = str(from_dir) 51 | files = [str(f).replace(from_dir, '') for f in Path(from_dir).glob(f'**/*{suffix}')] 52 | files = [f[1:] if f[0] == '/' else f for f in files] 53 | print(f'Processing {len(files)} {suffix} files at a sampling rate of {sample_rate} Hz...') 54 | assert len(files) > 0 55 | 56 | with Pool() as p: 57 | args = [[f, from_dir, to_dir, sample_rate, verbose] for f in files] 58 | shapes = list(tqdm(p.imap(_converter_worker, args), total=len(args))) 59 | 60 | print('finished.') 61 | 62 | 63 | if __name__ == "__main__": 64 | fire.Fire(convert_wav) 65 | -------------------------------------------------------------------------------- /evar/utils/download_voxforge.py: -------------------------------------------------------------------------------- 1 | """ 2 | Download VoxForge dataset to your "to_folder". 3 | This code uses a file list from TFDS, downloads .tgz files, and extract them. 4 | The definition of labels and data splits is available in evar/metadata/voxforge.csv. 5 | 6 | Following TFDS implementation for the details. 7 | 8 | ## Usage 9 | 10 | '''sh 11 | python download_voxforge.py 12 | ''' 13 | 14 | ## Reference 15 | 16 | - [1] http://www.voxforge.org/ 17 | - [2] TFDS: https://github.com/tensorflow/datasets/blob/master/tensorflow_datasets/audio/voxforge.py 18 | 19 | @article{maclean2018voxforge, 20 | title={Voxforge}, 21 | author={MacLean, Ken}, 22 | journal={Ken MacLean.[Online]. Available: http://www.voxforge.org/home.[Acedido em 2012]}, 23 | year={2018} 24 | } 25 | """ 26 | 27 | import urllib.request 28 | import shutil 29 | import os 30 | from pathlib import Path 31 | from multiprocessing import Pool 32 | from tqdm import tqdm 33 | import fire 34 | 35 | 36 | TFDS_URL = 'https://storage.googleapis.com/tfds-data/downloads/voxforge/voxforge_urls.txt' 37 | 38 | 39 | def _download_extract_worker(args): 40 | url, filename, dest_path = args 41 | 42 | if (Path(dest_path)/Path(filename).stem).exists(): 43 | #print(' skip', Path(filename).stem) 44 | #print('.', end='') 45 | return 46 | 47 | tmpfile = '/tmp/' + filename 48 | try: 49 | urllib.request.urlretrieve('http://' + url, tmpfile) 50 | except: 51 | print('ERROR to download', url) 52 | return 53 | try: 54 | shutil.unpack_archive(tmpfile, dest_path) 55 | except: 56 | print('ERROR to extract', url) 57 | 58 | os.remove(tmpfile) 59 | 60 | 61 | def download_extract_voxforge(dest_path): 62 | file = urllib.request.urlopen(TFDS_URL) 63 | urls = [line.decode('utf-8').strip() for line in file] 64 | filenames = [url.split('/')[-1] for url in urls] 65 | assert len(set(filenames)) == len(urls) 66 | 67 | print('Downloading voxforge for', len(urls), 'tgz archives.') 68 | Path(dest_path).mkdir(exist_ok=True, parents=True) 69 | with Pool() as p: 70 | args = [[url, filename, dest_path] for url, filename in zip(urls, filenames)] 71 | shapes = list(tqdm(p.imap(_download_extract_worker, args), total=len(args))) 72 | 73 | print('finished.') 74 | 75 | 76 | if __name__ == "__main__": 77 | fire.Fire(download_extract_voxforge) 78 | -------------------------------------------------------------------------------- /app/circor/README_CirCor.md: -------------------------------------------------------------------------------- 1 | # CirCor evaluation 2 | 3 | We provide code to evaluate CirCor with various models. 4 | In addition, the exact stratified data splits used in the paper are provided for reproducibility. 5 | 6 | **NOTE: The code freezes the audio representation model weights.** 7 | 8 | Prepare code and download datasets before your evaluation. 9 | 10 | ## Prepare codebase 11 | 12 | In this folder `app/circor`, run the following: 13 | 14 | ```sh 15 | git clone https://github.com/Benjamin-Walker/heart-murmur-detection.git 16 | (cd heart-murmur-detection && git checkout 60f5420918b151e06932f70a52649d9562f0be2d) 17 | patch -p1 < patch-heart-murmur-detection.diff 18 | 19 | wget https://raw.githubusercontent.com/nttcslab/m2d/refs/heads/master/app/circor/datalist_stratified_data1.csv 20 | wget https://raw.githubusercontent.com/nttcslab/m2d/refs/heads/master/app/circor/datalist_stratified_data2.csv 21 | wget https://raw.githubusercontent.com/nttcslab/m2d/refs/heads/master/app/circor/datalist_stratified_data3.csv 22 | ``` 23 | 24 | ## Download and rearrange dataset 25 | 26 | In this folder `app/circor`, download the dataset: 27 | 28 | ```sh 29 | wget -r -N -c -np https://physionet.org/files/circor-heart-sound/1.0.3/ 30 | ``` 31 | 32 | Then, do the following to rearrange data files into stratified splits and copy them under `heart-murmur-detection/data` and `../../work/16k/circor`. 33 | 34 | ```sh 35 | python rearrange_data.py 36 | ``` 37 | 38 | It also creates metadata files as `../../evar/metadata/circor[1-3].csv`. 39 | 40 | ## Run evaluations 41 | 42 | In the **root folder of EVAR**, run the scripts `ev_*.sh`. The following is the complete set of command lines for the paper. 43 | 44 | The results will be recorded in `results/circor-scores.csv`. 45 | 46 | ```sh 47 | bash app/circor/ev_ast.sh 1 5 7 0.03 48 | bash app/circor/ev_ast.sh 2 5 7 0.03 49 | bash app/circor/ev_ast.sh 3 5 7 0.03 50 | 51 | bash app/circor/ev_beats.sh 1 5 7 0.03 52 | bash app/circor/ev_beats.sh 2 5 7 0.03 53 | bash app/circor/ev_beats.sh 3 5 7 0.03 54 | 55 | bash app/circor/ev_byola.sh 1 5 7 0.1 56 | bash app/circor/ev_byola.sh 2 5 7 0.1 57 | bash app/circor/ev_byola.sh 3 5 7 0.1 58 | 59 | bash app/circor/ev_m2d.sh m2d_vit_base-80x608p16x16-221006-mr7/checkpoint-300.pth 1 5 7 0.1 60 | bash app/circor/ev_m2d.sh m2d_vit_base-80x608p16x16-221006-mr7/checkpoint-300.pth 2 5 7 0.1 61 | bash app/circor/ev_m2d.sh m2d_vit_base-80x608p16x16-221006-mr7/checkpoint-300.pth 3 5 7 0.1 62 | ``` 63 | -------------------------------------------------------------------------------- /evar/utils/download_cremad.py: -------------------------------------------------------------------------------- 1 | """ 2 | Download CREMA-D dataset to your "to_folder". 3 | This code uses a file list from TFDS and downloads .wav files only. 4 | The definition of labels and data splits is available in evar/metadata/cremad.csv. 5 | 6 | Following NOSS [2] split. We assign 70 % of speakers (63) as training, 10 % (9) as validation, 7 | and the remaining 20 % (19) as test splits, with no speaker duplication in multiple splits. 8 | 9 | ## Usage 10 | 11 | '''sh 12 | python download_cremad.py 13 | ''' 14 | 15 | ## Reference 16 | 17 | - [1] H. Cao, D. G. Cooper, M. K. Keutmann, R. C. Gur, A. Nenkova and R. Verma, "CREMA-D: Crowd-Sourced Emotional Multimodal Actors Dataset," in IEEE Transactions on Affective Computing, vol. 5, no. 4, pp. 377-390, 1 Oct.-Dec. 2014, doi: 10.1109/TAFFC.2014.2336244. 18 | - [2] J. Shor, A. Jansen, R. Maor, O. Lang, O. Tuval, F. d. C. Quitry, M. Tagliasacchi, I. Shavitt, D. Emanuel, and Y. Haviv, “Towards learning a universal non-semantic representation of speech,” in Interspeech, Oct 2020. 19 | - [3] https://github.com/tensorflow/datasets/blob/master/tensorflow_datasets/audio/crema_d.py 20 | """ 21 | 22 | import urllib.request 23 | from pathlib import Path 24 | from multiprocessing import Pool 25 | from tqdm import tqdm 26 | import fire 27 | 28 | 29 | TFDS_URL = 'https://storage.googleapis.com/tfds-data/manual_checksums/crema_d.txt' 30 | 31 | 32 | def _download_worker(args): 33 | url, dest_path = args 34 | filename = url.split('/')[-1] 35 | 36 | if (Path(dest_path)/Path(filename).name).exists(): 37 | print(' skip', Path(filename).stem) 38 | return 39 | 40 | destfile = f'{dest_path}/{filename}' 41 | try: 42 | urllib.request.urlretrieve(url, destfile) 43 | except: 44 | print('ERROR to download', url) 45 | 46 | 47 | def download_extract_cremad(dest_path): 48 | lines = urllib.request.urlopen(TFDS_URL) 49 | urls = [line.decode('utf-8').strip().split()[0] for line in lines] 50 | urls = [url for url in urls if url[-4:] == '.wav'] # wav only, excluding summaryTable.csv 51 | 52 | print('Downloading CREMA-D for', len(urls), 'wav files.') 53 | Path(dest_path).mkdir(exist_ok=True, parents=True) 54 | with Pool() as p: 55 | args = [[url, dest_path] for url in urls] 56 | shapes = list(tqdm(p.imap(_download_worker, args), total=len(args))) 57 | 58 | print('finished.') 59 | 60 | 61 | if __name__ == "__main__": 62 | fire.Fire(download_extract_cremad) 63 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | -------------------------------------------------------------------------------- /evar/ar_htsat.py: -------------------------------------------------------------------------------- 1 | """Wrapper code for: 2 | 3 | HTS-AT: A Hierarchical Token-Semantic Audio Transformer for Sound Classification and Detection 4 | 5 | ## Reference 6 | - [1] https://arxiv.org/abs/2202.00874 7 | - [2] https://github.com/RetroCirce/HTS-Audio-Transformer 8 | """ 9 | 10 | from evar.ar_base import BaseAudioRepr, temporal_pooling 11 | import sys 12 | import logging 13 | import torch 14 | try: 15 | import os 16 | evar_home = os.getenv('EVAR', '') 17 | sys.path.append(os.path.join(evar_home, 'external/htsat')) 18 | sys.path.append('../../external/htsat') 19 | from model.htsat import HTSAT_Swin_Transformer 20 | import config 21 | except: 22 | pass # please install HTS-AT 23 | 24 | 25 | class AR_HTSAT(BaseAudioRepr): 26 | 27 | def __init__(self, cfg): 28 | super().__init__(cfg=cfg) 29 | 30 | # load the pre-trained checkpoints 31 | checkpoint = torch.load(cfg.weight_file) 32 | logging.info(f' Using weight_file: {cfg.weight_file}') 33 | 34 | self.backbone = HTSAT_Swin_Transformer( 35 | spec_size=config.htsat_spec_size, 36 | patch_size=config.htsat_patch_size, 37 | in_chans=1, 38 | num_classes=config.classes_num, 39 | window_size=config.htsat_window_size, 40 | config = config, 41 | depths = config.htsat_depth, 42 | embed_dim = config.htsat_dim, 43 | patch_stride=config.htsat_stride, 44 | num_heads=config.htsat_num_head) 45 | 46 | states, L = {}, len('sed_model.') 47 | for k in checkpoint["state_dict"]: 48 | new_k = k[L:] if k.startswith('sed_model.') else k 49 | states[new_k] = checkpoint["state_dict"][k] 50 | self.backbone.load_state_dict(states) 51 | # cfg = checkpoint['config'] 52 | 53 | def encode_frames(self, batch_audio): 54 | assert False, 'encode_frames for HTS-AT is not supported for now' 55 | 56 | def forward(self, batch_audio): 57 | # Split long audio into pieces and average the features. 58 | features = [] 59 | for chunk_index in range((batch_audio.shape[-1] + config.clip_samples - 1) // config.clip_samples): 60 | chunk = batch_audio[:, chunk_index*config.clip_samples:(chunk_index + 1)*config.clip_samples] 61 | features.append(self.backbone(chunk, mixup_lambda=None, infer_mode=True)['latent_output']) 62 | features = torch.stack(features) 63 | features = torch.mean(features, dim=0) 64 | return features -------------------------------------------------------------------------------- /app/icbhi_sprs/README_ICBHI_SPRS.md: -------------------------------------------------------------------------------- 1 | # ICBHI 2017 and SPRSound evaluation 2 | 3 | We provide code to evaluate ICBHI 2017 and SPRSound with various models. 4 | 5 | **NOTE: The code freezes the audio representation model weights.** 6 | 7 | Prepare code and download datasets before running the evaluation. 8 | 9 | ## Prepare code 10 | 11 | ```sh 12 | pip install torchinfo 13 | git clone https://github.com/ilyassmoummad/scl_icbhi2017.git 14 | cd scl_icbhi2017 15 | git reset --hard 915c1120719a9357d662c5fe484bce7fbe845139 16 | mv dataset.py augmentations.py utils.py losses.py args.py .. 17 | mv data .. 18 | mv ce.py .. 19 | cd .. 20 | patch -p2 < patch_scl_icbhi2017_evar.diff 21 | rm -fr scl_icbhi2017 22 | ``` 23 | 24 | ## Download ICBHI 2017 25 | 26 | ```sh 27 | wget https://bhichallenge.med.auth.gr/sites/default/files/ICBHI_final_database/ICBHI_final_database.zip --no-check-certificate 28 | 29 | unzip ICBHI_final_database.zip | awk 'BEGIN {ORS=" "} {if(NR%10==0)print "."}' 30 | mv ICBHI_final_database/* data/ICBHI 31 | rmdir ICBHI_final_database 32 | ``` 33 | 34 | ## Download SPRS 35 | 36 | ```sh 37 | git clone https://github.com/SJTU-YONGFU-RESEARCH-GRP/SPRSound.git 38 | (cd SPRSound && git reset --hard 45b0d5d435ff320c46585762fa1090afd0ebb318) 39 | cp -r SPRSound/train_wav SPRSound/test_wav data/SPRS/ 40 | ``` 41 | 42 | ## Run evaluations 43 | 44 | The following examples run evaluations on ICBHI 2017 for the models. 45 | 46 | ```sh 47 | bash ev_icbhi_beats.sh 48 | bash ev_icbhi_m2d.sh ../../m2d_clap_vit_base-80x1001p16x16-240128_AS-FT_enconly/weights_ep67it3124-0.48558.pth 49 | ``` 50 | 51 | Find the shell scripts for more evaluations. 52 | 53 | **NOTE: All the evaluations employ a transformer head except for ev_icbhi_mlp_m2d.sh, which uses MLP instead.** 54 | 55 | The following is the list of command lines for reproduction. 56 | 57 | ```sh 58 | bash ev_icbhi_ast.sh 5 59 | bash ev_icbhi_beats.sh 5 60 | bash ev_icbhi_byola.sh 5 61 | bash ev_icbhi_opera.sh 5 62 | bash ev_icbhi_m2d.sh m2d_vit_base-80x608p16x16-221006-mr7/checkpoint-300.pth 5 63 | bash ev_icbhi_m2d.sh m2d_vit_base-80x200p16x4-230529/checkpoint-300.pth 5 64 | 65 | bash ev_sprs_ast.sh 5 66 | bash ev_sprs_beats.sh 5 67 | bash ev_sprs_byola.sh 5 68 | bash ev_sprs_opera.sh 5 69 | bash ev_sprs_m2d.sh m2d_vit_base-80x608p16x16-221006-mr7/checkpoint-300.pth 5 70 | bash ev_sprs_m2d.sh m2d_vit_base-80x200p16x4-230529/checkpoint-300.pth 5 71 | 72 | # Ablations: M2D (16×4, MLP) 73 | bash ev_icbhi_mlp_m2d.sh m2d_vit_base-80x200p16x4-230529/checkpoint-300.pth 5 74 | bash ev_sprs_mlp_m2d.sh m2d_vit_base-80x200p16x4-230529/checkpoint-300.pth 5 75 | ``` 76 | -------------------------------------------------------------------------------- /evar/ar_beats.py: -------------------------------------------------------------------------------- 1 | """Wrapper code for: 2 | 3 | BEATs: Audio Pre-Training with Acoustic Tokenizers 4 | 5 | ## Reference 6 | - [1] https://arxiv.org/abs/2212.09058 7 | - [2] https://github.com/microsoft/unilm/blob/master/beats/README.md 8 | """ 9 | 10 | from evar.ar_base import BaseAudioRepr, temporal_pooling 11 | import sys 12 | import logging 13 | import torch 14 | try: 15 | import os 16 | evar_home = os.getenv('EVAR', '') 17 | sys.path.append(os.path.join(evar_home, 'external/unilm/beats')) 18 | sys.path.append('../../external/unilm/beats') 19 | from Tokenizers import TokenizersConfig, Tokenizers 20 | from BEATs import BEATs, BEATsConfig 21 | except: 22 | pass 23 | 24 | 25 | class AR_BEATs(BaseAudioRepr): 26 | 27 | def __init__(self, cfg): 28 | super().__init__(cfg=cfg) 29 | 30 | # load the pre-trained checkpoints 31 | checkpoint = torch.load(cfg.weight_file) 32 | logging.info(f' Using weight_file: {cfg.weight_file}') 33 | 34 | cfg = BEATsConfig(checkpoint['cfg']) 35 | BEATs_model = BEATs(cfg) 36 | BEATs_model.load_state_dict(checkpoint['model']) 37 | self.backbone = BEATs_model.eval() 38 | 39 | def encode_frames(self, batch_audio): 40 | padding_mask = torch.zeros_like(batch_audio).bool() 41 | features = self.backbone.extract_features(batch_audio, padding_mask=padding_mask)[0] 42 | return features.transpose(1, 2) # [B, D, T] 43 | 44 | def forward(self, batch_audio): 45 | x = self.encode_frames(batch_audio) 46 | return x.mean(dim=-1) # [B, D, T] -> [B, D] 47 | 48 | 49 | class AR_BEATsTokenizer(BaseAudioRepr): 50 | """EXPERIMENTAL""" 51 | 52 | def __init__(self, cfg): 53 | super().__init__(cfg=cfg) 54 | 55 | # load the pre-trained checkpoints 56 | checkpoint = torch.load(cfg.weight_file) 57 | logging.info(f' Using weight_file: {cfg.weight_file}') 58 | 59 | cfg = TokenizersConfig(checkpoint['cfg']) 60 | BEATs_tokenizer = Tokenizers(cfg) 61 | BEATs_tokenizer.load_state_dict(checkpoint['model']) 62 | self.backbone = BEATs_tokenizer.eval() 63 | 64 | def encode_frames(self, batch_audio): 65 | padding_mask = torch.zeros_like(batch_audio).bool() 66 | features = self.backbone.extract_labels(batch_audio, padding_mask=padding_mask) 67 | features = features.reshape(batch_audio.shape[0], -1).unsqueeze(-1) 68 | return features.to(float) # [B, D, T] 69 | 70 | def forward(self, batch_audio): 71 | x = self.encode_frames(batch_audio) 72 | return x.mean(dim=-1) # [B, D, T] -> [B, D] 73 | -------------------------------------------------------------------------------- /evar/ar_wavcaps.py: -------------------------------------------------------------------------------- 1 | """Wrapper code for: 2 | 3 | WavCaps: A ChatGPT-Assisted Weakly-Labelled Audio Captioning Dataset for Audio-Language Multimodal Research 4 | 5 | ## Reference 6 | - [1] https://arxiv.org/abs/2303.17395 7 | - [2] https://github.com/XinhaoMei/WavCaps 8 | """ 9 | 10 | from evar.ar_base import BaseCLAP 11 | import sys 12 | import torch 13 | try: 14 | sys.path.append('external/WavCaps') 15 | from retrieval.models.ase_model import ASE 16 | except: 17 | pass # please install WavCaps 18 | 19 | 20 | class AR_WavCaps(BaseCLAP): 21 | 22 | def __init__(self, cfg): 23 | super().__init__(cfg=cfg) 24 | cp = torch.load(cfg.weight_file) 25 | config = cp["config"] 26 | config['audio_encoder_args']['pretrained'] = False 27 | model = ASE(config) 28 | model.load_state_dict(cp["model"], strict=False) 29 | self.backbone = model 30 | 31 | def encode_frames(self, batch_audio): 32 | assert False, 'encode_frames for MS CLAP is not supported for now' 33 | 34 | def forward(self, batch_audio): 35 | # Split long audio into pieces and average the features. 36 | features, clip_samples = [], 32000 * 10 37 | for chunk_index in range((batch_audio.shape[-1] + clip_samples - 1) // clip_samples): 38 | chunk = batch_audio[:, chunk_index*clip_samples:(chunk_index + 1)*clip_samples] 39 | if chunk.shape[-1] < clip_samples: # from https://github.com/XinhaoMei/WavCaps/blob/master/retrieval/zero_shot_classification.py 40 | pad_length = clip_samples - chunk.shape[-1] 41 | chunk = torch.nn.functional.pad(chunk, [0, pad_length], "constant", 0.0) 42 | features.append(self.backbone.encode_audio(chunk)) 43 | features = torch.stack(features) 44 | features = torch.mean(features, dim=0) 45 | return features 46 | 47 | def encode_audio(self, batch_audio): 48 | audio_embeddings = self.forward(batch_audio) 49 | return audio_embeddings 50 | 51 | def encode_text(self, batch_text): 52 | text_input = self.backbone.text_encoder.tokenizer(batch_text, 53 | padding='longest', 54 | truncation=True, 55 | max_length=30, 56 | return_tensors="pt").to(self.backbone.text_encoder.device) 57 | text_feats = self.backbone.text_encoder.text_encoder(input_ids=text_input.input_ids, 58 | attention_mask=text_input.attention_mask)[0] 59 | text_feats = self.backbone.text_proj(text_feats[:, 0, :]) 60 | return text_feats 61 | -------------------------------------------------------------------------------- /external/ast_models.patch: -------------------------------------------------------------------------------- 1 | diff --git a/src/models/ast_models.py b/src/models/ast_models.py 2 | index 897d6b5..e542ad2 100644 3 | --- a/ast/src/models/ast_models.py 4 | +++ b/ast/src/models/ast_models.py 5 | @@ -44,7 +44,7 @@ class ASTModel(nn.Module): 6 | :param audioset_pretrain: if pretrain the model with full AudioSet in addition to ImageNet 7 | :param model_size: the model size of AST, should be in [tiny224, small224, base224, base384], base224 and base 384 are same model, but are trained differently during pretraining. 8 | """ 9 | - def __init__(self, label_dim=527, fstride=10, tstride=10, input_fdim=128, input_tdim=1024, imagenet_pretrain=True, audioset_pretrain=False, model_size='base384', verbose=True): 10 | + def __init__(self, label_dim=527, fstride=10, tstride=10, input_fdim=128, input_tdim=1024, imagenet_pretrain=True, audioset_pretrain=False, model_size='base384', verbose=True, pretrained_weight='../../pretrained_models/ast_audioset.pth'): 11 | 12 | super(ASTModel, self).__init__() 13 | assert timm.__version__ == '0.4.5', 'Please use timm == 0.4.5, the code might not be compatible with newer versions.' 14 | @@ -119,11 +119,11 @@ class ASTModel(nn.Module): 15 | if model_size != 'base384': 16 | raise ValueError('currently only has base384 AudioSet pretrained model.') 17 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 18 | - if os.path.exists('../../pretrained_models/audioset_10_10_0.4593.pth') == False: 19 | + if os.path.exists(pretrained_weight) == False: 20 | # this model performs 0.4593 mAP on the audioset eval set 21 | audioset_mdl_url = 'https://www.dropbox.com/s/cv4knew8mvbrnvq/audioset_0.4593.pth?dl=1' 22 | - wget.download(audioset_mdl_url, out='../../pretrained_models/audioset_10_10_0.4593.pth') 23 | - sd = torch.load('../../pretrained_models/audioset_10_10_0.4593.pth', map_location=device) 24 | + wget.download(audioset_mdl_url, out=pretrained_weight) 25 | + sd = torch.load(pretrained_weight, map_location=device) 26 | audio_model = ASTModel(label_dim=527, fstride=10, tstride=10, input_fdim=128, input_tdim=1024, imagenet_pretrain=False, audioset_pretrain=False, model_size='base384', verbose=False) 27 | audio_model = torch.nn.DataParallel(audio_model) 28 | audio_model.load_state_dict(sd, strict=False) 29 | @@ -178,7 +178,7 @@ class ASTModel(nn.Module): 30 | x = self.v.norm(x) 31 | x = (x[:, 0] + x[:, 1]) / 2 32 | 33 | - x = self.mlp_head(x) 34 | + # x = self.mlp_head(x) 35 | return x 36 | 37 | if __name__ == '__main__': 38 | -------------------------------------------------------------------------------- /evar/ar_coala.py: -------------------------------------------------------------------------------- 1 | """Wrapper code for: 2 | 3 | COALA: Co-Aligned Autoencoders for Learning Semantically Enriched Audio Representations 4 | 5 | ## Note 6 | 7 | - FS is 22,000: https://github.com/xavierfav/coala/blob/master/utils.py#L66 8 | - Fixed the original scaler_top_1000.pkl: https://github.com/xavierfav/coala/issues/3 9 | 10 | ## Reference 11 | - [1] https://arxiv.org/abs/2006.08386 12 | - [2] https://github.com/xavierfav/coala 13 | """ 14 | 15 | from evar.ar_base import (BaseAudioRepr, temporal_pooling) 16 | import torch 17 | import librosa 18 | import numpy as np 19 | import logging 20 | try: 21 | from external.coala.encode import return_loaded_model, scaler 22 | from external.coala.models_t1000 import AudioEncoder 23 | from external.coala.utils import pad 24 | except: 25 | pass # logging.error('Make your copy of COALA under external folder. Check Preparing-models.md for the details.') 26 | 27 | 28 | def _compute_spectrogram(audio, sr=22000, n_mels=96): 29 | """Borrowed from coala/utils.py, removed wav loading to accept raw audio input.""" 30 | # zero pad and compute log mel spec 31 | try: 32 | x = pad(audio, sr) 33 | except ValueError: 34 | x = audio 35 | audio_rep = librosa.feature.melspectrogram(y=x, sr=sr, hop_length=512, n_fft=1024, n_mels=n_mels, power=1.) 36 | audio_rep = np.log(audio_rep + np.finfo(np.float32).eps) 37 | return audio_rep 38 | 39 | 40 | def _extract_audio_embedding_chunks(model, audio): 41 | """Borrowed from coala/encode.py, modified to accept torch tensor raw audio input.""" 42 | with torch.no_grad(): 43 | device = audio.device 44 | x = _compute_spectrogram(audio.cpu().numpy()) 45 | x_chunks = np.array([scaler.transform(chunk.T) for chunk in 46 | librosa.util.frame(np.asfortranarray(x), frame_length=96, hop_length=96, axis=-1).T]) 47 | x_chunks = torch.unsqueeze(torch.tensor(x_chunks), 1).to(device) 48 | embedding_chunks, embedding_d_chunks = model(x_chunks) 49 | return embedding_chunks, embedding_d_chunks 50 | 51 | 52 | class AR_COALA(BaseAudioRepr): 53 | def __init__(self, cfg): 54 | super().__init__(cfg=cfg) 55 | self.model = return_loaded_model(AudioEncoder, 'external/coala/saved_models/dual_ae_c/audio_encoder_epoch_200.pt') 56 | 57 | def encode_frames(self, batch_audio): 58 | xs = [_extract_audio_embedding_chunks(self.model, x)[0] for x in batch_audio] 59 | x = torch.stack(xs).transpose(1, 2) # [Frame, D] x B -> [B, Frame, D] -> [B, D, Frame (T)] 60 | return x 61 | 62 | def forward(self, batch_audio): 63 | x = self.encode_frames(batch_audio) 64 | x = temporal_pooling(self, x) 65 | return x 66 | -------------------------------------------------------------------------------- /evar/ar_ast.py: -------------------------------------------------------------------------------- 1 | """Wrapper code for: 2 | 3 | AST: Audio Spectrogram Transformer 4 | 5 | ## Reference 6 | - [1] https://arxiv.org/abs/2104.01778 Y. Gong, Y.-A. Chung, and J. Glass, “Ast: Audio spectrogram transformer,” arXiv preprint arXiv:2104.01778, 2021. 7 | - [2] https://github.com/YuanGongND/ast 8 | """ 9 | 10 | from evar.ar_base import (BaseAudioRepr, calculate_norm_stats) 11 | import torch 12 | import torchaudio 13 | 14 | try: 15 | from external.ast.src.models import ASTModel 16 | except Exception as e: 17 | pass # print(f'(For AST users) Make your copy of AST under external folder. Check Preparing-models.md for the details.') 18 | 19 | 20 | class AST_Feature(torch.nn.Module): 21 | def __init__(self, cfg): 22 | super().__init__() 23 | self.cfg = cfg 24 | 25 | def forward(self, waveforms): 26 | def get_one(waveform): 27 | waveform = waveform - waveform.mean() 28 | fbank = torchaudio.compliance.kaldi.fbank(waveform, htk_compat=True, 29 | sample_frequency=self.cfg.sample_rate, use_energy=False, 30 | window_type=self.cfg.window, num_mel_bins=self.cfg.n_mels, 31 | dither=0.0, frame_shift=10) 32 | return fbank 33 | device = waveforms.device 34 | if len(waveforms.shape) == 1: # [L] -> [1, L] 35 | waveforms = waveforms.unsqueeze(0) 36 | fbanks = torch.stack([get_one(w.unsqueeze(0)) for w in waveforms]) 37 | return fbanks.to(device) 38 | 39 | 40 | class AR_AST(BaseAudioRepr): 41 | def __init__(self, cfg): 42 | super().__init__(cfg=cfg) 43 | self.to_feature = AST_Feature(cfg) 44 | tdim = self.to_feature(torch.rand(1, cfg.unit_samples)).shape[1] 45 | self.backbone = ASTModel(label_dim=10, input_tdim=tdim, imagenet_pretrain=True, 46 | audioset_pretrain=True, pretrained_weight=cfg.weight_file) 47 | 48 | def precompute(self, device, data_loader): 49 | self.norm_stats = calculate_norm_stats(device, data_loader, self.to_feature) 50 | 51 | def encode_frames(self, batch_audio): 52 | # AST returns a single embeddings for one audio, then simply add time axis. 53 | return self.forward(batch_audio).unsqueeze(-1) # B,D -> B,D,1 54 | 55 | def forward(self, batch_audio): 56 | x = self.to_feature(batch_audio) 57 | x = self.normalize_spectrogram(x) 58 | x = self.augment_if_training(x) 59 | x = self.backbone(x) 60 | return x 61 | 62 | def normalize_spectrogram(self, spectrograms): 63 | mu, sigma = self.norm_stats 64 | spectrograms = (spectrograms - mu) / (sigma * 2) # follows the original AudiosetDataset 65 | return spectrograms 66 | -------------------------------------------------------------------------------- /Evaluation-examples.md: -------------------------------------------------------------------------------- 1 | # Example command lines for evaluating models 2 | 3 | ## AST 4 | 5 | python 2pass_blackbox.py config/ast.yaml fsd50k 6 | python 2pass_blackbox.py config/ast.yaml spcv2 7 | python 2pass_blackbox.py config/ast.yaml us8k 8 | python 2pass_blackbox.py config/ast.yaml surge --lr=0.0001 9 | python 2pass_blackbox.py config/ast.yaml nsynth 10 | python 2pass_blackbox.py config/ast.yaml nspitch 11 | python 2pass_blackbox.py config/ast.yaml vc1 12 | python 2pass_blackbox.py config/ast.yaml cremad 13 | python 2pass_blackbox.py config/ast.yaml voxforge 14 | python 2pass_blackbox.py config/ast.yaml esc50 15 | python 2pass_blackbox.py config/ast.yaml gtzan batch_size=12 16 | 17 | ## BYOL-A 18 | 19 | python 2pass_blackbox.py config/byola.yaml fsd50k 20 | python 2pass_blackbox.py config/byola.yaml spcv2 21 | python 2pass_blackbox.py config/byola.yaml us8k 22 | python 2pass_blackbox.py config/byola.yaml surge --lr=0.0001 23 | python 2pass_blackbox.py config/byola.yaml nsynth 24 | python 2pass_blackbox.py config/byola.yaml nspitch 25 | python 2pass_blackbox.py config/byola.yaml vc1 26 | python 2pass_blackbox.py config/byola.yaml cremad 27 | python 2pass_blackbox.py config/byola.yaml voxforge 28 | python 2pass_blackbox.py config/byola.yaml esc50 29 | python 2pass_blackbox.py config/byola.yaml gtzan batch_size=64 --lr=0.001 30 | 31 | ## PANNs' CNN14 32 | 33 | python 2pass_blackbox.py config/cnn14.yaml cremad 34 | python 2pass_blackbox.py config/cnn14.yaml voxforge 35 | python 2pass_blackbox.py config/cnn14.yaml esc50 36 | python 2pass_blackbox.py config/cnn14.yaml gtzan 37 | python 2pass_blackbox.py config/cnn14.yaml fsd50k 38 | python 2pass_blackbox.py config/cnn14.yaml nsynth --lr=0.00001 39 | python 2pass_blackbox.py config/cnn14.yaml nspitch 40 | python 2pass_blackbox.py config/cnn14.yaml surge 41 | python 2pass_blackbox.py config/cnn14.yaml vc1 42 | python 2pass_blackbox.py config/cnn14.yaml spcv2 43 | python 2pass_blackbox.py config/cnn14.yaml us8k 44 | 45 | ## VGGish 46 | 47 | python 2pass_blackbox.py config/vggish.yaml fsd50k 48 | python 2pass_blackbox.py config/vggish.yaml nsynth 49 | python 2pass_blackbox.py config/vggish.yaml nspitch 50 | python 2pass_blackbox.py config/vggish.yaml surge 51 | python 2pass_blackbox.py config/vggish.yaml vc1 --lr=0.0005 52 | python 2pass_blackbox.py config/vggish.yaml spcv2 53 | python 2pass_blackbox.py config/vggish.yaml us8k 54 | python 2pass_blackbox.py config/vggish.yaml cremad 55 | python 2pass_blackbox.py config/vggish.yaml voxforge 56 | python 2pass_blackbox.py config/vggish.yaml esc50 --lr=0.003 57 | python 2pass_blackbox.py config/vggish.yaml gtzan batch_size=128 58 | 59 | -------------------------------------------------------------------------------- /summarize.py: -------------------------------------------------------------------------------- 1 | """Summarize results for a model. 2 | """ 3 | 4 | from evar.common import (np, pd, Path, RESULT_DIR) 5 | import fire 6 | 7 | 8 | def get_weight(weight_file): 9 | weight_file = Path(weight_file) 10 | weight = weight_file.parent.name + '/' + weight_file.stem 11 | return weight 12 | 13 | 14 | def available_tasks(df): 15 | ALL_TASKS = ['esc50', 'us8k', 'spcv2', 'vc1', 'voxforge', 'cremad', 'gtzan', 'nsynth', 'surge', 'fsd50k'] \ 16 | + ['zs_esc50', 'zs_us8k', 'zs_spcv2', 'zs_vc1', 'zs_voxforge', 'zs_cremad', 'zs_gtzan', 'zs_nsynth', 'zs_surge', 'zs_fsd50k', 'zs_as'] 17 | tasks = [t for t in ALL_TASKS if t in list(df.columns)] 18 | return tasks 19 | 20 | 21 | def summarize(weight_file, post=True): 22 | # Summarize LE 23 | df = pd.read_csv(f'{RESULT_DIR}/scores.csv') 24 | df = df[df.report.str.contains(weight_file, na=False, regex=False)] 25 | df['weight'] = get_weight(weight_file) 26 | src_df = df.copy() 27 | 28 | df = pd.pivot_table(df, index=['weight'], columns=['task'], values=['score'], aggfunc=np.mean) 29 | df.columns = df.columns.get_level_values(1) 30 | df = df[available_tasks(df)] 31 | if len(df) == 0: 32 | print(f'No data for {weight_file}.') 33 | return 34 | df['average'] = df.mean(1) 35 | 36 | # Summarize ATR 37 | if Path(f'{RESULT_DIR}/retrieval_scores.csv').exists(): 38 | d = pd.read_csv(f'{RESULT_DIR}/retrieval_scores.csv') 39 | d = d[d.weight.str.contains(weight_file, na=False, regex=False)] 40 | if len(d) > 0: 41 | d = d.set_index('model') 42 | d['weight'] = get_weight(weight_file) 43 | d.columns = ['task', 'a2tR1', 'a2tR5', 'a2tR10', 'a2tmAP10', 't2aR1', 't2aR5', 't2aR10', 't2amAP10', 'weight'] 44 | new_d = None 45 | for t, shortname in [('audiocaps', 'A'), ('clotho', 'C')]: 46 | d_ = d[d.task == t][['a2tR1', 'a2tR5', 'a2tR10', 't2aR1', 't2aR5', 't2aR10']] 47 | d_.columns = [shortname + c for c in list(d_.columns)] 48 | d_.index = ['same_index'] 49 | new_d = d_ if new_d is None else pd.concat([new_d, d_], axis=1) 50 | new_d['weight'] = get_weight(weight_file) 51 | new_d = new_d.set_index('weight') * 0.01 52 | df = pd.concat([df, new_d], axis=1) 53 | 54 | # Report 55 | report = df.applymap(lambda x: f'{x*100:.2f}%' if str(x).isnumeric else x).to_markdown() 56 | print(report) 57 | 58 | # Save source results to a csv 59 | report_csv = RESULT_DIR + '/' + str(df.index[0]).replace('/', '_') + '.csv' 60 | src_df.report = src_df.report.str.replace('\n', ' ') 61 | src_df.to_csv(report_csv, index=None) 62 | 63 | 64 | if __name__ == '__main__': 65 | fire.Fire(summarize) 66 | -------------------------------------------------------------------------------- /evar/ds_tasks.py: -------------------------------------------------------------------------------- 1 | """Downstream task definitions.""" 2 | 3 | from evar.common import (os, Path, WORK, METADATA_DIR) 4 | 5 | 6 | _defs = { 7 | # folds, unit_sec, data_folder (None if task name is the folder name), balanced training when fine-tining 8 | 'us8k': [10, 4.0, None, False], 9 | 'esc50': [5, 5.0, None, False], 10 | 'fsd50k': [1, 7.6358, None, False], ## Changed to NOT balanced: to make it the same as PaSST. 11 | 'fsdnoisy18k': [1, 8.25, None, False], 12 | 'gtzan': [1, 30.0, None, False], 13 | 'nsynth': [1, 4.0, None, False], 14 | 'cremad': [1, 2.5, None, False], 15 | 'spcv1': [1, 1.0, None, False], 16 | 'spcv2': [1, 1.0, None, False], 17 | 'surge': [1, 4.0, None, False], 18 | 'vc1': [1, 8.2, None, False], 19 | 'vocalsound': [1, 4.18, None, False], 20 | 'voxforge': [1, 5.8, None, False], 21 | 'as20k': [1, 10.0, 'as', False], 22 | 'as': [1, 10.0, 'as', True], 23 | 'audiocaps': [1, 10.0, None, False], 24 | 'ja_audiocaps': [1, 10.0, 'audiocaps', False], 25 | 'clotho': [1, 30.0, None, False], 26 | 'circor1': [1, 5.0, None, False], 27 | 'circor2': [1, 5.0, None, False], 28 | 'circor3': [1, 5.0, None, False], 29 | 'bmdhs1': [1, 20.0, 'bmdhs', False], 30 | 'bmdhs2': [1, 20.0, 'bmdhs', False], 31 | 'bmdhs3': [1, 20.0, 'bmdhs', False], 32 | 'xacle': [1, 10.0, None, False], 33 | 'xacle_test': [1, 10.0, 'xacle', False], 34 | } 35 | 36 | _fs_table = { 37 | 16000: '16k', 38 | 22000: '22k', # Following COALA that uses 22,000 Hz 39 | 32000: '32k', 40 | 44100: '44k', 41 | 48000: '48k', 42 | } 43 | 44 | def get_original_folder(task, folder): 45 | orgs = { 46 | 'us8k': 'UrbanSound8K', 47 | 'esc50': 'ESC-50-master', 48 | 'as20k': 'AudioSet', 49 | 'as': 'AudioSet', 50 | 'vocalsound': 'vocalsound_44k/data_44k', 51 | } 52 | return orgs[task] if task in orgs else folder 53 | 54 | 55 | def get_defs(cfg, task, original_data=False): 56 | """Get task definition parameters. 57 | 58 | Returns: 59 | pathname (str): Metadata .csv file path. 60 | wav_folder (str): "work/16k/us8k" for example. 61 | folds (int): Number of LOOCV folds or 1. 1 means no cross validation. 62 | unit_sec (float): Unit duration in seconds. 63 | weighted (bool): True if the training requires a weighted loss calculation. 64 | balanced (bool): True if the training requires a class-balanced sampling. 65 | """ 66 | folds, unit_sec, folder, balanced = _defs[task] 67 | folder = folder or task 68 | evar_path = Path(os.environ.get('EVAR', '.')) 69 | workfolder = f'{WORK}/original/{get_original_folder(task, folder)}' if original_data else f'{WORK}/{_fs_table[cfg.sample_rate]}/{folder}' 70 | return str(evar_path/f'{METADATA_DIR}/{task}.csv'), str(evar_path/workfolder), folds, unit_sec, balanced 71 | -------------------------------------------------------------------------------- /evar/ar_byola.py: -------------------------------------------------------------------------------- 1 | """Wrapper code for: 2 | 3 | BYOL for Audio: Self-Supervised Learning for General-Purpose Audio Representation 4 | 5 | ## Reference 6 | - [1] https://arxiv.org/abs/2103.06695 7 | - [2] https://github.com/nttcslab/byol-a 8 | """ 9 | 10 | from evar.ar_base import (BaseAudioRepr, ToLogMelSpec, calculate_norm_stats, normalize_spectrogram, temporal_pooling) 11 | from evar.model_utils import load_pretrained_weights 12 | import logging 13 | try: 14 | from external.byol_a.byol_a.models import AudioNTT2020Task6, AudioNTT2020Task6X 15 | except Exception as e: 16 | pass # logging.info(f'Make your copy of BYOL-A under external folder. Check Preparing-models.md for the details.') 17 | 18 | 19 | class AR_BYOLA(BaseAudioRepr): 20 | def __init__(self, cfg): 21 | super().__init__(cfg=cfg) 22 | self.to_feature = ToLogMelSpec(cfg) 23 | 24 | self.body = AudioNTT2020Task6(n_mels=cfg.n_mels, d=cfg.feature_d) 25 | if cfg.weight_file is not None: 26 | load_pretrained_weights(self.body, cfg.weight_file, model_key='body') 27 | 28 | def precompute(self, device, data_loader): 29 | self.norm_stats = calculate_norm_stats(device, data_loader, self.to_feature) 30 | 31 | def encode_frames(self, batch_audio): 32 | x = self.to_feature(batch_audio) 33 | x = normalize_spectrogram(self.norm_stats, x) # B,F,T 34 | x = self.augment_if_training(x) 35 | x = x.unsqueeze(1) # -> B,1,F,T 36 | x = self.body(x) # -> B,T,D=C*F 37 | x = x.transpose(1, 2) # -> B,D,T 38 | return x 39 | 40 | def forward(self, batch_audio): 41 | x = self.encode_frames(batch_audio) 42 | x = temporal_pooling(self, x) 43 | return x 44 | 45 | 46 | class AR_BYOLAX(BaseAudioRepr): 47 | """A BYOL-A variant extended to stack features from all the layers.""" 48 | def __init__(self, cfg): 49 | super().__init__(cfg=cfg) 50 | self.to_feature = ToLogMelSpec(cfg) 51 | 52 | self.body = AudioNTT2020Task6X(n_mels=cfg.n_mels, d=cfg.feature_d) 53 | if cfg.weight_file is not None: 54 | self.body.load_weight(cfg.weight_file, device='cpu') 55 | self.cfg.feature_d = self.cfg.feature_d * self.body.n_feature_layer 56 | 57 | def precompute(self, device, data_loader): 58 | self.norm_stats = calculate_norm_stats(device, data_loader, self.to_feature) 59 | 60 | def encode_frames(self, batch_audio): 61 | x = self.to_feature(batch_audio) 62 | x = normalize_spectrogram(self.norm_stats, x) # B,F,T 63 | x = self.augment_if_training(x) 64 | x = x.unsqueeze(1) # -> B,1,F,T 65 | x = self.body(x, layered=True) # -> B,T,D=C*F*Layer 66 | x = x.transpose(1, 2) # -> B,D,T 67 | return x 68 | 69 | def forward(self, batch_audio): 70 | x = self.encode_frames(batch_audio) 71 | x = temporal_pooling(self, x) 72 | return x 73 | 74 | -------------------------------------------------------------------------------- /plugin/MARBLE/REAEDME_MARBLE.md: -------------------------------------------------------------------------------- 1 | ## MARBLE Benchmark Integration 2 | 3 | This repository provides the code to integrate EVAR as a plugin model into the [MARBLE](https://github.com/a43992899/MARBLE) benchmark for music tasks, enabling MARBLE to evaluate pre-trained audio representation models supported by EVAR. Based on this MARBLE extension, it also provides scripts and instructions to reproduce the results from the M2D2 paper. 4 | 5 | NOTE: We support MARBLE v1 for now. 6 | 7 | ## How to Integrate with the MARBLE Benchmark 8 | 9 | Follow the steps below to integrate EVAR into your MARBLE directory cloned from GitHub. 10 | 11 | *NOTE*: In addition to the integration steps, set the environment variable `EVAR` to point to the local EVAR folder so that MARBLE can reference it. 12 | 13 | ```sh 14 | export EVAR=/lab/eval-audio-repr 15 | 16 | git clone https://github.com/a43992899/MARBLE-Benchmark.git 17 | cd MARBLE-Benchmark 18 | git checkout d9300e335eefdad8d6b825418e8c44b22d0919c7 19 | 20 | patch -p1 < $EVAR/plugin/MARBLE/evar_marble_diff.patch 21 | cp -r $EVAR/plugin/MARBLE/benchmark/models/evar benchmark/models 22 | cp -r $EVAR/plugin/MARBLE/configs/evar configs 23 | cp $EVAR/plugin/MARBLE/evar_marble.sh . 24 | ``` 25 | 26 | For the task datasets for MARBLE, follow the instructions provided by the MARBLE. 27 | 28 | ## Evaluating models on MARBLE 29 | 30 | Once you prepare EVAR on MARBLE, you can use the script `evar_marble.sh` to evaluate models. The following is an example of M2D. 31 | 32 | ```sh 33 | bash evar_marble.sh m2d /your/m2d_vit_base-80x608p16x16-221006-mr7/checkpoint-300.pth 7 5 feat-m2d-mr7 3840 34 | ``` 35 | 36 | The results will be stored in CSV files, such as `score_EMO.csv`. 37 | 38 | ### More example command lines 39 | 40 | ```sh 41 | EVAR=/your/evar bash evar_marble.sh beats_plus /your/BEATs_iter3_plus_AS2M.pt 7 5 42 | EVAR=/your/evar bash evar_marble.sh atst_frame /your/atstframe_base.ckpt 7 5 43 | EVAR=/your/evar bash evar_marble.sh msclap 2023 7 5 44 | EVAR=/your/evar bash evar_marble.sh m2d /your/m2d_vit_base-80x608p16x16-221006-mr7/checkpoint-300.pth 7 5 feat-m2d-mr7 45 | EVAR=/your/evar bash evar_marble.sh m2d /your/m2d_vit_base-80x1001p16x16-221006-mr7_as_46ab246d/weights_ep67it3124-0.47941.pth 7 5 feat-m2d-mr7-as 46 | EVAR=/your/evar bash evar_marble.sh m2d /your/clap/m2d_clap_vit_base-80x608p16x16-240128/checkpoint-300.pth 7 5 feat-m2d-clap 47 | EVAR=/your/evar bash evar_marble.sh m2d /your/msm_mae_vit_base-80x608p16x16-220924-mr75/checkpoint-300.pth 7 5 feat-msm-mae 48 | ``` 49 | 50 | ## Referecnces 51 | 52 | - MARBLE: *[R. Yuan, Y. Ma, Y. Li, G. Zhang, X. Chen, H. Yin, z. le, Y. Liu, J. Huang, Z. Tian, B. Deng, N. Wang, C. Lin, E. Benetos, A. Ragni, N. Gyenge, R. Dannenberg, W. Chen, G. Xia, W. Xue, S. Liu, S. Wang, R. Liu, Y. Guo, and J. Fu, “MARBLE: Music audio representation benchmark for universal evaluation,” in NeurIPS, vol. 36, 2023, pp. 39 626–39 647.](https://proceedings.neurips.cc/paper_files/paper/2023/hash/7cbeec46f979618beafb4f46d8f39f36-Abstract-Datasets_and_Benchmarks.html).* 👉 [GitHub](https://github.com/a43992899/MARBLE/tree/main-v1-archived). 53 | 54 | -------------------------------------------------------------------------------- /evar/ar_wav2vec2.py: -------------------------------------------------------------------------------- 1 | """Wrapper code for: 2 | 3 | wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations 4 | 5 | ## Reference 6 | - [1] https://ai.facebook.com/blog/wav2vec-20-learning-the-structure-of-speech-from-raw-audio/ 7 | - [2] https://huggingface.co/facebook/wav2vec2-large-960h-lv60 8 | """ 9 | 10 | from evar.ar_base import BaseAudioRepr, temporal_pooling 11 | import torch 12 | import logging 13 | try: 14 | from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC 15 | except: 16 | logging.error('Install transformers.\n>>> pip install transformers') 17 | 18 | 19 | class AR_Wav2Vec2Logit(BaseAudioRepr): 20 | """Wav2Vec2.0 logits from LM output. 21 | https://huggingface.co/facebook/wav2vec2-large-960h-lv60 22 | """ 23 | def __init__(self, cfg): 24 | super().__init__(cfg=cfg) 25 | 26 | self.processor = Wav2Vec2Processor.from_pretrained(cfg.wav2vec_model) 27 | self.backbone = Wav2Vec2ForCTC.from_pretrained(cfg.wav2vec_model) 28 | 29 | def encode_frames(self, batch_audio): 30 | device = batch_audio.device 31 | preprocessed = self.processor(batch_audio.cpu().numpy(), return_tensors="pt", sampling_rate=16000).input_values 32 | preprocessed = preprocessed[0] if preprocessed.shape[0] == 1 else preprocessed # [1, B, raw wave length] -> [B, raw wave length] 33 | preprocessed = preprocessed.to(device) 34 | logits = self.backbone(preprocessed).logits # [B, T, D] 35 | return logits.transpose(1, 2) # [B, D, T] 36 | 37 | def forward(self, batch_audio): 38 | return temporal_pooling(self, self.encode_frames(batch_audio)) 39 | 40 | 41 | class AR_Wav2Vec2Context(AR_Wav2Vec2Logit): 42 | """Wav2Vec2.0 context network. 43 | https://github.com/huggingface/transformers/blob/master/src/transformers/models/wav2vec2/modeling_wav2vec2.py#L1529 44 | """ 45 | 46 | def encode_frames(self, batch_audio): 47 | device = batch_audio.device 48 | preprocessed = self.processor(batch_audio.cpu().numpy(), return_tensors="pt", sampling_rate=16000).input_values 49 | preprocessed = preprocessed[0] if preprocessed.shape[0] == 1 else preprocessed # [1, B, raw wave length] -> [B, raw wave length] 50 | preprocessed = preprocessed.to(device) 51 | features = self.backbone.wav2vec2(preprocessed, output_hidden_states=True).hidden_states # [B, T, D] 52 | hidden_states = self.backbone(preprocessed, output_hidden_states=True).hidden_states # [B, T, D] 53 | # stack layer outputs 54 | states_to_stack = [hidden_states[index] for index in self.cfg.output_layers] if self.cfg.output_layers else hidden_states 55 | features = torch.cat(states_to_stack, axis=-1) 56 | return features.transpose(1, 2) # [B, D, T] 57 | 58 | 59 | class AR_Wav2Vec2Feature(AR_Wav2Vec2Logit): 60 | """Wav2Vec2.0 feature encoder.""" 61 | 62 | def encode_frames(self, batch_audio): 63 | device = batch_audio.device 64 | preprocessed = self.processor(batch_audio.cpu().numpy(), return_tensors="pt", sampling_rate=16000).input_values 65 | preprocessed = preprocessed[0] if preprocessed.shape[0] == 1 else preprocessed # [1, B, raw wave length] -> [B, raw wave length] 66 | preprocessed = preprocessed.to(device) 67 | features = self.backbone.wav2vec2.feature_extractor(preprocessed) # [B, D, T] 68 | features = features.transpose(1, 2) # -> [B, T, D] 69 | return features.transpose(1, 2) # [B, D, T] 70 | -------------------------------------------------------------------------------- /evar/common.py: -------------------------------------------------------------------------------- 1 | """Common imports/constants/small functions.""" 2 | 3 | from evar.utils import * 4 | import shutil 5 | import re 6 | from torch import nn 7 | import torch.nn.functional as F 8 | import torchaudio 9 | import torchaudio.functional as AF 10 | import torchaudio.transforms as AT 11 | from torch.utils.data import DataLoader, Dataset 12 | 13 | 14 | # Folders 15 | WORK = 'work' 16 | METADATA_DIR = 'evar/metadata' 17 | RESULT_DIR = 'results' 18 | LOG_DIR = 'logs' 19 | 20 | 21 | def eval_if_possible(text): 22 | for pat in [r'\[.*\]', r'\(.*\)']: 23 | if re.search(pat, text): 24 | return eval(text) 25 | if re_valuable.match(text): 26 | return eval(text) 27 | return text 28 | 29 | 30 | def split_camma(text): 31 | flag = None 32 | elements = [] 33 | cur = [] 34 | for c in text: 35 | if flag is not None: 36 | cur.append(c) 37 | if flag == '[' and c == ']': flag = None 38 | if flag == '(' and c == ')': flag = None 39 | if flag == '"' and c == '"': flag = None 40 | if flag == "'" and c == "'": flag = None 41 | continue 42 | if c in ['[', '(', '"', "'"]: 43 | cur.append(c) 44 | flag = c 45 | continue 46 | if c == ',': 47 | elements.append(''.join(cur)) 48 | cur = [] 49 | else: 50 | cur.append(c) 51 | if cur: 52 | elements.append(''.join(cur)) 53 | return elements 54 | 55 | 56 | # App level utilities 57 | def complete_cfg(cfg, options, no_id=False): 58 | # Override parameter values with given "options". 59 | if 'name' not in cfg or not isinstance(cfg['name'], str): 60 | cfg['name'] = '' 61 | print(options) 62 | for item in split_camma(options): 63 | if item == '': continue 64 | keyvalues = item.split('=') 65 | assert len(keyvalues) == 2, f'An option need one and only one "=" in the option {item} in {options}.' 66 | key, value = keyvalues 67 | value = eval_if_possible(value) 68 | if key[0] == '+': 69 | key = key[1:] 70 | cfg[key] = None 71 | if key not in cfg.keys(): 72 | raise Exception(f'Cannot find a setting named: {key} of the option {item}') 73 | cfg[key] = value 74 | # Set ID. 75 | if not no_id: 76 | task = Path(cfg.task_metadata).stem if 'task_metadata' in cfg else '' 77 | if 'name' in cfg and len(cfg['name']) > 0: 78 | name = cfg.name 79 | elif 'weight_file' in cfg and len(str(cfg['weight_file'])) > 0: 80 | weight_path = Path(str(cfg['weight_file'])) 81 | parent = weight_path.parent.name.replace('.', '_') if len(weight_path.parent.name) > 0 else str(cfg.audio_repr.split(',')[-1]) 82 | name = f'{parent}-{weight_path.stem}' 83 | else: 84 | name = str(cfg.audio_repr.split(',')[-1]) 85 | cfg.id = name + '_' + task + '_' + hash_text(str(cfg), L=8) 86 | return cfg 87 | 88 | 89 | def kwarg_cfg(**kwargs): 90 | cfg = EasyDict(kwargs) 91 | cfg.id = hash_text(str(cfg), L=8) 92 | return cfg 93 | 94 | 95 | def app_setup_logger(cfg, level=logging.INFO): 96 | logpath = Path(LOG_DIR)/cfg.id 97 | logpath.mkdir(parents=True, exist_ok=True) 98 | setup_logger(filename=logpath/'log.txt', level=level) 99 | print('Logging to', logpath/'log.txt') 100 | logging.info(str(cfg)) 101 | return logpath 102 | 103 | 104 | def setup_dir(dirs=[]): 105 | for d in dirs: 106 | Path(d).mkdir(parents=True, exist_ok=True) 107 | -------------------------------------------------------------------------------- /evar/utils/__init__.py: -------------------------------------------------------------------------------- 1 | """Utilities for EVAR 2 | """ 3 | 4 | import os 5 | import sys 6 | from itertools import chain 7 | import subprocess 8 | import re 9 | import logging 10 | from easydict import EasyDict 11 | from pathlib import Path 12 | import pandas as pd 13 | import yaml 14 | import numpy as np 15 | import random 16 | import datetime 17 | import hashlib 18 | import torch 19 | try: 20 | import pickle5 as pickle 21 | except: 22 | import pickle 23 | 24 | 25 | # Regular expression to check string can be converted into variables 26 | # Thanks to -- https://stackoverflow.com/a/385597/6528729 27 | re_valuable = re.compile("""(?x) 28 | ^ 29 | ( # int|float|double 30 | [+-]?\ * # first, match an optional sign *and space* 31 | ( # then match integers or f.p. mantissas: 32 | \d+ # start out with a ... 33 | ( 34 | \.\d* # mantissa of the form a.b or a. 35 | )? # ? takes care of integers of the form a 36 | |\.\d+ # mantissa of the form .b 37 | ) 38 | ([eE][+-]?\d+)? # finally, optionally match an exponent 39 | ) 40 | |( # bool 41 | False|True 42 | ) 43 | $""") 44 | 45 | 46 | def run_command(cmd_line): 47 | print('>>>', ' '.join(cmd_line)) 48 | def runner(): 49 | proc = subprocess.Popen(cmd_line, bufsize=0, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) 50 | 51 | while True: 52 | line = proc.stdout.readline() 53 | if line: 54 | yield line 55 | 56 | if not line and proc.poll() is not None: 57 | break 58 | 59 | for line in runner(): 60 | sys.stdout.write(line.decode()) 61 | 62 | 63 | def seed_everything(seed=42): 64 | random.seed(seed) 65 | os.environ['PYTHONHASHSEED'] = str(seed) 66 | np.random.seed(seed) 67 | torch.manual_seed(seed) 68 | torch.backends.cudnn.deterministic = True 69 | torch.backends.cudnn.benchmark = False 70 | 71 | 72 | def get_timestamp(): 73 | """ex) Outputs 202104220830""" 74 | return datetime.datetime.now().strftime('%y%m%d%H%M') 75 | 76 | 77 | def load_yaml_config(path_to_config): 78 | """Loads yaml configuration settings as an EasyDict object.""" 79 | path_to_config = Path(path_to_config) 80 | assert path_to_config.is_file(), f'{path_to_config} not found, cwd={Path(".").resolve()}' 81 | with open(path_to_config) as f: 82 | yaml_contents = yaml.safe_load(f) 83 | cfg = EasyDict(yaml_contents) 84 | return cfg 85 | 86 | 87 | def hash_text(text, L=128): 88 | hashed = hashlib.shake_128(text.encode()).hexdigest(L//2 + 1) 89 | return hashed[:L] 90 | 91 | 92 | def setup_logger(name='', filename=None, level=logging.INFO): 93 | # Thanks to https://stackoverflow.com/a/53553516/6528729 94 | from imp import reload 95 | reload(logging) 96 | 97 | logging.basicConfig(format='%(asctime)s %(levelname)s %(message)s', 98 | datefmt='%Y-%m-%d %H:%M:%S', level=level, filename=filename) 99 | logger = logging.getLogger(name) 100 | console = logging.StreamHandler() 101 | console.setLevel(level) 102 | logger.addHandler(console) 103 | 104 | 105 | def flatten_list(lists): 106 | return list(chain.from_iterable(lists)) 107 | 108 | 109 | def append_to_csv(csv_filename, data): 110 | filename = Path(csv_filename) 111 | filename.parent.mkdir(parents=True, exist_ok=True) 112 | df = pd.read_csv(filename) if filename.exists() else pd.DataFrame() 113 | df = pd.concat([df, data], ignore_index=True).to_csv(filename, index=False) 114 | -------------------------------------------------------------------------------- /app/circor/rearrange_data.py: -------------------------------------------------------------------------------- 1 | """CirCor evaluation utility. 2 | 3 | This program stratified-splits physionet.org/files/circor-heart-sound/1.0.3/training_data to heart-murmur-detection/data. 4 | Then, it copies stratified data (under heart-murmur-detection/data) to evar working folder (under evar/work/16k). 5 | It also creates metadata files as ../../evar/metadata/circor[1-3].csv. 6 | """ 7 | 8 | import pandas as pd 9 | import numpy as np 10 | from pathlib import Path 11 | import shutil 12 | import librosa 13 | import torch 14 | import torchaudio 15 | 16 | 17 | ## Copy raw data under physionet.org/files/circor-heart-sound/1.0.3/training_data to heart-murmur-detection/data 18 | split_csvs = ['./datalist_stratified_data1.csv', './datalist_stratified_data2.csv', './datalist_stratified_data3.csv'] 19 | df = pd.concat([pd.read_csv(f) for f in split_csvs], ignore_index=True) 20 | 21 | dest = Path('heart-murmur-detection/data') 22 | for f in df.dest_file.values: 23 | f = Path(f) 24 | f.parent.mkdir(exist_ok=True, parents=True) 25 | from_file = Path('physionet.org/files/circor-heart-sound/1.0.3/training_data')/f.name 26 | #print('Copy', from_file, 'to', f) 27 | shutil.copy(from_file, f) 28 | 29 | 30 | ## Copy stratified data (under heart-murmur-detection/data) to evar working folder (evar/work/16k) 31 | dfs = [] 32 | 33 | for split_no in [1, 2, 3]: 34 | trn = sorted(Path(f'heart-murmur-detection/data/stratified_data{split_no}/train_data/').glob('*.wav')) 35 | val = sorted(Path(f'heart-murmur-detection/data/stratified_data{split_no}/vali_data/').glob('*.wav')) 36 | tst = sorted(Path(f'heart-murmur-detection/data/stratified_data{split_no}/test_data/').glob('*.wav')) 37 | #Tr, V, Te = len(trn), len(val), len(tst) 38 | 39 | itrn = sorted(list(set([int(f.stem.split('_')[0]) for f in trn]))) 40 | ival = sorted(list(set([int(f.stem.split('_')[0]) for f in val]))) 41 | itst = sorted(list(set([int(f.stem.split('_')[0]) for f in tst]))) 42 | Tr, V, Te = len(itrn), len(ival), len(itst) 43 | N = Tr + V + Te 44 | print(f'Split #{split_no} has samples: Training:{Tr}({Tr/N*100:.2f}%), Val:{V}({V/N*100:.2f}%), Test:{Te}({Te/N*100:.2f}%)') 45 | print(' Training sample IDs are:', itrn[:3], '...') 46 | 47 | df = pd.read_csv('physionet.org/files/circor-heart-sound/1.0.3/training_data.csv') 48 | 49 | def get_split(pid): 50 | if pid in itrn: return 'train' 51 | if pid in ival: return 'valid' 52 | if pid in itst: return 'test' 53 | assert False, f'Patient ID {pid} Unknown' 54 | df['split'] = df['Patient ID'].apply(get_split) 55 | 56 | 57 | SR = 16000 58 | L = int(SR * 5.0) 59 | STEP = int(SR * 2.5) 60 | 61 | ROOT = Path('physionet.org/files/circor-heart-sound/1.0.3/training_data/') 62 | TO_FOLDER = Path(f'../../work/16k/circor{split_no}') 63 | 64 | evardf = pd.DataFrame() 65 | 66 | for i, r in df.iterrows(): 67 | pid, recloc, split, label = str(r['Patient ID']), r['Recording locations:'], r.split, r.Murmur 68 | # Not using recloc. Search real recordings... 69 | recloc = [f.stem.replace(pid+'_', '') for f in sorted(ROOT.glob(f'{pid}_*.wav'))] 70 | #print(pid, recloc, split, label) 71 | for rl in recloc: 72 | wav, sr = librosa.load(f'{ROOT}/{pid}_{rl}.wav', sr=SR) 73 | for widx, pos in enumerate(range(0, len(wav) - STEP + 1, STEP)): 74 | w = wav[pos:pos+L] 75 | org_len = len(w) 76 | if org_len < L: 77 | w = np.pad(w, (0, L - org_len)) 78 | assert len(w) == L 79 | to_name = TO_FOLDER/split/f'{pid}_{rl}_{widx}.wav' 80 | to_rel_name = to_name.relative_to(TO_FOLDER) 81 | #print(pid, rl, len(wav)/SR, to_name, to_rel_name, org_len, len(w), pos) 82 | evardf.loc[to_name.stem, 'file_name'] = to_rel_name 83 | evardf.loc[to_name.stem, 'label'] = label 84 | evardf.loc[to_name.stem, 'split'] = split 85 | 86 | to_name.parent.mkdir(exist_ok=True, parents=True) 87 | w = torch.tensor(w * 32767.0).to(torch.int16).unsqueeze(0) 88 | torchaudio.save(to_name, w, SR) 89 | evardf.to_csv(f'../../evar/metadata/circor{split_no}.csv', index=None) 90 | print('Split', split_no) 91 | print(evardf[:3]) 92 | 93 | df[:3] 94 | -------------------------------------------------------------------------------- /plugin/MARBLE/evar_marble_diff.patch: -------------------------------------------------------------------------------- 1 | diff --git a/benchmark/constants/model_constants.py b/benchmark/constants/model_constants.py 2 | index beaf1a6..ff474df 100644 3 | --- a/benchmark/constants/model_constants.py 4 | +++ b/benchmark/constants/model_constants.py 5 | @@ -54,6 +54,7 @@ NAME_TO_EXTRACT_FEATURES_MAIN = { 6 | "music2vec_target12": "extract_data2vec_audio_features_main", 7 | "music2vec_span15": "extract_data2vec_audio_features_main", 8 | "yue": "extract_yue_features_main", 9 | + 'evar': "extract_evar_features_main", 10 | } 11 | 12 | SUPPORTED_REPRESENTATIONS = list(NAME_TO_EXTRACT_FEATURES_MAIN.keys()) 13 | diff --git a/benchmark/extract.py b/benchmark/extract.py 14 | index 3869b43..8308d98 100644 15 | --- a/benchmark/extract.py 16 | +++ b/benchmark/extract.py 17 | @@ -7,11 +7,12 @@ def main(args): 18 | from benchmark.models.data2vec.extract_data2vec_features import main as extract_data2vec_audio_features_main #data2vec-audio 19 | from benchmark.models.handcrafted.extract_handcrafted_features import main as extract_handcrafted_features_main 20 | from benchmark.models.jukebox.extract_jukemir_features import main as extract_jukemir_features_main 21 | - from benchmark.models.musicnn.extract_musicnn_features import main as extract_musicnn_features_main 22 | + #from benchmark.models.musicnn.extract_musicnn_features import main as extract_musicnn_features_main 23 | from benchmark.models.clmr.extract_clmr_features import main as extract_clmr_features_main 24 | - from benchmark.models.mule.extract_mule_features import main as extract_mule_features_main 25 | + #from benchmark.models.mule.extract_mule_features import main as extract_mule_features_main 26 | from benchmark.models.hubert.extract_hubert_features import main as extract_speech_hubert_features_main #hubert 27 | - from benchmark.models.yue.extract_yue_features import main as extract_yue_features_main 28 | + #from benchmark.models.yue.extract_yue_features import main as extract_yue_features_main 29 | + from benchmark.models.evar.extract_evar_features import main as extract_evar_features_main 30 | 31 | config = load_config(args.config, namespace=True) 32 | 33 | diff --git a/benchmark/probe.py b/benchmark/probe.py 34 | index 4f2b746..7aaad6d 100644 35 | --- a/benchmark/probe.py 36 | +++ b/benchmark/probe.py 37 | @@ -1,6 +1,8 @@ 38 | import wandb 39 | import argparse 40 | import torch 41 | +import pandas as pd 42 | +from pathlib import Path 43 | import pytorch_lightning as pl 44 | 45 | import benchmark as bench 46 | @@ -27,6 +29,12 @@ def main(args): 47 | assert cfg.trainer.paradigm == 'probe', "paradigm must be probe for probe.py" 48 | pl.seed_everything(cfg.trainer.seed) 49 | 50 | + if cfg.dataset.pre_extract.feature_extractor.pretrain.num_features is None: 51 | + import yaml 52 | + with open(cfg.dataset.pre_extract.feature_extractor.pretrain.evar_config) as f: 53 | + evar_cfg = yaml.safe_load(f) 54 | + cfg.dataset.pre_extract.feature_extractor.pretrain.num_features = evar_cfg['feature_d'] 55 | + 56 | logger = get_logger(cfg) 57 | model = get_model(cfg) 58 | train_loader, valid_loader, test_loader = get_dataloaders(cfg) 59 | @@ -73,5 +81,26 @@ def main(args): 60 | # does it really save the best model? 61 | if cfg.checkpoint.save_best_to is not None: trainer.save_checkpoint(cfg.checkpoint.save_best_to) 62 | 63 | + def append_to_csv(csv_filename, data): 64 | + filename = Path(csv_filename) 65 | + filename.parent.mkdir(parents=True, exist_ok=True) 66 | + df = pd.read_csv(filename) if filename.exists() else pd.DataFrame() 67 | + df = pd.concat([df, data], ignore_index=True).to_csv(filename, index=False) 68 | + 69 | + csvname = f'score_{cfg.dataset.dataset}.csv' 70 | + model = cfg.dataset.pre_extract.feature_extractor.pretrain.name 71 | + model = Path(cfg.dataset.pre_extract.feature_extractor.pretrain.evar_config).stem if model == 'evar' else model 72 | + weight = Path(str(cfg.dataset.pre_extract.feature_extractor.pretrain.weight)) 73 | + report = { 74 | + 'model': [model], 75 | + 'weight': [weight.parent.name + '/' + weight.name], 76 | + 'task': [cfg.dataset.dataset], 77 | + } 78 | + for k in trainer.logged_metrics: 79 | + report[k] = trainer.logged_metrics[k].item() 80 | + result_df = pd.DataFrame(report) 81 | + append_to_csv(csvname, result_df) 82 | + print(report) 83 | + 84 | wandb.finish() 85 | 86 | -------------------------------------------------------------------------------- /evar/model_utils.py: -------------------------------------------------------------------------------- 1 | """Model utilities. 2 | """ 3 | 4 | import logging 5 | from pathlib import Path 6 | import torch 7 | from torch import nn 8 | 9 | 10 | def ensure_weights(filename, url): 11 | """Ensures thar `filename` exists, or download from the `url`""" 12 | 13 | if not Path(filename).is_file(): 14 | import urllib.request 15 | logging.info(f'Downloading {url} as {filename} ...') 16 | urllib.request.urlretrieve(url, filename) 17 | 18 | 19 | def load_pretrained_weights(model, pathname, model_key='model', strict=True): 20 | state_dict = torch.load(pathname) 21 | if 'state_dict' in state_dict: 22 | state_dict = state_dict['state_dict'] 23 | if 'model' in state_dict: 24 | state_dict = state_dict['model'] 25 | children = sorted([n + '.' for n, _ in model.named_children()]) 26 | 27 | # 'model.xxx' -> 'xxx" 28 | weights = {} 29 | for k in state_dict: 30 | weights[k[len(model_key)+1:] if k.startswith(model_key+'.') else k] = state_dict[k] 31 | state_dict = weights 32 | 33 | # model's parameter only 34 | def find_model_prm(k): 35 | for name in children: 36 | if name in k: # ex) "conv_block1" in "model.conv_block1.conv1.weight" 37 | return k 38 | return None 39 | 40 | weights = {} 41 | for k in state_dict: 42 | if find_model_prm(k) is None: continue 43 | weights[k] = state_dict[k] 44 | 45 | logging.info(f' using network pretrained weight: {Path(pathname).name}') 46 | print(list(weights.keys())) 47 | logging.info(str(model.load_state_dict(weights, strict=strict))) 48 | return sorted(list(weights.keys())) 49 | 50 | 51 | def set_layers_trainable(layer, trainable=False): 52 | for n, p in layer.named_parameters(): 53 | p.requires_grad = trainable 54 | 55 | 56 | def show_layers_trainable(layer, show_all_trainable=True, print_str=True): 57 | total_params = sum(p.numel() for p in layer.parameters()) 58 | total_trainable_params = sum(p.numel() for p in layer.parameters() if p.requires_grad) 59 | str_total = f'Total number of parameters: {total_params:,} (trainable {total_trainable_params:,})\n' 60 | if print_str: print(str_total) 61 | trainable = [n for n, p in layer.named_parameters() if p.requires_grad] 62 | str_trainable = f'Trainable parameters: {trainable if show_all_trainable else trainable[:10]} ...\n' 63 | frozen = [n for n, p in layer.named_parameters() if not p.requires_grad] 64 | str_frozen = f'\nOthers are frozen such as: {frozen[:3]} ...' if len(frozen) >= 3 else '' 65 | if print_str: print(str_trainable) 66 | if print_str: print(str_frozen) 67 | return str_total + str_trainable + str_frozen 68 | 69 | 70 | def initialize_layers(layer): 71 | # initialize all childrens first. 72 | for l in layer.children(): 73 | initialize_layers(l) 74 | 75 | # initialize only linaer 76 | if type(layer) != nn.Linear: 77 | return 78 | 79 | # Thanks to https://github.com/qiuqiangkong/audioset_tagging_cnn/blob/d2f4b8c18eab44737fcc0de1248ae21eb43f6aa4/pytorch/models.py#L10 80 | logging.debug(f' initialize {layer}.weight') 81 | nn.init.xavier_uniform_(layer.weight) 82 | if hasattr(layer, 'bias'): 83 | if layer.bias is not None: 84 | logging.debug(f' initialize {layer}.bias') 85 | layer.bias.data.fill_(0.) 86 | 87 | 88 | class MLP(nn.Module): 89 | def __init__(self, input_size, hidden_sizes, output_size, hidden_dropout=0.5, mean=0.0, std=0.01, bias=0.): 90 | super().__init__() 91 | sizes = [input_size] + list(hidden_sizes) + [output_size] 92 | fcs = [] 93 | for l, (in_size, out_size) in enumerate(zip(sizes[:-1], sizes[1:])): 94 | if l > 0: 95 | fcs.append(nn.Dropout(hidden_dropout)) 96 | linear = nn.Linear(in_size, out_size) 97 | nn.init.normal_(linear.weight, mean=mean, std=std) 98 | nn.init.constant_(linear.bias, bias) 99 | fcs.append(linear) 100 | fcs.append(nn.ReLU()) 101 | self.mlp = nn.Sequential(*fcs[:-1]) 102 | 103 | def forward(self, x): 104 | out = self.mlp(x) 105 | return out 106 | 107 | 108 | def mean_max_pooling(frame_embeddings, dim=-1): 109 | assert len(frame_embeddings.shape) == 3 # Batch,Feature Dimension,Time 110 | (x1, _) = torch.max(frame_embeddings, dim=dim) 111 | x2 = torch.mean(frame_embeddings, dim=dim) 112 | x = x1 + x2 113 | return x 114 | 115 | 116 | def mean_pooling(frame_embeddings, dim=-1): 117 | assert len(frame_embeddings.shape) == 3 # Batch,Feature Dimension,Time 118 | x2 = torch.mean(frame_embeddings, dim=dim) 119 | return x2 120 | 121 | 122 | def max_pooling(frame_embeddings, dim=-1): 123 | assert len(frame_embeddings.shape) == 3 # Batch,Feature Dimension,Time 124 | (x1, _) = torch.max(frame_embeddings, dim=dim) 125 | return x1 126 | -------------------------------------------------------------------------------- /evar/ar_laionclap.py: -------------------------------------------------------------------------------- 1 | """Wrapper code for: 2 | 3 | Large-Scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation 4 | 5 | ## Reference 6 | - [1] https://arxiv.org/abs/2211.06687 7 | - [2] https://github.com/LAION-AI/CLAP 8 | """ 9 | 10 | from evar.ar_base import BaseCLAP 11 | try: 12 | from packaging import version 13 | import torch 14 | import transformers, os 15 | import laion_clap 16 | except: 17 | pass # please install: pip install laion-clap 18 | 19 | 20 | def load_state_dict(checkpoint_path: str, map_location="cpu", skip_params=True): 21 | # https://github.com/LAION-AI/CLAP/blob/817041c079af560fa2c610287c68c7c97ace50b6/src/laion_clap/clap_module/factory.py#L53 22 | checkpoint = torch.load(checkpoint_path, map_location=map_location) 23 | if isinstance(checkpoint, dict) and "state_dict" in checkpoint: 24 | state_dict = checkpoint["state_dict"] 25 | else: 26 | state_dict = checkpoint 27 | if skip_params: 28 | if next(iter(state_dict.items()))[0].startswith("module"): 29 | state_dict = {k[7:]: v for k, v in state_dict.items()} 30 | 31 | # removing position_ids to maintain compatibility with latest transformers update 32 | if version.parse(transformers.__version__) >= version.parse("4.31.0"): 33 | del state_dict["text_branch.embeddings.position_ids"] 34 | return state_dict 35 | 36 | 37 | def load_ckpt(self, ckpt = None, model_id = -1, verbose = True): 38 | # https://github.com/LAION-AI/CLAP/blob/817041c079af560fa2c610287c68c7c97ace50b6/src/laion_clap/hook.py#L74C2-L119C5 39 | """Load the pretrained checkpoint of CLAP model 40 | 41 | Parameters 42 | ---------- 43 | ckpt: str 44 | if ckpt is specified, the model will load this ckpt, otherwise the model will download the ckpt from zenodo. \n 45 | For fusion model, it will download the 630k+audioset fusion model (id=3). For non-fusion model, it will download the 630k+audioset model (id=1). 46 | model_id: 47 | if model_id is specified, you can download our best ckpt, as: 48 | id = 0 --> 630k non-fusion ckpt \n 49 | id = 1 --> 630k+audioset non-fusion ckpt \n 50 | id = 2 --> 630k fusion ckpt \n 51 | id = 3 --> 630k+audioset fusion ckpt \n 52 | Note that if your model is specied as non-fusion model but you download a fusion model ckpt, you will face an error. 53 | """ 54 | import wget 55 | download_link = 'https://huggingface.co/lukewys/laion_clap/resolve/main/' 56 | download_names = [ 57 | '630k-best.pt', 58 | '630k-audioset-best.pt', 59 | '630k-fusion-best.pt', 60 | '630k-audioset-fusion-best.pt' 61 | ] 62 | if ckpt is not None: 63 | print(f'Load the specified checkpoint {ckpt} from users.') 64 | else: 65 | print(f'Load our best checkpoint in the paper.') 66 | if model_id == -1: 67 | model_id = 3 if self.enable_fusion else 1 68 | package_dir = os.path.dirname(os.path.realpath(__file__)) 69 | weight_file_name = download_names[model_id] 70 | ckpt = os.path.join(package_dir, weight_file_name) 71 | print(ckpt) 72 | if os.path.exists(ckpt): 73 | print(f'The checkpoint is already downloaded') 74 | else: 75 | print('Downloading laion_clap weight files...') 76 | ckpt = wget.download(download_link + weight_file_name, os.path.dirname(ckpt)) 77 | print('Download completed!') 78 | print('Load Checkpoint...') 79 | ckpt = load_state_dict(ckpt, skip_params=True) 80 | self.model.load_state_dict(ckpt) 81 | if verbose: 82 | param_names = [n for n, p in self.model.named_parameters()] 83 | for n in param_names: 84 | print(n, "\t", "Loaded" if n in ckpt else "Unloaded") 85 | 86 | 87 | class AR_LAIONCLAP(BaseCLAP): 88 | 89 | def __init__(self, cfg): 90 | super().__init__(cfg=cfg) 91 | 92 | self.backbone = laion_clap.CLAP_Module() 93 | # workaround to make sure: del state_dict["text_branch.embeddings.position_ids"] 94 | print(version.parse(transformers.__version__)) 95 | self.backbone.load_ckpt = load_ckpt.__get__(self.backbone, laion_clap.CLAP_Module) 96 | self.backbone.load_ckpt() 97 | 98 | def encode_frames(self, batch_audio): 99 | assert False, 'encode_frames for LAION-CLAP is not supported for now' 100 | 101 | def forward(self, batch_audio): 102 | audio_embeddings = self.backbone.get_audio_embedding_from_data(x=batch_audio, use_tensor=True) 103 | return audio_embeddings 104 | 105 | def encode_audio(self, batch_audio): 106 | audio_embeddings = self.forward(batch_audio) 107 | return audio_embeddings 108 | 109 | def encode_text(self, batch_text): 110 | text_embeddings = self.backbone.get_text_embedding(batch_text, use_tensor=True) 111 | return text_embeddings 112 | -------------------------------------------------------------------------------- /Preparing-models.md: -------------------------------------------------------------------------------- 1 | # Instructions for preparing models 2 | 3 | The followings are command lines to prepare models. 4 | 5 | **Note: you can setup only the models you need.** 6 | 7 | ## AST 8 | 9 | cd external/ 10 | git clone https://github.com/YuanGongND/ast.git 11 | patch -p1 < ast_models.patch 12 | pip install wget 13 | cd .. 14 | 15 | ## ATST & ATST-Frame 16 | 17 | In addition to the following steps, please download the ATST-Frame checkpoint as `external/atstframe_base.ckpt` from https://github.com/Audio-WestlakeU/audiossl/tree/main/audiossl/methods/atstframe. 18 | 19 | (cd external && git clone https://github.com/Audio-WestlakeU/audiossl.git) 20 | (cd external && wget https://checkpointstorage.oss-cn-beijing.aliyuncs.com/atst/base.ckpt -O atst_base.ckpt) 21 | pip install pytorch_lightning fairseq 22 | 23 | ## BEATs 24 | 25 | In addition to the following steps, please download the BEATs_iter3 and BEATs_iter3_plus checkpoints as `external/BEATs_iter3.pt` and `external/BEATs_iter3_plus_AS2M.pt` from https://github.com/microsoft/unilm/tree/master/beats. 26 | 27 | (cd external && git clone https://github.com/microsoft/unilm.git) 28 | 29 | ## BYOL-A (IJCNN2021) & BYOL-A v2 (TASLP2023) 30 | 31 | cd external/ 32 | git clone https://github.com/nttcslab/byol-a.git 33 | mv byol-a byol_a 34 | cd .. 35 | 36 | ## CED 37 | 38 | (cd external && git clone https://github.com/jimbozhang/hf_transformers_custom_model_ced.git) 39 | pip install transformers 40 | 41 | ## COALA 42 | 43 | cd external/ 44 | git clone https://github.com/xavierfav/coala.git 45 | cd coala 46 | patch -p1 < ../../external/coala.patch 47 | cd ../.. 48 | 49 | ## Dasheng 50 | 51 | pip install git+https://github.com/jimbozhang/hf_transformers_custom_model_dasheng.git 52 | 53 | ## ESResNe(X)t-fbsp 54 | 55 | cd external 56 | wget https://github.com/AndreyGuzhov/ESResNeXt-fbsp/releases/download/v0.1/ESResNeXtFBSP_AudioSet.pt 57 | git clone https://github.com/AndreyGuzhov/ESResNeXt-fbsp.git esresnext 58 | pip install msgpack_numpy 59 | cd esresnext 60 | sed -i 's/import ignite_trainer as it/#import ignite_trainer as it/' model/esresnet_base.py utils/transforms.py utils/datasets.py utils/datasets.py 61 | sed -i 's/it\.AbstractNet/torch.nn\.Module/' model/esresnet_base.py 62 | sed -i 's/it\.AbstractTransform/torch.nn\.Module/' utils/transforms.py 63 | sed -i 's/from model /from \. /' model/esresnet_base.py 64 | sed -i 's/from model\./from \./' model/esresnet_fbsp.py 65 | sed -i 's/from utils/from \.\.utils/' model/esresnet_base.py model/esresnet_fbsp.py 66 | sed -i 's/from utils/from \./' utils/datasets.py 67 | cd ../.. 68 | 69 | ## HTS-AT 70 | 71 | In addition to the following steps, please download the checkpoint as `external/HTSAT_AudioSet_Saved_1.ckpt` from https://github.com/RetroCirce/HTS-Audio-Transformer?tab=readme-ov-file#model-checkpoints. 72 | 73 | (cd external && git clone https://github.com/RetroCirce/HTS-Audio-Transformer.git htsat) 74 | pip install h5py museval torchlibrosa 75 | 76 | ## M2D 77 | 78 | To get M2D ready, follow the steps 👉 [M2D setup](https://github.com/nttcslab/m2d?tab=readme-ov-file#1-setup): 79 | 80 | cd external 81 | << follow the steps described in https://github.com/nttcslab/m2d?tab=readme-ov-file#1-setup >> 82 | 83 | Download the weights from the GitHub. Example: 84 | 85 | wget https://github.com/nttcslab/m2d/releases/download/v0.3.0/m2d_clap_vit_base-80x1001p16x16-240128_AS-FT_enconly.zip 86 | unzip m2d_clap_vit_base-80x1001p16x16-240128_AS-FT_enconly.zip 87 | 88 | You will find the `m2d_clap_vit_base-80x1001p16x16-240128_AS-FT_enconly` folder. 89 | The following runs a linear evaluation on CREMA-D. 90 | 91 | python lineareval.py config/m2d.yaml cremad weight_file=m2d_clap_vit_base-80x1001p16x16-240128_AS-FT_enconly/weights_ep67it3124-0.48558.pth 92 | 93 | ## MS-CLAP, LAION-CLAP 94 | 95 | pip install msclap 96 | pip install laion-clap 97 | 98 | ## Opera 99 | 100 | (cd external && git clone https://github.com/evelyn0414/OPERA.git) 101 | (cd external/OPERA && curl -L -O https://huggingface.co/evelyn0414/OPERA/resolve/main/encoder-operaCT.ckpt) 102 | (cd external/OPERA && patch -p0 < ../opera.patch) 103 | 104 | ## VGGish 105 | 106 | cd external 107 | git clone https://github.com/tcvrick/audioset-vggish-tensorflow-to-pytorch.git tcvrick_vggish 108 | sed -i 's/from audioset import/from \. import/' tcvrick_vggish/audioset/vggish_input.py 109 | wget https://github.com/tcvrick/audioset-vggish-tensorflow-to-pytorch/releases/download/v0.1/pytorch_vggish.zip 110 | unzip pytorch_vggish.zip 111 | cd .. 112 | 113 | ## WavCaps 114 | 115 | In addition to the following steps, please download the checkpoint `HTSAT-BERT-PT.pt` in the folder `external/WavCaps` from https://github.com/XinhaoMei/WavCaps/tree/master/retrieval. 116 | 117 | (cd external && git clone https://github.com/XinhaoMei/WavCaps.git) 118 | (cd external/WavCaps && git apply ../../external/wavcaps.patch) 119 | pip install ruamel.yaml sentence_transformers wandb loguru torchlibrosa 120 | 121 | -------------------------------------------------------------------------------- /evar/ar_opera.py: -------------------------------------------------------------------------------- 1 | """Wrapper code for: 2 | 3 | Towards Open Respiratory Acoustic Foundation Models: Pretraining and Benchmarking 4 | 5 | ## Reference 6 | - [1] https://arxiv.org/abs/2406.16148 7 | - [2] https://github.com/evelyn0414/OPERA 8 | """ 9 | 10 | from evar.ar_base import BaseAudioRepr, np 11 | import torch 12 | import librosa 13 | import logging 14 | 15 | try: 16 | import sys 17 | sys.path.append('../../external/OPERA') 18 | import os 19 | evar_home = os.getenv('EVAR', '') 20 | sys.path.append(os.path.join(evar_home, 'external/OPERA')) 21 | from src.model.models_cola import Cola 22 | from src.util import _equally_slice_pad_sample, _duplicate_padding 23 | except Exception as e: 24 | pass # print(f'(For M2D users) Build your EVAR in your M2D folder.') 25 | 26 | 27 | def split_pad_sample(sample, desired_length, sample_rate, types='repeat'): 28 | # Quoted from https://github.com/evelyn0414/OPERA/blob/main/src/util.py 29 | """ 30 | if the audio sample length > desired_length, then split and pad samples 31 | else simply pad samples according to pad_types 32 | * types 'zero' : simply pad by zeros (zero-padding) 33 | * types 'repeat' : pad with duplicate on both sides (half-n-half) 34 | * types 'aug' : pad with augmented sample on both sides (half-n-half) 35 | """ 36 | if types == 'zero': 37 | return _equally_slice_pad_sample(sample, desired_length, sample_rate) 38 | 39 | output_length = int(desired_length * sample_rate) 40 | soundclip = sample[0].copy() 41 | n_samples = len(soundclip) 42 | 43 | output = [] 44 | if n_samples > output_length: 45 | """ 46 | if sample length > desired_length, slice samples with desired_length then just use them, 47 | and the last sample is padded according to the padding types 48 | """ 49 | # frames[j] = x[j * hop_length : j * hop_length + frame_length] 50 | frames = librosa.util.frame( 51 | soundclip, frame_length=output_length, hop_length=output_length//2, axis=0) 52 | for i in range(frames.shape[0]): 53 | output.append((frames[i], sample[1], sample[2])) 54 | 55 | # get the last sample 56 | last_id = frames.shape[0] * (output_length//2) 57 | last_sample = soundclip[last_id:] 58 | 59 | padded = _duplicate_padding( 60 | soundclip, last_sample, output_length, sample_rate, types) 61 | output.append((padded, sample[1], sample[2])) 62 | else: # only pad 63 | padded = _duplicate_padding( 64 | soundclip, soundclip, output_length, sample_rate, types) 65 | output.append((padded, sample[1], sample[2])) 66 | 67 | return output 68 | 69 | 70 | def pre_process_audio_mel_t(audio, sample_rate=16000, n_mels=64, f_min=50, f_max=8000, nfft=1024, hop=512): 71 | # Quoted from https://github.com/evelyn0414/OPERA/blob/main/src/util.py 72 | S = librosa.feature.melspectrogram( 73 | y=audio, sr=sample_rate, n_mels=n_mels, fmin=f_min, fmax=f_max, n_fft=nfft, hop_length=hop) 74 | # convert scale to dB from magnitude 75 | S = librosa.power_to_db(S, ref=np.max) 76 | if S.max() != S.min(): 77 | mel_db = (S - S.min()) / (S.max() - S.min()) 78 | else: 79 | mel_db = S 80 | print("warning in producing spectrogram!") 81 | 82 | return mel_db 83 | 84 | 85 | def get_entire_signal_librosa(data, input_sec=8, sample_rate=16000, butterworth_filter=None, pad=False, from_cycle=False, yt=None, types='repeat'): 86 | device = data.device 87 | # Cut from https://github.com/evelyn0414/OPERA/blob/main/src/util.py 88 | # Trim leading and trailing silence from an audio signal. 89 | FRAME_LEN = int(sample_rate / 10) # 90 | HOP = int(FRAME_LEN / 2) # 50% overlap, meaning 5ms hop length 91 | yt, index = librosa.effects.trim(data.cpu().numpy(), frame_length=FRAME_LEN, hop_length=HOP) 92 | 93 | # check audio not too short 94 | duration = librosa.get_duration(y=yt, sr=sample_rate) 95 | if duration < input_sec: 96 | yt = split_pad_sample([yt, 0,0], input_sec, sample_rate, types)[0][0] 97 | 98 | # # visualization for testing the spectrogram parameters 99 | # plot_melspectrogram(yt.squeeze(), title=filename.replace("/", "-")) 100 | return torch.tensor(pre_process_audio_mel_t(yt.squeeze(), f_max=8000)).to(device) 101 | 102 | 103 | class AR_OPERA_CT(BaseAudioRepr): 104 | 105 | def __init__(self, cfg): 106 | super().__init__(cfg=cfg) 107 | if 'icbhi_sprs_mode' not in cfg: 108 | logging.error('\n\n *** The model supports app/ICBHI_SPRT only. Exiting... ***\n') 109 | exit(-1) 110 | self.backbone = Cola(encoder="htsat") 111 | ckpt = torch.load(cfg.weight_file) 112 | self.backbone.load_state_dict(ckpt["state_dict"], strict=False) 113 | 114 | def encode_frames(self, batch_audio): 115 | x = get_entire_signal_librosa(batch_audio, input_sec=8) #, input_sec=self.cfg.unit_samples / self.cfg.sample_rate) 116 | x = self.augment_if_training(x) 117 | x = x.transpose(-2, -1) # B,D,T -> B,T,D 118 | features = self.backbone.extract_feature(x, self.cfg.feature_d) 119 | return features.unsqueeze(-1) # [B, D] -> [B, D, 1] 120 | 121 | def forward(self, batch_audio): 122 | x = self.encode_frames(batch_audio) 123 | return x.mean(dim=-1) # [B, D, T] -> [B, D] 124 | 125 | -------------------------------------------------------------------------------- /plugin/MARBLE/benchmark/models/evar/extract_evar_features.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | 4 | import torch 5 | import numpy as np 6 | from tqdm import tqdm 7 | import wget 8 | import sys 9 | import pandas as pd 10 | import librosa 11 | 12 | from benchmark.utils.audio_utils import load_audio, find_audios 13 | 14 | sys.path.append(os.environ.get('EVAR', '')) 15 | import evar 16 | from lineareval import make_cfg 17 | 18 | 19 | def select_args(config): 20 | args = argparse.Namespace() 21 | args.accelerator = config.dataset.pre_extract.accelerator 22 | args.output_dir = config.dataset.pre_extract.output_dir 23 | args.overwrite = config.dataset.pre_extract.overwrite 24 | args.audio_dir = config.dataset.pre_extract.audio_dir 25 | args.n_shard = config.args.n_shard 26 | args.shard_rank = config.args.shard_rank 27 | args.keep_folder_structure = config.dataset.pre_extract.keep_folder_structure 28 | args.evar_config = config.dataset.pre_extract.feature_extractor.pretrain.evar_config 29 | args.weight = config.dataset.pre_extract.feature_extractor.pretrain.weight 30 | args.options = config.dataset.pre_extract.feature_extractor.pretrain.options 31 | return args 32 | 33 | 34 | class WavDataset(evar.data.BaseRawAudioDataset): 35 | def __init__(self, cfg, files): 36 | super().__init__(cfg.unit_samples, tfms=None, random_crop=False, return_filename=cfg.return_filename) 37 | self.cfg = cfg 38 | self.df = pd.DataFrame({'file_name': files}) 39 | self.cfg.task_data = 'dummy' 40 | 41 | def __len__(self): 42 | return len(self.df) 43 | 44 | def get_audio(self, index): 45 | filename = self.df.file_name.values[index] 46 | if self.cfg.return_filename: 47 | return filename 48 | wav, sr = librosa.load(filename, sr=self.cfg.sample_rate, mono=True) 49 | wav = torch.tensor(wav).to(torch.float32) 50 | return wav 51 | 52 | def __getitem__(self, index): 53 | wav = self.get_audio(index) 54 | return wav 55 | 56 | 57 | def collate_trunc_wav(original_batch): 58 | if isinstance(original_batch[0], (str)): 59 | return original_batch # return_filename 60 | # truncate all items to the size of the shortest item 61 | truncated = [] 62 | shortest = min([b.shape[-1] for b in original_batch]) 63 | for item in original_batch: 64 | l = item.shape[-1] 65 | if l > shortest: 66 | i = np.random.randint(l - shortest) 67 | item = item[..., i:i+shortest] 68 | truncated.append(item) 69 | return torch.stack(truncated) 70 | 71 | 72 | def main(config): 73 | args = select_args(config) 74 | 75 | os.makedirs(args.output_dir, exist_ok=True) 76 | 77 | audio_files = find_audios(args.audio_dir) 78 | print(f'Found {len(audio_files)} audio files') 79 | 80 | if args.n_shard > 1: 81 | print(f'processing shard {args.shard_rank} of {args.n_shard}') 82 | audio_files.sort() # make sure no intersetction 83 | audio_files = audio_files[args.shard_rank * len(audio_files) // args.n_shard : (args.shard_rank + 1) * len(audio_files) // args.n_shard] 84 | 85 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 86 | options = f'weight_file={args.weight},' + ('' if args.options is None else args.options) 87 | cfg, n_folds, balanced = make_cfg(args.evar_config, 'as20k', options, extras={}, abs_unit_sec=10) # as20k is a dummy task, 10s is a dummy input unit second 88 | model = eval('evar.'+cfg.audio_repr)(cfg).to(device) 89 | 90 | batch_size = 32 # TODO make it flexible 91 | dataset = WavDataset(cfg, np.random.default_rng().choice(audio_files, min(len(audio_files), 1000), replace=False)) # choose random 1000< samples for calculating statistics 92 | data_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_trunc_wav) 93 | print(f'Precomputing using the audio representation: {cfg.id} for {len(dataset)} files ({len(data_loader)} batches)') 94 | model.precompute(device, data_loader) 95 | 96 | print(f'Extracting features using {cfg.id}') 97 | for audio_file in tqdm(audio_files): 98 | # load audio 99 | try: 100 | waveform = load_audio( 101 | audio_file, 102 | target_sr=config.dataset.pre_extract.feature_extractor.pretrain.target_sr, 103 | is_mono=True, 104 | is_normalize=False, 105 | crop_to_length_in_sec=None, 106 | ) 107 | except Exception as e: 108 | print(f"skip audio {audio_file} because of {e}") 109 | continue 110 | 111 | # extract features 112 | #waveform = waveform.squeeze().cpu().numpy() 113 | with torch.no_grad(): 114 | audio_data = [audio_file] if cfg.return_filename else waveform.to('cuda') 115 | embeddings = model(audio_data) # [dims] 116 | # reshape to [1, 1, dims] 117 | out = embeddings.reshape(1, 1, -1).cpu().detach().numpy() 118 | 119 | # save to npy 120 | if args.keep_folder_structure: 121 | output_file = os.path.join( 122 | args.output_dir, 123 | os.path.relpath(audio_file, args.audio_dir)+'.npy', 124 | ) 125 | os.makedirs(os.path.dirname(output_file), exist_ok=True) 126 | else: 127 | output_file = os.path.join( 128 | args.output_dir, 129 | os.path.basename(audio_file)+'.npy', 130 | ) 131 | if not args.overwrite: 132 | assert not os.path.exists(output_file), f"{output_file} exists" 133 | np.save(output_file, out) 134 | --------------------------------------------------------------------------------