├── superb
    └── upstream
    │   └── m2d
    │       ├── __init__.py
    │       ├── hubconf.py
    │       ├── expert.py
    │       └── README.md
├── LICENSE.pdf
├── clap
    ├── image_figure2.jpg
    ├── image-ESC10-Viz.jpg
    ├── image_Table3_CLAP_LE.png
    ├── image_Table4_CLAP_FT.png
    ├── image_Table5_CLAP_ZS.png
    ├── README.md
    └── Note-ACalt4_GTEbase.ipynb
├── LICENSE
├── image-AppGuideChart.png
├── image-key-visual-m2d.jpg
├── speech
    ├── figure-github.jpg
    ├── figure2system-s.jpg
    ├── bat
    │   ├── asv.sh
    │   ├── ic.sh
    │   ├── ks.sh
    │   ├── sid.sh
    │   ├── pr.sh
    │   ├── ss.sh
    │   └── er.sh
    ├── extract_offline_ls960.py
    ├── README.md
    └── speech_dataset.py
├── app
    ├── circor
    │   ├── EMBC_TableII.png
    │   ├── bat
    │   │   ├── m2d_ftcircor_rand.sh
    │   │   ├── m2d_ftcircor.sh
    │   │   ├── ast_ftcircor.sh
    │   │   ├── ast_ftcircor_noaug.sh
    │   │   ├── cnn14_ftcircor.sh
    │   │   ├── cnn14_ftcircor_noaug.sh
    │   │   ├── byola_ftcircor.sh
    │   │   └── byola_ftcircor_noaug.sh
    │   ├── diff-heart-murmur-detection.patch
    │   ├── diff-evar.patch
    │   ├── README.md
    │   └── circor_eval.py
    └── icbhi_sprs
    │   ├── eval_icbhi.sh
    │   ├── eval_sprs.sh
    │   ├── cut_data_sprs.py
    │   ├── README_ICBHI_SPRS.md
    │   └── patch_scl_icbhi2017.diff
├── image-key-vis-m2d-clap.jpg
├── audioset
    ├── table-V-M2D-AS-le.png
    ├── table-VI-M2D-AS-ft.png
    └── README.md
├── requirements.txt
├── data
    ├── files_audioset.csv
    └── README.md
├── util
    ├── ft-spc.sh
    ├── ft-as20k.sh
    ├── ft-esc50.sh
    ├── ft-vc1.sh
    ├── to_encoder_only_weight.py
    ├── ft-as2m.sh
    └── make_as_weighted_list.py
├── quick_eval.sh
├── all_eval.sh
├── .gitignore
├── common.py
├── examples
    ├── Example_1.ipynb
    ├── Example_old4_CLAP2024.ipynb
    └── Example_4_CLAP2025.ipynb
├── wav_to_lms.py
├── Guide_app.md
└── audio_dataset.py


/superb/upstream/m2d/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/LICENSE.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nttcslab/m2d/HEAD/LICENSE.pdf


--------------------------------------------------------------------------------
/clap/image_figure2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nttcslab/m2d/HEAD/clap/image_figure2.jpg


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Please find the LICENSE at https://github.com/nttcslab/m2d/blob/master/LICENSE.pdf
2 | 


--------------------------------------------------------------------------------
/clap/image-ESC10-Viz.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nttcslab/m2d/HEAD/clap/image-ESC10-Viz.jpg


--------------------------------------------------------------------------------
/image-AppGuideChart.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nttcslab/m2d/HEAD/image-AppGuideChart.png


--------------------------------------------------------------------------------
/image-key-visual-m2d.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nttcslab/m2d/HEAD/image-key-visual-m2d.jpg


--------------------------------------------------------------------------------
/speech/figure-github.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nttcslab/m2d/HEAD/speech/figure-github.jpg


--------------------------------------------------------------------------------
/app/circor/EMBC_TableII.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nttcslab/m2d/HEAD/app/circor/EMBC_TableII.png


--------------------------------------------------------------------------------
/image-key-vis-m2d-clap.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nttcslab/m2d/HEAD/image-key-vis-m2d-clap.jpg


--------------------------------------------------------------------------------
/speech/figure2system-s.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nttcslab/m2d/HEAD/speech/figure2system-s.jpg


--------------------------------------------------------------------------------
/audioset/table-V-M2D-AS-le.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nttcslab/m2d/HEAD/audioset/table-V-M2D-AS-le.png


--------------------------------------------------------------------------------
/clap/image_Table3_CLAP_LE.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nttcslab/m2d/HEAD/clap/image_Table3_CLAP_LE.png


--------------------------------------------------------------------------------
/clap/image_Table4_CLAP_FT.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nttcslab/m2d/HEAD/clap/image_Table4_CLAP_FT.png


--------------------------------------------------------------------------------
/clap/image_Table5_CLAP_ZS.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nttcslab/m2d/HEAD/clap/image_Table5_CLAP_ZS.png


--------------------------------------------------------------------------------
/audioset/table-VI-M2D-AS-ft.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nttcslab/m2d/HEAD/audioset/table-VI-M2D-AS-ft.png


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | pandas
 2 | numpy
 3 | matplotlib
 4 | torch
 5 | torchaudio
 6 | torchvision
 7 | tensorboard
 8 | fire
 9 | tqdm
10 | scikit-learn
11 | librosa
12 | nnAudio
13 | timm==0.4.5
14 | transformers
15 | einops
16 | easydict
17 | torchlibrosa
18 | 


--------------------------------------------------------------------------------
/app/icbhi_sprs/eval_icbhi.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | if [ $# -lt 2 ]; then
 4 |   n_iter=3
 5 | else
 6 |   n_iter=$2
 7 | fi
 8 | 
 9 | if [ $# -lt 3 ]; then
10 |   lr_prm=5e-5
11 | else
12 |   lr_prm=$3
13 | fi
14 | 
15 | echo Repeating $n_iter times...
16 | 
17 | for i in $(seq $n_iter); do
18 |     cmdline="CUDA_VISIBLE_DEVICES=0 python app_main.py --method sl --backbone m2d --epochs 150 --bs 64 --weightspath $1 --lr $lr_prm --freeze_embed --split_iter 4"
19 |     echo $cmdline
20 |     eval $cmdline
21 | done
22 | 


--------------------------------------------------------------------------------
/data/files_audioset.csv:
--------------------------------------------------------------------------------
 1 | file_name
 2 | audioset_lms/balanced_train_segments/--aE2O5G5WE_0.000.npy
 3 | audioset_lms/balanced_train_segments/--cB2ZVjpnA_30.000.npy
 4 | audioset_lms/balanced_train_segments/--aaILOrkII_200.000.npy
 5 | audioset_lms/balanced_train_segments/--ZhevVpy1s_50.000.npy
 6 | audioset_lms/balanced_train_segments/--aO5cdqSAg_30.000.npy
 7 | audioset_lms/balanced_train_segments/--PJHxphWEs_30.000.npy
 8 | audioset_lms/balanced_train_segments/--ekDLDTUXA_30.000.npy
 9 | ** please replace this sample with yours **
10 | 


--------------------------------------------------------------------------------
/app/icbhi_sprs/eval_sprs.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | if [ $# -lt 2 ]; then
 4 |   n_iter=3
 5 | else
 6 |   n_iter=$2
 7 | fi
 8 | 
 9 | if [ $# -lt 3 ]; then
10 |   lr_prm=5e-6
11 | else
12 |   lr_prm=$3
13 | fi
14 | 
15 | echo Repeating $n_iter times...
16 | 
17 | for i in $(seq $n_iter); do
18 |     cmdline="CUDA_VISIBLE_DEVICES=0 python app_main.py --dataset SPRS --datapath data/SPRS --method sl --backbone m2d --epochs 50 --bs 64 --weightspath $1 --lr $lr_prm --freeze_embed --split_iter 4"
19 |     echo $cmdline
20 |     eval $cmdline
21 | done
22 | 


--------------------------------------------------------------------------------
/speech/bat/asv.sh:
--------------------------------------------------------------------------------
 1 | gpu=$1
 2 | upmodel=$2
 3 | ckpt=$3
 4 | lr=5e-5
 5 | task=ASV
 6 | seed=$4
 7 | 
 8 | parentpath=$(dirname $ckpt)
 9 | parent=$(basename $parentpath)
10 | ckptbase=$(basename $ckpt)
11 | ckptstem=${ckptbase%.*}
12 | expbase=$parent-$ckptstem
13 | 
14 | expname=$expbase-$task-lr$lr-s$seed
15 | 
16 | echo $expname
17 | CUDA_VISIBLE_DEVICES=$gpu python run_downstream.py -m train -n $expname -u $upmodel -d sv_voxceleb1 -o "config.optimizer.lr=$lr" -k $ckpt,-11.070931,4.1807961 --seed $seed
18 | CUDA_VISIBLE_DEVICES=$gpu ./downstream/sv_voxceleb1/test_expdir.sh result/downstream/$expname /lab/data/superb/voxceleb1
19 | 


--------------------------------------------------------------------------------
/speech/bat/ic.sh:
--------------------------------------------------------------------------------
 1 | gpu=$1
 2 | upmodel=$2
 3 | ckpt=$3
 4 | lr=1e-3
 5 | task=IC
 6 | seed=$4
 7 | 
 8 | parentpath=$(dirname $ckpt)
 9 | parent=$(basename $parentpath)
10 | ckptbase=$(basename $ckpt)
11 | ckptstem=${ckptbase%.*}
12 | expbase=$parent-$ckptstem
13 | 
14 | expname=$expbase-$task-lr$lr-s$seed
15 | 
16 | echo $expname
17 | CUDA_VISIBLE_DEVICES=$gpu python run_downstream.py -m train -n $expname -u $upmodel -d fluent_commands -o "config.optimizer.lr=$lr" -k $ckpt,-13.017439842224121,4.417759895324707 --seed $seed
18 | CUDA_VISIBLE_DEVICES=$gpu python run_downstream.py -m evaluate -e result/downstream/$expname/dev-best.ckpt
19 | 


--------------------------------------------------------------------------------
/speech/bat/ks.sh:
--------------------------------------------------------------------------------
 1 | gpu=$1
 2 | upmodel=$2
 3 | ckpt=$3
 4 | lr=1e-4
 5 | task=KS
 6 | seed=$4
 7 | 
 8 | parentpath=$(dirname $ckpt)
 9 | parent=$(basename $parentpath)
10 | ckptbase=$(basename $ckpt)
11 | ckptstem=${ckptbase%.*}
12 | expbase=$parent-$ckptstem
13 | 
14 | expname=$expbase-$task-lr$lr-s$seed
15 | 
16 | echo $expname
17 | CUDA_VISIBLE_DEVICES=$gpu python run_downstream.py -m train -n $expname -u $upmodel -d speech_commands -o "config.optimizer.lr=$lr" -k $ckpt,-11.506255149841309,4.314857482910156 --seed $seed
18 | CUDA_VISIBLE_DEVICES=$gpu python run_downstream.py -m evaluate -e result/downstream/$expname/dev-best.ckpt
19 | 


--------------------------------------------------------------------------------
/app/circor/bat/m2d_ftcircor_rand.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | split=$2
 4 | n_iter=$3
 5 | seed=$4
 6 | #lr_prm=0.001 for bs128
 7 | lr_prm=0.00025
 8 | bs=32
 9 | gpu=0
10 | 
11 | echo Repeating $n_iter times...
12 | 
13 | for i in $(seq $n_iter); do
14 |   weight=$1/random
15 |   seed=$((seed + 1))
16 |   cmdline="CUDA_VISIBLE_DEVICES=$gpu python circor_eval.py config/m2d.yaml circor$split weight_file=$weight,encoder_only=True,freeze_embed=True --lr=$lr_prm --freq_mask 0 --time_mask 0 --training_mask 0.2 --mixup 0.0 --rrc False --epochs 50 --warmup_epochs 5 --seed $seed --batch_size $bs"
17 |   echo $cmdline
18 |   eval $cmdline
19 | done
20 | 


--------------------------------------------------------------------------------
/speech/bat/sid.sh:
--------------------------------------------------------------------------------
 1 | gpu=$1
 2 | upmodel=$2
 3 | ckpt=$3
 4 | lr=1e-3
 5 | task=SID
 6 | seed=$4
 7 | 
 8 | parentpath=$(dirname $ckpt)
 9 | parent=$(basename $parentpath)
10 | ckptbase=$(basename $ckpt)
11 | ckptstem=${ckptbase%.*}
12 | expbase=$parent-$ckptstem
13 | 
14 | expname=$expbase-$task-lr$lr-s$seed
15 | 
16 | echo $expname
17 | CUDA_VISIBLE_DEVICES=$gpu python run_downstream.py -m train -n $expname -u $upmodel -d voxceleb1 -o "config.optimizer.lr=$lr" -k $ckpt,-10.571271,4.3681135 --seed $seed
18 | CUDA_VISIBLE_DEVICES=$gpu python run_downstream.py -m evaluate -n $expname -d voxceleb1 -e result/downstream/$expname/dev-best.ckpt
19 | 


--------------------------------------------------------------------------------
/speech/bat/pr.sh:
--------------------------------------------------------------------------------
 1 | gpu=$1
 2 | upmodel=$2
 3 | ckpt=$3
 4 | lr=1e-3
 5 | task=PR
 6 | seed=$4
 7 | 
 8 | parentpath=$(dirname $ckpt)
 9 | parent=$(basename $parentpath)
10 | ckptbase=$(basename $ckpt)
11 | ckptstem=${ckptbase%.*}
12 | expbase=$parent-$ckptstem
13 | 
14 | expname=$expbase-$task-lr$lr-s$seed
15 | 
16 | echo $expname
17 | CUDA_VISIBLE_DEVICES=$gpu python run_downstream.py -m train -n $expname -u $upmodel -d ctc -c downstream/ctc/libriphone.yaml -o "config.optimizer.lr=$lr" -k $ckpt,-10.43253231048584,4.241369724273682 --seed $seed
18 | CUDA_VISIBLE_DEVICES=$gpu python run_downstream.py -m evaluate -n $expname -d ctc -e result/downstream/$expname/dev-best.ckpt
19 | 


--------------------------------------------------------------------------------
/app/circor/bat/m2d_ftcircor.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | split=$2
 4 | n_iter=$3
 5 | seed=$4
 6 | #lr_prm=0.001 for bs128
 7 | lr_prm=0.00025
 8 | bs=32
 9 | gpu=0
10 | 
11 | echo Repeating $n_iter times...
12 | 
13 | for i in $(seq $n_iter); do
14 |   for w in ${@:5}; do
15 |     weight=$1/checkpoint-$w.pth
16 |     seed=$((seed + 1))
17 |     cmdline="CUDA_VISIBLE_DEVICES=$gpu python circor_eval.py config/m2d.yaml circor$split weight_file=$weight,encoder_only=True,freeze_embed=True --lr=$lr_prm --freq_mask 0 --time_mask 0 --training_mask 0.0 --mixup 0.0 --rrc False --epochs 50 --warmup_epochs 5 --seed $seed --batch_size $bs"
18 |     echo $cmdline
19 |     eval $cmdline
20 |   done
21 | done
22 | 


--------------------------------------------------------------------------------
/app/circor/bat/ast_ftcircor.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # CUDA_VISIBLE_DEVICES=0 python circor_eval.py config/ast.yaml circor1 --lr=1e-5 --freq_mask 40 --time_mask 100 --mixup 0.0 --rrc False --epochs 50 --warmup_epochs 5 --seed 7 --batch_size 64
 3 | 
 4 | split=$1
 5 | n_iter=$2
 6 | seed=$3
 7 | lr_prm=0.00003
 8 | bs=64
 9 | gpu=0
10 | 
11 | echo Repeating $n_iter times...
12 | 
13 | for i in $(seq $n_iter); do
14 |   seed=$((seed + 1))
15 |   cmdline="CUDA_VISIBLE_DEVICES=$gpu python circor_eval.py config/ast.yaml circor$split --lr=$lr_prm --freq_mask 40 --time_mask 100 --mixup 0.0 --rrc False --epochs 50 --warmup_epochs 5 --seed $seed --batch_size $bs"
16 |   echo $cmdline
17 |   eval $cmdline
18 | done
19 | 


--------------------------------------------------------------------------------
/app/circor/bat/ast_ftcircor_noaug.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # CUDA_VISIBLE_DEVICES=0 python circor_eval.py config/ast.yaml circor1 --lr=1e-5 --freq_mask 0 --time_mask 0 --mixup 0.0 --rrc False --epochs 50 --warmup_epochs 5 --seed 7 --batch_size 64
 3 | 
 4 | split=$1
 5 | n_iter=$2
 6 | seed=$3
 7 | lr_prm=0.00003
 8 | bs=64
 9 | gpu=0
10 | 
11 | echo Repeating $n_iter times...
12 | 
13 | for i in $(seq $n_iter); do
14 |   seed=$((seed + 1))
15 |   cmdline="CUDA_VISIBLE_DEVICES=$gpu python circor_eval.py config/ast.yaml circor$split --lr=$lr_prm --freq_mask 0 --time_mask 0 --mixup 0.0 --rrc False --epochs 50 --warmup_epochs 5 --seed $seed --batch_size $bs"
16 |   echo $cmdline
17 |   eval $cmdline
18 | done
19 | 


--------------------------------------------------------------------------------
/app/circor/bat/cnn14_ftcircor.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # CUDA_VISIBLE_DEVICES=0 python circor_eval.py config/cnn14.yaml circor1 --lr=1e-5 --freq_mask 20 --time_mask 200  -mixup 0.5 --rrc False --epochs 50 --warmup_epochs 5 --seed 8 --batch_size 256
 3 | 
 4 | split=$1
 5 | n_iter=$2
 6 | seed=$3
 7 | lr_prm=1e-3
 8 | bs=256
 9 | gpu=0
10 | 
11 | echo Repeating $n_iter times...
12 | 
13 | for i in $(seq $n_iter); do
14 |   seed=$((seed + 1))
15 |   cmdline="CUDA_VISIBLE_DEVICES=$gpu python circor_eval.py config/cnn14.yaml circor$split --lr=$lr_prm --freq_mask 20 --time_mask 200  -mixup 0.0 --rrc False --epochs 50 --warmup_epochs 5 --seed $seed --batch_size $bs"
16 |   echo $cmdline
17 |   eval $cmdline
18 | done
19 | 


--------------------------------------------------------------------------------
/app/circor/bat/cnn14_ftcircor_noaug.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # CUDA_VISIBLE_DEVICES=0 python circor_eval.py config/cnn14.yaml circor1 --lr=1e-5 --freq_mask 0 --time_mask 0  -mixup 0.0 --rrc False --epochs 50 --warmup_epochs 5 --seed 8 --batch_size 256
 3 | 
 4 | split=$1
 5 | n_iter=$2
 6 | seed=$3
 7 | lr_prm=1e-3
 8 | bs=256
 9 | gpu=0
10 | 
11 | echo Repeating $n_iter times...
12 | 
13 | for i in $(seq $n_iter); do
14 |   seed=$((seed + 1))
15 |   cmdline="CUDA_VISIBLE_DEVICES=$gpu python circor_eval.py config/cnn14.yaml circor$split --lr=$lr_prm --freq_mask 0 --time_mask 0  -mixup 0.0 --rrc False --epochs 50 --warmup_epochs 5 --seed $seed --batch_size $bs"
16 |   echo $cmdline
17 |   eval $cmdline
18 | done
19 | 


--------------------------------------------------------------------------------
/speech/bat/ss.sh:
--------------------------------------------------------------------------------
 1 | gpu=$1
 2 | upmodel=$2
 3 | ckpt=$3
 4 | lr=1e-3
 5 | task=SS
 6 | seed=$4
 7 | 
 8 | parentpath=$(dirname $ckpt)
 9 | parent=$(basename $parentpath)
10 | ckptbase=$(basename $ckpt)
11 | ckptstem=${ckptbase%.*}
12 | expbase=$parent-$ckptstem
13 | 
14 | expname=$expbase-$task-lr$lr-s$seed
15 | 
16 | echo $expname
17 | CUDA_VISIBLE_DEVICES=$gpu python run_downstream.py -m train -n $expname -u $upmodel -d separation_stft2 -o "config.optimizer.lr=$lr" -k $ckpt,-9.58743667602539,4.168412208557129 --seed $seed -c downstream/separation_stft2/configs/cfg.yaml
18 | CUDA_VISIBLE_DEVICES=$gpu python run_downstream.py -m evaluate -n $expname -d separation_stft2 -e result/downstream/$expname/best-states-dev.ckpt
19 | 


--------------------------------------------------------------------------------
/speech/bat/er.sh:
--------------------------------------------------------------------------------
 1 | gpu=$1
 2 | upmodel=$2
 3 | ckpt=$3
 4 | lr=1e-5
 5 | task=ER
 6 | seed=$4
 7 | 
 8 | parentpath=$(dirname $ckpt)
 9 | parent=$(basename $parentpath)
10 | ckptbase=$(basename $ckpt)
11 | ckptstem=${ckptbase%.*}
12 | expbase=$parent-$ckptstem
13 | 
14 | for test_fold in fold1 fold2 fold3 fold4 fold5;
15 | do
16 |     expname=$expbase-$task-lr$lr-s$seed-$test_fold
17 |     echo $expname
18 |     CUDA_VISIBLE_DEVICES=$gpu python run_downstream.py -m train -n $expname -u $upmodel -d emotion -c downstream/emotion/config.yaml -o "config.optimizer.lr=$lr,, config.downstream_expert.datarc.test_fold='$test_fold'" -k $ckpt,-13.037399291992188,3.619741439819336 --seed $seed
19 |     CUDA_VISIBLE_DEVICES=$gpu python run_downstream.py -m evaluate -e result/downstream/$expname/dev-best.ckpt
20 | done
21 | 


--------------------------------------------------------------------------------
/util/ft-spc.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Usage:
 4 | #     bash (your m2d)/util/ft-spc.sh <weight folder path> <# of iteration> <random seed> <checkpoint epochs to test>
 5 | #
 6 | # Example: The parameter `300` will test the checkpoint-300.pth
 7 | #     cd evar
 8 | #     bash (your m2d)/util/ft-spc.sh (your m2d)/m2d_vit_base-80x608p16x16-221006-mr7 3 42 300
 9 | 
10 | n_iter=$2
11 | seed=$3
12 | 
13 | echo **SPCV2** Repeating $n_iter times...
14 | 
15 | for i in $(seq $n_iter); do
16 |   for w in ${@:4}; do
17 |     weight=$1/checkpoint-$w.pth
18 |     seed=$((seed + 1))
19 |     cmdline="python finetune.py config/m2d.yaml spcv2 weight_file=$weight,encoder_only=True --lr=0.5 --freq_mask 30 --time_mask 48 --training_mask 0.5 --mixup 0.3 --rrc True --seed $seed"
20 |     echo $cmdline
21 |     eval $cmdline
22 |   done
23 | done


--------------------------------------------------------------------------------
/util/ft-as20k.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Usage:
 4 | #     bash (your m2d)/util/ft-as20k.sh <weight folder path> <# of iteration> <random seed> <checkpoint epochs to test>
 5 | #
 6 | # Example: The parameter `300` will test the checkpoint-300.pth
 7 | #     cd evar
 8 | #     bash (your m2d)/util/ft-as20k.sh (your m2d)/m2d_vit_base-80x608p16x16-221006-mr7 3 42 300
 9 | 
10 | n_iter=$2
11 | seed=$3
12 | 
13 | echo **AS20K** Repeating $n_iter times...
14 | 
15 | for i in $(seq $n_iter); do
16 |   for w in ${@:4}; do
17 |     weight=$1/checkpoint-$w.pth
18 |     seed=$((seed + 1))
19 |     cmdline="python finetune.py config/m2d.yaml as20k weight_file=$weight,encoder_only=True,dur_frames=1001 --lr=0.5 --freq_mask 30 --time_mask 192 --training_mask 0.5 --mixup 0.3 --rrc True --batch_size 64 --seed $seed"
20 |     echo $cmdline
21 |     eval $cmdline
22 |   done
23 | done


--------------------------------------------------------------------------------
/util/ft-esc50.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Usage:
 4 | #     bash (your m2d)/util/ft-esc50.sh <weight folder path> <# of iteration> <random seed> <checkpoint epochs to test>
 5 | #
 6 | # Example: The parameter `300` will test the checkpoint-300.pth
 7 | #     cd evar
 8 | #     bash (your m2d)/util/ft-esc50.sh (your m2d)/m2d_vit_base-80x608p16x16-221006-mr7 3 42 300
 9 | 
10 | n_iter=$2
11 | seed=$3
12 | 
13 | echo **ESC-50** Repeating $n_iter times...
14 | 
15 | for i in $(seq $n_iter); do
16 |   for w in ${@:4}; do
17 |     weight=$1/checkpoint-$w.pth
18 |     seed=$((seed + 1))
19 |     cmdline="python finetune.py config/m2d.yaml esc50 weight_file=$weight,encoder_only=True,dur_frames=501,freeze_embed=True --lr=0.5 --freq_mask 15 --time_mask 48 --training_mask 0.5 --mixup 0.0 --rrc True --seed $seed"
20 |     echo $cmdline
21 |     eval $cmdline
22 |   done
23 | done


--------------------------------------------------------------------------------
/util/ft-vc1.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Usage:
 4 | #     bash (your m2d)/util/ft-vc1.sh <weight folder path> <# of iteration> <random seed> <checkpoint epochs to test>
 5 | #
 6 | # Example: The parameter `300` will test the checkpoint-300.pth
 7 | #     cd evar
 8 | #     bash (your m2d)/util/ft-vc1.sh (your m2d)/m2d_vit_base-80x608p16x16-221006-mr7 3 42 300
 9 | 
10 | n_iter=$2
11 | seed=$3
12 | 
13 | echo **VC1** Repeating $n_iter times...
14 | 
15 | for i in $(seq $n_iter); do
16 |   for w in ${@:4}; do
17 |     weight=$1/checkpoint-$w.pth
18 |     seed=$((seed + 1))
19 |     cmdline="python finetune.py config/m2d.yaml vc1 weight_file=$weight,encoder_only=True,dur_frames=821 --optim adamw --lr=0.0005 --freq_mask 30 --time_mask 48 --training_mask 0.0 --mixup 0.0 --rrc False --epochs 50 --batch_size 64 --seed $seed"
20 |     echo $cmdline
21 |     eval $cmdline
22 |   done
23 | done


--------------------------------------------------------------------------------
/app/circor/bat/byola_ftcircor.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # CUDA_VISIBLE_DEVICES=0 python circor_eval.py config/byola.yaml circor1 weight_file=external/byol_a/pretrained_weights/AudioNTT2020-BYOLA-64x96d2048.pth --lr=3e-5 --freq_mask 20 --time_mask 50  -mixup 0.0 --rrc False --epochs 50 --warmup_epochs 5 --seed 7 --batch_size 256
 3 | 
 4 | split=$1
 5 | n_iter=$2
 6 | seed=$3
 7 | lr_prm=0.001
 8 | bs=256
 9 | gpu=0
10 | 
11 | echo Repeating $n_iter times...
12 | 
13 | for i in $(seq $n_iter); do
14 |   weight="external/byol_a/pretrained_weights/AudioNTT2020-BYOLA-64x96d2048.pth"
15 |   seed=$((seed + 1))
16 |   cmdline="CUDA_VISIBLE_DEVICES=$gpu python circor_eval.py config/byola.yaml circor$split weight_file=$weight --lr=$lr_prm --freq_mask 20 --time_mask 50  -mixup 0.0 --rrc False --epochs 50 --warmup_epochs 5 --seed $seed --batch_size $bs"
17 |   echo $cmdline
18 |   eval $cmdline
19 | done
20 | 


--------------------------------------------------------------------------------
/app/circor/bat/byola_ftcircor_noaug.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # CUDA_VISIBLE_DEVICES=0 python circor_eval.py config/byola.yaml circor1 weight_file=external/byol_a/pretrained_weights/AudioNTT2020-BYOLA-64x96d2048.pth --lr=3e-5 --freq_mask 0 --time_mask 0  -mixup 0.0 --rrc False --epochs 50 --warmup_epochs 5 --seed 7 --batch_size 256
 3 | 
 4 | split=$1
 5 | n_iter=$2
 6 | seed=$3
 7 | lr_prm=0.001
 8 | bs=256
 9 | gpu=0
10 | 
11 | echo Repeating $n_iter times...
12 | 
13 | for i in $(seq $n_iter); do
14 |   weight="external/byol_a/pretrained_weights/AudioNTT2020-BYOLA-64x96d2048.pth"
15 |   seed=$((seed + 1))
16 |   cmdline="CUDA_VISIBLE_DEVICES=$gpu python circor_eval.py config/byola.yaml circor$split weight_file=$weight --lr=$lr_prm --freq_mask 0 --time_mask 0  -mixup 0.0 --rrc False --epochs 50 --warmup_epochs 5 --seed $seed --batch_size $bs"
17 |   echo $cmdline
18 |   eval $cmdline
19 | done
20 | 


--------------------------------------------------------------------------------
/util/to_encoder_only_weight.py:
--------------------------------------------------------------------------------
 1 | """A utility for M2D pre-trained weight files.
 2 | This script converts an M2D weight to an encoder-only weight, resulting in a much smaller weight (1.6G to 326M).
 3 | 
 4 | Usage: python [this script] [source checkpoint file] [output checkpoint file]
 5 | """
 6 | 
 7 | import torch
 8 | from pathlib import Path
 9 | import sys
10 | sys.path.append('examples')
11 | from portable_m2d import PortableM2D
12 | 
13 | src_file = sys.argv[1]
14 | dest_file = sys.argv[2]
15 | 
16 | if not Path(src_file).stem.startswith('checkpoint'):
17 |     print(f' **WARNING** Do not use this converter for the fine-tuned weights. HEAD WEIGHTS WILL BE LOST.')
18 | 
19 | # Load the weight. All the parameters not used in the encoder-only model will be deleted.
20 | # The parameter `norm_stats` will be added if the weight does not have it. i.e., Old weights.
21 | model = PortableM2D(src_file)
22 | 
23 | # Save the weights.
24 | Path(dest_file).parent.mkdir(exist_ok=True, parents=True)
25 | torch.save(model.backbone.state_dict(), dest_file)
26 | print(f'Saved {dest_file}.')
27 | 


--------------------------------------------------------------------------------
/util/ft-as2m.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Usage:
 4 | #     bash (your m2d)/util/ft-as2m.sh <weight folder path> <# of iteration> <random seed> <checkpoint epochs to test>
 5 | #
 6 | # Example: The parameter `300` will test the checkpoint-300.pth
 7 | #     cd evar
 8 | #     bash (your m2d)/util/ft-as2m.sh (your m2d)/m2d_vit_base-80x608p16x16-221006-mr7 3 42 300
 9 | 
10 | # CONFIGURE HERE: Set your the same data path as pre-training here
11 | # Fine-tuning on AS2M requires the log-mel spectrogram audio files.
12 | # Prepare data/audioset_lms according to the [Example preprocessing steps (AudioSet)](../data/README.md#example-preprocessing-steps-audioset).
13 | datapath=../data/audioset_lms
14 | 
15 | # Fine-tuning steps follow
16 | n_iter=$2
17 | seed=$3
18 | 
19 | echo **AS2M** Repeating $n_iter times...
20 | 
21 | for i in $(seq $n_iter); do
22 |   for w in ${@:4}; do
23 |     weight=$1/checkpoint-$w.pth
24 |     seed=$((seed + 1))
25 |     cmdline="python finetune.py config/m2d.yaml as weight_file=$weight,encoder_only=True,dur_frames=1001 --lr=2.0 --freq_mask 30 --time_mask 192 --training_mask 0.5 --mixup 0.5 --rrc False --epochs 70 --warmup_epochs 15 --optim lars --batch_size 64 --data_path $datapath --seed $seed"
26 |     echo $cmdline
27 |     eval $cmdline
28 |   done
29 | done


--------------------------------------------------------------------------------
/app/icbhi_sprs/cut_data_sprs.py:
--------------------------------------------------------------------------------
 1 | """Data cutter.
 2 | 
 3 | Run under the app/icbhi_sprs folder.
 4 | """
 5 | 
 6 | import sys
 7 | import os
 8 | import fire
 9 | import torch
10 | import torchaudio
11 | import librosa
12 | from pathlib import Path
13 | import pandas as pd
14 | import numpy as np
15 | sys.path.append('../..')
16 | 
17 | from dataset import SPRS
18 | from m2d.runtime_audio import RuntimeM2D, Config
19 | 
20 | args_device = 'cuda'
21 | args_metalabel ='sa'
22 | args_samplerate = 16000
23 | args_duration = 8
24 | args_pad = 'circular'
25 | 
26 | 
27 | def convert(to_dir='../../data/sprsound_lms', data_dir='./data/SPRS', metadata_csv='metadata.csv'):
28 |     rt = RuntimeM2D(weight_file='m2d_vit_base-80x100p16x4-random')
29 |     train_ds = SPRS(data_path=data_dir, metadatafile=metadata_csv, duration=args_duration, split='train', device="cpu", samplerate=args_samplerate, pad_type=args_pad, meta_label=args_metalabel)
30 |     val_ds = SPRS(data_path=data_dir, metadatafile=metadata_csv, duration=args_duration, split='inter_test', device="cpu", samplerate=args_samplerate, pad_type=args_pad, meta_label=args_metalabel)
31 |     to_dir = Path(to_dir)
32 | 
33 |     for split, ds in [('val', val_ds), ('train', train_ds)]:
34 |         print(split)
35 |         to_split = to_dir/split
36 |         to_split.mkdir(parents=True, exist_ok=True)
37 |         for i in range(len(ds)):
38 |             sample, *_ = ds[i]
39 |             with torch.no_grad():
40 |                 lms = rt.to_feature(sample).cpu().numpy()[0] # 1,1,80,801 -> 1,80,801
41 |             np.save(to_split/f'{i:04d}.npy', lms)
42 |             print('.', end=' ')
43 |         print(i)
44 | 
45 | 
46 | fire.Fire(convert)
47 | 
48 | 


--------------------------------------------------------------------------------
/app/circor/diff-heart-murmur-detection.patch:
--------------------------------------------------------------------------------
 1 | --- org/heart-murmur-detection/ModelEvaluation/evaluate_model.py	2024-01-12 15:29:10.126397375 +0900
 2 | +++ /heart-murmur-detection/ModelEvaluation/evaluate_model.py	2023-11-15 16:47:47.351524689 +0900
 3 | @@ -59,6 +59,10 @@
 4 |      murmur_weighted_accuracy = compute_weighted_accuracy(
 5 |          murmur_labels, output_labels, murmur_classes
 6 |      )  # This is the murmur scoring metric.
 7 | +
 8 | +    # UAR
 9 | +    murmur_uar = murmur_accuracy_classes.mean()
10 | +
11 |      murmur_scores = (
12 |          murmur_classes,
13 |          murmur_auroc,
14 | @@ -70,6 +74,7 @@
15 |          murmur_accuracy,
16 |          murmur_accuracy_classes,
17 |          murmur_weighted_accuracy,
18 | +        murmur_uar,
19 |      )
20 |  
21 |      (
22 | @@ -83,11 +88,12 @@
23 |          accuracy,
24 |          accuracy_classes,
25 |          weighted_accuracy,
26 | +        uar,
27 |      ) = murmur_scores
28 |      murmur_output_string = (
29 | -        "AUROC,AUPRC,F-measure,Accuracy,Weighted Accuracy"
30 | -        "\n{:.3f},{:.3f},{:.3f},{:.3f},{:.3f}\n".format(
31 | -            auroc, auprc, f_measure, accuracy, weighted_accuracy
32 | +        "AUROC,AUPRC,F-measure,Accuracy,Weighted Accuracy,UAR"
33 | +        "\n{:.3f},{:.3f},{:.3f},{:.3f},{:.3f},{:.3f}\n".format(
34 | +            auroc, auprc, f_measure, accuracy, weighted_accuracy, uar
35 |          )
36 |      )
37 |      murmur_class_output_string = (
38 | @@ -109,8 +115,10 @@
39 |          + murmur_class_output_string
40 |      )
41 |  
42 | +    print(output_string)
43 | +
44 |      # Return the results.
45 | -    return output_string
46 | +    return murmur_scores
47 |  
48 |  
49 |  # Find Challenge files.
50 | 


--------------------------------------------------------------------------------
/quick_eval.sh:
--------------------------------------------------------------------------------
 1 | cd evar
 2 | GPU=0
 3 | 
 4 | if [[ "$1" == *'p32k-'* ]]; then
 5 |     cfg='config/m2d_32k.yaml'
 6 |     cfg_clap='config/m2d_clap_32k.yaml'
 7 | else
 8 |     cfg='config/m2d.yaml'
 9 |     cfg_clap='config/m2d_clap.yaml'
10 | fi
11 | 
12 | if [[ "$1" == *'_clap'* ]]; then
13 |     zs_opt=',flat_features=True'
14 | fi
15 | 
16 | CUDA_VISIBLE_DEVICES=$GPU python lineareval.py $cfg cremad batch_size=16,weight_file=$1
17 | CUDA_VISIBLE_DEVICES=$GPU python lineareval.py $cfg gtzan batch_size=16,weight_file=$1
18 | CUDA_VISIBLE_DEVICES=$GPU python lineareval.py $cfg spcv2 batch_size=64,weight_file=$1
19 | CUDA_VISIBLE_DEVICES=$GPU python lineareval.py $cfg esc50 batch_size=64,weight_file=$1
20 | 
21 | if [[ "$1" == *'_clap'* ]]; then
22 |     echo 'Zero-shot evaluation'
23 |     CUDA_VISIBLE_DEVICES=$GPU python zeroshot.py $cfg_clap cremad batch_size=16,weight_file=$1$zs_opt
24 |     CUDA_VISIBLE_DEVICES=$GPU python zeroshot.py $cfg_clap gtzan batch_size=16,weight_file=$1$zs_opt
25 |     CUDA_VISIBLE_DEVICES=$GPU python zeroshot.py $cfg_clap nsynth batch_size=64,weight_file=$1$zs_opt
26 |     CUDA_VISIBLE_DEVICES=$GPU python zeroshot.py $cfg_clap esc50 batch_size=64,weight_file=$1$zs_opt
27 |     CUDA_VISIBLE_DEVICES=$GPU python zeroshot.py $cfg_clap us8k batch_size=64,weight_file=$1$zs_opt
28 |     CUDA_VISIBLE_DEVICES=$GPU python zeroshot.py $cfg_clap fsd50k batch_size=64,weight_file=$1$zs_opt
29 |     CUDA_VISIBLE_DEVICES=$GPU python zeroshot.py $cfg_clap as batch_size=64,weight_file=$1$zs_opt
30 | fi
31 | 
32 | if [[ "$1" == *'_clap'* ]]; then
33 |     echo 'Audio-text retrieval evaluation'
34 |     CUDA_VISIBLE_DEVICES=$GPU python retr_a2t_t2a.py $cfg_clap audiocaps batch_size=64,weight_file=$1$zs_opt
35 |     CUDA_VISIBLE_DEVICES=$GPU python retr_a2t_t2a.py $cfg_clap clotho batch_size=64,weight_file=$1$zs_opt
36 | fi
37 | 
38 | python summarize.py $1
39 | 


--------------------------------------------------------------------------------
/superb/upstream/m2d/hubconf.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*- #
 2 | """*********************************************************************************************"""
 3 | #   FileName     [ upstream/m2d/hubconf.py ]
 4 | #   Synopsis     [ the M2D torch hubconf ]
 5 | """*********************************************************************************************"""
 6 | 
 7 | import os
 8 | 
 9 | from .expert import UpstreamExpert as _UpstreamExpert
10 | 
11 | 
12 | def m2d_local(ckpt, model_config=None, *args, **kwargs):
13 |     assert os.path.isfile(ckpt)
14 |     if model_config is not None:
15 |         assert os.path.isfile(model_config)
16 |     if 'feature_d' not in kwargs:
17 |         kwargs["feature_d"] = None
18 |     return _UpstreamExpert(ckpt, model_config, *args, **kwargs)
19 | 
20 | 
21 | def m2d_calcnorm(refresh=False, *args, **kwargs):
22 |     """Upstream model entry for calculating normalization statistics for M2D on Superb.
23 |     """
24 | 
25 |     if kwargs['ckpt'] is None:
26 |         print('Set -i your-checkpoint. Exit now.')
27 |         exit(-1)
28 | 
29 |     kwargs['ckpt'] = kwargs['ckpt'].split(',')[0]
30 |     return m2d_local(*args, **kwargs)
31 | 
32 | 
33 | def m2d(refresh=False, *args, **kwargs):
34 |     """Upstream model entry for running M2D on Superb.
35 |     Note:
36 |         kwargs['ckpt']: "path-name-of-your-ckpt,dataset-mean,dataset-std".    
37 |     """
38 | 
39 |     if kwargs['ckpt'] is None:
40 |         print('Set "-k your-checkpoint.pth,dataset-mean,dataset-std". Exit now.')
41 |         exit(-1)
42 |     try:
43 |         ckpt, norm_mean, norm_std = kwargs['ckpt'].split(',')
44 |     except:
45 |         print(f'Confirm your `ckpt`: {kwargs["ckpt"]}')
46 |         exit(-1)
47 | 
48 |     kwargs['ckpt'] = ckpt
49 |     norm_mean, norm_std = float(norm_mean), float(norm_std)
50 |     print(' using checkpoint:', ckpt)
51 |     print(' norm stats:', norm_mean, norm_std)
52 |     return m2d_local(*args, norm_mean=norm_mean, norm_std=norm_std, **kwargs)
53 | 


--------------------------------------------------------------------------------
/app/circor/diff-evar.patch:
--------------------------------------------------------------------------------
 1 | diff --git a/evar/ds_tasks.py b/evar/ds_tasks.py
 2 | index 14576f2..b717425 100644
 3 | --- a/evar/ds_tasks.py
 4 | +++ b/evar/ds_tasks.py
 5 | @@ -19,6 +19,9 @@ _defs = {
 6 |      'voxforge': [1, 5.8, None, False],
 7 |      'as20k': [1, 10.0, 'as', False],
 8 |      'as': [1, 10.0, 'as', True],
 9 | +    'circor1': [1, 5.0, None, False],
10 | +    'circor2': [1, 5.0, None, False],
11 | +    'circor3': [1, 5.0, None, False],
12 |  }
13 |  
14 |  _fs_table = {
15 | diff --git a/finetune.py b/finetune.py
16 | index e196538..a32cf0d 100644
17 | --- a/finetune.py
18 | +++ b/finetune.py
19 | @@ -126,6 +126,18 @@ def loss_bce(logits, gts):
20 |      return F.binary_cross_entropy_with_logits(logits, gts) # no need to apply F.sigmoid(logits)
21 |  
22 |  
23 | +class WeightedCE:
24 | +    def __init__(self, labels, device) -> None:
25 | +        weights = utils.class_weight.compute_class_weight('balanced', classes=np.unique(labels), y=labels)
26 | +        self.celoss = torch.nn.CrossEntropyLoss(weight=torch.tensor(weights).to(device))
27 | +        self.__name__ = f'CrossEntropyLoss(weight={weights})'
28 | +
29 | +    def __call__(self, logits, gts):
30 | +        preds = F.softmax(logits, dim=-1)
31 | +        loss = self.celoss(preds, gts)
32 | +        return loss
33 | +
34 | +
35 |  def eval_map(y_score, y_true, classes):
36 |      average_precision = metrics.average_precision_score(
37 |          y_true, y_score, average=None)
38 | @@ -211,8 +223,8 @@ def arg_conf_str(args, defaults={
39 |  
40 |  def _train(cfg, ar_model, device, logpath, train_loader, valid_loader, test_loader, multi_label, seed, lr, balanced, verbose):
41 |      classes = train_loader.dataset.classes
42 | -
43 | -    loss_fn = loss_bce if multi_label else loss_nll
44 | +    labels = np.argmax(train_loader.dataset.labels, axis=1)  # For app/circor, OH to numbers
45 | +    loss_fn = WeightedCE(labels.numpy(), device)  # For app/circor, using class-weighted CE loss
46 |      eval_fn = eval_map if multi_label else eval_acc
47 |      crit_str = 'mAP' if eval_fn == eval_map else 'acc'
48 |      optimizer = {
49 | 


--------------------------------------------------------------------------------
/data/README.md:
--------------------------------------------------------------------------------
 1 | ## Pre-training data
 2 | 
 3 | The pre-trainer (e.g., `train_audio.py` for audio) loads data from the `data` folder by default (`--data_path`), using a list of samples in a CSV  file `data/files_audioset.csv` by default (`--dataset`).
 4 | 
 5 | The CSV file should have a `file_name` column containing the relative pathname of the files containing a log-mel spectrogram (LMS) audio. Example:
 6 | 
 7 | ```
 8 | file_name
 9 | audioset_lms/balanced_train_segments/--aE2O5G5WE_0.000.npy
10 | audioset_lms/balanced_train_segments/--cB2ZVjpnA_30.000.npy
11 | audioset_lms/balanced_train_segments/--aaILOrkII_200.000.npy
12 | audioset_lms/balanced_train_segments/--ZhevVpy1s_50.000.npy
13 | audioset_lms/balanced_train_segments/--aO5cdqSAg_30.000.npy
14 | audioset_lms/balanced_train_segments/--PJHxphWEs_30.000.npy
15 | audioset_lms/balanced_train_segments/--ekDLDTUXA_30.000.npy
16 | ```
17 | 
18 | The folders/files should look like the following:
19 | 
20 |     (Example of the folder structure)
21 |     data/
22 |         audioset_lms/
23 |             balanced_train_segments/
24 |                 --aE2O5G5WE_0.000.npy
25 |                 --cB2ZVjpnA_30.000.npy
26 |                   :
27 | 
28 | If you also have pre-processed FSD50K data, the folder will be as follows:
29 | 
30 |     (Example of the folder structure)
31 |     data/
32 |         audioset_lms/
33 |           :
34 |         fsd50k_lms/
35 |             FSD50K.dev_audio/
36 |                 2931.npy
37 |                 408195.npy
38 |                     :
39 | 
40 | ### Example preprocessing steps (AudioSet)
41 | 
42 | If you have downloaded the AudioSet samples and converted them into .wav files in `/your/local/audioset` folder, the following example steps will preprocess and create a new folder, `data/audioset_lms`.
43 | 
44 | 1. Convert your pre-training data to LMS using [`wav_to_lms.py`](../wav_to_lms.py). Example: `python wav_to_lms.py /your/local/audioset data/audioset_lms`
45 | 2. Then, make a list of files under your `data` folder. Example follows:
46 | 
47 |     ```sh
48 |     echo file_name > data/files_audioset.csv
49 |     (cd data && find audioset_lms -name "*.npy") >> data/files_audioset.csv
50 |     ```
51 | 
52 | 


--------------------------------------------------------------------------------
/audioset/README.md:
--------------------------------------------------------------------------------
 1 | # M2D-AS (M2D-X specialized in AudioSet)
 2 | 
 3 | This sub-repository describes the steps to reproduce M2D-AS pre-training from our [following paper](https://ieeexplore.ieee.org/document/10502167): create a metadata file containing labels and run the pre-training.
 4 | 
 5 | ```BibTeX
 6 | @article{niizumi2024m2dx,
 7 |     title   = {{Masked Modeling Duo: Towards a Universal Audio Pre-training Framework}},
 8 |     author  = {Daisuke Niizumi and Daiki Takeuchi and Yasunori Ohishi and Noboru Harada and Kunio Kashino},
 9 |     journal = {IEEE/ACM Trans. Audio, Speech, Language Process.},
10 |     year    = {2024},
11 |     volume  = {32},
12 |     pages   = {2391-2406},
13 |     url     = {https://ieeexplore.ieee.org/document/10502167},
14 |     doi     = {10.1109/TASLP.2024.3389636}}
15 | ```
16 | 
17 | ## 1. Creating a metadata file
18 | 
19 | 1. Make a list of AudioSet files as "data/files_audioset.csv" for the M2D pre-training by following "Example preprocessing steps (AudioSet)" in data/README.
20 | 2. In the M2D folder, create "data/files_as_weighted.csv" containing both sample path and labels (and also sample weights) as follows.
21 | 
22 |     python util/make_as_weighted_list.py
23 | 
24 | You should have a file `data/files_as_weighted.csv`.
25 | 
26 | ## 2. Conduct M2D-AS pre-training
27 | 
28 | The exact pre-training command line we used is as follows:
29 | 
30 | ```shell
31 | OMP_NUM_THREADS=1 torchrun --nproc_per_node=4 -m audioset.train_as --input_size 80x608 --patch_size 16x16 --epochs 300 --batch_size 512 --accum_iter 1 --save_freq 50 --seed 3 --loss_off 1.
32 | ```
33 | 
34 | It requires 4x 48 GB GPU (for about two days), and the following should allow pre-training with 4x 24 GB GPU (3090Ti) within a week.
35 | 
36 | ```shell
37 | OMP_NUM_THREADS=1 torchrun --nproc_per_node=4 -m audioset.train_as --input_size 80x608 --patch_size 16x16 --epochs 300 --batch_size 256 --accum_iter 2 --save_freq 50 --seed 3 --loss_off 1.
38 | ```
39 | 
40 | ## Results on the paper
41 | 
42 | <figure>
43 |   <img src="table-V-M2D-AS-le.png" alt="Table-V-M2D-AS-le", width="100%">
44 | </figure>
45 | 
46 | <figure>
47 |   <img src="table-VI-M2D-AS-ft.png" alt="Tale-VI-M2D-AS-ft", width="50%">
48 | </figure>
49 | 
50 | 


--------------------------------------------------------------------------------
/all_eval.sh:
--------------------------------------------------------------------------------
 1 | cd evar
 2 | GPU=0
 3 | NAME=$(basename $(dirname "$1"))/$(basename "$1")
 4 | echo Processing $NAME
 5 | 
 6 | if [[ "$1" == *'p32k'* ]]; then
 7 |     cfg='config/m2d_32k.yaml'
 8 |     cfg_clap='config/m2d_clap_32k.yaml'
 9 | else
10 |     cfg='config/m2d.yaml'
11 |     cfg_clap='config/m2d_clap.yaml'
12 | fi
13 | 
14 | if [[ $NAME == *'_clap'*'/checkpoint'* ]]; then
15 |     zs_opt=',flat_features=True'
16 | fi
17 | 
18 | CUDA_VISIBLE_DEVICES=$GPU python 2pass_lineareval.py $cfg cremad batch_size=16,weight_file=$1
19 | CUDA_VISIBLE_DEVICES=$GPU python 2pass_lineareval.py $cfg gtzan batch_size=16,weight_file=$1
20 | CUDA_VISIBLE_DEVICES=$GPU python 2pass_lineareval.py $cfg spcv2 batch_size=64,weight_file=$1
21 | CUDA_VISIBLE_DEVICES=$GPU python 2pass_lineareval.py $cfg esc50 batch_size=64,weight_file=$1
22 | CUDA_VISIBLE_DEVICES=$GPU python 2pass_lineareval.py $cfg us8k batch_size=64,weight_file=$1
23 | CUDA_VISIBLE_DEVICES=$GPU python 2pass_lineareval.py $cfg vc1 batch_size=64,weight_file=$1
24 | CUDA_VISIBLE_DEVICES=$GPU python 2pass_lineareval.py $cfg voxforge batch_size=64,weight_file=$1
25 | CUDA_VISIBLE_DEVICES=$GPU python 2pass_lineareval.py $cfg nsynth batch_size=64,weight_file=$1
26 | CUDA_VISIBLE_DEVICES=$GPU python 2pass_lineareval.py $cfg surge batch_size=64,weight_file=$1
27 | 
28 | if [[ $NAME == *'_clap'*'/checkpoint'* ]]; then
29 |     echo 'Zero-shot evaluation'
30 |     CUDA_VISIBLE_DEVICES=$GPU python zeroshot.py $cfg_clap cremad batch_size=16,weight_file=$1$zs_opt
31 |     CUDA_VISIBLE_DEVICES=$GPU python zeroshot.py $cfg_clap gtzan batch_size=16,weight_file=$1$zs_opt
32 |     CUDA_VISIBLE_DEVICES=$GPU python zeroshot.py $cfg_clap nsynth batch_size=64,weight_file=$1$zs_opt
33 |     CUDA_VISIBLE_DEVICES=$GPU python zeroshot.py $cfg_clap esc50 batch_size=64,weight_file=$1$zs_opt
34 |     CUDA_VISIBLE_DEVICES=$GPU python zeroshot.py $cfg_clap us8k batch_size=64,weight_file=$1$zs_opt
35 |     CUDA_VISIBLE_DEVICES=$GPU python zeroshot.py $cfg_clap fsd50k batch_size=64,weight_file=$1$zs_opt
36 |     CUDA_VISIBLE_DEVICES=$GPU python zeroshot.py $cfg_clap as batch_size=64,weight_file=$1$zs_opt
37 | fi
38 | 
39 | if [[ $NAME == *'_clap'*'/checkpoint'* ]]; then
40 |     echo 'Audio-text retrieval evaluation'
41 |     CUDA_VISIBLE_DEVICES=$GPU python retr_a2t_t2a.py $cfg_clap audiocaps batch_size=64,weight_file=$1$zs_opt
42 |     CUDA_VISIBLE_DEVICES=$GPU python retr_a2t_t2a.py $cfg_clap clotho batch_size=64,weight_file=$1$zs_opt
43 | fi
44 | 
45 | python summarize.py $1
46 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | slurm*.out
  7 | 
  8 | # C extensions
  9 | *.so
 10 | 
 11 | # Distribution / packaging
 12 | .Python
 13 | build/
 14 | develop-eggs/
 15 | dist/
 16 | downloads/
 17 | eggs/
 18 | .eggs/
 19 | lib/
 20 | lib64/
 21 | parts/
 22 | sdist/
 23 | var/
 24 | wheels/
 25 | pip-wheel-metadata/
 26 | share/python-wheels/
 27 | *.egg-info/
 28 | .installed.cfg
 29 | *.egg
 30 | MANIFEST
 31 | 
 32 | # PyInstaller
 33 | #  Usually these files are written by a python script from a template
 34 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 35 | *.manifest
 36 | *.spec
 37 | 
 38 | # Installer logs
 39 | pip-log.txt
 40 | pip-delete-this-directory.txt
 41 | 
 42 | # Unit test / coverage reports
 43 | htmlcov/
 44 | .tox/
 45 | .nox/
 46 | .coverage
 47 | .coverage.*
 48 | .cache
 49 | nosetests.xml
 50 | coverage.xml
 51 | *.cover
 52 | *.py,cover
 53 | .hypothesis/
 54 | .pytest_cache/
 55 | 
 56 | # Translations
 57 | *.mo
 58 | *.pot
 59 | 
 60 | # Django stuff:
 61 | *.log
 62 | local_settings.py
 63 | db.sqlite3
 64 | db.sqlite3-journal
 65 | 
 66 | # Flask stuff:
 67 | instance/
 68 | .webassets-cache
 69 | 
 70 | # Scrapy stuff:
 71 | .scrapy
 72 | 
 73 | # Sphinx documentation
 74 | docs/_build/
 75 | 
 76 | # PyBuilder
 77 | target/
 78 | 
 79 | # Jupyter Notebook
 80 | .ipynb_checkpoints
 81 | 
 82 | # IPython
 83 | profile_default/
 84 | ipython_config.py
 85 | 
 86 | # pyenv
 87 | .python-version
 88 | 
 89 | # pipenv
 90 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 91 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 92 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 93 | #   install all needed dependencies.
 94 | #Pipfile.lock
 95 | 
 96 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 97 | __pypackages__/
 98 | 
 99 | # Celery stuff
100 | celerybeat-schedule
101 | celerybeat.pid
102 | 
103 | # SageMath parsed files
104 | *.sage.py
105 | 
106 | # Environments
107 | .env
108 | .venv
109 | env/
110 | venv/
111 | ENV/
112 | env.bak/
113 | venv.bak/
114 | 
115 | # Spyder project settings
116 | .spyderproject
117 | .spyproject
118 | 
119 | # Rope project settings
120 | .ropeproject
121 | 
122 | # mkdocs documentation
123 | /site
124 | 
125 | # mypy
126 | .mypy_cache/
127 | .dmypy.json
128 | dmypy.json
129 | 
130 | # Pyre type checker
131 | .pyre/
132 | 


--------------------------------------------------------------------------------
/common.py:
--------------------------------------------------------------------------------
 1 | # Masked Modeling Duo (M2D)
 2 | 
 3 | import datetime
 4 | import hashlib
 5 | import sys
 6 | import re
 7 | 
 8 | 
 9 | class PrintLogger(object):
10 |     def __init__(self, logfile):
11 |         self.stdout = sys.stdout
12 |         self.log = open(logfile, 'a')
13 |         sys.stdout = self
14 | 
15 |     def write(self, message):
16 |         self.stdout.write(message)
17 |         self.log.write(message)  
18 | 
19 |     def flush(self):
20 |         self.stdout.flush()
21 | 
22 | 
23 | def get_timestamp():
24 |     """ex) Outputs 202104220830"""
25 |     return datetime.datetime.now().strftime('%y%m%d%H%M')
26 | 
27 | 
28 | def hash_text(text, L=128):
29 |     hashed = hashlib.shake_128(text.encode()).hexdigest(L//2 + 1)
30 |     return hashed[:L]
31 | 
32 | 
33 | def short_model_desc(model, head_len=5, tail_len=1):
34 |     text = repr(model).split('\n')
35 |     text = text[:head_len] + ['  :'] + (text[-tail_len:] if tail_len > 0 else [''])
36 |     return '\n'.join(text)
37 | 
38 | 
39 | def prmstr_z(p):
40 |     return str(p).replace('.0', '').replace('0.', '.')
41 | 
42 | def prmstr_zz(prm):
43 |     ps = [prmstr_z(p) for p in prm]
44 |     return '-'.join(ps)
45 | 
46 | 
47 | conf_defaults={
48 |     'dataset': ('data/files_audioset.csv', 'D', 'path'),
49 |     'ema_decay_init': (0.99995, 'ema', 'z'),
50 |     'ema_decay': (0.99999, 'ed', 'z'),
51 |     'decoder_depth': (8, 'dd', 'asis'),
52 |     'mask_ratio': (0.7, 'mr', 'z'),
53 |     'seed': (0, 's', 'asis'),
54 |     'norm_pix_loss':  (True, '~N', 'b'),
55 |     'loss_fn': ('norm_mse', 'L', 'head'),
56 |     'optim': ('adamw', 'O', 'asis'),
57 |     'blr': (3e-4, 'blr', 'z'),
58 |     'lr': (None, 'lr', 'z'),
59 |     'eff_batch_size': (2048, 'bs', 'asis'),
60 |     'accum_iter': (1, 'a', 'asis'),
61 | }
62 | 
63 | 
64 | def arg_conf_str(args, defaults=conf_defaults):
65 |     confstr = ''
66 |     for k in defaults:
67 |         try:
68 |             arg_value = eval('args.' + k)
69 |         except:
70 |             continue # no parameter k for the run.
71 |         if arg_value == defaults[k][0]:
72 |             continue
73 |         arg_key, value_format = defaults[k][1:]
74 |         value = str(arg_value)
75 |         if value_format == 'z':
76 |             value = prmstr_z(arg_value)
77 |         elif value_format == 'zz':
78 |             value = prmstr_zz(arg_value)
79 |         elif value_format == 'b':
80 |             value = '' # nothing to add
81 |         elif value_format == 'head':
82 |             value = value[:1]
83 |         elif value_format == 'head_':
84 |             value = ''.join([v[:1] for v in value.split('_')])
85 |         elif value_format == 'path':
86 |             value = ''.join([v[:1] for v in re.split(r'_|/', value)])
87 |         confstr += arg_key + value
88 |     return confstr
89 | 


--------------------------------------------------------------------------------
/examples/Example_1.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Short example"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": 1,
 13 |    "metadata": {},
 14 |    "outputs": [],
 15 |    "source": [
 16 |     "import warnings; warnings.simplefilter('ignore')\n",
 17 |     "import logging\n",
 18 |     "logging.basicConfig(level=logging.INFO)\n",
 19 |     "import sys\n",
 20 |     "sys.path.append('..')\n",
 21 |     "import torch"
 22 |    ]
 23 |   },
 24 |   {
 25 |    "cell_type": "code",
 26 |    "execution_count": 2,
 27 |    "metadata": {},
 28 |    "outputs": [
 29 |     {
 30 |      "name": "stderr",
 31 |      "output_type": "stream",
 32 |      "text": [
 33 |       "INFO:root:<All keys matched successfully>\n",
 34 |       "INFO:root:Model input size: [80, 608]\n",
 35 |       "INFO:root:Using weights: m2d_vit_base-80x608p16x16-220930-mr7/checkpoint-300.pth\n",
 36 |       "INFO:root:Feature dimension: 3840\n",
 37 |       "INFO:root:Norm stats: -7.1, 4.2\n",
 38 |       "INFO:root:Runtime MelSpectrogram(16000, 400, 400, 160, 80, 50, 8000):\n",
 39 |       "INFO:root:MelSpectrogram(\n",
 40 |       "  Mel filter banks size = (80, 201), trainable_mel=False\n",
 41 |       "  (stft): STFT(n_fft=400, Fourier Kernel size=(201, 1, 400), iSTFT=False, trainable=False)\n",
 42 |       ")\n"
 43 |      ]
 44 |     },
 45 |     {
 46 |      "name": "stdout",
 47 |      "output_type": "stream",
 48 |      "text": [
 49 |       " using 150 parameters, while dropped 250 out of 400 parameters from m2d_vit_base-80x608p16x16-220930-mr7/checkpoint-300.pth\n",
 50 |       " (dropped: ['mask_token', 'decoder_pos_embed', 'decoder_embed.weight', 'decoder_embed.bias', 'decoder_blocks.0.norm1.weight'] ...)\n",
 51 |       "<All keys matched successfully>\n"
 52 |      ]
 53 |     }
 54 |    ],
 55 |    "source": [
 56 |     "from portable_m2d import PortableM2D\n",
 57 |     "weight = 'm2d_vit_base-80x608p16x16-220930-mr7/checkpoint-300.pth'\n",
 58 |     "model = PortableM2D(weight_file=weight)\n"
 59 |    ]
 60 |   },
 61 |   {
 62 |    "cell_type": "code",
 63 |    "execution_count": 3,
 64 |    "metadata": {},
 65 |    "outputs": [
 66 |     {
 67 |      "name": "stdout",
 68 |      "output_type": "stream",
 69 |      "text": [
 70 |       "torch.Size([1, 63, 3840])\n"
 71 |      ]
 72 |     }
 73 |    ],
 74 |    "source": [
 75 |     "# A single sample of random waveform\n",
 76 |     "wav = torch.rand(1, 16000 * 10)\n",
 77 |     "\n",
 78 |     "# Encode with M2D\n",
 79 |     "with torch.no_grad():\n",
 80 |     "    embeddings = model(wav)\n",
 81 |     "\n",
 82 |     "# The output embeddings has a shape of [Batch, Frame, Dimension]\n",
 83 |     "print(embeddings.shape)  # --> torch.Size([1, 63, 3840])"
 84 |    ]
 85 |   },
 86 |   {
 87 |    "cell_type": "code",
 88 |    "execution_count": null,
 89 |    "metadata": {},
 90 |    "outputs": [],
 91 |    "source": []
 92 |   }
 93 |  ],
 94 |  "metadata": {
 95 |   "kernelspec": {
 96 |    "display_name": "ar",
 97 |    "language": "python",
 98 |    "name": "python3"
 99 |   },
100 |   "language_info": {
101 |    "codemirror_mode": {
102 |     "name": "ipython",
103 |     "version": 3
104 |    },
105 |    "file_extension": ".py",
106 |    "mimetype": "text/x-python",
107 |    "name": "python",
108 |    "nbconvert_exporter": "python",
109 |    "pygments_lexer": "ipython3",
110 |    "version": "3.9.18"
111 |   }
112 |  },
113 |  "nbformat": 4,
114 |  "nbformat_minor": 2
115 | }
116 | 


--------------------------------------------------------------------------------
/speech/extract_offline_ls960.py:
--------------------------------------------------------------------------------
  1 | """Offline Teacher Model Feature Extractor for M2D-S
  2 | 
  3 | Masked Modeling Duo for Speech: Specializing General-Purpose Audio Representation to Speech using Denoising Distillation
  4 | https://arxiv.org/abs/2305.14079
  5 | 
  6 | This script prepares offline features obtained from a teacher model. An example follows:
  7 | 
  8 |     (cd to the root folder of your M2D copy)
  9 |     python speech/extract_offline_ls960.py /path/to/LibriSpeech
 10 | 
 11 | This example will create `data/ls960_hybrid7s_hubaseL9` and `data/files_ls960_hybrid.csv`.
 12 | 
 13 | ## Data file details
 14 | 
 15 | `data/ls960_hybrid7s_hubaseL9` will have converted files in .npz format. Each .npz file consists of three contents:
 16 | 
 17 | - arr_0: Log-mel spectrogram converted from the raw wave. The speech shorter than 7 seconds will be padded with zeros.
 18 | - arr_1: Features (hidden_states) extracted from the teacher model.
 19 | - arr_2: The length of the original hidden states excluding paddings.
 20 | 
 21 | Find the details for how these contents are used in SpeechHybridDataset class in speech/speech_dataset.py.
 22 | """
 23 | 
 24 | import sys
 25 | import numpy as np
 26 | import pandas as pd
 27 | from pathlib import Path
 28 | import torch
 29 | import librosa
 30 | import fire
 31 | from transformers import Wav2Vec2Processor, HubertModel
 32 | from tqdm import tqdm
 33 | 
 34 | sys.path.append('.')  # for running under your `m2d` folder to find wav_to_lms
 35 | from wav_to_lms import ToLogMelSpec, FFT_parameters
 36 | 
 37 | 
 38 | def prepare_ls960(src, dest='data/ls960_hybrid7s_hubaseL9', dest_csv='data/files_ls960_hybrid.csv', min_seconds=7):
 39 |     """
 40 |     Args:
 41 |         src: Source LibriSpeech 960h dataset folder.
 42 |         dest: Destination folder to store .npz files.
 43 |         dest_csv: The name of the output CSV file listing the .npz file names.
 44 |     """
 45 | 
 46 |     dest = Path(dest)
 47 |     src = Path(src)
 48 |     files = sorted(src.rglob('train*/**/*.flac'))
 49 |     min_samples = 16000 * min_seconds
 50 | 
 51 |     # Teacher model
 52 |     output_layers = [9]
 53 |     device = 'cuda'
 54 |     processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
 55 |     model = HubertModel.from_pretrained("facebook/hubert-base-ls960")
 56 |     model.eval()
 57 |     model.to(device)
 58 | 
 59 |     # Spectrogram converter (M2D default)
 60 |     to_lms = ToLogMelSpec(FFT_parameters())
 61 | 
 62 |     # Extract LS960 features from the teacher
 63 |     print(f'Processing {len(files)} files..')
 64 |     csv_rel_paths = []
 65 |     for i, f in tqdm(enumerate(files)):
 66 |         wav, sr = librosa.load(f, mono=True, sr=FFT_parameters.sample_rate)
 67 |         org_wav_len = len(wav)
 68 | 
 69 |         # pad if short
 70 |         if min_samples is not None:
 71 |             if wav.shape[-1] < min_samples:
 72 |                 wav = np.pad(wav, (0, min_samples - wav.shape[-1]))
 73 | 
 74 |         lms = to_lms(wav).numpy()
 75 |         wav = torch.tensor(wav).unsqueeze(0)
 76 | 
 77 |         preprocessed = processor(wav, return_tensors="pt", sampling_rate=16000).input_values  # Batch size 1
 78 |         preprocessed = preprocessed[0].to(device) # [1, B, raw wave length] -> [B, raw wave length]
 79 |         with torch.no_grad():
 80 |             hidden_states = model(preprocessed, output_hidden_states=True).hidden_states# list of [B, T, D]
 81 |         # stack layer outputs
 82 |         states_to_stack = [hidden_states[index] for index in output_layers] if output_layers else hidden_states
 83 |         hidden_states = torch.cat(states_to_stack, axis=-1).cpu().numpy()
 84 | 
 85 |         rel_path = str(f.relative_to(src)).replace('.flac', '.npz')
 86 |         csv_rel_paths.append(str(dest.relative_to('data')/rel_path))
 87 |         newname = dest/rel_path
 88 |         newname.parent.mkdir(parents=True, exist_ok=True)
 89 | 
 90 |         org_hidden_len = (hidden_states.shape[1] * org_wav_len) // wav.shape[-1]
 91 | 
 92 |         np.savez(newname, lms, hidden_states, org_hidden_len)  # arr_0: lms, arr_1: hidden_states, arr_2: original hidden states length
 93 |         if (i + 1) % 100 == 0:
 94 |             print(i, f'{i/len(files)*100:.3f}%', newname, lms.shape, hidden_states.shape, org_hidden_len)
 95 | 
 96 |     pd.DataFrame({'file_name': csv_rel_paths}).to_csv(dest_csv, index=None)
 97 |     print('Done.')
 98 | 
 99 | 
100 | if __name__ == '__main__':
101 |     fire.Fire(prepare_ls960)
102 | 


--------------------------------------------------------------------------------
/wav_to_lms.py:
--------------------------------------------------------------------------------
  1 | """Wave to log-mel spectrogram (LMS) audio file converter.
  2 | 
  3 | This program converts the original audio files recursively found in the source folder,
  4 | then stores them in the destination folder while holding the same relative path structure.
  5 | 
  6 | The conversion includes the following processes:
  7 |     - Stereo to mono
  8 |     - Resampling to a sampling rate
  9 |     - Converting to a log-mel spectrogram
 10 | 
 11 | Example:
 12 |     python wav_to_lms.py /your/local/fsd50k/FSD50K.dev_audio /your/msm_mae/fsd50kdev_lms
 13 | """
 14 | 
 15 | import numpy as np
 16 | from pathlib import Path
 17 | import librosa
 18 | from multiprocessing import Pool
 19 | import torch.multiprocessing as mp
 20 | import torch
 21 | import fire
 22 | from tqdm import tqdm
 23 | import nnAudio.features
 24 | import warnings
 25 | warnings.simplefilter('ignore')
 26 | 
 27 | 
 28 | class FFT_parameters:
 29 |     # We extract log-mel spectrograms with 80 features using a window size of 25 ms and a stride of 10 ms from a waveform sampled at 16kHz.
 30 |     sample_rate = 16000
 31 |     window_size = 400
 32 |     n_fft       = 400
 33 |     hop_size    = 160
 34 |     n_mels      = 80
 35 |     f_min       = 50
 36 |     f_max       = 8000
 37 | 
 38 | 
 39 | def _converter_worker(args):
 40 |     subpathname, from_dir, to_dir, prms, to_lms, suffix, min_length, max_length, verbose = args
 41 |     from_dir, to_dir = Path(from_dir), Path(to_dir)
 42 |     to_name = to_dir/(subpathname[:-len(suffix)]+'.npy')
 43 | 
 44 |     if to_name.exists():
 45 |         print('already exist', subpathname)
 46 |         return ''
 47 | 
 48 |     # load and convert to a log-mel spectrogram
 49 |     try:
 50 |         wav, org_sr = librosa.load(str(from_dir/subpathname), mono=True, sr=prms.sample_rate)
 51 | 
 52 |         # pad if short
 53 |         if min_length is not None:
 54 |             min_length = int(FFT_parameters.sample_rate * min_length)
 55 |             if wav.shape[-1] < min_length:
 56 |                 print('from', wav.shape)
 57 |                 wav = np.pad(wav, (0, min_length - wav.shape[-1]))
 58 |                 print('to', wav.shape)
 59 | 
 60 |         if max_length is not None:
 61 |             max_length = int(FFT_parameters.sample_rate * max_length)
 62 |             if max_length < wav.shape[-1]:
 63 |                 print('from', wav.shape)
 64 |                 wav = wav[:max_length]
 65 |                 print('to', wav.shape)
 66 | 
 67 |         lms = to_lms(wav)
 68 |     except Exception as e:
 69 |         print('ERROR failed to open or convert', subpathname, '-', str(e))
 70 |         return ''
 71 | 
 72 |     to_name.parent.mkdir(parents=True, exist_ok=True)
 73 |     np.save(to_name, lms)
 74 | 
 75 |     if verbose:
 76 |         print(from_dir, '->', to_name, lms.shape)
 77 | 
 78 |     return to_name.name
 79 | 
 80 | 
 81 | class ToLogMelSpec:
 82 |     def __init__(self, cfg):
 83 |         # Spectrogram extractor
 84 |         self.cfg = cfg
 85 |         self.to_spec = nnAudio.features.MelSpectrogram(
 86 |             sr=cfg.sample_rate,
 87 |             n_fft=cfg.n_fft,
 88 |             win_length=cfg.window_size,
 89 |             hop_length=cfg.hop_size,
 90 |             n_mels=cfg.n_mels,
 91 |             fmin=cfg.f_min,
 92 |             fmax=cfg.f_max,
 93 |             center=True,
 94 |             power=2,
 95 |             verbose=False,
 96 |         )
 97 | 
 98 |     def __call__(self, audio):
 99 |         x = self.to_spec(torch.tensor(audio))
100 |         x = (x + torch.finfo().eps).log()
101 |         return x
102 | 
103 | 
104 | def convert_wav(from_dir, to_dir, suffix='.wav', skip=0, min_length=6.1, max_length=30.0, verbose=False) -> None:
105 |     from_dir = str(from_dir)
106 |     files = [str(f).replace(from_dir, '') for f in Path(from_dir).glob(f'**/*{suffix}')]
107 |     files = [f[1:] if f[0] == '/' else f for f in files]
108 |     files = sorted(files)
109 |     if skip > 0:
110 |         files = files[skip:]
111 | 
112 |     prms = FFT_parameters()
113 |     to_lms = ToLogMelSpec(prms)
114 | 
115 |     print(f'Processing {len(files)} {suffix} files at a sampling rate of {prms.sample_rate} Hz...')
116 |     assert len(files) > 0
117 | 
118 |     with Pool() as p:
119 |         args = [[f, from_dir, to_dir, prms, to_lms, suffix, min_length, max_length, verbose] for f in files]
120 |         shapes = list(tqdm(p.imap(_converter_worker, args), total=len(args)))
121 | 
122 |     print('finished.')
123 | 
124 | 
125 | if __name__ == "__main__":
126 |     mp.set_start_method('spawn', force=True)
127 |     fire.Fire(convert_wav)
128 | 


--------------------------------------------------------------------------------
/clap/README.md:
--------------------------------------------------------------------------------
 1 | # M2D-CLAP: Masked Modeling Duo Meets CLAP for Learning General-purpose Audio-Language Representation
 2 | 
 3 | <figure>
 4 |   <img src="image_figure2.jpg" alt="image_figure2", width="50%">
 5 | </figure>
 6 | 
 7 | This sub-repository provides codes for our M2D-CLAP papers, including the setup procedure for the training caption data and the pre-training steps.
 8 | 
 9 | ```bibtex
10 | @article{niizumi2025m2d-clap,
11 |     author  = {Niizumi, Daisuke and Takeuchi, Daiki and Yasuda, Masahiro and Nguyen, Binh Thien and Ohishi, Yasunori and Harada, Noboru},
12 |     journal = {IEEE Access}, 
13 |     title   = {M2D-CLAP: Exploring General-purpose Audio-Language Representations Beyond CLAP}, 
14 |     year    = {2025},
15 |     pages   = {1-1},
16 |     doi={10.1109/ACCESS.2025.3611348}}
17 | 
18 | @inproceedings{niizumi2024m2d-clap,
19 |     title   = {{M2D-CLAP: Masked Modeling Duo Meets CLAP for Learning General-purpose Audio-Language Representation}},
20 |     author  = {Daisuke Niizumi and Daiki Takeuchi and Yasunori Ohishi and Noboru Harada and Masahiro Yasuda and Shunsuke Tsubaki and Keisuke Imoto},
21 |     booktitle={Interspeech},
22 |     year    = {2024},
23 |     pages   = {57--61},
24 |     doi     = {10.21437/Interspeech.2024-29}}
25 | ```
26 | 
27 | ## 1. Setup
28 | 
29 | Our implementation does not convert texts into sentence (semantic) embeddings on the fly. Instead, we convert them into embeddings in advance (offline) at the following steps 2 and 3.
30 | 
31 | 1. Prepare for the M2D pre-training on AudioSet by following the [3. Pre-training From Scratch](../README.md#3-pre-training-from-scratch).
32 |     - Especcially, configure data/audioset_lms according to the [Example preprocessing steps (AudioSet)](../data/README.md#example-preprocessing-steps-audioset).
33 | 2. Run `Note-AutoACD-GTEbase.ipynb` to create `data/capemb_GTEbase_Audo_A_C_D.npy` for [Auto-ACD](https://auto-acd.github.io/) captions.
34 | 3. Run `Note-ACalt4_GTEbase.ipynb` to create `data/capemb_GTEbase_AC_alt_4.npy` for [AudioCaps Alternative 4 Captions (ACalt4)](https://github.com/KeisukeImoto/ACalt4).
35 | 
36 | In summary, the following data should be ready.
37 | 
38 | - `data/audioset_lms` -- The log-mel spectrogram audio samples (many .npy files)
39 | - `data/files_audioset.csv` -- The list of the samples in the `data/audioset_lms`.
40 | - `data/capemb_GTEbase_Audo_A_C_D.npy` -- The caption embeddings of AutoACD.
41 | - `data/capemb_GTEbase_AC_alt_4.npy` -- The caption embeddings of ACalt4.
42 | 
43 | ## 2. Pre-training
44 | 
45 | The exact pre-training command line we used is as follows:
46 | 
47 | ```shell
48 | OMP_NUM_THREADS=1 torchrun --nproc_per_node=4 -m semantics.train_clap --input_size 80x608 --patch_size 16x16 --epochs 300 --batch_size 512 --accum_iter 1 --save_freq 50 --seed 3 --model m2d_clap_vit_base --file_caption data/capemb_GTEbase_Audo_A_C_D.npy,data/capemb_GTEbase_AC_alt_4.npy --loss_off .01
49 | ```
50 | 
51 | ## 3. Evaluation
52 | 
53 | Quick example: [examples/Example_4_CLAP2025.ipynb](../examples/Example_4_CLAP2025.ipynb).
54 | 
55 | The evaluation steps follow the [original M2D](../README.md#2-evaluating-m2d).
56 | 
57 | For the zero-shot evaluation, refer to the [../all_eval.sh](../all_eval.sh), which contains all the command lines exactly used for the paper.
58 | 
59 | ## AudioCaps Alternative 4 Captions (ACalt4)
60 | 
61 | Refer to the repository [ACalt4](https://github.com/KeisukeImoto/ACalt4) for the details.
62 | 
63 | ## Examples
64 | 
65 | | Description | Notebook |
66 | |:------------|:---------|
67 | | Zero-shot ESC-50 classification with M2D-CLAP | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg) examples/Colab_M2D-CLAP_ESC-50_ZS.ipynb](http://colab.research.google.com/github/nttcslab/m2d/blob/master/examples/Colab_M2D-CLAP_ESC-50_ZS.ipynb) |
68 | | Audio feature visualization example with M2D-CLAP | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg) examples/Colab_M2D-CLAP_ESC-50_VizualizeEmbs.ipynb](http://colab.research.google.com/github/nttcslab/m2d/blob/master/examples/Colab_M2D-CLAP_ESC-50_VizualizeEmbs.ipynb) |
69 | 
70 | ### t-SNE visualization of ESC-10 samples
71 | 
72 | The t-SNE visualization of the audio embeddings encoded by M2D-CLAP. The conventional audio embeddings are the output of the audio encoder for transfer learning. The CLAP audio embeddings are the output of the audio projector for ZS inference.
73 | 
74 | <figure>
75 |   <img src="image-ESC10-Viz.jpg" alt="image-ESC10-Viz", width="50%">
76 | </figure>
77 | 
78 | ## Results on the paper
79 | 
80 | <figure>
81 |   <img src="image_Table3_CLAP_LE.png" alt="image_Table3_CLAP_LE", width="90%">
82 | </figure>
83 | 
84 | <figure>
85 |   <img src="image_Table4_CLAP_FT.png" alt="image_Table4_CLAP_FT", width="50%">
86 | </figure>
87 | 
88 | <figure>
89 |   <img src="image_Table5_CLAP_ZS.png" alt="image_Table5_CLAP_ZS", width="50%">
90 | </figure>
91 | 


--------------------------------------------------------------------------------
/util/make_as_weighted_list.py:
--------------------------------------------------------------------------------
  1 | """AudioSet metadata maker for M2D-AS
  2 | 
  3 | This utility requires `data/files_audioset.csv` as input.
  4 | Before you begin, make the list of AudioSet files as "data/files_audioset.csv" for the M2D pre-training by following "Example preprocessing steps (AudioSet)" in data/README.
  5 | 
  6 | In the M2D folder, you can create "data/files_as_weighted.csv" containing both sample path and labels (and also sample weights) with the following.
  7 | 
  8 |     python util/make_as_weighted_list.py
  9 | 
 10 | """
 11 | 
 12 | from re import U
 13 | import urllib.request
 14 | from pathlib import Path
 15 | import pandas as pd
 16 | import numpy as np
 17 | import csv
 18 | import fire
 19 | 
 20 | 
 21 | def download_segment_csv():
 22 |     EVAL_URL = 'http://storage.googleapis.com/us_audioset/youtube_corpus/v1/csv/eval_segments.csv'
 23 |     BALANCED_TRAIN_URL = 'http://storage.googleapis.com/us_audioset/youtube_corpus/v1/csv/balanced_train_segments.csv'
 24 |     UNBALANCED_TRAIN_URL = 'http://storage.googleapis.com/us_audioset/youtube_corpus/v1/csv/unbalanced_train_segments.csv'
 25 |     CLASS_LABEL_URL = 'http://storage.googleapis.com/us_audioset/youtube_corpus/v1/csv/class_labels_indices.csv'
 26 | 
 27 |     for subset_url in [EVAL_URL, BALANCED_TRAIN_URL, UNBALANCED_TRAIN_URL, CLASS_LABEL_URL]:
 28 |         subset_path = '/tmp/' + Path(subset_url).name
 29 |         if Path(subset_path).is_file():
 30 |             continue
 31 |         with open(subset_path, 'w') as f:
 32 |             subset_data = urllib.request.urlopen(subset_url).read().decode()
 33 |             f.write(subset_data)
 34 |             print('Wrote', subset_path)
 35 | 
 36 | 
 37 | def gen_weight(df, label_file):
 38 |     # Following AudioMAE https://github.com/facebookresearch/AudioMAE/blob/main/dataset/audioset/gen_weight.py
 39 | 
 40 |     def make_index_dict(label_csv):
 41 |         index_lookup = {}
 42 |         with open(label_csv, 'r') as f:
 43 |             csv_reader = csv.DictReader(f)
 44 |             line_count = 0
 45 |             for row in csv_reader:
 46 |                 index_lookup[row['mid']] = row['index']
 47 |                 line_count += 1
 48 |         return index_lookup
 49 | 
 50 |     index_dict = make_index_dict(label_file)
 51 |     label_count = np.zeros(527)
 52 | 
 53 |     for sample in df.label.values:
 54 |         sample_labels = sample.split(',')
 55 |         for label in sample_labels:
 56 |             label_idx = int(index_dict[label])
 57 |             label_count[label_idx] = label_count[label_idx] + 1
 58 | 
 59 |     label_weight = 1000.0 / (label_count + 100)
 60 | 
 61 |     sample_weight = np.zeros(len(df))
 62 |     for i, sample in enumerate(df.label.values):
 63 |         sample_labels = sample.split(',')
 64 |         for label in sample_labels:
 65 |             label_idx = int(index_dict[label])
 66 |             # summing up the weight of all appeared classes in the sample, note audioset is multiple-label classification
 67 |             sample_weight[i] += label_weight[label_idx]
 68 |     sample_weight = np.power(sample_weight, 1.0/1.5)  # making the weights softer
 69 |     df['weight'] = sample_weight
 70 |     return df
 71 | 
 72 | 
 73 | def make_metadata(org_list='data/files_audioset.csv', to_list='data/files_as_weighted.csv'):
 74 |     # download the original metadata.
 75 |     download_segment_csv()
 76 | 
 77 |     # load label maps.
 78 |     e_df = pd.read_csv('/tmp/eval_segments.csv', skiprows=2, sep=', ', engine='python')
 79 |     e_df['split'] = 'eval_segments'
 80 |     b_df = pd.read_csv('/tmp/balanced_train_segments.csv', skiprows=2, sep=', ', engine='python')
 81 |     b_df['split'] = 'balanced_train_segments'
 82 |     u_df = pd.read_csv('/tmp/unbalanced_train_segments.csv', skiprows=2, sep=', ', engine='python')
 83 |     u_df['split'] = 'unbalanced_train_segments'
 84 |     df = pd.concat([e_df, b_df, u_df])
 85 |     df = df[['# YTID', 'positive_labels', 'split']].copy()
 86 |     df.columns = ['ytid', 'label', 'split']
 87 |     # clean labels.
 88 |     def remove_quotations(s):
 89 |         assert s[0] == '"' and s[-1] == '"'
 90 |         return s[1:-1]
 91 |     df.label = df.label.apply(lambda s: remove_quotations(s))
 92 |     label_mapper = {ytid: label for ytid, label in df[['ytid', 'label']].values}
 93 | 
 94 |     # calculate weights for each sample in org_list, and store the results in to_list.
 95 |     org_df = pd.read_csv(org_list)  # assert: org_list has only one column "file_name"
 96 |     org_df['label'] = org_df.file_name.apply(lambda f: label_mapper[f.split('/')[-1][:11]])  # assign labels for each file_name
 97 |     new_df = gen_weight(org_df, '/tmp/class_labels_indices.csv')  # assign sample weights for each file_name
 98 |     new_df.to_csv(to_list, index=None)
 99 |     print('Created', to_list, 'based on', org_list)
100 | 
101 | 
102 | fire.Fire(make_metadata)
103 | 


--------------------------------------------------------------------------------
/app/circor/README.md:
--------------------------------------------------------------------------------
  1 | # Exploring Pre-trained General-purpose Audio Representations for Heart Murmur Detection
  2 | 
  3 | ![EMBC](https://embc.embs.org/2024/wp-content/uploads/sites/102/2023/05/ieee-embc-2024-logo2x.png)
  4 | 
  5 | This sub-repository provides codes for evaluating the performance of pre-trained models intended to reproduce the results in [our IEEE EMBC 2024 paper](https://arxiv.org/abs/2404.17107).
  6 | 
  7 | <figure>
  8 |   <img src="EMBC_TableII.png" alt="Table II", width="40%">
  9 |   <figcaption>We compared the results among the previous studies and four pre-trained audio models.</figcaption>
 10 | </figure>
 11 | 
 12 | Our contents include:
 13 | 
 14 | - Data downloading and formatting notebook. It also covers code setup.
 15 | - Training/testing codes and utility batch scripts for reproducing our experiments.
 16 | - The command lines used for the paper.
 17 | - The notebook used to summarize and format results for the paper.
 18 | 
 19 | Please refer to the following paper (arXiv link) for the details.
 20 | 
 21 | ```bibtex
 22 | @article{niizumi2024embc,
 23 |     title   = {{Exploring Pre-trained General-purpose Audio Representations for Heart Murmur Detection}},
 24 |     author  = {Daisuke Niizumi and Daiki Takeuchi and Yasunori Ohishi and Noboru Harada and Kunio Kashino},
 25 |     journal = {to appear at IEEE EMBC},
 26 |     year    = {2024},
 27 |     url     = {https://arxiv.org/abs/2404.17107}
 28 | }
 29 | ```
 30 | 
 31 | ## 1. Setup
 32 | 
 33 | ### 1-0. Global setup
 34 | 
 35 | Please complete the setup for M2D first.
 36 | 
 37 | [👉️ GLOBAL SETUP, PLEASE BE SURE TO COMPLETE THESE STEPS.](https://github.com/nttcslab/m2d/tree/master?tab=readme-ov-file#1-setup)
 38 | 
 39 | 
 40 | ### 1-1. Setup for the reproduction of the paper
 41 | 
 42 | [0-Prepare.ipynb](0-Prepare.ipynb) provides complete setup steps, including:
 43 | - Code setup (training/test program and external evaluation code)
 44 | - Downloading dataset from `physionet.org`
 45 | - Format the code for our experiments
 46 | - Integrity check for the data
 47 | 
 48 | ### 1-2. Major folders
 49 | 
 50 | You will find the following folders after the setup.
 51 | 
 52 |     bat      -- Batch scripts for automating experiments
 53 |     evar     -- Experiment runs under this folder
 54 |       /work  -- The data used during the training
 55 |     heart-murmur-detection  -- Copy of the repository of the previous study from Walker et al.
 56 |       /data  -- The data used for the final test
 57 |     m2d_vit_base-80x608p16x16-220930-mr7_enconly  -- The pre-trained M2D weight
 58 |     physionet.org  -- The copy of the dataset
 59 |     scores   -- The results of our paper
 60 | 
 61 | ## 2. Running Experiments
 62 | 
 63 | We provide two example notebooks for running experiments.
 64 | 
 65 | - [1-Run-M2D.ipynb](1-Run-M2D.ipynb) provides an example of a complete command line. You can train a model using an M2D model, and you should obtain a result close to the paper. You can also check the details of fine-tuning parameters.
 66 | - [2-Run-BYOL-A.ipynb](2-Run-BYOL-A.ipynb) provides an example of the experiment using a batch file. This is exactly what we performed for the paper.
 67 | 
 68 | Please find the complete command line in [Command lines used for the paper](#command-lines-used-for-the-paper).
 69 | 
 70 | ## 3. Summarizing the results
 71 | 
 72 | [9-Summarize-results-CirCor.ipynb](9-Summarize-results-CirCor.ipynb) provides complete steps to summarize the results using our result files in the `scores` folder.
 73 | 
 74 | ## Files
 75 | 
 76 | This sub-repository contains the following files:
 77 | 
 78 | - 0-Prepare.ipynb -- A notebook for preparing the experiment
 79 | - 1-Run-M2D.ipynb -- A notebook for the M2D experiment
 80 | - 2-Run-BYOL-A.ipynb -- A notebook for the BYOL-A experiment
 81 | - 9-Summarize-results-CirCor.ipynb -- A notebook for summarizing results
 82 | - circor_eval.py -- The main program for the experiment
 83 | - bat/*.sh -- Scripts for automating experiments for each pre-trained audio representation
 84 | - diff-evar.patch -- A patch file for EVAR
 85 | - diff-heart-murmur-detection.patch -- A patch file for heart-murmur-detection
 86 | 
 87 | ## Acknowledgements
 88 | 
 89 | We appreciate the previous studies that shared their codes.
 90 | Our code uses [Benjamin-Walker/heart-murmur-detection](https://github.com/Benjamin-Walker/heart-murmur-detection) from the paper:
 91 | 
 92 | ```bibtex
 93 | @article{walker2022DBResNet,
 94 |     title={Dual Bayesian ResNet: A Deep Learning Approach to Heart Murmur Detection},
 95 |     author={Benjamin Walker and Felix Krones and Ivan Kiskin and Guy Parsons and Terry Lyons and Adam Mahdi},
 96 |     journal={Computing in Cardiology},
 97 |     volume={49},
 98 |     year={2022}
 99 | }
100 | ```
101 | 
102 | ## Command lines used for the paper
103 | 
104 | We used the following command lines.
105 | 
106 | ```sh
107 | cd evar
108 | bash ../bat/m2d_ftcircor.sh ../m2d_vit_base-80x608p16x16-220930-mr7_enconly 1 5 7 300
109 | bash ../bat/m2d_ftcircor.sh ../m2d_vit_base-80x608p16x16-220930-mr7_enconly 2 5 7 300
110 | bash ../bat/m2d_ftcircor.sh ../m2d_vit_base-80x608p16x16-220930-mr7_enconly 3 5 7 300
111 | 
112 | bash ../bat/ast_ftcircor.sh 1 5 42
113 | bash ../bat/ast_ftcircor.sh 2 5 42
114 | bash ../bat/ast_ftcircor.sh 3 5 42
115 | 
116 | bash ../bat/byola_ftcircor.sh 1 5 42
117 | bash ../bat/byola_ftcircor.sh 2 5 42
118 | bash ../bat/byola_ftcircor.sh 3 5 42
119 | 
120 | bash ../bat/cnn14_ftcircor.sh 1 5 42
121 | bash ../bat/cnn14_ftcircor.sh 2 5 42
122 | bash ../bat/cnn14_ftcircor.sh 3 5 42
123 | 
124 | bash ../bat/m2d_ftcircor_rand.sh m2d_vit_base-80x608p16x16-220930-mr7_enconly 1 5 7
125 | bash ../bat/m2d_ftcircor_rand.sh m2d_vit_base-80x608p16x16-220930-mr7_enconly 2 5 7
126 | bash ../bat/m2d_ftcircor_rand.sh m2d_vit_base-80x608p16x16-220930-mr7_enconly 3 5 7
127 | 
128 | bash ../bat/ast_ftcircor_noaug.sh 1 5 42
129 | bash ../bat/ast_ftcircor_noaug.sh 2 5 42
130 | bash ../bat/ast_ftcircor_noaug.sh 3 5 42
131 | 
132 | bash ../bat/byola_ftcircor_noaug.sh 1 5 42
133 | bash ../bat/byola_ftcircor_noaug.sh 2 5 42
134 | bash ../bat/byola_ftcircor_noaug.sh 3 5 42
135 | 
136 | bash ../bat/cnn14_ftcircor_noaug.sh 1 5 42
137 | bash ../bat/cnn14_ftcircor_noaug.sh 2 5 42
138 | bash ../bat/cnn14_ftcircor_noaug.sh 3 5 42
139 | ```
140 | 


--------------------------------------------------------------------------------
/superb/upstream/m2d/expert.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*- #
  2 | """*********************************************************************************************"""
  3 | #   FileName     [ upstream/m2d/expert.py ]
  4 | #   Synopsis     [ the Masked Modeling Duo (M2D) wrapper ]
  5 | """*********************************************************************************************"""
  6 | 
  7 | 
  8 | ###############
  9 | # IMPORTATION #
 10 | ###############
 11 | import math
 12 | #-------------#
 13 | import torch
 14 | import torch.nn as nn
 15 | from torch.nn.utils.rnn import pad_sequence
 16 | #-------------#
 17 | from .m2d.m2d.runtime_audio import RuntimeM2D
 18 | 
 19 | 
 20 | class RunningMean:
 21 |     """Running mean calculator for arbitrary axis configuration.
 22 |     Borrowed from https://github.com/nttcslab/byol-a/blob/master/v2/byol_a2/augmentations.py#L147
 23 |     """
 24 | 
 25 |     def __init__(self, axis):
 26 |         self.n = 0
 27 |         self.axis = axis
 28 | 
 29 |     def put(self, x):
 30 |         # https://math.stackexchange.com/questions/106700/incremental-averageing
 31 |         self.n += 1
 32 |         if self.n == 1:
 33 |             self.mu = x.mean(self.axis, keepdims=True)
 34 |         else:
 35 |             self.mu += (x.mean(self.axis, keepdims=True) - self.mu) / self.n
 36 | 
 37 |     def __call__(self):
 38 |         return self.mu
 39 | 
 40 |     def __len__(self):
 41 |         return self.n
 42 | 
 43 | 
 44 | class RunningVariance:
 45 |     """Calculate mean/variance of tensors online.
 46 |     Thanks to https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
 47 |     Borrowed from https://github.com/nttcslab/byol-a/blob/master/v2/byol_a2/augmentations.py#L147
 48 |     """
 49 | 
 50 |     def __init__(self, axis, mean):
 51 |         self.update_mean(mean)
 52 |         self.s2 = RunningMean(axis)
 53 | 
 54 |     def update_mean(self, mean):
 55 |         self.mean = mean
 56 | 
 57 |     def put(self, x):
 58 |         self.s2.put((x - self.mean) **2)
 59 | 
 60 |     def __call__(self):
 61 |         return self.s2()
 62 | 
 63 |     def std(self):
 64 |         return self().sqrt()
 65 | 
 66 | 
 67 | class RunningNorm(nn.Module):
 68 |     """Online Normalization using Running Mean/Std.
 69 |     Borrowed from https://github.com/nttcslab/byol-a/blob/master/v2/byol_a2/augmentations.py#L147
 70 |     This module will only update the statistics up to the specified number of epochs.
 71 |     After the `max_update_epochs`, this will normalize with the last updated statistics.
 72 |     Args:
 73 |         epoch_samples: Number of samples in one epoch
 74 |         max_update_epochs: Number of epochs to allow update of running mean/variance.
 75 |         axis: Axis setting used to calculate mean/variance.
 76 |     """
 77 | 
 78 |     def __init__(self, epoch_samples, max_update_epochs=10, axis=[1, 2]):
 79 |         super().__init__()
 80 |         self.max_update = epoch_samples * max_update_epochs
 81 |         self.ema_mean = RunningMean(axis)
 82 |         self.ema_var = RunningVariance(axis, 0)
 83 |         self.reported = False
 84 | 
 85 |     def forward(self, image):
 86 |         if len(self.ema_mean) < self.max_update:
 87 |             self.ema_mean.put(image)
 88 |             self.ema_var.update_mean(self.ema_mean())
 89 |             self.ema_var.put(image)
 90 |             self.mean = self.ema_mean()
 91 |             self.std = torch.clamp(self.ema_var.std(), torch.finfo().eps, torch.finfo().max)
 92 |         elif not self.reported:
 93 |             self.reported = True
 94 |             logger.info(f'\n*** Running Norm has finished updates over {self.max_update} times, using the following stats from now on. ***\n  mean={float(self.mean.view(-1))}, std={float(self.std.view(-1))}')
 95 |             logger.info(f'*** Please use these statistics in your model. EXIT... ***\n')
 96 |             exit(-1)
 97 |         return ((image - self.mean) / self.std)
 98 | 
 99 |     def __repr__(self):
100 |         format_string = self.__class__.__name__ + f'(max_update={self.max_update},axis={self.ema_mean.axis})'
101 |         return format_string
102 | 
103 | 
104 | ###################
105 | # UPSTREAM EXPERT #
106 | ###################
107 | class UpstreamExpert(nn.Module):
108 |     """
109 |     The M2D wrapper
110 |     """
111 | 
112 |     def __init__(
113 |         self,
114 |         ckpt: str,
115 |         model_config: str,
116 |         feature_d: int,
117 |         window_secs: float = (160 * 16) / 16000,
118 |         stride_secs: float = (160 * 16) / 16000,
119 |         norm_mean: float = None,  # Has to be a float value to continue training.
120 |         norm_std: float = None,  # The same as above.
121 |         **kwargs,
122 |     ):
123 |         super(UpstreamExpert, self).__init__()
124 | 
125 |         # Normalizer
126 |         if norm_mean is None or norm_std is None:
127 |             # ** CAUTION **
128 |             # ** Please note that here we calculate statistics using RunningNorm and will exit early in the training. **
129 |             # ** CAUTION **
130 |             self.norm = RunningNorm(epoch_samples=10_000, max_update_epochs=1, axis=[0, 1, 2, 3])  # Use single scalar mean/std values.
131 |         else:
132 |             print(f'*** Using normalization statistics: mean={norm_mean}, std={norm_std} ***')
133 |             self.norm = lambda x: (x - norm_mean) / norm_std
134 |         
135 | 
136 |         # Load pretrained weights.
137 |         self.model = RuntimeM2D(weight_file=ckpt)
138 | 
139 |         # attributes
140 |         self.output_dim = self.model.cfg.feature_d
141 |         self.max_input_length = 1024  # self.model.cfg.input_size[1]
142 | 
143 |     # Interface
144 |     def get_output_dim(self):
145 |         return self.output_dim
146 | 
147 |     # Interface
148 |     def get_downsample_rates(self, key: str) -> int:
149 |         return 160 * self.model.cfg.patch_size[1]  # hop_size x time frames
150 | 
151 |     def to_feature(self, batch_audio):
152 |         x = self.model.to_spec(batch_audio)
153 |         x = (x + torch.finfo().eps).log()
154 |         return x.unsqueeze(1) #.to(device)
155 | 
156 |     # Interface
157 |     def forward(self, wavs):
158 |         """
159 |         Args:
160 |             wavs:
161 |                 list of unpadded wavs [wav1, wav2, ...]
162 |                 each wav is in torch.FloatTensor with sample rate 16000
163 |                 and already put in the device assigned by command-line args
164 | 
165 |         Return:
166 |             features:
167 |                 list of unpadded features [feat1, feat2, ...]
168 |                 each feat is in torch.FloatTensor and already
169 |                 put in the device assigned by command-line args
170 |         """
171 |         wavs = pad_sequence(wavs, batch_first=True)
172 |         features = self.to_feature(wavs)
173 |         # normalize
174 |         features = self.norm(features)
175 |         # encode
176 |         layered_features = self.model.encode_lms(features, return_layers=True)
177 |         return {
178 |             "last_hidden_state": layered_features[-1],
179 |             "hidden_states": layered_features,
180 |         }
181 | 
182 | 


--------------------------------------------------------------------------------
/speech/README.md:
--------------------------------------------------------------------------------
  1 | ![key_visual](figure-github.jpg)
  2 | 
  3 | # Masked Modeling Duo for Speech (M2D-S)
  4 | 
  5 | This repository provides a demo implementation of "[Masked Modeling Duo for Speech: Specializing General-Purpose Audio Representation to Speech using Denoising Distillation](https://arxiv.org/abs/2305.14079)."
  6 | 
  7 | - [x] Code for pre-training and pre-processing LS-960 features.
  8 | - [x] Pre-trained weights.
  9 | - [x] [SUPERB](https://arxiv.org/abs/2105.01051) evaluation code and instructions.
 10 | 
 11 | ## 1. Getting Started
 12 | 
 13 | For installation, follow the instruction in the ["1. Getting Started" in the main README.md](../README.md#1-getting-started).
 14 | 
 15 | For evaluating on SUPREB, refer to [superb/upstream/m2d/README.md](../superb/upstream/m2d/README.md).
 16 | 
 17 | ## 2. Pre-trained weights
 18 | 
 19 | Find pre-trained weight files in [releases](https://github.com/nttcslab/m2d/releases).
 20 | 
 21 | - M2D-S T=4.0s: m2d_s_vit_base-80x400p80x2-230201
 22 | - M2D-S T=5.12s: m2d_s_vit_base-80x512p80x2-230301
 23 | - M2D-S T=6.08s: m2d_s_vit_base-80x608p80x2-230220
 24 | 
 25 | | Model    | Pre-trained dataset  | PR    | KS    | IC    | SID   | ER    | ENV       | MUS      |
 26 | |----------|----------------------|-------|-------|-------|-------|-------|-----------|----------|
 27 | | M2D-S T=4.0s  | LS-960+AS        | 5.72  | 96.47 | 97.80 | 81.97 | 66.36 | _53.22_  | _41.71_  |
 28 | | M2D-S T=5.12s | LS-960+AS        | 5.64  | 96.87 | 97.65 | 80.69 | 65.35 | _57.34_  | _43.23_  |
 29 | | M2D-S T=6.08s | LS-960+AS        | 5.33  | 96.80 | 97.63 | 81.74 | 66.13 | _54.77_  | _43.75_  |
 30 | 
 31 | 
 32 | ## 3. Pre-training from Scratch
 33 | 
 34 | ### 3-1. Pre-processing data files
 35 | 
 36 | M2D-S learns from the following pre-processed files using LibriSpeech (LS-960) and HuBERT-base pre-trained model.
 37 | 
 38 | - `data/ls960_hybrid7s_hubaseL9`: Pre-processed data consists of log-mel spectrogram samples converted from LS-960 and HuBERT layer #9 features encoded from LS-960.
 39 | - `data/files_ls960_hybrid.csv`: List of pre-processed files of the `ls960_hybrid7s_hubaseL9` folder.
 40 | 
 41 | M2D-S also requires AudioSet as a background noise.
 42 | 
 43 | - `data/audioset_lms`: Pre-processed log-mel spectrogram samples from AudioSet, as in the original M2D.
 44 | - `data/files_audioset.csv`: List of pre-processed AudioSet files, as in the original M2D.
 45 | 
 46 | #### 3-1-1. LS-960 data files
 47 | 
 48 | The following command line will create `data/ls960_hybrid7s_hubaseL9` and `data/files_ls960_hybrid.csv`.
 49 | 
 50 | ```
 51 | python speech/extract_offline_ls960.py /path/to/LibriSpeech
 52 | ```
 53 | 
 54 | #### 3-1-2. AudioSet data files
 55 | 
 56 | For preparing AudioSet data files (`data/audioset_lms` and `data/files_audioset.csv`), please follow the [data/README.md](../data/README.md).
 57 | 
 58 | ### 3-2. Pre-training
 59 | 
 60 | The `train_speech.py` pre-trains for speech.
 61 | 
 62 | The following example would run on any affordable GPU, consuming only 7,170MiB. However, please note that it will take very long (It took over 20 minutes for one epoch).
 63 | You can also change the BG noise dataset by adding `--csv_bg_noise data/files_f_s_d_5_0_k.csv`, for example.
 64 | 
 65 | ```sh
 66 | python -m speech.train_speech --loss_m2d 1. --loss_off 1. --input_size 80x208 --patch_size 80x4 --noise_ratio 0.2 --batch_size 128 --accum_iter 16
 67 | ```
 68 | 
 69 | The followings are for pre-training high-end models, taking 2.5-3.5 days to complete with 4 A100s.
 70 | 
 71 | ```sh
 72 | OMP_NUM_THREADS=1 torchrun --nproc_per_node=4 -m speech.train_speech --loss_m2d 1. --loss_off .5 --input_size 80x400 --patch_size 80x2 --noise_ratio 0.2
 73 | OMP_NUM_THREADS=1 torchrun --nproc_per_node=4 -m speech.train_speech --loss_m2d 1. --loss_off .5 --input_size 80x512 --patch_size 80x2 --noise_ratio 0.2 --batch_size 256 --accum_iter 2
 74 | OMP_NUM_THREADS=1 torchrun --nproc_per_node=4 -m speech.train_speech --loss_m2d 1. --loss_off .5 --input_size 80x608 --patch_size 80x2 --noise_ratio 0.2 --batch_size 256 --accum_iter 2
 75 | ```
 76 | 
 77 | #### 3-2-1. Major pre-training options
 78 | 
 79 | - --batch_size: Batch size per GPU, 512 by default.
 80 | - --epochs: Training epochs, 1000 by default.
 81 | - --accum_iter: Iterations to accumulate gradients, 1 by default.
 82 | - --input_size: Input spectrogram size, 80x208 by default.
 83 | - --patch_size: Patch size, 80x4 by default.
 84 | - --mask_ratio: Masking ratio, 0.6 by default.
 85 | - --loss_m2d: Loss ratio for M2D masked prediction, 1.0 by default.
 86 | - --loss_off: Loss ratio for offline target, 0.0 by default.
 87 | - --blr: Base learning rate: absolute_lr = base_lr * total_batch_size / 256.
 88 | - --csv_main: A CSV file to list sample files in the main dataset, 'data/files_ls960_hybrid.csv' by default.
 89 | - --csv_bg_noise: A CSV file to list sample files in the BG noise dataset, 'data/files_audioset.csv' by default.
 90 | - --noise_ratio: Noise mixing ratio, 0.2 by default.
 91 | 
 92 | ## 4. SUPERB Evaluation
 93 | 
 94 | We provide upstream wrapper implementation, which you can import to your [SUPERB](https://arxiv.org/abs/2105.01051) environment.
 95 | 
 96 | - Copy the `superb/upstream/m2d` folder under your `s3prl/upstream` folder.
 97 | - Make a symbolic link to your copy of M2D repository under your `s3prl/upstream/m2d`, making `s3prl/upstream/m2d/m2d`. The wrapper files will find M2D programs under this symbolic link.
 98 | - You will need to run `pip install -e .` under your `s3prl` folder, so that you install your local SUPERB in your Python environment.
 99 | 
100 | Please refer to [superb/upstream/m2d/README.md](../superb/upstream/m2d/README.md) for more details.
101 | 
102 | ## Acknowledgements
103 | 
104 | - Our code is based on the [MAE PyTorch/GPU re-implementation](https://github.com/facebookresearch/mae) of the paper [Masked Autoencoders Are Scalable Vision Learners](https://openaccess.thecvf.com/content/CVPR2022/html/He_Masked_Autoencoders_Are_Scalable_Vision_Learners_CVPR_2022_paper.html).
105 | - We use [nnAudio](https://ieeexplore.ieee.org/document/9174990) ([KinWaiCheuk/nnAudio](https://github.com/KinWaiCheuk/nnAudio)) for converting raw audio into log-mel spectrogram.
106 | - We use [Hugging Face Transformers](https://huggingface.co/docs/transformers/index) for the implementation and pre-trained weights of the [HuBERT](https://ieeexplore.ieee.org/document/9585401) model.
107 | 
108 | We appreciate these publicly available resources.
109 | 
110 | ## References
111 | 
112 | If you find our M2D-S useful in your research, please consider citing our paper:
113 | 
114 | ```BibTeX
115 | @article{niizumi2023m2d4speech,
116 |     title   = {{Masked Modeling Duo for Speech: Specializing General-Purpose Audio Representation to Speech using Denoising Distillation}},
117 |     author  = {Daisuke Niizumi and Daiki Takeuchi and Yasunori Ohishi and Noboru Harada and Kunio Kashino},
118 |     journal = {to appear at Interspeech}, 
119 |     year    = {2023},
120 |     url     = {https://arxiv.org/abs/2305.14079}
121 | }
122 | ```
123 | 
124 | - SUPERB: *[Shu-wen Yang, Po-Han Chi, Yung-Sung Chuang, Cheng-I Jeff Lai, Kushal Lakhotia, Yist Y. Lin, Andy T. Liu, Jiatong Shi, Xuankai Chang, Guan-Ting Lin, Tzu-Hsien Huang, Wei-Cheng Tseng, Ko-tik Lee, Da-Rong Liu, Zili Huang, Shuyan Dong, Shang-Wen Li, Shinji Watanabe, Abdelrahman Mohamed, and Hung-yi Lee, "SUPERB: Speech Processing Universal PERformance Benchmark," Interspeech, 2021](https://arxiv.org/abs/2105.01051).*
125 |     - https://github.com/s3prl/s3prl/blob/main/s3prl/downstream/docs/superb.md
126 | - HuBERT: *[W.-N. Hsu, B. Bolte, Y.-H. H. Tsai, K. Lakhotia, R. Salakhutdinov, and A. Mohamed, “HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units,” IEEE/ACM Trans. Audio, Speech, Language Process., p.3451–3460, 2021](https://ieeexplore.ieee.org/document/9585401).*
127 | 


--------------------------------------------------------------------------------
/superb/upstream/m2d/README.md:
--------------------------------------------------------------------------------
  1 | # Masked Modeling Duo (M2D) upstream model for SUPERB
  2 | 
  3 | Masked Modeling Duo for Speech: Specializing General-Purpose Audio Representation to Speech using Denoising Distillation
  4 | https://arxiv.org/abs/2305.14079
  5 | 
  6 | This is an M2D wrapper for SUPERB, and evaluating M2D on SUPERB involves two steps:
  7 | 
  8 | - Calculating normalization statistics first, as M2D requires the average and standard deviation of the downstream task dataset.
  9 | - Evaluating M2D on SUPERB using the calculated statistics.
 10 | 
 11 | ## Installation
 12 | 
 13 | - Copy the `superb/upstream/m2d` folder under your `s3prl/upstream` folder.
 14 | - Create a copy of entire M2D repository under your `s3prl/upstream/m2d`, or make a symbolic link instead. The wrapper `expert.py` will find M2D programs in the folder.
 15 | - Edit your `s3prl/hub.py` to add `from s3prl.upstream.m2d.hubconf import *`.
 16 | 
 17 | The expected folders/files are as follows:
 18 | 
 19 |     your_s3prl/
 20 |         s3prl/
 21 |             upstream/
 22 |                 m2d/
 23 |                     __init__.py
 24 |                     expert.py
 25 |                     hubconf.py
 26 |                     README.md
 27 |                     m2d/
 28 |                         (all the M2D contents should be here)
 29 |             hub.py  (should have `from s3prl.upstream.m2d.hubconf import *`)
 30 | 
 31 | You might also need to run `pip install -e .` under your `s3prl` folder, so that you install your local SUPERB in your Python environment.
 32 | 
 33 | Here is an example of installing fresh SUPERB under your copy of the M2D repository.
 34 | 
 35 |     git clone https://github.com/s3prl/s3prl.git
 36 |     ln -s ../../../superb/upstream/m2d s3prl/s3prl/upstream/
 37 |     ln -s ../../.. s3prl/s3prl/upstream/m2d/m2d
 38 |     pip install tensorboardX catalyst
 39 |     cd s3prl/s3prl
 40 |     (Now edit hub.py to add the following line.)
 41 |         from s3prl.upstream.m2d.hubconf import *
 42 |     cd ..   (move to your_m2d/s3prl)
 43 |     pip install -e .
 44 | 
 45 | After these steps, your SUPERB should accept the following evaluation steps.
 46 | 
 47 | ## Step 1. Pre-compute statistics on a downstream task
 48 | 
 49 | We need statistics for each downstream task.
 50 | 
 51 | Use the upstream  `m2d_calcnorm` to calculate statistics. Example with a downstream task `voxceleb1` (SID):
 52 | 
 53 |     python run_downstream.py -m train -n m2d_calcnorm_1 -u m2d_calcnorm -d voxceleb1
 54 | 
 55 | This will output:
 56 | 
 57 |     *** Running Norm has finished updates over 10000 times, using the following stats from now on. ***
 58 |     mean=-10.571270942687988, std=4.3681135177612305
 59 |     *** Please use these statistics in your model. EXIT... ***
 60 | 
 61 | These `-10.571270942687988` and `4.3681135177612305` are the statistics for the `voxceleb1` (SID).
 62 | 
 63 | ## Step 2. Run your evaluation on the downstream task
 64 | 
 65 | Use the upstream `m2d` to evaluate your weights with the statistics calculated in the step above.
 66 | Here an example of testing m2d_s_vit_base-80x608p80x2-230220 using `voxceleb1` (SID):
 67 | 
 68 |     python run_downstream.py -m train -n m2d_vc1_1 -u m2d -d voxceleb1 -k /your/m2d_s_vit_base-80x608p80x2-230220/checkpoint-1000.pth,-10.571271,4.3681135
 69 |     python run_downstream.py -m evaluate -e result/downstream/m2d_vc1_1/dev-best.ckpt
 70 | 
 71 | ## Examples
 72 | 
 73 | These are the scripts used for evaluating "[Masked Modeling Duo for Speech: Specializing General-Purpose Audio Representation to Speech using Denoising Distillation](https://arxiv.org/abs/2305.14079)."
 74 | 
 75 | For example, we run the following to evaluate a weight `m2d_s_vit_base-80x608p80x2-230220/checkpoint-1000.pth` on KS.
 76 | 
 77 |     ./ks.sh 0 m2d /your/m2d_s_vit_base-80x608p80x2-230220/checkpoint-1000.pth 7
 78 | 
 79 | The `0` is a GPU number, `m2d` is an upstream name, and the last `7` is a random seed.
 80 | This command line will run the following two Python commands:
 81 | 
 82 |     CUDA_VISIBLE_DEVICES=0 python run_downstream.py -m train -n m2d_s_vit_base-80x608p80x2-230220-checkpoint-1000-KS-lr1e-4-s7 -u m2d -d speech_commands -o config.optimizer.lr=1e-4 -k /your/m2d_s_vit_base-80x608p80x2-230220/checkpoint-1000.pth,-11.506255149841309,4.314857482910156 --seed 7
 83 |     CUDA_VISIBLE_DEVICES=0 python run_downstream.py -m evaluate -e result/downstream/m2d_s_vit_base-80x608p80x2-230220-checkpoint-1000-KS-lr1e-4-s7/dev-best.ckpt
 84 | 
 85 | 
 86 | ### ER (er.sh)
 87 | 
 88 | ```sh
 89 | gpu=$1
 90 | upmodel=$2
 91 | ckpt=$3
 92 | lr=1e-5
 93 | task=ER
 94 | seed=$4
 95 | 
 96 | parentpath=$(dirname $ckpt)
 97 | parent=$(basename $parentpath)
 98 | ckptbase=$(basename $ckpt)
 99 | ckptstem=${ckptbase%.*}
100 | expbase=$parent-$ckptstem
101 | 
102 | for test_fold in fold1 fold2 fold3 fold4 fold5;
103 | do
104 |     expname=$expbase-$task-lr$lr-s$seed-$test_fold
105 |     echo $expname
106 |     CUDA_VISIBLE_DEVICES=$gpu python run_downstream.py -m train -n $expname -u $upmodel -d emotion -c downstream/emotion/config.yaml -o "config.optimizer.lr=$lr,, config.downstream_expert.datarc.test_fold='$test_fold'" -k $ckpt,-13.037399291992188,3.619741439819336 --seed $seed
107 |     CUDA_VISIBLE_DEVICES=$gpu python run_downstream.py -m evaluate -e result/downstream/$expname/dev-best.ckpt
108 | done
109 | ```
110 | 
111 | ### IC (ic.sh)
112 | 
113 | ```sh
114 | gpu=$1
115 | upmodel=$2
116 | ckpt=$3
117 | lr=1e-3
118 | task=IC
119 | seed=$4
120 | 
121 | parentpath=$(dirname $ckpt)
122 | parent=$(basename $parentpath)
123 | ckptbase=$(basename $ckpt)
124 | ckptstem=${ckptbase%.*}
125 | expbase=$parent-$ckptstem
126 | 
127 | expname=$expbase-$task-lr$lr-s$seed
128 | 
129 | echo $expname
130 | CUDA_VISIBLE_DEVICES=$gpu python run_downstream.py -m train -n $expname -u $upmodel -d fluent_commands -o "config.optimizer.lr=$lr" -k $ckpt,-13.017439842224121,4.417759895324707 --seed $seed
131 | CUDA_VISIBLE_DEVICES=$gpu python run_downstream.py -m evaluate -e result/downstream/$expname/dev-best.ckpt
132 | ```
133 | 
134 | ### KS (ks.sh)
135 | 
136 | ```sh
137 | gpu=$1
138 | upmodel=$2
139 | ckpt=$3
140 | lr=1e-4
141 | task=KS
142 | seed=$4
143 | 
144 | parentpath=$(dirname $ckpt)
145 | parent=$(basename $parentpath)
146 | ckptbase=$(basename $ckpt)
147 | ckptstem=${ckptbase%.*}
148 | expbase=$parent-$ckptstem
149 | 
150 | expname=$expbase-$task-lr$lr-s$seed
151 | 
152 | echo $expname
153 | CUDA_VISIBLE_DEVICES=$gpu python run_downstream.py -m train -n $expname -u $upmodel -d speech_commands -o "config.optimizer.lr=$lr" -k $ckpt,-11.506255149841309,4.314857482910156 --seed $seed
154 | CUDA_VISIBLE_DEVICES=$gpu python run_downstream.py -m evaluate -e result/downstream/$expname/dev-best.ckpt
155 | ```
156 | 
157 | ### PR (pr.sh)
158 | 
159 | ```sh
160 | gpu=$1
161 | upmodel=$2
162 | ckpt=$3
163 | lr=1e-3
164 | task=PR
165 | seed=$4
166 | 
167 | parentpath=$(dirname $ckpt)
168 | parent=$(basename $parentpath)
169 | ckptbase=$(basename $ckpt)
170 | ckptstem=${ckptbase%.*}
171 | expbase=$parent-$ckptstem
172 | 
173 | expname=$expbase-$task-lr$lr-s$seed
174 | 
175 | echo $expname
176 | CUDA_VISIBLE_DEVICES=$gpu python run_downstream.py -m train -n $expname -u $upmodel -d ctc -c downstream/ctc/libriphone.yaml -o "config.optimizer.lr=$lr" -k $ckpt,-10.43253231048584,4.241369724273682 --seed $seed
177 | CUDA_VISIBLE_DEVICES=$gpu python run_downstream.py -m evaluate -n $expname -d ctc -e result/downstream/$expname/dev-best.ckpt
178 | ```
179 | 
180 | ### SID (sid.sh)
181 | 
182 | ```sh
183 | gpu=$1
184 | upmodel=$2
185 | ckpt=$3
186 | lr=1e-3
187 | task=SID
188 | seed=$4
189 | 
190 | parentpath=$(dirname $ckpt)
191 | parent=$(basename $parentpath)
192 | ckptbase=$(basename $ckpt)
193 | ckptstem=${ckptbase%.*}
194 | expbase=$parent-$ckptstem
195 | 
196 | expname=$expbase-$task-lr$lr-s$seed
197 | 
198 | echo $expname
199 | CUDA_VISIBLE_DEVICES=$gpu python run_downstream.py -m train -n $expname -u $upmodel -d voxceleb1 -o "config.optimizer.lr=$lr" -k $ckpt,-10.571271,4.3681135 --seed $seed
200 | CUDA_VISIBLE_DEVICES=$gpu python run_downstream.py -m evaluate -n $expname -d voxceleb1 -e result/downstream/$expname/dev-best.ckpt
201 | ```
202 | 
203 | 


--------------------------------------------------------------------------------
/Guide_app.md:
--------------------------------------------------------------------------------
  1 | # M2D/M2D-X Application Guide (update: May 25, 2024)
  2 | 
  3 | While our papers provide the details of methods, more is needed to guide how to use them in new applications, especially for the pre-training for each purpose.
  4 | Here are guides based on the experiences and the information found afterward.
  5 | 
  6 | CAUTION: This guide does not provide complete information covering many use cases because we are not working on many applications. Therefore, it is subject to change according to the new information/experience gained.
  7 | 
  8 | ## 1. Transfer learning only, no pre-training
  9 | If you load the pre-trained weight and use the encoder for fine-tuning as a feature extractor, you may choose a weight from ["Pre-trained/Fine-tuned Weights"](README.md#pre-trainedfine-tuned-weights):
 10 | 
 11 | - "M2D-AS fine-tuned on AS2M" or "M2D/0.7 fine-tuned on AS2M" -- If your application setting is closer to the AudioSet ontology, including typical audio tagging (AT), sound event detection (SED), and audio captioning. 
 12 | - "M2D-AS fine-tuned on AS2M@32kHz" -- If application data needs higher frequency.
 13 | - "M2D/0.7", "M2D/0.6", "M2D-AS", or "M2D-AS@32kHz" -- General-purpose weights. If the application domain is far from AudioSet, such as medical or industrial (e.g., factory) sound, or if it is uncertain.
 14 | - "M2D-S" -- Weights for speech tasks.
 15 | 
 16 | 
 17 | ## 2. Pre-training on your data
 18 | 
 19 | ### 2.1 Pre-training strategy choice
 20 | 
 21 | Effective pre-training depends on the available dataset and computing resources.
 22 | 
 23 | ![chart](image-AppGuideChart.png)
 24 | 
 25 | Possible choices:
 26 | 
 27 | - Used the `fL` (AudioSet or LibriSpeech pre-trained weights) as they are -- The provided weights could be effective.
 28 | - Pre-training on `XLd` (a large in-domain dataset) from scratch -- As in speech, in-domain pre-training may be possible.
 29 | - Further pre-training on `Xapp` (an application dataset).
 30 |     - If your `Xapp` is large enough (>1000h), pre-training from scratch on `Xapp` may be effective.
 31 | 
 32 | ### 2.2 Base weight choice
 33 | 
 34 | A weight closer to the application domain may be effective.
 35 | 
 36 | - AudioSet pre-trained weights (M2D pre-training) "M2D/0.7" or "M2D/0.6" -- For general non-speech tasks. A respiratory sound task may be non-speech.
 37 |     - e.g., m2d_vit_base-80x608p16x16-221006-mr7
 38 | - AudioSet pre-trained weights (M2D-AS pre-training) "M2D-AS" -- For typical audio captioning, audio tagging, sound event detection tasks.
 39 |     - e.g., m2d_as_vit_base-80x608p16x16-240213
 40 | - LibriSpeech pre-trained weights "M2D-S" -- For speech tasks. Note that AudioSet weights may be more effective even for some sounds seemingly closer to speech, such as respiratory sounds.
 41 |     - e.g., m2d_s_vit_base-80x400p80x2-230201 or m2d_s_vit_base-80x608p80x2-230220, starting with the 80x400 model would make your experiment easier.
 42 | 
 43 | ### 2.3 Parameter setting
 44 | 
 45 | #### 2.3.1 Pre-training from scratch
 46 | 
 47 | Practically, training from scratch may require >100K samples and multiple GPUs. Here's the command line we use to train an M2D.
 48 | 
 49 | ```sh
 50 | OMP_NUM_THREADS=1 torchrun --nproc_per_node=4 train_audio.py --input_size 80x608 --patch_size 16x16 --epochs 300 --batch_size 512 --accum_iter 1 --save_freq 50 --seed 3 --model m2d_vit_base --csv_main data/files_audioset.csv --data_path /path/to/your/data --loss_off 0.
 51 | ```
 52 | 
 53 | The parameters specifically matter for your purpose:
 54 | 
 55 | - `--epochs 300 --batch_size 512 --accum_iter 1` -- The combination of these parameters, the learning rate, and the EMA decay parameters matter. The epochs could be adjusted, while longer epochs do not always yield better results. Set the effective batch size to 2048 according to your GPU resources (This example uses 4 GPUs for each 512-sample batch). Following these guides, you may not need to change the learning rate and EMA parameters. The successful settings we have sed so far:
 56 |     - bs=2048 & epochs=300 for AudioSet 2M samples.
 57 |     - bs=2048 & epochs=1000 for LibriSpeech 281k samples.
 58 | - `--csv_main data/files_audioset.csv` -- You may set your data list here.
 59 | - `--data_path /path/to/your/data` -- You may set your data folder. I explicitly set this to a fast storage device.
 60 | - `--loss_off 0.` -- No offline loss.
 61 | 
 62 | 
 63 | #### 2.3.2 Further pre-training
 64 | 
 65 | Further pre-training may be a choice that pre-trains a pre-trained model on your data when your data is small, such as <10K samples.
 66 | (We have yet to check for how much data you need to pre-train from scratch or do further pre-training.)
 67 | 
 68 | Here's the command line we use to train an M2D-X model for ICBHI 2017 (see our TALSP paper for the details).
 69 | 
 70 | ```sh
 71 | CUDA_VISIBLE_DEVICES=0 python train_audio.py --epochs 600 --resume m2d_vit_base-80x200p16x4-230529/checkpoint-300.pth --model m2d_x_vit_base --input_size 80x200 --patch_size 16x4 --batch_size 64 --accum_iter 2 --csv_main data/files_icbhi2017.csv --csv_bg_noise data/files_f_s_d_5_0_k.csv --noise_ratio 0.3 --save_freq 100 --eval_after 600 --seed 6 --teacher m2d_vit_base-80x200p16x4-230529/checkpoint-300.pth --blr 3e-4 --loss_off 1. --min_ds_size 10000
 72 | ```
 73 | 
 74 | The parameters specifically matter for your purpose:
 75 | 
 76 | - `--epochs 600 --batch_size 64 --accum_iter 2` -- The combination of these parameters matters. The epochs could be adjusted. Set the effective batch size to 128 according to your GPU resources (This example uses a GPU with a batch size of 64 and accumulating loss twice). Following these guides, you may not need to change the learning rate and EMA parameters. The successful settings we have used so far:
 77 |     - bs=128 & epochs=600 for 10k samples. (We virtually increased up to 10k by repeating the list of 1k samples.)
 78 | - `--resume m2d_vit_base-80x200p16x4-230529/checkpoint-300.pth ` -- It initializes the online encoder weights using the pre-trained weight.
 79 | - `--teacher m2d_vit_base-80x200p16x4-230529/checkpoint-300.pth` -- We also use the model as an offline teacher encoder.
 80 | - `--model m2d_x_vit_base` -- Set the pre-training framework as M2D-X.
 81 | - `--input_size 80x200 --patch_size 16x4` -- You explicitly need to set them when using non-default parameters.
 82 | - `--csv_main data/files_icbhi2017.csv` -- You may set your data list here.
 83 | - `--csv_bg_noise data/files_f_s_d_5_0_k.csv` -- Set the BG noise data list here when you set the noise ratio to >0.0.
 84 | - `--noise_ratio 0.3` -- Set the mixing ratio of the BG noise. The 0.3 will mix data for main/BG with a proportion of 0.7/0.3.
 85 | - `--eval_after 600` -- We skip the evaluation of the checkpoints after the epoch of 600; the intermediate checkpoints will not be tested.
 86 | - `--blr 3e-4` -- The default is `3e-4`, so we just set it here in case we want to adjust.
 87 | - `--loss_off 1.` -- The offline loss ratio for M2D-X.
 88 | - `--min_ds_size 10000` -- We virtually increase the number of samples to 10k by repeating the list of 1k samples.
 89 | 
 90 | #### Example use case: Further pre-training with 2 GPUs, bs=32, 50k data samples
 91 | 
 92 | This is an example command line for two small GPUs that can accommodate a batch size of 32.
 93 | 
 94 | ```sh
 95 | CUDA_VISIBLE_DEVICES=0,1 torchrun --nproc_per_node=2 train_audio.py --epochs 600 --warmup_epochs 24 --resume m2d_vit_base-80x608p16x16-221006-mr7/checkpoint-300.pth --model m2d_x_vit_base --batch_size 32 --accum_iter 4 --csv_main __your__.csv --csv_bg_noise data/files_f_s_d_5_0_k.csv --noise_ratio 0.01 --save_freq 100 --eval_after 600 --seed 3 --teacher m2d_vit_base-80x608p16x16-221006-mr7/checkpoint-300.pth --blr 3e-4 --loss_off 1.
 96 | ```
 97 | 
 98 | We set options for the setup.
 99 | 
100 | - `torchrun --nproc_per_node=2` -- For the distributed training using two GPUs.
101 | - `--batch_size 32 --accum_iter 4` -- For the effective batch size of 128.
102 | - Removed `--min_ds_size 10000` because the number of data samples would be enough to form an epoch.
103 | 
104 | ## 3. Notes on pre-training
105 | 
106 | #### a. Batch size, learning rate scheduling, and EMA
107 | 
108 | The M2D combines the masked prediction pretext task with recent SSL techniques, such as the offline network updated by EMA and the annealing learning rate schedule, making the pre-training settings somewhat tricky. Here are some related notes.
109 | 
110 | - How the offline target encoder evolves relates to the effective batch size because the system updates it by EMA every time batch samples are consumed. Thus, the effective batch size (batch size by accum_iter) matters for gaining a useful training signal created by the offline encoder.
111 | - Other factors that affect the offline target encoder are the number of epochs and the data size.
112 | - In summary, the effective batch size, number of epochs, data size, and EMA parameters control how we get good training signals.
113 | 
114 | However, searching for a set of these parameters takes time. Thus, using a similar set of parameters known to be effective is recommended.
115 | 
116 | 


--------------------------------------------------------------------------------
/speech/speech_dataset.py:
--------------------------------------------------------------------------------
  1 | """Dataset for Speech
  2 | 
  3 | Masked Modeling Duo for Speech: Specializing General-Purpose Audio Representation to Speech using Denoising Distillation
  4 | https://arxiv.org/abs/2305.14079
  5 | 
  6 | ## Data files
  7 | 
  8 | All the data samples used here are expected to be `.npz` preprocessed contents.
  9 | Please find the details in `README.md` and preprocessor `extract_offline_ls960.py`.
 10 | 
 11 | """
 12 | 
 13 | import numpy as np
 14 | from pathlib import Path
 15 | import torch
 16 | 
 17 | from audio_dataset import SpectrogramDataset, get_files
 18 | 
 19 | 
 20 | def log_mixup_exp(xa, xb, alpha):
 21 |     xa = xa.exp()
 22 |     xb = xb.exp()
 23 |     x = alpha * xa + (1. - alpha) * xb
 24 |     return torch.log(torch.max(x, torch.finfo(x.dtype).eps*torch.ones_like(x)))
 25 | 
 26 | 
 27 | class SpeechHybridDataset(SpectrogramDataset):
 28 |     def __init__(self, folder, files, crop_size, norm_stats=None,
 29 |                  random_crop=True, n_norm_calc=20000, 
 30 |                  patch_len=None):
 31 |         assert (crop_size[1] % 2) == 0, f'Crop frames has to be multiple of 2 (frames=100Hz vs embeddings=50Hz): {crop_size}'
 32 |         self.raw_emb_len = crop_size[1] // 2  # frames=100Hz vs embeddings=50Hz
 33 |         self.emb_len = crop_size[1] // patch_len
 34 |         self.patch_len = patch_len
 35 | 
 36 |         super().__init__(folder=folder, files=files, crop_frames=crop_size[1], norm_stats=norm_stats,
 37 |                  random_crop=random_crop, n_norm_calc=n_norm_calc)
 38 | 
 39 |     def get_raw_data(self, index):
 40 |         filename = self.folder/self.df.file_name.values[index]
 41 |         try:
 42 |             hybrid = np.load(str(filename))
 43 |         except:
 44 |             assert False, f'Failed to load: {filename}'
 45 |         lms = torch.tensor(hybrid['arr_0'])
 46 |         emb = torch.tensor(hybrid['arr_1'])
 47 |         raw_emb_len = torch.tensor(hybrid['arr_2'])
 48 | 
 49 |         # original sample is shorter than crop duration
 50 |         if raw_emb_len < self.raw_emb_len:
 51 |             raw_emb_len = self.raw_emb_len
 52 |         # emb_len has to be the multiple of patch_len
 53 |         if (raw_emb_len % self.patch_len) > 0:
 54 |             raw_emb_len = int(raw_emb_len / self.patch_len) * self.patch_len
 55 |             assert raw_emb_len >= self.raw_emb_len, f'{raw_emb_len} {self.raw_emb_len}'
 56 | 
 57 |         emb = emb[:, :raw_emb_len, :]
 58 |         lms = lms[:, :, :raw_emb_len * 2]  # ensure lms length matches emb length, *2 = frames=100Hz vs embeddings=50Hz
 59 | 
 60 |         return lms, emb.transpose(-1, -2)  # emb: [1, T, D] -> [1, D, T] to make the same shape with lms.
 61 | 
 62 |     def complete_data(self, lms, emb):
 63 |         # crop & normalize
 64 |         x = super().complete_audio(lms)
 65 |         j = self.last_crop_start
 66 |         if not hasattr(self, 'norm_stats'):
 67 |             return x  # for norm_stats calculation
 68 | 
 69 |         # rescale the cut position j from LMS frame length to embedding length
 70 |         emb_j = (self.raw_emb_len * j) // self.crop_frames
 71 |  
 72 |         # crop embedding
 73 |         emb = emb[..., emb_j:emb_j + self.raw_emb_len]
 74 | 
 75 |         # shrink embeddings to match the patch length only when needed
 76 |         n_emb_per_patch = self.patch_len // 2  # 20ms per offline embedding
 77 |         if n_emb_per_patch > 1:
 78 |             _, D, T = emb.shape
 79 |             assert (T % n_emb_per_patch) == 0, f'T:{T} self.emb_len:{self.emb_len} n_emb_per_patch:{n_emb_per_patch} emb.shape:{emb.shape}'
 80 |             new_len = T // n_emb_per_patch
 81 |             emb = emb.reshape(1, D, new_len, n_emb_per_patch).mean(axis=-1)
 82 |             if new_len == 0:
 83 |                 print(f'T:{T} self.emb_len:{self.emb_len} n_emb_per_patch:{n_emb_per_patch} emb.shape:{emb.shape}')
 84 | 
 85 |         # reshape to make it useful
 86 |         y = emb.transpose(-1, -2).squeeze(0)  # [1, D, T] to [T, D]
 87 | 
 88 |         return x, y
 89 | 
 90 |     def __getitem__(self, index):
 91 |         lms, emb = self.get_raw_data(index)
 92 |         items = self.complete_data(lms, emb)
 93 |         return items
 94 | 
 95 | 
 96 | import pandas as pd
 97 | class SpeechHybridLabelDataset(SpeechHybridDataset):
 98 |     def __init__(self, folder, files, crop_size, norm_stats=None,
 99 |                  random_crop=True, n_norm_calc=20000, 
100 |                  label_csv='data/ls960_train_hubert_base_ls960_L9_km500.csv', n_classes=500,
101 |                  patch_len=None):
102 |         super().__init__(folder, files, crop_size, norm_stats, random_crop, n_norm_calc, patch_len)
103 | 
104 |         df = pd.read_csv(label_csv)
105 |         df['id_'] = [x.split('/')[-1][:-5] for x in df.file_name.values]
106 |         df = df.sort_values('file_name')
107 | 
108 |         files_id_ = [f.split('/')[-1][:-4] for f in files]
109 |         assert all(files_id_ == df.id_.values), 'Mismatch between LMS files and labels.'
110 | 
111 |         # convert label text into list of labels
112 |         df['labels'] = [[int(x) for x in label.split(' ')] for label in df.labels.values]
113 |         df['file_name'] = files
114 |         self.df = df
115 |         self.label_len = crop_size[1] // patch_len
116 |         self.n_classes = n_classes
117 | 
118 |     def complete_data(self, lms, label):
119 |         # crop & normalize
120 |         x = super().complete_audio(lms)
121 |         j = self.last_crop_start
122 |         if not hasattr(self, 'norm_stats'):
123 |             return x  # for norm_stats calculation
124 | 
125 |         label_j = (self.label_len * j) // self.crop_frames
126 |         assert (self.crop_frames % self.label_len) == 0, f'LMS frame length has to be multiple of label length.'
127 |         # convert label into one-hot encoding and shrink the label length
128 |         # repeat the last label for short labels to ensure that the frame length matches the label length
129 |         padded_frames = max(lms.shape[-1], self.crop_frames)
130 |         n_patches = (padded_frames + self.patch_len - 1) // self.patch_len
131 |         n_frames = n_patches * self.patch_len
132 |         n_labels = (n_frames + 1) // 2  # frames=100Hz vs labels=50Hz
133 |         if len(label) < n_labels:
134 |             n_repeat = n_labels - len(label)
135 |             label = label + ([label[-1]] * n_repeat)
136 |             assert len(label) == n_labels
137 |         # shrink labels to match the patch length
138 |         onehot = np.eye(self.n_classes)[label]
139 |         n_label_per_patch = self.patch_len // 2
140 |         cur_len = len(label)
141 |         new_len = cur_len // n_label_per_patch
142 |         onehot = onehot.T.reshape(-1, new_len, n_label_per_patch).sum(axis=-1).T
143 |         onehot = onehot / n_label_per_patch  # values in a one-hot label should sum to 1.
144 |         # crop label
145 |         onehot = onehot[label_j:label_j + self.label_len, :]
146 | 
147 |         if onehot.shape[0] < self.label_len:
148 |             print(onehot.shape, lms.shape, i, j, h, w, n_patches, n_frames, n_labels, cur_len, new_len, label_j, label_j + self.label_len)
149 |         return x, torch.tensor(onehot).to(float)
150 | 
151 |     def __getitem__(self, index):
152 |         filename = self.folder/self.df.file_name.values[index]
153 |         try:
154 |             hybrid = np.load(str(filename))
155 |         except:
156 |             assert False, f'Failed to load: {filename}'
157 |         lms = torch.tensor(hybrid['arr_0'])
158 | 
159 |         label = self.df.labels.values[index] if hasattr(self, 'norm_stats') else ['not needed']
160 |         items = self.complete_data(lms, label)
161 |         return items
162 | 
163 | 
164 | class MixedSpeechDataset(torch.utils.data.Dataset):
165 |     def __init__(self, base_folder, files_speech, files_bg_noise, crop_size, patch_len, noise_ratio=0.0,
166 |                  random_crop=True, n_norm_calc=10000, use_label=False) -> None:
167 |         super().__init__()
168 | 
169 |         ds_cls = SpeechHybridLabelDataset if use_label else SpeechHybridDataset
170 |         self.ds1 = ds_cls(folder=base_folder, files=files_speech, crop_size=crop_size,
171 |                 random_crop=random_crop, norm_stats=None, n_norm_calc=n_norm_calc//2,
172 |                 patch_len=patch_len)
173 |         # disable normalizion scaling in the ds1
174 |         self.norm_std = self.ds1.norm_stats[1]
175 |         self.ds1.norm_stats = (self.ds1.norm_stats[0], 1.0)
176 | 
177 |         if noise_ratio > 0.0:
178 |             self.ds2 = SpectrogramDataset(folder=base_folder, files=files_bg_noise, crop_frames=crop_size[1],
179 |                     random_crop=random_crop, norm_stats=None, n_norm_calc=n_norm_calc//2, repeat_short=True)
180 |             self.ds2.norm_stats = (self.ds2.norm_stats[0], 1.0) # disable normalizion scaling in the ds2
181 | 
182 |         self.noise_ratio = noise_ratio
183 |         self.bg_index = []
184 | 
185 |     def __len__(self):
186 |         return len(self.ds1)
187 | 
188 |     def __getitem__(self, index, fixed_noise=False):
189 |         # load index sample
190 |         sig, label = self.ds1[index]
191 |         if self.noise_ratio > 0.0:
192 |             # load random noise sample ### , while making noise floor zero
193 |             noise = self.ds2[index if fixed_noise else self.get_next_bgidx()][0]
194 |             # mix
195 |             sig = log_mixup_exp(noise, sig, self.noise_ratio) if self.noise_ratio < 1.0 else noise
196 |         # finish normalization. sig and noise were averaged to zero. the following will scale to 1.0 using ds1 std.
197 |         sig = sig / self.norm_std
198 |         return sig, label
199 | 
200 | 
201 |     def get_next_bgidx(self):
202 |         if len(self.bg_index) == 0:
203 |             self.bg_index = torch.randperm(len(self.ds2)).tolist()
204 |             # print(f'Refreshed the bg index list with {len(self.bg_index)} items: {self.bg_index[:5]}...')
205 |         return self.bg_index.pop(0)
206 | 
207 |     def __repr__(self):
208 |         format_string = self.__class__.__name__ + f'(crop_frames={self.ds1.crop_frames}, '
209 |         format_string += f'folder_sp={self.ds1.df.file_name.values[0].split("/")[0]}, '
210 |         if self.noise_ratio > 0.: format_string += f'folder_bg={self.ds2.df.file_name.values[0].split("/")[0]}, '
211 |         return format_string
212 | 
213 | 
214 | def build_mixed_speech_dataset(cfg):
215 |     ds = MixedSpeechDataset(
216 |         base_folder=cfg.data_path, files_speech=get_files(cfg.csv_main),
217 |         files_bg_noise=get_files(cfg.csv_bg_noise) if cfg.noise_ratio > 0. else [],
218 |         crop_size=cfg.input_size, patch_len=cfg.patch_size[1],
219 |         noise_ratio=cfg.noise_ratio, use_label=(cfg.model in ['m2d_s_vit_label_base', 'm2d_s_vit_label_bce_base',
220 |          'm2d_s_vit_label2_base', 'm2d_s_vit_label2_bce_base', 'm2d_s_vit_hubert_base']))
221 | 
222 |     val_ds = SpectrogramDataset(folder=cfg.data_path, files=get_files(cfg.csv_val), crop_frames=cfg.input_size[1], random_crop=True) \
223 |         if cfg.csv_val else None
224 | 
225 |     return ds, val_ds
226 | 
227 | 
228 | def build_viz_dataset(cfg):
229 |     files = [str(f).replace(str(cfg.data_path) + '/', '') for f in sorted(Path(cfg.data_path).glob('vis_speect_samples/*.npy'))]
230 |     if len(files) == 0:
231 |         return None, []
232 |     norm_stats = cfg.norm_stats if 'norm_stats' in cfg else None
233 |     ds = SpectrogramDataset(folder=cfg.data_path, files=files, crop_size=cfg.input_size, norm_stats=norm_stats)
234 |     return ds, files
235 | 


--------------------------------------------------------------------------------
/app/circor/circor_eval.py:
--------------------------------------------------------------------------------
  1 | """Main program for the paper: Exploring Pre-trained General-purpose Audio Representations for Heart Murmur Detection
  2 | """
  3 | 
  4 | import sys
  5 | sys.path.append('../heart-murmur-detection')
  6 | sys.path.append('../heart-murmur-detection/ModelEvaluation')
  7 | 
  8 | from evar.common import (sys, np, pd, kwarg_cfg, Path,
  9 |     torch, logging, append_to_csv, RESULT_DIR)
 10 | import torchaudio
 11 | import fire
 12 | 
 13 | from evar.data import create_dataloader
 14 | import evar
 15 | from lineareval import make_cfg
 16 | from finetune import TaskNetwork, finetune_main
 17 | 
 18 | from DataProcessing.find_and_load_patient_files import load_patient_data
 19 | from DataProcessing.helper_code import load_recordings
 20 | from ModelEvaluation.evaluate_model import evaluate_model
 21 | from tqdm import tqdm
 22 | 
 23 | 
 24 | def infer_and_eval(cfg, model, test_root, eval_mode='follow_prior_work'):
 25 |     model.eval()
 26 |     
 27 |     pids = sorted(list(set([f.stem.split('_')[0] for f in Path(test_root).glob('*.wav')])))  # evaluate_model.py::find_challenge_files -> sorted(os.listdir(label_folder))
 28 |     txt_files = [test_root+pid+'.txt' for pid in pids]
 29 |     print('Test file folder:', test_root)
 30 |     print('Test files:', pids[:2], txt_files[:2])
 31 |     softmax_fn = torch.nn.Softmax(dim=1)
 32 |     probabilities, wav_probabilities = [], []
 33 | 
 34 |     for txt in tqdm(txt_files):
 35 |         # Load recordigns
 36 |         data = load_patient_data(txt)
 37 |         recordings, frequencies = load_recordings(test_root, data, get_frequencies=True)
 38 |         recordings = [torch.tensor(r / 32768.).to(torch.float) for r in recordings]
 39 | 
 40 |         # Note: No normalization of raw audio wave. Already normalized in the pipeline.
 41 |         #   recordings[0].max() -> tensor(1.0000)
 42 |         #   recordings[0].min() -> tensor(-1.)
 43 |         # def normalize(wav):
 44 |         #     return wav / (1.0e-10 + wav.abs().max())
 45 |         # recordings = [normalize(r) for r in recordings]
 46 | 
 47 |         wavs = [torchaudio.transforms.Resample(f, cfg.sample_rate)(r) for r, f in zip(recordings, frequencies)]
 48 | 
 49 |         # Note: *No padding* because sample lengths are very different among recordings, for example: [164608, 150272, 105472, 460544]
 50 |         # print([len(w) for w in wavs])
 51 |         # max_len = max([len(w) for w in wavs])
 52 |         # wavs = [(np.pad(w, (0, max_len - len(w)) if len(w) < max_len else w) for w in wavs)]
 53 | 
 54 |         # Process per recording (with variable length)
 55 |         L = cfg.unit_samples  # number of samples for 5 sec
 56 |         logits = []
 57 |         for wav in wavs:
 58 |             if len(wav) < L:
 59 |                 wav = torch.nn.functional.pad(wav, (0, L - len(wav)))
 60 |             # Split wav into 5-s segments and encode them.
 61 |             segment_logits = []
 62 |             for widx, pos in enumerate(range(0, len(wav) - L + 1, L)):
 63 |                 segment = wav[pos:pos+L]
 64 |                 if len(segment) < L:
 65 |                     continue
 66 |                 with torch.no_grad():
 67 |                     x = segment.unsqueeze(0)
 68 |                     logit = model(x)
 69 |                 segment_logits.append(logit)       # [1, 3] for one chunk
 70 |             # Logits for one recording wav.
 71 |             logits.append(torch.stack(segment_logits).mean(0))
 72 | 
 73 |         # Reorder classes from ["Absent", "Present", "Unknown"] -> ["Present", "Unknown", "Absent"]
 74 |         logits = torch.vstack(logits)
 75 |         logits = logits[:, [1, 2, 0]]
 76 |         # Probabilities for each wav
 77 |         probs = logits.softmax(1).detach().to('cpu')
 78 |         wav_probabilities.append(probs)
 79 |         # Probability for the average logits
 80 |         probs = logits.mean(0, keepdims=True).softmax(1).detach().to('cpu')[0]
 81 |         probabilities.append(probs)
 82 | 
 83 |     probabilities = torch.stack(probabilities)
 84 |  
 85 |     def label_decision_rule(wav_probs):
 86 |         # Following Panah et al. “Exploring Wav2vec 2.0 Model for Heart Murmur Detection.” EUSIPCO, 2023, pp. 1010–14.
 87 |         cidxs = torch.argmax(wav_probs, dim=1)
 88 |         PRESENT, UNKNOWN, ABSENT = 0, 1, 2
 89 |         # - Assign present if at least one recording was classified as present.
 90 |         if PRESENT in cidxs:
 91 |             final_label = PRESENT
 92 |         # - Assign unknown if none of the recordings was classified as present, and at least one recording was classified  as unknown.
 93 |         elif UNKNOWN in cidxs:
 94 |             final_label = UNKNOWN
 95 |         # - Assign absent if all recordings were classified as absent.
 96 |         else:
 97 |             final_label = ABSENT
 98 |         return final_label
 99 | 
100 |     if eval_mode is None or eval_mode == 'follow_prior_work':
101 |         print('Label decision follows: Panah et al. “Exploring Wav2vec 2.0 Model for Heart Murmur Detection.” EUSIPCO, 2023, pp. 1010–14.')
102 |         cidxs = torch.tensor([label_decision_rule(wav_probs) for wav_probs in wav_probabilities])
103 |     elif eval_mode == 'normal':
104 |         print('Label decision is: torch.argmax(probabilities, dim=1)')
105 |         cidxs = torch.argmax(probabilities, dim=1)
106 |     else:
107 |         assert False, f'Unknown eval_mode: {eval_mode}'
108 |     labels = torch.nn.functional.one_hot(cidxs, num_classes=3)
109 | 
110 |     wav_probabilities = [p.numpy() for p in wav_probabilities]
111 |     probabilities = probabilities.numpy()
112 |     labels = labels.numpy()
113 |     return evaluate_model(test_root, probabilities, labels), (wav_probabilities, probabilities)
114 | 
115 | 
116 | def eval_main(config_file, task, checkpoint, options='', seed=42, lr=None, hidden=(), epochs=None, early_stop_epochs=None, warmup_epochs=None,
117 |               mixup=None, freq_mask=None, time_mask=None, rrc=None, training_mask=None, batch_size=None,
118 |               optim='sgd', unit_sec=None, verbose=False, data_path='work', eval_mode=None, save_prob=None):
119 |     
120 |     cfg, n_folds, balanced = make_cfg(config_file, task, options, extras={}, abs_unit_sec=unit_sec)
121 |     lr = lr or cfg.ft_lr
122 |     cfg.mixup = mixup if mixup is not None else cfg.mixup
123 |     cfg.ft_early_stop_epochs = early_stop_epochs if early_stop_epochs is not None else cfg.ft_early_stop_epochs
124 |     cfg.warmup_epochs = warmup_epochs if warmup_epochs is not None else cfg.warmup_epochs
125 |     cfg.ft_epochs = epochs or cfg.ft_epochs
126 |     cfg.ft_freq_mask = freq_mask if freq_mask is not None else cfg.ft_freq_mask
127 |     cfg.ft_time_mask = time_mask if time_mask is not None else cfg.ft_time_mask
128 |     cfg.ft_rrc = rrc if rrc is not None else (cfg.ft_rrc if 'ft_rrc' in cfg else False)
129 |     cfg.training_mask = training_mask if training_mask is not None else (cfg.training_mask if 'training_mask' in cfg else 0.0)
130 |     cfg.ft_bs = batch_size or cfg.ft_bs
131 |     cfg.optim = optim
132 |     cfg.unit_sec = unit_sec
133 |     cfg.data_path = data_path
134 | 
135 |     train_loader, valid_loader, test_loader, multi_label = create_dataloader(cfg, fold=n_folds-1, seed=seed, batch_size=cfg.ft_bs,
136 |         always_one_hot=True, balanced_random=balanced)
137 |     print('Classes:', train_loader.dataset.classes)
138 |     cfg.eval_checkpoint = checkpoint
139 | 
140 |     cfg.runtime_cfg = kwarg_cfg(lr=lr, seed=seed, hidden=hidden, mixup=cfg.mixup, bs=cfg.ft_bs,
141 |                                 freq_mask=cfg.ft_freq_mask, time_mask=cfg.ft_time_mask, rrc=cfg.ft_rrc, epochs=cfg.ft_epochs,
142 |                                 early_stop_epochs=cfg.ft_early_stop_epochs, n_class=len(train_loader.dataset.classes))
143 | 
144 |     device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
145 | 
146 |     # Make a fresh model
147 |     ar = eval('evar.'+cfg.audio_repr)(cfg).to(device)
148 |     if hasattr(train_loader, 'lms_mode') and train_loader.lms_mode:
149 |         ar.precompute_lms(device, train_loader)
150 |     else:
151 |         ar.precompute(device, train_loader)
152 |     task_model = TaskNetwork(cfg, ar).to(device)
153 |     task_model_dp = torch.nn.DataParallel(task_model).to(device)
154 |     # Load checkpoint
155 |     print('Using checkpoint', checkpoint)
156 |     print(task_model_dp.load_state_dict(torch.load(checkpoint, map_location=device)))
157 |     task_model_dp.eval()
158 | 
159 |     circor_no = task[-1]  # ex) '1' of 'circor1'
160 |     stratified_data = f'../heart-murmur-detection/data/stratified_data{circor_no}/test_data/'
161 |     results, probs = infer_and_eval(cfg, task_model_dp, stratified_data, eval_mode=eval_mode)
162 |     (   classes,
163 |         auroc,
164 |         auprc,
165 |         auroc_classes,
166 |         auprc_classes,
167 |         f_measure,
168 |         f_measure_classes,
169 |         accuracy,
170 |         accuracy_classes,
171 |         weighted_accuracy,
172 |         uar,
173 |     ) = results
174 | 
175 |     name  = f'{cfg.id}{"" if cfg.weight_file != "" else "/rnd"}-'
176 |     report = f'Finetuning {name} on {task} -> weighted_accuracy: {weighted_accuracy:.5f}, UAR: {uar:.5f}, recall per class: {accuracy_classes}'
177 |     report += f', best weight: {checkpoint}, config: {cfg}'
178 |     logging.info(report)
179 | 
180 |     result_df = pd.DataFrame({
181 |         'representation': [cfg.id.split('_')[-2]], # AR name
182 |         'task': [task],
183 |         'wacc': [weighted_accuracy],
184 |         'uar': [uar],
185 |         'r_Present': [accuracy_classes[0]],
186 |         'r_Unknown': [accuracy_classes[1]],
187 |         'r_Absent': [accuracy_classes[2]],
188 |         'weight_file': [cfg.weight_file],
189 |         'run_id': [cfg.id],
190 |         'report': [report],
191 |     })
192 |     csv_name = {
193 |         None: 'circor-scores.csv',
194 |         'follow_prior_work': 'circor-scores.csv',
195 |         'normal': 'circor-scores-wo-rule.csv',
196 |     }[eval_mode]
197 |     append_to_csv(f'{RESULT_DIR}/{csv_name}', result_df)
198 | 
199 |     if save_prob is not None:
200 |         for i, var in zip(['_1', '_2'], probs):
201 |             prob_name = Path(save_prob)/str(checkpoint).replace('/', '-').replace('.pth', i + '.npy')
202 |             #probs = [p.numpy() for p in probs]
203 |             prob_name.parent.mkdir(parents=True, exist_ok=True)
204 |             np.save(prob_name, np.array(var, dtype=object))
205 |             print('Probabilities saved as:', prob_name)
206 | 
207 | 
208 | def finetune_circor(config_file, task, options='', seed=42, lr=None, hidden=(), epochs=None, early_stop_epochs=None, warmup_epochs=None,
209 |                   mixup=None, freq_mask=None, time_mask=None, rrc=None, training_mask=None, batch_size=None,
210 |                   optim='sgd', unit_sec=None, verbose=False, data_path='work', eval_only=None, eval_mode=None, save_prob='probs'):
211 | 
212 |     assert task in [f'circor{n}' for n in range(1, 3+1)]
213 | 
214 |     # We train a model using the original fine-tuner from the EVAR (finetune_main), and the best_path holds the path of the best weight.
215 |     # This part is the same training process as what we have been doing in BYOL-A and M2D.
216 |     if eval_only is None:
217 |         report, scores, best_path, name, cfg, logpath = finetune_main(config_file, task, options=options, seed=seed, lr=lr, hidden=hidden, epochs=epochs,
218 |             early_stop_epochs=early_stop_epochs, warmup_epochs=warmup_epochs,
219 |             mixup=mixup, freq_mask=freq_mask, time_mask=time_mask, rrc=rrc, training_mask=training_mask, batch_size=batch_size,
220 |             optim=optim, unit_sec=unit_sec, verbose=verbose, data_path=data_path)
221 |         del report, scores, name, cfg, logpath
222 |     else:
223 |         best_path = eval_only
224 | 
225 |     # Then, we evaluate the trained model specifically for the CirCor problem setting.
226 |     return eval_main(config_file, task, best_path, options=options, seed=seed, lr=lr, hidden=hidden, epochs=epochs,
227 |         early_stop_epochs=early_stop_epochs, warmup_epochs=warmup_epochs,
228 |         mixup=mixup, freq_mask=freq_mask, time_mask=time_mask, rrc=rrc, training_mask=training_mask, batch_size=batch_size,
229 |         optim=optim, unit_sec=unit_sec, verbose=verbose, data_path=data_path, eval_mode=eval_mode, save_prob=save_prob)
230 | 
231 | 
232 | if __name__ == '__main__':
233 |     fire.Fire(finetune_circor)
234 | 


--------------------------------------------------------------------------------
/audio_dataset.py:
--------------------------------------------------------------------------------
  1 | """Dataset for Spectrogram Audio.
  2 | 
  3 | ## Data files
  4 | All the data samples used here are expected to be `.npy` pre-converted spectrograms.
  5 | Please find instructions in `README.md`.
  6 | 
  7 | ## Data folder structure
  8 | We expect the following data folder structure.
  9 | Note that our training pipeline uses samples from the folder `vis_samples` for visualization.
 10 | Make a folder named `vis_samples` under the root folder of the dataset, and put some samples for visualization in the `vis_samples`.
 11 | 
 12 |     (data root)/(any sub-folder)/(data samples).npy
 13 |       :
 14 |     (data root)/vis_samples/(data samples for visualization).npy
 15 |       :
 16 | """
 17 | 
 18 | import pandas as pd
 19 | import numpy as np
 20 | from pathlib import Path
 21 | import torch
 22 | import torch.nn.functional as F
 23 | 
 24 | 
 25 | class SpectrogramDataset(torch.utils.data.Dataset):
 26 |     """Spectrogram audio dataset class.
 27 |     Args:
 28 |         folder: Root folder that stores audio samples.
 29 |         files: List of relative path names from the root folder for all samples.
 30 |         crop_frames: Number of time frames of a data which this class outputs.
 31 |         norm_stats: Normalization statistics comprising mean and standard deviation.
 32 |             If None, statistics are calculated at runtime.
 33 |             If a pathname, the precomputed statistics will be loaded.
 34 |         tfms: Transform functions for data augmentation.
 35 |         random_crop: Set True to randomly crop data of length crop_frames,
 36 |             or always crop from the beginning of a sample.
 37 |         n_norm_calc: Number of samples to calculate normalization statistics at runtime.
 38 |     """
 39 | 
 40 |     def __init__(self, folder, files, crop_frames, norm_stats=None,
 41 |                  tfms=None, random_crop=True, n_norm_calc=10000, repeat_short=False):
 42 |         super().__init__()
 43 |         self.folder = Path(folder)
 44 |         self.df = pd.DataFrame({'file_name': files})
 45 |         self.crop_frames = crop_frames
 46 |         self.tfms = tfms
 47 |         self.random_crop = random_crop
 48 |         self.repeat_short = repeat_short
 49 | 
 50 |         # Norm stats
 51 |         if norm_stats is None:
 52 |             # Calculate norm stats runtime
 53 |             lms_vectors = [self[i][0] for i in np.random.randint(0, len(files), size=n_norm_calc)]
 54 |             lms_vectors = torch.stack(lms_vectors)
 55 |             norm_stats = lms_vectors.mean(), lms_vectors.std() + torch.finfo().eps
 56 |         elif isinstance(norm_stats, (str)):
 57 |             # Load from a file
 58 |             if Path(norm_stats).exists():
 59 |                 norm_stats = torch.FloatTensor(np.load(norm_stats))
 60 |             else:
 61 |                 # Create a norm stat file and save it. The created file will be loaded at the next runtime.
 62 |                 lms_vectors = [self[i][0] for i in np.random.randint(0, len(files), size=n_norm_calc)]
 63 |                 lms_vectors = torch.vstack(lms_vectors)
 64 |                 new_stats = lms_vectors.mean(axis=(0, 2), keepdims=True), lms_vectors.std(axis=(0, 2), keepdims=True) + torch.finfo().eps
 65 |                 np.save(norm_stats, torch.stack(new_stats).numpy())
 66 |                 norm_stats = new_stats
 67 |         self.norm_stats = norm_stats
 68 | 
 69 |         print(f'Dataset contains {len(self.df)} files with a normalizing stats {self.norm_stats}.')
 70 | 
 71 |     def __len__(self):
 72 |         return len(self.df)
 73 | 
 74 |     def get_audio_file(self, filename):
 75 |         lms = torch.tensor(np.load(filename))
 76 |         return lms
 77 | 
 78 |     def get_audio(self, index):
 79 |         filename = self.folder/self.df.file_name.values[index]
 80 |         return self.get_audio_file(filename)
 81 | 
 82 |     def complete_audio(self, lms, dont_tfms=False, org_index=None):
 83 |         # Repeat if short
 84 |         l = lms.shape[-1]
 85 |         if self.repeat_short and l < self.crop_frames:
 86 |             while l < self.crop_frames:
 87 |                 lms = torch.cat([lms, lms], dim=-1)
 88 |                 l = lms.shape[-1]
 89 |             # print(f'Repeated short sample (< {self.crop_frames}) at {org_index} as {lms.shape}')
 90 | 
 91 |         # Trim or pad
 92 |         start = 0
 93 |         if l > self.crop_frames:
 94 |             start = int(torch.randint(l - self.crop_frames, (1,))[0]) if self.random_crop else 0
 95 |             lms = lms[..., start:start + self.crop_frames]
 96 |             # if org_index is not None and org_index % 1000 == 0:
 97 |             #     print(org_index, 'trimmed from', start)
 98 |         elif l < self.crop_frames:
 99 |             pad_param = []
100 |             for i in range(len(lms.shape)):
101 |                 pad_param += [0, self.crop_frames - l] if i == 0 else [0, 0]
102 |             lms = F.pad(lms, pad_param, mode='constant', value=0)
103 |         self.last_crop_start = start
104 |         lms = lms.to(torch.float)
105 | 
106 |         # Normalize
107 |         if hasattr(self, 'norm_stats'):
108 |             lms = (lms - self.norm_stats[0]) / self.norm_stats[1]
109 | 
110 |         # Apply transforms
111 |         if self.tfms is not None:
112 |             if not dont_tfms:
113 |                 lms = self.tfms(lms)
114 | 
115 |         return lms
116 | 
117 |     def __getitem__(self, index):
118 |         lms = self.get_audio(index)
119 |         return self.complete_audio(lms, org_index=index)
120 | 
121 |     def __repr__(self):
122 |         format_string = self.__class__.__name__ + f'(crop_frames={self.crop_frames}, random_crop={self.random_crop}, '
123 |         format_string += f'tfms={self.tfms}\n'
124 |         return format_string
125 | 
126 | 
127 | def get_files(dataset_name):
128 |     files = pd.read_csv(str(dataset_name)).file_name.values
129 |     files = sorted(files)
130 |     return files
131 | 
132 | 
133 | def get_files_no_sort(dataset_name):
134 |     return pd.read_csv(str(dataset_name)).file_name.values
135 | 
136 | 
137 | def build_dataset(cfg):
138 |     """The followings configure the training dataset details.
139 |         - data_path: Root folder of the training dataset.
140 |         - dataset: The _name_ of the training dataset, an stem name of a `.csv` training data list.
141 |         - norm_stats: Normalization statistics, a list of [mean, std].
142 |         - input_size: Input size, a list of [# of freq. bins, # of time frames].
143 |     """
144 | 
145 |     transforms = None # Future options: torch.nn.Sequential(*transforms) if transforms else None
146 |     norm_stats = cfg.norm_stats if 'norm_stats' in cfg else None
147 |     ds = SpectrogramDataset(folder=cfg.data_path, files=get_files(cfg.dataset), crop_frames=cfg.input_size[1],
148 |             tfms=transforms, norm_stats=norm_stats)
149 |     return ds
150 | 
151 | 
152 | def build_viz_dataset(cfg):
153 |     files = [str(f).replace(str(cfg.data_path) + '/', '') for f in sorted(Path(cfg.data_path).glob('vis_samples/*.npy'))]
154 |     if len(files) == 0:
155 |         return None, []
156 |     norm_stats = cfg.norm_stats if 'norm_stats' in cfg else None
157 |     ds = SpectrogramDataset(folder=cfg.data_path, files=files, crop_frames=cfg.input_size[1], tfms=None, norm_stats=norm_stats)
158 |     return ds, files
159 | 
160 | 
161 | # Mixed dataset
162 | 
163 | def log_mixup_exp(xa, xb, alpha):
164 |     xa = xa.exp()
165 |     xb = xb.exp()
166 |     x = alpha * xa + (1. - alpha) * xb
167 |     return torch.log(torch.max(x, torch.finfo(x.dtype).eps*torch.ones_like(x)))
168 | 
169 | 
170 | class MixedSpecDataset(torch.utils.data.Dataset):
171 |     def __init__(self, base_folder, files_main, files_bg_noise, crop_size, noise_ratio=0.0,
172 |                  random_crop=True, n_norm_calc=10000) -> None:
173 |         super().__init__()
174 | 
175 |         self.ds1 = SpectrogramDataset(folder=base_folder, files=files_main, crop_frames=crop_size[1],
176 |                 random_crop=random_crop, norm_stats=None,
177 |                 n_norm_calc=n_norm_calc//2)
178 |         self.norm_stats = self.ds1.norm_stats  # for compatibility with SpectrogramDataset
179 |         # disable normalizion scaling in the ds1
180 |         self.norm_std = self.ds1.norm_stats[1]
181 |         self.ds1.norm_stats = (self.ds1.norm_stats[0], 1.0)
182 | 
183 |         if noise_ratio > 0.0:
184 |             self.ds2 = SpectrogramDataset(folder=base_folder, files=files_bg_noise, crop_frames=crop_size[1],
185 |                     random_crop=random_crop, norm_stats=None, n_norm_calc=n_norm_calc//2, repeat_short=True)
186 |             self.ds2.norm_stats = (self.ds2.norm_stats[0], 1.0) # disable normalizion scaling in the ds2
187 | 
188 |         self.noise_ratio = noise_ratio
189 |         self.bg_index = []
190 | 
191 |     def __len__(self):
192 |         return len(self.ds1)
193 | 
194 |     def __getitem__(self, index, fixed_noise=False):
195 |         # load index sample
196 |         clean = self.ds1[index]
197 |         if self.noise_ratio > 0.0:
198 |             # load random noise sample ### , while making noise floor zero
199 |             noise = self.ds2[index if fixed_noise else self.get_next_bgidx()]
200 |             # mix
201 |             mixed = log_mixup_exp(noise, clean, self.noise_ratio) if self.noise_ratio < 1.0 else noise
202 |         else:
203 |             mixed = clean.clone()
204 |         # finish normalization. clean and noise were averaged to zero. the following will scale to 1.0 using ds1 std.
205 |         clean = clean / self.norm_std
206 |         mixed = mixed / self.norm_std
207 |         return clean, mixed
208 | 
209 | 
210 |     def get_next_bgidx(self):
211 |         if len(self.bg_index) == 0:
212 |             self.bg_index = torch.randperm(len(self.ds2)).tolist()
213 |             # print(f'Refreshed the bg index list with {len(self.bg_index)} items: {self.bg_index[:5]}...')
214 |         return self.bg_index.pop(0)
215 | 
216 |     def __repr__(self):
217 |         format_string = self.__class__.__name__ + f'(crop_frames={self.ds1.crop_frames}, '
218 |         format_string += f'folder_sp={self.ds1.df.file_name.values[0].split("/")[0]}, '
219 |         if self.noise_ratio > 0.: format_string += f'folder_bg={self.ds2.df.file_name.values[0].split("/")[0]}, '
220 |         return format_string
221 | 
222 | 
223 | def inflate_files(files, desired_size):
224 |     if len(files) == 0:
225 |         return files
226 |     files = list(files)  # make sure `files`` is a list
227 |     while len(files) < desired_size:
228 |         files = (files + files)[:desired_size]
229 |     return files
230 | 
231 | 
232 | def build_mixed_dataset(cfg):
233 |     """The followings configure the training dataset details.
234 |         - data_path: Root folder of the training dataset.
235 |         - dataset: The _name_ of the training dataset, an stem name of a `.csv` training data list.
236 |         - norm_stats: Normalization statistics, a list of [mean, std].
237 |         - input_size: Input size, a list of [# of freq. bins, # of time frames].
238 |     """
239 | 
240 |     # get files and inflate the number of files (by repeating the list) if needed
241 |     files_main = get_files(cfg.csv_main)
242 |     files_bg = get_files(cfg.csv_bg_noise) if cfg.noise_ratio > 0. else []
243 |     desired_min_size = 0
244 |     if 'min_ds_size' in cfg and cfg.min_ds_size > 0:
245 |         desired_min_size = cfg.min_ds_size
246 |     if desired_min_size > 0:
247 |         old_sizes = len(files_main), len(files_bg)
248 |         files_main, files_bg = inflate_files(files_main, desired_min_size), inflate_files(files_bg, desired_min_size)
249 |         print('The numbers of data files are increased from', old_sizes, 'to', (len(files_main), len(files_bg)))
250 | 
251 |     ds = MixedSpecDataset(
252 |         base_folder=cfg.data_path, files_main=files_main,
253 |         files_bg_noise=files_bg,
254 |         crop_size=cfg.input_size,
255 |         noise_ratio=cfg.noise_ratio,
256 |         random_crop=True)
257 |     if 'weighted' in cfg and cfg.weighted:
258 |         assert desired_min_size == 0
259 |         ds.weight = pd.read_csv(cfg.csv_main).weight.values
260 | 
261 |     val_ds = SpectrogramDataset(folder=cfg.data_path, files=get_files(cfg.csv_val), crop_frames=cfg.input_size[1], random_crop=True) \
262 |         if cfg.csv_val else None
263 | 
264 |     return ds, val_ds
265 | 
266 | 
267 | def build_mixed_viz_dataset(cfg):
268 |     files = [str(f).replace(str(cfg.data_path) + '/', '') for f in sorted(Path(cfg.data_path).glob('vis_samples/*.npy'))]
269 |     if len(files) == 0:
270 |         return None, []
271 |     norm_stats = cfg.norm_stats if 'norm_stats' in cfg else None
272 |     ds = SpectrogramDataset(folder=cfg.data_path, files=files, crop_frames=cfg.input_size[1], tfms=None, norm_stats=norm_stats)
273 |     return ds, files
274 | 
275 | 
276 | if __name__ == '__main__':
277 |     # Test
278 |     ds = MixedSpecDataset(base_folder='data', files_main=get_files('data/files_gtzan.csv'),
279 |                           files_bg_noise=get_files('data/files_audioset.csv'),
280 |                           crop_size=[80, 608], noise_ratio=0.2, random_crop=True, n_norm_calc=10)
281 |     for i in range(0, 10):
282 |         clean, mixed = ds[i]
283 |         print(clean.shape, mixed.shape)
284 | 


--------------------------------------------------------------------------------
/examples/Example_old4_CLAP2024.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# M2D-CLAP example\n",
  8 |     "\n",
  9 |     "This is an example of CLAP part of our Interspeech 2024 paper.\n",
 10 |     "\n",
 11 |     "```bibtex\n",
 12 |     "@InProceedings{\t  niizumi2024M2D-CLAP,\n",
 13 |     "  title\t\t= {{M2D-CLAP: Masked Modeling Duo Meets CLAP for Learning General-purpose Audio-Language Representation}},\n",
 14 |     "  author\t= {Daisuke Niizumi and Daiki Takeuchi and Yasunori Ohishi and Noboru Harada and Masahiro Yasuda and Shunsuke Tsubaki and Keisuke Imoto},\n",
 15 |     "  year\t\t= {2024},\n",
 16 |     "  booktitle\t= {Interspeech},\n",
 17 |     "  pages\t\t= {57--61},\n",
 18 |     "  doi\t\t= {10.21437/Interspeech.2024-29},\n",
 19 |     "  issn\t\t= {2958-1796}}\n",
 20 |     "```"
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "code",
 25 |    "execution_count": 1,
 26 |    "metadata": {},
 27 |    "outputs": [],
 28 |    "source": [
 29 |     "import warnings; warnings.simplefilter('ignore')\n",
 30 |     "import logging\n",
 31 |     "logging.basicConfig(level=logging.INFO)\n",
 32 |     "import sys\n",
 33 |     "sys.path.append('..')\n",
 34 |     "import torch\n",
 35 |     "from pathlib import Path\n",
 36 |     "import numpy as np"
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "code",
 41 |    "execution_count": 2,
 42 |    "metadata": {},
 43 |    "outputs": [
 44 |     {
 45 |      "name": "stdout",
 46 |      "output_type": "stream",
 47 |      "text": [
 48 |       " using default norm_stats: tensor([-7.1000,  4.2000])\n"
 49 |      ]
 50 |     },
 51 |     {
 52 |      "name": "stderr",
 53 |      "output_type": "stream",
 54 |      "text": [
 55 |       "INFO:root:<All keys matched successfully>\n",
 56 |       "INFO:root:Model input size: [80, 608]\n",
 57 |       "INFO:root:Using weights: m2d_clap_vit_base-80x608p16x16-240128/checkpoint-300.pth\n",
 58 |       "INFO:root:Feature dimension: 768\n",
 59 |       "INFO:root:Norm stats: -7.099999904632568, 4.199999809265137\n",
 60 |       "INFO:root:Runtime MelSpectrogram(16000, 400, 400, 160, 80, 50, 8000):\n",
 61 |       "INFO:root:MelSpectrogram(\n",
 62 |       "  Mel filter banks size = (80, 201), trainable_mel=False\n",
 63 |       "  (stft): STFT(n_fft=400, Fourier Kernel size=(201, 1, 400), iSTFT=False, trainable=False)\n",
 64 |       ")\n"
 65 |      ]
 66 |     },
 67 |     {
 68 |      "name": "stdout",
 69 |      "output_type": "stream",
 70 |      "text": [
 71 |       " using 155 parameters, while dropped 251 out of 406 parameters from m2d_clap_vit_base-80x608p16x16-240128/checkpoint-300.pth\n",
 72 |       " (dropped: ['mask_token', 'decoder_pos_embed', 'logit_scale', 'decoder_embed.weight', 'decoder_embed.bias'] ...)\n",
 73 |       "<All keys matched successfully>\n"
 74 |      ]
 75 |     }
 76 |    ],
 77 |    "source": [
 78 |     "from portable_m2d import PortableM2D\n",
 79 |     "weight = 'm2d_clap_vit_base-80x608p16x16-240128/checkpoint-300.pth'\n",
 80 |     "model = PortableM2D(weight_file=weight, flat_features=True)\n"
 81 |    ]
 82 |   },
 83 |   {
 84 |    "cell_type": "code",
 85 |    "execution_count": 3,
 86 |    "metadata": {},
 87 |    "outputs": [],
 88 |    "source": [
 89 |     "files = ['AudioSetWav16k/eval_segments/-22tna7KHzI_28.000.wav',\n",
 90 |     "    'AudioSetWav16k/eval_segments/-0xzrMun0Rs_30.000.wav',\n",
 91 |     "    'AudioSetWav16k/eval_segments/3tUlhM80ObM_0.000.wav',\n",
 92 |     "    'AudioSetWav16k/eval_segments/-1nilez17Dg_30.000.wav',\n",
 93 |     "    'AudioSetWav16k/eval_segments/--U7joUcTCo_0.000.wav',\n",
 94 |     "    'AudioSetWav16k/eval_segments/5hlsVoxJPNI_30.000.wav',]\n",
 95 |     "captions = ['The sound of Explosion.',\n",
 96 |     "    'The sound of Stomach rumble, and Music.',\n",
 97 |     "    'The sound of Knock.',\n",
 98 |     "    'The sound of Heart murmur, and Speech.',\n",
 99 |     "    \"A man's laughter abruptly interrupts as someone sneezes, suggesting a casual gathering or social event.\",\n",
100 |     "    \"The sound of Christmas music, Music, and Speech.\",]"
101 |    ]
102 |   },
103 |   {
104 |    "cell_type": "code",
105 |    "execution_count": 4,
106 |    "metadata": {},
107 |    "outputs": [],
108 |    "source": [
109 |     "import librosa\n",
110 |     "\n",
111 |     "with torch.no_grad():\n",
112 |     "    audios = [librosa.load(f, sr=16000)[0] for f in files]\n",
113 |     "    audios = [np.pad(a, (0, 16000 * 10 - a.shape[-1])) for a in audios]  # Make sure all files are 10-s.\n",
114 |     "    audios = torch.tensor(audios)\n",
115 |     "    audio_embs = model.encode_clap_audio(audios)\n",
116 |     "    text_embs = model.encode_clap_text(captions)"
117 |    ]
118 |   },
119 |   {
120 |    "cell_type": "code",
121 |    "execution_count": 5,
122 |    "metadata": {},
123 |    "outputs": [
124 |     {
125 |      "data": {
126 |       "text/plain": [
127 |        "(torch.Size([6, 768]), torch.Size([6, 768]))"
128 |       ]
129 |      },
130 |      "execution_count": 5,
131 |      "metadata": {},
132 |      "output_type": "execute_result"
133 |     }
134 |    ],
135 |    "source": [
136 |     "audio_embs.shape, text_embs.shape"
137 |    ]
138 |   },
139 |   {
140 |    "cell_type": "code",
141 |    "execution_count": 6,
142 |    "metadata": {},
143 |    "outputs": [
144 |     {
145 |      "name": "stdout",
146 |      "output_type": "stream",
147 |      "text": [
148 |       "[[ 0.08543  0.08029  0.02879  0.01966  0.00866 -0.00212]\n",
149 |       " [-0.0049   0.06878  0.02576 -0.00711 -0.02833  0.01233]\n",
150 |       " [ 0.04179  0.01696  0.13246 -0.00467  0.01645 -0.00098]\n",
151 |       " [-0.00217  0.0425  -0.00594  0.10569 -0.00474  0.00028]\n",
152 |       " [ 0.05769  0.02339  0.04664  0.01432  0.08724  0.02567]\n",
153 |       " [-0.04205 -0.00013 -0.04844  0.00155 -0.02319  0.04316]]\n"
154 |      ]
155 |     },
156 |     {
157 |      "data": {
158 |       "image/png": "iVBORw0KGgoAAAANSUhEUgAAAZgAAAGdCAYAAAAv9mXmAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjguMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8g+/7EAAAACXBIWXMAAA9hAAAPYQGoP6dpAAAUWklEQVR4nO3df4yUhb3v8e+y6w7+WFZR+bFlQT1WDHqWRhQusVYrVM/GEG1OGkJISmhPe9osjYR402xuUvSPZsnNTaO3cinpL/4pQdsETUyFUlogTaXCEnLBpka8Nq7hV+1Nd5dNHHF37h833XM4iocBvvM4O69X8iTdyTM8nye0vDsz7NJUqVQqAQCX2aSiBwAwMQkMACkEBoAUAgNACoEBIIXAAJBCYABIITAApGip9QXHxsbi+PHj0dbWFk1NTbW+PACXoFKpxPDwcHR0dMSkSR//GqXmgTl+/Hh0dnbW+rIAXEYDAwMxa9asjz2n5oFpa2uLiIhlL6yIK65urfXlCzV0dnLRE2ruz3vnFD2hEFecKXpBMVoHG+8nT5WGxoqeUFOjZ9+L/pe/O/5n+cepeWD+/rbYFVe3NlxgrjjbWPcbEdFcaryoRkQ0ny16QTGaWxsvMC1XNFZg/u5CPuLwIT8AKQQGgBQCA0AKgQEghcAAkEJgAEghMACkEBgAUggMACkEBoAUAgNACoEBIIXAAJBCYABIITAApBAYAFIIDAApBAaAFAIDQAqBASCFwACQQmAASCEwAKS4qMBs3Lgxbrrpppg8eXIsWrQoXn311cu9C4A6V3VgnnvuuVi3bl2sX78+Dh06FPPnz4+HH344Tp8+nbEPgDpVdWC+973vxde+9rVYvXp1zJs3L37wgx/EVVddFT/5yU8y9gFQp6oKzPvvvx/9/f2xdOnSf/sFJk2KpUuXxiuvvPKRzymXyzE0NHTOAcDEV1Vg3n333RgdHY3p06ef8/j06dPj5MmTH/mcvr6+aG9vHz86Ozsvfi0AdSP9b5H19vbG4ODg+DEwMJB9SQA+AVqqOfmGG26I5ubmOHXq1DmPnzp1KmbMmPGRzymVSlEqlS5+IQB1qapXMK2trbFgwYLYvXv3+GNjY2Oxe/fuWLx48WUfB0D9quoVTETEunXrYtWqVXH33XfHwoUL4+mnn46RkZFYvXp1xj4A6lTVgVm+fHn85S9/ie985ztx8uTJ+MxnPhM7duz40Af/ADS2qgMTEbFmzZpYs2bN5d4CwATiZ5EBkEJgAEghMACkEBgAUggMACkEBoAUAgNACoEBIIXAAJBCYABIITAApBAYAFIIDAApBAaAFAIDQAqBASCFwACQQmAASCEwAKQQGABSCAwAKQQGgBQCA0AKgQEgRUtRFz7+vX+IlismF3X5Qgz9y1DRE2qu6YOiFxTjutfPFj2hEFcd+79FT6i5Uw9MK3pCTY2+f+GvS7yCASCFwACQQmAASCEwAKQQGABSCAwAKQQGgBQCA0AKgQEghcAAkEJgAEghMACkEBgAUggMACkEBoAUAgNACoEBIIXAAJBCYABIITAApBAYAFIIDAApBAaAFAIDQAqBASCFwACQourA7Nu3L5YtWxYdHR3R1NQUL7zwQsIsAOpd1YEZGRmJ+fPnx8aNGzP2ADBBtFT7hO7u7uju7s7YAsAEUnVgqlUul6NcLo9/PTQ0lH1JAD4B0j/k7+vri/b29vGjs7Mz+5IAfAKkB6a3tzcGBwfHj4GBgexLAvAJkP4WWalUilKplH0ZAD5hfB8MACmqfgVz5syZOHbs2PjXb731Vhw+fDimTp0as2fPvqzjAKhfVQfm4MGD8fnPf37863Xr1kVExKpVq2LLli2XbRgA9a3qwDzwwANRqVQytgAwgfgMBoAUAgNACoEBIIXAAJBCYABIITAApBAYAFIIDAApBAaAFAIDQAqBASCFwACQQmAASCEwAKQQGABSCAwAKQQGgBQCA0AKgQEghcAAkEJgAEghMACkEBgAUrQUdeHjn5sUkyY3Vt/afj216Ak191rv/yp6QiHu/9evFz2hECceml70hJq7+uRo0RNq6oOzYxd8bmP9CQ9AzQgMACkEBoAUAgNACoEBIIXAAJBCYABIITAApBAYAFIIDAApBAaAFAIDQAqBASCFwACQQmAASCEwAKQQGABSCAwAKQQGgBQCA0AKgQEghcAAkEJgAEghMACkEBgAUggMACkEBoAUVQWmr68v7rnnnmhra4tp06bFY489Fq+//nrWNgDqWFWB2bt3b/T09MT+/ftj165dcfbs2XjooYdiZGQkax8AdaqlmpN37NhxztdbtmyJadOmRX9/f3zuc5+7rMMAqG9VBeY/GhwcjIiIqVOnnveccrkc5XJ5/OuhoaFLuSQAdeKiP+QfGxuLtWvXxr333ht33nnnec/r6+uL9vb28aOzs/NiLwlAHbnowPT09MTRo0dj27ZtH3teb29vDA4Ojh8DAwMXe0kA6shFvUW2Zs2aeOmll2Lfvn0xa9asjz23VCpFqVS6qHEA1K+qAlOpVOJb3/pWbN++Pfbs2RM333xz1i4A6lxVgenp6YmtW7fGiy++GG1tbXHy5MmIiGhvb48rr7wyZSAA9amqz2A2bdoUg4OD8cADD8TMmTPHj+eeey5rHwB1quq3yADgQvhZZACkEBgAUggMACkEBoAUAgNACoEBIIXAAJBCYABIITAApBAYAFIIDAApBAaAFAIDQAqBASCFwACQQmAASCEwAKQQGABSCAwAKQQGgBQCA0AKgQEghcAAkKKlqAvPeKUSLVdUirp8IY7fN1b0hJq7/+tfL3pCIVb/jxeKnlCIn/7Xx4qeUHMfTG4qekJNVaq4Xa9gAEghMACkEBgAUggMACkEBoAUAgNACoEBIIXAAJBCYABIITAApBAYAFIIDAApBAaAFAIDQAqBASCFwACQQmAASCEwAKQQGABSCAwAKQQGgBQCA0AKgQEghcAAkEJgAEghMACkqCowmzZtiq6urpgyZUpMmTIlFi9eHC+//HLWNgDqWFWBmTVrVmzYsCH6+/vj4MGD8eCDD8ajjz4ar732WtY+AOpUSzUnL1u27Jyvv/vd78amTZti//79cccdd1zWYQDUt6oC8++Njo7Gz3/+8xgZGYnFixef97xyuRzlcnn866GhoYu9JAB1pOoP+Y8cORLXXHNNlEql+MY3vhHbt2+PefPmnff8vr6+aG9vHz86OzsvaTAA9aHqwMydOzcOHz4cf/jDH+Kb3/xmrFq1Kv74xz+e9/ze3t4YHBwcPwYGBi5pMAD1oeq3yFpbW+PWW2+NiIgFCxbEgQMH4plnnonNmzd/5PmlUilKpdKlrQSg7lzy98GMjY2d8xkLAERU+Qqmt7c3uru7Y/bs2TE8PBxbt26NPXv2xM6dO7P2AVCnqgrM6dOn48tf/nKcOHEi2tvbo6urK3bu3Blf+MIXsvYBUKeqCsyPf/zjrB0ATDB+FhkAKQQGgBQCA0AKgQEghcAAkEJgAEghMACkEBgAUggMACkEBoAUAgNACoEBIIXAAJBCYABIITAApBAYAFIIDAApBAaAFAIDQAqBASCFwACQQmAASCEwAKQQGABStBR14ZPL3o9JVzVW367631cWPaHmBrorRU8oxP985p+LnlCI+f/taNETau7VF/+x6Ak1NVpuvuBzG+tPeABqRmAASCEwAKQQGABSCAwAKQQGgBQCA0AKgQEghcAAkEJgAEghMACkEBgAUggMACkEBoAUAgNACoEBIIXAAJBCYABIITAApBAYAFIIDAApBAaAFAIDQAqBASCFwACQQmAASHFJgdmwYUM0NTXF2rVrL9McACaKiw7MgQMHYvPmzdHV1XU59wAwQVxUYM6cORMrV66MH/7wh3Hddddd7k0ATAAXFZienp545JFHYunSpf/pueVyOYaGhs45AJj4Wqp9wrZt2+LQoUNx4MCBCzq/r68vnnrqqaqHAVDfqnoFMzAwEI8//nj87Gc/i8mTJ1/Qc3p7e2NwcHD8GBgYuKihANSXql7B9Pf3x+nTp+Ouu+4af2x0dDT27dsXzz77bJTL5Whubj7nOaVSKUql0uVZC0DdqCowS5YsiSNHjpzz2OrVq+P222+Pb3/72x+KCwCNq6rAtLW1xZ133nnOY1dffXVcf/31H3ocgMbmO/kBSFH13yL7j/bs2XMZZgAw0XgFA0AKgQEghcAAkEJgAEghMACkEBgAUggMACkEBoAUAgNACoEBIIXAAJBCYABIITAApBAYAFIIDAApBAaAFAIDQAqBASCFwACQQmAASCEwAKQQGABSCAwAKVqKuvCt6/5PtDS1FnX5Qgz907yiJ9Rcx3//Q9ETCjG8/L8UPaEQJ/91VtETau691WNFT6ipsfcu/H69ggEghcAAkEJgAEghMACkEBgAUggMACkEBoAUAgNACoEBIIXAAJBCYABIITAApBAYAFIIDAApBAaAFAIDQAqBASCFwACQQmAASCEwAKQQGABSCAwAKQQGgBQCA0AKgQEghcAAkKKqwDz55JPR1NR0znH77bdnbQOgjrVU+4Q77rgjfv3rX//bL9BS9S8BQAOoug4tLS0xY8aMjC0ATCBVfwbzxhtvREdHR9xyyy2xcuXKePvttz/2/HK5HENDQ+ccAEx8VQVm0aJFsWXLltixY0ds2rQp3nrrrbjvvvtieHj4vM/p6+uL9vb28aOzs/OSRwPwyVdVYLq7u+NLX/pSdHV1xcMPPxy//OUv429/+1s8//zz531Ob29vDA4Ojh8DAwOXPBqAT75L+oT+2muvjdtuuy2OHTt23nNKpVKUSqVLuQwAdeiSvg/mzJkz8eabb8bMmTMv1x4AJoiqAvPEE0/E3r17489//nP8/ve/jy9+8YvR3NwcK1asyNoHQJ2q6i2yd955J1asWBF//etf48Ybb4zPfvazsX///rjxxhuz9gFQp6oKzLZt27J2ADDB+FlkAKQQGABSCAwAKQQGgBQCA0AKgQEghcAAkEJgAEghMACkEBgAUggMACkEBoAUAgNACoEBIIXAAJBCYABIITAApBAYAFIIDAApBAaAFAIDQAqBASCFwACQoqXWF6xUKhER8UHlbK0vXbgPzr5X9ISaa8Tf54jG/L2OiPhgtFz0hJobe6+xfq//fr9//7P84zRVLuSsy+idd96Jzs7OWl4SgMtsYGAgZs2a9bHn1DwwY2Njcfz48Whra4umpqaaXXdoaCg6OztjYGAgpkyZUrPrFs19N859N+I9RzTmfRd5z5VKJYaHh6OjoyMmTfr4T1lq/hbZpEmT/tPqZZoyZUrD/Jfw33PfjaMR7zmiMe+7qHtub2+/oPN8yA9ACoEBIEXDBKZUKsX69eujVCoVPaWm3Hfj3Hcj3nNEY953vdxzzT/kB6AxNMwrGABqS2AASCEwAKQQGABSNExgNm7cGDfddFNMnjw5Fi1aFK+++mrRk1Lt27cvli1bFh0dHdHU1BQvvPBC0ZPS9fX1xT333BNtbW0xbdq0eOyxx+L1118vela6TZs2RVdX1/g33S1evDhefvnlomfV1IYNG6KpqSnWrl1b9JRUTz75ZDQ1NZ1z3H777UXPOq+GCMxzzz0X69ati/Xr18ehQ4di/vz58fDDD8fp06eLnpZmZGQk5s+fHxs3bix6Ss3s3bs3enp6Yv/+/bFr1644e/ZsPPTQQzEyMlL0tFSzZs2KDRs2RH9/fxw8eDAefPDBePTRR+O1114relpNHDhwIDZv3hxdXV1FT6mJO+64I06cODF+/O53vyt60vlVGsDChQsrPT0941+Pjo5WOjo6Kn19fQWuqp2IqGzfvr3oGTV3+vTpSkRU9u7dW/SUmrvuuusqP/rRj4qekW54eLjy6U9/urJr167K/fffX3n88ceLnpRq/fr1lfnz5xc944JN+Fcw77//fvT398fSpUvHH5s0aVIsXbo0XnnllQKXkW1wcDAiIqZOnVrwktoZHR2Nbdu2xcjISCxevLjoOel6enrikUceOed/3xPdG2+8ER0dHXHLLbfEypUr4+233y560nnV/Idd1tq7774bo6OjMX369HMenz59evzpT38qaBXZxsbGYu3atXHvvffGnXfeWfScdEeOHInFixfHe++9F9dcc01s37495s2bV/SsVNu2bYtDhw7FgQMHip5SM4sWLYotW7bE3Llz48SJE/HUU0/FfffdF0ePHo22trai533IhA8MjamnpyeOHj36yX5/+jKaO3duHD58OAYHB+MXv/hFrFq1Kvbu3TthIzMwMBCPP/547Nq1KyZPnlz0nJrp7u4e/89dXV2xaNGimDNnTjz//PPx1a9+tcBlH23CB+aGG26I5ubmOHXq1DmPnzp1KmbMmFHQKjKtWbMmXnrppdi3b1+h/zRELbW2tsatt94aERELFiyIAwcOxDPPPBObN28ueFmO/v7+OH36dNx1113jj42Ojsa+ffvi2WefjXK5HM3NzQUurI1rr702brvttjh27FjRUz7ShP8MprW1NRYsWBC7d+8ef2xsbCx2797dEO9RN5JKpRJr1qyJ7du3x29+85u4+eabi55UmLGxsSiXJ+4/X7xkyZI4cuRIHD58ePy4++67Y+XKlXH48OGGiEtExJkzZ+LNN9+MmTNnFj3lI034VzAREevWrYtVq1bF3XffHQsXLoynn346RkZGYvXq1UVPS3PmzJlz/l/NW2+9FYcPH46pU6fG7NmzC1yWp6enJ7Zu3RovvvhitLW1xcmTJyPi///jSFdeeWXB6/L09vZGd3d3zJ49O4aHh2Pr1q2xZ8+e2LlzZ9HT0rS1tX3os7Wrr746rr/++gn9mdsTTzwRy5Ytizlz5sTx48dj/fr10dzcHCtWrCh62kcr+q+x1cr3v//9yuzZsyutra2VhQsXVvbv31/0pFS//e1vKxHxoWPVqlVFT0vzUfcbEZWf/vSnRU9L9ZWvfKUyZ86cSmtra+XGG2+sLFmypPKrX/2q6Fk11wh/TXn58uWVmTNnVlpbWyuf+tSnKsuXL68cO3as6Fnn5cf1A5Biwn8GA0AxBAaAFAIDQAqBASCFwACQQmAASCEwAKQQGABSCAwAKQQGgBQCA0AKgQEgxf8DSMGKMe/DP+0AAAAASUVORK5CYII=",
159 |       "text/plain": [
160 |        "<Figure size 640x480 with 1 Axes>"
161 |       ]
162 |      },
163 |      "metadata": {},
164 |      "output_type": "display_data"
165 |     }
166 |    ],
167 |    "source": [
168 |     "from sklearn.metrics.pairwise import cosine_similarity\n",
169 |     "import matplotlib.pyplot as plt\n",
170 |     "\n",
171 |     "H = cosine_similarity(audio_embs, text_embs)\n",
172 |     "plt.imshow(H, interpolation='none')\n",
173 |     "np.set_printoptions(precision=5, suppress=True)\n",
174 |     "print(H)"
175 |    ]
176 |   },
177 |   {
178 |    "cell_type": "code",
179 |    "execution_count": null,
180 |    "metadata": {},
181 |    "outputs": [],
182 |    "source": []
183 |   }
184 |  ],
185 |  "metadata": {
186 |   "kernelspec": {
187 |    "display_name": "ar",
188 |    "language": "python",
189 |    "name": "python3"
190 |   },
191 |   "language_info": {
192 |    "codemirror_mode": {
193 |     "name": "ipython",
194 |     "version": 3
195 |    },
196 |    "file_extension": ".py",
197 |    "mimetype": "text/x-python",
198 |    "name": "python",
199 |    "nbconvert_exporter": "python",
200 |    "pygments_lexer": "ipython3",
201 |    "version": "3.9.18"
202 |   }
203 |  },
204 |  "nbformat": 4,
205 |  "nbformat_minor": 2
206 | }
207 | 


--------------------------------------------------------------------------------
/examples/Example_4_CLAP2025.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# M2D-CLAP example\n",
  8 |     "\n",
  9 |     "This is an example of the CLAP features from M2D-CLAP $_{2025}$, the journal paper version.\n",
 10 |     "\n",
 11 |     "Download and prepare the wweight [`m2d_clap_vit_base-80x1001p16x16p16kpBpTI-2025`](https://github.com/nttcslab/m2d/releases/download/v0.5.0/m2d_clap_vit_base-80x1001p16x16p16kpBpTI-2025.zip) before you run.\n",
 12 |     "\n",
 13 |     "```bibtex\n",
 14 |     "@article{niizumi2025m2d-clap,\n",
 15 |     "    title   = {{M2D-CLAP: Exploring General-purpose Audio-Language Representations Beyond CLAP}},\n",
 16 |     "    author  = {Daisuke Niizumi and Daiki Takeuchi and Masahiro Yasuda and Binh Thien Nguyen and Yasunori Ohishi and Noboru Harada},\n",
 17 |     "    journal = {IEEE Access},\n",
 18 |     "    year    = {2025},\n",
 19 |     "    url     = {https://ieeexplore.ieee.org/document/11168481}}\n",
 20 |     "```"
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "code",
 25 |    "execution_count": 1,
 26 |    "metadata": {},
 27 |    "outputs": [],
 28 |    "source": [
 29 |     "import warnings; warnings.simplefilter('ignore')\n",
 30 |     "import logging\n",
 31 |     "logging.basicConfig(level=logging.INFO)\n",
 32 |     "import torch\n",
 33 |     "from pathlib import Path\n",
 34 |     "import numpy as np"
 35 |    ]
 36 |   },
 37 |   {
 38 |    "cell_type": "code",
 39 |    "execution_count": null,
 40 |    "metadata": {},
 41 |    "outputs": [
 42 |     {
 43 |      "name": "stderr",
 44 |      "output_type": "stream",
 45 |      "text": [
 46 |       "INFO:root:<All keys matched successfully>\n",
 47 |       "INFO:root:Model input size: [80, 1001]\n",
 48 |       "INFO:root:Using weights: m2d_clap_vit_base-80x1001p16x16p16kpBpTI-2025/checkpoint-30.pth\n",
 49 |       "INFO:root:Feature dimension: 768\n",
 50 |       "INFO:root:Norm stats: -7.261779308319092, 4.3511505126953125\n",
 51 |       "INFO:root:Runtime MelSpectrogram(16000, 400, 400, 160, 80, 50, 8000):\n",
 52 |       "INFO:root:MelSpectrogram(\n",
 53 |       "  Mel filter banks size = (80, 201), trainable_mel=False\n",
 54 |       "  (stft): STFT(n_fft=400, Fourier Kernel size=(201, 1, 400), iSTFT=False, trainable=False)\n",
 55 |       ")\n"
 56 |      ]
 57 |     },
 58 |     {
 59 |      "name": "stdout",
 60 |      "output_type": "stream",
 61 |      "text": [
 62 |       " using 166 parameters from m2d_clap_vit_base-80x1001p16x16p16kpBpTI-2025/checkpoint-30.pth\n",
 63 |       " (included audio_proj params: ['audio_proj.sem_token', 'audio_proj.sem_blocks.0.norm1.weight', 'audio_proj.sem_blocks.0.norm1.bias', 'audio_proj.sem_blocks.0.attn.qkv.weight', 'audio_proj.sem_blocks.0.attn.qkv.bias']\n",
 64 |       " (included text_proj params: []\n",
 65 |       " (dropped: [] )\n",
 66 |       "<All keys matched successfully>\n",
 67 |       " using norm_stats: -7.261779308319092, 4.3511505126953125\n"
 68 |      ]
 69 |     }
 70 |    ],
 71 |    "source": [
 72 |     "from portable_m2d import PortableM2D\n",
 73 |     "weight = 'm2d_clap_vit_base-80x1001p16x16p16kpBpTI-2025/checkpoint-30.pth'\n",
 74 |     "# Use flat_features=True for CLAP features only. For conventional audio features, flat_features should be False.\n",
 75 |     "model = PortableM2D(weight_file=weight, flat_features=True)"
 76 |    ]
 77 |   },
 78 |   {
 79 |    "cell_type": "code",
 80 |    "execution_count": 3,
 81 |    "metadata": {},
 82 |    "outputs": [],
 83 |    "source": [
 84 |     "files = ['AudioSetWav16k/eval_segments/-22tna7KHzI_28.000.wav',\n",
 85 |     "    'AudioSetWav16k/eval_segments/-0xzrMun0Rs_30.000.wav',\n",
 86 |     "    'AudioSetWav16k/eval_segments/3tUlhM80ObM_0.000.wav',\n",
 87 |     "    'AudioSetWav16k/eval_segments/-1nilez17Dg_30.000.wav',\n",
 88 |     "    'AudioSetWav16k/eval_segments/--U7joUcTCo_0.000.wav',\n",
 89 |     "    'AudioSetWav16k/eval_segments/5hlsVoxJPNI_30.000.wav',]\n",
 90 |     "captions = ['The sound of Explosion.',\n",
 91 |     "    'The sound of Stomach rumble, and Music.',\n",
 92 |     "    'The sound of Knock.',\n",
 93 |     "    'The sound of Heart murmur, and Speech.',\n",
 94 |     "    \"A man's laughter abruptly interrupts as someone sneezes, suggesting a casual gathering or social event.\",\n",
 95 |     "    \"The sound of Christmas music, Music, and Speech.\",]"
 96 |    ]
 97 |   },
 98 |   {
 99 |    "cell_type": "code",
100 |    "execution_count": 4,
101 |    "metadata": {},
102 |    "outputs": [
103 |     {
104 |      "name": "stderr",
105 |      "output_type": "stream",
106 |      "text": [
107 |       "INFO:root: using text encoder: BERT base\n"
108 |      ]
109 |     },
110 |     {
111 |      "name": "stdout",
112 |      "output_type": "stream",
113 |      "text": [
114 |       " using model.text_encoder from m2d_clap_vit_base-80x1001p16x16p16kpBpTI-2025/checkpoint-30.pth\n"
115 |      ]
116 |     }
117 |    ],
118 |    "source": [
119 |     "import librosa\n",
120 |     "\n",
121 |     "with torch.no_grad():\n",
122 |     "    audios = [librosa.load(f, sr=16000)[0] for f in files]\n",
123 |     "    audios = [np.pad(a, (0, 16000 * 10 - a.shape[-1])) for a in audios]  # Make sure all files are 10-s.\n",
124 |     "    audios = torch.tensor(audios)\n",
125 |     "    audio_embs = model.encode_clap_audio(audios)\n",
126 |     "    text_embs = model.encode_clap_text(captions)"
127 |    ]
128 |   },
129 |   {
130 |    "cell_type": "code",
131 |    "execution_count": 5,
132 |    "metadata": {},
133 |    "outputs": [
134 |     {
135 |      "data": {
136 |       "text/plain": [
137 |        "(torch.Size([6, 768]), torch.Size([6, 768]))"
138 |       ]
139 |      },
140 |      "execution_count": 5,
141 |      "metadata": {},
142 |      "output_type": "execute_result"
143 |     }
144 |    ],
145 |    "source": [
146 |     "audio_embs.shape, text_embs.shape"
147 |    ]
148 |   },
149 |   {
150 |    "cell_type": "code",
151 |    "execution_count": 6,
152 |    "metadata": {},
153 |    "outputs": [
154 |     {
155 |      "name": "stdout",
156 |      "output_type": "stream",
157 |      "text": [
158 |       "[[0.28606 0.2381  0.23852 0.21239 0.11076 0.13236]\n",
159 |       " [0.22148 0.37747 0.2139  0.18893 0.12501 0.21492]\n",
160 |       " [0.26712 0.24247 0.37288 0.21389 0.10618 0.15825]\n",
161 |       " [0.18678 0.22834 0.20472 0.39384 0.10315 0.22582]\n",
162 |       " [0.25185 0.13545 0.23883 0.18921 0.32387 0.1312 ]\n",
163 |       " [0.19546 0.24592 0.1791  0.23728 0.0799  0.31999]]\n"
164 |      ]
165 |     },
166 |     {
167 |      "data": {
168 |       "image/png": "iVBORw0KGgoAAAANSUhEUgAAAZgAAAGdCAYAAAAv9mXmAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjguMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8g+/7EAAAACXBIWXMAAA9hAAAPYQGoP6dpAAAUaklEQVR4nO3df4yVhZ3v8e8ws3PGH8MoCsiUwR+xaoAMvYIQYm2tUL3EEO0fhmtISmjTpN2hgbBmm/mnaHKbIbnZRlNZSmpb9o8StE3Qu6ZqKS2QplLHIdwFuzXi0jiGX9W9nRkm6wFmzv6x2dllFZcDfM/jmXm9kifpnDyH53Oi+O45DzM0VCqVSgDAZTap6AEAjE8CA0AKgQEghcAAkEJgAEghMACkEBgAUggMACmaan3B0dHROHr0aLS2tkZDQ0OtLw/AJahUKjE0NBTt7e0xadLHv0epeWCOHj0aHR0dtb4sAJdRf39/zJw582PPqXlgWltbIyJi7t+ticYrS7W+fKH+/z9dW/QEaqT5/Yn56fNNPz1W9ISaO3t9a9ETaursSDl+s/9vxv5b/nFqHph//1is8crShAvMpJaWoidQI42liRmYpkkT6/d0REQ0Tczf1xdyi2Ni/i4AIJ3AAJBCYABIITAApBAYAFIIDAApBAaAFAIDQAqBASCFwACQQmAASCEwAKQQGABSCAwAKQQGgBQCA0AKgQEghcAAkEJgAEghMACkEBgAUggMACkEBoAUFxWYTZs2xU033RQtLS2xaNGieO211y73LgDqXNWBefbZZ2P9+vWxYcOG2L9/f8ybNy8eeOCBOHnyZMY+AOpU1YH57ne/G1/72tdi9erVMXv27Pj+978fV155ZfzoRz/K2AdAnaoqMKdPn46+vr5YunTpf/wCkybF0qVL49VXX/3I55TL5RgcHDznAGD8qyow7733XoyMjMT06dPPeXz69Olx/Pjxj3xOT09PtLW1jR0dHR0XvxaAupH+p8i6u7tjYGBg7Ojv78++JACfAE3VnHz99ddHY2NjnDhx4pzHT5w4ETfccMNHPqdUKkWpVLr4hQDUparewTQ3N8f8+fNj165dY4+Njo7Grl27YvHixZd9HAD1q6p3MBER69evj1WrVsWCBQti4cKF8eSTT8bw8HCsXr06Yx8AdarqwKxYsSL+9Kc/xbe//e04fvx4fOYzn4mXX375Qzf+AZjYqg5MRMSaNWtizZo1l3sLAOOIn0UGQAqBASCFwACQQmAASCEwAKQQGABSCAwAKQQGgBQCA0AKgQEghcAAkEJgAEghMACkEBgAUggMACkEBoAUAgNACoEBIIXAAJBCYABIITAApBAYAFIIDAApBAaAFE1FXXjo/02JSS0tRV2+EL/88v8pekLNPfjMXxc9oRAt71eKnlCI0clXFj2h5t65/+qiJ9TUSLkpovfCzvUOBoAUAgNACoEBIIXAAJBCYABIITAApBAYAFIIDAApBAaAFAIDQAqBASCFwACQQmAASCEwAKQQGABSCAwAKQQGgBQCA0AKgQEghcAAkEJgAEghMACkEBgAUggMACkEBoAUAgNAiqoDs3fv3li+fHm0t7dHQ0NDPP/88wmzAKh3VQdmeHg45s2bF5s2bcrYA8A40VTtE5YtWxbLli3L2ALAOFJ1YKpVLpejXC6PfT04OJh9SQA+AdJv8vf09ERbW9vY0dHRkX1JAD4B0gPT3d0dAwMDY0d/f3/2JQH4BEj/iKxUKkWpVMq+DACfML4PBoAUVb+DOXXqVBw+fHjs6yNHjsSBAwdiypQpMWvWrMs6DoD6VXVgXn/99fjCF74w9vX69esjImLVqlWxdevWyzYMgPpWdWDuvffeqFQqGVsAGEfcgwEghcAAkEJgAEghMACkEBgAUggMACkEBoAUAgNACoEBIIXAAJBCYABIITAApBAYAFIIDAApBAaAFAIDQAqBASCFwACQQmAASCEwAKQQGABSCAwAKQQGgBRNRV145OZ/icqVlaIuX4gl//evip5Qc3//1b8pekIhHvnhxPtnHREx6f3BoifU3NR/uLroCTV19szZePsCz/UOBoAUAgNACoEBIIXAAJBCYABIITAApBAYAFIIDAApBAaAFAIDQAqBASCFwACQQmAASCEwAKQQGABSCAwAKQQGgBQCA0AKgQEghcAAkEJgAEghMACkEBgAUggMACkEBoAUAgNACoEBIEVVgenp6Ym77rorWltbY9q0afHwww/Hm2++mbUNgDpWVWD27NkTXV1dsW/fvti5c2ecOXMm7r///hgeHs7aB0Cdaqrm5Jdffvmcr7du3RrTpk2Lvr6++NznPndZhwFQ36oKzH81MDAQERFTpkw57znlcjnK5fLY14ODg5dySQDqxEXf5B8dHY1169bF3XffHXPnzj3veT09PdHW1jZ2dHR0XOwlAagjFx2Yrq6uOHToUGzfvv1jz+vu7o6BgYGxo7+//2IvCUAduaiPyNasWRMvvvhi7N27N2bOnPmx55ZKpSiVShc1DoD6VVVgKpVKfPOb34wdO3bE7t274+abb87aBUCdqyowXV1dsW3btnjhhReitbU1jh8/HhERbW1tccUVV6QMBKA+VXUPZvPmzTEwMBD33ntvzJgxY+x49tlns/YBUKeq/ogMAC6En0UGQAqBASCFwACQQmAASCEwAKQQGABSCAwAKQQGgBQCA0AKgQEghcAAkEJgAEghMACkEBgAUggMACkEBoAUAgNACoEBIIXAAJBCYABIITAApBAYAFIIDAApmoq6cOtvr4jG5paiLl+IoZuKXlB7/+tv/6roCYX4/bq/LXpCIR585n8WPaHmji9sLHpCTY1+0Bjx4oWd6x0MACkEBoAUAgNACoEBIIXAAJBCYABIITAApBAYAFIIDAApBAaAFAIDQAqBASCFwACQQmAASCEwAKQQGABSCAwAKQQGgBQCA0AKgQEghcAAkEJgAEghMACkEBgAUggMACkEBoAUVQVm8+bN0dnZGZMnT47JkyfH4sWL46WXXsraBkAdqyowM2fOjI0bN0ZfX1+8/vrrcd9998VDDz0Ub7zxRtY+AOpUUzUnL1++/Jyvv/Od78TmzZtj3759MWfOnMs6DID6VlVg/rORkZH46U9/GsPDw7F48eLznlcul6NcLo99PTg4eLGXBKCOVH2T/+DBg3H11VdHqVSKr3/967Fjx46YPXv2ec/v6emJtra2saOjo+OSBgNQH6oOzO233x4HDhyI3/3ud/GNb3wjVq1aFb///e/Pe353d3cMDAyMHf39/Zc0GID6UPVHZM3NzXHrrbdGRMT8+fOjt7c3nnrqqdiyZctHnl8qlaJUKl3aSgDqziV/H8zo6Og591gAIKLKdzDd3d2xbNmymDVrVgwNDcW2bdti9+7d8corr2TtA6BOVRWYkydPxpe//OU4duxYtLW1RWdnZ7zyyivxxS9+MWsfAHWqqsD88Ic/zNoBwDjjZ5EBkEJgAEghMACkEBgAUggMACkEBoAUAgNACoEBIIXAAJBCYABIITAApBAYAFIIDAApBAaAFAIDQAqBASCFwACQQmAASCEwAKQQGABSCAwAKQQGgBQCA0AKgQEgRVNRF/7zvLMx6YqzRV2+EJ/+uzNFT6i5tx9pKXpCIf7H//7LoicU4p6/7y16Qs2d/cupRU+oqbNnP4gjF3iudzAApBAYAFIIDAApBAaAFAIDQAqBASCFwACQQmAASCEwAKQQGABSCAwAKQQGgBQCA0AKgQEghcAAkEJgAEghMACkEBgAUggMACkEBoAUAgNACoEBIIXAAJBCYABIITAApBAYAFJcUmA2btwYDQ0NsW7duss0B4Dx4qID09vbG1u2bInOzs7LuQeAceKiAnPq1KlYuXJl/OAHP4hrr732cm8CYBy4qMB0dXXFgw8+GEuXLv1vzy2XyzE4OHjOAcD411TtE7Zv3x779++P3t7eCzq/p6cnnnjiiaqHAVDfqnoH09/fH2vXro2f/OQn0dLSckHP6e7ujoGBgbGjv7//ooYCUF+qegfT19cXJ0+ejDvvvHPssZGRkdi7d288/fTTUS6Xo7Gx8ZznlEqlKJVKl2ctAHWjqsAsWbIkDh48eM5jq1evjjvuuCO+9a1vfSguAExcVQWmtbU15s6de85jV111VVx33XUfehyAic138gOQouo/RfZf7d69+zLMAGC88Q4GgBQCA0AKgQEghcAAkEJgAEghMACkEBgAUggMACkEBoAUAgNACoEBIIXAAJBCYABIITAApBAYAFIIDAApBAaAFAIDQAqBASCFwACQQmAASCEwAKQQGABSNBV14dZ//ItoLP1FUZcvxFsrG4qeUHPX90681xwR8c+do0VPKMSbC84UPaHmbnrtcNETaur0qdMRX7iwc72DASCFwACQQmAASCEwAKQQGABSCAwAKQQGgBQCA0AKgQEghcAAkEJgAEghMACkEBgAUggMACkEBoAUAgNACoEBIIXAAJBCYABIITAApBAYAFIIDAApBAaAFAIDQAqBASCFwACQoqrAPP7449HQ0HDOcccdd2RtA6CONVX7hDlz5sQvf/nL//gFmqr+JQCYAKquQ1NTU9xwww0ZWwAYR6q+B/PWW29Fe3t73HLLLbFy5cp45513Pvb8crkcg4OD5xwAjH9VBWbRokWxdevWePnll2Pz5s1x5MiRuOeee2JoaOi8z+np6Ym2traxo6Oj45JHA/DJV1Vgli1bFo888kh0dnbGAw88ED//+c/jz3/+czz33HPnfU53d3cMDAyMHf39/Zc8GoBPvku6Q3/NNdfEbbfdFocPHz7vOaVSKUql0qVcBoA6dEnfB3Pq1Kl4++23Y8aMGZdrDwDjRFWBeeyxx2LPnj3xxz/+MX7729/Gl770pWhsbIxHH300ax8Adaqqj8jefffdePTRR+P999+PqVOnxmc/+9nYt29fTJ06NWsfAHWqqsBs3749awcA44yfRQZACoEBIIXAAJBCYABIITAApBAYAFIIDAApBAaAFAIDQAqBASCFwACQQmAASCEwAKQQGABSCAwAKQQGgBQCA0AKgQEghcAAkEJgAEghMACkEBgAUggMACmaan3BSqUSEREjpz+o9aULN/ovZ4ueUHMjpxuLnlCI0Q8qRU8oxNnKmaIn1NzpUxPr3/HTw//2z/jf/1v+cRoqF3LWZfTuu+9GR0dHLS8JwGXW398fM2fO/Nhzah6Y0dHROHr0aLS2tkZDQ0PNrjs4OBgdHR3R398fkydPrtl1i+Z1T5zXPRFfc8TEfN1FvuZKpRJDQ0PR3t4ekyZ9/F2Wmn9ENmnSpP+2epkmT548Yf4l/M+87oljIr7miIn5uot6zW1tbRd0npv8AKQQGABSTJjAlEql2LBhQ5RKpaKn1JTXPXFe90R8zRET83XXy2uu+U1+ACaGCfMOBoDaEhgAUggMACkEBoAUEyYwmzZtiptuuilaWlpi0aJF8dprrxU9KdXevXtj+fLl0d7eHg0NDfH8888XPSldT09P3HXXXdHa2hrTpk2Lhx9+ON58882iZ6XbvHlzdHZ2jn3T3eLFi+Oll14qelZNbdy4MRoaGmLdunVFT0n1+OOPR0NDwznHHXfcUfSs85oQgXn22Wdj/fr1sWHDhti/f3/MmzcvHnjggTh58mTR09IMDw/HvHnzYtOmTUVPqZk9e/ZEV1dX7Nu3L3bu3BlnzpyJ+++/P4aHh4uelmrmzJmxcePG6Ovri9dffz3uu+++eOihh+KNN94oelpN9Pb2xpYtW6Kzs7PoKTUxZ86cOHbs2Njxm9/8puhJ51eZABYuXFjp6uoa+3pkZKTS3t5e6enpKXBV7UREZceOHUXPqLmTJ09WIqKyZ8+eoqfU3LXXXlt55plnip6RbmhoqPLpT3+6snPnzsrnP//5ytq1a4uelGrDhg2VefPmFT3jgo37dzCnT5+Ovr6+WLp06dhjkyZNiqVLl8arr75a4DKyDQwMRETElClTCl5SOyMjI7F9+/YYHh6OxYsXFz0nXVdXVzz44IPn/P4e7956661ob2+PW265JVauXBnvvPNO0ZPOq+Y/7LLW3nvvvRgZGYnp06ef8/j06dPjD3/4Q0GryDY6Ohrr1q2Lu+++O+bOnVv0nHQHDx6MxYsXxwcffBBXX3117NixI2bPnl30rFTbt2+P/fv3R29vb9FTambRokWxdevWuP322+PYsWPxxBNPxD333BOHDh2K1tbWoud9yLgPDBNTV1dXHDp06JP9+fRldPvtt8eBAwdiYGAgfvazn8WqVatiz5494zYy/f39sXbt2ti5c2e0tLQUPadmli1bNva/Ozs7Y9GiRXHjjTfGc889F1/96lcLXPbRxn1grr/++mhsbIwTJ06c8/iJEyfihhtuKGgVmdasWRMvvvhi7N27t9C/GqKWmpub49Zbb42IiPnz50dvb2889dRTsWXLloKX5ejr64uTJ0/GnXfeOfbYyMhI7N27N55++ukol8vR2Dj+/6bJa665Jm677bY4fPhw0VM+0ri/B9Pc3Bzz58+PXbt2jT02Ojoau3btmhCfUU8klUol1qxZEzt27Ihf/epXcfPNNxc9qTCjo6NRLpeLnpFmyZIlcfDgwThw4MDYsWDBgli5cmUcOHBgQsQlIuLUqVPx9ttvx4wZM4qe8pHG/TuYiIj169fHqlWrYsGCBbFw4cJ48sknY3h4OFavXl30tDSnTp065//VHDlyJA4cOBBTpkyJWbNmFbgsT1dXV2zbti1eeOGFaG1tjePHj0fEv/3lSFdccUXB6/J0d3fHsmXLYtasWTE0NBTbtm2L3bt3xyuvvFL0tDStra0furd21VVXxXXXXTeu77k99thjsXz58rjxxhvj6NGjsWHDhmhsbIxHH3206Gkfreg/xlYr3/ve9yqzZs2qNDc3VxYuXFjZt29f0ZNS/frXv65ExIeOVatWFT0tzUe93oio/PjHPy56WqqvfOUrlRtvvLHS3NxcmTp1amXJkiWVX/ziF0XPqrmJ8MeUV6xYUZkxY0alubm58qlPfaqyYsWKyuHDh4uedV5+XD8AKcb9PRgAiiEwAKQQGABSCAwAKQQGgBQCA0AKgQEghcAAkEJgAEghMACkEBgAUggMACn+FVFAhnoLRBuZAAAAAElFTkSuQmCC",
169 |       "text/plain": [
170 |        "<Figure size 640x480 with 1 Axes>"
171 |       ]
172 |      },
173 |      "metadata": {},
174 |      "output_type": "display_data"
175 |     }
176 |    ],
177 |    "source": [
178 |     "from sklearn.metrics.pairwise import cosine_similarity\n",
179 |     "import matplotlib.pyplot as plt\n",
180 |     "\n",
181 |     "H = cosine_similarity(audio_embs, text_embs)\n",
182 |     "plt.imshow(H, interpolation='none')\n",
183 |     "np.set_printoptions(precision=5, suppress=True)\n",
184 |     "print(H)"
185 |    ]
186 |   }
187 |  ],
188 |  "metadata": {
189 |   "kernelspec": {
190 |    "display_name": "ar",
191 |    "language": "python",
192 |    "name": "python3"
193 |   },
194 |   "language_info": {
195 |    "codemirror_mode": {
196 |     "name": "ipython",
197 |     "version": 3
198 |    },
199 |    "file_extension": ".py",
200 |    "mimetype": "text/x-python",
201 |    "name": "python",
202 |    "nbconvert_exporter": "python",
203 |    "pygments_lexer": "ipython3",
204 |    "version": "3.9.18"
205 |   }
206 |  },
207 |  "nbformat": 4,
208 |  "nbformat_minor": 2
209 | }
210 | 


--------------------------------------------------------------------------------
/app/icbhi_sprs/README_ICBHI_SPRS.md:
--------------------------------------------------------------------------------
  1 | # M2D-X Applied on Respiratory Sound Tasks: ICBHI2017 & SPRSound
  2 | 
  3 | This sub-repository provides application examples in a realistic setting described in our [TASLP paper](https://ieeexplore.ieee.org/document/10502167).
  4 | 
  5 | ```BibTeX
  6 | @article{niizumi2024m2dx,
  7 |     title   = {{Masked Modeling Duo: Towards a Universal Audio Pre-training Framework}},
  8 |     author  = {Daisuke Niizumi and Daiki Takeuchi and Yasunori Ohishi and Noboru Harada and Kunio Kashino},
  9 |     journal = {IEEE/ACM Trans. Audio, Speech, Language Process.},
 10 |     year    = {2024},
 11 |     volume  = {32},
 12 |     pages   = {2391-2406},
 13 |     url     = {https://ieeexplore.ieee.org/document/10502167},
 14 |     doi     = {10.1109/TASLP.2024.3389636}}
 15 | ```
 16 | 
 17 | We applied the pre-trained models to respiratory sound tasks. While usually we fine-tune these models, we also further pre-train models on the application data that enhances the final performance (the step 2 below). The example contains data and test environmental setup, further pre-training steps, and fine-tuning steps.
 18 | 
 19 | <figure>
 20 |   <img src="../../image-M2D-further-PT.svg" alt="An shcematic illustration of M2D-X further pre-training", width="30%">
 21 | </figure>
 22 | 
 23 | Notably, the example follows the test setup compatible with previous studies, enabling comparison with SOTA. In addition, it runs on a smaller GPU RTX3090Ti (24GB).
 24 | 
 25 | # ISBHI2017
 26 | 
 27 | NEWS: The best model, `M2D-X/0.7 (η= 0.3)`, weight is available at the release [v0.2.0](https://github.com/nttcslab/m2d/releases/tag/v0.2.0).
 28 | 
 29 | ## 1. Data and test setup
 30 | 
 31 | ### 1-1. Setup application files
 32 | 
 33 | In the `app/icbhi_sprs` folder, running the following steps will download and setup the application program files.
 34 | 
 35 | ```sh
 36 | pip install torchinfo
 37 | git clone https://github.com/ilyassmoummad/scl_icbhi2017.git
 38 | cd scl_icbhi2017
 39 | git reset --hard 915c1120719a9357d662c5fe484bce7fbe845139
 40 | mv dataset.py augmentations.py utils.py losses.py args.py ..
 41 | mv data ..
 42 | mv main.py ../app_main.py
 43 | mv ce.py models.py ..
 44 | cd ..
 45 | patch -p2 < patch_scl_icbhi2017.diff
 46 | ```
 47 | 
 48 | When you finish these steps, you will find many .py files and a folder:
 49 | - Program files: app/icbhi_sprs/{app_main.py, args.py, augmentations.py, ce.py, dataset.py, losses.py, models.py, utils.py}
 50 | - Data folder: app/icbhi_sprs/data
 51 | 
 52 | ### 1-2. Download the ICBHI2017 data
 53 | 
 54 | In the `app/icbhi_sprs` folder, running the following steps will download and setup the ICBHI2017 data. The last step converts raw audios into spectrograms.
 55 | 
 56 | ```sh
 57 | wget https://bhichallenge.med.auth.gr/sites/default/files/ICBHI_final_database/ICBHI_final_database.zip --no-check-certificate
 58 | 
 59 | unzip ICBHI_final_database.zip | awk 'BEGIN {ORS=" "} {if(NR%10==0)print "."}'
 60 | mv ICBHI_final_database/* data/ICBHI
 61 | rmdir ICBHI_final_database
 62 | 
 63 | python ../../wav_to_lms.py data/ICBHI ../../data/icbhi2017_lms
 64 | cp files_icbhi2017.csv ../../data/files_icbhi2017.csv
 65 | ```
 66 | 
 67 | When you finish these steps, you will find the following:
 68 | - app/icbhi_sprs/data/ICBHI  -- For fine-tuning, the original data files.
 69 | - data/icbhi2017_lms         -- For further pre-training, the log-mel spectrogram (LMS) files.
 70 | - data/files_icbhi2017.csv   -- For further pre-training, the list of LMS files.
 71 | 
 72 | ### 1-3. Download FSD50K and setup the data
 73 | 
 74 | We use FSD50K as the background noise for the further pre-training.
 75 | 
 76 | To create the log-mel spectrogram FSD50K files, follow the [steps in the main README](../../README.md#3-1-preparing-pre-training-data-samples).
 77 | When you finish, you will have the following:
 78 | - data/fsd50k_lms             -- For further pre-training, the log-mel spectrogram (LMS) files.
 79 | - data/files_f_s_d_5_0_k.csv  -- For further pre-training, the list of LMS files.
 80 | 
 81 | ### 1-3. Download pre-trained weight
 82 | 
 83 | We use an M2D weight with an input size of 80x200 and a patch size of 16x4. Be sure to download the weight to your copy's M2D root folder.
 84 | 
 85 | ```sh
 86 | cd (your M2D root folder)
 87 | wget https://github.com/nttcslab/m2d/releases/download/v0.1.0/m2d_vit_base-80x200p16x4-230529.zip
 88 | unzip m2d_vit_base-80x200p16x4-230529.zip
 89 | ```
 90 | 
 91 | You will find `(your M2D root folder)/m2d_vit_base-80x200p16x4-230529/checkpoint-300.pth`.
 92 | 
 93 | 
 94 | ## 2. Further pre-training
 95 | 
 96 | We pre-train the pre-trained model again to make it more suitable for the target application data distribution.
 97 | 
 98 | ** *Be sure to move to your copy's M2D root folder before you run the following.* **
 99 | 
100 | ```sh
101 | cd (your M2D root folder)
102 | 
103 | CUDA_VISIBLE_DEVICES=0 python train_audio.py --epochs 600 --resume m2d_vit_base-80x200p16x4-230529/checkpoint-300.pth --model m2d_x_vit_base --input_size 80x200 --patch_size 16x4 --batch_size 64 --accum_iter 2 --csv_main data/files_icbhi2017.csv --csv_bg_noise data/files_f_s_d_5_0_k.csv --noise_ratio 0.3 --save_freq 100 --eval_after 600 --seed 6 --teacher m2d_vit_base-80x200p16x4-230529/checkpoint-300.pth --blr 3e-4 --loss_off 1. --min_ds_size 10000
104 | ```
105 | 
106 | When you finish, you will find the further pre-trained model folder named `m2d_x_vit_base-80x200p16x4p16k-240514-MdfiDdffsd50ks6bs128a2lo1nr.3-e600` (`240514` will be the date you run).
107 | The folder contains six checkpoint files for every 100 epochs.
108 | 
109 | In this step, the source `m2d_vit_base-80x200p16x4-230529` model pre-trained on AudioSet is further pre-trained using the files listed in `data/files_icbhi2017.csv`, making it more effective for solving ICBHI2017.
110 | 
111 | 
112 | ## 3. Fine-tuning
113 | 
114 | We are almost complete. The last step is fine-tuning in the app/icbhi_sprs folder.
115 | Use your further pre-trained weight, such as `m2d_x_vit_base-80x200p16x4p16k-240514-MdfiDdffsd50ks6bs128a2lo1nr.3-e600/checkpoint-600.pth`:
116 | 
117 | ** *Be sure to change the folder name to yours.* **
118 | 
119 | ** *Be sure to move to your copy's M2D root folder before you run the following.* **
120 | 
121 | ```sh
122 | cd (your M2D root folder)/app/icbhi_sprs
123 | 
124 | CUDA_VISIBLE_DEVICES=0 python app_main.py --method sl --backbone m2d --epochs 150 --bs 64 --lr 5e-5 --freeze_embed --split_iter 4 --weightspath ../../m2d_x_vit_base-80x200p16x4p16k-240514-MdfiDdffsd50ks6bs128a2lo1nr.3-e600/checkpoint-600.pth
125 | ```
126 | 
127 | We appreciate the codebase [lyassmoummad/scl_icbhi2017](https://github.com/ilyassmoummad/scl_icbhi2017) from the following paper. We customized the code to load and fine-tune the M2D models.
128 | 
129 | - [Moummad and Farrugia, "Pretraining Respiratory Sound Representations using Metadata and Contrastive Learning," in WASPAA, 2023](https://arxiv.org/abs/2210.16192)
130 | 
131 | To iterate fine-tuning for getting a statistical result, we actually used a batch file instead of a raw command line.
132 | 
133 | ```sh
134 | bash eval_icbhi.sh ../../m2d_x_vit_base-80x200p16x4p16k-240514-MdfiDdffsd50ks6bs128a2lo1nr.3-e600/checkpoint-600.pth 5
135 | ```
136 | 
137 | The last `5` denotes the number of the fine-tuning iteration.
138 | 
139 | The results on the paper:
140 | <figure>
141 |   <img src="graph_icbhi_furpt.svg" alt="ICBHI results", width="50%">
142 | </figure>
143 | 
144 | # SPRSound
145 | 
146 | ## 4. SPRSound
147 | 
148 | For SPRSound, we use the same code while setting up the SPRSound data. 
149 | 
150 | ### 4-1. Download SPRSound and setup data
151 | 
152 | In the `app/icbhi_sprs` folder, running the following steps will download the SPRSound data.
153 | 
154 | ```sh
155 | git clone https://github.com/SJTU-YONGFU-RESEARCH-GRP/SPRSound.git
156 | (cd SPRSound && git reset --hard 45b0d5d435ff320c46585762fa1090afd0ebb318)
157 | ```
158 | 
159 | ```sh
160 | cp -r SPRSound/train_wav SPRSound/test_wav data/SPRS/
161 | cp files_sprs.csv ../../data
162 | python cut_data_sprs.py
163 | ```
164 | 
165 | When you finish these steps, you will find the following:
166 | - app/icbhi_sprs/data/SPRS  -- For fine-tuning, the original data files.
167 | - data/sprsound_lms         -- For further pre-training, the log-mel spectrogram (LMS) files.
168 | - data/files_sprs.csv       -- For further pre-training, the list of LMS files.
169 | 
170 | ### 4-2. Further Pre-training
171 | 
172 | ** *Be sure to move to your copy's M2D root folder before you run the following.* **
173 | 
174 | ```sh
175 | cd (your M2D root folder)
176 | 
177 | CUDA_VISIBLE_DEVICES=0 python train_audio.py --epochs 600 --resume m2d_vit_base-80x200p16x4-230529/checkpoint-300.pth --model m2d_x_vit_base --input_size 80x200 --patch_size 16x4 --batch_size 64 --accum_iter 2 --csv_main data/files_sprs.csv --csv_bg_noise data/files_f_s_d_5_0_k.csv --noise_ratio 0.01 --save_freq 100 --eval_after 600 --seed 3 --teacher m2d_vit_base-80x200p16x4-230529/checkpoint-300.pth --blr 3e-4 --loss_off 1. --min_ds_size 0
178 | ```
179 | 
180 | ### 4-3. Fine-tuning
181 | 
182 | To iterate fine-tuning for getting a statistical result, we used a batch file.
183 | 
184 | ```sh
185 | bash eval_sprs.sh ../../m2d_x_vit_base-80x200p16x4p16k-240514-MdfiDdffsd50ks6bs128a2lo1nr.3-e600/checkpoint-600.pth 5
186 | ```
187 | 
188 | The last `5` denotes the number of the fine-tuning iteration.
189 | 
190 | The results on the paper:
191 | <figure>
192 |   <img src="graph_sprs_furpt.svg" alt="SPRSound results", width="50%">
193 | </figure>
194 | 
195 | 
196 | ## Complete command lines
197 | 
198 | The command lines for reproduction follows.
199 | 
200 | ### ICBHI2017
201 | 
202 | #### Further pre-training
203 | 
204 | We explain the details in the [Guide_app.md](Guide_app.md).
205 | 
206 | ```sh
207 | # M2D-X, noise_ratio 0.3
208 | CUDA_VISIBLE_DEVICES=0 python train_audio.py --epochs 600 --resume m2d_vit_base-80x200p16x4-230529/checkpoint-300.pth --model m2d_x_vit_base --input_size 80x200 --patch_size 16x4 --batch_size 64 --accum_iter 2 --csv_main data/files_icbhi2017.csv --csv_bg_noise data/files_f_s_d_5_0_k.csv --noise_ratio 0.3 --save_freq 100 --eval_after 600 --seed 6 --teacher m2d_vit_base-80x200p16x4-230529/checkpoint-300.pth --blr 3e-4 --loss_off 1. --min_ds_size 10000
209 | # M2D-X, noise_ratio 0.0
210 | CUDA_VISIBLE_DEVICES=0 python train_audio.py --epochs 600 --resume m2d_vit_base-80x200p16x4-230529/checkpoint-300.pth --model m2d_x_vit_base --input_size 80x200 --patch_size 16x4 --batch_size 64 --accum_iter 2 --csv_main data/files_icbhi2017.csv --csv_bg_noise data/files_f_s_d_5_0_k.csv --noise_ratio 0.0 --save_freq 100 --eval_after 600 --seed 6 --teacher m2d_vit_base-80x200p16x4-230529/checkpoint-300.pth --blr 3e-4 --loss_off 1. --min_ds_size 10000
211 | 
212 | We specifically leave our command lines for the random seed 3. We did them for all four seeds. 
213 | 
214 | # M2D, noise_ratio 0.3
215 | CUDA_VISIBLE_DEVICES=0 python train_audio.py --epochs 600 --resume m2d_vit_base-80x200p16x4-230529/checkpoint-300.pth --model m2d_vit_base --input_size 80x200 --patch_size 16x4 --batch_size 64 --accum_iter 2 --csv_main data/files_icbhi2017.csv --csv_bg_noise data/files_f_s_d_5_0_k.csv --noise_ratio 0.3 --save_freq 100 --eval_after 600 --seed 3 --blr 3e-4 --loss_off 0. --min_ds_size 10000
216 | # M2D, noise_ratio 0.0
217 | CUDA_VISIBLE_DEVICES=0 python train_audio.py --epochs 600 --resume m2d_vit_base-80x200p16x4-230529/checkpoint-300.pth --model m2d_vit_base --input_size 80x200 --patch_size 16x4 --batch_size 64 --accum_iter 2 --csv_main data/files_icbhi2017.csv --csv_bg_noise data/files_f_s_d_5_0_k.csv --noise_ratio 0.0 --save_freq 100 --eval_after 600 --seed 3 --blr 3e-4 --loss_off 0. --min_ds_size 10000
218 | ```
219 | 
220 | Example log is available for M2D-X, noise_ratio 0.3 in the [example_logs.zip](https://github.com/nttcslab/m2d/releases/download/v0.1.0/example_logs.zip).
221 | Find `examples/logs/log_m2d_x_vit_base-80x200p16x4-230814-Ddffsd50ks5blr0003bs128a2nr.3-e600.out` from the zip file.
222 | 
223 | 
224 | #### Fine-tuning
225 | 
226 | We specifically leave our command lines for checkpoint-600.pth. We did them for all six checkpoints. 
227 | 
228 | ```sh
229 | # M2D-X, noise_ratio 0.3, random seeds 3 to 6
230 | bash eval_icbhi.sh ../../m2d_x_vit_base-80x200p16x4-230814-Ddffsd50ks3blr0003bs128a2nr.3-e600/checkpoint-600.pth 5
231 | bash eval_icbhi.sh ../../m2d_x_vit_base-80x200p16x4-230814-Ddffsd50ks4blr0003bs128a2nr.3-e600/checkpoint-600.pth 5
232 | bash eval_icbhi.sh ../../m2d_x_vit_base-80x200p16x4-230814-Ddffsd50ks5blr0003bs128a2nr.3-e600/checkpoint-600.pth 5
233 | bash eval_icbhi.sh ../../m2d_x_vit_base-80x200p16x4-230814-Ddffsd50ks6blr0003bs128a2nr.3-e600/checkpoint-600.pth 5
234 | # M2D-X, noise_ratio 0.0, random seeds 3 to 6
235 | bash eval_icbhi.sh ../../m2d_x_vit_base-80x200p16x4-230818-Ddffsd50ks3blr0003bs128a2nr0-e600/checkpoint-600.pth 5
236 | bash eval_icbhi.sh ../../m2d_x_vit_base-80x200p16x4-230818-Ddffsd50ks4blr0003bs128a2nr0-e600/checkpoint-600.pth 5
237 | bash eval_icbhi.sh ../../m2d_x_vit_base-80x200p16x4-230818-Ddffsd50ks5blr0003bs128a2nr0-e600/checkpoint-600.pth 5
238 | bash eval_icbhi.sh ../../m2d_x_vit_base-80x200p16x4-230818-Ddffsd50ks6blr0003bs128a2nr0-e600/checkpoint-600.pth 5
239 | 
240 | # M2D, noise_ratio 0.3, random seeds 3 to 6
241 | bash eval_icbhi.sh ../../m2d_vit_base-80x200p16x4-mix-230814-s3blr0003bs128a2MdfiDdffsd50knr.3-e600/checkpoint-600.pth 5
242 | bash eval_icbhi.sh ../../m2d_vit_base-80x200p16x4-mix-230814-s4blr0003bs128a2MdfiDdffsd50knr.3-e600/checkpoint-600.pth 5
243 | bash eval_icbhi.sh ../../m2d_vit_base-80x200p16x4-mix-230814-s5blr0003bs128a2MdfiDdffsd50knr.3-e600/checkpoint-600.pth 5
244 | bash eval_icbhi.sh ../../m2d_vit_base-80x200p16x4-mix-230814-s6blr0003bs128a2MdfiDdffsd50knr.3-e600/checkpoint-600.pth 5
245 | # M2D, noise_ratio 0.3, random seeds 3 to 6
246 | bash eval_icbhi.sh ../../m2d_vit_base-80x200p16x4-mix-230814-s3blr0003bs128a2MdfiDdffsd50knr0-e600/checkpoint-600.pth 5
247 | bash eval_icbhi.sh ../../m2d_vit_base-80x200p16x4-mix-230814-s4blr0003bs128a2MdfiDdffsd50knr0-e600/checkpoint-600.pth 5
248 | bash eval_icbhi.sh ../../m2d_vit_base-80x200p16x4-mix-230814-s5blr0003bs128a2MdfiDdffsd50knr0-e600/checkpoint-600.pth 5
249 | bash eval_icbhi.sh ../../m2d_vit_base-80x200p16x4-mix-230814-s6blr0003bs128a2MdfiDdffsd50knr0-e600/checkpoint-600.pth 5
250 | ```
251 | ### SPRSound
252 | 
253 | #### Further pre-training
254 | 
255 | We specifically leave our command lines for the random seed 3. We did them for all four seeds. 
256 | 
257 | ```sh
258 | # M2D-X, noise_ratio 0.3
259 | CUDA_VISIBLE_DEVICES=0 python train_audio.py --epochs 600 --resume m2d_vit_base-80x200p16x4-230529/checkpoint-300.pth --model m2d_x_vit_base --input_size 80x200 --patch_size 16x4 --batch_size 64 --accum_iter 2 --csv_main data/files_sprs.csv --csv_bg_noise data/files_f_s_d_5_0_k.csv --noise_ratio 0.01 --save_freq 100 --eval_after 600 --seed 3 --teacher m2d_vit_base-80x200p16x4-230529/checkpoint-300.pth --blr 3e-4 --loss_off 1. --min_ds_size 0
260 | # M2D-X, noise_ratio 0.0
261 | CUDA_VISIBLE_DEVICES=0 python train_audio.py --epochs 600 --resume m2d_vit_base-80x200p16x4-230529/checkpoint-300.pth --model m2d_x_vit_base --input_size 80x200 --patch_size 16x4 --batch_size 64 --accum_iter 2 --csv_main data/files_sprs.csv --csv_bg_noise data/files_f_s_d_5_0_k.csv --noise_ratio 0.0 --save_freq 100 --eval_after 600 --seed 3 --teacher m2d_vit_base-80x200p16x4-230529/checkpoint-300.pth --blr 3e-4 --loss_off 1. --min_ds_size 0
262 | ```
263 | 
264 | #### Fine-tuning
265 | 
266 | We specifically leave our command lines for checkpoint-600.pth. We did them for all six checkpoints. 
267 | 
268 | ```sh
269 | # M2D-X, noise_ratio 0.01, random seeds 3 to 6
270 | bash eval_sprs.sh ../../m2d_x_vit_base-80x200p16x4-240222-MdfsDdffsd50ks3blr0003bs128a2lo0nr01dn0-e600/checkpoint-600.pth 5
271 | bash eval_sprs.sh ../../m2d_x_vit_base-80x200p16x4-240222-MdfsDdffsd50ks4blr0003bs128a2lo0nr01dn0-e600/checkpoint-600.pth 5
272 | bash eval_sprs.sh ../../m2d_x_vit_base-80x200p16x4-240222-MdfsDdffsd50ks5blr0003bs128a2lo0nr01dn0-e600/checkpoint-600.pth 5
273 | bash eval_sprs.sh ../../m2d_x_vit_base-80x200p16x4-240222-MdfsDdffsd50ks6blr0003bs128a2lo0nr01dn0-e600/checkpoint-600.pth 5
274 | # M2D-X, noise_ratio 0.0, random seeds 3 to 6
275 | bash eval_sprs.sh ../../m2d_x_vit_base-80x200p16x4-240223-MdfsDdffsd50ks3blr0003bs128a2lo0nr0dn0-e600/checkpoint-600.pth 5
276 | bash eval_sprs.sh ../../m2d_x_vit_base-80x200p16x4-240223-MdfsDdffsd50ks4blr0003bs128a2lo0nr0dn0-e600/checkpoint-600.pth 5
277 | bash eval_sprs.sh ../../m2d_x_vit_base-80x200p16x4-240223-MdfsDdffsd50ks5blr0003bs128a2lo0nr0dn0-e600/checkpoint-600.pth 5
278 | bash eval_sprs.sh ../../m2d_x_vit_base-80x200p16x4-240223-MdfsDdffsd50ks6blr0003bs128a2lo0nr0dn0-e600/checkpoint-600.pth 5
279 | ```
280 | 


--------------------------------------------------------------------------------
/clap/Note-ACalt4_GTEbase.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Preparing Caption Embeddings for AudioCaps Alternative 4 Captions (ACalt4)\n",
  8 |     "\n",
  9 |     "Our implementation does not convert texts into sentence (semantic) embeddings on the fly. Instead, we convert them into embeddings in advance in an offline fashion.\n",
 10 |     "\n",
 11 |     "- Download ACalt4 as `../data/audiocaps_alternative_4.csv` in advance from the external website DOSHISHA.\n",
 12 |     "- The following will create `../data/capemb_GTEbase_AC_BLIP_Aug.npy` using the GTE base sentence embedding encoder model."
 13 |    ]
 14 |   },
 15 |   {
 16 |    "cell_type": "code",
 17 |    "execution_count": 1,
 18 |    "metadata": {},
 19 |    "outputs": [
 20 |     {
 21 |      "name": "stderr",
 22 |      "output_type": "stream",
 23 |      "text": [
 24 |       "INFO:numexpr.utils:Note: detected 80 virtual cores but NumExpr set to maximum of 64, check \"NUMEXPR_MAX_THREADS\" environment variable.\n",
 25 |       "INFO:numexpr.utils:Note: NumExpr detected 80 cores but \"NUMEXPR_MAX_THREADS\" not set, so enforcing safe limit of 8.\n",
 26 |       "INFO:numexpr.utils:NumExpr defaulting to 8 threads.\n"
 27 |      ]
 28 |     }
 29 |    ],
 30 |    "source": [
 31 |     "import warnings; warnings.simplefilter('ignore')\n",
 32 |     "import logging; logging.basicConfig(level=logging.INFO)\n",
 33 |     "import numpy as np\n",
 34 |     "import pandas as pd\n",
 35 |     "import torch"
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "code",
 40 |    "execution_count": 2,
 41 |    "metadata": {},
 42 |    "outputs": [
 43 |     {
 44 |      "name": "stdout",
 45 |      "output_type": "stream",
 46 |      "text": [
 47 |       "[[69.65808868408203, 88.03551483154297, 68.79684448242188]]\n"
 48 |      ]
 49 |     }
 50 |    ],
 51 |    "source": [
 52 |     "# https://huggingface.co/thenlper/gte-base\n",
 53 |     "\n",
 54 |     "import torch.nn.functional as F\n",
 55 |     "from torch import Tensor\n",
 56 |     "from transformers import AutoTokenizer, AutoModel\n",
 57 |     "\n",
 58 |     "def average_pool(last_hidden_states: Tensor,\n",
 59 |     "                 attention_mask: Tensor) -> Tensor:\n",
 60 |     "    last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0)\n",
 61 |     "    return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]\n",
 62 |     "\n",
 63 |     "input_texts = [\n",
 64 |     "    \"what is the capital of China?\",\n",
 65 |     "    \"how to implement quick sort in python?\",\n",
 66 |     "    \"Beijing\",\n",
 67 |     "    \"sorting algorithms\"\n",
 68 |     "]\n",
 69 |     "\n",
 70 |     "tokenizer = AutoTokenizer.from_pretrained(\"thenlper/gte-base\")\n",
 71 |     "model = AutoModel.from_pretrained(\"thenlper/gte-base\")\n",
 72 |     "\n",
 73 |     "# Tokenize the input texts\n",
 74 |     "batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt')\n",
 75 |     "\n",
 76 |     "outputs = model(**batch_dict)\n",
 77 |     "embeddings = average_pool(outputs.last_hidden_state, batch_dict['attention_mask'])\n",
 78 |     "\n",
 79 |     "# (Optionally) normalize embeddings\n",
 80 |     "embeddings = F.normalize(embeddings, p=2, dim=1)\n",
 81 |     "scores = (embeddings[:1] @ embeddings[1:].T) * 100\n",
 82 |     "print(scores.tolist())"
 83 |    ]
 84 |   },
 85 |   {
 86 |    "cell_type": "code",
 87 |    "execution_count": 3,
 88 |    "metadata": {},
 89 |    "outputs": [
 90 |     {
 91 |      "data": {
 92 |       "text/html": [
 93 |        "<div>\n",
 94 |        "<style scoped>\n",
 95 |        "    .dataframe tbody tr th:only-of-type {\n",
 96 |        "        vertical-align: middle;\n",
 97 |        "    }\n",
 98 |        "\n",
 99 |        "    .dataframe tbody tr th {\n",
100 |        "        vertical-align: top;\n",
101 |        "    }\n",
102 |        "\n",
103 |        "    .dataframe thead th {\n",
104 |        "        text-align: right;\n",
105 |        "    }\n",
106 |        "</style>\n",
107 |        "<table border=\"1\" class=\"dataframe\">\n",
108 |        "  <thead>\n",
109 |        "    <tr style=\"text-align: right;\">\n",
110 |        "      <th></th>\n",
111 |        "      <th>caption1</th>\n",
112 |        "      <th>caption2</th>\n",
113 |        "      <th>caption3</th>\n",
114 |        "      <th>caption4</th>\n",
115 |        "    </tr>\n",
116 |        "    <tr>\n",
117 |        "      <th>youtube_id</th>\n",
118 |        "      <th></th>\n",
119 |        "      <th></th>\n",
120 |        "      <th></th>\n",
121 |        "      <th></th>\n",
122 |        "    </tr>\n",
123 |        "  </thead>\n",
124 |        "  <tbody>\n",
125 |        "    <tr>\n",
126 |        "      <th>---1_cCGK4M</th>\n",
127 |        "      <td>A train is moving along the tracks with the rh...</td>\n",
128 |        "      <td>A train swiftly moving along the tracks, accom...</td>\n",
129 |        "      <td>A train horn blaring in the distance, blending...</td>\n",
130 |        "      <td>The unmistakable sound of a train, with the cl...</td>\n",
131 |        "    </tr>\n",
132 |        "    <tr>\n",
133 |        "      <th>---lTs1dxhU</th>\n",
134 |        "      <td>A racing car speeding past in a virtual race</td>\n",
135 |        "      <td>A car zooming around a track in a video game</td>\n",
136 |        "      <td>The fast-paced sound of a car zooming along a ...</td>\n",
137 |        "      <td>A dynamic sound of a vehicle racing on a track...</td>\n",
138 |        "    </tr>\n",
139 |        "    <tr>\n",
140 |        "      <th>--0PQM4-hqg</th>\n",
141 |        "      <td>Water flowing through a river with a gurgling ...</td>\n",
142 |        "      <td>A waterfall cascading down with a rush of water</td>\n",
143 |        "      <td>Gurgling water flowing through a peaceful land...</td>\n",
144 |        "      <td>Natures symphony includes the gentle gurgling ...</td>\n",
145 |        "    </tr>\n",
146 |        "    <tr>\n",
147 |        "      <th>--299m5_DdE</th>\n",
148 |        "      <td>Excitement fills the indoor water park as chil...</td>\n",
149 |        "      <td>The joyful sounds of children playing fill the...</td>\n",
150 |        "      <td>Gurgling water and a waterfall fill the indoor...</td>\n",
151 |        "      <td>The air in an indoor water park is filled with...</td>\n",
152 |        "    </tr>\n",
153 |        "    <tr>\n",
154 |        "      <th>--2XRMjyizo</th>\n",
155 |        "      <td>Bird vocalizations, with chirps and tweets, fi...</td>\n",
156 |        "      <td>Two police officers standing in front of a map</td>\n",
157 |        "      <td>Birds chirping and tweeting in the background</td>\n",
158 |        "      <td>Amidst the scene of two police officers studyi...</td>\n",
159 |        "    </tr>\n",
160 |        "    <tr>\n",
161 |        "      <th>...</th>\n",
162 |        "      <td>...</td>\n",
163 |        "      <td>...</td>\n",
164 |        "      <td>...</td>\n",
165 |        "      <td>...</td>\n",
166 |        "    </tr>\n",
167 |        "    <tr>\n",
168 |        "      <th>zzlfP-snUeY</th>\n",
169 |        "      <td>A bulldozer idling in a rural area</td>\n",
170 |        "      <td>A bulldozer idles and its engine rumbles softl...</td>\n",
171 |        "      <td>An idling engine of a vehicle in an outdoor se...</td>\n",
172 |        "      <td>The engine of a parked bulldozer purrs quietly...</td>\n",
173 |        "    </tr>\n",
174 |        "    <tr>\n",
175 |        "      <th>zzm3dwoXY8Y</th>\n",
176 |        "      <td>Birds chirping and cooing in a natural outdoor...</td>\n",
177 |        "      <td>Birds chirping and cooing in an outdoor setting</td>\n",
178 |        "      <td>A soft cooing sound coming from a group of bir...</td>\n",
179 |        "      <td>The cooing of pigeons in an outdoor environment</td>\n",
180 |        "    </tr>\n",
181 |        "    <tr>\n",
182 |        "      <th>zzvWbSyZfr0</th>\n",
183 |        "      <td>The snoring in this image is occasionally inte...</td>\n",
184 |        "      <td>There is snoring and occasional speech coming ...</td>\n",
185 |        "      <td>A young girl is peacefully sleeping on a bed i...</td>\n",
186 |        "      <td>In the background, there is a gentle snoring s...</td>\n",
187 |        "    </tr>\n",
188 |        "    <tr>\n",
189 |        "      <th>zzwBazlj0Oc</th>\n",
190 |        "      <td>The soft sound of pigeons cooing in a confined...</td>\n",
191 |        "      <td>Birds cooing softly in a confined space</td>\n",
192 |        "      <td>Pigeons cooing softly in a confined space</td>\n",
193 |        "      <td>Pigeons cooing softly in a small room</td>\n",
194 |        "    </tr>\n",
195 |        "    <tr>\n",
196 |        "      <th>zzznDcamMpw</th>\n",
197 |        "      <td>Ducks quacking and people speaking can be hear...</td>\n",
198 |        "      <td>The echoes of ducks and people talking can be ...</td>\n",
199 |        "      <td>The echoes of ducks and people talking can be ...</td>\n",
200 |        "      <td>The echoes of ducks and people talking can be ...</td>\n",
201 |        "    </tr>\n",
202 |        "  </tbody>\n",
203 |        "</table>\n",
204 |        "<p>41785 rows × 4 columns</p>\n",
205 |        "</div>"
206 |       ],
207 |       "text/plain": [
208 |        "                                                      caption1  \\\n",
209 |        "youtube_id                                                       \n",
210 |        "---1_cCGK4M  A train is moving along the tracks with the rh...   \n",
211 |        "---lTs1dxhU       A racing car speeding past in a virtual race   \n",
212 |        "--0PQM4-hqg  Water flowing through a river with a gurgling ...   \n",
213 |        "--299m5_DdE  Excitement fills the indoor water park as chil...   \n",
214 |        "--2XRMjyizo  Bird vocalizations, with chirps and tweets, fi...   \n",
215 |        "...                                                        ...   \n",
216 |        "zzlfP-snUeY                 A bulldozer idling in a rural area   \n",
217 |        "zzm3dwoXY8Y  Birds chirping and cooing in a natural outdoor...   \n",
218 |        "zzvWbSyZfr0  The snoring in this image is occasionally inte...   \n",
219 |        "zzwBazlj0Oc  The soft sound of pigeons cooing in a confined...   \n",
220 |        "zzznDcamMpw  Ducks quacking and people speaking can be hear...   \n",
221 |        "\n",
222 |        "                                                      caption2  \\\n",
223 |        "youtube_id                                                       \n",
224 |        "---1_cCGK4M  A train swiftly moving along the tracks, accom...   \n",
225 |        "---lTs1dxhU       A car zooming around a track in a video game   \n",
226 |        "--0PQM4-hqg    A waterfall cascading down with a rush of water   \n",
227 |        "--299m5_DdE  The joyful sounds of children playing fill the...   \n",
228 |        "--2XRMjyizo     Two police officers standing in front of a map   \n",
229 |        "...                                                        ...   \n",
230 |        "zzlfP-snUeY  A bulldozer idles and its engine rumbles softl...   \n",
231 |        "zzm3dwoXY8Y    Birds chirping and cooing in an outdoor setting   \n",
232 |        "zzvWbSyZfr0  There is snoring and occasional speech coming ...   \n",
233 |        "zzwBazlj0Oc            Birds cooing softly in a confined space   \n",
234 |        "zzznDcamMpw  The echoes of ducks and people talking can be ...   \n",
235 |        "\n",
236 |        "                                                      caption3  \\\n",
237 |        "youtube_id                                                       \n",
238 |        "---1_cCGK4M  A train horn blaring in the distance, blending...   \n",
239 |        "---lTs1dxhU  The fast-paced sound of a car zooming along a ...   \n",
240 |        "--0PQM4-hqg  Gurgling water flowing through a peaceful land...   \n",
241 |        "--299m5_DdE  Gurgling water and a waterfall fill the indoor...   \n",
242 |        "--2XRMjyizo      Birds chirping and tweeting in the background   \n",
243 |        "...                                                        ...   \n",
244 |        "zzlfP-snUeY  An idling engine of a vehicle in an outdoor se...   \n",
245 |        "zzm3dwoXY8Y  A soft cooing sound coming from a group of bir...   \n",
246 |        "zzvWbSyZfr0  A young girl is peacefully sleeping on a bed i...   \n",
247 |        "zzwBazlj0Oc          Pigeons cooing softly in a confined space   \n",
248 |        "zzznDcamMpw  The echoes of ducks and people talking can be ...   \n",
249 |        "\n",
250 |        "                                                      caption4  \n",
251 |        "youtube_id                                                      \n",
252 |        "---1_cCGK4M  The unmistakable sound of a train, with the cl...  \n",
253 |        "---lTs1dxhU  A dynamic sound of a vehicle racing on a track...  \n",
254 |        "--0PQM4-hqg  Natures symphony includes the gentle gurgling ...  \n",
255 |        "--299m5_DdE  The air in an indoor water park is filled with...  \n",
256 |        "--2XRMjyizo  Amidst the scene of two police officers studyi...  \n",
257 |        "...                                                        ...  \n",
258 |        "zzlfP-snUeY  The engine of a parked bulldozer purrs quietly...  \n",
259 |        "zzm3dwoXY8Y    The cooing of pigeons in an outdoor environment  \n",
260 |        "zzvWbSyZfr0  In the background, there is a gentle snoring s...  \n",
261 |        "zzwBazlj0Oc              Pigeons cooing softly in a small room  \n",
262 |        "zzznDcamMpw  The echoes of ducks and people talking can be ...  \n",
263 |        "\n",
264 |        "[41785 rows x 4 columns]"
265 |       ]
266 |      },
267 |      "execution_count": 3,
268 |      "metadata": {},
269 |      "output_type": "execute_result"
270 |     }
271 |    ],
272 |    "source": [
273 |     "df = pd.read_csv('../data/audiocaps_alternative_4.csv').set_index('youtube_id')\n",
274 |     "df"
275 |    ]
276 |   },
277 |   {
278 |    "cell_type": "code",
279 |    "execution_count": 4,
280 |    "metadata": {},
281 |    "outputs": [],
282 |    "source": [
283 |     "def chunks(lst, n):\n",
284 |     "    \"\"\"Yield successive n-sized chunks from lst.\"\"\"\n",
285 |     "    for i in range(0, len(lst), n):\n",
286 |     "        yield lst[i:i + n]\n",
287 |     "\n",
288 |     "cap_chunks = [c for c in chunks(list(df.values), 64)]"
289 |    ]
290 |   },
291 |   {
292 |    "cell_type": "code",
293 |    "execution_count": 5,
294 |    "metadata": {},
295 |    "outputs": [
296 |     {
297 |      "name": "stderr",
298 |      "output_type": "stream",
299 |      "text": [
300 |       "100%|██████████| 653/653 [01:25<00:00,  7.62it/s]\n"
301 |      ]
302 |     }
303 |    ],
304 |    "source": [
305 |     "from tqdm import tqdm\n",
306 |     "\n",
307 |     "model = model.to('cuda:0')\n",
308 |     "\n",
309 |     "emb_chunks = []\n",
310 |     "for i, caps in enumerate(tqdm(cap_chunks)):\n",
311 |     "    flat_caps = []\n",
312 |     "    for cap4 in caps:\n",
313 |     "        assert len(cap4) == 4  # asserts 4 captions each\n",
314 |     "        for cap in cap4:\n",
315 |     "            flat_caps.append(cap)\n",
316 |     "\n",
317 |     "    with torch.no_grad():\n",
318 |     "        batch_dict = tokenizer(flat_caps, max_length=512, padding=True, truncation=True, return_tensors='pt')\n",
319 |     "        batch_dict['input_ids'] = batch_dict['input_ids'].to('cuda:0')\n",
320 |     "        batch_dict['token_type_ids'] = batch_dict['token_type_ids'].to('cuda:0')\n",
321 |     "        batch_dict['attention_mask'] = batch_dict['attention_mask'].to('cuda:0')\n",
322 |     "        outputs = model(**batch_dict)\n",
323 |     "    embeddings = average_pool(outputs.last_hidden_state, batch_dict['attention_mask']).detach().cpu()\n",
324 |     "    embeddings = embeddings.reshape(-1, 4, embeddings.shape[-1])\n",
325 |     "    emb_chunks.append(embeddings)\n"
326 |    ]
327 |   },
328 |   {
329 |    "cell_type": "code",
330 |    "execution_count": 6,
331 |    "metadata": {},
332 |    "outputs": [
333 |     {
334 |      "data": {
335 |       "text/plain": [
336 |        "(41785, 4, 768)"
337 |       ]
338 |      },
339 |      "execution_count": 6,
340 |      "metadata": {},
341 |      "output_type": "execute_result"
342 |     }
343 |    ],
344 |    "source": [
345 |     "embs = torch.cat(emb_chunks, dim=0).numpy().astype(np.float16)\n",
346 |     "embs.shape"
347 |    ]
348 |   },
349 |   {
350 |    "cell_type": "code",
351 |    "execution_count": 7,
352 |    "metadata": {},
353 |    "outputs": [],
354 |    "source": [
355 |     "embdic = {y: c for y, c in zip(df.index.values, embs)}\n",
356 |     "np.save('../data/capemb_GTEbase_AC_BLIP_Aug.npy', embdic)"
357 |    ]
358 |   },
359 |   {
360 |    "cell_type": "code",
361 |    "execution_count": 8,
362 |    "metadata": {},
363 |    "outputs": [
364 |     {
365 |      "data": {
366 |       "text/plain": [
367 |        "((4, 768),\n",
368 |        " array([[-0.1776 , -0.2524 ,  0.2241 , ...,  0.568  ,  0.501  , -0.3445 ],\n",
369 |        "        [-0.1724 , -0.3872 ,  0.0874 , ...,  0.247  ,  0.6016 , -0.3633 ],\n",
370 |        "        [ 0.1284 , -0.0255 ,  0.1407 , ...,  0.4292 ,  0.4458 , -0.1812 ],\n",
371 |        "        [-0.04327, -0.3618 ,  0.4766 , ...,  0.3176 ,  0.2566 , -0.4915 ]],\n",
372 |        "       dtype=float16))"
373 |       ]
374 |      },
375 |      "execution_count": 8,
376 |      "metadata": {},
377 |      "output_type": "execute_result"
378 |     }
379 |    ],
380 |    "source": [
381 |     "embdic['---1_cCGK4M'].shape, embdic['---1_cCGK4M']"
382 |    ]
383 |   },
384 |   {
385 |    "cell_type": "code",
386 |    "execution_count": null,
387 |    "metadata": {},
388 |    "outputs": [],
389 |    "source": []
390 |   }
391 |  ],
392 |  "metadata": {
393 |   "kernelspec": {
394 |    "display_name": "ar",
395 |    "language": "python",
396 |    "name": "python3"
397 |   },
398 |   "language_info": {
399 |    "codemirror_mode": {
400 |     "name": "ipython",
401 |     "version": 3
402 |    },
403 |    "file_extension": ".py",
404 |    "mimetype": "text/x-python",
405 |    "name": "python",
406 |    "nbconvert_exporter": "python",
407 |    "pygments_lexer": "ipython3",
408 |    "version": "3.9.18"
409 |   }
410 |  },
411 |  "nbformat": 4,
412 |  "nbformat_minor": 2
413 | }
414 | 


--------------------------------------------------------------------------------
/app/icbhi_sprs/patch_scl_icbhi2017.diff:
--------------------------------------------------------------------------------
  1 | --- _org/app/app_main.py	2024-05-14 09:58:27.909947715 +0900
  2 | +++ app/icbhi_sprs/app_main.py	2024-05-14 10:08:03.645092839 +0900
  3 | @@ -1,16 +1,18 @@
  4 | -import os 
  5 | +import os
  6 |  import torch
  7 |  import torch.nn as nn
  8 | +from pathlib import Path
  9 | +import pandas as pd
 10 |  from torchaudio import transforms as T
 11 |  import torch.nn.functional as F
 12 |  from torchinfo import summary
 13 |  from augmentations import SpecAugment
 14 | -from models import CNN6, CNN10, CNN14, Projector, LinearClassifier
 15 | +from models import CNN6, CNN10, CNN14, Projector, LinearClassifier, RT_LMS_M2D
 16 |  from dataset import ICBHI, SPRS
 17 |  from utils import Normalize, Standardize
 18 |  from losses import SupConLoss, SupConCELoss
 19 |  from ce import train_ce
 20 | -from hybrid import train_supconce
 21 | +# from hybrid import train_supconce
 22 |  from args import args
 23 |  if args.method == 'scl':
 24 |      from scl import train_scl, linear_scl
 25 | @@ -26,12 +28,13 @@
 26 |  elif args.dataset == 'SPRS':
 27 |      DEFAULT_NUM_CLASSES = 7
 28 |  DEFAULT_OUT_DIM = 128 #for ssl embedding space dimension
 29 | -DEFAULT_NFFT = 1024
 30 | -DEFAULT_NMELS = 64
 31 | -DEFAULT_WIN_LENGTH = 1024
 32 | -DEFAULT_HOP_LENGTH = 512
 33 | +DEFAULT_NFFT = 400
 34 | +DEFAULT_NMELS = 80
 35 | +DEFAULT_WIN_LENGTH = 400
 36 | +DEFAULT_HOP_LENGTH = 160
 37 |  DEFAULT_FMIN = 50
 38 | -DEFAULT_FMAX = 2000
 39 | +DEFAULT_FMAX = 8000
 40 | +args.backbone = 'm2d'
 41 |  
 42 |  # Model definition
 43 |  if args.method == 'sl':
 44 | @@ -52,6 +55,9 @@
 45 |  elif args.backbone == 'cnn14':
 46 |      PATH_TO_WEIGHTS = os.path.join(args.weightspath, 'Cnn14_mAP=0.431.pth')
 47 |      model = CNN14(num_classes=DEFAULT_NUM_CLASSES, do_dropout=args.dropout, embed_only=embed_only, from_scratch=args.scratch, path_to_weights=PATH_TO_WEIGHTS, device=args.device)
 48 | +elif args.backbone == 'm2d':
 49 | +    model = RT_LMS_M2D(num_classes=DEFAULT_NUM_CLASSES, embed_only=embed_only, weight_file=args.weightspath, training_mask=0.0, freeze_embed=args.freeze_embed, adjust_pos=args.adjust_pos)
 50 | +    model = model.to(args.device)
 51 |  s = summary(model, device=args.device)
 52 |  nparams = s.trainable_params
 53 |  
 54 | @@ -59,7 +65,13 @@
 55 |  melspec = T.MelSpectrogram(n_fft=DEFAULT_NFFT, n_mels=DEFAULT_NMELS, win_length=DEFAULT_WIN_LENGTH, hop_length=DEFAULT_HOP_LENGTH, f_min=DEFAULT_FMIN, f_max=DEFAULT_FMAX).to(args.device)
 56 |  normalize = Normalize()
 57 |  melspec = torch.nn.Sequential(melspec, normalize)
 58 | -standardize = Standardize(device=args.device)
 59 | +if True:  ## Switch to False for calculating statistics
 60 | +    stat_mean, stat_std = [0.3671, 0.2391] if args.dataset == 'ICBHI' else [0.2000, 0.2094]
 61 | +else:
 62 | +    print('**** FOR STATS CALCULATION ONLY ****')
 63 | +    stat_mean, stat_std = 0., 1.
 64 | +print(f'** Using T.MelSpectrogram & Standardize({stat_mean}, std={stat_std}) **')
 65 | +standardize = Standardize(mean=stat_mean, std=stat_std, device=args.device)
 66 |  
 67 |  # Data transformations
 68 |  specaug = SpecAugment(freq_mask=args.freqmask, time_mask=args.timemask, freq_stripes=args.freqstripes, time_stripes=args.timestripes).to(args.device)
 69 | @@ -72,13 +84,29 @@
 70 |      val_ds = ICBHI(data_path=args.datapath, metadatafile=args.metadata, duration=args.duration, split='test', device=args.device, samplerate=args.samplerate, pad_type=args.pad, meta_label=args.metalabel)
 71 |  elif args.dataset == 'SPRS':
 72 |      train_ds = SPRS(data_path=args.datapath, metadatafile=args.metadata, duration=args.duration, split='train', device="cpu", samplerate=args.samplerate, pad_type=args.pad, meta_label=args.metalabel)
 73 | -    if args.mode == 'intra':
 74 | +    if args.appmode == 'intra':
 75 |          val_ds = SPRS(data_path=args.datapath, metadatafile=args.metadata, duration=args.duration, split='intra_test', device="cpu", samplerate=args.samplerate, pad_type=args.pad, meta_label=args.metalabel)
 76 | -    elif args.mode == 'inter':
 77 | +    elif args.appmode == 'inter':
 78 |          val_ds = SPRS(data_path=args.datapath, metadatafile=args.metadata, duration=args.duration, split='inter_test', device="cpu", samplerate=args.samplerate, pad_type=args.pad, meta_label=args.metalabel)
 79 |  train_loader = torch.utils.data.DataLoader(train_ds, batch_size=args.bs, shuffle=True)
 80 |  val_loader = torch.utils.data.DataLoader(val_ds, batch_size=args.bs, shuffle=False)
 81 |  
 82 | +# ***** Calculating statistics of your dataset *****
 83 | +# 1. Change True to False in "if True:" above.
 84 | +# 2. Change the following False to True
 85 | +# 3. Run: python app_main.py --dataset ICBHI --datapath data/ICBHI --weightspath ../m2d_vit_base-80x200p16x4-230529/random
 86 | +#     or  python app_main.py --dataset SPRS --datapath data/SPRS --weightspath ../m2d_vit_base-80x200p16x4-230529/random
 87 | +if False:
 88 | +    Xs = []
 89 | +    for X, *_ in train_loader:
 90 | +        with torch.no_grad():
 91 | +            X = train_transform(X.to('cuda'))
 92 | +        Xs.append(X.cpu())
 93 | +    X = torch.vstack(Xs)
 94 | +    print(X.mean(), X.std())
 95 | +    import pdb; pdb.set_trace()
 96 | +    exit(0)
 97 | +
 98 |  ### Optimizer
 99 |  if METHOD == 'sl':
100 |      optimizer = torch.optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.wd)
101 | @@ -105,8 +133,9 @@
102 |  else:
103 |      criterion_ce = nn.CrossEntropyLoss()
104 |  
105 | +print(args)
106 |  if METHOD == 'sl':
107 | -    history = train_ce(model, train_loader, val_loader, train_transform, val_transform, criterion_ce, optimizer, args.epochs, scheduler)
108 | +    history = train_ce(model, train_loader, val_loader, train_transform, val_transform, criterion_ce, optimizer, args.epochs, scheduler, DEFAULT_NUM_CLASSES, args.split_iter)
109 |      del model
110 |  
111 |  elif METHOD == 'scl':
112 | @@ -126,5 +155,38 @@
113 |      history = train_supconce(model, projector, classifier, train_loader, val_loader, train_transform, val_transform, criterion, criterion_ce, optimizer, args.epochs, scheduler)
114 |      del model; del projector; del classifier
115 |  
116 | +report, (best_sp, best_se, best_icbhi_score, best_weight), train_losses, val_losses, train_se_scores, train_sp_scores, train_icbhi_scores, train_acc_scores, val_se_scores, val_sp_scores, val_icbhi_scores, val_acc_scores = history
117 | +scores_csv = Path('results')/(str(args.dataset).lower() + '-scores.csv')
118 | +scores_csv.parent.mkdir(parents=True, exist_ok=True)
119 | +weight_name = Path(args.weightspath).parent.name + '_' + Path(args.weightspath).stem
120 | +model_name = f'{args.backbone}-{METHOD}-{weight_name}-lr{args.lr}-bs{args.bs}'
121 | +if args.split_iter > 1: model_name += f's{args.split_iter}'
122 | +if args.freeze_embed: model_name += 'Z'
123 | +if args.adjust_pos: model_name += 'P'
124 | +text_all_args = str(dict(mode=model_name, **dict(vars(args))))
125 | +report = f'{model_name}: {report}'
126 | +print(report)
127 | +
128 | +weight_path = Path('results/checkpoints')
129 | +weight_path.mkdir(parents=True, exist_ok=True)
130 | +torch.save(best_weight, weight_path/(model_name + '.pth'))
131 | +
132 | +# scores
133 | +try:
134 | +    dforg = pd.read_csv(scores_csv)
135 | +except:
136 | +    print(f'Create a new {scores_csv}')
137 | +    dforg = pd.DataFrame()
138 | +df = pd.DataFrame(dict(model=[model_name], best_sp=[best_sp], best_se=[best_se], best_icbhi_score=[best_icbhi_score], report=[report], args=[text_all_args]))
139 | +pd.concat([dforg, df]).to_csv(scores_csv, index=None)
140 | +
141 | +# logs
142 | +epoch_logs = dict(train_losses=train_losses, val_losses=val_losses, train_se_scores=train_se_scores, train_sp_scores=train_sp_scores,
143 | +         train_icbhi_scores=train_icbhi_scores, train_acc_scores=train_acc_scores, val_se_scores=val_se_scores,
144 | +         val_sp_scores=val_sp_scores, val_icbhi_scores=val_icbhi_scores, val_acc_scores=val_acc_scores)
145 | +df = pd.DataFrame(epoch_logs)
146 | +Path('results/logs').mkdir(parents=True, exist_ok=True)
147 | +df.to_csv(f'results/logs/{weight_name}.csv')
148 | +
149 |  del train_ds; del val_ds
150 | -del train_loader; del val_loader
151 | \ ファイル末尾に改行がありません
152 | +del train_loader; del val_loader
153 | --- _org/app/models.py	2024-05-14 10:06:11.612480963 +0900
154 | +++ app/icbhi_sprs/models.py	2024-05-14 10:01:31.634951532 +0900
155 | @@ -1,3 +1,5 @@
156 | +import sys
157 | +sys.path.append('../..')
158 |  import torch
159 |  import torch.nn as nn
160 |  import torch.nn.functional as F
161 | @@ -288,8 +290,43 @@
162 |  def cnn14(**kwargs):
163 |      return CNN14(**kwargs)
164 |      
165 | +
166 | +from m2d.runtime_audio import RuntimeM2D, Config
167 | +class RT_LMS_M2D(RuntimeM2D):
168 | +    def __init__(self, num_classes=4, embed_only=False, training_mask=0.0, weight_file='m2d_vit_base-80x608p16x16-220930-mr7/checkpoint-300.pth', freeze_embed=None, adjust_pos=False):
169 | +        cfg = Config()
170 | +        if adjust_pos:
171 | +            cfg.dur_frames = 801
172 | +        super().__init__(cfg=cfg, weight_file=weight_file, training_mask=training_mask, encoder_only=True, freeze_embed=freeze_embed)
173 | +        self.embed_only = embed_only
174 | +        if not embed_only:
175 | +            self.linear = nn.Linear(self.cfg.feature_d, num_classes, bias=True)
176 | +        # remove unneeded modules for encoding audio
177 | +        #del self.backbone.decoder_blocks
178 | +        #del self.backbone.target_blocks
179 | +        self.accum_mean, self.accum_std = 0., 1.
180 | +
181 | +    def forward(self, features):
182 | +        # def ema(old, new):
183 | +        #     alpha = 0.999
184 | +        #     return alpha*old + (1 - alpha)*new
185 | +        # _mean, _std = features.mean(), features.std()
186 | +        # self.accum_mean, self.accum_std = ema(self.accum_mean, _mean), ema(self.accum_std, _std)
187 | +        # print(_mean, _std, self.accum_mean, self.accum_std)
188 | +        x = self.encode_lms(features)  # [128, 51, 3840]
189 | +        x = torch.mean(x, dim=1)  # [128, 768]
190 | +        if self.embed_only:
191 | +            return x
192 | +        return self.linear(x)  # [128, num_classes]
193 | +
194 | +
195 | +def m2d(**kwargs):
196 | +    return RT_LMS_M2D(**kwargs)
197 | +
198 | +
199 |  model_dict = {
200 |      'cnn6' : [cnn6, 512],
201 |      'cnn10' : [cnn10, 512],
202 |      'cnn14' : [cnn14, 2048],
203 | +    'm2d': [m2d, 768*5]
204 |  }
205 | --- _org/app/args.py	2024-05-14 10:34:44.317784486 +0900
206 | +++ app/icbhi_sprs/args.py	2024-05-14 10:35:13.997945151 +0900
207 | @@ -19,7 +19,7 @@
208 |  
209 |  #Data
210 |  parser.add_argument("--dataset", type=str, default='ICBHI') # which dataset to use ['ICBHI', 'SPRS']
211 | -parser.add_argument("--mode", type=str, default='inter') # for SPRS dataset, there are two test splits ['inter', 'intra']
212 | +parser.add_argument("--appmode", type=str, default='inter') # for SPRS dataset, there are two test splits ['inter', 'intra']
213 |  parser.add_argument("--datapath", type=str, default='data/ICBHI') # path of the dataset files
214 |  parser.add_argument("--metadata", type=str, default='metadata.csv') #metadata file
215 |  parser.add_argument("--metalabel", type=str, default='sa') #meta label used for mscl, 's' stands for sex, 'a' for age, and 'c' for respiratory class
216 | @@ -44,4 +44,10 @@
217 |  parser.add_argument("--alpha", type=float, default=0.5) #tradeoff between cross entropy and nt xent
218 |  parser.add_argument("--lam", type=float, default=0.75) #tradeoff between scl label and scl metadata
219 |  
220 | -args = parser.parse_args()
221 | \ ファイル末尾に改行がありません
222 | +#M2D
223 | +parser.add_argument("--freeze_embed", action='store_true') #freeze ViT embedding layer
224 | +parser.add_argument("--adjust_pos", action='store_true')   #adjust positional embedding length
225 | +parser.add_argument("--split_iter", type=int, default=1)   #for a low-memory run, split actual batch size by this number
226 | +
227 | +
228 | +args = parser.parse_args()
229 | --- _org/app/ce.py	2024-05-14 10:34:44.317784486 +0900
230 | +++ app/icbhi_sprs/ce.py	2024-05-14 10:35:17.561964443 +0900
231 | @@ -2,35 +2,42 @@
232 |  import torch
233 |  from args import args
234 |  
235 | -def train_epoch(model, train_loader, train_transform, criterion, optimizer, scheduler):
236 | +def train_epoch(model, train_loader, train_transform, criterion, optimizer, scheduler, n_classes, K=1):
237 |      
238 | -    TP = [0, 0, 0 ,0]
239 | -    GT = [0, 0, 0, 0]
240 | +    TP = [0 for _ in range(n_classes)]
241 | +    GT = [0 for _ in range(n_classes)]
242 |  
243 |      epoch_loss = 0.0
244 |  
245 |      model.train()
246 |  
247 | -    for data, target, _ in train_loader:
248 | -        data, target = data.to(args.device), target.to(args.device)
249 | +    for batch_data, batch_target, _ in train_loader:
250 | +        batch_data, batch_target = batch_data.to(args.device), batch_target.to(args.device)
251 |  
252 |          with torch.no_grad():
253 | -            data_t = train_transform(data) 
254 | +            batch_data_t = train_transform(batch_data)
255 |          
256 |          optimizer.zero_grad()
257 |  
258 | -        output = model(data_t)
259 | -        loss = criterion(output, target)
260 | +        L = len(batch_data_t)
261 | +        D = L // K
262 | +        for i in range(K):
263 | +            data = batch_data_t[i*D:(i+1)*D]
264 | +            target = batch_target[i*D:(i+1)*D]
265 | +
266 | +            output = model(data)
267 | +            loss = criterion(output, target)
268 |              
269 | -        epoch_loss += loss.item()
270 | +            epoch_loss += loss.item()
271 |  
272 | -        _, labels_predicted = torch.max(output, dim=1)
273 | +            _, labels_predicted = torch.max(output, dim=1)
274 |  
275 | -        for idx in range(len(TP)):
276 | -            TP[idx] += torch.logical_and((labels_predicted==idx),(target==idx)).sum().item()
277 | -            GT[idx] += (target==idx).sum().item()
278 | +            for idx in range(len(TP)):
279 | +                TP[idx] += torch.logical_and((labels_predicted==idx),(target==idx)).sum().item()
280 | +                GT[idx] += (target==idx).sum().item()
281 |          
282 | -        loss.backward()
283 | +            loss.backward()
284 | +
285 |          optimizer.step()
286 |  
287 |      scheduler.step()
288 | @@ -43,10 +50,10 @@
289 |  
290 |      return epoch_loss, se, sp, icbhi_score, acc
291 |  
292 | -def val_epoch(model, val_loader, val_transform, criterion):
293 | +def val_epoch(model, val_loader, val_transform, criterion, n_classes, K=1):
294 |  
295 | -    TP = [0, 0, 0 ,0]
296 | -    GT = [0, 0, 0, 0]
297 | +    TP = [0 for _ in range(n_classes)]
298 | +    GT = [0 for _ in range(n_classes)]
299 |  
300 |      epoch_loss = 0.0
301 |  
302 | @@ -54,18 +61,24 @@
303 |  
304 |      with torch.no_grad():
305 |  
306 | -        for data, target, _ in val_loader:
307 | -            data, target = data.to(args.device), target.to(args.device)
308 | +        for batch_data, batch_target, _ in val_loader:
309 | +            batch_data, batch_target = batch_data.to(args.device), batch_target.to(args.device)
310 |              
311 | -            output = model(val_transform(data))
312 | -            loss = criterion(output, target)
313 | -            epoch_loss += loss.item()
314 | -
315 | -            _, labels_predicted = torch.max(output, dim=1)
316 | -
317 | -            for idx in range(len(TP)):
318 | -                TP[idx] += torch.logical_and((labels_predicted==idx),(target==idx)).sum().item()
319 | -                GT[idx] += (target==idx).sum().item()
320 | +            L = len(batch_data)
321 | +            D = L // K
322 | +            for i in range(K):
323 | +                data = batch_data[i*D:(i+1)*D]
324 | +                target = batch_target[i*D:(i+1)*D]
325 | +
326 | +                output = model(val_transform(data))
327 | +                loss = criterion(output, target)
328 | +                epoch_loss += loss.item()
329 | +
330 | +                _, labels_predicted = torch.max(output, dim=1)
331 | +
332 | +                for idx in range(len(TP)):
333 | +                    TP[idx] += torch.logical_and((labels_predicted==idx),(target==idx)).sum().item()
334 | +                    GT[idx] += (target==idx).sum().item()
335 |  
336 |  
337 |      epoch_loss = epoch_loss / len(val_loader)
338 | @@ -76,7 +89,7 @@
339 |  
340 |      return epoch_loss, se, sp, icbhi_score, acc
341 |  
342 | -def train_ce(model, train_loader, val_loader, train_transform, val_transform, criterion, optimizer, epochs, scheduler):
343 | +def train_ce(model, train_loader, val_loader, train_transform, val_transform, criterion, optimizer, epochs, scheduler, n_classes, K=1):
344 |  
345 |      train_losses = []; val_losses = []; train_se_scores = []; train_sp_scores = []; train_icbhi_scores = []; train_acc_scores = []; val_se_scores = []; val_sp_scores = []; val_icbhi_scores = []; val_acc_scores = []
346 |  
347 | @@ -86,16 +99,17 @@
348 |      best_sp = 0
349 |      best_epoch_acc = 0
350 |      best_epoch_icbhi = 0
351 | +    best_weight = None
352 |  
353 |      for i in range(1, epochs+1):
354 |          
355 |          print(f"Epoch {i}")
356 |  
357 | -        train_loss, train_se, train_sp, train_icbhi_score, train_acc = train_epoch(model, train_loader, train_transform, criterion, optimizer, scheduler)
358 | +        train_loss, train_se, train_sp, train_icbhi_score, train_acc = train_epoch(model, train_loader, train_transform, criterion, optimizer, scheduler, n_classes, K)
359 |          train_losses.append(train_loss); train_se_scores.append(train_se); train_sp_scores.append(train_sp); train_icbhi_scores.append(train_icbhi_score); train_acc_scores.append(train_acc)
360 |          print(f"Train loss : {format(train_loss, '.4f')}\tTrain SE : {format(train_se, '.4f')}\tTrain SP : {format(train_sp, '.4f')}\tTrain Score : {format(train_icbhi_score, '.4f')}\tTrain Acc : {format(train_acc, '.4f')}")
361 |  
362 | -        val_loss, val_se, val_sp, val_icbhi_score, val_acc = val_epoch(model, val_loader, val_transform, criterion)
363 | +        val_loss, val_se, val_sp, val_icbhi_score, val_acc = val_epoch(model, val_loader, val_transform, criterion, n_classes, K)
364 |          val_losses.append(val_loss); val_se_scores.append(val_se); val_sp_scores.append(val_sp); val_icbhi_scores.append(val_icbhi_score); val_acc_scores.append(val_acc)
365 |          print(f"Val loss : {format(val_loss, '.4f')}\tVal SE : {format(val_se, '.4f')}\tVal SP : {format(val_sp, '.4f')}\tVal Score : {format(val_icbhi_score, '.4f')}\tVal Acc : {format(val_acc, '.4f')}")          
366 |  
367 | @@ -112,11 +126,15 @@
368 |              best_icbhi_score = val_icbhi_score
369 |              best_se = val_se
370 |              best_sp = val_sp
371 | +            best_weight = {k: v.cpu() for k, v in model.state_dict().items()}
372 |          
373 |          if best_val_acc < val_acc:
374 |              best_epoch_acc = i
375 |              best_val_acc = val_acc
376 |  
377 | -    print(f"best icbhi score is {format(best_icbhi_score, '.4f')} (se:{format(best_se, '.4f')} sp:{format(best_sp, '.4f')}) at epoch {best_epoch_icbhi}")
378 | +        print(f"Val loss : {format(val_loss, '.4f')}\tVal SE : {format(val_se, '.4f')}\tVal SP : {format(val_sp, '.4f')}\tVal Score : {format(val_icbhi_score, '.4f')}\tVal Acc : {format(val_acc, '.4f')} best_icbhi_score so far: {format(best_icbhi_score, '.4f')}")
379 | +
380 | +    report = f"best icbhi score is {format(best_icbhi_score, '.4f')} (se:{format(best_se, '.4f')} sp:{format(best_sp, '.4f')}) at epoch {best_epoch_icbhi}"
381 | +    print(report)
382 |  
383 | -    return train_losses, val_losses, train_se_scores, train_sp_scores, train_icbhi_scores, train_acc_scores, val_se_scores, val_sp_scores, val_icbhi_scores, val_acc_scores
384 | \ ファイル末尾に改行がありません
385 | +    return report, (best_sp, best_se, best_icbhi_score, best_weight), train_losses, val_losses, train_se_scores, train_sp_scores, train_icbhi_scores, train_acc_scores, val_se_scores, val_sp_scores, val_icbhi_scores, val_acc_scores
386 | 


--------------------------------------------------------------------------------