├── superb └── upstream │ └── m2d │ ├── __init__.py │ ├── hubconf.py │ ├── expert.py │ └── README.md ├── LICENSE.pdf ├── clap ├── image_figure2.jpg ├── image-ESC10-Viz.jpg ├── image_Table3_CLAP_LE.png ├── image_Table4_CLAP_FT.png ├── image_Table5_CLAP_ZS.png ├── README.md └── Note-ACalt4_GTEbase.ipynb ├── LICENSE ├── image-AppGuideChart.png ├── image-key-visual-m2d.jpg ├── speech ├── figure-github.jpg ├── figure2system-s.jpg ├── bat │ ├── asv.sh │ ├── ic.sh │ ├── ks.sh │ ├── sid.sh │ ├── pr.sh │ ├── ss.sh │ └── er.sh ├── extract_offline_ls960.py ├── README.md └── speech_dataset.py ├── app ├── circor │ ├── EMBC_TableII.png │ ├── bat │ │ ├── m2d_ftcircor_rand.sh │ │ ├── m2d_ftcircor.sh │ │ ├── ast_ftcircor.sh │ │ ├── ast_ftcircor_noaug.sh │ │ ├── cnn14_ftcircor.sh │ │ ├── cnn14_ftcircor_noaug.sh │ │ ├── byola_ftcircor.sh │ │ └── byola_ftcircor_noaug.sh │ ├── diff-heart-murmur-detection.patch │ ├── diff-evar.patch │ ├── README.md │ └── circor_eval.py └── icbhi_sprs │ ├── eval_icbhi.sh │ ├── eval_sprs.sh │ ├── cut_data_sprs.py │ ├── README_ICBHI_SPRS.md │ └── patch_scl_icbhi2017.diff ├── image-key-vis-m2d-clap.jpg ├── audioset ├── table-V-M2D-AS-le.png ├── table-VI-M2D-AS-ft.png └── README.md ├── requirements.txt ├── data ├── files_audioset.csv └── README.md ├── util ├── ft-spc.sh ├── ft-as20k.sh ├── ft-esc50.sh ├── ft-vc1.sh ├── to_encoder_only_weight.py ├── ft-as2m.sh └── make_as_weighted_list.py ├── quick_eval.sh ├── all_eval.sh ├── .gitignore ├── common.py ├── examples ├── Example_1.ipynb ├── Example_old4_CLAP2024.ipynb └── Example_4_CLAP2025.ipynb ├── wav_to_lms.py ├── Guide_app.md └── audio_dataset.py /superb/upstream/m2d/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /LICENSE.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nttcslab/m2d/HEAD/LICENSE.pdf -------------------------------------------------------------------------------- /clap/image_figure2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nttcslab/m2d/HEAD/clap/image_figure2.jpg -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Please find the LICENSE at https://github.com/nttcslab/m2d/blob/master/LICENSE.pdf 2 | -------------------------------------------------------------------------------- /clap/image-ESC10-Viz.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nttcslab/m2d/HEAD/clap/image-ESC10-Viz.jpg -------------------------------------------------------------------------------- /image-AppGuideChart.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nttcslab/m2d/HEAD/image-AppGuideChart.png -------------------------------------------------------------------------------- /image-key-visual-m2d.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nttcslab/m2d/HEAD/image-key-visual-m2d.jpg -------------------------------------------------------------------------------- /speech/figure-github.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nttcslab/m2d/HEAD/speech/figure-github.jpg -------------------------------------------------------------------------------- /app/circor/EMBC_TableII.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nttcslab/m2d/HEAD/app/circor/EMBC_TableII.png -------------------------------------------------------------------------------- /image-key-vis-m2d-clap.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nttcslab/m2d/HEAD/image-key-vis-m2d-clap.jpg -------------------------------------------------------------------------------- /speech/figure2system-s.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nttcslab/m2d/HEAD/speech/figure2system-s.jpg -------------------------------------------------------------------------------- /audioset/table-V-M2D-AS-le.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nttcslab/m2d/HEAD/audioset/table-V-M2D-AS-le.png -------------------------------------------------------------------------------- /clap/image_Table3_CLAP_LE.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nttcslab/m2d/HEAD/clap/image_Table3_CLAP_LE.png -------------------------------------------------------------------------------- /clap/image_Table4_CLAP_FT.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nttcslab/m2d/HEAD/clap/image_Table4_CLAP_FT.png -------------------------------------------------------------------------------- /clap/image_Table5_CLAP_ZS.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nttcslab/m2d/HEAD/clap/image_Table5_CLAP_ZS.png -------------------------------------------------------------------------------- /audioset/table-VI-M2D-AS-ft.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nttcslab/m2d/HEAD/audioset/table-VI-M2D-AS-ft.png -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | pandas 2 | numpy 3 | matplotlib 4 | torch 5 | torchaudio 6 | torchvision 7 | tensorboard 8 | fire 9 | tqdm 10 | scikit-learn 11 | librosa 12 | nnAudio 13 | timm==0.4.5 14 | transformers 15 | einops 16 | easydict 17 | torchlibrosa 18 | -------------------------------------------------------------------------------- /app/icbhi_sprs/eval_icbhi.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [ $# -lt 2 ]; then 4 | n_iter=3 5 | else 6 | n_iter=$2 7 | fi 8 | 9 | if [ $# -lt 3 ]; then 10 | lr_prm=5e-5 11 | else 12 | lr_prm=$3 13 | fi 14 | 15 | echo Repeating $n_iter times... 16 | 17 | for i in $(seq $n_iter); do 18 | cmdline="CUDA_VISIBLE_DEVICES=0 python app_main.py --method sl --backbone m2d --epochs 150 --bs 64 --weightspath $1 --lr $lr_prm --freeze_embed --split_iter 4" 19 | echo $cmdline 20 | eval $cmdline 21 | done 22 | -------------------------------------------------------------------------------- /data/files_audioset.csv: -------------------------------------------------------------------------------- 1 | file_name 2 | audioset_lms/balanced_train_segments/--aE2O5G5WE_0.000.npy 3 | audioset_lms/balanced_train_segments/--cB2ZVjpnA_30.000.npy 4 | audioset_lms/balanced_train_segments/--aaILOrkII_200.000.npy 5 | audioset_lms/balanced_train_segments/--ZhevVpy1s_50.000.npy 6 | audioset_lms/balanced_train_segments/--aO5cdqSAg_30.000.npy 7 | audioset_lms/balanced_train_segments/--PJHxphWEs_30.000.npy 8 | audioset_lms/balanced_train_segments/--ekDLDTUXA_30.000.npy 9 | ** please replace this sample with yours ** 10 | -------------------------------------------------------------------------------- /app/icbhi_sprs/eval_sprs.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [ $# -lt 2 ]; then 4 | n_iter=3 5 | else 6 | n_iter=$2 7 | fi 8 | 9 | if [ $# -lt 3 ]; then 10 | lr_prm=5e-6 11 | else 12 | lr_prm=$3 13 | fi 14 | 15 | echo Repeating $n_iter times... 16 | 17 | for i in $(seq $n_iter); do 18 | cmdline="CUDA_VISIBLE_DEVICES=0 python app_main.py --dataset SPRS --datapath data/SPRS --method sl --backbone m2d --epochs 50 --bs 64 --weightspath $1 --lr $lr_prm --freeze_embed --split_iter 4" 19 | echo $cmdline 20 | eval $cmdline 21 | done 22 | -------------------------------------------------------------------------------- /speech/bat/asv.sh: -------------------------------------------------------------------------------- 1 | gpu=$1 2 | upmodel=$2 3 | ckpt=$3 4 | lr=5e-5 5 | task=ASV 6 | seed=$4 7 | 8 | parentpath=$(dirname $ckpt) 9 | parent=$(basename $parentpath) 10 | ckptbase=$(basename $ckpt) 11 | ckptstem=${ckptbase%.*} 12 | expbase=$parent-$ckptstem 13 | 14 | expname=$expbase-$task-lr$lr-s$seed 15 | 16 | echo $expname 17 | CUDA_VISIBLE_DEVICES=$gpu python run_downstream.py -m train -n $expname -u $upmodel -d sv_voxceleb1 -o "config.optimizer.lr=$lr" -k $ckpt,-11.070931,4.1807961 --seed $seed 18 | CUDA_VISIBLE_DEVICES=$gpu ./downstream/sv_voxceleb1/test_expdir.sh result/downstream/$expname /lab/data/superb/voxceleb1 19 | -------------------------------------------------------------------------------- /speech/bat/ic.sh: -------------------------------------------------------------------------------- 1 | gpu=$1 2 | upmodel=$2 3 | ckpt=$3 4 | lr=1e-3 5 | task=IC 6 | seed=$4 7 | 8 | parentpath=$(dirname $ckpt) 9 | parent=$(basename $parentpath) 10 | ckptbase=$(basename $ckpt) 11 | ckptstem=${ckptbase%.*} 12 | expbase=$parent-$ckptstem 13 | 14 | expname=$expbase-$task-lr$lr-s$seed 15 | 16 | echo $expname 17 | CUDA_VISIBLE_DEVICES=$gpu python run_downstream.py -m train -n $expname -u $upmodel -d fluent_commands -o "config.optimizer.lr=$lr" -k $ckpt,-13.017439842224121,4.417759895324707 --seed $seed 18 | CUDA_VISIBLE_DEVICES=$gpu python run_downstream.py -m evaluate -e result/downstream/$expname/dev-best.ckpt 19 | -------------------------------------------------------------------------------- /speech/bat/ks.sh: -------------------------------------------------------------------------------- 1 | gpu=$1 2 | upmodel=$2 3 | ckpt=$3 4 | lr=1e-4 5 | task=KS 6 | seed=$4 7 | 8 | parentpath=$(dirname $ckpt) 9 | parent=$(basename $parentpath) 10 | ckptbase=$(basename $ckpt) 11 | ckptstem=${ckptbase%.*} 12 | expbase=$parent-$ckptstem 13 | 14 | expname=$expbase-$task-lr$lr-s$seed 15 | 16 | echo $expname 17 | CUDA_VISIBLE_DEVICES=$gpu python run_downstream.py -m train -n $expname -u $upmodel -d speech_commands -o "config.optimizer.lr=$lr" -k $ckpt,-11.506255149841309,4.314857482910156 --seed $seed 18 | CUDA_VISIBLE_DEVICES=$gpu python run_downstream.py -m evaluate -e result/downstream/$expname/dev-best.ckpt 19 | -------------------------------------------------------------------------------- /app/circor/bat/m2d_ftcircor_rand.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | split=$2 4 | n_iter=$3 5 | seed=$4 6 | #lr_prm=0.001 for bs128 7 | lr_prm=0.00025 8 | bs=32 9 | gpu=0 10 | 11 | echo Repeating $n_iter times... 12 | 13 | for i in $(seq $n_iter); do 14 | weight=$1/random 15 | seed=$((seed + 1)) 16 | cmdline="CUDA_VISIBLE_DEVICES=$gpu python circor_eval.py config/m2d.yaml circor$split weight_file=$weight,encoder_only=True,freeze_embed=True --lr=$lr_prm --freq_mask 0 --time_mask 0 --training_mask 0.2 --mixup 0.0 --rrc False --epochs 50 --warmup_epochs 5 --seed $seed --batch_size $bs" 17 | echo $cmdline 18 | eval $cmdline 19 | done 20 | -------------------------------------------------------------------------------- /speech/bat/sid.sh: -------------------------------------------------------------------------------- 1 | gpu=$1 2 | upmodel=$2 3 | ckpt=$3 4 | lr=1e-3 5 | task=SID 6 | seed=$4 7 | 8 | parentpath=$(dirname $ckpt) 9 | parent=$(basename $parentpath) 10 | ckptbase=$(basename $ckpt) 11 | ckptstem=${ckptbase%.*} 12 | expbase=$parent-$ckptstem 13 | 14 | expname=$expbase-$task-lr$lr-s$seed 15 | 16 | echo $expname 17 | CUDA_VISIBLE_DEVICES=$gpu python run_downstream.py -m train -n $expname -u $upmodel -d voxceleb1 -o "config.optimizer.lr=$lr" -k $ckpt,-10.571271,4.3681135 --seed $seed 18 | CUDA_VISIBLE_DEVICES=$gpu python run_downstream.py -m evaluate -n $expname -d voxceleb1 -e result/downstream/$expname/dev-best.ckpt 19 | -------------------------------------------------------------------------------- /speech/bat/pr.sh: -------------------------------------------------------------------------------- 1 | gpu=$1 2 | upmodel=$2 3 | ckpt=$3 4 | lr=1e-3 5 | task=PR 6 | seed=$4 7 | 8 | parentpath=$(dirname $ckpt) 9 | parent=$(basename $parentpath) 10 | ckptbase=$(basename $ckpt) 11 | ckptstem=${ckptbase%.*} 12 | expbase=$parent-$ckptstem 13 | 14 | expname=$expbase-$task-lr$lr-s$seed 15 | 16 | echo $expname 17 | CUDA_VISIBLE_DEVICES=$gpu python run_downstream.py -m train -n $expname -u $upmodel -d ctc -c downstream/ctc/libriphone.yaml -o "config.optimizer.lr=$lr" -k $ckpt,-10.43253231048584,4.241369724273682 --seed $seed 18 | CUDA_VISIBLE_DEVICES=$gpu python run_downstream.py -m evaluate -n $expname -d ctc -e result/downstream/$expname/dev-best.ckpt 19 | -------------------------------------------------------------------------------- /app/circor/bat/m2d_ftcircor.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | split=$2 4 | n_iter=$3 5 | seed=$4 6 | #lr_prm=0.001 for bs128 7 | lr_prm=0.00025 8 | bs=32 9 | gpu=0 10 | 11 | echo Repeating $n_iter times... 12 | 13 | for i in $(seq $n_iter); do 14 | for w in ${@:5}; do 15 | weight=$1/checkpoint-$w.pth 16 | seed=$((seed + 1)) 17 | cmdline="CUDA_VISIBLE_DEVICES=$gpu python circor_eval.py config/m2d.yaml circor$split weight_file=$weight,encoder_only=True,freeze_embed=True --lr=$lr_prm --freq_mask 0 --time_mask 0 --training_mask 0.0 --mixup 0.0 --rrc False --epochs 50 --warmup_epochs 5 --seed $seed --batch_size $bs" 18 | echo $cmdline 19 | eval $cmdline 20 | done 21 | done 22 | -------------------------------------------------------------------------------- /app/circor/bat/ast_ftcircor.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # CUDA_VISIBLE_DEVICES=0 python circor_eval.py config/ast.yaml circor1 --lr=1e-5 --freq_mask 40 --time_mask 100 --mixup 0.0 --rrc False --epochs 50 --warmup_epochs 5 --seed 7 --batch_size 64 3 | 4 | split=$1 5 | n_iter=$2 6 | seed=$3 7 | lr_prm=0.00003 8 | bs=64 9 | gpu=0 10 | 11 | echo Repeating $n_iter times... 12 | 13 | for i in $(seq $n_iter); do 14 | seed=$((seed + 1)) 15 | cmdline="CUDA_VISIBLE_DEVICES=$gpu python circor_eval.py config/ast.yaml circor$split --lr=$lr_prm --freq_mask 40 --time_mask 100 --mixup 0.0 --rrc False --epochs 50 --warmup_epochs 5 --seed $seed --batch_size $bs" 16 | echo $cmdline 17 | eval $cmdline 18 | done 19 | -------------------------------------------------------------------------------- /app/circor/bat/ast_ftcircor_noaug.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # CUDA_VISIBLE_DEVICES=0 python circor_eval.py config/ast.yaml circor1 --lr=1e-5 --freq_mask 0 --time_mask 0 --mixup 0.0 --rrc False --epochs 50 --warmup_epochs 5 --seed 7 --batch_size 64 3 | 4 | split=$1 5 | n_iter=$2 6 | seed=$3 7 | lr_prm=0.00003 8 | bs=64 9 | gpu=0 10 | 11 | echo Repeating $n_iter times... 12 | 13 | for i in $(seq $n_iter); do 14 | seed=$((seed + 1)) 15 | cmdline="CUDA_VISIBLE_DEVICES=$gpu python circor_eval.py config/ast.yaml circor$split --lr=$lr_prm --freq_mask 0 --time_mask 0 --mixup 0.0 --rrc False --epochs 50 --warmup_epochs 5 --seed $seed --batch_size $bs" 16 | echo $cmdline 17 | eval $cmdline 18 | done 19 | -------------------------------------------------------------------------------- /app/circor/bat/cnn14_ftcircor.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # CUDA_VISIBLE_DEVICES=0 python circor_eval.py config/cnn14.yaml circor1 --lr=1e-5 --freq_mask 20 --time_mask 200 -mixup 0.5 --rrc False --epochs 50 --warmup_epochs 5 --seed 8 --batch_size 256 3 | 4 | split=$1 5 | n_iter=$2 6 | seed=$3 7 | lr_prm=1e-3 8 | bs=256 9 | gpu=0 10 | 11 | echo Repeating $n_iter times... 12 | 13 | for i in $(seq $n_iter); do 14 | seed=$((seed + 1)) 15 | cmdline="CUDA_VISIBLE_DEVICES=$gpu python circor_eval.py config/cnn14.yaml circor$split --lr=$lr_prm --freq_mask 20 --time_mask 200 -mixup 0.0 --rrc False --epochs 50 --warmup_epochs 5 --seed $seed --batch_size $bs" 16 | echo $cmdline 17 | eval $cmdline 18 | done 19 | -------------------------------------------------------------------------------- /app/circor/bat/cnn14_ftcircor_noaug.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # CUDA_VISIBLE_DEVICES=0 python circor_eval.py config/cnn14.yaml circor1 --lr=1e-5 --freq_mask 0 --time_mask 0 -mixup 0.0 --rrc False --epochs 50 --warmup_epochs 5 --seed 8 --batch_size 256 3 | 4 | split=$1 5 | n_iter=$2 6 | seed=$3 7 | lr_prm=1e-3 8 | bs=256 9 | gpu=0 10 | 11 | echo Repeating $n_iter times... 12 | 13 | for i in $(seq $n_iter); do 14 | seed=$((seed + 1)) 15 | cmdline="CUDA_VISIBLE_DEVICES=$gpu python circor_eval.py config/cnn14.yaml circor$split --lr=$lr_prm --freq_mask 0 --time_mask 0 -mixup 0.0 --rrc False --epochs 50 --warmup_epochs 5 --seed $seed --batch_size $bs" 16 | echo $cmdline 17 | eval $cmdline 18 | done 19 | -------------------------------------------------------------------------------- /speech/bat/ss.sh: -------------------------------------------------------------------------------- 1 | gpu=$1 2 | upmodel=$2 3 | ckpt=$3 4 | lr=1e-3 5 | task=SS 6 | seed=$4 7 | 8 | parentpath=$(dirname $ckpt) 9 | parent=$(basename $parentpath) 10 | ckptbase=$(basename $ckpt) 11 | ckptstem=${ckptbase%.*} 12 | expbase=$parent-$ckptstem 13 | 14 | expname=$expbase-$task-lr$lr-s$seed 15 | 16 | echo $expname 17 | CUDA_VISIBLE_DEVICES=$gpu python run_downstream.py -m train -n $expname -u $upmodel -d separation_stft2 -o "config.optimizer.lr=$lr" -k $ckpt,-9.58743667602539,4.168412208557129 --seed $seed -c downstream/separation_stft2/configs/cfg.yaml 18 | CUDA_VISIBLE_DEVICES=$gpu python run_downstream.py -m evaluate -n $expname -d separation_stft2 -e result/downstream/$expname/best-states-dev.ckpt 19 | -------------------------------------------------------------------------------- /speech/bat/er.sh: -------------------------------------------------------------------------------- 1 | gpu=$1 2 | upmodel=$2 3 | ckpt=$3 4 | lr=1e-5 5 | task=ER 6 | seed=$4 7 | 8 | parentpath=$(dirname $ckpt) 9 | parent=$(basename $parentpath) 10 | ckptbase=$(basename $ckpt) 11 | ckptstem=${ckptbase%.*} 12 | expbase=$parent-$ckptstem 13 | 14 | for test_fold in fold1 fold2 fold3 fold4 fold5; 15 | do 16 | expname=$expbase-$task-lr$lr-s$seed-$test_fold 17 | echo $expname 18 | CUDA_VISIBLE_DEVICES=$gpu python run_downstream.py -m train -n $expname -u $upmodel -d emotion -c downstream/emotion/config.yaml -o "config.optimizer.lr=$lr,, config.downstream_expert.datarc.test_fold='$test_fold'" -k $ckpt,-13.037399291992188,3.619741439819336 --seed $seed 19 | CUDA_VISIBLE_DEVICES=$gpu python run_downstream.py -m evaluate -e result/downstream/$expname/dev-best.ckpt 20 | done 21 | -------------------------------------------------------------------------------- /util/ft-spc.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Usage: 4 | # bash (your m2d)/util/ft-spc.sh <# of iteration> 5 | # 6 | # Example: The parameter `300` will test the checkpoint-300.pth 7 | # cd evar 8 | # bash (your m2d)/util/ft-spc.sh (your m2d)/m2d_vit_base-80x608p16x16-221006-mr7 3 42 300 9 | 10 | n_iter=$2 11 | seed=$3 12 | 13 | echo **SPCV2** Repeating $n_iter times... 14 | 15 | for i in $(seq $n_iter); do 16 | for w in ${@:4}; do 17 | weight=$1/checkpoint-$w.pth 18 | seed=$((seed + 1)) 19 | cmdline="python finetune.py config/m2d.yaml spcv2 weight_file=$weight,encoder_only=True --lr=0.5 --freq_mask 30 --time_mask 48 --training_mask 0.5 --mixup 0.3 --rrc True --seed $seed" 20 | echo $cmdline 21 | eval $cmdline 22 | done 23 | done -------------------------------------------------------------------------------- /util/ft-as20k.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Usage: 4 | # bash (your m2d)/util/ft-as20k.sh <# of iteration> 5 | # 6 | # Example: The parameter `300` will test the checkpoint-300.pth 7 | # cd evar 8 | # bash (your m2d)/util/ft-as20k.sh (your m2d)/m2d_vit_base-80x608p16x16-221006-mr7 3 42 300 9 | 10 | n_iter=$2 11 | seed=$3 12 | 13 | echo **AS20K** Repeating $n_iter times... 14 | 15 | for i in $(seq $n_iter); do 16 | for w in ${@:4}; do 17 | weight=$1/checkpoint-$w.pth 18 | seed=$((seed + 1)) 19 | cmdline="python finetune.py config/m2d.yaml as20k weight_file=$weight,encoder_only=True,dur_frames=1001 --lr=0.5 --freq_mask 30 --time_mask 192 --training_mask 0.5 --mixup 0.3 --rrc True --batch_size 64 --seed $seed" 20 | echo $cmdline 21 | eval $cmdline 22 | done 23 | done -------------------------------------------------------------------------------- /util/ft-esc50.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Usage: 4 | # bash (your m2d)/util/ft-esc50.sh <# of iteration> 5 | # 6 | # Example: The parameter `300` will test the checkpoint-300.pth 7 | # cd evar 8 | # bash (your m2d)/util/ft-esc50.sh (your m2d)/m2d_vit_base-80x608p16x16-221006-mr7 3 42 300 9 | 10 | n_iter=$2 11 | seed=$3 12 | 13 | echo **ESC-50** Repeating $n_iter times... 14 | 15 | for i in $(seq $n_iter); do 16 | for w in ${@:4}; do 17 | weight=$1/checkpoint-$w.pth 18 | seed=$((seed + 1)) 19 | cmdline="python finetune.py config/m2d.yaml esc50 weight_file=$weight,encoder_only=True,dur_frames=501,freeze_embed=True --lr=0.5 --freq_mask 15 --time_mask 48 --training_mask 0.5 --mixup 0.0 --rrc True --seed $seed" 20 | echo $cmdline 21 | eval $cmdline 22 | done 23 | done -------------------------------------------------------------------------------- /util/ft-vc1.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Usage: 4 | # bash (your m2d)/util/ft-vc1.sh <# of iteration> 5 | # 6 | # Example: The parameter `300` will test the checkpoint-300.pth 7 | # cd evar 8 | # bash (your m2d)/util/ft-vc1.sh (your m2d)/m2d_vit_base-80x608p16x16-221006-mr7 3 42 300 9 | 10 | n_iter=$2 11 | seed=$3 12 | 13 | echo **VC1** Repeating $n_iter times... 14 | 15 | for i in $(seq $n_iter); do 16 | for w in ${@:4}; do 17 | weight=$1/checkpoint-$w.pth 18 | seed=$((seed + 1)) 19 | cmdline="python finetune.py config/m2d.yaml vc1 weight_file=$weight,encoder_only=True,dur_frames=821 --optim adamw --lr=0.0005 --freq_mask 30 --time_mask 48 --training_mask 0.0 --mixup 0.0 --rrc False --epochs 50 --batch_size 64 --seed $seed" 20 | echo $cmdline 21 | eval $cmdline 22 | done 23 | done -------------------------------------------------------------------------------- /app/circor/bat/byola_ftcircor.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # CUDA_VISIBLE_DEVICES=0 python circor_eval.py config/byola.yaml circor1 weight_file=external/byol_a/pretrained_weights/AudioNTT2020-BYOLA-64x96d2048.pth --lr=3e-5 --freq_mask 20 --time_mask 50 -mixup 0.0 --rrc False --epochs 50 --warmup_epochs 5 --seed 7 --batch_size 256 3 | 4 | split=$1 5 | n_iter=$2 6 | seed=$3 7 | lr_prm=0.001 8 | bs=256 9 | gpu=0 10 | 11 | echo Repeating $n_iter times... 12 | 13 | for i in $(seq $n_iter); do 14 | weight="external/byol_a/pretrained_weights/AudioNTT2020-BYOLA-64x96d2048.pth" 15 | seed=$((seed + 1)) 16 | cmdline="CUDA_VISIBLE_DEVICES=$gpu python circor_eval.py config/byola.yaml circor$split weight_file=$weight --lr=$lr_prm --freq_mask 20 --time_mask 50 -mixup 0.0 --rrc False --epochs 50 --warmup_epochs 5 --seed $seed --batch_size $bs" 17 | echo $cmdline 18 | eval $cmdline 19 | done 20 | -------------------------------------------------------------------------------- /app/circor/bat/byola_ftcircor_noaug.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # CUDA_VISIBLE_DEVICES=0 python circor_eval.py config/byola.yaml circor1 weight_file=external/byol_a/pretrained_weights/AudioNTT2020-BYOLA-64x96d2048.pth --lr=3e-5 --freq_mask 0 --time_mask 0 -mixup 0.0 --rrc False --epochs 50 --warmup_epochs 5 --seed 7 --batch_size 256 3 | 4 | split=$1 5 | n_iter=$2 6 | seed=$3 7 | lr_prm=0.001 8 | bs=256 9 | gpu=0 10 | 11 | echo Repeating $n_iter times... 12 | 13 | for i in $(seq $n_iter); do 14 | weight="external/byol_a/pretrained_weights/AudioNTT2020-BYOLA-64x96d2048.pth" 15 | seed=$((seed + 1)) 16 | cmdline="CUDA_VISIBLE_DEVICES=$gpu python circor_eval.py config/byola.yaml circor$split weight_file=$weight --lr=$lr_prm --freq_mask 0 --time_mask 0 -mixup 0.0 --rrc False --epochs 50 --warmup_epochs 5 --seed $seed --batch_size $bs" 17 | echo $cmdline 18 | eval $cmdline 19 | done 20 | -------------------------------------------------------------------------------- /util/to_encoder_only_weight.py: -------------------------------------------------------------------------------- 1 | """A utility for M2D pre-trained weight files. 2 | This script converts an M2D weight to an encoder-only weight, resulting in a much smaller weight (1.6G to 326M). 3 | 4 | Usage: python [this script] [source checkpoint file] [output checkpoint file] 5 | """ 6 | 7 | import torch 8 | from pathlib import Path 9 | import sys 10 | sys.path.append('examples') 11 | from portable_m2d import PortableM2D 12 | 13 | src_file = sys.argv[1] 14 | dest_file = sys.argv[2] 15 | 16 | if not Path(src_file).stem.startswith('checkpoint'): 17 | print(f' **WARNING** Do not use this converter for the fine-tuned weights. HEAD WEIGHTS WILL BE LOST.') 18 | 19 | # Load the weight. All the parameters not used in the encoder-only model will be deleted. 20 | # The parameter `norm_stats` will be added if the weight does not have it. i.e., Old weights. 21 | model = PortableM2D(src_file) 22 | 23 | # Save the weights. 24 | Path(dest_file).parent.mkdir(exist_ok=True, parents=True) 25 | torch.save(model.backbone.state_dict(), dest_file) 26 | print(f'Saved {dest_file}.') 27 | -------------------------------------------------------------------------------- /util/ft-as2m.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Usage: 4 | # bash (your m2d)/util/ft-as2m.sh <# of iteration> 5 | # 6 | # Example: The parameter `300` will test the checkpoint-300.pth 7 | # cd evar 8 | # bash (your m2d)/util/ft-as2m.sh (your m2d)/m2d_vit_base-80x608p16x16-221006-mr7 3 42 300 9 | 10 | # CONFIGURE HERE: Set your the same data path as pre-training here 11 | # Fine-tuning on AS2M requires the log-mel spectrogram audio files. 12 | # Prepare data/audioset_lms according to the [Example preprocessing steps (AudioSet)](../data/README.md#example-preprocessing-steps-audioset). 13 | datapath=../data/audioset_lms 14 | 15 | # Fine-tuning steps follow 16 | n_iter=$2 17 | seed=$3 18 | 19 | echo **AS2M** Repeating $n_iter times... 20 | 21 | for i in $(seq $n_iter); do 22 | for w in ${@:4}; do 23 | weight=$1/checkpoint-$w.pth 24 | seed=$((seed + 1)) 25 | cmdline="python finetune.py config/m2d.yaml as weight_file=$weight,encoder_only=True,dur_frames=1001 --lr=2.0 --freq_mask 30 --time_mask 192 --training_mask 0.5 --mixup 0.5 --rrc False --epochs 70 --warmup_epochs 15 --optim lars --batch_size 64 --data_path $datapath --seed $seed" 26 | echo $cmdline 27 | eval $cmdline 28 | done 29 | done -------------------------------------------------------------------------------- /app/icbhi_sprs/cut_data_sprs.py: -------------------------------------------------------------------------------- 1 | """Data cutter. 2 | 3 | Run under the app/icbhi_sprs folder. 4 | """ 5 | 6 | import sys 7 | import os 8 | import fire 9 | import torch 10 | import torchaudio 11 | import librosa 12 | from pathlib import Path 13 | import pandas as pd 14 | import numpy as np 15 | sys.path.append('../..') 16 | 17 | from dataset import SPRS 18 | from m2d.runtime_audio import RuntimeM2D, Config 19 | 20 | args_device = 'cuda' 21 | args_metalabel ='sa' 22 | args_samplerate = 16000 23 | args_duration = 8 24 | args_pad = 'circular' 25 | 26 | 27 | def convert(to_dir='../../data/sprsound_lms', data_dir='./data/SPRS', metadata_csv='metadata.csv'): 28 | rt = RuntimeM2D(weight_file='m2d_vit_base-80x100p16x4-random') 29 | train_ds = SPRS(data_path=data_dir, metadatafile=metadata_csv, duration=args_duration, split='train', device="cpu", samplerate=args_samplerate, pad_type=args_pad, meta_label=args_metalabel) 30 | val_ds = SPRS(data_path=data_dir, metadatafile=metadata_csv, duration=args_duration, split='inter_test', device="cpu", samplerate=args_samplerate, pad_type=args_pad, meta_label=args_metalabel) 31 | to_dir = Path(to_dir) 32 | 33 | for split, ds in [('val', val_ds), ('train', train_ds)]: 34 | print(split) 35 | to_split = to_dir/split 36 | to_split.mkdir(parents=True, exist_ok=True) 37 | for i in range(len(ds)): 38 | sample, *_ = ds[i] 39 | with torch.no_grad(): 40 | lms = rt.to_feature(sample).cpu().numpy()[0] # 1,1,80,801 -> 1,80,801 41 | np.save(to_split/f'{i:04d}.npy', lms) 42 | print('.', end=' ') 43 | print(i) 44 | 45 | 46 | fire.Fire(convert) 47 | 48 | -------------------------------------------------------------------------------- /app/circor/diff-heart-murmur-detection.patch: -------------------------------------------------------------------------------- 1 | --- org/heart-murmur-detection/ModelEvaluation/evaluate_model.py 2024-01-12 15:29:10.126397375 +0900 2 | +++ /heart-murmur-detection/ModelEvaluation/evaluate_model.py 2023-11-15 16:47:47.351524689 +0900 3 | @@ -59,6 +59,10 @@ 4 | murmur_weighted_accuracy = compute_weighted_accuracy( 5 | murmur_labels, output_labels, murmur_classes 6 | ) # This is the murmur scoring metric. 7 | + 8 | + # UAR 9 | + murmur_uar = murmur_accuracy_classes.mean() 10 | + 11 | murmur_scores = ( 12 | murmur_classes, 13 | murmur_auroc, 14 | @@ -70,6 +74,7 @@ 15 | murmur_accuracy, 16 | murmur_accuracy_classes, 17 | murmur_weighted_accuracy, 18 | + murmur_uar, 19 | ) 20 | 21 | ( 22 | @@ -83,11 +88,12 @@ 23 | accuracy, 24 | accuracy_classes, 25 | weighted_accuracy, 26 | + uar, 27 | ) = murmur_scores 28 | murmur_output_string = ( 29 | - "AUROC,AUPRC,F-measure,Accuracy,Weighted Accuracy" 30 | - "\n{:.3f},{:.3f},{:.3f},{:.3f},{:.3f}\n".format( 31 | - auroc, auprc, f_measure, accuracy, weighted_accuracy 32 | + "AUROC,AUPRC,F-measure,Accuracy,Weighted Accuracy,UAR" 33 | + "\n{:.3f},{:.3f},{:.3f},{:.3f},{:.3f},{:.3f}\n".format( 34 | + auroc, auprc, f_measure, accuracy, weighted_accuracy, uar 35 | ) 36 | ) 37 | murmur_class_output_string = ( 38 | @@ -109,8 +115,10 @@ 39 | + murmur_class_output_string 40 | ) 41 | 42 | + print(output_string) 43 | + 44 | # Return the results. 45 | - return output_string 46 | + return murmur_scores 47 | 48 | 49 | # Find Challenge files. 50 | -------------------------------------------------------------------------------- /quick_eval.sh: -------------------------------------------------------------------------------- 1 | cd evar 2 | GPU=0 3 | 4 | if [[ "$1" == *'p32k-'* ]]; then 5 | cfg='config/m2d_32k.yaml' 6 | cfg_clap='config/m2d_clap_32k.yaml' 7 | else 8 | cfg='config/m2d.yaml' 9 | cfg_clap='config/m2d_clap.yaml' 10 | fi 11 | 12 | if [[ "$1" == *'_clap'* ]]; then 13 | zs_opt=',flat_features=True' 14 | fi 15 | 16 | CUDA_VISIBLE_DEVICES=$GPU python lineareval.py $cfg cremad batch_size=16,weight_file=$1 17 | CUDA_VISIBLE_DEVICES=$GPU python lineareval.py $cfg gtzan batch_size=16,weight_file=$1 18 | CUDA_VISIBLE_DEVICES=$GPU python lineareval.py $cfg spcv2 batch_size=64,weight_file=$1 19 | CUDA_VISIBLE_DEVICES=$GPU python lineareval.py $cfg esc50 batch_size=64,weight_file=$1 20 | 21 | if [[ "$1" == *'_clap'* ]]; then 22 | echo 'Zero-shot evaluation' 23 | CUDA_VISIBLE_DEVICES=$GPU python zeroshot.py $cfg_clap cremad batch_size=16,weight_file=$1$zs_opt 24 | CUDA_VISIBLE_DEVICES=$GPU python zeroshot.py $cfg_clap gtzan batch_size=16,weight_file=$1$zs_opt 25 | CUDA_VISIBLE_DEVICES=$GPU python zeroshot.py $cfg_clap nsynth batch_size=64,weight_file=$1$zs_opt 26 | CUDA_VISIBLE_DEVICES=$GPU python zeroshot.py $cfg_clap esc50 batch_size=64,weight_file=$1$zs_opt 27 | CUDA_VISIBLE_DEVICES=$GPU python zeroshot.py $cfg_clap us8k batch_size=64,weight_file=$1$zs_opt 28 | CUDA_VISIBLE_DEVICES=$GPU python zeroshot.py $cfg_clap fsd50k batch_size=64,weight_file=$1$zs_opt 29 | CUDA_VISIBLE_DEVICES=$GPU python zeroshot.py $cfg_clap as batch_size=64,weight_file=$1$zs_opt 30 | fi 31 | 32 | if [[ "$1" == *'_clap'* ]]; then 33 | echo 'Audio-text retrieval evaluation' 34 | CUDA_VISIBLE_DEVICES=$GPU python retr_a2t_t2a.py $cfg_clap audiocaps batch_size=64,weight_file=$1$zs_opt 35 | CUDA_VISIBLE_DEVICES=$GPU python retr_a2t_t2a.py $cfg_clap clotho batch_size=64,weight_file=$1$zs_opt 36 | fi 37 | 38 | python summarize.py $1 39 | -------------------------------------------------------------------------------- /superb/upstream/m2d/hubconf.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- # 2 | """*********************************************************************************************""" 3 | # FileName [ upstream/m2d/hubconf.py ] 4 | # Synopsis [ the M2D torch hubconf ] 5 | """*********************************************************************************************""" 6 | 7 | import os 8 | 9 | from .expert import UpstreamExpert as _UpstreamExpert 10 | 11 | 12 | def m2d_local(ckpt, model_config=None, *args, **kwargs): 13 | assert os.path.isfile(ckpt) 14 | if model_config is not None: 15 | assert os.path.isfile(model_config) 16 | if 'feature_d' not in kwargs: 17 | kwargs["feature_d"] = None 18 | return _UpstreamExpert(ckpt, model_config, *args, **kwargs) 19 | 20 | 21 | def m2d_calcnorm(refresh=False, *args, **kwargs): 22 | """Upstream model entry for calculating normalization statistics for M2D on Superb. 23 | """ 24 | 25 | if kwargs['ckpt'] is None: 26 | print('Set -i your-checkpoint. Exit now.') 27 | exit(-1) 28 | 29 | kwargs['ckpt'] = kwargs['ckpt'].split(',')[0] 30 | return m2d_local(*args, **kwargs) 31 | 32 | 33 | def m2d(refresh=False, *args, **kwargs): 34 | """Upstream model entry for running M2D on Superb. 35 | Note: 36 | kwargs['ckpt']: "path-name-of-your-ckpt,dataset-mean,dataset-std". 37 | """ 38 | 39 | if kwargs['ckpt'] is None: 40 | print('Set "-k your-checkpoint.pth,dataset-mean,dataset-std". Exit now.') 41 | exit(-1) 42 | try: 43 | ckpt, norm_mean, norm_std = kwargs['ckpt'].split(',') 44 | except: 45 | print(f'Confirm your `ckpt`: {kwargs["ckpt"]}') 46 | exit(-1) 47 | 48 | kwargs['ckpt'] = ckpt 49 | norm_mean, norm_std = float(norm_mean), float(norm_std) 50 | print(' using checkpoint:', ckpt) 51 | print(' norm stats:', norm_mean, norm_std) 52 | return m2d_local(*args, norm_mean=norm_mean, norm_std=norm_std, **kwargs) 53 | -------------------------------------------------------------------------------- /app/circor/diff-evar.patch: -------------------------------------------------------------------------------- 1 | diff --git a/evar/ds_tasks.py b/evar/ds_tasks.py 2 | index 14576f2..b717425 100644 3 | --- a/evar/ds_tasks.py 4 | +++ b/evar/ds_tasks.py 5 | @@ -19,6 +19,9 @@ _defs = { 6 | 'voxforge': [1, 5.8, None, False], 7 | 'as20k': [1, 10.0, 'as', False], 8 | 'as': [1, 10.0, 'as', True], 9 | + 'circor1': [1, 5.0, None, False], 10 | + 'circor2': [1, 5.0, None, False], 11 | + 'circor3': [1, 5.0, None, False], 12 | } 13 | 14 | _fs_table = { 15 | diff --git a/finetune.py b/finetune.py 16 | index e196538..a32cf0d 100644 17 | --- a/finetune.py 18 | +++ b/finetune.py 19 | @@ -126,6 +126,18 @@ def loss_bce(logits, gts): 20 | return F.binary_cross_entropy_with_logits(logits, gts) # no need to apply F.sigmoid(logits) 21 | 22 | 23 | +class WeightedCE: 24 | + def __init__(self, labels, device) -> None: 25 | + weights = utils.class_weight.compute_class_weight('balanced', classes=np.unique(labels), y=labels) 26 | + self.celoss = torch.nn.CrossEntropyLoss(weight=torch.tensor(weights).to(device)) 27 | + self.__name__ = f'CrossEntropyLoss(weight={weights})' 28 | + 29 | + def __call__(self, logits, gts): 30 | + preds = F.softmax(logits, dim=-1) 31 | + loss = self.celoss(preds, gts) 32 | + return loss 33 | + 34 | + 35 | def eval_map(y_score, y_true, classes): 36 | average_precision = metrics.average_precision_score( 37 | y_true, y_score, average=None) 38 | @@ -211,8 +223,8 @@ def arg_conf_str(args, defaults={ 39 | 40 | def _train(cfg, ar_model, device, logpath, train_loader, valid_loader, test_loader, multi_label, seed, lr, balanced, verbose): 41 | classes = train_loader.dataset.classes 42 | - 43 | - loss_fn = loss_bce if multi_label else loss_nll 44 | + labels = np.argmax(train_loader.dataset.labels, axis=1) # For app/circor, OH to numbers 45 | + loss_fn = WeightedCE(labels.numpy(), device) # For app/circor, using class-weighted CE loss 46 | eval_fn = eval_map if multi_label else eval_acc 47 | crit_str = 'mAP' if eval_fn == eval_map else 'acc' 48 | optimizer = { 49 | -------------------------------------------------------------------------------- /data/README.md: -------------------------------------------------------------------------------- 1 | ## Pre-training data 2 | 3 | The pre-trainer (e.g., `train_audio.py` for audio) loads data from the `data` folder by default (`--data_path`), using a list of samples in a CSV file `data/files_audioset.csv` by default (`--dataset`). 4 | 5 | The CSV file should have a `file_name` column containing the relative pathname of the files containing a log-mel spectrogram (LMS) audio. Example: 6 | 7 | ``` 8 | file_name 9 | audioset_lms/balanced_train_segments/--aE2O5G5WE_0.000.npy 10 | audioset_lms/balanced_train_segments/--cB2ZVjpnA_30.000.npy 11 | audioset_lms/balanced_train_segments/--aaILOrkII_200.000.npy 12 | audioset_lms/balanced_train_segments/--ZhevVpy1s_50.000.npy 13 | audioset_lms/balanced_train_segments/--aO5cdqSAg_30.000.npy 14 | audioset_lms/balanced_train_segments/--PJHxphWEs_30.000.npy 15 | audioset_lms/balanced_train_segments/--ekDLDTUXA_30.000.npy 16 | ``` 17 | 18 | The folders/files should look like the following: 19 | 20 | (Example of the folder structure) 21 | data/ 22 | audioset_lms/ 23 | balanced_train_segments/ 24 | --aE2O5G5WE_0.000.npy 25 | --cB2ZVjpnA_30.000.npy 26 | : 27 | 28 | If you also have pre-processed FSD50K data, the folder will be as follows: 29 | 30 | (Example of the folder structure) 31 | data/ 32 | audioset_lms/ 33 | : 34 | fsd50k_lms/ 35 | FSD50K.dev_audio/ 36 | 2931.npy 37 | 408195.npy 38 | : 39 | 40 | ### Example preprocessing steps (AudioSet) 41 | 42 | If you have downloaded the AudioSet samples and converted them into .wav files in `/your/local/audioset` folder, the following example steps will preprocess and create a new folder, `data/audioset_lms`. 43 | 44 | 1. Convert your pre-training data to LMS using [`wav_to_lms.py`](../wav_to_lms.py). Example: `python wav_to_lms.py /your/local/audioset data/audioset_lms` 45 | 2. Then, make a list of files under your `data` folder. Example follows: 46 | 47 | ```sh 48 | echo file_name > data/files_audioset.csv 49 | (cd data && find audioset_lms -name "*.npy") >> data/files_audioset.csv 50 | ``` 51 | 52 | -------------------------------------------------------------------------------- /audioset/README.md: -------------------------------------------------------------------------------- 1 | # M2D-AS (M2D-X specialized in AudioSet) 2 | 3 | This sub-repository describes the steps to reproduce M2D-AS pre-training from our [following paper](https://ieeexplore.ieee.org/document/10502167): create a metadata file containing labels and run the pre-training. 4 | 5 | ```BibTeX 6 | @article{niizumi2024m2dx, 7 | title = {{Masked Modeling Duo: Towards a Universal Audio Pre-training Framework}}, 8 | author = {Daisuke Niizumi and Daiki Takeuchi and Yasunori Ohishi and Noboru Harada and Kunio Kashino}, 9 | journal = {IEEE/ACM Trans. Audio, Speech, Language Process.}, 10 | year = {2024}, 11 | volume = {32}, 12 | pages = {2391-2406}, 13 | url = {https://ieeexplore.ieee.org/document/10502167}, 14 | doi = {10.1109/TASLP.2024.3389636}} 15 | ``` 16 | 17 | ## 1. Creating a metadata file 18 | 19 | 1. Make a list of AudioSet files as "data/files_audioset.csv" for the M2D pre-training by following "Example preprocessing steps (AudioSet)" in data/README. 20 | 2. In the M2D folder, create "data/files_as_weighted.csv" containing both sample path and labels (and also sample weights) as follows. 21 | 22 | python util/make_as_weighted_list.py 23 | 24 | You should have a file `data/files_as_weighted.csv`. 25 | 26 | ## 2. Conduct M2D-AS pre-training 27 | 28 | The exact pre-training command line we used is as follows: 29 | 30 | ```shell 31 | OMP_NUM_THREADS=1 torchrun --nproc_per_node=4 -m audioset.train_as --input_size 80x608 --patch_size 16x16 --epochs 300 --batch_size 512 --accum_iter 1 --save_freq 50 --seed 3 --loss_off 1. 32 | ``` 33 | 34 | It requires 4x 48 GB GPU (for about two days), and the following should allow pre-training with 4x 24 GB GPU (3090Ti) within a week. 35 | 36 | ```shell 37 | OMP_NUM_THREADS=1 torchrun --nproc_per_node=4 -m audioset.train_as --input_size 80x608 --patch_size 16x16 --epochs 300 --batch_size 256 --accum_iter 2 --save_freq 50 --seed 3 --loss_off 1. 38 | ``` 39 | 40 | ## Results on the paper 41 | 42 |
43 | Table-V-M2D-AS-le 44 |
45 | 46 |
47 | Tale-VI-M2D-AS-ft 48 |
49 | 50 | -------------------------------------------------------------------------------- /all_eval.sh: -------------------------------------------------------------------------------- 1 | cd evar 2 | GPU=0 3 | NAME=$(basename $(dirname "$1"))/$(basename "$1") 4 | echo Processing $NAME 5 | 6 | if [[ "$1" == *'p32k'* ]]; then 7 | cfg='config/m2d_32k.yaml' 8 | cfg_clap='config/m2d_clap_32k.yaml' 9 | else 10 | cfg='config/m2d.yaml' 11 | cfg_clap='config/m2d_clap.yaml' 12 | fi 13 | 14 | if [[ $NAME == *'_clap'*'/checkpoint'* ]]; then 15 | zs_opt=',flat_features=True' 16 | fi 17 | 18 | CUDA_VISIBLE_DEVICES=$GPU python 2pass_lineareval.py $cfg cremad batch_size=16,weight_file=$1 19 | CUDA_VISIBLE_DEVICES=$GPU python 2pass_lineareval.py $cfg gtzan batch_size=16,weight_file=$1 20 | CUDA_VISIBLE_DEVICES=$GPU python 2pass_lineareval.py $cfg spcv2 batch_size=64,weight_file=$1 21 | CUDA_VISIBLE_DEVICES=$GPU python 2pass_lineareval.py $cfg esc50 batch_size=64,weight_file=$1 22 | CUDA_VISIBLE_DEVICES=$GPU python 2pass_lineareval.py $cfg us8k batch_size=64,weight_file=$1 23 | CUDA_VISIBLE_DEVICES=$GPU python 2pass_lineareval.py $cfg vc1 batch_size=64,weight_file=$1 24 | CUDA_VISIBLE_DEVICES=$GPU python 2pass_lineareval.py $cfg voxforge batch_size=64,weight_file=$1 25 | CUDA_VISIBLE_DEVICES=$GPU python 2pass_lineareval.py $cfg nsynth batch_size=64,weight_file=$1 26 | CUDA_VISIBLE_DEVICES=$GPU python 2pass_lineareval.py $cfg surge batch_size=64,weight_file=$1 27 | 28 | if [[ $NAME == *'_clap'*'/checkpoint'* ]]; then 29 | echo 'Zero-shot evaluation' 30 | CUDA_VISIBLE_DEVICES=$GPU python zeroshot.py $cfg_clap cremad batch_size=16,weight_file=$1$zs_opt 31 | CUDA_VISIBLE_DEVICES=$GPU python zeroshot.py $cfg_clap gtzan batch_size=16,weight_file=$1$zs_opt 32 | CUDA_VISIBLE_DEVICES=$GPU python zeroshot.py $cfg_clap nsynth batch_size=64,weight_file=$1$zs_opt 33 | CUDA_VISIBLE_DEVICES=$GPU python zeroshot.py $cfg_clap esc50 batch_size=64,weight_file=$1$zs_opt 34 | CUDA_VISIBLE_DEVICES=$GPU python zeroshot.py $cfg_clap us8k batch_size=64,weight_file=$1$zs_opt 35 | CUDA_VISIBLE_DEVICES=$GPU python zeroshot.py $cfg_clap fsd50k batch_size=64,weight_file=$1$zs_opt 36 | CUDA_VISIBLE_DEVICES=$GPU python zeroshot.py $cfg_clap as batch_size=64,weight_file=$1$zs_opt 37 | fi 38 | 39 | if [[ $NAME == *'_clap'*'/checkpoint'* ]]; then 40 | echo 'Audio-text retrieval evaluation' 41 | CUDA_VISIBLE_DEVICES=$GPU python retr_a2t_t2a.py $cfg_clap audiocaps batch_size=64,weight_file=$1$zs_opt 42 | CUDA_VISIBLE_DEVICES=$GPU python retr_a2t_t2a.py $cfg_clap clotho batch_size=64,weight_file=$1$zs_opt 43 | fi 44 | 45 | python summarize.py $1 46 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | slurm*.out 7 | 8 | # C extensions 9 | *.so 10 | 11 | # Distribution / packaging 12 | .Python 13 | build/ 14 | develop-eggs/ 15 | dist/ 16 | downloads/ 17 | eggs/ 18 | .eggs/ 19 | lib/ 20 | lib64/ 21 | parts/ 22 | sdist/ 23 | var/ 24 | wheels/ 25 | pip-wheel-metadata/ 26 | share/python-wheels/ 27 | *.egg-info/ 28 | .installed.cfg 29 | *.egg 30 | MANIFEST 31 | 32 | # PyInstaller 33 | # Usually these files are written by a python script from a template 34 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 35 | *.manifest 36 | *.spec 37 | 38 | # Installer logs 39 | pip-log.txt 40 | pip-delete-this-directory.txt 41 | 42 | # Unit test / coverage reports 43 | htmlcov/ 44 | .tox/ 45 | .nox/ 46 | .coverage 47 | .coverage.* 48 | .cache 49 | nosetests.xml 50 | coverage.xml 51 | *.cover 52 | *.py,cover 53 | .hypothesis/ 54 | .pytest_cache/ 55 | 56 | # Translations 57 | *.mo 58 | *.pot 59 | 60 | # Django stuff: 61 | *.log 62 | local_settings.py 63 | db.sqlite3 64 | db.sqlite3-journal 65 | 66 | # Flask stuff: 67 | instance/ 68 | .webassets-cache 69 | 70 | # Scrapy stuff: 71 | .scrapy 72 | 73 | # Sphinx documentation 74 | docs/_build/ 75 | 76 | # PyBuilder 77 | target/ 78 | 79 | # Jupyter Notebook 80 | .ipynb_checkpoints 81 | 82 | # IPython 83 | profile_default/ 84 | ipython_config.py 85 | 86 | # pyenv 87 | .python-version 88 | 89 | # pipenv 90 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 91 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 92 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 93 | # install all needed dependencies. 94 | #Pipfile.lock 95 | 96 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 97 | __pypackages__/ 98 | 99 | # Celery stuff 100 | celerybeat-schedule 101 | celerybeat.pid 102 | 103 | # SageMath parsed files 104 | *.sage.py 105 | 106 | # Environments 107 | .env 108 | .venv 109 | env/ 110 | venv/ 111 | ENV/ 112 | env.bak/ 113 | venv.bak/ 114 | 115 | # Spyder project settings 116 | .spyderproject 117 | .spyproject 118 | 119 | # Rope project settings 120 | .ropeproject 121 | 122 | # mkdocs documentation 123 | /site 124 | 125 | # mypy 126 | .mypy_cache/ 127 | .dmypy.json 128 | dmypy.json 129 | 130 | # Pyre type checker 131 | .pyre/ 132 | -------------------------------------------------------------------------------- /common.py: -------------------------------------------------------------------------------- 1 | # Masked Modeling Duo (M2D) 2 | 3 | import datetime 4 | import hashlib 5 | import sys 6 | import re 7 | 8 | 9 | class PrintLogger(object): 10 | def __init__(self, logfile): 11 | self.stdout = sys.stdout 12 | self.log = open(logfile, 'a') 13 | sys.stdout = self 14 | 15 | def write(self, message): 16 | self.stdout.write(message) 17 | self.log.write(message) 18 | 19 | def flush(self): 20 | self.stdout.flush() 21 | 22 | 23 | def get_timestamp(): 24 | """ex) Outputs 202104220830""" 25 | return datetime.datetime.now().strftime('%y%m%d%H%M') 26 | 27 | 28 | def hash_text(text, L=128): 29 | hashed = hashlib.shake_128(text.encode()).hexdigest(L//2 + 1) 30 | return hashed[:L] 31 | 32 | 33 | def short_model_desc(model, head_len=5, tail_len=1): 34 | text = repr(model).split('\n') 35 | text = text[:head_len] + [' :'] + (text[-tail_len:] if tail_len > 0 else ['']) 36 | return '\n'.join(text) 37 | 38 | 39 | def prmstr_z(p): 40 | return str(p).replace('.0', '').replace('0.', '.') 41 | 42 | def prmstr_zz(prm): 43 | ps = [prmstr_z(p) for p in prm] 44 | return '-'.join(ps) 45 | 46 | 47 | conf_defaults={ 48 | 'dataset': ('data/files_audioset.csv', 'D', 'path'), 49 | 'ema_decay_init': (0.99995, 'ema', 'z'), 50 | 'ema_decay': (0.99999, 'ed', 'z'), 51 | 'decoder_depth': (8, 'dd', 'asis'), 52 | 'mask_ratio': (0.7, 'mr', 'z'), 53 | 'seed': (0, 's', 'asis'), 54 | 'norm_pix_loss': (True, '~N', 'b'), 55 | 'loss_fn': ('norm_mse', 'L', 'head'), 56 | 'optim': ('adamw', 'O', 'asis'), 57 | 'blr': (3e-4, 'blr', 'z'), 58 | 'lr': (None, 'lr', 'z'), 59 | 'eff_batch_size': (2048, 'bs', 'asis'), 60 | 'accum_iter': (1, 'a', 'asis'), 61 | } 62 | 63 | 64 | def arg_conf_str(args, defaults=conf_defaults): 65 | confstr = '' 66 | for k in defaults: 67 | try: 68 | arg_value = eval('args.' + k) 69 | except: 70 | continue # no parameter k for the run. 71 | if arg_value == defaults[k][0]: 72 | continue 73 | arg_key, value_format = defaults[k][1:] 74 | value = str(arg_value) 75 | if value_format == 'z': 76 | value = prmstr_z(arg_value) 77 | elif value_format == 'zz': 78 | value = prmstr_zz(arg_value) 79 | elif value_format == 'b': 80 | value = '' # nothing to add 81 | elif value_format == 'head': 82 | value = value[:1] 83 | elif value_format == 'head_': 84 | value = ''.join([v[:1] for v in value.split('_')]) 85 | elif value_format == 'path': 86 | value = ''.join([v[:1] for v in re.split(r'_|/', value)]) 87 | confstr += arg_key + value 88 | return confstr 89 | -------------------------------------------------------------------------------- /examples/Example_1.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Short example" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "import warnings; warnings.simplefilter('ignore')\n", 17 | "import logging\n", 18 | "logging.basicConfig(level=logging.INFO)\n", 19 | "import sys\n", 20 | "sys.path.append('..')\n", 21 | "import torch" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": 2, 27 | "metadata": {}, 28 | "outputs": [ 29 | { 30 | "name": "stderr", 31 | "output_type": "stream", 32 | "text": [ 33 | "INFO:root:\n", 34 | "INFO:root:Model input size: [80, 608]\n", 35 | "INFO:root:Using weights: m2d_vit_base-80x608p16x16-220930-mr7/checkpoint-300.pth\n", 36 | "INFO:root:Feature dimension: 3840\n", 37 | "INFO:root:Norm stats: -7.1, 4.2\n", 38 | "INFO:root:Runtime MelSpectrogram(16000, 400, 400, 160, 80, 50, 8000):\n", 39 | "INFO:root:MelSpectrogram(\n", 40 | " Mel filter banks size = (80, 201), trainable_mel=False\n", 41 | " (stft): STFT(n_fft=400, Fourier Kernel size=(201, 1, 400), iSTFT=False, trainable=False)\n", 42 | ")\n" 43 | ] 44 | }, 45 | { 46 | "name": "stdout", 47 | "output_type": "stream", 48 | "text": [ 49 | " using 150 parameters, while dropped 250 out of 400 parameters from m2d_vit_base-80x608p16x16-220930-mr7/checkpoint-300.pth\n", 50 | " (dropped: ['mask_token', 'decoder_pos_embed', 'decoder_embed.weight', 'decoder_embed.bias', 'decoder_blocks.0.norm1.weight'] ...)\n", 51 | "\n" 52 | ] 53 | } 54 | ], 55 | "source": [ 56 | "from portable_m2d import PortableM2D\n", 57 | "weight = 'm2d_vit_base-80x608p16x16-220930-mr7/checkpoint-300.pth'\n", 58 | "model = PortableM2D(weight_file=weight)\n" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": 3, 64 | "metadata": {}, 65 | "outputs": [ 66 | { 67 | "name": "stdout", 68 | "output_type": "stream", 69 | "text": [ 70 | "torch.Size([1, 63, 3840])\n" 71 | ] 72 | } 73 | ], 74 | "source": [ 75 | "# A single sample of random waveform\n", 76 | "wav = torch.rand(1, 16000 * 10)\n", 77 | "\n", 78 | "# Encode with M2D\n", 79 | "with torch.no_grad():\n", 80 | " embeddings = model(wav)\n", 81 | "\n", 82 | "# The output embeddings has a shape of [Batch, Frame, Dimension]\n", 83 | "print(embeddings.shape) # --> torch.Size([1, 63, 3840])" 84 | ] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "execution_count": null, 89 | "metadata": {}, 90 | "outputs": [], 91 | "source": [] 92 | } 93 | ], 94 | "metadata": { 95 | "kernelspec": { 96 | "display_name": "ar", 97 | "language": "python", 98 | "name": "python3" 99 | }, 100 | "language_info": { 101 | "codemirror_mode": { 102 | "name": "ipython", 103 | "version": 3 104 | }, 105 | "file_extension": ".py", 106 | "mimetype": "text/x-python", 107 | "name": "python", 108 | "nbconvert_exporter": "python", 109 | "pygments_lexer": "ipython3", 110 | "version": "3.9.18" 111 | } 112 | }, 113 | "nbformat": 4, 114 | "nbformat_minor": 2 115 | } 116 | -------------------------------------------------------------------------------- /speech/extract_offline_ls960.py: -------------------------------------------------------------------------------- 1 | """Offline Teacher Model Feature Extractor for M2D-S 2 | 3 | Masked Modeling Duo for Speech: Specializing General-Purpose Audio Representation to Speech using Denoising Distillation 4 | https://arxiv.org/abs/2305.14079 5 | 6 | This script prepares offline features obtained from a teacher model. An example follows: 7 | 8 | (cd to the root folder of your M2D copy) 9 | python speech/extract_offline_ls960.py /path/to/LibriSpeech 10 | 11 | This example will create `data/ls960_hybrid7s_hubaseL9` and `data/files_ls960_hybrid.csv`. 12 | 13 | ## Data file details 14 | 15 | `data/ls960_hybrid7s_hubaseL9` will have converted files in .npz format. Each .npz file consists of three contents: 16 | 17 | - arr_0: Log-mel spectrogram converted from the raw wave. The speech shorter than 7 seconds will be padded with zeros. 18 | - arr_1: Features (hidden_states) extracted from the teacher model. 19 | - arr_2: The length of the original hidden states excluding paddings. 20 | 21 | Find the details for how these contents are used in SpeechHybridDataset class in speech/speech_dataset.py. 22 | """ 23 | 24 | import sys 25 | import numpy as np 26 | import pandas as pd 27 | from pathlib import Path 28 | import torch 29 | import librosa 30 | import fire 31 | from transformers import Wav2Vec2Processor, HubertModel 32 | from tqdm import tqdm 33 | 34 | sys.path.append('.') # for running under your `m2d` folder to find wav_to_lms 35 | from wav_to_lms import ToLogMelSpec, FFT_parameters 36 | 37 | 38 | def prepare_ls960(src, dest='data/ls960_hybrid7s_hubaseL9', dest_csv='data/files_ls960_hybrid.csv', min_seconds=7): 39 | """ 40 | Args: 41 | src: Source LibriSpeech 960h dataset folder. 42 | dest: Destination folder to store .npz files. 43 | dest_csv: The name of the output CSV file listing the .npz file names. 44 | """ 45 | 46 | dest = Path(dest) 47 | src = Path(src) 48 | files = sorted(src.rglob('train*/**/*.flac')) 49 | min_samples = 16000 * min_seconds 50 | 51 | # Teacher model 52 | output_layers = [9] 53 | device = 'cuda' 54 | processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h") 55 | model = HubertModel.from_pretrained("facebook/hubert-base-ls960") 56 | model.eval() 57 | model.to(device) 58 | 59 | # Spectrogram converter (M2D default) 60 | to_lms = ToLogMelSpec(FFT_parameters()) 61 | 62 | # Extract LS960 features from the teacher 63 | print(f'Processing {len(files)} files..') 64 | csv_rel_paths = [] 65 | for i, f in tqdm(enumerate(files)): 66 | wav, sr = librosa.load(f, mono=True, sr=FFT_parameters.sample_rate) 67 | org_wav_len = len(wav) 68 | 69 | # pad if short 70 | if min_samples is not None: 71 | if wav.shape[-1] < min_samples: 72 | wav = np.pad(wav, (0, min_samples - wav.shape[-1])) 73 | 74 | lms = to_lms(wav).numpy() 75 | wav = torch.tensor(wav).unsqueeze(0) 76 | 77 | preprocessed = processor(wav, return_tensors="pt", sampling_rate=16000).input_values # Batch size 1 78 | preprocessed = preprocessed[0].to(device) # [1, B, raw wave length] -> [B, raw wave length] 79 | with torch.no_grad(): 80 | hidden_states = model(preprocessed, output_hidden_states=True).hidden_states# list of [B, T, D] 81 | # stack layer outputs 82 | states_to_stack = [hidden_states[index] for index in output_layers] if output_layers else hidden_states 83 | hidden_states = torch.cat(states_to_stack, axis=-1).cpu().numpy() 84 | 85 | rel_path = str(f.relative_to(src)).replace('.flac', '.npz') 86 | csv_rel_paths.append(str(dest.relative_to('data')/rel_path)) 87 | newname = dest/rel_path 88 | newname.parent.mkdir(parents=True, exist_ok=True) 89 | 90 | org_hidden_len = (hidden_states.shape[1] * org_wav_len) // wav.shape[-1] 91 | 92 | np.savez(newname, lms, hidden_states, org_hidden_len) # arr_0: lms, arr_1: hidden_states, arr_2: original hidden states length 93 | if (i + 1) % 100 == 0: 94 | print(i, f'{i/len(files)*100:.3f}%', newname, lms.shape, hidden_states.shape, org_hidden_len) 95 | 96 | pd.DataFrame({'file_name': csv_rel_paths}).to_csv(dest_csv, index=None) 97 | print('Done.') 98 | 99 | 100 | if __name__ == '__main__': 101 | fire.Fire(prepare_ls960) 102 | -------------------------------------------------------------------------------- /wav_to_lms.py: -------------------------------------------------------------------------------- 1 | """Wave to log-mel spectrogram (LMS) audio file converter. 2 | 3 | This program converts the original audio files recursively found in the source folder, 4 | then stores them in the destination folder while holding the same relative path structure. 5 | 6 | The conversion includes the following processes: 7 | - Stereo to mono 8 | - Resampling to a sampling rate 9 | - Converting to a log-mel spectrogram 10 | 11 | Example: 12 | python wav_to_lms.py /your/local/fsd50k/FSD50K.dev_audio /your/msm_mae/fsd50kdev_lms 13 | """ 14 | 15 | import numpy as np 16 | from pathlib import Path 17 | import librosa 18 | from multiprocessing import Pool 19 | import torch.multiprocessing as mp 20 | import torch 21 | import fire 22 | from tqdm import tqdm 23 | import nnAudio.features 24 | import warnings 25 | warnings.simplefilter('ignore') 26 | 27 | 28 | class FFT_parameters: 29 | # We extract log-mel spectrograms with 80 features using a window size of 25 ms and a stride of 10 ms from a waveform sampled at 16kHz. 30 | sample_rate = 16000 31 | window_size = 400 32 | n_fft = 400 33 | hop_size = 160 34 | n_mels = 80 35 | f_min = 50 36 | f_max = 8000 37 | 38 | 39 | def _converter_worker(args): 40 | subpathname, from_dir, to_dir, prms, to_lms, suffix, min_length, max_length, verbose = args 41 | from_dir, to_dir = Path(from_dir), Path(to_dir) 42 | to_name = to_dir/(subpathname[:-len(suffix)]+'.npy') 43 | 44 | if to_name.exists(): 45 | print('already exist', subpathname) 46 | return '' 47 | 48 | # load and convert to a log-mel spectrogram 49 | try: 50 | wav, org_sr = librosa.load(str(from_dir/subpathname), mono=True, sr=prms.sample_rate) 51 | 52 | # pad if short 53 | if min_length is not None: 54 | min_length = int(FFT_parameters.sample_rate * min_length) 55 | if wav.shape[-1] < min_length: 56 | print('from', wav.shape) 57 | wav = np.pad(wav, (0, min_length - wav.shape[-1])) 58 | print('to', wav.shape) 59 | 60 | if max_length is not None: 61 | max_length = int(FFT_parameters.sample_rate * max_length) 62 | if max_length < wav.shape[-1]: 63 | print('from', wav.shape) 64 | wav = wav[:max_length] 65 | print('to', wav.shape) 66 | 67 | lms = to_lms(wav) 68 | except Exception as e: 69 | print('ERROR failed to open or convert', subpathname, '-', str(e)) 70 | return '' 71 | 72 | to_name.parent.mkdir(parents=True, exist_ok=True) 73 | np.save(to_name, lms) 74 | 75 | if verbose: 76 | print(from_dir, '->', to_name, lms.shape) 77 | 78 | return to_name.name 79 | 80 | 81 | class ToLogMelSpec: 82 | def __init__(self, cfg): 83 | # Spectrogram extractor 84 | self.cfg = cfg 85 | self.to_spec = nnAudio.features.MelSpectrogram( 86 | sr=cfg.sample_rate, 87 | n_fft=cfg.n_fft, 88 | win_length=cfg.window_size, 89 | hop_length=cfg.hop_size, 90 | n_mels=cfg.n_mels, 91 | fmin=cfg.f_min, 92 | fmax=cfg.f_max, 93 | center=True, 94 | power=2, 95 | verbose=False, 96 | ) 97 | 98 | def __call__(self, audio): 99 | x = self.to_spec(torch.tensor(audio)) 100 | x = (x + torch.finfo().eps).log() 101 | return x 102 | 103 | 104 | def convert_wav(from_dir, to_dir, suffix='.wav', skip=0, min_length=6.1, max_length=30.0, verbose=False) -> None: 105 | from_dir = str(from_dir) 106 | files = [str(f).replace(from_dir, '') for f in Path(from_dir).glob(f'**/*{suffix}')] 107 | files = [f[1:] if f[0] == '/' else f for f in files] 108 | files = sorted(files) 109 | if skip > 0: 110 | files = files[skip:] 111 | 112 | prms = FFT_parameters() 113 | to_lms = ToLogMelSpec(prms) 114 | 115 | print(f'Processing {len(files)} {suffix} files at a sampling rate of {prms.sample_rate} Hz...') 116 | assert len(files) > 0 117 | 118 | with Pool() as p: 119 | args = [[f, from_dir, to_dir, prms, to_lms, suffix, min_length, max_length, verbose] for f in files] 120 | shapes = list(tqdm(p.imap(_converter_worker, args), total=len(args))) 121 | 122 | print('finished.') 123 | 124 | 125 | if __name__ == "__main__": 126 | mp.set_start_method('spawn', force=True) 127 | fire.Fire(convert_wav) 128 | -------------------------------------------------------------------------------- /clap/README.md: -------------------------------------------------------------------------------- 1 | # M2D-CLAP: Masked Modeling Duo Meets CLAP for Learning General-purpose Audio-Language Representation 2 | 3 |
4 | image_figure2 5 |
6 | 7 | This sub-repository provides codes for our M2D-CLAP papers, including the setup procedure for the training caption data and the pre-training steps. 8 | 9 | ```bibtex 10 | @article{niizumi2025m2d-clap, 11 | author = {Niizumi, Daisuke and Takeuchi, Daiki and Yasuda, Masahiro and Nguyen, Binh Thien and Ohishi, Yasunori and Harada, Noboru}, 12 | journal = {IEEE Access}, 13 | title = {M2D-CLAP: Exploring General-purpose Audio-Language Representations Beyond CLAP}, 14 | year = {2025}, 15 | pages = {1-1}, 16 | doi={10.1109/ACCESS.2025.3611348}} 17 | 18 | @inproceedings{niizumi2024m2d-clap, 19 | title = {{M2D-CLAP: Masked Modeling Duo Meets CLAP for Learning General-purpose Audio-Language Representation}}, 20 | author = {Daisuke Niizumi and Daiki Takeuchi and Yasunori Ohishi and Noboru Harada and Masahiro Yasuda and Shunsuke Tsubaki and Keisuke Imoto}, 21 | booktitle={Interspeech}, 22 | year = {2024}, 23 | pages = {57--61}, 24 | doi = {10.21437/Interspeech.2024-29}} 25 | ``` 26 | 27 | ## 1. Setup 28 | 29 | Our implementation does not convert texts into sentence (semantic) embeddings on the fly. Instead, we convert them into embeddings in advance (offline) at the following steps 2 and 3. 30 | 31 | 1. Prepare for the M2D pre-training on AudioSet by following the [3. Pre-training From Scratch](../README.md#3-pre-training-from-scratch). 32 | - Especcially, configure data/audioset_lms according to the [Example preprocessing steps (AudioSet)](../data/README.md#example-preprocessing-steps-audioset). 33 | 2. Run `Note-AutoACD-GTEbase.ipynb` to create `data/capemb_GTEbase_Audo_A_C_D.npy` for [Auto-ACD](https://auto-acd.github.io/) captions. 34 | 3. Run `Note-ACalt4_GTEbase.ipynb` to create `data/capemb_GTEbase_AC_alt_4.npy` for [AudioCaps Alternative 4 Captions (ACalt4)](https://github.com/KeisukeImoto/ACalt4). 35 | 36 | In summary, the following data should be ready. 37 | 38 | - `data/audioset_lms` -- The log-mel spectrogram audio samples (many .npy files) 39 | - `data/files_audioset.csv` -- The list of the samples in the `data/audioset_lms`. 40 | - `data/capemb_GTEbase_Audo_A_C_D.npy` -- The caption embeddings of AutoACD. 41 | - `data/capemb_GTEbase_AC_alt_4.npy` -- The caption embeddings of ACalt4. 42 | 43 | ## 2. Pre-training 44 | 45 | The exact pre-training command line we used is as follows: 46 | 47 | ```shell 48 | OMP_NUM_THREADS=1 torchrun --nproc_per_node=4 -m semantics.train_clap --input_size 80x608 --patch_size 16x16 --epochs 300 --batch_size 512 --accum_iter 1 --save_freq 50 --seed 3 --model m2d_clap_vit_base --file_caption data/capemb_GTEbase_Audo_A_C_D.npy,data/capemb_GTEbase_AC_alt_4.npy --loss_off .01 49 | ``` 50 | 51 | ## 3. Evaluation 52 | 53 | Quick example: [examples/Example_4_CLAP2025.ipynb](../examples/Example_4_CLAP2025.ipynb). 54 | 55 | The evaluation steps follow the [original M2D](../README.md#2-evaluating-m2d). 56 | 57 | For the zero-shot evaluation, refer to the [../all_eval.sh](../all_eval.sh), which contains all the command lines exactly used for the paper. 58 | 59 | ## AudioCaps Alternative 4 Captions (ACalt4) 60 | 61 | Refer to the repository [ACalt4](https://github.com/KeisukeImoto/ACalt4) for the details. 62 | 63 | ## Examples 64 | 65 | | Description | Notebook | 66 | |:------------|:---------| 67 | | Zero-shot ESC-50 classification with M2D-CLAP | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg) examples/Colab_M2D-CLAP_ESC-50_ZS.ipynb](http://colab.research.google.com/github/nttcslab/m2d/blob/master/examples/Colab_M2D-CLAP_ESC-50_ZS.ipynb) | 68 | | Audio feature visualization example with M2D-CLAP | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg) examples/Colab_M2D-CLAP_ESC-50_VizualizeEmbs.ipynb](http://colab.research.google.com/github/nttcslab/m2d/blob/master/examples/Colab_M2D-CLAP_ESC-50_VizualizeEmbs.ipynb) | 69 | 70 | ### t-SNE visualization of ESC-10 samples 71 | 72 | The t-SNE visualization of the audio embeddings encoded by M2D-CLAP. The conventional audio embeddings are the output of the audio encoder for transfer learning. The CLAP audio embeddings are the output of the audio projector for ZS inference. 73 | 74 |
75 | image-ESC10-Viz 76 |
77 | 78 | ## Results on the paper 79 | 80 |
81 | image_Table3_CLAP_LE 82 |
83 | 84 |
85 | image_Table4_CLAP_FT 86 |
87 | 88 |
89 | image_Table5_CLAP_ZS 90 |
91 | -------------------------------------------------------------------------------- /util/make_as_weighted_list.py: -------------------------------------------------------------------------------- 1 | """AudioSet metadata maker for M2D-AS 2 | 3 | This utility requires `data/files_audioset.csv` as input. 4 | Before you begin, make the list of AudioSet files as "data/files_audioset.csv" for the M2D pre-training by following "Example preprocessing steps (AudioSet)" in data/README. 5 | 6 | In the M2D folder, you can create "data/files_as_weighted.csv" containing both sample path and labels (and also sample weights) with the following. 7 | 8 | python util/make_as_weighted_list.py 9 | 10 | """ 11 | 12 | from re import U 13 | import urllib.request 14 | from pathlib import Path 15 | import pandas as pd 16 | import numpy as np 17 | import csv 18 | import fire 19 | 20 | 21 | def download_segment_csv(): 22 | EVAL_URL = 'http://storage.googleapis.com/us_audioset/youtube_corpus/v1/csv/eval_segments.csv' 23 | BALANCED_TRAIN_URL = 'http://storage.googleapis.com/us_audioset/youtube_corpus/v1/csv/balanced_train_segments.csv' 24 | UNBALANCED_TRAIN_URL = 'http://storage.googleapis.com/us_audioset/youtube_corpus/v1/csv/unbalanced_train_segments.csv' 25 | CLASS_LABEL_URL = 'http://storage.googleapis.com/us_audioset/youtube_corpus/v1/csv/class_labels_indices.csv' 26 | 27 | for subset_url in [EVAL_URL, BALANCED_TRAIN_URL, UNBALANCED_TRAIN_URL, CLASS_LABEL_URL]: 28 | subset_path = '/tmp/' + Path(subset_url).name 29 | if Path(subset_path).is_file(): 30 | continue 31 | with open(subset_path, 'w') as f: 32 | subset_data = urllib.request.urlopen(subset_url).read().decode() 33 | f.write(subset_data) 34 | print('Wrote', subset_path) 35 | 36 | 37 | def gen_weight(df, label_file): 38 | # Following AudioMAE https://github.com/facebookresearch/AudioMAE/blob/main/dataset/audioset/gen_weight.py 39 | 40 | def make_index_dict(label_csv): 41 | index_lookup = {} 42 | with open(label_csv, 'r') as f: 43 | csv_reader = csv.DictReader(f) 44 | line_count = 0 45 | for row in csv_reader: 46 | index_lookup[row['mid']] = row['index'] 47 | line_count += 1 48 | return index_lookup 49 | 50 | index_dict = make_index_dict(label_file) 51 | label_count = np.zeros(527) 52 | 53 | for sample in df.label.values: 54 | sample_labels = sample.split(',') 55 | for label in sample_labels: 56 | label_idx = int(index_dict[label]) 57 | label_count[label_idx] = label_count[label_idx] + 1 58 | 59 | label_weight = 1000.0 / (label_count + 100) 60 | 61 | sample_weight = np.zeros(len(df)) 62 | for i, sample in enumerate(df.label.values): 63 | sample_labels = sample.split(',') 64 | for label in sample_labels: 65 | label_idx = int(index_dict[label]) 66 | # summing up the weight of all appeared classes in the sample, note audioset is multiple-label classification 67 | sample_weight[i] += label_weight[label_idx] 68 | sample_weight = np.power(sample_weight, 1.0/1.5) # making the weights softer 69 | df['weight'] = sample_weight 70 | return df 71 | 72 | 73 | def make_metadata(org_list='data/files_audioset.csv', to_list='data/files_as_weighted.csv'): 74 | # download the original metadata. 75 | download_segment_csv() 76 | 77 | # load label maps. 78 | e_df = pd.read_csv('/tmp/eval_segments.csv', skiprows=2, sep=', ', engine='python') 79 | e_df['split'] = 'eval_segments' 80 | b_df = pd.read_csv('/tmp/balanced_train_segments.csv', skiprows=2, sep=', ', engine='python') 81 | b_df['split'] = 'balanced_train_segments' 82 | u_df = pd.read_csv('/tmp/unbalanced_train_segments.csv', skiprows=2, sep=', ', engine='python') 83 | u_df['split'] = 'unbalanced_train_segments' 84 | df = pd.concat([e_df, b_df, u_df]) 85 | df = df[['# YTID', 'positive_labels', 'split']].copy() 86 | df.columns = ['ytid', 'label', 'split'] 87 | # clean labels. 88 | def remove_quotations(s): 89 | assert s[0] == '"' and s[-1] == '"' 90 | return s[1:-1] 91 | df.label = df.label.apply(lambda s: remove_quotations(s)) 92 | label_mapper = {ytid: label for ytid, label in df[['ytid', 'label']].values} 93 | 94 | # calculate weights for each sample in org_list, and store the results in to_list. 95 | org_df = pd.read_csv(org_list) # assert: org_list has only one column "file_name" 96 | org_df['label'] = org_df.file_name.apply(lambda f: label_mapper[f.split('/')[-1][:11]]) # assign labels for each file_name 97 | new_df = gen_weight(org_df, '/tmp/class_labels_indices.csv') # assign sample weights for each file_name 98 | new_df.to_csv(to_list, index=None) 99 | print('Created', to_list, 'based on', org_list) 100 | 101 | 102 | fire.Fire(make_metadata) 103 | -------------------------------------------------------------------------------- /app/circor/README.md: -------------------------------------------------------------------------------- 1 | # Exploring Pre-trained General-purpose Audio Representations for Heart Murmur Detection 2 | 3 | ![EMBC](https://embc.embs.org/2024/wp-content/uploads/sites/102/2023/05/ieee-embc-2024-logo2x.png) 4 | 5 | This sub-repository provides codes for evaluating the performance of pre-trained models intended to reproduce the results in [our IEEE EMBC 2024 paper](https://arxiv.org/abs/2404.17107). 6 | 7 |
8 | Table II 9 |
We compared the results among the previous studies and four pre-trained audio models.
10 |
11 | 12 | Our contents include: 13 | 14 | - Data downloading and formatting notebook. It also covers code setup. 15 | - Training/testing codes and utility batch scripts for reproducing our experiments. 16 | - The command lines used for the paper. 17 | - The notebook used to summarize and format results for the paper. 18 | 19 | Please refer to the following paper (arXiv link) for the details. 20 | 21 | ```bibtex 22 | @article{niizumi2024embc, 23 | title = {{Exploring Pre-trained General-purpose Audio Representations for Heart Murmur Detection}}, 24 | author = {Daisuke Niizumi and Daiki Takeuchi and Yasunori Ohishi and Noboru Harada and Kunio Kashino}, 25 | journal = {to appear at IEEE EMBC}, 26 | year = {2024}, 27 | url = {https://arxiv.org/abs/2404.17107} 28 | } 29 | ``` 30 | 31 | ## 1. Setup 32 | 33 | ### 1-0. Global setup 34 | 35 | Please complete the setup for M2D first. 36 | 37 | [👉️ GLOBAL SETUP, PLEASE BE SURE TO COMPLETE THESE STEPS.](https://github.com/nttcslab/m2d/tree/master?tab=readme-ov-file#1-setup) 38 | 39 | 40 | ### 1-1. Setup for the reproduction of the paper 41 | 42 | [0-Prepare.ipynb](0-Prepare.ipynb) provides complete setup steps, including: 43 | - Code setup (training/test program and external evaluation code) 44 | - Downloading dataset from `physionet.org` 45 | - Format the code for our experiments 46 | - Integrity check for the data 47 | 48 | ### 1-2. Major folders 49 | 50 | You will find the following folders after the setup. 51 | 52 | bat -- Batch scripts for automating experiments 53 | evar -- Experiment runs under this folder 54 | /work -- The data used during the training 55 | heart-murmur-detection -- Copy of the repository of the previous study from Walker et al. 56 | /data -- The data used for the final test 57 | m2d_vit_base-80x608p16x16-220930-mr7_enconly -- The pre-trained M2D weight 58 | physionet.org -- The copy of the dataset 59 | scores -- The results of our paper 60 | 61 | ## 2. Running Experiments 62 | 63 | We provide two example notebooks for running experiments. 64 | 65 | - [1-Run-M2D.ipynb](1-Run-M2D.ipynb) provides an example of a complete command line. You can train a model using an M2D model, and you should obtain a result close to the paper. You can also check the details of fine-tuning parameters. 66 | - [2-Run-BYOL-A.ipynb](2-Run-BYOL-A.ipynb) provides an example of the experiment using a batch file. This is exactly what we performed for the paper. 67 | 68 | Please find the complete command line in [Command lines used for the paper](#command-lines-used-for-the-paper). 69 | 70 | ## 3. Summarizing the results 71 | 72 | [9-Summarize-results-CirCor.ipynb](9-Summarize-results-CirCor.ipynb) provides complete steps to summarize the results using our result files in the `scores` folder. 73 | 74 | ## Files 75 | 76 | This sub-repository contains the following files: 77 | 78 | - 0-Prepare.ipynb -- A notebook for preparing the experiment 79 | - 1-Run-M2D.ipynb -- A notebook for the M2D experiment 80 | - 2-Run-BYOL-A.ipynb -- A notebook for the BYOL-A experiment 81 | - 9-Summarize-results-CirCor.ipynb -- A notebook for summarizing results 82 | - circor_eval.py -- The main program for the experiment 83 | - bat/*.sh -- Scripts for automating experiments for each pre-trained audio representation 84 | - diff-evar.patch -- A patch file for EVAR 85 | - diff-heart-murmur-detection.patch -- A patch file for heart-murmur-detection 86 | 87 | ## Acknowledgements 88 | 89 | We appreciate the previous studies that shared their codes. 90 | Our code uses [Benjamin-Walker/heart-murmur-detection](https://github.com/Benjamin-Walker/heart-murmur-detection) from the paper: 91 | 92 | ```bibtex 93 | @article{walker2022DBResNet, 94 | title={Dual Bayesian ResNet: A Deep Learning Approach to Heart Murmur Detection}, 95 | author={Benjamin Walker and Felix Krones and Ivan Kiskin and Guy Parsons and Terry Lyons and Adam Mahdi}, 96 | journal={Computing in Cardiology}, 97 | volume={49}, 98 | year={2022} 99 | } 100 | ``` 101 | 102 | ## Command lines used for the paper 103 | 104 | We used the following command lines. 105 | 106 | ```sh 107 | cd evar 108 | bash ../bat/m2d_ftcircor.sh ../m2d_vit_base-80x608p16x16-220930-mr7_enconly 1 5 7 300 109 | bash ../bat/m2d_ftcircor.sh ../m2d_vit_base-80x608p16x16-220930-mr7_enconly 2 5 7 300 110 | bash ../bat/m2d_ftcircor.sh ../m2d_vit_base-80x608p16x16-220930-mr7_enconly 3 5 7 300 111 | 112 | bash ../bat/ast_ftcircor.sh 1 5 42 113 | bash ../bat/ast_ftcircor.sh 2 5 42 114 | bash ../bat/ast_ftcircor.sh 3 5 42 115 | 116 | bash ../bat/byola_ftcircor.sh 1 5 42 117 | bash ../bat/byola_ftcircor.sh 2 5 42 118 | bash ../bat/byola_ftcircor.sh 3 5 42 119 | 120 | bash ../bat/cnn14_ftcircor.sh 1 5 42 121 | bash ../bat/cnn14_ftcircor.sh 2 5 42 122 | bash ../bat/cnn14_ftcircor.sh 3 5 42 123 | 124 | bash ../bat/m2d_ftcircor_rand.sh m2d_vit_base-80x608p16x16-220930-mr7_enconly 1 5 7 125 | bash ../bat/m2d_ftcircor_rand.sh m2d_vit_base-80x608p16x16-220930-mr7_enconly 2 5 7 126 | bash ../bat/m2d_ftcircor_rand.sh m2d_vit_base-80x608p16x16-220930-mr7_enconly 3 5 7 127 | 128 | bash ../bat/ast_ftcircor_noaug.sh 1 5 42 129 | bash ../bat/ast_ftcircor_noaug.sh 2 5 42 130 | bash ../bat/ast_ftcircor_noaug.sh 3 5 42 131 | 132 | bash ../bat/byola_ftcircor_noaug.sh 1 5 42 133 | bash ../bat/byola_ftcircor_noaug.sh 2 5 42 134 | bash ../bat/byola_ftcircor_noaug.sh 3 5 42 135 | 136 | bash ../bat/cnn14_ftcircor_noaug.sh 1 5 42 137 | bash ../bat/cnn14_ftcircor_noaug.sh 2 5 42 138 | bash ../bat/cnn14_ftcircor_noaug.sh 3 5 42 139 | ``` 140 | -------------------------------------------------------------------------------- /superb/upstream/m2d/expert.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- # 2 | """*********************************************************************************************""" 3 | # FileName [ upstream/m2d/expert.py ] 4 | # Synopsis [ the Masked Modeling Duo (M2D) wrapper ] 5 | """*********************************************************************************************""" 6 | 7 | 8 | ############### 9 | # IMPORTATION # 10 | ############### 11 | import math 12 | #-------------# 13 | import torch 14 | import torch.nn as nn 15 | from torch.nn.utils.rnn import pad_sequence 16 | #-------------# 17 | from .m2d.m2d.runtime_audio import RuntimeM2D 18 | 19 | 20 | class RunningMean: 21 | """Running mean calculator for arbitrary axis configuration. 22 | Borrowed from https://github.com/nttcslab/byol-a/blob/master/v2/byol_a2/augmentations.py#L147 23 | """ 24 | 25 | def __init__(self, axis): 26 | self.n = 0 27 | self.axis = axis 28 | 29 | def put(self, x): 30 | # https://math.stackexchange.com/questions/106700/incremental-averageing 31 | self.n += 1 32 | if self.n == 1: 33 | self.mu = x.mean(self.axis, keepdims=True) 34 | else: 35 | self.mu += (x.mean(self.axis, keepdims=True) - self.mu) / self.n 36 | 37 | def __call__(self): 38 | return self.mu 39 | 40 | def __len__(self): 41 | return self.n 42 | 43 | 44 | class RunningVariance: 45 | """Calculate mean/variance of tensors online. 46 | Thanks to https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance 47 | Borrowed from https://github.com/nttcslab/byol-a/blob/master/v2/byol_a2/augmentations.py#L147 48 | """ 49 | 50 | def __init__(self, axis, mean): 51 | self.update_mean(mean) 52 | self.s2 = RunningMean(axis) 53 | 54 | def update_mean(self, mean): 55 | self.mean = mean 56 | 57 | def put(self, x): 58 | self.s2.put((x - self.mean) **2) 59 | 60 | def __call__(self): 61 | return self.s2() 62 | 63 | def std(self): 64 | return self().sqrt() 65 | 66 | 67 | class RunningNorm(nn.Module): 68 | """Online Normalization using Running Mean/Std. 69 | Borrowed from https://github.com/nttcslab/byol-a/blob/master/v2/byol_a2/augmentations.py#L147 70 | This module will only update the statistics up to the specified number of epochs. 71 | After the `max_update_epochs`, this will normalize with the last updated statistics. 72 | Args: 73 | epoch_samples: Number of samples in one epoch 74 | max_update_epochs: Number of epochs to allow update of running mean/variance. 75 | axis: Axis setting used to calculate mean/variance. 76 | """ 77 | 78 | def __init__(self, epoch_samples, max_update_epochs=10, axis=[1, 2]): 79 | super().__init__() 80 | self.max_update = epoch_samples * max_update_epochs 81 | self.ema_mean = RunningMean(axis) 82 | self.ema_var = RunningVariance(axis, 0) 83 | self.reported = False 84 | 85 | def forward(self, image): 86 | if len(self.ema_mean) < self.max_update: 87 | self.ema_mean.put(image) 88 | self.ema_var.update_mean(self.ema_mean()) 89 | self.ema_var.put(image) 90 | self.mean = self.ema_mean() 91 | self.std = torch.clamp(self.ema_var.std(), torch.finfo().eps, torch.finfo().max) 92 | elif not self.reported: 93 | self.reported = True 94 | logger.info(f'\n*** Running Norm has finished updates over {self.max_update} times, using the following stats from now on. ***\n mean={float(self.mean.view(-1))}, std={float(self.std.view(-1))}') 95 | logger.info(f'*** Please use these statistics in your model. EXIT... ***\n') 96 | exit(-1) 97 | return ((image - self.mean) / self.std) 98 | 99 | def __repr__(self): 100 | format_string = self.__class__.__name__ + f'(max_update={self.max_update},axis={self.ema_mean.axis})' 101 | return format_string 102 | 103 | 104 | ################### 105 | # UPSTREAM EXPERT # 106 | ################### 107 | class UpstreamExpert(nn.Module): 108 | """ 109 | The M2D wrapper 110 | """ 111 | 112 | def __init__( 113 | self, 114 | ckpt: str, 115 | model_config: str, 116 | feature_d: int, 117 | window_secs: float = (160 * 16) / 16000, 118 | stride_secs: float = (160 * 16) / 16000, 119 | norm_mean: float = None, # Has to be a float value to continue training. 120 | norm_std: float = None, # The same as above. 121 | **kwargs, 122 | ): 123 | super(UpstreamExpert, self).__init__() 124 | 125 | # Normalizer 126 | if norm_mean is None or norm_std is None: 127 | # ** CAUTION ** 128 | # ** Please note that here we calculate statistics using RunningNorm and will exit early in the training. ** 129 | # ** CAUTION ** 130 | self.norm = RunningNorm(epoch_samples=10_000, max_update_epochs=1, axis=[0, 1, 2, 3]) # Use single scalar mean/std values. 131 | else: 132 | print(f'*** Using normalization statistics: mean={norm_mean}, std={norm_std} ***') 133 | self.norm = lambda x: (x - norm_mean) / norm_std 134 | 135 | 136 | # Load pretrained weights. 137 | self.model = RuntimeM2D(weight_file=ckpt) 138 | 139 | # attributes 140 | self.output_dim = self.model.cfg.feature_d 141 | self.max_input_length = 1024 # self.model.cfg.input_size[1] 142 | 143 | # Interface 144 | def get_output_dim(self): 145 | return self.output_dim 146 | 147 | # Interface 148 | def get_downsample_rates(self, key: str) -> int: 149 | return 160 * self.model.cfg.patch_size[1] # hop_size x time frames 150 | 151 | def to_feature(self, batch_audio): 152 | x = self.model.to_spec(batch_audio) 153 | x = (x + torch.finfo().eps).log() 154 | return x.unsqueeze(1) #.to(device) 155 | 156 | # Interface 157 | def forward(self, wavs): 158 | """ 159 | Args: 160 | wavs: 161 | list of unpadded wavs [wav1, wav2, ...] 162 | each wav is in torch.FloatTensor with sample rate 16000 163 | and already put in the device assigned by command-line args 164 | 165 | Return: 166 | features: 167 | list of unpadded features [feat1, feat2, ...] 168 | each feat is in torch.FloatTensor and already 169 | put in the device assigned by command-line args 170 | """ 171 | wavs = pad_sequence(wavs, batch_first=True) 172 | features = self.to_feature(wavs) 173 | # normalize 174 | features = self.norm(features) 175 | # encode 176 | layered_features = self.model.encode_lms(features, return_layers=True) 177 | return { 178 | "last_hidden_state": layered_features[-1], 179 | "hidden_states": layered_features, 180 | } 181 | 182 | -------------------------------------------------------------------------------- /speech/README.md: -------------------------------------------------------------------------------- 1 | ![key_visual](figure-github.jpg) 2 | 3 | # Masked Modeling Duo for Speech (M2D-S) 4 | 5 | This repository provides a demo implementation of "[Masked Modeling Duo for Speech: Specializing General-Purpose Audio Representation to Speech using Denoising Distillation](https://arxiv.org/abs/2305.14079)." 6 | 7 | - [x] Code for pre-training and pre-processing LS-960 features. 8 | - [x] Pre-trained weights. 9 | - [x] [SUPERB](https://arxiv.org/abs/2105.01051) evaluation code and instructions. 10 | 11 | ## 1. Getting Started 12 | 13 | For installation, follow the instruction in the ["1. Getting Started" in the main README.md](../README.md#1-getting-started). 14 | 15 | For evaluating on SUPREB, refer to [superb/upstream/m2d/README.md](../superb/upstream/m2d/README.md). 16 | 17 | ## 2. Pre-trained weights 18 | 19 | Find pre-trained weight files in [releases](https://github.com/nttcslab/m2d/releases). 20 | 21 | - M2D-S T=4.0s: m2d_s_vit_base-80x400p80x2-230201 22 | - M2D-S T=5.12s: m2d_s_vit_base-80x512p80x2-230301 23 | - M2D-S T=6.08s: m2d_s_vit_base-80x608p80x2-230220 24 | 25 | | Model | Pre-trained dataset | PR | KS | IC | SID | ER | ENV | MUS | 26 | |----------|----------------------|-------|-------|-------|-------|-------|-----------|----------| 27 | | M2D-S T=4.0s | LS-960+AS | 5.72 | 96.47 | 97.80 | 81.97 | 66.36 | _53.22_ | _41.71_ | 28 | | M2D-S T=5.12s | LS-960+AS | 5.64 | 96.87 | 97.65 | 80.69 | 65.35 | _57.34_ | _43.23_ | 29 | | M2D-S T=6.08s | LS-960+AS | 5.33 | 96.80 | 97.63 | 81.74 | 66.13 | _54.77_ | _43.75_ | 30 | 31 | 32 | ## 3. Pre-training from Scratch 33 | 34 | ### 3-1. Pre-processing data files 35 | 36 | M2D-S learns from the following pre-processed files using LibriSpeech (LS-960) and HuBERT-base pre-trained model. 37 | 38 | - `data/ls960_hybrid7s_hubaseL9`: Pre-processed data consists of log-mel spectrogram samples converted from LS-960 and HuBERT layer #9 features encoded from LS-960. 39 | - `data/files_ls960_hybrid.csv`: List of pre-processed files of the `ls960_hybrid7s_hubaseL9` folder. 40 | 41 | M2D-S also requires AudioSet as a background noise. 42 | 43 | - `data/audioset_lms`: Pre-processed log-mel spectrogram samples from AudioSet, as in the original M2D. 44 | - `data/files_audioset.csv`: List of pre-processed AudioSet files, as in the original M2D. 45 | 46 | #### 3-1-1. LS-960 data files 47 | 48 | The following command line will create `data/ls960_hybrid7s_hubaseL9` and `data/files_ls960_hybrid.csv`. 49 | 50 | ``` 51 | python speech/extract_offline_ls960.py /path/to/LibriSpeech 52 | ``` 53 | 54 | #### 3-1-2. AudioSet data files 55 | 56 | For preparing AudioSet data files (`data/audioset_lms` and `data/files_audioset.csv`), please follow the [data/README.md](../data/README.md). 57 | 58 | ### 3-2. Pre-training 59 | 60 | The `train_speech.py` pre-trains for speech. 61 | 62 | The following example would run on any affordable GPU, consuming only 7,170MiB. However, please note that it will take very long (It took over 20 minutes for one epoch). 63 | You can also change the BG noise dataset by adding `--csv_bg_noise data/files_f_s_d_5_0_k.csv`, for example. 64 | 65 | ```sh 66 | python -m speech.train_speech --loss_m2d 1. --loss_off 1. --input_size 80x208 --patch_size 80x4 --noise_ratio 0.2 --batch_size 128 --accum_iter 16 67 | ``` 68 | 69 | The followings are for pre-training high-end models, taking 2.5-3.5 days to complete with 4 A100s. 70 | 71 | ```sh 72 | OMP_NUM_THREADS=1 torchrun --nproc_per_node=4 -m speech.train_speech --loss_m2d 1. --loss_off .5 --input_size 80x400 --patch_size 80x2 --noise_ratio 0.2 73 | OMP_NUM_THREADS=1 torchrun --nproc_per_node=4 -m speech.train_speech --loss_m2d 1. --loss_off .5 --input_size 80x512 --patch_size 80x2 --noise_ratio 0.2 --batch_size 256 --accum_iter 2 74 | OMP_NUM_THREADS=1 torchrun --nproc_per_node=4 -m speech.train_speech --loss_m2d 1. --loss_off .5 --input_size 80x608 --patch_size 80x2 --noise_ratio 0.2 --batch_size 256 --accum_iter 2 75 | ``` 76 | 77 | #### 3-2-1. Major pre-training options 78 | 79 | - --batch_size: Batch size per GPU, 512 by default. 80 | - --epochs: Training epochs, 1000 by default. 81 | - --accum_iter: Iterations to accumulate gradients, 1 by default. 82 | - --input_size: Input spectrogram size, 80x208 by default. 83 | - --patch_size: Patch size, 80x4 by default. 84 | - --mask_ratio: Masking ratio, 0.6 by default. 85 | - --loss_m2d: Loss ratio for M2D masked prediction, 1.0 by default. 86 | - --loss_off: Loss ratio for offline target, 0.0 by default. 87 | - --blr: Base learning rate: absolute_lr = base_lr * total_batch_size / 256. 88 | - --csv_main: A CSV file to list sample files in the main dataset, 'data/files_ls960_hybrid.csv' by default. 89 | - --csv_bg_noise: A CSV file to list sample files in the BG noise dataset, 'data/files_audioset.csv' by default. 90 | - --noise_ratio: Noise mixing ratio, 0.2 by default. 91 | 92 | ## 4. SUPERB Evaluation 93 | 94 | We provide upstream wrapper implementation, which you can import to your [SUPERB](https://arxiv.org/abs/2105.01051) environment. 95 | 96 | - Copy the `superb/upstream/m2d` folder under your `s3prl/upstream` folder. 97 | - Make a symbolic link to your copy of M2D repository under your `s3prl/upstream/m2d`, making `s3prl/upstream/m2d/m2d`. The wrapper files will find M2D programs under this symbolic link. 98 | - You will need to run `pip install -e .` under your `s3prl` folder, so that you install your local SUPERB in your Python environment. 99 | 100 | Please refer to [superb/upstream/m2d/README.md](../superb/upstream/m2d/README.md) for more details. 101 | 102 | ## Acknowledgements 103 | 104 | - Our code is based on the [MAE PyTorch/GPU re-implementation](https://github.com/facebookresearch/mae) of the paper [Masked Autoencoders Are Scalable Vision Learners](https://openaccess.thecvf.com/content/CVPR2022/html/He_Masked_Autoencoders_Are_Scalable_Vision_Learners_CVPR_2022_paper.html). 105 | - We use [nnAudio](https://ieeexplore.ieee.org/document/9174990) ([KinWaiCheuk/nnAudio](https://github.com/KinWaiCheuk/nnAudio)) for converting raw audio into log-mel spectrogram. 106 | - We use [Hugging Face Transformers](https://huggingface.co/docs/transformers/index) for the implementation and pre-trained weights of the [HuBERT](https://ieeexplore.ieee.org/document/9585401) model. 107 | 108 | We appreciate these publicly available resources. 109 | 110 | ## References 111 | 112 | If you find our M2D-S useful in your research, please consider citing our paper: 113 | 114 | ```BibTeX 115 | @article{niizumi2023m2d4speech, 116 | title = {{Masked Modeling Duo for Speech: Specializing General-Purpose Audio Representation to Speech using Denoising Distillation}}, 117 | author = {Daisuke Niizumi and Daiki Takeuchi and Yasunori Ohishi and Noboru Harada and Kunio Kashino}, 118 | journal = {to appear at Interspeech}, 119 | year = {2023}, 120 | url = {https://arxiv.org/abs/2305.14079} 121 | } 122 | ``` 123 | 124 | - SUPERB: *[Shu-wen Yang, Po-Han Chi, Yung-Sung Chuang, Cheng-I Jeff Lai, Kushal Lakhotia, Yist Y. Lin, Andy T. Liu, Jiatong Shi, Xuankai Chang, Guan-Ting Lin, Tzu-Hsien Huang, Wei-Cheng Tseng, Ko-tik Lee, Da-Rong Liu, Zili Huang, Shuyan Dong, Shang-Wen Li, Shinji Watanabe, Abdelrahman Mohamed, and Hung-yi Lee, "SUPERB: Speech Processing Universal PERformance Benchmark," Interspeech, 2021](https://arxiv.org/abs/2105.01051).* 125 | - https://github.com/s3prl/s3prl/blob/main/s3prl/downstream/docs/superb.md 126 | - HuBERT: *[W.-N. Hsu, B. Bolte, Y.-H. H. Tsai, K. Lakhotia, R. Salakhutdinov, and A. Mohamed, “HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units,” IEEE/ACM Trans. Audio, Speech, Language Process., p.3451–3460, 2021](https://ieeexplore.ieee.org/document/9585401).* 127 | -------------------------------------------------------------------------------- /superb/upstream/m2d/README.md: -------------------------------------------------------------------------------- 1 | # Masked Modeling Duo (M2D) upstream model for SUPERB 2 | 3 | Masked Modeling Duo for Speech: Specializing General-Purpose Audio Representation to Speech using Denoising Distillation 4 | https://arxiv.org/abs/2305.14079 5 | 6 | This is an M2D wrapper for SUPERB, and evaluating M2D on SUPERB involves two steps: 7 | 8 | - Calculating normalization statistics first, as M2D requires the average and standard deviation of the downstream task dataset. 9 | - Evaluating M2D on SUPERB using the calculated statistics. 10 | 11 | ## Installation 12 | 13 | - Copy the `superb/upstream/m2d` folder under your `s3prl/upstream` folder. 14 | - Create a copy of entire M2D repository under your `s3prl/upstream/m2d`, or make a symbolic link instead. The wrapper `expert.py` will find M2D programs in the folder. 15 | - Edit your `s3prl/hub.py` to add `from s3prl.upstream.m2d.hubconf import *`. 16 | 17 | The expected folders/files are as follows: 18 | 19 | your_s3prl/ 20 | s3prl/ 21 | upstream/ 22 | m2d/ 23 | __init__.py 24 | expert.py 25 | hubconf.py 26 | README.md 27 | m2d/ 28 | (all the M2D contents should be here) 29 | hub.py (should have `from s3prl.upstream.m2d.hubconf import *`) 30 | 31 | You might also need to run `pip install -e .` under your `s3prl` folder, so that you install your local SUPERB in your Python environment. 32 | 33 | Here is an example of installing fresh SUPERB under your copy of the M2D repository. 34 | 35 | git clone https://github.com/s3prl/s3prl.git 36 | ln -s ../../../superb/upstream/m2d s3prl/s3prl/upstream/ 37 | ln -s ../../.. s3prl/s3prl/upstream/m2d/m2d 38 | pip install tensorboardX catalyst 39 | cd s3prl/s3prl 40 | (Now edit hub.py to add the following line.) 41 | from s3prl.upstream.m2d.hubconf import * 42 | cd .. (move to your_m2d/s3prl) 43 | pip install -e . 44 | 45 | After these steps, your SUPERB should accept the following evaluation steps. 46 | 47 | ## Step 1. Pre-compute statistics on a downstream task 48 | 49 | We need statistics for each downstream task. 50 | 51 | Use the upstream `m2d_calcnorm` to calculate statistics. Example with a downstream task `voxceleb1` (SID): 52 | 53 | python run_downstream.py -m train -n m2d_calcnorm_1 -u m2d_calcnorm -d voxceleb1 54 | 55 | This will output: 56 | 57 | *** Running Norm has finished updates over 10000 times, using the following stats from now on. *** 58 | mean=-10.571270942687988, std=4.3681135177612305 59 | *** Please use these statistics in your model. EXIT... *** 60 | 61 | These `-10.571270942687988` and `4.3681135177612305` are the statistics for the `voxceleb1` (SID). 62 | 63 | ## Step 2. Run your evaluation on the downstream task 64 | 65 | Use the upstream `m2d` to evaluate your weights with the statistics calculated in the step above. 66 | Here an example of testing m2d_s_vit_base-80x608p80x2-230220 using `voxceleb1` (SID): 67 | 68 | python run_downstream.py -m train -n m2d_vc1_1 -u m2d -d voxceleb1 -k /your/m2d_s_vit_base-80x608p80x2-230220/checkpoint-1000.pth,-10.571271,4.3681135 69 | python run_downstream.py -m evaluate -e result/downstream/m2d_vc1_1/dev-best.ckpt 70 | 71 | ## Examples 72 | 73 | These are the scripts used for evaluating "[Masked Modeling Duo for Speech: Specializing General-Purpose Audio Representation to Speech using Denoising Distillation](https://arxiv.org/abs/2305.14079)." 74 | 75 | For example, we run the following to evaluate a weight `m2d_s_vit_base-80x608p80x2-230220/checkpoint-1000.pth` on KS. 76 | 77 | ./ks.sh 0 m2d /your/m2d_s_vit_base-80x608p80x2-230220/checkpoint-1000.pth 7 78 | 79 | The `0` is a GPU number, `m2d` is an upstream name, and the last `7` is a random seed. 80 | This command line will run the following two Python commands: 81 | 82 | CUDA_VISIBLE_DEVICES=0 python run_downstream.py -m train -n m2d_s_vit_base-80x608p80x2-230220-checkpoint-1000-KS-lr1e-4-s7 -u m2d -d speech_commands -o config.optimizer.lr=1e-4 -k /your/m2d_s_vit_base-80x608p80x2-230220/checkpoint-1000.pth,-11.506255149841309,4.314857482910156 --seed 7 83 | CUDA_VISIBLE_DEVICES=0 python run_downstream.py -m evaluate -e result/downstream/m2d_s_vit_base-80x608p80x2-230220-checkpoint-1000-KS-lr1e-4-s7/dev-best.ckpt 84 | 85 | 86 | ### ER (er.sh) 87 | 88 | ```sh 89 | gpu=$1 90 | upmodel=$2 91 | ckpt=$3 92 | lr=1e-5 93 | task=ER 94 | seed=$4 95 | 96 | parentpath=$(dirname $ckpt) 97 | parent=$(basename $parentpath) 98 | ckptbase=$(basename $ckpt) 99 | ckptstem=${ckptbase%.*} 100 | expbase=$parent-$ckptstem 101 | 102 | for test_fold in fold1 fold2 fold3 fold4 fold5; 103 | do 104 | expname=$expbase-$task-lr$lr-s$seed-$test_fold 105 | echo $expname 106 | CUDA_VISIBLE_DEVICES=$gpu python run_downstream.py -m train -n $expname -u $upmodel -d emotion -c downstream/emotion/config.yaml -o "config.optimizer.lr=$lr,, config.downstream_expert.datarc.test_fold='$test_fold'" -k $ckpt,-13.037399291992188,3.619741439819336 --seed $seed 107 | CUDA_VISIBLE_DEVICES=$gpu python run_downstream.py -m evaluate -e result/downstream/$expname/dev-best.ckpt 108 | done 109 | ``` 110 | 111 | ### IC (ic.sh) 112 | 113 | ```sh 114 | gpu=$1 115 | upmodel=$2 116 | ckpt=$3 117 | lr=1e-3 118 | task=IC 119 | seed=$4 120 | 121 | parentpath=$(dirname $ckpt) 122 | parent=$(basename $parentpath) 123 | ckptbase=$(basename $ckpt) 124 | ckptstem=${ckptbase%.*} 125 | expbase=$parent-$ckptstem 126 | 127 | expname=$expbase-$task-lr$lr-s$seed 128 | 129 | echo $expname 130 | CUDA_VISIBLE_DEVICES=$gpu python run_downstream.py -m train -n $expname -u $upmodel -d fluent_commands -o "config.optimizer.lr=$lr" -k $ckpt,-13.017439842224121,4.417759895324707 --seed $seed 131 | CUDA_VISIBLE_DEVICES=$gpu python run_downstream.py -m evaluate -e result/downstream/$expname/dev-best.ckpt 132 | ``` 133 | 134 | ### KS (ks.sh) 135 | 136 | ```sh 137 | gpu=$1 138 | upmodel=$2 139 | ckpt=$3 140 | lr=1e-4 141 | task=KS 142 | seed=$4 143 | 144 | parentpath=$(dirname $ckpt) 145 | parent=$(basename $parentpath) 146 | ckptbase=$(basename $ckpt) 147 | ckptstem=${ckptbase%.*} 148 | expbase=$parent-$ckptstem 149 | 150 | expname=$expbase-$task-lr$lr-s$seed 151 | 152 | echo $expname 153 | CUDA_VISIBLE_DEVICES=$gpu python run_downstream.py -m train -n $expname -u $upmodel -d speech_commands -o "config.optimizer.lr=$lr" -k $ckpt,-11.506255149841309,4.314857482910156 --seed $seed 154 | CUDA_VISIBLE_DEVICES=$gpu python run_downstream.py -m evaluate -e result/downstream/$expname/dev-best.ckpt 155 | ``` 156 | 157 | ### PR (pr.sh) 158 | 159 | ```sh 160 | gpu=$1 161 | upmodel=$2 162 | ckpt=$3 163 | lr=1e-3 164 | task=PR 165 | seed=$4 166 | 167 | parentpath=$(dirname $ckpt) 168 | parent=$(basename $parentpath) 169 | ckptbase=$(basename $ckpt) 170 | ckptstem=${ckptbase%.*} 171 | expbase=$parent-$ckptstem 172 | 173 | expname=$expbase-$task-lr$lr-s$seed 174 | 175 | echo $expname 176 | CUDA_VISIBLE_DEVICES=$gpu python run_downstream.py -m train -n $expname -u $upmodel -d ctc -c downstream/ctc/libriphone.yaml -o "config.optimizer.lr=$lr" -k $ckpt,-10.43253231048584,4.241369724273682 --seed $seed 177 | CUDA_VISIBLE_DEVICES=$gpu python run_downstream.py -m evaluate -n $expname -d ctc -e result/downstream/$expname/dev-best.ckpt 178 | ``` 179 | 180 | ### SID (sid.sh) 181 | 182 | ```sh 183 | gpu=$1 184 | upmodel=$2 185 | ckpt=$3 186 | lr=1e-3 187 | task=SID 188 | seed=$4 189 | 190 | parentpath=$(dirname $ckpt) 191 | parent=$(basename $parentpath) 192 | ckptbase=$(basename $ckpt) 193 | ckptstem=${ckptbase%.*} 194 | expbase=$parent-$ckptstem 195 | 196 | expname=$expbase-$task-lr$lr-s$seed 197 | 198 | echo $expname 199 | CUDA_VISIBLE_DEVICES=$gpu python run_downstream.py -m train -n $expname -u $upmodel -d voxceleb1 -o "config.optimizer.lr=$lr" -k $ckpt,-10.571271,4.3681135 --seed $seed 200 | CUDA_VISIBLE_DEVICES=$gpu python run_downstream.py -m evaluate -n $expname -d voxceleb1 -e result/downstream/$expname/dev-best.ckpt 201 | ``` 202 | 203 | -------------------------------------------------------------------------------- /Guide_app.md: -------------------------------------------------------------------------------- 1 | # M2D/M2D-X Application Guide (update: May 25, 2024) 2 | 3 | While our papers provide the details of methods, more is needed to guide how to use them in new applications, especially for the pre-training for each purpose. 4 | Here are guides based on the experiences and the information found afterward. 5 | 6 | CAUTION: This guide does not provide complete information covering many use cases because we are not working on many applications. Therefore, it is subject to change according to the new information/experience gained. 7 | 8 | ## 1. Transfer learning only, no pre-training 9 | If you load the pre-trained weight and use the encoder for fine-tuning as a feature extractor, you may choose a weight from ["Pre-trained/Fine-tuned Weights"](README.md#pre-trainedfine-tuned-weights): 10 | 11 | - "M2D-AS fine-tuned on AS2M" or "M2D/0.7 fine-tuned on AS2M" -- If your application setting is closer to the AudioSet ontology, including typical audio tagging (AT), sound event detection (SED), and audio captioning. 12 | - "M2D-AS fine-tuned on AS2M@32kHz" -- If application data needs higher frequency. 13 | - "M2D/0.7", "M2D/0.6", "M2D-AS", or "M2D-AS@32kHz" -- General-purpose weights. If the application domain is far from AudioSet, such as medical or industrial (e.g., factory) sound, or if it is uncertain. 14 | - "M2D-S" -- Weights for speech tasks. 15 | 16 | 17 | ## 2. Pre-training on your data 18 | 19 | ### 2.1 Pre-training strategy choice 20 | 21 | Effective pre-training depends on the available dataset and computing resources. 22 | 23 | ![chart](image-AppGuideChart.png) 24 | 25 | Possible choices: 26 | 27 | - Used the `fL` (AudioSet or LibriSpeech pre-trained weights) as they are -- The provided weights could be effective. 28 | - Pre-training on `XLd` (a large in-domain dataset) from scratch -- As in speech, in-domain pre-training may be possible. 29 | - Further pre-training on `Xapp` (an application dataset). 30 | - If your `Xapp` is large enough (>1000h), pre-training from scratch on `Xapp` may be effective. 31 | 32 | ### 2.2 Base weight choice 33 | 34 | A weight closer to the application domain may be effective. 35 | 36 | - AudioSet pre-trained weights (M2D pre-training) "M2D/0.7" or "M2D/0.6" -- For general non-speech tasks. A respiratory sound task may be non-speech. 37 | - e.g., m2d_vit_base-80x608p16x16-221006-mr7 38 | - AudioSet pre-trained weights (M2D-AS pre-training) "M2D-AS" -- For typical audio captioning, audio tagging, sound event detection tasks. 39 | - e.g., m2d_as_vit_base-80x608p16x16-240213 40 | - LibriSpeech pre-trained weights "M2D-S" -- For speech tasks. Note that AudioSet weights may be more effective even for some sounds seemingly closer to speech, such as respiratory sounds. 41 | - e.g., m2d_s_vit_base-80x400p80x2-230201 or m2d_s_vit_base-80x608p80x2-230220, starting with the 80x400 model would make your experiment easier. 42 | 43 | ### 2.3 Parameter setting 44 | 45 | #### 2.3.1 Pre-training from scratch 46 | 47 | Practically, training from scratch may require >100K samples and multiple GPUs. Here's the command line we use to train an M2D. 48 | 49 | ```sh 50 | OMP_NUM_THREADS=1 torchrun --nproc_per_node=4 train_audio.py --input_size 80x608 --patch_size 16x16 --epochs 300 --batch_size 512 --accum_iter 1 --save_freq 50 --seed 3 --model m2d_vit_base --csv_main data/files_audioset.csv --data_path /path/to/your/data --loss_off 0. 51 | ``` 52 | 53 | The parameters specifically matter for your purpose: 54 | 55 | - `--epochs 300 --batch_size 512 --accum_iter 1` -- The combination of these parameters, the learning rate, and the EMA decay parameters matter. The epochs could be adjusted, while longer epochs do not always yield better results. Set the effective batch size to 2048 according to your GPU resources (This example uses 4 GPUs for each 512-sample batch). Following these guides, you may not need to change the learning rate and EMA parameters. The successful settings we have sed so far: 56 | - bs=2048 & epochs=300 for AudioSet 2M samples. 57 | - bs=2048 & epochs=1000 for LibriSpeech 281k samples. 58 | - `--csv_main data/files_audioset.csv` -- You may set your data list here. 59 | - `--data_path /path/to/your/data` -- You may set your data folder. I explicitly set this to a fast storage device. 60 | - `--loss_off 0.` -- No offline loss. 61 | 62 | 63 | #### 2.3.2 Further pre-training 64 | 65 | Further pre-training may be a choice that pre-trains a pre-trained model on your data when your data is small, such as <10K samples. 66 | (We have yet to check for how much data you need to pre-train from scratch or do further pre-training.) 67 | 68 | Here's the command line we use to train an M2D-X model for ICBHI 2017 (see our TALSP paper for the details). 69 | 70 | ```sh 71 | CUDA_VISIBLE_DEVICES=0 python train_audio.py --epochs 600 --resume m2d_vit_base-80x200p16x4-230529/checkpoint-300.pth --model m2d_x_vit_base --input_size 80x200 --patch_size 16x4 --batch_size 64 --accum_iter 2 --csv_main data/files_icbhi2017.csv --csv_bg_noise data/files_f_s_d_5_0_k.csv --noise_ratio 0.3 --save_freq 100 --eval_after 600 --seed 6 --teacher m2d_vit_base-80x200p16x4-230529/checkpoint-300.pth --blr 3e-4 --loss_off 1. --min_ds_size 10000 72 | ``` 73 | 74 | The parameters specifically matter for your purpose: 75 | 76 | - `--epochs 600 --batch_size 64 --accum_iter 2` -- The combination of these parameters matters. The epochs could be adjusted. Set the effective batch size to 128 according to your GPU resources (This example uses a GPU with a batch size of 64 and accumulating loss twice). Following these guides, you may not need to change the learning rate and EMA parameters. The successful settings we have used so far: 77 | - bs=128 & epochs=600 for 10k samples. (We virtually increased up to 10k by repeating the list of 1k samples.) 78 | - `--resume m2d_vit_base-80x200p16x4-230529/checkpoint-300.pth ` -- It initializes the online encoder weights using the pre-trained weight. 79 | - `--teacher m2d_vit_base-80x200p16x4-230529/checkpoint-300.pth` -- We also use the model as an offline teacher encoder. 80 | - `--model m2d_x_vit_base` -- Set the pre-training framework as M2D-X. 81 | - `--input_size 80x200 --patch_size 16x4` -- You explicitly need to set them when using non-default parameters. 82 | - `--csv_main data/files_icbhi2017.csv` -- You may set your data list here. 83 | - `--csv_bg_noise data/files_f_s_d_5_0_k.csv` -- Set the BG noise data list here when you set the noise ratio to >0.0. 84 | - `--noise_ratio 0.3` -- Set the mixing ratio of the BG noise. The 0.3 will mix data for main/BG with a proportion of 0.7/0.3. 85 | - `--eval_after 600` -- We skip the evaluation of the checkpoints after the epoch of 600; the intermediate checkpoints will not be tested. 86 | - `--blr 3e-4` -- The default is `3e-4`, so we just set it here in case we want to adjust. 87 | - `--loss_off 1.` -- The offline loss ratio for M2D-X. 88 | - `--min_ds_size 10000` -- We virtually increase the number of samples to 10k by repeating the list of 1k samples. 89 | 90 | #### Example use case: Further pre-training with 2 GPUs, bs=32, 50k data samples 91 | 92 | This is an example command line for two small GPUs that can accommodate a batch size of 32. 93 | 94 | ```sh 95 | CUDA_VISIBLE_DEVICES=0,1 torchrun --nproc_per_node=2 train_audio.py --epochs 600 --warmup_epochs 24 --resume m2d_vit_base-80x608p16x16-221006-mr7/checkpoint-300.pth --model m2d_x_vit_base --batch_size 32 --accum_iter 4 --csv_main __your__.csv --csv_bg_noise data/files_f_s_d_5_0_k.csv --noise_ratio 0.01 --save_freq 100 --eval_after 600 --seed 3 --teacher m2d_vit_base-80x608p16x16-221006-mr7/checkpoint-300.pth --blr 3e-4 --loss_off 1. 96 | ``` 97 | 98 | We set options for the setup. 99 | 100 | - `torchrun --nproc_per_node=2` -- For the distributed training using two GPUs. 101 | - `--batch_size 32 --accum_iter 4` -- For the effective batch size of 128. 102 | - Removed `--min_ds_size 10000` because the number of data samples would be enough to form an epoch. 103 | 104 | ## 3. Notes on pre-training 105 | 106 | #### a. Batch size, learning rate scheduling, and EMA 107 | 108 | The M2D combines the masked prediction pretext task with recent SSL techniques, such as the offline network updated by EMA and the annealing learning rate schedule, making the pre-training settings somewhat tricky. Here are some related notes. 109 | 110 | - How the offline target encoder evolves relates to the effective batch size because the system updates it by EMA every time batch samples are consumed. Thus, the effective batch size (batch size by accum_iter) matters for gaining a useful training signal created by the offline encoder. 111 | - Other factors that affect the offline target encoder are the number of epochs and the data size. 112 | - In summary, the effective batch size, number of epochs, data size, and EMA parameters control how we get good training signals. 113 | 114 | However, searching for a set of these parameters takes time. Thus, using a similar set of parameters known to be effective is recommended. 115 | 116 | -------------------------------------------------------------------------------- /speech/speech_dataset.py: -------------------------------------------------------------------------------- 1 | """Dataset for Speech 2 | 3 | Masked Modeling Duo for Speech: Specializing General-Purpose Audio Representation to Speech using Denoising Distillation 4 | https://arxiv.org/abs/2305.14079 5 | 6 | ## Data files 7 | 8 | All the data samples used here are expected to be `.npz` preprocessed contents. 9 | Please find the details in `README.md` and preprocessor `extract_offline_ls960.py`. 10 | 11 | """ 12 | 13 | import numpy as np 14 | from pathlib import Path 15 | import torch 16 | 17 | from audio_dataset import SpectrogramDataset, get_files 18 | 19 | 20 | def log_mixup_exp(xa, xb, alpha): 21 | xa = xa.exp() 22 | xb = xb.exp() 23 | x = alpha * xa + (1. - alpha) * xb 24 | return torch.log(torch.max(x, torch.finfo(x.dtype).eps*torch.ones_like(x))) 25 | 26 | 27 | class SpeechHybridDataset(SpectrogramDataset): 28 | def __init__(self, folder, files, crop_size, norm_stats=None, 29 | random_crop=True, n_norm_calc=20000, 30 | patch_len=None): 31 | assert (crop_size[1] % 2) == 0, f'Crop frames has to be multiple of 2 (frames=100Hz vs embeddings=50Hz): {crop_size}' 32 | self.raw_emb_len = crop_size[1] // 2 # frames=100Hz vs embeddings=50Hz 33 | self.emb_len = crop_size[1] // patch_len 34 | self.patch_len = patch_len 35 | 36 | super().__init__(folder=folder, files=files, crop_frames=crop_size[1], norm_stats=norm_stats, 37 | random_crop=random_crop, n_norm_calc=n_norm_calc) 38 | 39 | def get_raw_data(self, index): 40 | filename = self.folder/self.df.file_name.values[index] 41 | try: 42 | hybrid = np.load(str(filename)) 43 | except: 44 | assert False, f'Failed to load: {filename}' 45 | lms = torch.tensor(hybrid['arr_0']) 46 | emb = torch.tensor(hybrid['arr_1']) 47 | raw_emb_len = torch.tensor(hybrid['arr_2']) 48 | 49 | # original sample is shorter than crop duration 50 | if raw_emb_len < self.raw_emb_len: 51 | raw_emb_len = self.raw_emb_len 52 | # emb_len has to be the multiple of patch_len 53 | if (raw_emb_len % self.patch_len) > 0: 54 | raw_emb_len = int(raw_emb_len / self.patch_len) * self.patch_len 55 | assert raw_emb_len >= self.raw_emb_len, f'{raw_emb_len} {self.raw_emb_len}' 56 | 57 | emb = emb[:, :raw_emb_len, :] 58 | lms = lms[:, :, :raw_emb_len * 2] # ensure lms length matches emb length, *2 = frames=100Hz vs embeddings=50Hz 59 | 60 | return lms, emb.transpose(-1, -2) # emb: [1, T, D] -> [1, D, T] to make the same shape with lms. 61 | 62 | def complete_data(self, lms, emb): 63 | # crop & normalize 64 | x = super().complete_audio(lms) 65 | j = self.last_crop_start 66 | if not hasattr(self, 'norm_stats'): 67 | return x # for norm_stats calculation 68 | 69 | # rescale the cut position j from LMS frame length to embedding length 70 | emb_j = (self.raw_emb_len * j) // self.crop_frames 71 | 72 | # crop embedding 73 | emb = emb[..., emb_j:emb_j + self.raw_emb_len] 74 | 75 | # shrink embeddings to match the patch length only when needed 76 | n_emb_per_patch = self.patch_len // 2 # 20ms per offline embedding 77 | if n_emb_per_patch > 1: 78 | _, D, T = emb.shape 79 | assert (T % n_emb_per_patch) == 0, f'T:{T} self.emb_len:{self.emb_len} n_emb_per_patch:{n_emb_per_patch} emb.shape:{emb.shape}' 80 | new_len = T // n_emb_per_patch 81 | emb = emb.reshape(1, D, new_len, n_emb_per_patch).mean(axis=-1) 82 | if new_len == 0: 83 | print(f'T:{T} self.emb_len:{self.emb_len} n_emb_per_patch:{n_emb_per_patch} emb.shape:{emb.shape}') 84 | 85 | # reshape to make it useful 86 | y = emb.transpose(-1, -2).squeeze(0) # [1, D, T] to [T, D] 87 | 88 | return x, y 89 | 90 | def __getitem__(self, index): 91 | lms, emb = self.get_raw_data(index) 92 | items = self.complete_data(lms, emb) 93 | return items 94 | 95 | 96 | import pandas as pd 97 | class SpeechHybridLabelDataset(SpeechHybridDataset): 98 | def __init__(self, folder, files, crop_size, norm_stats=None, 99 | random_crop=True, n_norm_calc=20000, 100 | label_csv='data/ls960_train_hubert_base_ls960_L9_km500.csv', n_classes=500, 101 | patch_len=None): 102 | super().__init__(folder, files, crop_size, norm_stats, random_crop, n_norm_calc, patch_len) 103 | 104 | df = pd.read_csv(label_csv) 105 | df['id_'] = [x.split('/')[-1][:-5] for x in df.file_name.values] 106 | df = df.sort_values('file_name') 107 | 108 | files_id_ = [f.split('/')[-1][:-4] for f in files] 109 | assert all(files_id_ == df.id_.values), 'Mismatch between LMS files and labels.' 110 | 111 | # convert label text into list of labels 112 | df['labels'] = [[int(x) for x in label.split(' ')] for label in df.labels.values] 113 | df['file_name'] = files 114 | self.df = df 115 | self.label_len = crop_size[1] // patch_len 116 | self.n_classes = n_classes 117 | 118 | def complete_data(self, lms, label): 119 | # crop & normalize 120 | x = super().complete_audio(lms) 121 | j = self.last_crop_start 122 | if not hasattr(self, 'norm_stats'): 123 | return x # for norm_stats calculation 124 | 125 | label_j = (self.label_len * j) // self.crop_frames 126 | assert (self.crop_frames % self.label_len) == 0, f'LMS frame length has to be multiple of label length.' 127 | # convert label into one-hot encoding and shrink the label length 128 | # repeat the last label for short labels to ensure that the frame length matches the label length 129 | padded_frames = max(lms.shape[-1], self.crop_frames) 130 | n_patches = (padded_frames + self.patch_len - 1) // self.patch_len 131 | n_frames = n_patches * self.patch_len 132 | n_labels = (n_frames + 1) // 2 # frames=100Hz vs labels=50Hz 133 | if len(label) < n_labels: 134 | n_repeat = n_labels - len(label) 135 | label = label + ([label[-1]] * n_repeat) 136 | assert len(label) == n_labels 137 | # shrink labels to match the patch length 138 | onehot = np.eye(self.n_classes)[label] 139 | n_label_per_patch = self.patch_len // 2 140 | cur_len = len(label) 141 | new_len = cur_len // n_label_per_patch 142 | onehot = onehot.T.reshape(-1, new_len, n_label_per_patch).sum(axis=-1).T 143 | onehot = onehot / n_label_per_patch # values in a one-hot label should sum to 1. 144 | # crop label 145 | onehot = onehot[label_j:label_j + self.label_len, :] 146 | 147 | if onehot.shape[0] < self.label_len: 148 | print(onehot.shape, lms.shape, i, j, h, w, n_patches, n_frames, n_labels, cur_len, new_len, label_j, label_j + self.label_len) 149 | return x, torch.tensor(onehot).to(float) 150 | 151 | def __getitem__(self, index): 152 | filename = self.folder/self.df.file_name.values[index] 153 | try: 154 | hybrid = np.load(str(filename)) 155 | except: 156 | assert False, f'Failed to load: {filename}' 157 | lms = torch.tensor(hybrid['arr_0']) 158 | 159 | label = self.df.labels.values[index] if hasattr(self, 'norm_stats') else ['not needed'] 160 | items = self.complete_data(lms, label) 161 | return items 162 | 163 | 164 | class MixedSpeechDataset(torch.utils.data.Dataset): 165 | def __init__(self, base_folder, files_speech, files_bg_noise, crop_size, patch_len, noise_ratio=0.0, 166 | random_crop=True, n_norm_calc=10000, use_label=False) -> None: 167 | super().__init__() 168 | 169 | ds_cls = SpeechHybridLabelDataset if use_label else SpeechHybridDataset 170 | self.ds1 = ds_cls(folder=base_folder, files=files_speech, crop_size=crop_size, 171 | random_crop=random_crop, norm_stats=None, n_norm_calc=n_norm_calc//2, 172 | patch_len=patch_len) 173 | # disable normalizion scaling in the ds1 174 | self.norm_std = self.ds1.norm_stats[1] 175 | self.ds1.norm_stats = (self.ds1.norm_stats[0], 1.0) 176 | 177 | if noise_ratio > 0.0: 178 | self.ds2 = SpectrogramDataset(folder=base_folder, files=files_bg_noise, crop_frames=crop_size[1], 179 | random_crop=random_crop, norm_stats=None, n_norm_calc=n_norm_calc//2, repeat_short=True) 180 | self.ds2.norm_stats = (self.ds2.norm_stats[0], 1.0) # disable normalizion scaling in the ds2 181 | 182 | self.noise_ratio = noise_ratio 183 | self.bg_index = [] 184 | 185 | def __len__(self): 186 | return len(self.ds1) 187 | 188 | def __getitem__(self, index, fixed_noise=False): 189 | # load index sample 190 | sig, label = self.ds1[index] 191 | if self.noise_ratio > 0.0: 192 | # load random noise sample ### , while making noise floor zero 193 | noise = self.ds2[index if fixed_noise else self.get_next_bgidx()][0] 194 | # mix 195 | sig = log_mixup_exp(noise, sig, self.noise_ratio) if self.noise_ratio < 1.0 else noise 196 | # finish normalization. sig and noise were averaged to zero. the following will scale to 1.0 using ds1 std. 197 | sig = sig / self.norm_std 198 | return sig, label 199 | 200 | 201 | def get_next_bgidx(self): 202 | if len(self.bg_index) == 0: 203 | self.bg_index = torch.randperm(len(self.ds2)).tolist() 204 | # print(f'Refreshed the bg index list with {len(self.bg_index)} items: {self.bg_index[:5]}...') 205 | return self.bg_index.pop(0) 206 | 207 | def __repr__(self): 208 | format_string = self.__class__.__name__ + f'(crop_frames={self.ds1.crop_frames}, ' 209 | format_string += f'folder_sp={self.ds1.df.file_name.values[0].split("/")[0]}, ' 210 | if self.noise_ratio > 0.: format_string += f'folder_bg={self.ds2.df.file_name.values[0].split("/")[0]}, ' 211 | return format_string 212 | 213 | 214 | def build_mixed_speech_dataset(cfg): 215 | ds = MixedSpeechDataset( 216 | base_folder=cfg.data_path, files_speech=get_files(cfg.csv_main), 217 | files_bg_noise=get_files(cfg.csv_bg_noise) if cfg.noise_ratio > 0. else [], 218 | crop_size=cfg.input_size, patch_len=cfg.patch_size[1], 219 | noise_ratio=cfg.noise_ratio, use_label=(cfg.model in ['m2d_s_vit_label_base', 'm2d_s_vit_label_bce_base', 220 | 'm2d_s_vit_label2_base', 'm2d_s_vit_label2_bce_base', 'm2d_s_vit_hubert_base'])) 221 | 222 | val_ds = SpectrogramDataset(folder=cfg.data_path, files=get_files(cfg.csv_val), crop_frames=cfg.input_size[1], random_crop=True) \ 223 | if cfg.csv_val else None 224 | 225 | return ds, val_ds 226 | 227 | 228 | def build_viz_dataset(cfg): 229 | files = [str(f).replace(str(cfg.data_path) + '/', '') for f in sorted(Path(cfg.data_path).glob('vis_speect_samples/*.npy'))] 230 | if len(files) == 0: 231 | return None, [] 232 | norm_stats = cfg.norm_stats if 'norm_stats' in cfg else None 233 | ds = SpectrogramDataset(folder=cfg.data_path, files=files, crop_size=cfg.input_size, norm_stats=norm_stats) 234 | return ds, files 235 | -------------------------------------------------------------------------------- /app/circor/circor_eval.py: -------------------------------------------------------------------------------- 1 | """Main program for the paper: Exploring Pre-trained General-purpose Audio Representations for Heart Murmur Detection 2 | """ 3 | 4 | import sys 5 | sys.path.append('../heart-murmur-detection') 6 | sys.path.append('../heart-murmur-detection/ModelEvaluation') 7 | 8 | from evar.common import (sys, np, pd, kwarg_cfg, Path, 9 | torch, logging, append_to_csv, RESULT_DIR) 10 | import torchaudio 11 | import fire 12 | 13 | from evar.data import create_dataloader 14 | import evar 15 | from lineareval import make_cfg 16 | from finetune import TaskNetwork, finetune_main 17 | 18 | from DataProcessing.find_and_load_patient_files import load_patient_data 19 | from DataProcessing.helper_code import load_recordings 20 | from ModelEvaluation.evaluate_model import evaluate_model 21 | from tqdm import tqdm 22 | 23 | 24 | def infer_and_eval(cfg, model, test_root, eval_mode='follow_prior_work'): 25 | model.eval() 26 | 27 | pids = sorted(list(set([f.stem.split('_')[0] for f in Path(test_root).glob('*.wav')]))) # evaluate_model.py::find_challenge_files -> sorted(os.listdir(label_folder)) 28 | txt_files = [test_root+pid+'.txt' for pid in pids] 29 | print('Test file folder:', test_root) 30 | print('Test files:', pids[:2], txt_files[:2]) 31 | softmax_fn = torch.nn.Softmax(dim=1) 32 | probabilities, wav_probabilities = [], [] 33 | 34 | for txt in tqdm(txt_files): 35 | # Load recordigns 36 | data = load_patient_data(txt) 37 | recordings, frequencies = load_recordings(test_root, data, get_frequencies=True) 38 | recordings = [torch.tensor(r / 32768.).to(torch.float) for r in recordings] 39 | 40 | # Note: No normalization of raw audio wave. Already normalized in the pipeline. 41 | # recordings[0].max() -> tensor(1.0000) 42 | # recordings[0].min() -> tensor(-1.) 43 | # def normalize(wav): 44 | # return wav / (1.0e-10 + wav.abs().max()) 45 | # recordings = [normalize(r) for r in recordings] 46 | 47 | wavs = [torchaudio.transforms.Resample(f, cfg.sample_rate)(r) for r, f in zip(recordings, frequencies)] 48 | 49 | # Note: *No padding* because sample lengths are very different among recordings, for example: [164608, 150272, 105472, 460544] 50 | # print([len(w) for w in wavs]) 51 | # max_len = max([len(w) for w in wavs]) 52 | # wavs = [(np.pad(w, (0, max_len - len(w)) if len(w) < max_len else w) for w in wavs)] 53 | 54 | # Process per recording (with variable length) 55 | L = cfg.unit_samples # number of samples for 5 sec 56 | logits = [] 57 | for wav in wavs: 58 | if len(wav) < L: 59 | wav = torch.nn.functional.pad(wav, (0, L - len(wav))) 60 | # Split wav into 5-s segments and encode them. 61 | segment_logits = [] 62 | for widx, pos in enumerate(range(0, len(wav) - L + 1, L)): 63 | segment = wav[pos:pos+L] 64 | if len(segment) < L: 65 | continue 66 | with torch.no_grad(): 67 | x = segment.unsqueeze(0) 68 | logit = model(x) 69 | segment_logits.append(logit) # [1, 3] for one chunk 70 | # Logits for one recording wav. 71 | logits.append(torch.stack(segment_logits).mean(0)) 72 | 73 | # Reorder classes from ["Absent", "Present", "Unknown"] -> ["Present", "Unknown", "Absent"] 74 | logits = torch.vstack(logits) 75 | logits = logits[:, [1, 2, 0]] 76 | # Probabilities for each wav 77 | probs = logits.softmax(1).detach().to('cpu') 78 | wav_probabilities.append(probs) 79 | # Probability for the average logits 80 | probs = logits.mean(0, keepdims=True).softmax(1).detach().to('cpu')[0] 81 | probabilities.append(probs) 82 | 83 | probabilities = torch.stack(probabilities) 84 | 85 | def label_decision_rule(wav_probs): 86 | # Following Panah et al. “Exploring Wav2vec 2.0 Model for Heart Murmur Detection.” EUSIPCO, 2023, pp. 1010–14. 87 | cidxs = torch.argmax(wav_probs, dim=1) 88 | PRESENT, UNKNOWN, ABSENT = 0, 1, 2 89 | # - Assign present if at least one recording was classified as present. 90 | if PRESENT in cidxs: 91 | final_label = PRESENT 92 | # - Assign unknown if none of the recordings was classified as present, and at least one recording was classified as unknown. 93 | elif UNKNOWN in cidxs: 94 | final_label = UNKNOWN 95 | # - Assign absent if all recordings were classified as absent. 96 | else: 97 | final_label = ABSENT 98 | return final_label 99 | 100 | if eval_mode is None or eval_mode == 'follow_prior_work': 101 | print('Label decision follows: Panah et al. “Exploring Wav2vec 2.0 Model for Heart Murmur Detection.” EUSIPCO, 2023, pp. 1010–14.') 102 | cidxs = torch.tensor([label_decision_rule(wav_probs) for wav_probs in wav_probabilities]) 103 | elif eval_mode == 'normal': 104 | print('Label decision is: torch.argmax(probabilities, dim=1)') 105 | cidxs = torch.argmax(probabilities, dim=1) 106 | else: 107 | assert False, f'Unknown eval_mode: {eval_mode}' 108 | labels = torch.nn.functional.one_hot(cidxs, num_classes=3) 109 | 110 | wav_probabilities = [p.numpy() for p in wav_probabilities] 111 | probabilities = probabilities.numpy() 112 | labels = labels.numpy() 113 | return evaluate_model(test_root, probabilities, labels), (wav_probabilities, probabilities) 114 | 115 | 116 | def eval_main(config_file, task, checkpoint, options='', seed=42, lr=None, hidden=(), epochs=None, early_stop_epochs=None, warmup_epochs=None, 117 | mixup=None, freq_mask=None, time_mask=None, rrc=None, training_mask=None, batch_size=None, 118 | optim='sgd', unit_sec=None, verbose=False, data_path='work', eval_mode=None, save_prob=None): 119 | 120 | cfg, n_folds, balanced = make_cfg(config_file, task, options, extras={}, abs_unit_sec=unit_sec) 121 | lr = lr or cfg.ft_lr 122 | cfg.mixup = mixup if mixup is not None else cfg.mixup 123 | cfg.ft_early_stop_epochs = early_stop_epochs if early_stop_epochs is not None else cfg.ft_early_stop_epochs 124 | cfg.warmup_epochs = warmup_epochs if warmup_epochs is not None else cfg.warmup_epochs 125 | cfg.ft_epochs = epochs or cfg.ft_epochs 126 | cfg.ft_freq_mask = freq_mask if freq_mask is not None else cfg.ft_freq_mask 127 | cfg.ft_time_mask = time_mask if time_mask is not None else cfg.ft_time_mask 128 | cfg.ft_rrc = rrc if rrc is not None else (cfg.ft_rrc if 'ft_rrc' in cfg else False) 129 | cfg.training_mask = training_mask if training_mask is not None else (cfg.training_mask if 'training_mask' in cfg else 0.0) 130 | cfg.ft_bs = batch_size or cfg.ft_bs 131 | cfg.optim = optim 132 | cfg.unit_sec = unit_sec 133 | cfg.data_path = data_path 134 | 135 | train_loader, valid_loader, test_loader, multi_label = create_dataloader(cfg, fold=n_folds-1, seed=seed, batch_size=cfg.ft_bs, 136 | always_one_hot=True, balanced_random=balanced) 137 | print('Classes:', train_loader.dataset.classes) 138 | cfg.eval_checkpoint = checkpoint 139 | 140 | cfg.runtime_cfg = kwarg_cfg(lr=lr, seed=seed, hidden=hidden, mixup=cfg.mixup, bs=cfg.ft_bs, 141 | freq_mask=cfg.ft_freq_mask, time_mask=cfg.ft_time_mask, rrc=cfg.ft_rrc, epochs=cfg.ft_epochs, 142 | early_stop_epochs=cfg.ft_early_stop_epochs, n_class=len(train_loader.dataset.classes)) 143 | 144 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 145 | 146 | # Make a fresh model 147 | ar = eval('evar.'+cfg.audio_repr)(cfg).to(device) 148 | if hasattr(train_loader, 'lms_mode') and train_loader.lms_mode: 149 | ar.precompute_lms(device, train_loader) 150 | else: 151 | ar.precompute(device, train_loader) 152 | task_model = TaskNetwork(cfg, ar).to(device) 153 | task_model_dp = torch.nn.DataParallel(task_model).to(device) 154 | # Load checkpoint 155 | print('Using checkpoint', checkpoint) 156 | print(task_model_dp.load_state_dict(torch.load(checkpoint, map_location=device))) 157 | task_model_dp.eval() 158 | 159 | circor_no = task[-1] # ex) '1' of 'circor1' 160 | stratified_data = f'../heart-murmur-detection/data/stratified_data{circor_no}/test_data/' 161 | results, probs = infer_and_eval(cfg, task_model_dp, stratified_data, eval_mode=eval_mode) 162 | ( classes, 163 | auroc, 164 | auprc, 165 | auroc_classes, 166 | auprc_classes, 167 | f_measure, 168 | f_measure_classes, 169 | accuracy, 170 | accuracy_classes, 171 | weighted_accuracy, 172 | uar, 173 | ) = results 174 | 175 | name = f'{cfg.id}{"" if cfg.weight_file != "" else "/rnd"}-' 176 | report = f'Finetuning {name} on {task} -> weighted_accuracy: {weighted_accuracy:.5f}, UAR: {uar:.5f}, recall per class: {accuracy_classes}' 177 | report += f', best weight: {checkpoint}, config: {cfg}' 178 | logging.info(report) 179 | 180 | result_df = pd.DataFrame({ 181 | 'representation': [cfg.id.split('_')[-2]], # AR name 182 | 'task': [task], 183 | 'wacc': [weighted_accuracy], 184 | 'uar': [uar], 185 | 'r_Present': [accuracy_classes[0]], 186 | 'r_Unknown': [accuracy_classes[1]], 187 | 'r_Absent': [accuracy_classes[2]], 188 | 'weight_file': [cfg.weight_file], 189 | 'run_id': [cfg.id], 190 | 'report': [report], 191 | }) 192 | csv_name = { 193 | None: 'circor-scores.csv', 194 | 'follow_prior_work': 'circor-scores.csv', 195 | 'normal': 'circor-scores-wo-rule.csv', 196 | }[eval_mode] 197 | append_to_csv(f'{RESULT_DIR}/{csv_name}', result_df) 198 | 199 | if save_prob is not None: 200 | for i, var in zip(['_1', '_2'], probs): 201 | prob_name = Path(save_prob)/str(checkpoint).replace('/', '-').replace('.pth', i + '.npy') 202 | #probs = [p.numpy() for p in probs] 203 | prob_name.parent.mkdir(parents=True, exist_ok=True) 204 | np.save(prob_name, np.array(var, dtype=object)) 205 | print('Probabilities saved as:', prob_name) 206 | 207 | 208 | def finetune_circor(config_file, task, options='', seed=42, lr=None, hidden=(), epochs=None, early_stop_epochs=None, warmup_epochs=None, 209 | mixup=None, freq_mask=None, time_mask=None, rrc=None, training_mask=None, batch_size=None, 210 | optim='sgd', unit_sec=None, verbose=False, data_path='work', eval_only=None, eval_mode=None, save_prob='probs'): 211 | 212 | assert task in [f'circor{n}' for n in range(1, 3+1)] 213 | 214 | # We train a model using the original fine-tuner from the EVAR (finetune_main), and the best_path holds the path of the best weight. 215 | # This part is the same training process as what we have been doing in BYOL-A and M2D. 216 | if eval_only is None: 217 | report, scores, best_path, name, cfg, logpath = finetune_main(config_file, task, options=options, seed=seed, lr=lr, hidden=hidden, epochs=epochs, 218 | early_stop_epochs=early_stop_epochs, warmup_epochs=warmup_epochs, 219 | mixup=mixup, freq_mask=freq_mask, time_mask=time_mask, rrc=rrc, training_mask=training_mask, batch_size=batch_size, 220 | optim=optim, unit_sec=unit_sec, verbose=verbose, data_path=data_path) 221 | del report, scores, name, cfg, logpath 222 | else: 223 | best_path = eval_only 224 | 225 | # Then, we evaluate the trained model specifically for the CirCor problem setting. 226 | return eval_main(config_file, task, best_path, options=options, seed=seed, lr=lr, hidden=hidden, epochs=epochs, 227 | early_stop_epochs=early_stop_epochs, warmup_epochs=warmup_epochs, 228 | mixup=mixup, freq_mask=freq_mask, time_mask=time_mask, rrc=rrc, training_mask=training_mask, batch_size=batch_size, 229 | optim=optim, unit_sec=unit_sec, verbose=verbose, data_path=data_path, eval_mode=eval_mode, save_prob=save_prob) 230 | 231 | 232 | if __name__ == '__main__': 233 | fire.Fire(finetune_circor) 234 | -------------------------------------------------------------------------------- /audio_dataset.py: -------------------------------------------------------------------------------- 1 | """Dataset for Spectrogram Audio. 2 | 3 | ## Data files 4 | All the data samples used here are expected to be `.npy` pre-converted spectrograms. 5 | Please find instructions in `README.md`. 6 | 7 | ## Data folder structure 8 | We expect the following data folder structure. 9 | Note that our training pipeline uses samples from the folder `vis_samples` for visualization. 10 | Make a folder named `vis_samples` under the root folder of the dataset, and put some samples for visualization in the `vis_samples`. 11 | 12 | (data root)/(any sub-folder)/(data samples).npy 13 | : 14 | (data root)/vis_samples/(data samples for visualization).npy 15 | : 16 | """ 17 | 18 | import pandas as pd 19 | import numpy as np 20 | from pathlib import Path 21 | import torch 22 | import torch.nn.functional as F 23 | 24 | 25 | class SpectrogramDataset(torch.utils.data.Dataset): 26 | """Spectrogram audio dataset class. 27 | Args: 28 | folder: Root folder that stores audio samples. 29 | files: List of relative path names from the root folder for all samples. 30 | crop_frames: Number of time frames of a data which this class outputs. 31 | norm_stats: Normalization statistics comprising mean and standard deviation. 32 | If None, statistics are calculated at runtime. 33 | If a pathname, the precomputed statistics will be loaded. 34 | tfms: Transform functions for data augmentation. 35 | random_crop: Set True to randomly crop data of length crop_frames, 36 | or always crop from the beginning of a sample. 37 | n_norm_calc: Number of samples to calculate normalization statistics at runtime. 38 | """ 39 | 40 | def __init__(self, folder, files, crop_frames, norm_stats=None, 41 | tfms=None, random_crop=True, n_norm_calc=10000, repeat_short=False): 42 | super().__init__() 43 | self.folder = Path(folder) 44 | self.df = pd.DataFrame({'file_name': files}) 45 | self.crop_frames = crop_frames 46 | self.tfms = tfms 47 | self.random_crop = random_crop 48 | self.repeat_short = repeat_short 49 | 50 | # Norm stats 51 | if norm_stats is None: 52 | # Calculate norm stats runtime 53 | lms_vectors = [self[i][0] for i in np.random.randint(0, len(files), size=n_norm_calc)] 54 | lms_vectors = torch.stack(lms_vectors) 55 | norm_stats = lms_vectors.mean(), lms_vectors.std() + torch.finfo().eps 56 | elif isinstance(norm_stats, (str)): 57 | # Load from a file 58 | if Path(norm_stats).exists(): 59 | norm_stats = torch.FloatTensor(np.load(norm_stats)) 60 | else: 61 | # Create a norm stat file and save it. The created file will be loaded at the next runtime. 62 | lms_vectors = [self[i][0] for i in np.random.randint(0, len(files), size=n_norm_calc)] 63 | lms_vectors = torch.vstack(lms_vectors) 64 | new_stats = lms_vectors.mean(axis=(0, 2), keepdims=True), lms_vectors.std(axis=(0, 2), keepdims=True) + torch.finfo().eps 65 | np.save(norm_stats, torch.stack(new_stats).numpy()) 66 | norm_stats = new_stats 67 | self.norm_stats = norm_stats 68 | 69 | print(f'Dataset contains {len(self.df)} files with a normalizing stats {self.norm_stats}.') 70 | 71 | def __len__(self): 72 | return len(self.df) 73 | 74 | def get_audio_file(self, filename): 75 | lms = torch.tensor(np.load(filename)) 76 | return lms 77 | 78 | def get_audio(self, index): 79 | filename = self.folder/self.df.file_name.values[index] 80 | return self.get_audio_file(filename) 81 | 82 | def complete_audio(self, lms, dont_tfms=False, org_index=None): 83 | # Repeat if short 84 | l = lms.shape[-1] 85 | if self.repeat_short and l < self.crop_frames: 86 | while l < self.crop_frames: 87 | lms = torch.cat([lms, lms], dim=-1) 88 | l = lms.shape[-1] 89 | # print(f'Repeated short sample (< {self.crop_frames}) at {org_index} as {lms.shape}') 90 | 91 | # Trim or pad 92 | start = 0 93 | if l > self.crop_frames: 94 | start = int(torch.randint(l - self.crop_frames, (1,))[0]) if self.random_crop else 0 95 | lms = lms[..., start:start + self.crop_frames] 96 | # if org_index is not None and org_index % 1000 == 0: 97 | # print(org_index, 'trimmed from', start) 98 | elif l < self.crop_frames: 99 | pad_param = [] 100 | for i in range(len(lms.shape)): 101 | pad_param += [0, self.crop_frames - l] if i == 0 else [0, 0] 102 | lms = F.pad(lms, pad_param, mode='constant', value=0) 103 | self.last_crop_start = start 104 | lms = lms.to(torch.float) 105 | 106 | # Normalize 107 | if hasattr(self, 'norm_stats'): 108 | lms = (lms - self.norm_stats[0]) / self.norm_stats[1] 109 | 110 | # Apply transforms 111 | if self.tfms is not None: 112 | if not dont_tfms: 113 | lms = self.tfms(lms) 114 | 115 | return lms 116 | 117 | def __getitem__(self, index): 118 | lms = self.get_audio(index) 119 | return self.complete_audio(lms, org_index=index) 120 | 121 | def __repr__(self): 122 | format_string = self.__class__.__name__ + f'(crop_frames={self.crop_frames}, random_crop={self.random_crop}, ' 123 | format_string += f'tfms={self.tfms}\n' 124 | return format_string 125 | 126 | 127 | def get_files(dataset_name): 128 | files = pd.read_csv(str(dataset_name)).file_name.values 129 | files = sorted(files) 130 | return files 131 | 132 | 133 | def get_files_no_sort(dataset_name): 134 | return pd.read_csv(str(dataset_name)).file_name.values 135 | 136 | 137 | def build_dataset(cfg): 138 | """The followings configure the training dataset details. 139 | - data_path: Root folder of the training dataset. 140 | - dataset: The _name_ of the training dataset, an stem name of a `.csv` training data list. 141 | - norm_stats: Normalization statistics, a list of [mean, std]. 142 | - input_size: Input size, a list of [# of freq. bins, # of time frames]. 143 | """ 144 | 145 | transforms = None # Future options: torch.nn.Sequential(*transforms) if transforms else None 146 | norm_stats = cfg.norm_stats if 'norm_stats' in cfg else None 147 | ds = SpectrogramDataset(folder=cfg.data_path, files=get_files(cfg.dataset), crop_frames=cfg.input_size[1], 148 | tfms=transforms, norm_stats=norm_stats) 149 | return ds 150 | 151 | 152 | def build_viz_dataset(cfg): 153 | files = [str(f).replace(str(cfg.data_path) + '/', '') for f in sorted(Path(cfg.data_path).glob('vis_samples/*.npy'))] 154 | if len(files) == 0: 155 | return None, [] 156 | norm_stats = cfg.norm_stats if 'norm_stats' in cfg else None 157 | ds = SpectrogramDataset(folder=cfg.data_path, files=files, crop_frames=cfg.input_size[1], tfms=None, norm_stats=norm_stats) 158 | return ds, files 159 | 160 | 161 | # Mixed dataset 162 | 163 | def log_mixup_exp(xa, xb, alpha): 164 | xa = xa.exp() 165 | xb = xb.exp() 166 | x = alpha * xa + (1. - alpha) * xb 167 | return torch.log(torch.max(x, torch.finfo(x.dtype).eps*torch.ones_like(x))) 168 | 169 | 170 | class MixedSpecDataset(torch.utils.data.Dataset): 171 | def __init__(self, base_folder, files_main, files_bg_noise, crop_size, noise_ratio=0.0, 172 | random_crop=True, n_norm_calc=10000) -> None: 173 | super().__init__() 174 | 175 | self.ds1 = SpectrogramDataset(folder=base_folder, files=files_main, crop_frames=crop_size[1], 176 | random_crop=random_crop, norm_stats=None, 177 | n_norm_calc=n_norm_calc//2) 178 | self.norm_stats = self.ds1.norm_stats # for compatibility with SpectrogramDataset 179 | # disable normalizion scaling in the ds1 180 | self.norm_std = self.ds1.norm_stats[1] 181 | self.ds1.norm_stats = (self.ds1.norm_stats[0], 1.0) 182 | 183 | if noise_ratio > 0.0: 184 | self.ds2 = SpectrogramDataset(folder=base_folder, files=files_bg_noise, crop_frames=crop_size[1], 185 | random_crop=random_crop, norm_stats=None, n_norm_calc=n_norm_calc//2, repeat_short=True) 186 | self.ds2.norm_stats = (self.ds2.norm_stats[0], 1.0) # disable normalizion scaling in the ds2 187 | 188 | self.noise_ratio = noise_ratio 189 | self.bg_index = [] 190 | 191 | def __len__(self): 192 | return len(self.ds1) 193 | 194 | def __getitem__(self, index, fixed_noise=False): 195 | # load index sample 196 | clean = self.ds1[index] 197 | if self.noise_ratio > 0.0: 198 | # load random noise sample ### , while making noise floor zero 199 | noise = self.ds2[index if fixed_noise else self.get_next_bgidx()] 200 | # mix 201 | mixed = log_mixup_exp(noise, clean, self.noise_ratio) if self.noise_ratio < 1.0 else noise 202 | else: 203 | mixed = clean.clone() 204 | # finish normalization. clean and noise were averaged to zero. the following will scale to 1.0 using ds1 std. 205 | clean = clean / self.norm_std 206 | mixed = mixed / self.norm_std 207 | return clean, mixed 208 | 209 | 210 | def get_next_bgidx(self): 211 | if len(self.bg_index) == 0: 212 | self.bg_index = torch.randperm(len(self.ds2)).tolist() 213 | # print(f'Refreshed the bg index list with {len(self.bg_index)} items: {self.bg_index[:5]}...') 214 | return self.bg_index.pop(0) 215 | 216 | def __repr__(self): 217 | format_string = self.__class__.__name__ + f'(crop_frames={self.ds1.crop_frames}, ' 218 | format_string += f'folder_sp={self.ds1.df.file_name.values[0].split("/")[0]}, ' 219 | if self.noise_ratio > 0.: format_string += f'folder_bg={self.ds2.df.file_name.values[0].split("/")[0]}, ' 220 | return format_string 221 | 222 | 223 | def inflate_files(files, desired_size): 224 | if len(files) == 0: 225 | return files 226 | files = list(files) # make sure `files`` is a list 227 | while len(files) < desired_size: 228 | files = (files + files)[:desired_size] 229 | return files 230 | 231 | 232 | def build_mixed_dataset(cfg): 233 | """The followings configure the training dataset details. 234 | - data_path: Root folder of the training dataset. 235 | - dataset: The _name_ of the training dataset, an stem name of a `.csv` training data list. 236 | - norm_stats: Normalization statistics, a list of [mean, std]. 237 | - input_size: Input size, a list of [# of freq. bins, # of time frames]. 238 | """ 239 | 240 | # get files and inflate the number of files (by repeating the list) if needed 241 | files_main = get_files(cfg.csv_main) 242 | files_bg = get_files(cfg.csv_bg_noise) if cfg.noise_ratio > 0. else [] 243 | desired_min_size = 0 244 | if 'min_ds_size' in cfg and cfg.min_ds_size > 0: 245 | desired_min_size = cfg.min_ds_size 246 | if desired_min_size > 0: 247 | old_sizes = len(files_main), len(files_bg) 248 | files_main, files_bg = inflate_files(files_main, desired_min_size), inflate_files(files_bg, desired_min_size) 249 | print('The numbers of data files are increased from', old_sizes, 'to', (len(files_main), len(files_bg))) 250 | 251 | ds = MixedSpecDataset( 252 | base_folder=cfg.data_path, files_main=files_main, 253 | files_bg_noise=files_bg, 254 | crop_size=cfg.input_size, 255 | noise_ratio=cfg.noise_ratio, 256 | random_crop=True) 257 | if 'weighted' in cfg and cfg.weighted: 258 | assert desired_min_size == 0 259 | ds.weight = pd.read_csv(cfg.csv_main).weight.values 260 | 261 | val_ds = SpectrogramDataset(folder=cfg.data_path, files=get_files(cfg.csv_val), crop_frames=cfg.input_size[1], random_crop=True) \ 262 | if cfg.csv_val else None 263 | 264 | return ds, val_ds 265 | 266 | 267 | def build_mixed_viz_dataset(cfg): 268 | files = [str(f).replace(str(cfg.data_path) + '/', '') for f in sorted(Path(cfg.data_path).glob('vis_samples/*.npy'))] 269 | if len(files) == 0: 270 | return None, [] 271 | norm_stats = cfg.norm_stats if 'norm_stats' in cfg else None 272 | ds = SpectrogramDataset(folder=cfg.data_path, files=files, crop_frames=cfg.input_size[1], tfms=None, norm_stats=norm_stats) 273 | return ds, files 274 | 275 | 276 | if __name__ == '__main__': 277 | # Test 278 | ds = MixedSpecDataset(base_folder='data', files_main=get_files('data/files_gtzan.csv'), 279 | files_bg_noise=get_files('data/files_audioset.csv'), 280 | crop_size=[80, 608], noise_ratio=0.2, random_crop=True, n_norm_calc=10) 281 | for i in range(0, 10): 282 | clean, mixed = ds[i] 283 | print(clean.shape, mixed.shape) 284 | -------------------------------------------------------------------------------- /examples/Example_old4_CLAP2024.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# M2D-CLAP example\n", 8 | "\n", 9 | "This is an example of CLAP part of our Interspeech 2024 paper.\n", 10 | "\n", 11 | "```bibtex\n", 12 | "@InProceedings{\t niizumi2024M2D-CLAP,\n", 13 | " title\t\t= {{M2D-CLAP: Masked Modeling Duo Meets CLAP for Learning General-purpose Audio-Language Representation}},\n", 14 | " author\t= {Daisuke Niizumi and Daiki Takeuchi and Yasunori Ohishi and Noboru Harada and Masahiro Yasuda and Shunsuke Tsubaki and Keisuke Imoto},\n", 15 | " year\t\t= {2024},\n", 16 | " booktitle\t= {Interspeech},\n", 17 | " pages\t\t= {57--61},\n", 18 | " doi\t\t= {10.21437/Interspeech.2024-29},\n", 19 | " issn\t\t= {2958-1796}}\n", 20 | "```" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": 1, 26 | "metadata": {}, 27 | "outputs": [], 28 | "source": [ 29 | "import warnings; warnings.simplefilter('ignore')\n", 30 | "import logging\n", 31 | "logging.basicConfig(level=logging.INFO)\n", 32 | "import sys\n", 33 | "sys.path.append('..')\n", 34 | "import torch\n", 35 | "from pathlib import Path\n", 36 | "import numpy as np" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": 2, 42 | "metadata": {}, 43 | "outputs": [ 44 | { 45 | "name": "stdout", 46 | "output_type": "stream", 47 | "text": [ 48 | " using default norm_stats: tensor([-7.1000, 4.2000])\n" 49 | ] 50 | }, 51 | { 52 | "name": "stderr", 53 | "output_type": "stream", 54 | "text": [ 55 | "INFO:root:\n", 56 | "INFO:root:Model input size: [80, 608]\n", 57 | "INFO:root:Using weights: m2d_clap_vit_base-80x608p16x16-240128/checkpoint-300.pth\n", 58 | "INFO:root:Feature dimension: 768\n", 59 | "INFO:root:Norm stats: -7.099999904632568, 4.199999809265137\n", 60 | "INFO:root:Runtime MelSpectrogram(16000, 400, 400, 160, 80, 50, 8000):\n", 61 | "INFO:root:MelSpectrogram(\n", 62 | " Mel filter banks size = (80, 201), trainable_mel=False\n", 63 | " (stft): STFT(n_fft=400, Fourier Kernel size=(201, 1, 400), iSTFT=False, trainable=False)\n", 64 | ")\n" 65 | ] 66 | }, 67 | { 68 | "name": "stdout", 69 | "output_type": "stream", 70 | "text": [ 71 | " using 155 parameters, while dropped 251 out of 406 parameters from m2d_clap_vit_base-80x608p16x16-240128/checkpoint-300.pth\n", 72 | " (dropped: ['mask_token', 'decoder_pos_embed', 'logit_scale', 'decoder_embed.weight', 'decoder_embed.bias'] ...)\n", 73 | "\n" 74 | ] 75 | } 76 | ], 77 | "source": [ 78 | "from portable_m2d import PortableM2D\n", 79 | "weight = 'm2d_clap_vit_base-80x608p16x16-240128/checkpoint-300.pth'\n", 80 | "model = PortableM2D(weight_file=weight, flat_features=True)\n" 81 | ] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "execution_count": 3, 86 | "metadata": {}, 87 | "outputs": [], 88 | "source": [ 89 | "files = ['AudioSetWav16k/eval_segments/-22tna7KHzI_28.000.wav',\n", 90 | " 'AudioSetWav16k/eval_segments/-0xzrMun0Rs_30.000.wav',\n", 91 | " 'AudioSetWav16k/eval_segments/3tUlhM80ObM_0.000.wav',\n", 92 | " 'AudioSetWav16k/eval_segments/-1nilez17Dg_30.000.wav',\n", 93 | " 'AudioSetWav16k/eval_segments/--U7joUcTCo_0.000.wav',\n", 94 | " 'AudioSetWav16k/eval_segments/5hlsVoxJPNI_30.000.wav',]\n", 95 | "captions = ['The sound of Explosion.',\n", 96 | " 'The sound of Stomach rumble, and Music.',\n", 97 | " 'The sound of Knock.',\n", 98 | " 'The sound of Heart murmur, and Speech.',\n", 99 | " \"A man's laughter abruptly interrupts as someone sneezes, suggesting a casual gathering or social event.\",\n", 100 | " \"The sound of Christmas music, Music, and Speech.\",]" 101 | ] 102 | }, 103 | { 104 | "cell_type": "code", 105 | "execution_count": 4, 106 | "metadata": {}, 107 | "outputs": [], 108 | "source": [ 109 | "import librosa\n", 110 | "\n", 111 | "with torch.no_grad():\n", 112 | " audios = [librosa.load(f, sr=16000)[0] for f in files]\n", 113 | " audios = [np.pad(a, (0, 16000 * 10 - a.shape[-1])) for a in audios] # Make sure all files are 10-s.\n", 114 | " audios = torch.tensor(audios)\n", 115 | " audio_embs = model.encode_clap_audio(audios)\n", 116 | " text_embs = model.encode_clap_text(captions)" 117 | ] 118 | }, 119 | { 120 | "cell_type": "code", 121 | "execution_count": 5, 122 | "metadata": {}, 123 | "outputs": [ 124 | { 125 | "data": { 126 | "text/plain": [ 127 | "(torch.Size([6, 768]), torch.Size([6, 768]))" 128 | ] 129 | }, 130 | "execution_count": 5, 131 | "metadata": {}, 132 | "output_type": "execute_result" 133 | } 134 | ], 135 | "source": [ 136 | "audio_embs.shape, text_embs.shape" 137 | ] 138 | }, 139 | { 140 | "cell_type": "code", 141 | "execution_count": 6, 142 | "metadata": {}, 143 | "outputs": [ 144 | { 145 | "name": "stdout", 146 | "output_type": "stream", 147 | "text": [ 148 | "[[ 0.08543 0.08029 0.02879 0.01966 0.00866 -0.00212]\n", 149 | " [-0.0049 0.06878 0.02576 -0.00711 -0.02833 0.01233]\n", 150 | " [ 0.04179 0.01696 0.13246 -0.00467 0.01645 -0.00098]\n", 151 | " [-0.00217 0.0425 -0.00594 0.10569 -0.00474 0.00028]\n", 152 | " [ 0.05769 0.02339 0.04664 0.01432 0.08724 0.02567]\n", 153 | " [-0.04205 -0.00013 -0.04844 0.00155 -0.02319 0.04316]]\n" 154 | ] 155 | }, 156 | { 157 | "data": { 158 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAZgAAAGdCAYAAAAv9mXmAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjguMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8g+/7EAAAACXBIWXMAAA9hAAAPYQGoP6dpAAAUWklEQVR4nO3df4yUhb3v8e+y6w7+WFZR+bFlQT1WDHqWRhQusVYrVM/GEG1OGkJISmhPe9osjYR402xuUvSPZsnNTaO3cinpL/4pQdsETUyFUlogTaXCEnLBpka8Nq7hV+1Nd5dNHHF37h833XM4iocBvvM4O69X8iTdyTM8nye0vDsz7NJUqVQqAQCX2aSiBwAwMQkMACkEBoAUAgNACoEBIIXAAJBCYABIITAApGip9QXHxsbi+PHj0dbWFk1NTbW+PACXoFKpxPDwcHR0dMSkSR//GqXmgTl+/Hh0dnbW+rIAXEYDAwMxa9asjz2n5oFpa2uLiIhlL6yIK65urfXlCzV0dnLRE2ruz3vnFD2hEFecKXpBMVoHG+8nT5WGxoqeUFOjZ9+L/pe/O/5n+cepeWD+/rbYFVe3NlxgrjjbWPcbEdFcaryoRkQ0ny16QTGaWxsvMC1XNFZg/u5CPuLwIT8AKQQGgBQCA0AKgQEghcAAkEJgAEghMACkEBgAUggMACkEBoAUAgNACoEBIIXAAJBCYABIITAApBAYAFIIDAApBAaAFAIDQAqBASCFwACQQmAASCEwAKS4qMBs3Lgxbrrpppg8eXIsWrQoXn311cu9C4A6V3VgnnvuuVi3bl2sX78+Dh06FPPnz4+HH344Tp8+nbEPgDpVdWC+973vxde+9rVYvXp1zJs3L37wgx/EVVddFT/5yU8y9gFQp6oKzPvvvx/9/f2xdOnSf/sFJk2KpUuXxiuvvPKRzymXyzE0NHTOAcDEV1Vg3n333RgdHY3p06ef8/j06dPj5MmTH/mcvr6+aG9vHz86Ozsvfi0AdSP9b5H19vbG4ODg+DEwMJB9SQA+AVqqOfmGG26I5ubmOHXq1DmPnzp1KmbMmPGRzymVSlEqlS5+IQB1qapXMK2trbFgwYLYvXv3+GNjY2Oxe/fuWLx48WUfB0D9quoVTETEunXrYtWqVXH33XfHwoUL4+mnn46RkZFYvXp1xj4A6lTVgVm+fHn85S9/ie985ztx8uTJ+MxnPhM7duz40Af/ADS2qgMTEbFmzZpYs2bN5d4CwATiZ5EBkEJgAEghMACkEBgAUggMACkEBoAUAgNACoEBIIXAAJBCYABIITAApBAYAFIIDAApBAaAFAIDQAqBASCFwACQQmAASCEwAKQQGABSCAwAKQQGgBQCA0AKgQEgRUtRFz7+vX+IlismF3X5Qgz9y1DRE2qu6YOiFxTjutfPFj2hEFcd+79FT6i5Uw9MK3pCTY2+f+GvS7yCASCFwACQQmAASCEwAKQQGABSCAwAKQQGgBQCA0AKgQEghcAAkEJgAEghMACkEBgAUggMACkEBoAUAgNACoEBIIXAAJBCYABIITAApBAYAFIIDAApBAaAFAIDQAqBASCFwACQourA7Nu3L5YtWxYdHR3R1NQUL7zwQsIsAOpd1YEZGRmJ+fPnx8aNGzP2ADBBtFT7hO7u7uju7s7YAsAEUnVgqlUul6NcLo9/PTQ0lH1JAD4B0j/k7+vri/b29vGjs7Mz+5IAfAKkB6a3tzcGBwfHj4GBgexLAvAJkP4WWalUilKplH0ZAD5hfB8MACmqfgVz5syZOHbs2PjXb731Vhw+fDimTp0as2fPvqzjAKhfVQfm4MGD8fnPf37863Xr1kVExKpVq2LLli2XbRgA9a3qwDzwwANRqVQytgAwgfgMBoAUAgNACoEBIIXAAJBCYABIITAApBAYAFIIDAApBAaAFAIDQAqBASCFwACQQmAASCEwAKQQGABSCAwAKQQGgBQCA0AKgQEghcAAkEJgAEghMACkEBgAUrQUdeHjn5sUkyY3Vt/afj216Ak191rv/yp6QiHu/9evFz2hECceml70hJq7+uRo0RNq6oOzYxd8bmP9CQ9AzQgMACkEBoAUAgNACoEBIIXAAJBCYABIITAApBAYAFIIDAApBAaAFAIDQAqBASCFwACQQmAASCEwAKQQGABSCAwAKQQGgBQCA0AKgQEghcAAkEJgAEghMACkEBgAUggMACkEBoAUVQWmr68v7rnnnmhra4tp06bFY489Fq+//nrWNgDqWFWB2bt3b/T09MT+/ftj165dcfbs2XjooYdiZGQkax8AdaqlmpN37NhxztdbtmyJadOmRX9/f3zuc5+7rMMAqG9VBeY/GhwcjIiIqVOnnveccrkc5XJ5/OuhoaFLuSQAdeKiP+QfGxuLtWvXxr333ht33nnnec/r6+uL9vb28aOzs/NiLwlAHbnowPT09MTRo0dj27ZtH3teb29vDA4Ojh8DAwMXe0kA6shFvUW2Zs2aeOmll2Lfvn0xa9asjz23VCpFqVS6qHEA1K+qAlOpVOJb3/pWbN++Pfbs2RM333xz1i4A6lxVgenp6YmtW7fGiy++GG1tbXHy5MmIiGhvb48rr7wyZSAA9amqz2A2bdoUg4OD8cADD8TMmTPHj+eeey5rHwB1quq3yADgQvhZZACkEBgAUggMACkEBoAUAgNACoEBIIXAAJBCYABIITAApBAYAFIIDAApBAaAFAIDQAqBASCFwACQQmAASCEwAKQQGABSCAwAKQQGgBQCA0AKgQEghcAAkKKlqAvPeKUSLVdUirp8IY7fN1b0hJq7/+tfL3pCIVb/jxeKnlCIn/7Xx4qeUHMfTG4qekJNVaq4Xa9gAEghMACkEBgAUggMACkEBoAUAgNACoEBIIXAAJBCYABIITAApBAYAFIIDAApBAaAFAIDQAqBASCFwACQQmAASCEwAKQQGABSCAwAKQQGgBQCA0AKgQEghcAAkEJgAEghMACkqCowmzZtiq6urpgyZUpMmTIlFi9eHC+//HLWNgDqWFWBmTVrVmzYsCH6+/vj4MGD8eCDD8ajjz4ar732WtY+AOpUSzUnL1u27Jyvv/vd78amTZti//79cccdd1zWYQDUt6oC8++Njo7Gz3/+8xgZGYnFixef97xyuRzlcnn866GhoYu9JAB1pOoP+Y8cORLXXHNNlEql+MY3vhHbt2+PefPmnff8vr6+aG9vHz86OzsvaTAA9aHqwMydOzcOHz4cf/jDH+Kb3/xmrFq1Kv74xz+e9/ze3t4YHBwcPwYGBi5pMAD1oeq3yFpbW+PWW2+NiIgFCxbEgQMH4plnnonNmzd/5PmlUilKpdKlrQSg7lzy98GMjY2d8xkLAERU+Qqmt7c3uru7Y/bs2TE8PBxbt26NPXv2xM6dO7P2AVCnqgrM6dOn48tf/nKcOHEi2tvbo6urK3bu3Blf+MIXsvYBUKeqCsyPf/zjrB0ATDB+FhkAKQQGgBQCA0AKgQEghcAAkEJgAEghMACkEBgAUggMACkEBoAUAgNACoEBIIXAAJBCYABIITAApBAYAFIIDAApBAaAFAIDQAqBASCFwACQQmAASCEwAKQQGABStBR14ZPL3o9JVzVW367631cWPaHmBrorRU8oxP985p+LnlCI+f/taNETau7VF/+x6Ak1NVpuvuBzG+tPeABqRmAASCEwAKQQGABSCAwAKQQGgBQCA0AKgQEghcAAkEJgAEghMACkEBgAUggMACkEBoAUAgNACoEBIIXAAJBCYABIITAApBAYAFIIDAApBAaAFAIDQAqBASCFwACQQmAASHFJgdmwYUM0NTXF2rVrL9McACaKiw7MgQMHYvPmzdHV1XU59wAwQVxUYM6cORMrV66MH/7wh3Hddddd7k0ATAAXFZienp545JFHYunSpf/pueVyOYaGhs45AJj4Wqp9wrZt2+LQoUNx4MCBCzq/r68vnnrqqaqHAVDfqnoFMzAwEI8//nj87Gc/i8mTJ1/Qc3p7e2NwcHD8GBgYuKihANSXql7B9Pf3x+nTp+Ouu+4af2x0dDT27dsXzz77bJTL5Whubj7nOaVSKUql0uVZC0DdqCowS5YsiSNHjpzz2OrVq+P222+Pb3/72x+KCwCNq6rAtLW1xZ133nnOY1dffXVcf/31H3ocgMbmO/kBSFH13yL7j/bs2XMZZgAw0XgFA0AKgQEghcAAkEJgAEghMACkEBgAUggMACkEBoAUAgNACoEBIIXAAJBCYABIITAApBAYAFIIDAApBAaAFAIDQAqBASCFwACQQmAASCEwAKQQGABSCAwAKVqKuvCt6/5PtDS1FnX5Qgz907yiJ9Rcx3//Q9ETCjG8/L8UPaEQJ/91VtETau691WNFT6ipsfcu/H69ggEghcAAkEJgAEghMACkEBgAUggMACkEBoAUAgNACoEBIIXAAJBCYABIITAApBAYAFIIDAApBAaAFAIDQAqBASCFwACQQmAASCEwAKQQGABSCAwAKQQGgBQCA0AKgQEghcAAkKKqwDz55JPR1NR0znH77bdnbQOgjrVU+4Q77rgjfv3rX//bL9BS9S8BQAOoug4tLS0xY8aMjC0ATCBVfwbzxhtvREdHR9xyyy2xcuXKePvttz/2/HK5HENDQ+ccAEx8VQVm0aJFsWXLltixY0ds2rQp3nrrrbjvvvtieHj4vM/p6+uL9vb28aOzs/OSRwPwyVdVYLq7u+NLX/pSdHV1xcMPPxy//OUv429/+1s8//zz531Ob29vDA4Ojh8DAwOXPBqAT75L+oT+2muvjdtuuy2OHTt23nNKpVKUSqVLuQwAdeiSvg/mzJkz8eabb8bMmTMv1x4AJoiqAvPEE0/E3r17489//nP8/ve/jy9+8YvR3NwcK1asyNoHQJ2q6i2yd955J1asWBF//etf48Ybb4zPfvazsX///rjxxhuz9gFQp6oKzLZt27J2ADDB+FlkAKQQGABSCAwAKQQGgBQCA0AKgQEghcAAkEJgAEghMACkEBgAUggMACkEBoAUAgNACoEBIIXAAJBCYABIITAApBAYAFIIDAApBAaAFAIDQAqBASCFwACQoqXWF6xUKhER8UHlbK0vXbgPzr5X9ISaa8Tf54jG/L2OiPhgtFz0hJobe6+xfq//fr9//7P84zRVLuSsy+idd96Jzs7OWl4SgMtsYGAgZs2a9bHn1DwwY2Njcfz48Whra4umpqaaXXdoaCg6OztjYGAgpkyZUrPrFs19N859N+I9RzTmfRd5z5VKJYaHh6OjoyMmTfr4T1lq/hbZpEmT/tPqZZoyZUrD/Jfw33PfjaMR7zmiMe+7qHtub2+/oPN8yA9ACoEBIEXDBKZUKsX69eujVCoVPaWm3Hfj3Hcj3nNEY953vdxzzT/kB6AxNMwrGABqS2AASCEwAKQQGABSNExgNm7cGDfddFNMnjw5Fi1aFK+++mrRk1Lt27cvli1bFh0dHdHU1BQvvPBC0ZPS9fX1xT333BNtbW0xbdq0eOyxx+L1118vela6TZs2RVdX1/g33S1evDhefvnlomfV1IYNG6KpqSnWrl1b9JRUTz75ZDQ1NZ1z3H777UXPOq+GCMxzzz0X69ati/Xr18ehQ4di/vz58fDDD8fp06eLnpZmZGQk5s+fHxs3bix6Ss3s3bs3enp6Yv/+/bFr1644e/ZsPPTQQzEyMlL0tFSzZs2KDRs2RH9/fxw8eDAefPDBePTRR+O1114relpNHDhwIDZv3hxdXV1FT6mJO+64I06cODF+/O53vyt60vlVGsDChQsrPT0941+Pjo5WOjo6Kn19fQWuqp2IqGzfvr3oGTV3+vTpSkRU9u7dW/SUmrvuuusqP/rRj4qekW54eLjy6U9/urJr167K/fffX3n88ceLnpRq/fr1lfnz5xc944JN+Fcw77//fvT398fSpUvHH5s0aVIsXbo0XnnllQKXkW1wcDAiIqZOnVrwktoZHR2Nbdu2xcjISCxevLjoOel6enrikUceOed/3xPdG2+8ER0dHXHLLbfEypUr4+233y560nnV/Idd1tq7774bo6OjMX369HMenz59evzpT38qaBXZxsbGYu3atXHvvffGnXfeWfScdEeOHInFixfHe++9F9dcc01s37495s2bV/SsVNu2bYtDhw7FgQMHip5SM4sWLYotW7bE3Llz48SJE/HUU0/FfffdF0ePHo22trai533IhA8MjamnpyeOHj36yX5/+jKaO3duHD58OAYHB+MXv/hFrFq1Kvbu3TthIzMwMBCPP/547Nq1KyZPnlz0nJrp7u4e/89dXV2xaNGimDNnTjz//PPx1a9+tcBlH23CB+aGG26I5ubmOHXq1DmPnzp1KmbMmFHQKjKtWbMmXnrppdi3b1+h/zRELbW2tsatt94aERELFiyIAwcOxDPPPBObN28ueFmO/v7+OH36dNx1113jj42Ojsa+ffvi2WefjXK5HM3NzQUurI1rr702brvttjh27FjRUz7ShP8MprW1NRYsWBC7d+8ef2xsbCx2797dEO9RN5JKpRJr1qyJ7du3x29+85u4+eabi55UmLGxsSiXJ+4/X7xkyZI4cuRIHD58ePy4++67Y+XKlXH48OGGiEtExJkzZ+LNN9+MmTNnFj3lI034VzAREevWrYtVq1bF3XffHQsXLoynn346RkZGYvXq1UVPS3PmzJlz/l/NW2+9FYcPH46pU6fG7NmzC1yWp6enJ7Zu3RovvvhitLW1xcmTJyPi///jSFdeeWXB6/L09vZGd3d3zJ49O4aHh2Pr1q2xZ8+e2LlzZ9HT0rS1tX3os7Wrr746rr/++gn9mdsTTzwRy5Ytizlz5sTx48dj/fr10dzcHCtWrCh62kcr+q+x1cr3v//9yuzZsyutra2VhQsXVvbv31/0pFS//e1vKxHxoWPVqlVFT0vzUfcbEZWf/vSnRU9L9ZWvfKUyZ86cSmtra+XGG2+sLFmypPKrX/2q6Fk11wh/TXn58uWVmTNnVlpbWyuf+tSnKsuXL68cO3as6Fnn5cf1A5Biwn8GA0AxBAaAFAIDQAqBASCFwACQQmAASCEwAKQQGABSCAwAKQQGgBQCA0AKgQEgxf8DSMGKMe/DP+0AAAAASUVORK5CYII=", 159 | "text/plain": [ 160 | "
" 161 | ] 162 | }, 163 | "metadata": {}, 164 | "output_type": "display_data" 165 | } 166 | ], 167 | "source": [ 168 | "from sklearn.metrics.pairwise import cosine_similarity\n", 169 | "import matplotlib.pyplot as plt\n", 170 | "\n", 171 | "H = cosine_similarity(audio_embs, text_embs)\n", 172 | "plt.imshow(H, interpolation='none')\n", 173 | "np.set_printoptions(precision=5, suppress=True)\n", 174 | "print(H)" 175 | ] 176 | }, 177 | { 178 | "cell_type": "code", 179 | "execution_count": null, 180 | "metadata": {}, 181 | "outputs": [], 182 | "source": [] 183 | } 184 | ], 185 | "metadata": { 186 | "kernelspec": { 187 | "display_name": "ar", 188 | "language": "python", 189 | "name": "python3" 190 | }, 191 | "language_info": { 192 | "codemirror_mode": { 193 | "name": "ipython", 194 | "version": 3 195 | }, 196 | "file_extension": ".py", 197 | "mimetype": "text/x-python", 198 | "name": "python", 199 | "nbconvert_exporter": "python", 200 | "pygments_lexer": "ipython3", 201 | "version": "3.9.18" 202 | } 203 | }, 204 | "nbformat": 4, 205 | "nbformat_minor": 2 206 | } 207 | -------------------------------------------------------------------------------- /examples/Example_4_CLAP2025.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# M2D-CLAP example\n", 8 | "\n", 9 | "This is an example of the CLAP features from M2D-CLAP $_{2025}$, the journal paper version.\n", 10 | "\n", 11 | "Download and prepare the wweight [`m2d_clap_vit_base-80x1001p16x16p16kpBpTI-2025`](https://github.com/nttcslab/m2d/releases/download/v0.5.0/m2d_clap_vit_base-80x1001p16x16p16kpBpTI-2025.zip) before you run.\n", 12 | "\n", 13 | "```bibtex\n", 14 | "@article{niizumi2025m2d-clap,\n", 15 | " title = {{M2D-CLAP: Exploring General-purpose Audio-Language Representations Beyond CLAP}},\n", 16 | " author = {Daisuke Niizumi and Daiki Takeuchi and Masahiro Yasuda and Binh Thien Nguyen and Yasunori Ohishi and Noboru Harada},\n", 17 | " journal = {IEEE Access},\n", 18 | " year = {2025},\n", 19 | " url = {https://ieeexplore.ieee.org/document/11168481}}\n", 20 | "```" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": 1, 26 | "metadata": {}, 27 | "outputs": [], 28 | "source": [ 29 | "import warnings; warnings.simplefilter('ignore')\n", 30 | "import logging\n", 31 | "logging.basicConfig(level=logging.INFO)\n", 32 | "import torch\n", 33 | "from pathlib import Path\n", 34 | "import numpy as np" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": null, 40 | "metadata": {}, 41 | "outputs": [ 42 | { 43 | "name": "stderr", 44 | "output_type": "stream", 45 | "text": [ 46 | "INFO:root:\n", 47 | "INFO:root:Model input size: [80, 1001]\n", 48 | "INFO:root:Using weights: m2d_clap_vit_base-80x1001p16x16p16kpBpTI-2025/checkpoint-30.pth\n", 49 | "INFO:root:Feature dimension: 768\n", 50 | "INFO:root:Norm stats: -7.261779308319092, 4.3511505126953125\n", 51 | "INFO:root:Runtime MelSpectrogram(16000, 400, 400, 160, 80, 50, 8000):\n", 52 | "INFO:root:MelSpectrogram(\n", 53 | " Mel filter banks size = (80, 201), trainable_mel=False\n", 54 | " (stft): STFT(n_fft=400, Fourier Kernel size=(201, 1, 400), iSTFT=False, trainable=False)\n", 55 | ")\n" 56 | ] 57 | }, 58 | { 59 | "name": "stdout", 60 | "output_type": "stream", 61 | "text": [ 62 | " using 166 parameters from m2d_clap_vit_base-80x1001p16x16p16kpBpTI-2025/checkpoint-30.pth\n", 63 | " (included audio_proj params: ['audio_proj.sem_token', 'audio_proj.sem_blocks.0.norm1.weight', 'audio_proj.sem_blocks.0.norm1.bias', 'audio_proj.sem_blocks.0.attn.qkv.weight', 'audio_proj.sem_blocks.0.attn.qkv.bias']\n", 64 | " (included text_proj params: []\n", 65 | " (dropped: [] )\n", 66 | "\n", 67 | " using norm_stats: -7.261779308319092, 4.3511505126953125\n" 68 | ] 69 | } 70 | ], 71 | "source": [ 72 | "from portable_m2d import PortableM2D\n", 73 | "weight = 'm2d_clap_vit_base-80x1001p16x16p16kpBpTI-2025/checkpoint-30.pth'\n", 74 | "# Use flat_features=True for CLAP features only. For conventional audio features, flat_features should be False.\n", 75 | "model = PortableM2D(weight_file=weight, flat_features=True)" 76 | ] 77 | }, 78 | { 79 | "cell_type": "code", 80 | "execution_count": 3, 81 | "metadata": {}, 82 | "outputs": [], 83 | "source": [ 84 | "files = ['AudioSetWav16k/eval_segments/-22tna7KHzI_28.000.wav',\n", 85 | " 'AudioSetWav16k/eval_segments/-0xzrMun0Rs_30.000.wav',\n", 86 | " 'AudioSetWav16k/eval_segments/3tUlhM80ObM_0.000.wav',\n", 87 | " 'AudioSetWav16k/eval_segments/-1nilez17Dg_30.000.wav',\n", 88 | " 'AudioSetWav16k/eval_segments/--U7joUcTCo_0.000.wav',\n", 89 | " 'AudioSetWav16k/eval_segments/5hlsVoxJPNI_30.000.wav',]\n", 90 | "captions = ['The sound of Explosion.',\n", 91 | " 'The sound of Stomach rumble, and Music.',\n", 92 | " 'The sound of Knock.',\n", 93 | " 'The sound of Heart murmur, and Speech.',\n", 94 | " \"A man's laughter abruptly interrupts as someone sneezes, suggesting a casual gathering or social event.\",\n", 95 | " \"The sound of Christmas music, Music, and Speech.\",]" 96 | ] 97 | }, 98 | { 99 | "cell_type": "code", 100 | "execution_count": 4, 101 | "metadata": {}, 102 | "outputs": [ 103 | { 104 | "name": "stderr", 105 | "output_type": "stream", 106 | "text": [ 107 | "INFO:root: using text encoder: BERT base\n" 108 | ] 109 | }, 110 | { 111 | "name": "stdout", 112 | "output_type": "stream", 113 | "text": [ 114 | " using model.text_encoder from m2d_clap_vit_base-80x1001p16x16p16kpBpTI-2025/checkpoint-30.pth\n" 115 | ] 116 | } 117 | ], 118 | "source": [ 119 | "import librosa\n", 120 | "\n", 121 | "with torch.no_grad():\n", 122 | " audios = [librosa.load(f, sr=16000)[0] for f in files]\n", 123 | " audios = [np.pad(a, (0, 16000 * 10 - a.shape[-1])) for a in audios] # Make sure all files are 10-s.\n", 124 | " audios = torch.tensor(audios)\n", 125 | " audio_embs = model.encode_clap_audio(audios)\n", 126 | " text_embs = model.encode_clap_text(captions)" 127 | ] 128 | }, 129 | { 130 | "cell_type": "code", 131 | "execution_count": 5, 132 | "metadata": {}, 133 | "outputs": [ 134 | { 135 | "data": { 136 | "text/plain": [ 137 | "(torch.Size([6, 768]), torch.Size([6, 768]))" 138 | ] 139 | }, 140 | "execution_count": 5, 141 | "metadata": {}, 142 | "output_type": "execute_result" 143 | } 144 | ], 145 | "source": [ 146 | "audio_embs.shape, text_embs.shape" 147 | ] 148 | }, 149 | { 150 | "cell_type": "code", 151 | "execution_count": 6, 152 | "metadata": {}, 153 | "outputs": [ 154 | { 155 | "name": "stdout", 156 | "output_type": "stream", 157 | "text": [ 158 | "[[0.28606 0.2381 0.23852 0.21239 0.11076 0.13236]\n", 159 | " [0.22148 0.37747 0.2139 0.18893 0.12501 0.21492]\n", 160 | " [0.26712 0.24247 0.37288 0.21389 0.10618 0.15825]\n", 161 | " [0.18678 0.22834 0.20472 0.39384 0.10315 0.22582]\n", 162 | " [0.25185 0.13545 0.23883 0.18921 0.32387 0.1312 ]\n", 163 | " [0.19546 0.24592 0.1791 0.23728 0.0799 0.31999]]\n" 164 | ] 165 | }, 166 | { 167 | "data": { 168 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAZgAAAGdCAYAAAAv9mXmAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjguMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8g+/7EAAAACXBIWXMAAA9hAAAPYQGoP6dpAAAUaklEQVR4nO3df4yVhZ3v8e8ws3PGH8MoCsiUwR+xaoAMvYIQYm2tUL3EEO0fhmtISmjTpN2hgbBmm/mnaHKbIbnZRlNZSmpb9o8StE3Qu6ZqKS2QplLHIdwFuzXi0jiGX9W9nRkm6wFmzv6x2dllFZcDfM/jmXm9kifpnDyH53Oi+O45DzM0VCqVSgDAZTap6AEAjE8CA0AKgQEghcAAkEJgAEghMACkEBgAUggMACmaan3B0dHROHr0aLS2tkZDQ0OtLw/AJahUKjE0NBTt7e0xadLHv0epeWCOHj0aHR0dtb4sAJdRf39/zJw582PPqXlgWltbIyJi7t+ticYrS7W+fKH+/z9dW/QEaqT5/Yn56fNNPz1W9ISaO3t9a9ETaursSDl+s/9vxv5b/nFqHph//1is8crShAvMpJaWoidQI42liRmYpkkT6/d0REQ0Tczf1xdyi2Ni/i4AIJ3AAJBCYABIITAApBAYAFIIDAApBAaAFAIDQAqBASCFwACQQmAASCEwAKQQGABSCAwAKQQGgBQCA0AKgQEghcAAkEJgAEghMACkEBgAUggMACkEBoAUFxWYTZs2xU033RQtLS2xaNGieO211y73LgDqXNWBefbZZ2P9+vWxYcOG2L9/f8ybNy8eeOCBOHnyZMY+AOpU1YH57ne/G1/72tdi9erVMXv27Pj+978fV155ZfzoRz/K2AdAnaoqMKdPn46+vr5YunTpf/wCkybF0qVL49VXX/3I55TL5RgcHDznAGD8qyow7733XoyMjMT06dPPeXz69Olx/Pjxj3xOT09PtLW1jR0dHR0XvxaAupH+p8i6u7tjYGBg7Ojv78++JACfAE3VnHz99ddHY2NjnDhx4pzHT5w4ETfccMNHPqdUKkWpVLr4hQDUparewTQ3N8f8+fNj165dY4+Njo7Grl27YvHixZd9HAD1q6p3MBER69evj1WrVsWCBQti4cKF8eSTT8bw8HCsXr06Yx8AdarqwKxYsSL+9Kc/xbe//e04fvx4fOYzn4mXX375Qzf+AZjYqg5MRMSaNWtizZo1l3sLAOOIn0UGQAqBASCFwACQQmAASCEwAKQQGABSCAwAKQQGgBQCA0AKgQEghcAAkEJgAEghMACkEBgAUggMACkEBoAUAgNACoEBIIXAAJBCYABIITAApBAYAFIIDAApBAaAFE1FXXjo/02JSS0tRV2+EL/88v8pekLNPfjMXxc9oRAt71eKnlCI0clXFj2h5t65/+qiJ9TUSLkpovfCzvUOBoAUAgNACoEBIIXAAJBCYABIITAApBAYAFIIDAApBAaAFAIDQAqBASCFwACQQmAASCEwAKQQGABSCAwAKQQGgBQCA0AKgQEghcAAkEJgAEghMACkEBgAUggMACkEBoAUAgNAiqoDs3fv3li+fHm0t7dHQ0NDPP/88wmzAKh3VQdmeHg45s2bF5s2bcrYA8A40VTtE5YtWxbLli3L2ALAOFJ1YKpVLpejXC6PfT04OJh9SQA+AdJv8vf09ERbW9vY0dHRkX1JAD4B0gPT3d0dAwMDY0d/f3/2JQH4BEj/iKxUKkWpVMq+DACfML4PBoAUVb+DOXXqVBw+fHjs6yNHjsSBAwdiypQpMWvWrMs6DoD6VXVgXn/99fjCF74w9vX69esjImLVqlWxdevWyzYMgPpWdWDuvffeqFQqGVsAGEfcgwEghcAAkEJgAEghMACkEBgAUggMACkEBoAUAgNACoEBIIXAAJBCYABIITAApBAYAFIIDAApBAaAFAIDQAqBASCFwACQQmAASCEwAKQQGABSCAwAKQQGgBRNRV145OZ/icqVlaIuX4gl//evip5Qc3//1b8pekIhHvnhxPtnHREx6f3BoifU3NR/uLroCTV19szZePsCz/UOBoAUAgNACoEBIIXAAJBCYABIITAApBAYAFIIDAApBAaAFAIDQAqBASCFwACQQmAASCEwAKQQGABSCAwAKQQGgBQCA0AKgQEghcAAkEJgAEghMACkEBgAUggMACkEBoAUAgNACoEBIEVVgenp6Ym77rorWltbY9q0afHwww/Hm2++mbUNgDpWVWD27NkTXV1dsW/fvti5c2ecOXMm7r///hgeHs7aB0Cdaqrm5Jdffvmcr7du3RrTpk2Lvr6++NznPndZhwFQ36oKzH81MDAQERFTpkw57znlcjnK5fLY14ODg5dySQDqxEXf5B8dHY1169bF3XffHXPnzj3veT09PdHW1jZ2dHR0XOwlAagjFx2Yrq6uOHToUGzfvv1jz+vu7o6BgYGxo7+//2IvCUAduaiPyNasWRMvvvhi7N27N2bOnPmx55ZKpSiVShc1DoD6VVVgKpVKfPOb34wdO3bE7t274+abb87aBUCdqyowXV1dsW3btnjhhReitbU1jh8/HhERbW1tccUVV6QMBKA+VXUPZvPmzTEwMBD33ntvzJgxY+x49tlns/YBUKeq/ogMAC6En0UGQAqBASCFwACQQmAASCEwAKQQGABSCAwAKQQGgBQCA0AKgQEghcAAkEJgAEghMACkEBgAUggMACkEBoAUAgNACoEBIIXAAJBCYABIITAApBAYAFIIDAApmoq6cOtvr4jG5paiLl+IoZuKXlB7/+tv/6roCYX4/bq/LXpCIR585n8WPaHmji9sLHpCTY1+0Bjx4oWd6x0MACkEBoAUAgNACoEBIIXAAJBCYABIITAApBAYAFIIDAApBAaAFAIDQAqBASCFwACQQmAASCEwAKQQGABSCAwAKQQGgBQCA0AKgQEghcAAkEJgAEghMACkEBgAUggMACkEBoAUVQVm8+bN0dnZGZMnT47JkyfH4sWL46WXXsraBkAdqyowM2fOjI0bN0ZfX1+8/vrrcd9998VDDz0Ub7zxRtY+AOpUUzUnL1++/Jyvv/Od78TmzZtj3759MWfOnMs6DID6VlVg/rORkZH46U9/GsPDw7F48eLznlcul6NcLo99PTg4eLGXBKCOVH2T/+DBg3H11VdHqVSKr3/967Fjx46YPXv2ec/v6emJtra2saOjo+OSBgNQH6oOzO233x4HDhyI3/3ud/GNb3wjVq1aFb///e/Pe353d3cMDAyMHf39/Zc0GID6UPVHZM3NzXHrrbdGRMT8+fOjt7c3nnrqqdiyZctHnl8qlaJUKl3aSgDqziV/H8zo6Og591gAIKLKdzDd3d2xbNmymDVrVgwNDcW2bdti9+7d8corr2TtA6BOVRWYkydPxpe//OU4duxYtLW1RWdnZ7zyyivxxS9+MWsfAHWqqsD88Ic/zNoBwDjjZ5EBkEJgAEghMACkEBgAUggMACkEBoAUAgNACoEBIIXAAJBCYABIITAApBAYAFIIDAApBAaAFAIDQAqBASCFwACQQmAASCEwAKQQGABSCAwAKQQGgBQCA0AKgQEgRVNRF/7zvLMx6YqzRV2+EJ/+uzNFT6i5tx9pKXpCIf7H//7LoicU4p6/7y16Qs2d/cupRU+oqbNnP4gjF3iudzAApBAYAFIIDAApBAaAFAIDQAqBASCFwACQQmAASCEwAKQQGABSCAwAKQQGgBQCA0AKgQEghcAAkEJgAEghMACkEBgAUggMACkEBoAUAgNACoEBIIXAAJBCYABIITAApBAYAFJcUmA2btwYDQ0NsW7duss0B4Dx4qID09vbG1u2bInOzs7LuQeAceKiAnPq1KlYuXJl/OAHP4hrr732cm8CYBy4qMB0dXXFgw8+GEuXLv1vzy2XyzE4OHjOAcD411TtE7Zv3x779++P3t7eCzq/p6cnnnjiiaqHAVDfqnoH09/fH2vXro2f/OQn0dLSckHP6e7ujoGBgbGjv7//ooYCUF+qegfT19cXJ0+ejDvvvHPssZGRkdi7d288/fTTUS6Xo7Gx8ZznlEqlKJVKl2ctAHWjqsAsWbIkDh48eM5jq1evjjvuuCO+9a1vfSguAExcVQWmtbU15s6de85jV111VVx33XUfehyAic138gOQouo/RfZf7d69+zLMAGC88Q4GgBQCA0AKgQEghcAAkEJgAEghMACkEBgAUggMACkEBoAUAgNACoEBIIXAAJBCYABIITAApBAYAFIIDAApBAaAFAIDQAqBASCFwACQQmAASCEwAKQQGABSNBV14dZ//ItoLP1FUZcvxFsrG4qeUHPX90681xwR8c+do0VPKMSbC84UPaHmbnrtcNETaur0qdMRX7iwc72DASCFwACQQmAASCEwAKQQGABSCAwAKQQGgBQCA0AKgQEghcAAkEJgAEghMACkEBgAUggMACkEBoAUAgNACoEBIIXAAJBCYABIITAApBAYAFIIDAApBAaAFAIDQAqBASCFwACQoqrAPP7449HQ0HDOcccdd2RtA6CONVX7hDlz5sQvf/nL//gFmqr+JQCYAKquQ1NTU9xwww0ZWwAYR6q+B/PWW29Fe3t73HLLLbFy5cp45513Pvb8crkcg4OD5xwAjH9VBWbRokWxdevWePnll2Pz5s1x5MiRuOeee2JoaOi8z+np6Ym2traxo6Oj45JHA/DJV1Vgli1bFo888kh0dnbGAw88ED//+c/jz3/+czz33HPnfU53d3cMDAyMHf39/Zc8GoBPvku6Q3/NNdfEbbfdFocPHz7vOaVSKUql0qVcBoA6dEnfB3Pq1Kl4++23Y8aMGZdrDwDjRFWBeeyxx2LPnj3xxz/+MX7729/Gl770pWhsbIxHH300ax8Adaqqj8jefffdePTRR+P999+PqVOnxmc/+9nYt29fTJ06NWsfAHWqqsBs3749awcA44yfRQZACoEBIIXAAJBCYABIITAApBAYAFIIDAApBAaAFAIDQAqBASCFwACQQmAASCEwAKQQGABSCAwAKQQGgBQCA0AKgQEghcAAkEJgAEghMACkEBgAUggMACmaan3BSqUSEREjpz+o9aULN/ovZ4ueUHMjpxuLnlCI0Q8qRU8oxNnKmaIn1NzpUxPr3/HTw//2z/jf/1v+cRoqF3LWZfTuu+9GR0dHLS8JwGXW398fM2fO/Nhzah6Y0dHROHr0aLS2tkZDQ0PNrjs4OBgdHR3R398fkydPrtl1i+Z1T5zXPRFfc8TEfN1FvuZKpRJDQ0PR3t4ekyZ9/F2Wmn9ENmnSpP+2epkmT548Yf4l/M+87oljIr7miIn5uot6zW1tbRd0npv8AKQQGABSTJjAlEql2LBhQ5RKpaKn1JTXPXFe90R8zRET83XXy2uu+U1+ACaGCfMOBoDaEhgAUggMACkEBoAUEyYwmzZtiptuuilaWlpi0aJF8dprrxU9KdXevXtj+fLl0d7eHg0NDfH8888XPSldT09P3HXXXdHa2hrTpk2Lhx9+ON58882iZ6XbvHlzdHZ2jn3T3eLFi+Oll14qelZNbdy4MRoaGmLdunVFT0n1+OOPR0NDwznHHXfcUfSs85oQgXn22Wdj/fr1sWHDhti/f3/MmzcvHnjggTh58mTR09IMDw/HvHnzYtOmTUVPqZk9e/ZEV1dX7Nu3L3bu3BlnzpyJ+++/P4aHh4uelmrmzJmxcePG6Ovri9dffz3uu+++eOihh+KNN94oelpN9Pb2xpYtW6Kzs7PoKTUxZ86cOHbs2Njxm9/8puhJ51eZABYuXFjp6uoa+3pkZKTS3t5e6enpKXBV7UREZceOHUXPqLmTJ09WIqKyZ8+eoqfU3LXXXlt55plnip6RbmhoqPLpT3+6snPnzsrnP//5ytq1a4uelGrDhg2VefPmFT3jgo37dzCnT5+Ovr6+WLp06dhjkyZNiqVLl8arr75a4DKyDQwMRETElClTCl5SOyMjI7F9+/YYHh6OxYsXFz0nXVdXVzz44IPn/P4e7956661ob2+PW265JVauXBnvvPNO0ZPOq+Y/7LLW3nvvvRgZGYnp06ef8/j06dPjD3/4Q0GryDY6Ohrr1q2Lu+++O+bOnVv0nHQHDx6MxYsXxwcffBBXX3117NixI2bPnl30rFTbt2+P/fv3R29vb9FTambRokWxdevWuP322+PYsWPxxBNPxD333BOHDh2K1tbWoud9yLgPDBNTV1dXHDp06JP9+fRldPvtt8eBAwdiYGAgfvazn8WqVatiz5494zYy/f39sXbt2ti5c2e0tLQUPadmli1bNva/Ozs7Y9GiRXHjjTfGc889F1/96lcLXPbRxn1grr/++mhsbIwTJ06c8/iJEyfihhtuKGgVmdasWRMvvvhi7N27t9C/GqKWmpub49Zbb42IiPnz50dvb2889dRTsWXLloKX5ejr64uTJ0/GnXfeOfbYyMhI7N27N55++ukol8vR2Dj+/6bJa665Jm677bY4fPhw0VM+0ri/B9Pc3Bzz58+PXbt2jT02Ojoau3btmhCfUU8klUol1qxZEzt27Ihf/epXcfPNNxc9qTCjo6NRLpeLnpFmyZIlcfDgwThw4MDYsWDBgli5cmUcOHBgQsQlIuLUqVPx9ttvx4wZM4qe8pHG/TuYiIj169fHqlWrYsGCBbFw4cJ48sknY3h4OFavXl30tDSnTp065//VHDlyJA4cOBBTpkyJWbNmFbgsT1dXV2zbti1eeOGFaG1tjePHj0fEv/3lSFdccUXB6/J0d3fHsmXLYtasWTE0NBTbtm2L3bt3xyuvvFL0tDStra0furd21VVXxXXXXTeu77k99thjsXz58rjxxhvj6NGjsWHDhmhsbIxHH3206Gkfreg/xlYr3/ve9yqzZs2qNDc3VxYuXFjZt29f0ZNS/frXv65ExIeOVatWFT0tzUe93oio/PjHPy56WqqvfOUrlRtvvLHS3NxcmTp1amXJkiWVX/ziF0XPqrmJ8MeUV6xYUZkxY0alubm58qlPfaqyYsWKyuHDh4uedV5+XD8AKcb9PRgAiiEwAKQQGABSCAwAKQQGgBQCA0AKgQEghcAAkEJgAEghMACkEBgAUggMACn+FVFAhnoLRBuZAAAAAElFTkSuQmCC", 169 | "text/plain": [ 170 | "
" 171 | ] 172 | }, 173 | "metadata": {}, 174 | "output_type": "display_data" 175 | } 176 | ], 177 | "source": [ 178 | "from sklearn.metrics.pairwise import cosine_similarity\n", 179 | "import matplotlib.pyplot as plt\n", 180 | "\n", 181 | "H = cosine_similarity(audio_embs, text_embs)\n", 182 | "plt.imshow(H, interpolation='none')\n", 183 | "np.set_printoptions(precision=5, suppress=True)\n", 184 | "print(H)" 185 | ] 186 | } 187 | ], 188 | "metadata": { 189 | "kernelspec": { 190 | "display_name": "ar", 191 | "language": "python", 192 | "name": "python3" 193 | }, 194 | "language_info": { 195 | "codemirror_mode": { 196 | "name": "ipython", 197 | "version": 3 198 | }, 199 | "file_extension": ".py", 200 | "mimetype": "text/x-python", 201 | "name": "python", 202 | "nbconvert_exporter": "python", 203 | "pygments_lexer": "ipython3", 204 | "version": "3.9.18" 205 | } 206 | }, 207 | "nbformat": 4, 208 | "nbformat_minor": 2 209 | } 210 | -------------------------------------------------------------------------------- /app/icbhi_sprs/README_ICBHI_SPRS.md: -------------------------------------------------------------------------------- 1 | # M2D-X Applied on Respiratory Sound Tasks: ICBHI2017 & SPRSound 2 | 3 | This sub-repository provides application examples in a realistic setting described in our [TASLP paper](https://ieeexplore.ieee.org/document/10502167). 4 | 5 | ```BibTeX 6 | @article{niizumi2024m2dx, 7 | title = {{Masked Modeling Duo: Towards a Universal Audio Pre-training Framework}}, 8 | author = {Daisuke Niizumi and Daiki Takeuchi and Yasunori Ohishi and Noboru Harada and Kunio Kashino}, 9 | journal = {IEEE/ACM Trans. Audio, Speech, Language Process.}, 10 | year = {2024}, 11 | volume = {32}, 12 | pages = {2391-2406}, 13 | url = {https://ieeexplore.ieee.org/document/10502167}, 14 | doi = {10.1109/TASLP.2024.3389636}} 15 | ``` 16 | 17 | We applied the pre-trained models to respiratory sound tasks. While usually we fine-tune these models, we also further pre-train models on the application data that enhances the final performance (the step 2 below). The example contains data and test environmental setup, further pre-training steps, and fine-tuning steps. 18 | 19 |
20 | An shcematic illustration of M2D-X further pre-training 21 |
22 | 23 | Notably, the example follows the test setup compatible with previous studies, enabling comparison with SOTA. In addition, it runs on a smaller GPU RTX3090Ti (24GB). 24 | 25 | # ISBHI2017 26 | 27 | NEWS: The best model, `M2D-X/0.7 (η= 0.3)`, weight is available at the release [v0.2.0](https://github.com/nttcslab/m2d/releases/tag/v0.2.0). 28 | 29 | ## 1. Data and test setup 30 | 31 | ### 1-1. Setup application files 32 | 33 | In the `app/icbhi_sprs` folder, running the following steps will download and setup the application program files. 34 | 35 | ```sh 36 | pip install torchinfo 37 | git clone https://github.com/ilyassmoummad/scl_icbhi2017.git 38 | cd scl_icbhi2017 39 | git reset --hard 915c1120719a9357d662c5fe484bce7fbe845139 40 | mv dataset.py augmentations.py utils.py losses.py args.py .. 41 | mv data .. 42 | mv main.py ../app_main.py 43 | mv ce.py models.py .. 44 | cd .. 45 | patch -p2 < patch_scl_icbhi2017.diff 46 | ``` 47 | 48 | When you finish these steps, you will find many .py files and a folder: 49 | - Program files: app/icbhi_sprs/{app_main.py, args.py, augmentations.py, ce.py, dataset.py, losses.py, models.py, utils.py} 50 | - Data folder: app/icbhi_sprs/data 51 | 52 | ### 1-2. Download the ICBHI2017 data 53 | 54 | In the `app/icbhi_sprs` folder, running the following steps will download and setup the ICBHI2017 data. The last step converts raw audios into spectrograms. 55 | 56 | ```sh 57 | wget https://bhichallenge.med.auth.gr/sites/default/files/ICBHI_final_database/ICBHI_final_database.zip --no-check-certificate 58 | 59 | unzip ICBHI_final_database.zip | awk 'BEGIN {ORS=" "} {if(NR%10==0)print "."}' 60 | mv ICBHI_final_database/* data/ICBHI 61 | rmdir ICBHI_final_database 62 | 63 | python ../../wav_to_lms.py data/ICBHI ../../data/icbhi2017_lms 64 | cp files_icbhi2017.csv ../../data/files_icbhi2017.csv 65 | ``` 66 | 67 | When you finish these steps, you will find the following: 68 | - app/icbhi_sprs/data/ICBHI -- For fine-tuning, the original data files. 69 | - data/icbhi2017_lms -- For further pre-training, the log-mel spectrogram (LMS) files. 70 | - data/files_icbhi2017.csv -- For further pre-training, the list of LMS files. 71 | 72 | ### 1-3. Download FSD50K and setup the data 73 | 74 | We use FSD50K as the background noise for the further pre-training. 75 | 76 | To create the log-mel spectrogram FSD50K files, follow the [steps in the main README](../../README.md#3-1-preparing-pre-training-data-samples). 77 | When you finish, you will have the following: 78 | - data/fsd50k_lms -- For further pre-training, the log-mel spectrogram (LMS) files. 79 | - data/files_f_s_d_5_0_k.csv -- For further pre-training, the list of LMS files. 80 | 81 | ### 1-3. Download pre-trained weight 82 | 83 | We use an M2D weight with an input size of 80x200 and a patch size of 16x4. Be sure to download the weight to your copy's M2D root folder. 84 | 85 | ```sh 86 | cd (your M2D root folder) 87 | wget https://github.com/nttcslab/m2d/releases/download/v0.1.0/m2d_vit_base-80x200p16x4-230529.zip 88 | unzip m2d_vit_base-80x200p16x4-230529.zip 89 | ``` 90 | 91 | You will find `(your M2D root folder)/m2d_vit_base-80x200p16x4-230529/checkpoint-300.pth`. 92 | 93 | 94 | ## 2. Further pre-training 95 | 96 | We pre-train the pre-trained model again to make it more suitable for the target application data distribution. 97 | 98 | ** *Be sure to move to your copy's M2D root folder before you run the following.* ** 99 | 100 | ```sh 101 | cd (your M2D root folder) 102 | 103 | CUDA_VISIBLE_DEVICES=0 python train_audio.py --epochs 600 --resume m2d_vit_base-80x200p16x4-230529/checkpoint-300.pth --model m2d_x_vit_base --input_size 80x200 --patch_size 16x4 --batch_size 64 --accum_iter 2 --csv_main data/files_icbhi2017.csv --csv_bg_noise data/files_f_s_d_5_0_k.csv --noise_ratio 0.3 --save_freq 100 --eval_after 600 --seed 6 --teacher m2d_vit_base-80x200p16x4-230529/checkpoint-300.pth --blr 3e-4 --loss_off 1. --min_ds_size 10000 104 | ``` 105 | 106 | When you finish, you will find the further pre-trained model folder named `m2d_x_vit_base-80x200p16x4p16k-240514-MdfiDdffsd50ks6bs128a2lo1nr.3-e600` (`240514` will be the date you run). 107 | The folder contains six checkpoint files for every 100 epochs. 108 | 109 | In this step, the source `m2d_vit_base-80x200p16x4-230529` model pre-trained on AudioSet is further pre-trained using the files listed in `data/files_icbhi2017.csv`, making it more effective for solving ICBHI2017. 110 | 111 | 112 | ## 3. Fine-tuning 113 | 114 | We are almost complete. The last step is fine-tuning in the app/icbhi_sprs folder. 115 | Use your further pre-trained weight, such as `m2d_x_vit_base-80x200p16x4p16k-240514-MdfiDdffsd50ks6bs128a2lo1nr.3-e600/checkpoint-600.pth`: 116 | 117 | ** *Be sure to change the folder name to yours.* ** 118 | 119 | ** *Be sure to move to your copy's M2D root folder before you run the following.* ** 120 | 121 | ```sh 122 | cd (your M2D root folder)/app/icbhi_sprs 123 | 124 | CUDA_VISIBLE_DEVICES=0 python app_main.py --method sl --backbone m2d --epochs 150 --bs 64 --lr 5e-5 --freeze_embed --split_iter 4 --weightspath ../../m2d_x_vit_base-80x200p16x4p16k-240514-MdfiDdffsd50ks6bs128a2lo1nr.3-e600/checkpoint-600.pth 125 | ``` 126 | 127 | We appreciate the codebase [lyassmoummad/scl_icbhi2017](https://github.com/ilyassmoummad/scl_icbhi2017) from the following paper. We customized the code to load and fine-tune the M2D models. 128 | 129 | - [Moummad and Farrugia, "Pretraining Respiratory Sound Representations using Metadata and Contrastive Learning," in WASPAA, 2023](https://arxiv.org/abs/2210.16192) 130 | 131 | To iterate fine-tuning for getting a statistical result, we actually used a batch file instead of a raw command line. 132 | 133 | ```sh 134 | bash eval_icbhi.sh ../../m2d_x_vit_base-80x200p16x4p16k-240514-MdfiDdffsd50ks6bs128a2lo1nr.3-e600/checkpoint-600.pth 5 135 | ``` 136 | 137 | The last `5` denotes the number of the fine-tuning iteration. 138 | 139 | The results on the paper: 140 |
141 | ICBHI results 142 |
143 | 144 | # SPRSound 145 | 146 | ## 4. SPRSound 147 | 148 | For SPRSound, we use the same code while setting up the SPRSound data. 149 | 150 | ### 4-1. Download SPRSound and setup data 151 | 152 | In the `app/icbhi_sprs` folder, running the following steps will download the SPRSound data. 153 | 154 | ```sh 155 | git clone https://github.com/SJTU-YONGFU-RESEARCH-GRP/SPRSound.git 156 | (cd SPRSound && git reset --hard 45b0d5d435ff320c46585762fa1090afd0ebb318) 157 | ``` 158 | 159 | ```sh 160 | cp -r SPRSound/train_wav SPRSound/test_wav data/SPRS/ 161 | cp files_sprs.csv ../../data 162 | python cut_data_sprs.py 163 | ``` 164 | 165 | When you finish these steps, you will find the following: 166 | - app/icbhi_sprs/data/SPRS -- For fine-tuning, the original data files. 167 | - data/sprsound_lms -- For further pre-training, the log-mel spectrogram (LMS) files. 168 | - data/files_sprs.csv -- For further pre-training, the list of LMS files. 169 | 170 | ### 4-2. Further Pre-training 171 | 172 | ** *Be sure to move to your copy's M2D root folder before you run the following.* ** 173 | 174 | ```sh 175 | cd (your M2D root folder) 176 | 177 | CUDA_VISIBLE_DEVICES=0 python train_audio.py --epochs 600 --resume m2d_vit_base-80x200p16x4-230529/checkpoint-300.pth --model m2d_x_vit_base --input_size 80x200 --patch_size 16x4 --batch_size 64 --accum_iter 2 --csv_main data/files_sprs.csv --csv_bg_noise data/files_f_s_d_5_0_k.csv --noise_ratio 0.01 --save_freq 100 --eval_after 600 --seed 3 --teacher m2d_vit_base-80x200p16x4-230529/checkpoint-300.pth --blr 3e-4 --loss_off 1. --min_ds_size 0 178 | ``` 179 | 180 | ### 4-3. Fine-tuning 181 | 182 | To iterate fine-tuning for getting a statistical result, we used a batch file. 183 | 184 | ```sh 185 | bash eval_sprs.sh ../../m2d_x_vit_base-80x200p16x4p16k-240514-MdfiDdffsd50ks6bs128a2lo1nr.3-e600/checkpoint-600.pth 5 186 | ``` 187 | 188 | The last `5` denotes the number of the fine-tuning iteration. 189 | 190 | The results on the paper: 191 |
192 | SPRSound results 193 |
194 | 195 | 196 | ## Complete command lines 197 | 198 | The command lines for reproduction follows. 199 | 200 | ### ICBHI2017 201 | 202 | #### Further pre-training 203 | 204 | We explain the details in the [Guide_app.md](Guide_app.md). 205 | 206 | ```sh 207 | # M2D-X, noise_ratio 0.3 208 | CUDA_VISIBLE_DEVICES=0 python train_audio.py --epochs 600 --resume m2d_vit_base-80x200p16x4-230529/checkpoint-300.pth --model m2d_x_vit_base --input_size 80x200 --patch_size 16x4 --batch_size 64 --accum_iter 2 --csv_main data/files_icbhi2017.csv --csv_bg_noise data/files_f_s_d_5_0_k.csv --noise_ratio 0.3 --save_freq 100 --eval_after 600 --seed 6 --teacher m2d_vit_base-80x200p16x4-230529/checkpoint-300.pth --blr 3e-4 --loss_off 1. --min_ds_size 10000 209 | # M2D-X, noise_ratio 0.0 210 | CUDA_VISIBLE_DEVICES=0 python train_audio.py --epochs 600 --resume m2d_vit_base-80x200p16x4-230529/checkpoint-300.pth --model m2d_x_vit_base --input_size 80x200 --patch_size 16x4 --batch_size 64 --accum_iter 2 --csv_main data/files_icbhi2017.csv --csv_bg_noise data/files_f_s_d_5_0_k.csv --noise_ratio 0.0 --save_freq 100 --eval_after 600 --seed 6 --teacher m2d_vit_base-80x200p16x4-230529/checkpoint-300.pth --blr 3e-4 --loss_off 1. --min_ds_size 10000 211 | 212 | We specifically leave our command lines for the random seed 3. We did them for all four seeds. 213 | 214 | # M2D, noise_ratio 0.3 215 | CUDA_VISIBLE_DEVICES=0 python train_audio.py --epochs 600 --resume m2d_vit_base-80x200p16x4-230529/checkpoint-300.pth --model m2d_vit_base --input_size 80x200 --patch_size 16x4 --batch_size 64 --accum_iter 2 --csv_main data/files_icbhi2017.csv --csv_bg_noise data/files_f_s_d_5_0_k.csv --noise_ratio 0.3 --save_freq 100 --eval_after 600 --seed 3 --blr 3e-4 --loss_off 0. --min_ds_size 10000 216 | # M2D, noise_ratio 0.0 217 | CUDA_VISIBLE_DEVICES=0 python train_audio.py --epochs 600 --resume m2d_vit_base-80x200p16x4-230529/checkpoint-300.pth --model m2d_vit_base --input_size 80x200 --patch_size 16x4 --batch_size 64 --accum_iter 2 --csv_main data/files_icbhi2017.csv --csv_bg_noise data/files_f_s_d_5_0_k.csv --noise_ratio 0.0 --save_freq 100 --eval_after 600 --seed 3 --blr 3e-4 --loss_off 0. --min_ds_size 10000 218 | ``` 219 | 220 | Example log is available for M2D-X, noise_ratio 0.3 in the [example_logs.zip](https://github.com/nttcslab/m2d/releases/download/v0.1.0/example_logs.zip). 221 | Find `examples/logs/log_m2d_x_vit_base-80x200p16x4-230814-Ddffsd50ks5blr0003bs128a2nr.3-e600.out` from the zip file. 222 | 223 | 224 | #### Fine-tuning 225 | 226 | We specifically leave our command lines for checkpoint-600.pth. We did them for all six checkpoints. 227 | 228 | ```sh 229 | # M2D-X, noise_ratio 0.3, random seeds 3 to 6 230 | bash eval_icbhi.sh ../../m2d_x_vit_base-80x200p16x4-230814-Ddffsd50ks3blr0003bs128a2nr.3-e600/checkpoint-600.pth 5 231 | bash eval_icbhi.sh ../../m2d_x_vit_base-80x200p16x4-230814-Ddffsd50ks4blr0003bs128a2nr.3-e600/checkpoint-600.pth 5 232 | bash eval_icbhi.sh ../../m2d_x_vit_base-80x200p16x4-230814-Ddffsd50ks5blr0003bs128a2nr.3-e600/checkpoint-600.pth 5 233 | bash eval_icbhi.sh ../../m2d_x_vit_base-80x200p16x4-230814-Ddffsd50ks6blr0003bs128a2nr.3-e600/checkpoint-600.pth 5 234 | # M2D-X, noise_ratio 0.0, random seeds 3 to 6 235 | bash eval_icbhi.sh ../../m2d_x_vit_base-80x200p16x4-230818-Ddffsd50ks3blr0003bs128a2nr0-e600/checkpoint-600.pth 5 236 | bash eval_icbhi.sh ../../m2d_x_vit_base-80x200p16x4-230818-Ddffsd50ks4blr0003bs128a2nr0-e600/checkpoint-600.pth 5 237 | bash eval_icbhi.sh ../../m2d_x_vit_base-80x200p16x4-230818-Ddffsd50ks5blr0003bs128a2nr0-e600/checkpoint-600.pth 5 238 | bash eval_icbhi.sh ../../m2d_x_vit_base-80x200p16x4-230818-Ddffsd50ks6blr0003bs128a2nr0-e600/checkpoint-600.pth 5 239 | 240 | # M2D, noise_ratio 0.3, random seeds 3 to 6 241 | bash eval_icbhi.sh ../../m2d_vit_base-80x200p16x4-mix-230814-s3blr0003bs128a2MdfiDdffsd50knr.3-e600/checkpoint-600.pth 5 242 | bash eval_icbhi.sh ../../m2d_vit_base-80x200p16x4-mix-230814-s4blr0003bs128a2MdfiDdffsd50knr.3-e600/checkpoint-600.pth 5 243 | bash eval_icbhi.sh ../../m2d_vit_base-80x200p16x4-mix-230814-s5blr0003bs128a2MdfiDdffsd50knr.3-e600/checkpoint-600.pth 5 244 | bash eval_icbhi.sh ../../m2d_vit_base-80x200p16x4-mix-230814-s6blr0003bs128a2MdfiDdffsd50knr.3-e600/checkpoint-600.pth 5 245 | # M2D, noise_ratio 0.3, random seeds 3 to 6 246 | bash eval_icbhi.sh ../../m2d_vit_base-80x200p16x4-mix-230814-s3blr0003bs128a2MdfiDdffsd50knr0-e600/checkpoint-600.pth 5 247 | bash eval_icbhi.sh ../../m2d_vit_base-80x200p16x4-mix-230814-s4blr0003bs128a2MdfiDdffsd50knr0-e600/checkpoint-600.pth 5 248 | bash eval_icbhi.sh ../../m2d_vit_base-80x200p16x4-mix-230814-s5blr0003bs128a2MdfiDdffsd50knr0-e600/checkpoint-600.pth 5 249 | bash eval_icbhi.sh ../../m2d_vit_base-80x200p16x4-mix-230814-s6blr0003bs128a2MdfiDdffsd50knr0-e600/checkpoint-600.pth 5 250 | ``` 251 | ### SPRSound 252 | 253 | #### Further pre-training 254 | 255 | We specifically leave our command lines for the random seed 3. We did them for all four seeds. 256 | 257 | ```sh 258 | # M2D-X, noise_ratio 0.3 259 | CUDA_VISIBLE_DEVICES=0 python train_audio.py --epochs 600 --resume m2d_vit_base-80x200p16x4-230529/checkpoint-300.pth --model m2d_x_vit_base --input_size 80x200 --patch_size 16x4 --batch_size 64 --accum_iter 2 --csv_main data/files_sprs.csv --csv_bg_noise data/files_f_s_d_5_0_k.csv --noise_ratio 0.01 --save_freq 100 --eval_after 600 --seed 3 --teacher m2d_vit_base-80x200p16x4-230529/checkpoint-300.pth --blr 3e-4 --loss_off 1. --min_ds_size 0 260 | # M2D-X, noise_ratio 0.0 261 | CUDA_VISIBLE_DEVICES=0 python train_audio.py --epochs 600 --resume m2d_vit_base-80x200p16x4-230529/checkpoint-300.pth --model m2d_x_vit_base --input_size 80x200 --patch_size 16x4 --batch_size 64 --accum_iter 2 --csv_main data/files_sprs.csv --csv_bg_noise data/files_f_s_d_5_0_k.csv --noise_ratio 0.0 --save_freq 100 --eval_after 600 --seed 3 --teacher m2d_vit_base-80x200p16x4-230529/checkpoint-300.pth --blr 3e-4 --loss_off 1. --min_ds_size 0 262 | ``` 263 | 264 | #### Fine-tuning 265 | 266 | We specifically leave our command lines for checkpoint-600.pth. We did them for all six checkpoints. 267 | 268 | ```sh 269 | # M2D-X, noise_ratio 0.01, random seeds 3 to 6 270 | bash eval_sprs.sh ../../m2d_x_vit_base-80x200p16x4-240222-MdfsDdffsd50ks3blr0003bs128a2lo0nr01dn0-e600/checkpoint-600.pth 5 271 | bash eval_sprs.sh ../../m2d_x_vit_base-80x200p16x4-240222-MdfsDdffsd50ks4blr0003bs128a2lo0nr01dn0-e600/checkpoint-600.pth 5 272 | bash eval_sprs.sh ../../m2d_x_vit_base-80x200p16x4-240222-MdfsDdffsd50ks5blr0003bs128a2lo0nr01dn0-e600/checkpoint-600.pth 5 273 | bash eval_sprs.sh ../../m2d_x_vit_base-80x200p16x4-240222-MdfsDdffsd50ks6blr0003bs128a2lo0nr01dn0-e600/checkpoint-600.pth 5 274 | # M2D-X, noise_ratio 0.0, random seeds 3 to 6 275 | bash eval_sprs.sh ../../m2d_x_vit_base-80x200p16x4-240223-MdfsDdffsd50ks3blr0003bs128a2lo0nr0dn0-e600/checkpoint-600.pth 5 276 | bash eval_sprs.sh ../../m2d_x_vit_base-80x200p16x4-240223-MdfsDdffsd50ks4blr0003bs128a2lo0nr0dn0-e600/checkpoint-600.pth 5 277 | bash eval_sprs.sh ../../m2d_x_vit_base-80x200p16x4-240223-MdfsDdffsd50ks5blr0003bs128a2lo0nr0dn0-e600/checkpoint-600.pth 5 278 | bash eval_sprs.sh ../../m2d_x_vit_base-80x200p16x4-240223-MdfsDdffsd50ks6blr0003bs128a2lo0nr0dn0-e600/checkpoint-600.pth 5 279 | ``` 280 | -------------------------------------------------------------------------------- /clap/Note-ACalt4_GTEbase.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Preparing Caption Embeddings for AudioCaps Alternative 4 Captions (ACalt4)\n", 8 | "\n", 9 | "Our implementation does not convert texts into sentence (semantic) embeddings on the fly. Instead, we convert them into embeddings in advance in an offline fashion.\n", 10 | "\n", 11 | "- Download ACalt4 as `../data/audiocaps_alternative_4.csv` in advance from the external website DOSHISHA.\n", 12 | "- The following will create `../data/capemb_GTEbase_AC_BLIP_Aug.npy` using the GTE base sentence embedding encoder model." 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": 1, 18 | "metadata": {}, 19 | "outputs": [ 20 | { 21 | "name": "stderr", 22 | "output_type": "stream", 23 | "text": [ 24 | "INFO:numexpr.utils:Note: detected 80 virtual cores but NumExpr set to maximum of 64, check \"NUMEXPR_MAX_THREADS\" environment variable.\n", 25 | "INFO:numexpr.utils:Note: NumExpr detected 80 cores but \"NUMEXPR_MAX_THREADS\" not set, so enforcing safe limit of 8.\n", 26 | "INFO:numexpr.utils:NumExpr defaulting to 8 threads.\n" 27 | ] 28 | } 29 | ], 30 | "source": [ 31 | "import warnings; warnings.simplefilter('ignore')\n", 32 | "import logging; logging.basicConfig(level=logging.INFO)\n", 33 | "import numpy as np\n", 34 | "import pandas as pd\n", 35 | "import torch" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": 2, 41 | "metadata": {}, 42 | "outputs": [ 43 | { 44 | "name": "stdout", 45 | "output_type": "stream", 46 | "text": [ 47 | "[[69.65808868408203, 88.03551483154297, 68.79684448242188]]\n" 48 | ] 49 | } 50 | ], 51 | "source": [ 52 | "# https://huggingface.co/thenlper/gte-base\n", 53 | "\n", 54 | "import torch.nn.functional as F\n", 55 | "from torch import Tensor\n", 56 | "from transformers import AutoTokenizer, AutoModel\n", 57 | "\n", 58 | "def average_pool(last_hidden_states: Tensor,\n", 59 | " attention_mask: Tensor) -> Tensor:\n", 60 | " last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0)\n", 61 | " return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]\n", 62 | "\n", 63 | "input_texts = [\n", 64 | " \"what is the capital of China?\",\n", 65 | " \"how to implement quick sort in python?\",\n", 66 | " \"Beijing\",\n", 67 | " \"sorting algorithms\"\n", 68 | "]\n", 69 | "\n", 70 | "tokenizer = AutoTokenizer.from_pretrained(\"thenlper/gte-base\")\n", 71 | "model = AutoModel.from_pretrained(\"thenlper/gte-base\")\n", 72 | "\n", 73 | "# Tokenize the input texts\n", 74 | "batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt')\n", 75 | "\n", 76 | "outputs = model(**batch_dict)\n", 77 | "embeddings = average_pool(outputs.last_hidden_state, batch_dict['attention_mask'])\n", 78 | "\n", 79 | "# (Optionally) normalize embeddings\n", 80 | "embeddings = F.normalize(embeddings, p=2, dim=1)\n", 81 | "scores = (embeddings[:1] @ embeddings[1:].T) * 100\n", 82 | "print(scores.tolist())" 83 | ] 84 | }, 85 | { 86 | "cell_type": "code", 87 | "execution_count": 3, 88 | "metadata": {}, 89 | "outputs": [ 90 | { 91 | "data": { 92 | "text/html": [ 93 | "
\n", 94 | "\n", 107 | "\n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | "
caption1caption2caption3caption4
youtube_id
---1_cCGK4MA train is moving along the tracks with the rh...A train swiftly moving along the tracks, accom...A train horn blaring in the distance, blending...The unmistakable sound of a train, with the cl...
---lTs1dxhUA racing car speeding past in a virtual raceA car zooming around a track in a video gameThe fast-paced sound of a car zooming along a ...A dynamic sound of a vehicle racing on a track...
--0PQM4-hqgWater flowing through a river with a gurgling ...A waterfall cascading down with a rush of waterGurgling water flowing through a peaceful land...Natures symphony includes the gentle gurgling ...
--299m5_DdEExcitement fills the indoor water park as chil...The joyful sounds of children playing fill the...Gurgling water and a waterfall fill the indoor...The air in an indoor water park is filled with...
--2XRMjyizoBird vocalizations, with chirps and tweets, fi...Two police officers standing in front of a mapBirds chirping and tweeting in the backgroundAmidst the scene of two police officers studyi...
...............
zzlfP-snUeYA bulldozer idling in a rural areaA bulldozer idles and its engine rumbles softl...An idling engine of a vehicle in an outdoor se...The engine of a parked bulldozer purrs quietly...
zzm3dwoXY8YBirds chirping and cooing in a natural outdoor...Birds chirping and cooing in an outdoor settingA soft cooing sound coming from a group of bir...The cooing of pigeons in an outdoor environment
zzvWbSyZfr0The snoring in this image is occasionally inte...There is snoring and occasional speech coming ...A young girl is peacefully sleeping on a bed i...In the background, there is a gentle snoring s...
zzwBazlj0OcThe soft sound of pigeons cooing in a confined...Birds cooing softly in a confined spacePigeons cooing softly in a confined spacePigeons cooing softly in a small room
zzznDcamMpwDucks quacking and people speaking can be hear...The echoes of ducks and people talking can be ...The echoes of ducks and people talking can be ...The echoes of ducks and people talking can be ...
\n", 204 | "

41785 rows × 4 columns

\n", 205 | "
" 206 | ], 207 | "text/plain": [ 208 | " caption1 \\\n", 209 | "youtube_id \n", 210 | "---1_cCGK4M A train is moving along the tracks with the rh... \n", 211 | "---lTs1dxhU A racing car speeding past in a virtual race \n", 212 | "--0PQM4-hqg Water flowing through a river with a gurgling ... \n", 213 | "--299m5_DdE Excitement fills the indoor water park as chil... \n", 214 | "--2XRMjyizo Bird vocalizations, with chirps and tweets, fi... \n", 215 | "... ... \n", 216 | "zzlfP-snUeY A bulldozer idling in a rural area \n", 217 | "zzm3dwoXY8Y Birds chirping and cooing in a natural outdoor... \n", 218 | "zzvWbSyZfr0 The snoring in this image is occasionally inte... \n", 219 | "zzwBazlj0Oc The soft sound of pigeons cooing in a confined... \n", 220 | "zzznDcamMpw Ducks quacking and people speaking can be hear... \n", 221 | "\n", 222 | " caption2 \\\n", 223 | "youtube_id \n", 224 | "---1_cCGK4M A train swiftly moving along the tracks, accom... \n", 225 | "---lTs1dxhU A car zooming around a track in a video game \n", 226 | "--0PQM4-hqg A waterfall cascading down with a rush of water \n", 227 | "--299m5_DdE The joyful sounds of children playing fill the... \n", 228 | "--2XRMjyizo Two police officers standing in front of a map \n", 229 | "... ... \n", 230 | "zzlfP-snUeY A bulldozer idles and its engine rumbles softl... \n", 231 | "zzm3dwoXY8Y Birds chirping and cooing in an outdoor setting \n", 232 | "zzvWbSyZfr0 There is snoring and occasional speech coming ... \n", 233 | "zzwBazlj0Oc Birds cooing softly in a confined space \n", 234 | "zzznDcamMpw The echoes of ducks and people talking can be ... \n", 235 | "\n", 236 | " caption3 \\\n", 237 | "youtube_id \n", 238 | "---1_cCGK4M A train horn blaring in the distance, blending... \n", 239 | "---lTs1dxhU The fast-paced sound of a car zooming along a ... \n", 240 | "--0PQM4-hqg Gurgling water flowing through a peaceful land... \n", 241 | "--299m5_DdE Gurgling water and a waterfall fill the indoor... \n", 242 | "--2XRMjyizo Birds chirping and tweeting in the background \n", 243 | "... ... \n", 244 | "zzlfP-snUeY An idling engine of a vehicle in an outdoor se... \n", 245 | "zzm3dwoXY8Y A soft cooing sound coming from a group of bir... \n", 246 | "zzvWbSyZfr0 A young girl is peacefully sleeping on a bed i... \n", 247 | "zzwBazlj0Oc Pigeons cooing softly in a confined space \n", 248 | "zzznDcamMpw The echoes of ducks and people talking can be ... \n", 249 | "\n", 250 | " caption4 \n", 251 | "youtube_id \n", 252 | "---1_cCGK4M The unmistakable sound of a train, with the cl... \n", 253 | "---lTs1dxhU A dynamic sound of a vehicle racing on a track... \n", 254 | "--0PQM4-hqg Natures symphony includes the gentle gurgling ... \n", 255 | "--299m5_DdE The air in an indoor water park is filled with... \n", 256 | "--2XRMjyizo Amidst the scene of two police officers studyi... \n", 257 | "... ... \n", 258 | "zzlfP-snUeY The engine of a parked bulldozer purrs quietly... \n", 259 | "zzm3dwoXY8Y The cooing of pigeons in an outdoor environment \n", 260 | "zzvWbSyZfr0 In the background, there is a gentle snoring s... \n", 261 | "zzwBazlj0Oc Pigeons cooing softly in a small room \n", 262 | "zzznDcamMpw The echoes of ducks and people talking can be ... \n", 263 | "\n", 264 | "[41785 rows x 4 columns]" 265 | ] 266 | }, 267 | "execution_count": 3, 268 | "metadata": {}, 269 | "output_type": "execute_result" 270 | } 271 | ], 272 | "source": [ 273 | "df = pd.read_csv('../data/audiocaps_alternative_4.csv').set_index('youtube_id')\n", 274 | "df" 275 | ] 276 | }, 277 | { 278 | "cell_type": "code", 279 | "execution_count": 4, 280 | "metadata": {}, 281 | "outputs": [], 282 | "source": [ 283 | "def chunks(lst, n):\n", 284 | " \"\"\"Yield successive n-sized chunks from lst.\"\"\"\n", 285 | " for i in range(0, len(lst), n):\n", 286 | " yield lst[i:i + n]\n", 287 | "\n", 288 | "cap_chunks = [c for c in chunks(list(df.values), 64)]" 289 | ] 290 | }, 291 | { 292 | "cell_type": "code", 293 | "execution_count": 5, 294 | "metadata": {}, 295 | "outputs": [ 296 | { 297 | "name": "stderr", 298 | "output_type": "stream", 299 | "text": [ 300 | "100%|██████████| 653/653 [01:25<00:00, 7.62it/s]\n" 301 | ] 302 | } 303 | ], 304 | "source": [ 305 | "from tqdm import tqdm\n", 306 | "\n", 307 | "model = model.to('cuda:0')\n", 308 | "\n", 309 | "emb_chunks = []\n", 310 | "for i, caps in enumerate(tqdm(cap_chunks)):\n", 311 | " flat_caps = []\n", 312 | " for cap4 in caps:\n", 313 | " assert len(cap4) == 4 # asserts 4 captions each\n", 314 | " for cap in cap4:\n", 315 | " flat_caps.append(cap)\n", 316 | "\n", 317 | " with torch.no_grad():\n", 318 | " batch_dict = tokenizer(flat_caps, max_length=512, padding=True, truncation=True, return_tensors='pt')\n", 319 | " batch_dict['input_ids'] = batch_dict['input_ids'].to('cuda:0')\n", 320 | " batch_dict['token_type_ids'] = batch_dict['token_type_ids'].to('cuda:0')\n", 321 | " batch_dict['attention_mask'] = batch_dict['attention_mask'].to('cuda:0')\n", 322 | " outputs = model(**batch_dict)\n", 323 | " embeddings = average_pool(outputs.last_hidden_state, batch_dict['attention_mask']).detach().cpu()\n", 324 | " embeddings = embeddings.reshape(-1, 4, embeddings.shape[-1])\n", 325 | " emb_chunks.append(embeddings)\n" 326 | ] 327 | }, 328 | { 329 | "cell_type": "code", 330 | "execution_count": 6, 331 | "metadata": {}, 332 | "outputs": [ 333 | { 334 | "data": { 335 | "text/plain": [ 336 | "(41785, 4, 768)" 337 | ] 338 | }, 339 | "execution_count": 6, 340 | "metadata": {}, 341 | "output_type": "execute_result" 342 | } 343 | ], 344 | "source": [ 345 | "embs = torch.cat(emb_chunks, dim=0).numpy().astype(np.float16)\n", 346 | "embs.shape" 347 | ] 348 | }, 349 | { 350 | "cell_type": "code", 351 | "execution_count": 7, 352 | "metadata": {}, 353 | "outputs": [], 354 | "source": [ 355 | "embdic = {y: c for y, c in zip(df.index.values, embs)}\n", 356 | "np.save('../data/capemb_GTEbase_AC_BLIP_Aug.npy', embdic)" 357 | ] 358 | }, 359 | { 360 | "cell_type": "code", 361 | "execution_count": 8, 362 | "metadata": {}, 363 | "outputs": [ 364 | { 365 | "data": { 366 | "text/plain": [ 367 | "((4, 768),\n", 368 | " array([[-0.1776 , -0.2524 , 0.2241 , ..., 0.568 , 0.501 , -0.3445 ],\n", 369 | " [-0.1724 , -0.3872 , 0.0874 , ..., 0.247 , 0.6016 , -0.3633 ],\n", 370 | " [ 0.1284 , -0.0255 , 0.1407 , ..., 0.4292 , 0.4458 , -0.1812 ],\n", 371 | " [-0.04327, -0.3618 , 0.4766 , ..., 0.3176 , 0.2566 , -0.4915 ]],\n", 372 | " dtype=float16))" 373 | ] 374 | }, 375 | "execution_count": 8, 376 | "metadata": {}, 377 | "output_type": "execute_result" 378 | } 379 | ], 380 | "source": [ 381 | "embdic['---1_cCGK4M'].shape, embdic['---1_cCGK4M']" 382 | ] 383 | }, 384 | { 385 | "cell_type": "code", 386 | "execution_count": null, 387 | "metadata": {}, 388 | "outputs": [], 389 | "source": [] 390 | } 391 | ], 392 | "metadata": { 393 | "kernelspec": { 394 | "display_name": "ar", 395 | "language": "python", 396 | "name": "python3" 397 | }, 398 | "language_info": { 399 | "codemirror_mode": { 400 | "name": "ipython", 401 | "version": 3 402 | }, 403 | "file_extension": ".py", 404 | "mimetype": "text/x-python", 405 | "name": "python", 406 | "nbconvert_exporter": "python", 407 | "pygments_lexer": "ipython3", 408 | "version": "3.9.18" 409 | } 410 | }, 411 | "nbformat": 4, 412 | "nbformat_minor": 2 413 | } 414 | -------------------------------------------------------------------------------- /app/icbhi_sprs/patch_scl_icbhi2017.diff: -------------------------------------------------------------------------------- 1 | --- _org/app/app_main.py 2024-05-14 09:58:27.909947715 +0900 2 | +++ app/icbhi_sprs/app_main.py 2024-05-14 10:08:03.645092839 +0900 3 | @@ -1,16 +1,18 @@ 4 | -import os 5 | +import os 6 | import torch 7 | import torch.nn as nn 8 | +from pathlib import Path 9 | +import pandas as pd 10 | from torchaudio import transforms as T 11 | import torch.nn.functional as F 12 | from torchinfo import summary 13 | from augmentations import SpecAugment 14 | -from models import CNN6, CNN10, CNN14, Projector, LinearClassifier 15 | +from models import CNN6, CNN10, CNN14, Projector, LinearClassifier, RT_LMS_M2D 16 | from dataset import ICBHI, SPRS 17 | from utils import Normalize, Standardize 18 | from losses import SupConLoss, SupConCELoss 19 | from ce import train_ce 20 | -from hybrid import train_supconce 21 | +# from hybrid import train_supconce 22 | from args import args 23 | if args.method == 'scl': 24 | from scl import train_scl, linear_scl 25 | @@ -26,12 +28,13 @@ 26 | elif args.dataset == 'SPRS': 27 | DEFAULT_NUM_CLASSES = 7 28 | DEFAULT_OUT_DIM = 128 #for ssl embedding space dimension 29 | -DEFAULT_NFFT = 1024 30 | -DEFAULT_NMELS = 64 31 | -DEFAULT_WIN_LENGTH = 1024 32 | -DEFAULT_HOP_LENGTH = 512 33 | +DEFAULT_NFFT = 400 34 | +DEFAULT_NMELS = 80 35 | +DEFAULT_WIN_LENGTH = 400 36 | +DEFAULT_HOP_LENGTH = 160 37 | DEFAULT_FMIN = 50 38 | -DEFAULT_FMAX = 2000 39 | +DEFAULT_FMAX = 8000 40 | +args.backbone = 'm2d' 41 | 42 | # Model definition 43 | if args.method == 'sl': 44 | @@ -52,6 +55,9 @@ 45 | elif args.backbone == 'cnn14': 46 | PATH_TO_WEIGHTS = os.path.join(args.weightspath, 'Cnn14_mAP=0.431.pth') 47 | model = CNN14(num_classes=DEFAULT_NUM_CLASSES, do_dropout=args.dropout, embed_only=embed_only, from_scratch=args.scratch, path_to_weights=PATH_TO_WEIGHTS, device=args.device) 48 | +elif args.backbone == 'm2d': 49 | + model = RT_LMS_M2D(num_classes=DEFAULT_NUM_CLASSES, embed_only=embed_only, weight_file=args.weightspath, training_mask=0.0, freeze_embed=args.freeze_embed, adjust_pos=args.adjust_pos) 50 | + model = model.to(args.device) 51 | s = summary(model, device=args.device) 52 | nparams = s.trainable_params 53 | 54 | @@ -59,7 +65,13 @@ 55 | melspec = T.MelSpectrogram(n_fft=DEFAULT_NFFT, n_mels=DEFAULT_NMELS, win_length=DEFAULT_WIN_LENGTH, hop_length=DEFAULT_HOP_LENGTH, f_min=DEFAULT_FMIN, f_max=DEFAULT_FMAX).to(args.device) 56 | normalize = Normalize() 57 | melspec = torch.nn.Sequential(melspec, normalize) 58 | -standardize = Standardize(device=args.device) 59 | +if True: ## Switch to False for calculating statistics 60 | + stat_mean, stat_std = [0.3671, 0.2391] if args.dataset == 'ICBHI' else [0.2000, 0.2094] 61 | +else: 62 | + print('**** FOR STATS CALCULATION ONLY ****') 63 | + stat_mean, stat_std = 0., 1. 64 | +print(f'** Using T.MelSpectrogram & Standardize({stat_mean}, std={stat_std}) **') 65 | +standardize = Standardize(mean=stat_mean, std=stat_std, device=args.device) 66 | 67 | # Data transformations 68 | specaug = SpecAugment(freq_mask=args.freqmask, time_mask=args.timemask, freq_stripes=args.freqstripes, time_stripes=args.timestripes).to(args.device) 69 | @@ -72,13 +84,29 @@ 70 | val_ds = ICBHI(data_path=args.datapath, metadatafile=args.metadata, duration=args.duration, split='test', device=args.device, samplerate=args.samplerate, pad_type=args.pad, meta_label=args.metalabel) 71 | elif args.dataset == 'SPRS': 72 | train_ds = SPRS(data_path=args.datapath, metadatafile=args.metadata, duration=args.duration, split='train', device="cpu", samplerate=args.samplerate, pad_type=args.pad, meta_label=args.metalabel) 73 | - if args.mode == 'intra': 74 | + if args.appmode == 'intra': 75 | val_ds = SPRS(data_path=args.datapath, metadatafile=args.metadata, duration=args.duration, split='intra_test', device="cpu", samplerate=args.samplerate, pad_type=args.pad, meta_label=args.metalabel) 76 | - elif args.mode == 'inter': 77 | + elif args.appmode == 'inter': 78 | val_ds = SPRS(data_path=args.datapath, metadatafile=args.metadata, duration=args.duration, split='inter_test', device="cpu", samplerate=args.samplerate, pad_type=args.pad, meta_label=args.metalabel) 79 | train_loader = torch.utils.data.DataLoader(train_ds, batch_size=args.bs, shuffle=True) 80 | val_loader = torch.utils.data.DataLoader(val_ds, batch_size=args.bs, shuffle=False) 81 | 82 | +# ***** Calculating statistics of your dataset ***** 83 | +# 1. Change True to False in "if True:" above. 84 | +# 2. Change the following False to True 85 | +# 3. Run: python app_main.py --dataset ICBHI --datapath data/ICBHI --weightspath ../m2d_vit_base-80x200p16x4-230529/random 86 | +# or python app_main.py --dataset SPRS --datapath data/SPRS --weightspath ../m2d_vit_base-80x200p16x4-230529/random 87 | +if False: 88 | + Xs = [] 89 | + for X, *_ in train_loader: 90 | + with torch.no_grad(): 91 | + X = train_transform(X.to('cuda')) 92 | + Xs.append(X.cpu()) 93 | + X = torch.vstack(Xs) 94 | + print(X.mean(), X.std()) 95 | + import pdb; pdb.set_trace() 96 | + exit(0) 97 | + 98 | ### Optimizer 99 | if METHOD == 'sl': 100 | optimizer = torch.optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.wd) 101 | @@ -105,8 +133,9 @@ 102 | else: 103 | criterion_ce = nn.CrossEntropyLoss() 104 | 105 | +print(args) 106 | if METHOD == 'sl': 107 | - history = train_ce(model, train_loader, val_loader, train_transform, val_transform, criterion_ce, optimizer, args.epochs, scheduler) 108 | + history = train_ce(model, train_loader, val_loader, train_transform, val_transform, criterion_ce, optimizer, args.epochs, scheduler, DEFAULT_NUM_CLASSES, args.split_iter) 109 | del model 110 | 111 | elif METHOD == 'scl': 112 | @@ -126,5 +155,38 @@ 113 | history = train_supconce(model, projector, classifier, train_loader, val_loader, train_transform, val_transform, criterion, criterion_ce, optimizer, args.epochs, scheduler) 114 | del model; del projector; del classifier 115 | 116 | +report, (best_sp, best_se, best_icbhi_score, best_weight), train_losses, val_losses, train_se_scores, train_sp_scores, train_icbhi_scores, train_acc_scores, val_se_scores, val_sp_scores, val_icbhi_scores, val_acc_scores = history 117 | +scores_csv = Path('results')/(str(args.dataset).lower() + '-scores.csv') 118 | +scores_csv.parent.mkdir(parents=True, exist_ok=True) 119 | +weight_name = Path(args.weightspath).parent.name + '_' + Path(args.weightspath).stem 120 | +model_name = f'{args.backbone}-{METHOD}-{weight_name}-lr{args.lr}-bs{args.bs}' 121 | +if args.split_iter > 1: model_name += f's{args.split_iter}' 122 | +if args.freeze_embed: model_name += 'Z' 123 | +if args.adjust_pos: model_name += 'P' 124 | +text_all_args = str(dict(mode=model_name, **dict(vars(args)))) 125 | +report = f'{model_name}: {report}' 126 | +print(report) 127 | + 128 | +weight_path = Path('results/checkpoints') 129 | +weight_path.mkdir(parents=True, exist_ok=True) 130 | +torch.save(best_weight, weight_path/(model_name + '.pth')) 131 | + 132 | +# scores 133 | +try: 134 | + dforg = pd.read_csv(scores_csv) 135 | +except: 136 | + print(f'Create a new {scores_csv}') 137 | + dforg = pd.DataFrame() 138 | +df = pd.DataFrame(dict(model=[model_name], best_sp=[best_sp], best_se=[best_se], best_icbhi_score=[best_icbhi_score], report=[report], args=[text_all_args])) 139 | +pd.concat([dforg, df]).to_csv(scores_csv, index=None) 140 | + 141 | +# logs 142 | +epoch_logs = dict(train_losses=train_losses, val_losses=val_losses, train_se_scores=train_se_scores, train_sp_scores=train_sp_scores, 143 | + train_icbhi_scores=train_icbhi_scores, train_acc_scores=train_acc_scores, val_se_scores=val_se_scores, 144 | + val_sp_scores=val_sp_scores, val_icbhi_scores=val_icbhi_scores, val_acc_scores=val_acc_scores) 145 | +df = pd.DataFrame(epoch_logs) 146 | +Path('results/logs').mkdir(parents=True, exist_ok=True) 147 | +df.to_csv(f'results/logs/{weight_name}.csv') 148 | + 149 | del train_ds; del val_ds 150 | -del train_loader; del val_loader 151 | \ ファイル末尾に改行がありません 152 | +del train_loader; del val_loader 153 | --- _org/app/models.py 2024-05-14 10:06:11.612480963 +0900 154 | +++ app/icbhi_sprs/models.py 2024-05-14 10:01:31.634951532 +0900 155 | @@ -1,3 +1,5 @@ 156 | +import sys 157 | +sys.path.append('../..') 158 | import torch 159 | import torch.nn as nn 160 | import torch.nn.functional as F 161 | @@ -288,8 +290,43 @@ 162 | def cnn14(**kwargs): 163 | return CNN14(**kwargs) 164 | 165 | + 166 | +from m2d.runtime_audio import RuntimeM2D, Config 167 | +class RT_LMS_M2D(RuntimeM2D): 168 | + def __init__(self, num_classes=4, embed_only=False, training_mask=0.0, weight_file='m2d_vit_base-80x608p16x16-220930-mr7/checkpoint-300.pth', freeze_embed=None, adjust_pos=False): 169 | + cfg = Config() 170 | + if adjust_pos: 171 | + cfg.dur_frames = 801 172 | + super().__init__(cfg=cfg, weight_file=weight_file, training_mask=training_mask, encoder_only=True, freeze_embed=freeze_embed) 173 | + self.embed_only = embed_only 174 | + if not embed_only: 175 | + self.linear = nn.Linear(self.cfg.feature_d, num_classes, bias=True) 176 | + # remove unneeded modules for encoding audio 177 | + #del self.backbone.decoder_blocks 178 | + #del self.backbone.target_blocks 179 | + self.accum_mean, self.accum_std = 0., 1. 180 | + 181 | + def forward(self, features): 182 | + # def ema(old, new): 183 | + # alpha = 0.999 184 | + # return alpha*old + (1 - alpha)*new 185 | + # _mean, _std = features.mean(), features.std() 186 | + # self.accum_mean, self.accum_std = ema(self.accum_mean, _mean), ema(self.accum_std, _std) 187 | + # print(_mean, _std, self.accum_mean, self.accum_std) 188 | + x = self.encode_lms(features) # [128, 51, 3840] 189 | + x = torch.mean(x, dim=1) # [128, 768] 190 | + if self.embed_only: 191 | + return x 192 | + return self.linear(x) # [128, num_classes] 193 | + 194 | + 195 | +def m2d(**kwargs): 196 | + return RT_LMS_M2D(**kwargs) 197 | + 198 | + 199 | model_dict = { 200 | 'cnn6' : [cnn6, 512], 201 | 'cnn10' : [cnn10, 512], 202 | 'cnn14' : [cnn14, 2048], 203 | + 'm2d': [m2d, 768*5] 204 | } 205 | --- _org/app/args.py 2024-05-14 10:34:44.317784486 +0900 206 | +++ app/icbhi_sprs/args.py 2024-05-14 10:35:13.997945151 +0900 207 | @@ -19,7 +19,7 @@ 208 | 209 | #Data 210 | parser.add_argument("--dataset", type=str, default='ICBHI') # which dataset to use ['ICBHI', 'SPRS'] 211 | -parser.add_argument("--mode", type=str, default='inter') # for SPRS dataset, there are two test splits ['inter', 'intra'] 212 | +parser.add_argument("--appmode", type=str, default='inter') # for SPRS dataset, there are two test splits ['inter', 'intra'] 213 | parser.add_argument("--datapath", type=str, default='data/ICBHI') # path of the dataset files 214 | parser.add_argument("--metadata", type=str, default='metadata.csv') #metadata file 215 | parser.add_argument("--metalabel", type=str, default='sa') #meta label used for mscl, 's' stands for sex, 'a' for age, and 'c' for respiratory class 216 | @@ -44,4 +44,10 @@ 217 | parser.add_argument("--alpha", type=float, default=0.5) #tradeoff between cross entropy and nt xent 218 | parser.add_argument("--lam", type=float, default=0.75) #tradeoff between scl label and scl metadata 219 | 220 | -args = parser.parse_args() 221 | \ ファイル末尾に改行がありません 222 | +#M2D 223 | +parser.add_argument("--freeze_embed", action='store_true') #freeze ViT embedding layer 224 | +parser.add_argument("--adjust_pos", action='store_true') #adjust positional embedding length 225 | +parser.add_argument("--split_iter", type=int, default=1) #for a low-memory run, split actual batch size by this number 226 | + 227 | + 228 | +args = parser.parse_args() 229 | --- _org/app/ce.py 2024-05-14 10:34:44.317784486 +0900 230 | +++ app/icbhi_sprs/ce.py 2024-05-14 10:35:17.561964443 +0900 231 | @@ -2,35 +2,42 @@ 232 | import torch 233 | from args import args 234 | 235 | -def train_epoch(model, train_loader, train_transform, criterion, optimizer, scheduler): 236 | +def train_epoch(model, train_loader, train_transform, criterion, optimizer, scheduler, n_classes, K=1): 237 | 238 | - TP = [0, 0, 0 ,0] 239 | - GT = [0, 0, 0, 0] 240 | + TP = [0 for _ in range(n_classes)] 241 | + GT = [0 for _ in range(n_classes)] 242 | 243 | epoch_loss = 0.0 244 | 245 | model.train() 246 | 247 | - for data, target, _ in train_loader: 248 | - data, target = data.to(args.device), target.to(args.device) 249 | + for batch_data, batch_target, _ in train_loader: 250 | + batch_data, batch_target = batch_data.to(args.device), batch_target.to(args.device) 251 | 252 | with torch.no_grad(): 253 | - data_t = train_transform(data) 254 | + batch_data_t = train_transform(batch_data) 255 | 256 | optimizer.zero_grad() 257 | 258 | - output = model(data_t) 259 | - loss = criterion(output, target) 260 | + L = len(batch_data_t) 261 | + D = L // K 262 | + for i in range(K): 263 | + data = batch_data_t[i*D:(i+1)*D] 264 | + target = batch_target[i*D:(i+1)*D] 265 | + 266 | + output = model(data) 267 | + loss = criterion(output, target) 268 | 269 | - epoch_loss += loss.item() 270 | + epoch_loss += loss.item() 271 | 272 | - _, labels_predicted = torch.max(output, dim=1) 273 | + _, labels_predicted = torch.max(output, dim=1) 274 | 275 | - for idx in range(len(TP)): 276 | - TP[idx] += torch.logical_and((labels_predicted==idx),(target==idx)).sum().item() 277 | - GT[idx] += (target==idx).sum().item() 278 | + for idx in range(len(TP)): 279 | + TP[idx] += torch.logical_and((labels_predicted==idx),(target==idx)).sum().item() 280 | + GT[idx] += (target==idx).sum().item() 281 | 282 | - loss.backward() 283 | + loss.backward() 284 | + 285 | optimizer.step() 286 | 287 | scheduler.step() 288 | @@ -43,10 +50,10 @@ 289 | 290 | return epoch_loss, se, sp, icbhi_score, acc 291 | 292 | -def val_epoch(model, val_loader, val_transform, criterion): 293 | +def val_epoch(model, val_loader, val_transform, criterion, n_classes, K=1): 294 | 295 | - TP = [0, 0, 0 ,0] 296 | - GT = [0, 0, 0, 0] 297 | + TP = [0 for _ in range(n_classes)] 298 | + GT = [0 for _ in range(n_classes)] 299 | 300 | epoch_loss = 0.0 301 | 302 | @@ -54,18 +61,24 @@ 303 | 304 | with torch.no_grad(): 305 | 306 | - for data, target, _ in val_loader: 307 | - data, target = data.to(args.device), target.to(args.device) 308 | + for batch_data, batch_target, _ in val_loader: 309 | + batch_data, batch_target = batch_data.to(args.device), batch_target.to(args.device) 310 | 311 | - output = model(val_transform(data)) 312 | - loss = criterion(output, target) 313 | - epoch_loss += loss.item() 314 | - 315 | - _, labels_predicted = torch.max(output, dim=1) 316 | - 317 | - for idx in range(len(TP)): 318 | - TP[idx] += torch.logical_and((labels_predicted==idx),(target==idx)).sum().item() 319 | - GT[idx] += (target==idx).sum().item() 320 | + L = len(batch_data) 321 | + D = L // K 322 | + for i in range(K): 323 | + data = batch_data[i*D:(i+1)*D] 324 | + target = batch_target[i*D:(i+1)*D] 325 | + 326 | + output = model(val_transform(data)) 327 | + loss = criterion(output, target) 328 | + epoch_loss += loss.item() 329 | + 330 | + _, labels_predicted = torch.max(output, dim=1) 331 | + 332 | + for idx in range(len(TP)): 333 | + TP[idx] += torch.logical_and((labels_predicted==idx),(target==idx)).sum().item() 334 | + GT[idx] += (target==idx).sum().item() 335 | 336 | 337 | epoch_loss = epoch_loss / len(val_loader) 338 | @@ -76,7 +89,7 @@ 339 | 340 | return epoch_loss, se, sp, icbhi_score, acc 341 | 342 | -def train_ce(model, train_loader, val_loader, train_transform, val_transform, criterion, optimizer, epochs, scheduler): 343 | +def train_ce(model, train_loader, val_loader, train_transform, val_transform, criterion, optimizer, epochs, scheduler, n_classes, K=1): 344 | 345 | train_losses = []; val_losses = []; train_se_scores = []; train_sp_scores = []; train_icbhi_scores = []; train_acc_scores = []; val_se_scores = []; val_sp_scores = []; val_icbhi_scores = []; val_acc_scores = [] 346 | 347 | @@ -86,16 +99,17 @@ 348 | best_sp = 0 349 | best_epoch_acc = 0 350 | best_epoch_icbhi = 0 351 | + best_weight = None 352 | 353 | for i in range(1, epochs+1): 354 | 355 | print(f"Epoch {i}") 356 | 357 | - train_loss, train_se, train_sp, train_icbhi_score, train_acc = train_epoch(model, train_loader, train_transform, criterion, optimizer, scheduler) 358 | + train_loss, train_se, train_sp, train_icbhi_score, train_acc = train_epoch(model, train_loader, train_transform, criterion, optimizer, scheduler, n_classes, K) 359 | train_losses.append(train_loss); train_se_scores.append(train_se); train_sp_scores.append(train_sp); train_icbhi_scores.append(train_icbhi_score); train_acc_scores.append(train_acc) 360 | print(f"Train loss : {format(train_loss, '.4f')}\tTrain SE : {format(train_se, '.4f')}\tTrain SP : {format(train_sp, '.4f')}\tTrain Score : {format(train_icbhi_score, '.4f')}\tTrain Acc : {format(train_acc, '.4f')}") 361 | 362 | - val_loss, val_se, val_sp, val_icbhi_score, val_acc = val_epoch(model, val_loader, val_transform, criterion) 363 | + val_loss, val_se, val_sp, val_icbhi_score, val_acc = val_epoch(model, val_loader, val_transform, criterion, n_classes, K) 364 | val_losses.append(val_loss); val_se_scores.append(val_se); val_sp_scores.append(val_sp); val_icbhi_scores.append(val_icbhi_score); val_acc_scores.append(val_acc) 365 | print(f"Val loss : {format(val_loss, '.4f')}\tVal SE : {format(val_se, '.4f')}\tVal SP : {format(val_sp, '.4f')}\tVal Score : {format(val_icbhi_score, '.4f')}\tVal Acc : {format(val_acc, '.4f')}") 366 | 367 | @@ -112,11 +126,15 @@ 368 | best_icbhi_score = val_icbhi_score 369 | best_se = val_se 370 | best_sp = val_sp 371 | + best_weight = {k: v.cpu() for k, v in model.state_dict().items()} 372 | 373 | if best_val_acc < val_acc: 374 | best_epoch_acc = i 375 | best_val_acc = val_acc 376 | 377 | - print(f"best icbhi score is {format(best_icbhi_score, '.4f')} (se:{format(best_se, '.4f')} sp:{format(best_sp, '.4f')}) at epoch {best_epoch_icbhi}") 378 | + print(f"Val loss : {format(val_loss, '.4f')}\tVal SE : {format(val_se, '.4f')}\tVal SP : {format(val_sp, '.4f')}\tVal Score : {format(val_icbhi_score, '.4f')}\tVal Acc : {format(val_acc, '.4f')} best_icbhi_score so far: {format(best_icbhi_score, '.4f')}") 379 | + 380 | + report = f"best icbhi score is {format(best_icbhi_score, '.4f')} (se:{format(best_se, '.4f')} sp:{format(best_sp, '.4f')}) at epoch {best_epoch_icbhi}" 381 | + print(report) 382 | 383 | - return train_losses, val_losses, train_se_scores, train_sp_scores, train_icbhi_scores, train_acc_scores, val_se_scores, val_sp_scores, val_icbhi_scores, val_acc_scores 384 | \ ファイル末尾に改行がありません 385 | + return report, (best_sp, best_se, best_icbhi_score, best_weight), train_losses, val_losses, train_se_scores, train_sp_scores, train_icbhi_scores, train_acc_scores, val_se_scores, val_sp_scores, val_icbhi_scores, val_acc_scores 386 | --------------------------------------------------------------------------------