├── .gitattributes ├── README.md ├── config ├── magna │ ├── east_final_openl3.json │ ├── east_final_pann.json │ ├── east_final_passt.json │ ├── east_final_vggish.json │ ├── east_kd_openl3.json │ ├── east_kd_pann.json │ ├── east_kd_passt.json │ ├── east_kd_vggish.json │ ├── lr_openl3.json │ ├── lr_pann.json │ ├── lr_passt.json │ └── lr_vggish.json └── openmic │ ├── east_final_openl3.json │ ├── east_final_pann.json │ ├── east_final_passt.json │ ├── east_final_vggish.json │ ├── east_kd_openl3.json │ ├── east_kd_pann.json │ ├── east_kd_passt.json │ ├── east_kd_vggish.json │ ├── lr_openl3.json │ ├── lr_pann.json │ ├── lr_passt.json │ └── lr_vggish.json ├── data ├── __init__.py ├── datamodule.py ├── dataset.py └── featurizer │ ├── build_featurizer.py │ ├── log_mel_featurizer.py │ └── waveform_featurizer.py ├── eval_model.py ├── magna_test.json ├── magna_train.json ├── magna_valid.json ├── models ├── __init__.py ├── basemodels │ ├── __init__.py │ ├── cp_resnet.py │ ├── mobile_fcn.py │ └── pretrained_feature_classifier.py ├── metrics │ ├── __init__.py │ └── metrics.py └── reg_loss_fn │ ├── __init__.py │ ├── combine_dist_reg.py │ ├── feature_space_reg.py │ └── kd.py ├── openmic_test.json ├── openmic_train.json ├── openmic_valid.json ├── preprocess ├── parse_magna.py ├── parse_openmic.py ├── pretrained_feature_extractor │ ├── extract_openl3.py │ ├── extract_passt.py │ └── extract_vggish.py ├── resample_openmic.py ├── split_magna.py └── split_openmic_train.py ├── train.py ├── work_dir_magna ├── east_final_openl3 │ └── epoch=93-val_mAP=0.448.ckpt ├── east_final_pann │ └── epoch=82-val_mAP=0.450.ckpt ├── east_final_passt │ └── epoch=64-val_mAP=0.466.ckpt └── east_final_vggish │ └── epoch=67-val_mAP=0.451.ckpt └── work_dir_openmic ├── east_final_openl3 └── epoch=62-val_mAP=0.851.ckpt ├── east_final_pann └── epoch=90-val_mAP=0.860.ckpt ├── east_final_passt └── epoch=96-val_mAP=0.864.ckpt └── east_final_vggish └── epoch=75-val_mAP=0.854.ckpt /.gitattributes: -------------------------------------------------------------------------------- 1 | work_dir_openmic/lr_vggish/epoch=23-val_mAP=0.817.ckpt filter=lfs diff=lfs merge=lfs -text 2 | work_dir_magna/east_final_pann/epoch=82-val_mAP=0.450.ckpt filter=lfs diff=lfs merge=lfs -text 3 | work_dir_magna/lr_pann/epoch=39-val_mAP=0.442.ckpt filter=lfs diff=lfs merge=lfs -text 4 | work_dir_magna/lr_passt/epoch=14-val_mAP=0.464.ckpt filter=lfs diff=lfs merge=lfs -text 5 | work_dir_openmic/east_kd_openl3/epoch=70-val_mAP=0.847.ckpt filter=lfs diff=lfs merge=lfs -text 6 | work_dir_openmic/east_kd_vggish/epoch=70-val_mAP=0.857.ckpt filter=lfs diff=lfs merge=lfs -text 7 | work_dir_openmic/lr_passt/epoch=11-val_mAP=0.865.ckpt filter=lfs diff=lfs merge=lfs -text 8 | work_dir_magna/east_final_openl3/epoch=93-val_mAP=0.448.ckpt filter=lfs diff=lfs merge=lfs -text 9 | work_dir_openmic/east_final_pann/epoch=90-val_mAP=0.860.ckpt filter=lfs diff=lfs merge=lfs -text 10 | work_dir_openmic/east_final_passt/epoch=96-val_mAP=0.864.ckpt filter=lfs diff=lfs merge=lfs -text 11 | work_dir_openmic/east_final_vggish/epoch=75-val_mAP=0.854.ckpt filter=lfs diff=lfs merge=lfs -text 12 | work_dir_magna/east_final_passt/epoch=64-val_mAP=0.466.ckpt filter=lfs diff=lfs merge=lfs -text 13 | work_dir_magna/lr_openl3/epoch=26-val_mAP=0.417.ckpt filter=lfs diff=lfs merge=lfs -text 14 | work_dir_openmic/lr_openl3/epoch=16-val_mAP=0.808.ckpt filter=lfs diff=lfs merge=lfs -text 15 | work_dir_magna/east_final_vggish/epoch=67-val_mAP=0.451.ckpt filter=lfs diff=lfs merge=lfs -text 16 | work_dir_magna/lr_vggish/epoch=82-val_mAP=0.414.ckpt filter=lfs diff=lfs merge=lfs -text 17 | work_dir_openmic/east_final_openl3/epoch=62-val_mAP=0.851.ckpt filter=lfs diff=lfs merge=lfs -text 18 | work_dir_openmic/lr_pann/epoch=5-val_mAP=0.865.ckpt filter=lfs diff=lfs merge=lfs -text 19 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Audio Embeddings As Teachers for Music Classification 2 | Official Implementation -------------------------------------------------------------------------------- /config/magna/east_final_openl3.json: -------------------------------------------------------------------------------- 1 | { 2 | "seed": 1223, 3 | "data": 4 | { 5 | "train_manifest_path": "magna_train.json", 6 | "val_manifest_path": "magna_valid.json", 7 | "test_manifest_path": "magna_test.json", 8 | "requires_vggish": false, 9 | "requires_openl3": true, 10 | "requires_passt": false, 11 | "requires_pann": false, 12 | 13 | "mixup": 0.0, 14 | 15 | "featurizer":{ 16 | "name": "log_mel_featurizer", 17 | "args": { 18 | "n_fft": 1024, 19 | "hop_length": 320, 20 | "win_length": 800, 21 | "sr": 16000, 22 | "n_mels": 96, 23 | 24 | "fmin": 0, 25 | "fmax": 8000, 26 | "fmin_aug_range": 0, 27 | "fmax_aug_range": 0, 28 | 29 | "normalize_mean": -3.7, 30 | "normalize_std": 3.1, 31 | 32 | "freqm": 0, 33 | "timem": 0 34 | } 35 | }, 36 | "batch_size": 16, 37 | "train_shuffle": true, 38 | "num_workers": 16 39 | }, 40 | 41 | "model": 42 | { 43 | "backbone":{ 44 | "feature": "openl3", 45 | "student": { 46 | "name": "MobileFCN", 47 | "ckpt": null, 48 | "args": { 49 | "n_mels": 96, 50 | "num_classes": 50 51 | } 52 | }, 53 | "teacher": { 54 | "name": "PretrainedFeatureClassifier", 55 | "ckpt": "work_dir_magna/lr_openl3/epoch=26-val_mAP=0.417.ckpt", 56 | "args": { 57 | "input_dim": 512, 58 | "num_classes": 50 59 | } 60 | }, 61 | "reg_loss_weight": 0.6, 62 | "regularization":{ 63 | "name": "FeatureSpaceRegularizationLoss", 64 | "args": { 65 | "mode": "distance_correlation", 66 | "stages_args": { 67 | "1": {"weight": 0.0, "student_expand": 1, "teacher_expand": 1}, 68 | "2": {"weight": 0.0, "student_expand": 2, "teacher_expand": 1}, 69 | "3": {"weight": 1.0, "student_expand": 2, "teacher_expand": 1} 70 | } 71 | } 72 | } 73 | }, 74 | 75 | "optim":{ 76 | "optimizer": { 77 | "name": "Adam", 78 | "args": { 79 | "lr": 0.0001, 80 | "weight_decay": 0.0001 81 | } 82 | }, 83 | "scheduler": { 84 | "name": "ReduceLROnPlateau", 85 | "monitor": "val/loss/total", 86 | "args": { 87 | "factor": 0.2, 88 | "patience": 5 89 | } 90 | } 91 | } 92 | }, 93 | 94 | "trainer": 95 | { 96 | "args": 97 | { 98 | "max_epochs": -1, 99 | "max_steps": 100000, 100 | "gradient_clip_val": 1.0, 101 | "num_sanity_val_steps": -1, 102 | 103 | "accelerator": "gpu", 104 | "devices": [0], 105 | "deterministic": true, 106 | 107 | "check_val_every_n_epoch": 1, 108 | "log_every_n_steps": 50 109 | }, 110 | 111 | "logger": 112 | { 113 | "save_dir": "work_dir_magna/east_final_openl3", 114 | "name": "log" 115 | }, 116 | 117 | "checkpoint": 118 | { 119 | "dirpath": "work_dir_magna/east_final_openl3", 120 | "filename": "epoch={epoch}-val_mAP={val/mAP:.3f}", 121 | "auto_insert_metric_name": false, 122 | 123 | "monitor": "val/mAP", 124 | "mode": "max", 125 | "every_n_epochs": 1, 126 | "save_top_k": 1 127 | }, 128 | 129 | "early_stopping": 130 | { 131 | "monitor": "val/loss/total", 132 | "mode": "min", 133 | "patience": 12 134 | } 135 | } 136 | } -------------------------------------------------------------------------------- /config/magna/east_final_pann.json: -------------------------------------------------------------------------------- 1 | { 2 | "seed": 1223, 3 | "data": 4 | { 5 | "train_manifest_path": "magna_train.json", 6 | "val_manifest_path": "magna_valid.json", 7 | "test_manifest_path": "magna_test.json", 8 | "requires_vggish": false, 9 | "requires_openl3": false, 10 | "requires_passt": false, 11 | "requires_pann": true, 12 | 13 | "mixup": 0.0, 14 | 15 | "featurizer":{ 16 | "name": "log_mel_featurizer", 17 | "args": { 18 | "n_fft": 1024, 19 | "hop_length": 320, 20 | "win_length": 800, 21 | "sr": 16000, 22 | "n_mels": 96, 23 | 24 | "fmin": 0, 25 | "fmax": 8000, 26 | "fmin_aug_range": 0, 27 | "fmax_aug_range": 0, 28 | 29 | "normalize_mean": -3.7, 30 | "normalize_std": 3.1, 31 | 32 | "freqm": 0, 33 | "timem": 0 34 | } 35 | }, 36 | "batch_size": 16, 37 | "train_shuffle": true, 38 | "num_workers": 16 39 | }, 40 | 41 | "model": 42 | { 43 | "backbone":{ 44 | "feature": "pann", 45 | "student": { 46 | "name": "MobileFCN", 47 | "ckpt": null, 48 | "args": { 49 | "n_mels": 96, 50 | "num_classes": 50 51 | } 52 | }, 53 | "teacher": { 54 | "name": "PretrainedFeatureClassifier", 55 | "ckpt": "work_dir_magna/lr_pann/epoch=39-val_mAP=0.442.ckpt", 56 | "args": { 57 | "input_dim": 2048, 58 | "num_classes": 50 59 | } 60 | }, 61 | "reg_loss_weight": 0.6, 62 | "regularization":{ 63 | "name": "FeatureSpaceRegularizationLoss", 64 | "args": { 65 | "mode": "distance_correlation", 66 | "stages_args": { 67 | "1": {"weight": 0.0, "student_expand": 1, "teacher_expand": 30}, 68 | "2": {"weight": 0.0, "student_expand": 1, "teacher_expand": 15}, 69 | "3": {"weight": 1.0, "student_expand": 1, "teacher_expand": 15} 70 | } 71 | } 72 | } 73 | }, 74 | 75 | "optim":{ 76 | "optimizer": { 77 | "name": "Adam", 78 | "args": { 79 | "lr": 0.0001, 80 | "weight_decay": 0.0001 81 | } 82 | }, 83 | "scheduler": { 84 | "name": "ReduceLROnPlateau", 85 | "monitor": "val/loss/total", 86 | "args": { 87 | "factor": 0.2, 88 | "patience": 5 89 | } 90 | } 91 | } 92 | }, 93 | 94 | "trainer": 95 | { 96 | "args": 97 | { 98 | "max_epochs": -1, 99 | "max_steps": 100000, 100 | "gradient_clip_val": 1.0, 101 | "num_sanity_val_steps": -1, 102 | 103 | "accelerator": "gpu", 104 | "devices": [0], 105 | "deterministic": true, 106 | 107 | "check_val_every_n_epoch": 1, 108 | "log_every_n_steps": 50 109 | }, 110 | 111 | "logger": 112 | { 113 | "save_dir": "work_dir_magna/east_final_pann", 114 | "name": "log" 115 | }, 116 | 117 | "checkpoint": 118 | { 119 | "dirpath": "work_dir_magna/east_final_pann", 120 | "filename": "epoch={epoch}-val_mAP={val/mAP:.3f}", 121 | "auto_insert_metric_name": false, 122 | 123 | "monitor": "val/mAP", 124 | "mode": "max", 125 | "every_n_epochs": 1, 126 | "save_top_k": 1 127 | }, 128 | 129 | "early_stopping": 130 | { 131 | "monitor": "val/loss/total", 132 | "mode": "min", 133 | "patience": 12 134 | } 135 | } 136 | } -------------------------------------------------------------------------------- /config/magna/east_final_passt.json: -------------------------------------------------------------------------------- 1 | { 2 | "seed": 1223, 3 | "data": 4 | { 5 | "train_manifest_path": "magna_train.json", 6 | "val_manifest_path": "magna_valid.json", 7 | "test_manifest_path": "magna_test.json", 8 | "requires_vggish": false, 9 | "requires_openl3": false, 10 | "requires_passt": true, 11 | "requires_pann": false, 12 | 13 | "mixup": 0.0, 14 | 15 | "featurizer":{ 16 | "name": "log_mel_featurizer", 17 | "args": { 18 | "n_fft": 1024, 19 | "hop_length": 320, 20 | "win_length": 800, 21 | "sr": 16000, 22 | "n_mels": 96, 23 | 24 | "fmin": 0, 25 | "fmax": 8000, 26 | "fmin_aug_range": 0, 27 | "fmax_aug_range": 0, 28 | 29 | "normalize_mean": -3.7, 30 | "normalize_std": 3.1, 31 | 32 | "freqm": 0, 33 | "timem": 0 34 | } 35 | }, 36 | "batch_size": 16, 37 | "train_shuffle": true, 38 | "num_workers": 16 39 | }, 40 | 41 | "model": 42 | { 43 | "backbone":{ 44 | "feature": "passt", 45 | "student": { 46 | "name": "MobileFCN", 47 | "ckpt": null, 48 | "args": { 49 | "n_mels": 96, 50 | "num_classes": 50 51 | } 52 | }, 53 | "teacher": { 54 | "name": "PretrainedFeatureClassifier", 55 | "ckpt": "work_dir_magna/lr_passt/epoch=14-val_mAP=0.464.ckpt", 56 | "args": { 57 | "input_dim": 768, 58 | "num_classes": 50 59 | } 60 | }, 61 | "reg_loss_weight": 0.6, 62 | "regularization":{ 63 | "name": "FeatureSpaceRegularizationLoss", 64 | "args": { 65 | "mode": "distance_correlation", 66 | "stages_args": { 67 | "1": {"weight": 0.0, "student_expand": 1, "teacher_expand": 1}, 68 | "2": {"weight": 0.0, "student_expand": 2, "teacher_expand": 1}, 69 | "3": {"weight": 1.0, "student_expand": 2, "teacher_expand": 1} 70 | } 71 | } 72 | } 73 | }, 74 | 75 | "optim":{ 76 | "optimizer": { 77 | "name": "Adam", 78 | "args": { 79 | "lr": 0.0001, 80 | "weight_decay": 0.0001 81 | } 82 | }, 83 | "scheduler": { 84 | "name": "ReduceLROnPlateau", 85 | "monitor": "val/loss/total", 86 | "args": { 87 | "factor": 0.2, 88 | "patience": 5 89 | } 90 | } 91 | } 92 | }, 93 | 94 | "trainer": 95 | { 96 | "args": 97 | { 98 | "max_epochs": -1, 99 | "max_steps": 100000, 100 | "gradient_clip_val": 1.0, 101 | "num_sanity_val_steps": -1, 102 | 103 | "accelerator": "gpu", 104 | "devices": [0], 105 | "deterministic": true, 106 | 107 | "check_val_every_n_epoch": 1, 108 | "log_every_n_steps": 50 109 | }, 110 | 111 | "logger": 112 | { 113 | "save_dir": "work_dir_magna/east_final_passt", 114 | "name": "log" 115 | }, 116 | 117 | "checkpoint": 118 | { 119 | "dirpath": "work_dir_magna/east_final_passt", 120 | "filename": "epoch={epoch}-val_mAP={val/mAP:.3f}", 121 | "auto_insert_metric_name": false, 122 | 123 | "monitor": "val/mAP", 124 | "mode": "max", 125 | "every_n_epochs": 1, 126 | "save_top_k": 1 127 | }, 128 | 129 | "early_stopping": 130 | { 131 | "monitor": "val/loss/total", 132 | "mode": "min", 133 | "patience": 12 134 | } 135 | } 136 | } -------------------------------------------------------------------------------- /config/magna/east_final_vggish.json: -------------------------------------------------------------------------------- 1 | { 2 | "seed": 1223, 3 | "data": 4 | { 5 | "train_manifest_path": "magna_train.json", 6 | "val_manifest_path": "magna_valid.json", 7 | "test_manifest_path": "magna_test.json", 8 | "requires_vggish": true, 9 | "requires_openl3": false, 10 | "requires_passt": false, 11 | "requires_pann": false, 12 | 13 | "mixup": 0.0, 14 | 15 | "featurizer":{ 16 | "name": "log_mel_featurizer", 17 | "args": { 18 | "n_fft": 1024, 19 | "hop_length": 320, 20 | "win_length": 800, 21 | "sr": 16000, 22 | "n_mels": 96, 23 | 24 | "fmin": 0, 25 | "fmax": 8000, 26 | "fmin_aug_range": 0, 27 | "fmax_aug_range": 0, 28 | 29 | "normalize_mean": -3.7, 30 | "normalize_std": 3.1, 31 | 32 | "freqm": 0, 33 | "timem": 0 34 | } 35 | }, 36 | "batch_size": 16, 37 | "train_shuffle": true, 38 | "num_workers": 16 39 | }, 40 | 41 | "model": 42 | { 43 | "backbone":{ 44 | "feature": "vggish", 45 | "student": { 46 | "name": "MobileFCN", 47 | "ckpt": null, 48 | "args": { 49 | "n_mels": 96, 50 | "num_classes": 50 51 | } 52 | }, 53 | "teacher": { 54 | "name": "PretrainedFeatureClassifier", 55 | "ckpt": "work_dir_magna/lr_vggish/epoch=82-val_mAP=0.414.ckpt", 56 | "args": { 57 | "input_dim": 128, 58 | "num_classes": 50 59 | } 60 | }, 61 | "reg_loss_weight": 0.6, 62 | "regularization":{ 63 | "name": "FeatureSpaceRegularizationLoss", 64 | "args": { 65 | "mode": "distance_correlation", 66 | "stages_args": { 67 | "1": {"weight": 0.0, "student_expand": 1, "teacher_expand": 1}, 68 | "2": {"weight": 0.0, "student_expand": 2, "teacher_expand": 1}, 69 | "3": {"weight": 1.0, "student_expand": 2, "teacher_expand": 1} 70 | } 71 | } 72 | } 73 | }, 74 | 75 | "optim":{ 76 | "optimizer": { 77 | "name": "Adam", 78 | "args": { 79 | "lr": 0.0001, 80 | "weight_decay": 0.0001 81 | } 82 | }, 83 | "scheduler": { 84 | "name": "ReduceLROnPlateau", 85 | "monitor": "val/loss/total", 86 | "args": { 87 | "factor": 0.2, 88 | "patience": 5 89 | } 90 | } 91 | } 92 | }, 93 | 94 | "trainer": 95 | { 96 | "args": 97 | { 98 | "max_epochs": -1, 99 | "max_steps": 100000, 100 | "gradient_clip_val": 1.0, 101 | "num_sanity_val_steps": -1, 102 | 103 | "accelerator": "gpu", 104 | "devices": [0], 105 | "deterministic": true, 106 | 107 | "check_val_every_n_epoch": 1, 108 | "log_every_n_steps": 50 109 | }, 110 | 111 | "logger": 112 | { 113 | "save_dir": "work_dir_magna/east_final_vggish", 114 | "name": "log" 115 | }, 116 | 117 | "checkpoint": 118 | { 119 | "dirpath": "work_dir_magna/east_final_vggish", 120 | "filename": "epoch={epoch}-val_mAP={val/mAP:.3f}", 121 | "auto_insert_metric_name": false, 122 | 123 | "monitor": "val/mAP", 124 | "mode": "max", 125 | "every_n_epochs": 1, 126 | "save_top_k": 1 127 | }, 128 | 129 | "early_stopping": 130 | { 131 | "monitor": "val/loss/total", 132 | "mode": "min", 133 | "patience": 12 134 | } 135 | } 136 | } -------------------------------------------------------------------------------- /config/magna/east_kd_openl3.json: -------------------------------------------------------------------------------- 1 | { 2 | "seed": 1223, 3 | "data": 4 | { 5 | "train_manifest_path": "magna_train.json", 6 | "val_manifest_path": "magna_valid.json", 7 | "test_manifest_path": "magna_test.json", 8 | "requires_vggish": false, 9 | "requires_openl3": true, 10 | "requires_passt": false, 11 | "requires_pann": false, 12 | 13 | "mixup": 0.0, 14 | 15 | "featurizer":{ 16 | "name": "log_mel_featurizer", 17 | "args": { 18 | "n_fft": 1024, 19 | "hop_length": 320, 20 | "win_length": 800, 21 | "sr": 16000, 22 | "n_mels": 96, 23 | 24 | "fmin": 0, 25 | "fmax": 8000, 26 | "fmin_aug_range": 0, 27 | "fmax_aug_range": 0, 28 | 29 | "normalize_mean": -3.7, 30 | "normalize_std": 3.1, 31 | 32 | "freqm": 0, 33 | "timem": 0 34 | } 35 | }, 36 | "batch_size": 16, 37 | "train_shuffle": true, 38 | "num_workers": 16 39 | }, 40 | 41 | "model": 42 | { 43 | "backbone":{ 44 | "feature": "openl3", 45 | "student": { 46 | "name": "MobileFCN", 47 | "ckpt": null, 48 | "args": { 49 | "n_mels": 96, 50 | "num_classes": 50 51 | } 52 | }, 53 | "teacher": { 54 | "name": "PretrainedFeatureClassifier", 55 | "ckpt": "work_dir_magna/lr_openl3/epoch=26-val_mAP=0.417.ckpt", 56 | "args": { 57 | "input_dim": 512, 58 | "num_classes": 50 59 | } 60 | }, 61 | "reg_loss_weight": 0.6, 62 | "regularization":{ 63 | "name": "CombineDistRegLoss", 64 | "args": { 65 | "feature_mode": "distance_correlation", 66 | "feature_stages_args": { 67 | "1": {"weight": 0.0, "student_expand": 1, "teacher_expand": 1}, 68 | "2": {"weight": 0.0, "student_expand": 2, "teacher_expand": 1}, 69 | "3": {"weight": 1.0, "student_expand": 2, "teacher_expand": 1} 70 | }, 71 | "label_mode": "soft", 72 | "label_tau": 2.0, 73 | "kd_weight": 0.3 74 | } 75 | } 76 | }, 77 | 78 | "optim":{ 79 | "optimizer": { 80 | "name": "Adam", 81 | "args": { 82 | "lr": 0.0001, 83 | "weight_decay": 0.0001 84 | } 85 | }, 86 | "scheduler": { 87 | "name": "ReduceLROnPlateau", 88 | "monitor": "val/loss/total", 89 | "args": { 90 | "factor": 0.2, 91 | "patience": 5 92 | } 93 | } 94 | } 95 | }, 96 | 97 | "trainer": 98 | { 99 | "args": 100 | { 101 | "max_epochs": -1, 102 | "max_steps": 100000, 103 | "gradient_clip_val": 1.0, 104 | "num_sanity_val_steps": -1, 105 | 106 | "accelerator": "gpu", 107 | "devices": [0], 108 | "deterministic": true, 109 | 110 | "check_val_every_n_epoch": 1, 111 | "log_every_n_steps": 50 112 | }, 113 | 114 | "logger": 115 | { 116 | "save_dir": "work_dir_magna/east_kd_openl3", 117 | "name": "log" 118 | }, 119 | 120 | "checkpoint": 121 | { 122 | "dirpath": "work_dir_magna/east_kd_openl3", 123 | "filename": "epoch={epoch}-val_mAP={val/mAP:.3f}", 124 | "auto_insert_metric_name": false, 125 | 126 | "monitor": "val/mAP", 127 | "mode": "max", 128 | "every_n_epochs": 1, 129 | "save_top_k": 1 130 | }, 131 | 132 | "early_stopping": 133 | { 134 | "monitor": "val/loss/total", 135 | "mode": "min", 136 | "patience": 12 137 | } 138 | } 139 | } -------------------------------------------------------------------------------- /config/magna/east_kd_pann.json: -------------------------------------------------------------------------------- 1 | { 2 | "seed": 1223, 3 | "data": 4 | { 5 | "train_manifest_path": "magna_train.json", 6 | "val_manifest_path": "magna_valid.json", 7 | "test_manifest_path": "magna_test.json", 8 | "requires_vggish": false, 9 | "requires_openl3": false, 10 | "requires_passt": false, 11 | "requires_pann": true, 12 | 13 | "mixup": 0.0, 14 | 15 | "featurizer":{ 16 | "name": "log_mel_featurizer", 17 | "args": { 18 | "n_fft": 1024, 19 | "hop_length": 320, 20 | "win_length": 800, 21 | "sr": 16000, 22 | "n_mels": 96, 23 | 24 | "fmin": 0, 25 | "fmax": 8000, 26 | "fmin_aug_range": 0, 27 | "fmax_aug_range": 0, 28 | 29 | "normalize_mean": -3.7, 30 | "normalize_std": 3.1, 31 | 32 | "freqm": 0, 33 | "timem": 0 34 | } 35 | }, 36 | "batch_size": 16, 37 | "train_shuffle": true, 38 | "num_workers": 16 39 | }, 40 | 41 | "model": 42 | { 43 | "backbone":{ 44 | "feature": "pann", 45 | "student": { 46 | "name": "MobileFCN", 47 | "ckpt": null, 48 | "args": { 49 | "n_mels": 96, 50 | "num_classes": 50 51 | } 52 | }, 53 | "teacher": { 54 | "name": "PretrainedFeatureClassifier", 55 | "ckpt": "work_dir_magna/lr_pann/epoch=39-val_mAP=0.442.ckpt", 56 | "args": { 57 | "input_dim": 2048, 58 | "num_classes": 50 59 | } 60 | }, 61 | "reg_loss_weight": 0.6, 62 | "regularization":{ 63 | "name": "CombineDistRegLoss", 64 | "args": { 65 | "feature_mode": "distance_correlation", 66 | "feature_stages_args": { 67 | "1": {"weight": 0.0, "student_expand": 1, "teacher_expand": 30}, 68 | "2": {"weight": 0.0, "student_expand": 1, "teacher_expand": 15}, 69 | "3": {"weight": 1.0, "student_expand": 1, "teacher_expand": 15} 70 | }, 71 | "label_mode": "soft", 72 | "label_tau": 2.0, 73 | "kd_weight": 0.3 74 | } 75 | } 76 | }, 77 | 78 | "optim":{ 79 | "optimizer": { 80 | "name": "Adam", 81 | "args": { 82 | "lr": 0.0001, 83 | "weight_decay": 0.0001 84 | } 85 | }, 86 | "scheduler": { 87 | "name": "ReduceLROnPlateau", 88 | "monitor": "val/loss/total", 89 | "args": { 90 | "factor": 0.2, 91 | "patience": 5 92 | } 93 | } 94 | } 95 | }, 96 | 97 | "trainer": 98 | { 99 | "args": 100 | { 101 | "max_epochs": -1, 102 | "max_steps": 100000, 103 | "gradient_clip_val": 1.0, 104 | "num_sanity_val_steps": -1, 105 | 106 | "accelerator": "gpu", 107 | "devices": [0], 108 | "deterministic": true, 109 | 110 | "check_val_every_n_epoch": 1, 111 | "log_every_n_steps": 50 112 | }, 113 | 114 | "logger": 115 | { 116 | "save_dir": "work_dir_magna/east_kd_pann", 117 | "name": "log" 118 | }, 119 | 120 | "checkpoint": 121 | { 122 | "dirpath": "work_dir_magna/east_kd_pann", 123 | "filename": "epoch={epoch}-val_mAP={val/mAP:.3f}", 124 | "auto_insert_metric_name": false, 125 | 126 | "monitor": "val/mAP", 127 | "mode": "max", 128 | "every_n_epochs": 1, 129 | "save_top_k": 1 130 | }, 131 | 132 | "early_stopping": 133 | { 134 | "monitor": "val/loss/total", 135 | "mode": "min", 136 | "patience": 12 137 | } 138 | } 139 | } -------------------------------------------------------------------------------- /config/magna/east_kd_passt.json: -------------------------------------------------------------------------------- 1 | { 2 | "seed": 1223, 3 | "data": 4 | { 5 | "train_manifest_path": "magna_train.json", 6 | "val_manifest_path": "magna_valid.json", 7 | "test_manifest_path": "magna_test.json", 8 | "requires_vggish": false, 9 | "requires_openl3": false, 10 | "requires_passt": true, 11 | "requires_pann": false, 12 | 13 | "mixup": 0.0, 14 | 15 | "featurizer":{ 16 | "name": "log_mel_featurizer", 17 | "args": { 18 | "n_fft": 1024, 19 | "hop_length": 320, 20 | "win_length": 800, 21 | "sr": 16000, 22 | "n_mels": 96, 23 | 24 | "fmin": 0, 25 | "fmax": 8000, 26 | "fmin_aug_range": 0, 27 | "fmax_aug_range": 0, 28 | 29 | "normalize_mean": -3.7, 30 | "normalize_std": 3.1, 31 | 32 | "freqm": 0, 33 | "timem": 0 34 | } 35 | }, 36 | "batch_size": 16, 37 | "train_shuffle": true, 38 | "num_workers": 16 39 | }, 40 | 41 | "model": 42 | { 43 | "backbone":{ 44 | "feature": "passt", 45 | "student": { 46 | "name": "MobileFCN", 47 | "ckpt": null, 48 | "args": { 49 | "n_mels": 96, 50 | "num_classes": 50 51 | } 52 | }, 53 | "teacher": { 54 | "name": "PretrainedFeatureClassifier", 55 | "ckpt": "work_dir_magna/lr_passt/epoch=14-val_mAP=0.464.ckpt", 56 | "args": { 57 | "input_dim": 768, 58 | "num_classes": 50 59 | } 60 | }, 61 | "reg_loss_weight": 0.6, 62 | "regularization":{ 63 | "name": "CombineDistRegLoss", 64 | "args": { 65 | "feature_mode": "distance_correlation", 66 | "feature_stages_args": { 67 | "1": {"weight": 0.0, "student_expand": 1, "teacher_expand": 1}, 68 | "2": {"weight": 0.0, "student_expand": 2, "teacher_expand": 1}, 69 | "3": {"weight": 1.0, "student_expand": 2, "teacher_expand": 1} 70 | }, 71 | "label_mode": "soft", 72 | "label_tau": 2.0, 73 | "kd_weight": 0.3 74 | } 75 | } 76 | }, 77 | 78 | "optim":{ 79 | "optimizer": { 80 | "name": "Adam", 81 | "args": { 82 | "lr": 0.0001, 83 | "weight_decay": 0.0001 84 | } 85 | }, 86 | "scheduler": { 87 | "name": "ReduceLROnPlateau", 88 | "monitor": "val/loss/total", 89 | "args": { 90 | "factor": 0.2, 91 | "patience": 5 92 | } 93 | } 94 | } 95 | }, 96 | 97 | "trainer": 98 | { 99 | "args": 100 | { 101 | "max_epochs": -1, 102 | "max_steps": 100000, 103 | "gradient_clip_val": 1.0, 104 | "num_sanity_val_steps": -1, 105 | 106 | "accelerator": "gpu", 107 | "devices": [0], 108 | "deterministic": true, 109 | 110 | "check_val_every_n_epoch": 1, 111 | "log_every_n_steps": 50 112 | }, 113 | 114 | "logger": 115 | { 116 | "save_dir": "work_dir_magna/east_kd_passt", 117 | "name": "log" 118 | }, 119 | 120 | "checkpoint": 121 | { 122 | "dirpath": "work_dir_magna/east_kd_passt", 123 | "filename": "epoch={epoch}-val_mAP={val/mAP:.3f}", 124 | "auto_insert_metric_name": false, 125 | 126 | "monitor": "val/mAP", 127 | "mode": "max", 128 | "every_n_epochs": 1, 129 | "save_top_k": 1 130 | }, 131 | 132 | "early_stopping": 133 | { 134 | "monitor": "val/loss/total", 135 | "mode": "min", 136 | "patience": 12 137 | } 138 | } 139 | } -------------------------------------------------------------------------------- /config/magna/east_kd_vggish.json: -------------------------------------------------------------------------------- 1 | { 2 | "seed": 1223, 3 | "data": 4 | { 5 | "train_manifest_path": "magna_train.json", 6 | "val_manifest_path": "magna_valid.json", 7 | "test_manifest_path": "magna_test.json", 8 | "requires_vggish": true, 9 | "requires_openl3": false, 10 | "requires_passt": false, 11 | "requires_pann": false, 12 | 13 | "mixup": 0.0, 14 | 15 | "featurizer":{ 16 | "name": "log_mel_featurizer", 17 | "args": { 18 | "n_fft": 1024, 19 | "hop_length": 320, 20 | "win_length": 800, 21 | "sr": 16000, 22 | "n_mels": 96, 23 | 24 | "fmin": 0, 25 | "fmax": 8000, 26 | "fmin_aug_range": 0, 27 | "fmax_aug_range": 0, 28 | 29 | "normalize_mean": -3.7, 30 | "normalize_std": 3.1, 31 | 32 | "freqm": 0, 33 | "timem": 0 34 | } 35 | }, 36 | "batch_size": 16, 37 | "train_shuffle": true, 38 | "num_workers": 16 39 | }, 40 | 41 | "model": 42 | { 43 | "backbone":{ 44 | "feature": "vggish", 45 | "student": { 46 | "name": "MobileFCN", 47 | "ckpt": null, 48 | "args": { 49 | "n_mels": 96, 50 | "num_classes": 50 51 | } 52 | }, 53 | "teacher": { 54 | "name": "PretrainedFeatureClassifier", 55 | "ckpt": "work_dir_magna/lr_vggish/epoch=82-val_mAP=0.414.ckpt", 56 | "args": { 57 | "input_dim": 128, 58 | "num_classes": 50 59 | } 60 | }, 61 | "reg_loss_weight": 0.6, 62 | "regularization":{ 63 | "name": "CombineDistRegLoss", 64 | "args": { 65 | "feature_mode": "distance_correlation", 66 | "feature_stages_args": { 67 | "1": {"weight": 0.0, "student_expand": 1, "teacher_expand": 1}, 68 | "2": {"weight": 0.0, "student_expand": 2, "teacher_expand": 1}, 69 | "3": {"weight": 1.0, "student_expand": 2, "teacher_expand": 1} 70 | }, 71 | "label_mode": "soft", 72 | "label_tau": 2.0, 73 | "kd_weight": 0.3 74 | } 75 | } 76 | }, 77 | 78 | "optim":{ 79 | "optimizer": { 80 | "name": "Adam", 81 | "args": { 82 | "lr": 0.0001, 83 | "weight_decay": 0.0001 84 | } 85 | }, 86 | "scheduler": { 87 | "name": "ReduceLROnPlateau", 88 | "monitor": "val/loss/total", 89 | "args": { 90 | "factor": 0.2, 91 | "patience": 5 92 | } 93 | } 94 | } 95 | }, 96 | 97 | "trainer": 98 | { 99 | "args": 100 | { 101 | "max_epochs": -1, 102 | "max_steps": 100000, 103 | "gradient_clip_val": 1.0, 104 | "num_sanity_val_steps": -1, 105 | 106 | "accelerator": "gpu", 107 | "devices": [0], 108 | "deterministic": true, 109 | 110 | "check_val_every_n_epoch": 1, 111 | "log_every_n_steps": 50 112 | }, 113 | 114 | "logger": 115 | { 116 | "save_dir": "work_dir_magna/east_kd_vggish", 117 | "name": "log" 118 | }, 119 | 120 | "checkpoint": 121 | { 122 | "dirpath": "work_dir_magna/east_kd_vggish", 123 | "filename": "epoch={epoch}-val_mAP={val/mAP:.3f}", 124 | "auto_insert_metric_name": false, 125 | 126 | "monitor": "val/mAP", 127 | "mode": "max", 128 | "every_n_epochs": 1, 129 | "save_top_k": 1 130 | }, 131 | 132 | "early_stopping": 133 | { 134 | "monitor": "val/loss/total", 135 | "mode": "min", 136 | "patience": 12 137 | } 138 | } 139 | } -------------------------------------------------------------------------------- /config/magna/lr_openl3.json: -------------------------------------------------------------------------------- 1 | { 2 | "seed": 1223, 3 | "data": 4 | { 5 | "train_manifest_path": "magna_train.json", 6 | "val_manifest_path": "magna_valid.json", 7 | "test_manifest_path": "magna_test.json", 8 | "requires_vggish": false, 9 | "requires_openl3": true, 10 | "requires_passt": false, 11 | "requires_pann": false, 12 | 13 | "mixup": 0, 14 | 15 | "featurizer":{ 16 | "name": "waveform_featurizer", 17 | "args": {} 18 | }, 19 | 20 | "batch_size": 16, 21 | "train_shuffle": true, 22 | "num_workers": 8 23 | }, 24 | 25 | "model": 26 | { 27 | "backbone":{ 28 | "feature": "openl3", 29 | "student": { 30 | "name": "PretrainedFeatureClassifier", 31 | "ckpt": null, 32 | "args": { 33 | "input_dim": 512, 34 | "num_classes": 50 35 | } 36 | }, 37 | "teacher": null 38 | }, 39 | 40 | "optim":{ 41 | "optimizer": { 42 | "name": "SGD", 43 | "args": { 44 | "lr": 0.1, 45 | "weight_decay": 0.0 46 | } 47 | }, 48 | "scheduler": { 49 | "name": "ReduceLROnPlateau", 50 | "monitor": "val/loss/total", 51 | "args": { 52 | "factor": 0.1, 53 | "patience": 1 54 | } 55 | } 56 | } 57 | }, 58 | 59 | "trainer": 60 | { 61 | "args": 62 | { 63 | "max_epochs": -1, 64 | "max_steps": 80000, 65 | "gradient_clip_val": 1.0, 66 | "num_sanity_val_steps": -1, 67 | 68 | "accelerator": "gpu", 69 | "devices": [0], 70 | "deterministic": true, 71 | 72 | "check_val_every_n_epoch": 1, 73 | "log_every_n_steps": 50 74 | }, 75 | 76 | "logger": 77 | { 78 | "save_dir": "work_dir_magna/lr_openl3", 79 | "name": "log" 80 | }, 81 | 82 | "checkpoint": 83 | { 84 | "dirpath": "work_dir_magna/lr_openl3", 85 | "filename": "epoch={epoch}-val_mAP={val/mAP:.3f}", 86 | "auto_insert_metric_name": false, 87 | 88 | "monitor": "val/mAP", 89 | "mode": "max", 90 | "every_n_epochs": 1, 91 | "save_top_k": 1 92 | }, 93 | 94 | "early_stopping": 95 | { 96 | "monitor": "val/loss/total", 97 | "mode": "min", 98 | "patience": 3 99 | } 100 | } 101 | } -------------------------------------------------------------------------------- /config/magna/lr_pann.json: -------------------------------------------------------------------------------- 1 | { 2 | "seed": 1223, 3 | "data": 4 | { 5 | "train_manifest_path": "magna_train.json", 6 | "val_manifest_path": "magna_valid.json", 7 | "test_manifest_path": "magna_test.json", 8 | "requires_vggish": false, 9 | "requires_openl3": false, 10 | "requires_passt": false, 11 | "requires_pann": true, 12 | 13 | "mixup": 0, 14 | 15 | "featurizer":{ 16 | "name": "waveform_featurizer", 17 | "args": {} 18 | }, 19 | 20 | "batch_size": 16, 21 | "train_shuffle": true, 22 | "num_workers": 8 23 | }, 24 | 25 | "model": 26 | { 27 | "backbone":{ 28 | "feature": "pann", 29 | "student": { 30 | "name": "PretrainedFeatureClassifier", 31 | "ckpt": null, 32 | "args": { 33 | "input_dim": 2048, 34 | "num_classes": 50 35 | } 36 | }, 37 | "teacher": null 38 | }, 39 | 40 | "optim":{ 41 | "optimizer": { 42 | "name": "SGD", 43 | "args": { 44 | "lr": 0.1, 45 | "weight_decay": 0.0 46 | } 47 | }, 48 | "scheduler": { 49 | "name": "ReduceLROnPlateau", 50 | "monitor": "val/loss/total", 51 | "args": { 52 | "factor": 0.1, 53 | "patience": 1 54 | } 55 | } 56 | } 57 | }, 58 | 59 | "trainer": 60 | { 61 | "args": 62 | { 63 | "max_epochs": -1, 64 | "max_steps": 80000, 65 | "gradient_clip_val": 1.0, 66 | "num_sanity_val_steps": -1, 67 | 68 | "accelerator": "gpu", 69 | "devices": [0], 70 | "deterministic": true, 71 | 72 | "check_val_every_n_epoch": 1, 73 | "log_every_n_steps": 50 74 | }, 75 | 76 | "logger": 77 | { 78 | "save_dir": "work_dir_magna/lr_pann", 79 | "name": "log" 80 | }, 81 | 82 | "checkpoint": 83 | { 84 | "dirpath": "work_dir_magna/lr_pann", 85 | "filename": "epoch={epoch}-val_mAP={val/mAP:.3f}", 86 | "auto_insert_metric_name": false, 87 | 88 | "monitor": "val/mAP", 89 | "mode": "max", 90 | "every_n_epochs": 1, 91 | "save_top_k": 1 92 | }, 93 | 94 | "early_stopping": 95 | { 96 | "monitor": "val/loss/total", 97 | "mode": "min", 98 | "patience": 3 99 | } 100 | } 101 | } -------------------------------------------------------------------------------- /config/magna/lr_passt.json: -------------------------------------------------------------------------------- 1 | { 2 | "seed": 1223, 3 | "data": 4 | { 5 | "train_manifest_path": "magna_train.json", 6 | "val_manifest_path": "magna_valid.json", 7 | "test_manifest_path": "magna_test.json", 8 | "requires_vggish": false, 9 | "requires_openl3": false, 10 | "requires_passt": true, 11 | "requires_pann": false, 12 | 13 | "mixup": 0, 14 | 15 | "featurizer":{ 16 | "name": "waveform_featurizer", 17 | "args": {} 18 | }, 19 | 20 | "batch_size": 16, 21 | "train_shuffle": true, 22 | "num_workers": 8 23 | }, 24 | 25 | "model": 26 | { 27 | "backbone":{ 28 | "feature": "passt", 29 | "student": { 30 | "name": "PretrainedFeatureClassifier", 31 | "ckpt": null, 32 | "args": { 33 | "input_dim": 768, 34 | "num_classes": 50 35 | } 36 | }, 37 | "teacher": null 38 | }, 39 | 40 | "optim":{ 41 | "optimizer": { 42 | "name": "SGD", 43 | "args": { 44 | "lr": 0.2, 45 | "weight_decay": 0.0 46 | } 47 | }, 48 | "scheduler": { 49 | "name": "ReduceLROnPlateau", 50 | "monitor": "val/loss/total", 51 | "args": { 52 | "factor": 0.1, 53 | "patience": 1 54 | } 55 | } 56 | } 57 | }, 58 | 59 | "trainer": 60 | { 61 | "args": 62 | { 63 | "max_epochs": -1, 64 | "max_steps": 80000, 65 | "gradient_clip_val": 1.0, 66 | "num_sanity_val_steps": -1, 67 | 68 | "accelerator": "gpu", 69 | "devices": [0], 70 | "deterministic": true, 71 | 72 | "check_val_every_n_epoch": 1, 73 | "log_every_n_steps": 50 74 | }, 75 | 76 | "logger": 77 | { 78 | "save_dir": "work_dir_magna/lr_passt", 79 | "name": "log" 80 | }, 81 | 82 | "checkpoint": 83 | { 84 | "dirpath": "work_dir_magna/lr_passt", 85 | "filename": "epoch={epoch}-val_mAP={val/mAP:.3f}", 86 | "auto_insert_metric_name": false, 87 | 88 | "monitor": "val/mAP", 89 | "mode": "max", 90 | "every_n_epochs": 1, 91 | "save_top_k": 1 92 | }, 93 | 94 | "early_stopping": 95 | { 96 | "monitor": "val/loss/total", 97 | "mode": "min", 98 | "patience": 3 99 | } 100 | } 101 | } -------------------------------------------------------------------------------- /config/magna/lr_vggish.json: -------------------------------------------------------------------------------- 1 | { 2 | "seed": 1223, 3 | "data": 4 | { 5 | "train_manifest_path": "magna_train.json", 6 | "val_manifest_path": "magna_valid.json", 7 | "test_manifest_path": "magna_test.json", 8 | "requires_vggish": true, 9 | "requires_openl3": false, 10 | "requires_passt": false, 11 | "requires_pann": false, 12 | 13 | "mixup": 0, 14 | 15 | "featurizer":{ 16 | "name": "waveform_featurizer", 17 | "args": {} 18 | }, 19 | 20 | "batch_size": 16, 21 | "train_shuffle": true, 22 | "num_workers": 8 23 | }, 24 | 25 | "model": 26 | { 27 | "backbone":{ 28 | "feature": "vggish", 29 | "student": { 30 | "name": "PretrainedFeatureClassifier", 31 | "ckpt": null, 32 | "args": { 33 | "input_dim": 128, 34 | "num_classes": 50 35 | } 36 | }, 37 | "teacher": null 38 | }, 39 | 40 | "optim":{ 41 | "optimizer": { 42 | "name": "SGD", 43 | "args": { 44 | "lr": 0.1, 45 | "weight_decay": 0.0 46 | } 47 | }, 48 | "scheduler": { 49 | "name": "ReduceLROnPlateau", 50 | "monitor": "val/loss/total", 51 | "args": { 52 | "factor": 0.1, 53 | "patience": 1 54 | } 55 | } 56 | } 57 | }, 58 | 59 | "trainer": 60 | { 61 | "args": 62 | { 63 | "max_epochs": -1, 64 | "max_steps": 80000, 65 | "gradient_clip_val": 1.0, 66 | "num_sanity_val_steps": -1, 67 | 68 | "accelerator": "gpu", 69 | "devices": [0], 70 | "deterministic": true, 71 | 72 | "check_val_every_n_epoch": 1, 73 | "log_every_n_steps": 50 74 | }, 75 | 76 | "logger": 77 | { 78 | "save_dir": "work_dir_magna/lr_vggish", 79 | "name": "log" 80 | }, 81 | 82 | "checkpoint": 83 | { 84 | "dirpath": "work_dir_magna/lr_vggish", 85 | "filename": "epoch={epoch}-val_mAP={val/mAP:.3f}", 86 | "auto_insert_metric_name": false, 87 | 88 | "monitor": "val/mAP", 89 | "mode": "max", 90 | "every_n_epochs": 1, 91 | "save_top_k": 1 92 | }, 93 | 94 | "early_stopping": 95 | { 96 | "monitor": "val/loss/total", 97 | "mode": "min", 98 | "patience": 3 99 | } 100 | } 101 | } -------------------------------------------------------------------------------- /config/openmic/east_final_openl3.json: -------------------------------------------------------------------------------- 1 | { 2 | "seed": 1223, 3 | "data": 4 | { 5 | "train_manifest_path": "openmic_train.json", 6 | "val_manifest_path": "openmic_valid.json", 7 | "test_manifest_path": "openmic_test.json", 8 | "requires_vggish": false, 9 | "requires_openl3": true, 10 | "requires_passt": false, 11 | "requires_pann": false, 12 | 13 | "mixup": 0.3, 14 | 15 | "featurizer":{ 16 | "name": "log_mel_featurizer", 17 | "args": { 18 | "n_fft": 1024, 19 | "hop_length": 320, 20 | "win_length": 800, 21 | "sr": 32000, 22 | "n_mels": 128, 23 | 24 | "fmin": 0, 25 | "fmax": 16000, 26 | "fmin_aug_range": 20, 27 | "fmax_aug_range": 2000, 28 | 29 | "normalize_mean": -3.7, 30 | "normalize_std": 3.1, 31 | 32 | "freqm": 48, 33 | "timem": 192 34 | } 35 | }, 36 | "batch_size": 16, 37 | "train_shuffle": true, 38 | "num_workers": 16 39 | }, 40 | 41 | "model": 42 | { 43 | "backbone":{ 44 | "feature": "openl3", 45 | "student": { 46 | "name": "CPResNet", 47 | "ckpt": null, 48 | "args": { 49 | "rho": 7, 50 | "in_channel": 1, 51 | "base_channels": 128, 52 | "num_classes": 20 53 | } 54 | }, 55 | "teacher": { 56 | "name": "PretrainedFeatureClassifier", 57 | "ckpt": "work_dir_openmic/lr_openl3/epoch=16-val_mAP=0.808.ckpt", 58 | "args": { 59 | "input_dim": 512, 60 | "num_classes": 20 61 | } 62 | }, 63 | "reg_loss_weight": 0.8, 64 | "regularization":{ 65 | "name": "FeatureSpaceRegularizationLoss", 66 | "args": { 67 | "mode": "distance_correlation", 68 | "stages_args": { 69 | "1": {"weight": 0.0, "student_expand": 1, "teacher_expand": 6}, 70 | "2": {"weight": 0.0, "student_expand": 1, "teacher_expand": 6}, 71 | "3": {"weight": 1.0, "student_expand": 1, "teacher_expand": 6} 72 | } 73 | } 74 | } 75 | }, 76 | 77 | "optim":{ 78 | "optimizer": { 79 | "name": "Adam", 80 | "args": { 81 | "lr": 0.0001, 82 | "weight_decay": 0.0001 83 | } 84 | }, 85 | "scheduler": { 86 | "name": "ReduceLROnPlateau", 87 | "monitor": "val/loss/total", 88 | "args": { 89 | "factor": 0.2, 90 | "patience": 8 91 | } 92 | } 93 | } 94 | }, 95 | 96 | "trainer": 97 | { 98 | "args": 99 | { 100 | "max_epochs": -1, 101 | "max_steps": 80000, 102 | "gradient_clip_val": 1.0, 103 | "num_sanity_val_steps": -1, 104 | 105 | "accelerator": "gpu", 106 | "devices": [0], 107 | "deterministic": true, 108 | 109 | "check_val_every_n_epoch": 1, 110 | "log_every_n_steps": 50 111 | }, 112 | 113 | "logger": 114 | { 115 | "save_dir": "work_dir_openmic/east_final_openl3", 116 | "name": "log" 117 | }, 118 | 119 | "checkpoint": 120 | { 121 | "dirpath": "work_dir_openmic/east_final_openl3", 122 | "filename": "epoch={epoch}-val_mAP={val/mAP:.3f}", 123 | "auto_insert_metric_name": false, 124 | 125 | "monitor": "val/mAP", 126 | "mode": "max", 127 | "every_n_epochs": 1, 128 | "save_top_k": 1 129 | }, 130 | 131 | "early_stopping": 132 | { 133 | "monitor": "val/loss/total", 134 | "mode": "min", 135 | "patience": 15 136 | } 137 | } 138 | } -------------------------------------------------------------------------------- /config/openmic/east_final_pann.json: -------------------------------------------------------------------------------- 1 | { 2 | "seed": 1223, 3 | "data": 4 | { 5 | "train_manifest_path": "openmic_train.json", 6 | "val_manifest_path": "openmic_valid.json", 7 | "test_manifest_path": "openmic_test.json", 8 | "requires_vggish": false, 9 | "requires_openl3": false, 10 | "requires_passt": false, 11 | "requires_pann": true, 12 | 13 | "mixup": 0.3, 14 | 15 | "featurizer":{ 16 | "name": "log_mel_featurizer", 17 | "args": { 18 | "n_fft": 1024, 19 | "hop_length": 320, 20 | "win_length": 800, 21 | "sr": 32000, 22 | "n_mels": 128, 23 | 24 | "fmin": 0, 25 | "fmax": 16000, 26 | "fmin_aug_range": 20, 27 | "fmax_aug_range": 2000, 28 | 29 | "normalize_mean": -3.7, 30 | "normalize_std": 3.1, 31 | 32 | "freqm": 48, 33 | "timem": 192 34 | } 35 | }, 36 | "batch_size": 16, 37 | "train_shuffle": true, 38 | "num_workers": 16 39 | }, 40 | 41 | "model": 42 | { 43 | "backbone":{ 44 | "feature": "pann", 45 | "student": { 46 | "name": "CPResNet", 47 | "ckpt": null, 48 | "args": { 49 | "rho": 7, 50 | "in_channel": 1, 51 | "base_channels": 128, 52 | "num_classes": 20 53 | } 54 | }, 55 | "teacher": { 56 | "name": "PretrainedFeatureClassifier", 57 | "ckpt": "work_dir_openmic/lr_pann/epoch=5-val_mAP=0.865.ckpt", 58 | "args": { 59 | "input_dim": 2048, 60 | "num_classes": 20 61 | } 62 | }, 63 | "reg_loss_weight": 0.9, 64 | "regularization":{ 65 | "name": "FeatureSpaceRegularizationLoss", 66 | "args": { 67 | "mode": "distance_correlation", 68 | "stages_args": { 69 | "1": {"weight": 0.0, "student_expand": 1, "teacher_expand": 60}, 70 | "2": {"weight": 0.0, "student_expand": 1, "teacher_expand": 60}, 71 | "3": {"weight": 1.0, "student_expand": 1, "teacher_expand": 60} 72 | } 73 | } 74 | } 75 | }, 76 | 77 | "optim":{ 78 | "optimizer": { 79 | "name": "Adam", 80 | "args": { 81 | "lr": 0.0001, 82 | "weight_decay": 0.0001 83 | } 84 | }, 85 | "scheduler": { 86 | "name": "ReduceLROnPlateau", 87 | "monitor": "val/loss/total", 88 | "args": { 89 | "factor": 0.2, 90 | "patience": 8 91 | } 92 | } 93 | } 94 | }, 95 | 96 | "trainer": 97 | { 98 | "args": 99 | { 100 | "max_epochs": -1, 101 | "max_steps": 80000, 102 | "gradient_clip_val": 1.0, 103 | "num_sanity_val_steps": -1, 104 | 105 | "accelerator": "gpu", 106 | "devices": [0], 107 | "deterministic": true, 108 | 109 | "check_val_every_n_epoch": 1, 110 | "log_every_n_steps": 50 111 | }, 112 | 113 | "logger": 114 | { 115 | "save_dir": "work_dir_openmic/east_final_pann", 116 | "name": "log" 117 | }, 118 | 119 | "checkpoint": 120 | { 121 | "dirpath": "work_dir_openmic/east_final_pann", 122 | "filename": "epoch={epoch}-val_mAP={val/mAP:.3f}", 123 | "auto_insert_metric_name": false, 124 | 125 | "monitor": "val/mAP", 126 | "mode": "max", 127 | "every_n_epochs": 1, 128 | "save_top_k": 1 129 | }, 130 | 131 | "early_stopping": 132 | { 133 | "monitor": "val/loss/total", 134 | "mode": "min", 135 | "patience": 15 136 | } 137 | } 138 | } -------------------------------------------------------------------------------- /config/openmic/east_final_passt.json: -------------------------------------------------------------------------------- 1 | { 2 | "seed": 1223, 3 | "data": 4 | { 5 | "train_manifest_path": "openmic_train.json", 6 | "val_manifest_path": "openmic_valid.json", 7 | "test_manifest_path": "openmic_test.json", 8 | "requires_vggish": false, 9 | "requires_openl3": false, 10 | "requires_passt": true, 11 | "requires_pann": false, 12 | 13 | "mixup": 0.3, 14 | 15 | "featurizer":{ 16 | "name": "log_mel_featurizer", 17 | "args": { 18 | "n_fft": 1024, 19 | "hop_length": 320, 20 | "win_length": 800, 21 | "sr": 32000, 22 | "n_mels": 128, 23 | 24 | "fmin": 0, 25 | "fmax": 16000, 26 | "fmin_aug_range": 20, 27 | "fmax_aug_range": 2000, 28 | 29 | "normalize_mean": -3.7, 30 | "normalize_std": 3.1, 31 | 32 | "freqm": 48, 33 | "timem": 192 34 | } 35 | }, 36 | "batch_size": 16, 37 | "train_shuffle": true, 38 | "num_workers": 16 39 | }, 40 | 41 | "model": 42 | { 43 | "backbone":{ 44 | "feature": "passt", 45 | "student": { 46 | "name": "CPResNet", 47 | "ckpt": null, 48 | "args": { 49 | "rho": 7, 50 | "in_channel": 1, 51 | "base_channels": 128, 52 | "num_classes": 20 53 | } 54 | }, 55 | "teacher": { 56 | "name": "PretrainedFeatureClassifier", 57 | "ckpt": "work_dir_openmic/lr_passt/epoch=11-val_mAP=0.865.ckpt", 58 | "args": { 59 | "input_dim": 768, 60 | "num_classes": 20 61 | } 62 | }, 63 | "reg_loss_weight": 0.9, 64 | "regularization":{ 65 | "name": "FeatureSpaceRegularizationLoss", 66 | "args": { 67 | "mode": "distance_correlation", 68 | "stages_args": { 69 | "1": {"weight": 0.0, "student_expand": 1, "teacher_expand": 6}, 70 | "2": {"weight": 0.0, "student_expand": 1, "teacher_expand": 6}, 71 | "3": {"weight": 1.0, "student_expand": 1, "teacher_expand": 6} 72 | } 73 | } 74 | } 75 | }, 76 | 77 | "optim":{ 78 | "optimizer": { 79 | "name": "Adam", 80 | "args": { 81 | "lr": 0.0001, 82 | "weight_decay": 0.0001 83 | } 84 | }, 85 | "scheduler": { 86 | "name": "ReduceLROnPlateau", 87 | "monitor": "val/loss/total", 88 | "args": { 89 | "factor": 0.2, 90 | "patience": 8 91 | } 92 | } 93 | } 94 | }, 95 | 96 | "trainer": 97 | { 98 | "args": 99 | { 100 | "max_epochs": -1, 101 | "max_steps": 80000, 102 | "gradient_clip_val": 1.0, 103 | "num_sanity_val_steps": -1, 104 | 105 | "accelerator": "gpu", 106 | "devices": [0], 107 | "deterministic": true, 108 | 109 | "check_val_every_n_epoch": 1, 110 | "log_every_n_steps": 50 111 | }, 112 | 113 | "logger": 114 | { 115 | "save_dir": "work_dir_openmic/east_final_passt", 116 | "name": "log" 117 | }, 118 | 119 | "checkpoint": 120 | { 121 | "dirpath": "work_dir_openmic/east_final_passt", 122 | "filename": "epoch={epoch}-val_mAP={val/mAP:.3f}", 123 | "auto_insert_metric_name": false, 124 | 125 | "monitor": "val/mAP", 126 | "mode": "max", 127 | "every_n_epochs": 1, 128 | "save_top_k": 1 129 | }, 130 | 131 | "early_stopping": 132 | { 133 | "monitor": "val/loss/total", 134 | "mode": "min", 135 | "patience": 15 136 | } 137 | } 138 | } -------------------------------------------------------------------------------- /config/openmic/east_final_vggish.json: -------------------------------------------------------------------------------- 1 | { 2 | "seed": 1223, 3 | "data": 4 | { 5 | "train_manifest_path": "openmic_train.json", 6 | "val_manifest_path": "openmic_valid.json", 7 | "test_manifest_path": "openmic_test.json", 8 | "requires_vggish": true, 9 | "requires_openl3": false, 10 | "requires_passt": false, 11 | "requires_pann": false, 12 | 13 | "mixup": 0.3, 14 | 15 | "featurizer":{ 16 | "name": "log_mel_featurizer", 17 | "args": { 18 | "n_fft": 1024, 19 | "hop_length": 320, 20 | "win_length": 800, 21 | "sr": 32000, 22 | "n_mels": 128, 23 | 24 | "fmin": 0, 25 | "fmax": 16000, 26 | "fmin_aug_range": 20, 27 | "fmax_aug_range": 2000, 28 | 29 | "normalize_mean": -3.7, 30 | "normalize_std": 3.1, 31 | 32 | "freqm": 48, 33 | "timem": 192 34 | } 35 | }, 36 | "batch_size": 16, 37 | "train_shuffle": true, 38 | "num_workers": 16 39 | }, 40 | 41 | "model": 42 | { 43 | "backbone":{ 44 | "feature": "vggish", 45 | "student": { 46 | "name": "CPResNet", 47 | "ckpt": null, 48 | "args": { 49 | "rho": 7, 50 | "in_channel": 1, 51 | "base_channels": 128, 52 | "num_classes": 20 53 | } 54 | }, 55 | "teacher": { 56 | "name": "PretrainedFeatureClassifier", 57 | "ckpt": "work_dir_openmic/lr_vggish/epoch=23-val_mAP=0.817.ckpt", 58 | "args": { 59 | "input_dim": 128, 60 | "num_classes": 20 61 | } 62 | }, 63 | "reg_loss_weight": 0.8, 64 | "regularization":{ 65 | "name": "FeatureSpaceRegularizationLoss", 66 | "args": { 67 | "mode": "distance_correlation", 68 | "stages_args": { 69 | "1": {"weight": 0.0, "student_expand": 1, "teacher_expand": 6}, 70 | "2": {"weight": 0.0, "student_expand": 1, "teacher_expand": 6}, 71 | "3": {"weight": 1.0, "student_expand": 1, "teacher_expand": 6} 72 | } 73 | } 74 | } 75 | }, 76 | 77 | "optim":{ 78 | "optimizer": { 79 | "name": "Adam", 80 | "args": { 81 | "lr": 0.0001, 82 | "weight_decay": 0.0001 83 | } 84 | }, 85 | "scheduler": { 86 | "name": "ReduceLROnPlateau", 87 | "monitor": "val/loss/total", 88 | "args": { 89 | "factor": 0.2, 90 | "patience": 8 91 | } 92 | } 93 | } 94 | }, 95 | 96 | "trainer": 97 | { 98 | "args": 99 | { 100 | "max_epochs": -1, 101 | "max_steps": 80000, 102 | "gradient_clip_val": 1.0, 103 | "num_sanity_val_steps": -1, 104 | 105 | "accelerator": "gpu", 106 | "devices": [0], 107 | "deterministic": true, 108 | 109 | "check_val_every_n_epoch": 1, 110 | "log_every_n_steps": 50 111 | }, 112 | 113 | "logger": 114 | { 115 | "save_dir": "work_dir_openmic/east_final_vggish", 116 | "name": "log" 117 | }, 118 | 119 | "checkpoint": 120 | { 121 | "dirpath": "work_dir_openmic/east_final_vggish", 122 | "filename": "epoch={epoch}-val_mAP={val/mAP:.3f}", 123 | "auto_insert_metric_name": false, 124 | 125 | "monitor": "val/mAP", 126 | "mode": "max", 127 | "every_n_epochs": 1, 128 | "save_top_k": 1 129 | }, 130 | 131 | "early_stopping": 132 | { 133 | "monitor": "val/loss/total", 134 | "mode": "min", 135 | "patience": 15 136 | } 137 | } 138 | } -------------------------------------------------------------------------------- /config/openmic/east_kd_openl3.json: -------------------------------------------------------------------------------- 1 | { 2 | "seed": 1223, 3 | "data": 4 | { 5 | "train_manifest_path": "openmic_train.json", 6 | "val_manifest_path": "openmic_valid.json", 7 | "test_manifest_path": "openmic_test.json", 8 | "requires_vggish": false, 9 | "requires_openl3": true, 10 | "requires_passt": false, 11 | "requires_pann": false, 12 | 13 | "mixup": 0.3, 14 | 15 | "featurizer":{ 16 | "name": "log_mel_featurizer", 17 | "args": { 18 | "n_fft": 1024, 19 | "hop_length": 320, 20 | "win_length": 800, 21 | "sr": 32000, 22 | "n_mels": 128, 23 | 24 | "fmin": 0, 25 | "fmax": 16000, 26 | "fmin_aug_range": 20, 27 | "fmax_aug_range": 2000, 28 | 29 | "normalize_mean": -3.7, 30 | "normalize_std": 3.1, 31 | 32 | "freqm": 48, 33 | "timem": 192 34 | } 35 | }, 36 | "batch_size": 16, 37 | "train_shuffle": true, 38 | "num_workers": 16 39 | }, 40 | 41 | "model": 42 | { 43 | "backbone":{ 44 | "feature": "openl3", 45 | "student": { 46 | "name": "CPResNet", 47 | "ckpt": null, 48 | "args": { 49 | "rho": 7, 50 | "in_channel": 1, 51 | "base_channels": 128, 52 | "num_classes": 20 53 | } 54 | }, 55 | "teacher": { 56 | "name": "PretrainedFeatureClassifier", 57 | "ckpt": "work_dir_openmic/lr_openl3/epoch=16-val_mAP=0.808.ckpt", 58 | "args": { 59 | "input_dim": 512, 60 | "num_classes": 20 61 | } 62 | }, 63 | "reg_loss_weight": 0.8, 64 | "regularization":{ 65 | "name": "CombineDistRegLoss", 66 | "args": { 67 | "feature_mode": "distance_correlation", 68 | "feature_stages_args": { 69 | "1": {"weight": 0.0, "student_expand": 1, "teacher_expand": 6}, 70 | "2": {"weight": 0.0, "student_expand": 1, "teacher_expand": 6}, 71 | "3": {"weight": 1.0, "student_expand": 1, "teacher_expand": 6} 72 | }, 73 | "label_mode": "soft", 74 | "label_tau": 2.0, 75 | "kd_weight": 0.3 76 | } 77 | } 78 | }, 79 | 80 | "optim":{ 81 | "optimizer": { 82 | "name": "Adam", 83 | "args": { 84 | "lr": 0.0001, 85 | "weight_decay": 0.0001 86 | } 87 | }, 88 | "scheduler": { 89 | "name": "ReduceLROnPlateau", 90 | "monitor": "val/loss/total", 91 | "args": { 92 | "factor": 0.2, 93 | "patience": 8 94 | } 95 | } 96 | } 97 | }, 98 | 99 | "trainer": 100 | { 101 | "args": 102 | { 103 | "max_epochs": -1, 104 | "max_steps": 80000, 105 | "gradient_clip_val": 1.0, 106 | "num_sanity_val_steps": -1, 107 | 108 | "accelerator": "gpu", 109 | "devices": [0], 110 | "deterministic": true, 111 | 112 | "check_val_every_n_epoch": 1, 113 | "log_every_n_steps": 50 114 | }, 115 | 116 | "logger": 117 | { 118 | "save_dir": "work_dir_openmic/east_kd_openl3", 119 | "name": "log" 120 | }, 121 | 122 | "checkpoint": 123 | { 124 | "dirpath": "work_dir_openmic/east_kd_openl3", 125 | "filename": "epoch={epoch}-val_mAP={val/mAP:.3f}", 126 | "auto_insert_metric_name": false, 127 | 128 | "monitor": "val/mAP", 129 | "mode": "max", 130 | "every_n_epochs": 1, 131 | "save_top_k": 1 132 | }, 133 | 134 | "early_stopping": 135 | { 136 | "monitor": "val/loss/total", 137 | "mode": "min", 138 | "patience": 15 139 | } 140 | } 141 | } -------------------------------------------------------------------------------- /config/openmic/east_kd_pann.json: -------------------------------------------------------------------------------- 1 | { 2 | "seed": 1223, 3 | "data": 4 | { 5 | "train_manifest_path": "openmic_train.json", 6 | "val_manifest_path": "openmic_valid.json", 7 | "test_manifest_path": "openmic_test.json", 8 | "requires_vggish": false, 9 | "requires_openl3": false, 10 | "requires_passt": false, 11 | "requires_pann": true, 12 | 13 | "mixup": 0.3, 14 | 15 | "featurizer":{ 16 | "name": "log_mel_featurizer", 17 | "args": { 18 | "n_fft": 1024, 19 | "hop_length": 320, 20 | "win_length": 800, 21 | "sr": 32000, 22 | "n_mels": 128, 23 | 24 | "fmin": 0, 25 | "fmax": 16000, 26 | "fmin_aug_range": 20, 27 | "fmax_aug_range": 2000, 28 | 29 | "normalize_mean": -3.7, 30 | "normalize_std": 3.1, 31 | 32 | "freqm": 48, 33 | "timem": 192 34 | } 35 | }, 36 | "batch_size": 16, 37 | "train_shuffle": true, 38 | "num_workers": 16 39 | }, 40 | 41 | "model": 42 | { 43 | "backbone":{ 44 | "feature": "pann", 45 | "student": { 46 | "name": "CPResNet", 47 | "ckpt": null, 48 | "args": { 49 | "rho": 7, 50 | "in_channel": 1, 51 | "base_channels": 128, 52 | "num_classes": 20 53 | } 54 | }, 55 | "teacher": { 56 | "name": "PretrainedFeatureClassifier", 57 | "ckpt": "work_dir_openmic/lr_pann/epoch=5-val_mAP=0.865.ckpt", 58 | "args": { 59 | "input_dim": 2048, 60 | "num_classes": 20 61 | } 62 | }, 63 | "reg_loss_weight": 0.9, 64 | "regularization":{ 65 | "name": "CombineDistRegLoss", 66 | "args": { 67 | "feature_mode": "distance_correlation", 68 | "feature_stages_args": { 69 | "1": {"weight": 0.0, "student_expand": 1, "teacher_expand": 60}, 70 | "2": {"weight": 0.0, "student_expand": 1, "teacher_expand": 60}, 71 | "3": {"weight": 1.0, "student_expand": 1, "teacher_expand": 60} 72 | }, 73 | "label_mode": "soft", 74 | "label_tau": 2.0, 75 | "kd_weight": 0.3 76 | } 77 | } 78 | }, 79 | 80 | "optim":{ 81 | "optimizer": { 82 | "name": "Adam", 83 | "args": { 84 | "lr": 0.0001, 85 | "weight_decay": 0.0001 86 | } 87 | }, 88 | "scheduler": { 89 | "name": "ReduceLROnPlateau", 90 | "monitor": "val/loss/total", 91 | "args": { 92 | "factor": 0.2, 93 | "patience": 8 94 | } 95 | } 96 | } 97 | }, 98 | 99 | "trainer": 100 | { 101 | "args": 102 | { 103 | "max_epochs": -1, 104 | "max_steps": 80000, 105 | "gradient_clip_val": 1.0, 106 | "num_sanity_val_steps": -1, 107 | 108 | "accelerator": "gpu", 109 | "devices": [0], 110 | "deterministic": true, 111 | 112 | "check_val_every_n_epoch": 1, 113 | "log_every_n_steps": 50 114 | }, 115 | 116 | "logger": 117 | { 118 | "save_dir": "work_dir_openmic/east_kd_pann", 119 | "name": "log" 120 | }, 121 | 122 | "checkpoint": 123 | { 124 | "dirpath": "work_dir_openmic/east_kd_pann", 125 | "filename": "epoch={epoch}-val_mAP={val/mAP:.3f}", 126 | "auto_insert_metric_name": false, 127 | 128 | "monitor": "val/mAP", 129 | "mode": "max", 130 | "every_n_epochs": 1, 131 | "save_top_k": 1 132 | }, 133 | 134 | "early_stopping": 135 | { 136 | "monitor": "val/loss/total", 137 | "mode": "min", 138 | "patience": 15 139 | } 140 | } 141 | } -------------------------------------------------------------------------------- /config/openmic/east_kd_passt.json: -------------------------------------------------------------------------------- 1 | { 2 | "seed": 1223, 3 | "data": 4 | { 5 | "train_manifest_path": "openmic_train.json", 6 | "val_manifest_path": "openmic_valid.json", 7 | "test_manifest_path": "openmic_test.json", 8 | "requires_vggish": false, 9 | "requires_openl3": false, 10 | "requires_passt": true, 11 | "requires_pann": false, 12 | 13 | "mixup": 0.3, 14 | 15 | "featurizer":{ 16 | "name": "log_mel_featurizer", 17 | "args": { 18 | "n_fft": 1024, 19 | "hop_length": 320, 20 | "win_length": 800, 21 | "sr": 32000, 22 | "n_mels": 128, 23 | 24 | "fmin": 0, 25 | "fmax": 16000, 26 | "fmin_aug_range": 20, 27 | "fmax_aug_range": 2000, 28 | 29 | "normalize_mean": -3.7, 30 | "normalize_std": 3.1, 31 | 32 | "freqm": 48, 33 | "timem": 192 34 | } 35 | }, 36 | "batch_size": 16, 37 | "train_shuffle": true, 38 | "num_workers": 16 39 | }, 40 | 41 | "model": 42 | { 43 | "backbone":{ 44 | "feature": "passt", 45 | "student": { 46 | "name": "CPResNet", 47 | "ckpt": null, 48 | "args": { 49 | "rho": 7, 50 | "in_channel": 1, 51 | "base_channels": 128, 52 | "num_classes": 20 53 | } 54 | }, 55 | "teacher": { 56 | "name": "PretrainedFeatureClassifier", 57 | "ckpt": "work_dir_openmic/lr_passt/epoch=11-val_mAP=0.865.ckpt", 58 | "args": { 59 | "input_dim": 768, 60 | "num_classes": 20 61 | } 62 | }, 63 | "reg_loss_weight": 0.9, 64 | "regularization":{ 65 | "name": "CombineDistRegLoss", 66 | "args": { 67 | "feature_mode": "distance_correlation", 68 | "feature_stages_args": { 69 | "1": {"weight": 0.0, "student_expand": 1, "teacher_expand": 6}, 70 | "2": {"weight": 0.0, "student_expand": 1, "teacher_expand": 6}, 71 | "3": {"weight": 1.0, "student_expand": 1, "teacher_expand": 6} 72 | }, 73 | "label_mode": "soft", 74 | "label_tau": 2.0, 75 | "kd_weight": 0.3 76 | } 77 | } 78 | }, 79 | 80 | "optim":{ 81 | "optimizer": { 82 | "name": "Adam", 83 | "args": { 84 | "lr": 0.0001, 85 | "weight_decay": 0.0001 86 | } 87 | }, 88 | "scheduler": { 89 | "name": "ReduceLROnPlateau", 90 | "monitor": "val/loss/total", 91 | "args": { 92 | "factor": 0.2, 93 | "patience": 8 94 | } 95 | } 96 | } 97 | }, 98 | 99 | "trainer": 100 | { 101 | "args": 102 | { 103 | "max_epochs": -1, 104 | "max_steps": 80000, 105 | "gradient_clip_val": 1.0, 106 | "num_sanity_val_steps": -1, 107 | 108 | "accelerator": "gpu", 109 | "devices": [0], 110 | "deterministic": true, 111 | 112 | "check_val_every_n_epoch": 1, 113 | "log_every_n_steps": 50 114 | }, 115 | 116 | "logger": 117 | { 118 | "save_dir": "work_dir_openmic/east_kd_passt", 119 | "name": "log" 120 | }, 121 | 122 | "checkpoint": 123 | { 124 | "dirpath": "work_dir_openmic/east_kd_passt", 125 | "filename": "epoch={epoch}-val_mAP={val/mAP:.3f}", 126 | "auto_insert_metric_name": false, 127 | 128 | "monitor": "val/mAP", 129 | "mode": "max", 130 | "every_n_epochs": 1, 131 | "save_top_k": 1 132 | }, 133 | 134 | "early_stopping": 135 | { 136 | "monitor": "val/loss/total", 137 | "mode": "min", 138 | "patience": 15 139 | } 140 | } 141 | } -------------------------------------------------------------------------------- /config/openmic/east_kd_vggish.json: -------------------------------------------------------------------------------- 1 | { 2 | "seed": 1223, 3 | "data": 4 | { 5 | "train_manifest_path": "openmic_train.json", 6 | "val_manifest_path": "openmic_valid.json", 7 | "test_manifest_path": "openmic_test.json", 8 | "requires_vggish": true, 9 | "requires_openl3": false, 10 | "requires_passt": false, 11 | "requires_pann": false, 12 | 13 | "mixup": 0.3, 14 | 15 | "featurizer":{ 16 | "name": "log_mel_featurizer", 17 | "args": { 18 | "n_fft": 1024, 19 | "hop_length": 320, 20 | "win_length": 800, 21 | "sr": 32000, 22 | "n_mels": 128, 23 | 24 | "fmin": 0, 25 | "fmax": 16000, 26 | "fmin_aug_range": 20, 27 | "fmax_aug_range": 2000, 28 | 29 | "normalize_mean": -3.7, 30 | "normalize_std": 3.1, 31 | 32 | "freqm": 48, 33 | "timem": 192 34 | } 35 | }, 36 | "batch_size": 16, 37 | "train_shuffle": true, 38 | "num_workers": 16 39 | }, 40 | 41 | "model": 42 | { 43 | "backbone":{ 44 | "feature": "vggish", 45 | "student": { 46 | "name": "CPResNet", 47 | "ckpt": null, 48 | "args": { 49 | "rho": 7, 50 | "in_channel": 1, 51 | "base_channels": 128, 52 | "num_classes": 20 53 | } 54 | }, 55 | "teacher": { 56 | "name": "PretrainedFeatureClassifier", 57 | "ckpt": "work_dir_openmic/lr_vggish/epoch=23-val_mAP=0.817.ckpt", 58 | "args": { 59 | "input_dim": 128, 60 | "num_classes": 20 61 | } 62 | }, 63 | "reg_loss_weight": 0.8, 64 | "regularization":{ 65 | "name": "CombineDistRegLoss", 66 | "args": { 67 | "feature_mode": "distance_correlation", 68 | "feature_stages_args": { 69 | "1": {"weight": 0.0, "student_expand": 1, "teacher_expand": 6}, 70 | "2": {"weight": 0.0, "student_expand": 1, "teacher_expand": 6}, 71 | "3": {"weight": 1.0, "student_expand": 1, "teacher_expand": 6} 72 | }, 73 | "label_mode": "soft", 74 | "label_tau": 2.0, 75 | "kd_weight": 0.3 76 | } 77 | } 78 | }, 79 | 80 | "optim":{ 81 | "optimizer": { 82 | "name": "Adam", 83 | "args": { 84 | "lr": 0.0001, 85 | "weight_decay": 0.0001 86 | } 87 | }, 88 | "scheduler": { 89 | "name": "ReduceLROnPlateau", 90 | "monitor": "val/loss/total", 91 | "args": { 92 | "factor": 0.2, 93 | "patience": 8 94 | } 95 | } 96 | } 97 | }, 98 | 99 | "trainer": 100 | { 101 | "args": 102 | { 103 | "max_epochs": -1, 104 | "max_steps": 80000, 105 | "gradient_clip_val": 1.0, 106 | "num_sanity_val_steps": -1, 107 | 108 | "accelerator": "gpu", 109 | "devices": [0], 110 | "deterministic": true, 111 | 112 | "check_val_every_n_epoch": 1, 113 | "log_every_n_steps": 50 114 | }, 115 | 116 | "logger": 117 | { 118 | "save_dir": "work_dir_openmic/east_kd_vggish", 119 | "name": "log" 120 | }, 121 | 122 | "checkpoint": 123 | { 124 | "dirpath": "work_dir_openmic/east_kd_vggish", 125 | "filename": "epoch={epoch}-val_mAP={val/mAP:.3f}", 126 | "auto_insert_metric_name": false, 127 | 128 | "monitor": "val/mAP", 129 | "mode": "max", 130 | "every_n_epochs": 1, 131 | "save_top_k": 1 132 | }, 133 | 134 | "early_stopping": 135 | { 136 | "monitor": "val/loss/total", 137 | "mode": "min", 138 | "patience": 15 139 | } 140 | } 141 | } -------------------------------------------------------------------------------- /config/openmic/lr_openl3.json: -------------------------------------------------------------------------------- 1 | { 2 | "seed": 1223, 3 | "data": 4 | { 5 | "train_manifest_path": "openmic_train.json", 6 | "val_manifest_path": "openmic_valid.json", 7 | "test_manifest_path": "openmic_test.json", 8 | "requires_vggish": false, 9 | "requires_openl3": true, 10 | "requires_pann": false, 11 | "requires_passt": false, 12 | 13 | "mixup": 0, 14 | 15 | "featurizer":{ 16 | "name": "waveform_featurizer", 17 | "args": {} 18 | }, 19 | 20 | "batch_size": 16, 21 | "train_shuffle": true, 22 | "num_workers": 16 23 | }, 24 | 25 | "model": 26 | { 27 | "backbone":{ 28 | "feature": "openl3", 29 | "student": { 30 | "name": "PretrainedFeatureClassifier", 31 | "ckpt": null, 32 | "args": { 33 | "input_dim": 512, 34 | "num_classes": 20 35 | } 36 | }, 37 | "teacher": null 38 | }, 39 | 40 | "optim":{ 41 | "optimizer": { 42 | "name": "SGD", 43 | "args": { 44 | "lr": 0.1, 45 | "weight_decay": 0.0 46 | } 47 | }, 48 | "scheduler": { 49 | "name": "ReduceLROnPlateau", 50 | "monitor": "val/loss/total", 51 | "args": { 52 | "factor": 0.1, 53 | "patience": 1 54 | } 55 | } 56 | } 57 | }, 58 | 59 | "trainer": 60 | { 61 | "args": 62 | { 63 | "max_epochs": -1, 64 | "max_steps": 80000, 65 | "gradient_clip_val": 1.0, 66 | "num_sanity_val_steps": -1, 67 | 68 | "accelerator": "gpu", 69 | "devices": [0], 70 | "deterministic": true, 71 | 72 | "check_val_every_n_epoch": 1, 73 | "log_every_n_steps": 50 74 | }, 75 | 76 | "logger": 77 | { 78 | "save_dir": "work_dir_openmic/lr_openl3", 79 | "name": "log" 80 | }, 81 | 82 | "checkpoint": 83 | { 84 | "dirpath": "work_dir_openmic/lr_openl3", 85 | "filename": "epoch={epoch}-val_mAP={val/mAP:.3f}", 86 | "auto_insert_metric_name": false, 87 | 88 | "monitor": "val/mAP", 89 | "mode": "max", 90 | "every_n_epochs": 1, 91 | "save_top_k": 1 92 | }, 93 | 94 | "early_stopping": 95 | { 96 | "monitor": "val/loss/total", 97 | "mode": "min", 98 | "patience": 3 99 | } 100 | } 101 | } -------------------------------------------------------------------------------- /config/openmic/lr_pann.json: -------------------------------------------------------------------------------- 1 | { 2 | "seed": 1223, 3 | "data": 4 | { 5 | "train_manifest_path": "openmic_train.json", 6 | "val_manifest_path": "openmic_valid.json", 7 | "test_manifest_path": "openmic_test.json", 8 | "requires_vggish": false, 9 | "requires_openl3": false, 10 | "requires_passt": false, 11 | "requires_pann": true, 12 | 13 | "mixup": 0, 14 | 15 | "featurizer":{ 16 | "name": "waveform_featurizer", 17 | "args": {} 18 | }, 19 | 20 | "batch_size": 16, 21 | "train_shuffle": true, 22 | "num_workers": 16 23 | }, 24 | 25 | "model": 26 | { 27 | "backbone":{ 28 | "feature": "pann", 29 | "student": { 30 | "name": "PretrainedFeatureClassifier", 31 | "ckpt": null, 32 | "args": { 33 | "input_dim": 2048, 34 | "num_classes": 20 35 | } 36 | }, 37 | "teacher": null 38 | }, 39 | 40 | "optim":{ 41 | "optimizer": { 42 | "name": "SGD", 43 | "args": { 44 | "lr": 0.1, 45 | "weight_decay": 0.0 46 | } 47 | }, 48 | "scheduler": { 49 | "name": "ReduceLROnPlateau", 50 | "monitor": "val/loss/total", 51 | "args": { 52 | "factor": 0.1, 53 | "patience": 1 54 | } 55 | } 56 | } 57 | }, 58 | 59 | "trainer": 60 | { 61 | "args": 62 | { 63 | "max_epochs": -1, 64 | "max_steps": 80000, 65 | "gradient_clip_val": 1.0, 66 | "num_sanity_val_steps": -1, 67 | 68 | "accelerator": "gpu", 69 | "devices": [0], 70 | "deterministic": true, 71 | 72 | "check_val_every_n_epoch": 1, 73 | "log_every_n_steps": 50 74 | }, 75 | 76 | "logger": 77 | { 78 | "save_dir": "work_dir_openmic/lr_pann", 79 | "name": "log" 80 | }, 81 | 82 | "checkpoint": 83 | { 84 | "dirpath": "work_dir_openmic/lr_pann", 85 | "filename": "epoch={epoch}-val_mAP={val/mAP:.3f}", 86 | "auto_insert_metric_name": false, 87 | 88 | "monitor": "val/mAP", 89 | "mode": "max", 90 | "every_n_epochs": 1, 91 | "save_top_k": 1 92 | }, 93 | 94 | "early_stopping": 95 | { 96 | "monitor": "val/loss/total", 97 | "mode": "min", 98 | "patience": 3 99 | } 100 | } 101 | } -------------------------------------------------------------------------------- /config/openmic/lr_passt.json: -------------------------------------------------------------------------------- 1 | { 2 | "seed": 1223, 3 | "data": 4 | { 5 | "train_manifest_path": "openmic_train.json", 6 | "val_manifest_path": "openmic_valid.json", 7 | "test_manifest_path": "openmic_test.json", 8 | "requires_vggish": false, 9 | "requires_openl3": false, 10 | "requires_passt": true, 11 | "requires_pann": false, 12 | 13 | "mixup": 0, 14 | 15 | "featurizer":{ 16 | "name": "waveform_featurizer", 17 | "args": {} 18 | }, 19 | 20 | "batch_size": 16, 21 | "train_shuffle": true, 22 | "num_workers": 16 23 | }, 24 | 25 | "model": 26 | { 27 | "backbone":{ 28 | "feature": "passt", 29 | "student": { 30 | "name": "PretrainedFeatureClassifier", 31 | "ckpt": null, 32 | "args": { 33 | "input_dim": 768, 34 | "num_classes": 20 35 | } 36 | }, 37 | "teacher": null 38 | }, 39 | 40 | "optim":{ 41 | "optimizer": { 42 | "name": "SGD", 43 | "args": { 44 | "lr": 0.1, 45 | "weight_decay": 0.0 46 | } 47 | }, 48 | "scheduler": { 49 | "name": "ReduceLROnPlateau", 50 | "monitor": "val/loss/total", 51 | "args": { 52 | "factor": 0.1, 53 | "patience": 1 54 | } 55 | } 56 | } 57 | }, 58 | 59 | "trainer": 60 | { 61 | "args": 62 | { 63 | "max_epochs": -1, 64 | "max_steps": 80000, 65 | "gradient_clip_val": 1.0, 66 | "num_sanity_val_steps": -1, 67 | 68 | "accelerator": "gpu", 69 | "devices": [0], 70 | "deterministic": true, 71 | 72 | "check_val_every_n_epoch": 1, 73 | "log_every_n_steps": 50 74 | }, 75 | 76 | "logger": 77 | { 78 | "save_dir": "work_dir_openmic/lr_passt", 79 | "name": "log" 80 | }, 81 | 82 | "checkpoint": 83 | { 84 | "dirpath": "work_dir_openmic/lr_passt", 85 | "filename": "epoch={epoch}-val_mAP={val/mAP:.3f}", 86 | "auto_insert_metric_name": false, 87 | 88 | "monitor": "val/mAP", 89 | "mode": "max", 90 | "every_n_epochs": 1, 91 | "save_top_k": 1 92 | }, 93 | 94 | "early_stopping": 95 | { 96 | "monitor": "val/loss/total", 97 | "mode": "min", 98 | "patience": 3 99 | } 100 | } 101 | } -------------------------------------------------------------------------------- /config/openmic/lr_vggish.json: -------------------------------------------------------------------------------- 1 | { 2 | "seed": 1223, 3 | "data": 4 | { 5 | "train_manifest_path": "openmic_train.json", 6 | "val_manifest_path": "openmic_valid.json", 7 | "test_manifest_path": "openmic_test.json", 8 | "requires_vggish": true, 9 | "requires_openl3": false, 10 | "requires_passt": false, 11 | "requires_pann": false, 12 | 13 | "mixup": 0, 14 | 15 | "featurizer":{ 16 | "name": "waveform_featurizer", 17 | "args": {} 18 | }, 19 | 20 | "batch_size": 16, 21 | "train_shuffle": true, 22 | "num_workers": 16 23 | }, 24 | 25 | "model": 26 | { 27 | "backbone":{ 28 | "feature": "vggish", 29 | "student": { 30 | "name": "PretrainedFeatureClassifier", 31 | "ckpt": null, 32 | "args": { 33 | "input_dim": 128, 34 | "num_classes": 20 35 | } 36 | }, 37 | "teacher": null 38 | }, 39 | 40 | "optim":{ 41 | "optimizer": { 42 | "name": "SGD", 43 | "args": { 44 | "lr": 0.2, 45 | "weight_decay": 0.0 46 | } 47 | }, 48 | "scheduler": { 49 | "name": "ReduceLROnPlateau", 50 | "monitor": "val/loss/total", 51 | "args": { 52 | "factor": 0.1, 53 | "patience": 1 54 | } 55 | } 56 | } 57 | }, 58 | 59 | "trainer": 60 | { 61 | "args": 62 | { 63 | "max_epochs": -1, 64 | "max_steps": 80000, 65 | "gradient_clip_val": 1.0, 66 | "num_sanity_val_steps": -1, 67 | 68 | "accelerator": "gpu", 69 | "devices": [0], 70 | "deterministic": true, 71 | 72 | "check_val_every_n_epoch": 1, 73 | "log_every_n_steps": 50 74 | }, 75 | 76 | "logger": 77 | { 78 | "save_dir": "work_dir_openmic/lr_vggish", 79 | "name": "log" 80 | }, 81 | 82 | "checkpoint": 83 | { 84 | "dirpath": "work_dir_openmic/lr_vggish", 85 | "filename": "epoch={epoch}-val_mAP={val/mAP:.3f}", 86 | "auto_insert_metric_name": false, 87 | 88 | "monitor": "val/mAP", 89 | "mode": "max", 90 | "every_n_epochs": 1, 91 | "save_top_k": 1 92 | }, 93 | 94 | "early_stopping": 95 | { 96 | "monitor": "val/loss/total", 97 | "mode": "min", 98 | "patience": 3 99 | } 100 | } 101 | } -------------------------------------------------------------------------------- /data/__init__.py: -------------------------------------------------------------------------------- 1 | from .datamodule import LatentSpaceDataModule -------------------------------------------------------------------------------- /data/datamodule.py: -------------------------------------------------------------------------------- 1 | import torch.utils.data as Data 2 | import pytorch_lightning as pl 3 | 4 | from .dataset import LatentSpaceDataset 5 | from .featurizer.build_featurizer import build_featurizer 6 | 7 | 8 | class LatentSpaceDataModule(pl.LightningDataModule): 9 | def __init__( 10 | self, 11 | train_manifest_path, 12 | val_manifest_path, 13 | test_manifest_path, 14 | 15 | requires_vggish, 16 | requires_openl3, 17 | requires_passt, 18 | requires_pann, 19 | 20 | mixup, 21 | featurizer, 22 | 23 | batch_size, 24 | train_shuffle, 25 | num_workers 26 | ) -> None: 27 | super().__init__() 28 | 29 | self.train_dataset = LatentSpaceDataset( 30 | train_manifest_path, 31 | requires_vggish=requires_vggish, 32 | requires_openl3=requires_openl3, 33 | requires_passt=requires_passt, 34 | requires_pann=requires_pann, 35 | mixup=mixup, 36 | featurizer=build_featurizer(featurizer, training=True) 37 | ) 38 | self.val_dataset = LatentSpaceDataset( 39 | val_manifest_path, 40 | requires_vggish=requires_vggish, 41 | requires_openl3=requires_openl3, 42 | requires_pann=requires_pann, 43 | requires_passt=requires_passt, 44 | mixup=0.0, 45 | featurizer=build_featurizer(featurizer, training=False) 46 | ) 47 | self.test_dataset = LatentSpaceDataset( 48 | test_manifest_path, 49 | requires_vggish=requires_vggish, 50 | requires_openl3=requires_openl3, 51 | requires_pann=requires_pann, 52 | requires_passt=requires_passt, 53 | mixup=0.0, 54 | featurizer=build_featurizer(featurizer, training=False) 55 | ) 56 | 57 | self.batch_size = batch_size 58 | self.train_shuffle = train_shuffle 59 | self.num_workers = num_workers 60 | 61 | def train_dataloader(self): 62 | return Data.DataLoader(self.train_dataset, batch_size=self.batch_size, shuffle=self.train_shuffle, num_workers=self.num_workers) 63 | 64 | def val_dataloader(self): 65 | return Data.DataLoader(self.val_dataset, batch_size=self.batch_size, num_workers=self.num_workers) 66 | 67 | def test_dataloader(self): 68 | return Data.DataLoader(self.test_dataset, batch_size=self.batch_size, num_workers=self.num_workers) 69 | -------------------------------------------------------------------------------- /data/dataset.py: -------------------------------------------------------------------------------- 1 | import json 2 | import soundfile as sf 3 | 4 | import numpy as np 5 | import torch 6 | import torch.utils.data as Data 7 | 8 | class LatentSpaceDataset(Data.Dataset): 9 | def __init__( 10 | self, 11 | manifest_path, 12 | 13 | requires_vggish=False, 14 | requires_openl3=False, 15 | requires_pann=False, 16 | requires_passt=False, 17 | 18 | mixup=0.0, 19 | 20 | featurizer=None 21 | ) -> None: 22 | super().__init__() 23 | 24 | with open(manifest_path) as f: 25 | self.data = [json.loads(line) for line in f] 26 | 27 | self.featurizer = featurizer 28 | 29 | self.requires_vggish = requires_vggish 30 | self.requires_openl3 = requires_openl3 31 | self.requires_pann = requires_pann 32 | self.requires_passt = requires_passt 33 | 34 | self.mixup = mixup 35 | 36 | def __len__(self): 37 | return len(self.data) 38 | 39 | def __getitem__(self, idx): 40 | if not self.mixup > 0: 41 | return self._transform_data(self.data[idx]) 42 | 43 | mixup_idx = np.random.randint(low=0, high=len(self.data)) 44 | mixup_lam = np.random.beta(self.mixup, self.mixup) 45 | mixup_lam = max(mixup_lam, 1 - mixup_lam) 46 | 47 | return self._mixup_data( 48 | self._transform_data(self.data[idx]), 49 | self._transform_data(self.data[mixup_idx]), 50 | mixup_lam 51 | ) 52 | 53 | def _transform_data(self, data): 54 | output_data = dict() 55 | 56 | audio, _ = sf.read(data["audio_path"]) 57 | if len(audio.shape) > 1: 58 | audio = audio.mean(axis=1) 59 | output_data["x"] = self.featurizer(torch.from_numpy(audio).float()) 60 | 61 | y = torch.tensor(data["label"], dtype=torch.float32) 62 | 63 | output_data["y"] = (y == 1).float() 64 | output_data["y_mask"] = (y != 0).bool() 65 | 66 | if self.requires_vggish: 67 | feature = torch.from_numpy(np.load(data["vggish"])) 68 | output_data["vggish"] = (feature.float() - 128) / 256 # normalization 69 | if self.requires_openl3: 70 | feature = torch.from_numpy(np.load(data["openl3"])) 71 | output_data["openl3"] = feature.float() - 2.24 # quick and dirt normalization 72 | if self.requires_pann: 73 | output_data["pann"] = torch.from_numpy(np.load(data["pann"])) 74 | if self.requires_passt: 75 | output_data["passt"] = torch.from_numpy(np.load(data["passt"])) 76 | 77 | return output_data 78 | 79 | def _mixup_data(self, data1, data2, lam): 80 | output_data = dict() 81 | output_data["x"] = data1["x"] * lam + data2["x"] * (1. - lam) 82 | output_data["y"] = data1["y"] * lam + data2["y"] * (1. - lam) 83 | output_data["y_mask"] = data1["y_mask"] | data2["y_mask"] 84 | if self.requires_vggish: 85 | output_data["vggish"] = data1["vggish"] * lam + data2["vggish"] * (1. - lam) 86 | if self.requires_openl3: 87 | output_data["openl3"] = data1["openl3"] * lam + data2["openl3"] * (1. - lam) 88 | if self.requires_pann: 89 | output_data["pann"] = data1["pann"] * lam + data2["pann"] * (1. - lam) 90 | if self.requires_passt: 91 | output_data["passt"] = data1["passt"] * lam + data2["passt"] * (1. - lam) 92 | 93 | return output_data -------------------------------------------------------------------------------- /data/featurizer/build_featurizer.py: -------------------------------------------------------------------------------- 1 | from .waveform_featurizer import WaveformFeaturizer 2 | from .log_mel_featurizer import LogMelFeaturizer 3 | 4 | _featurizers_dict = dict( 5 | waveform_featurizer=WaveformFeaturizer, 6 | log_mel_featurizer=LogMelFeaturizer 7 | ) 8 | 9 | def build_featurizer(cfg, training=True): 10 | featurizer_name = cfg.get("name").lower() 11 | 12 | if featurizer_name in _featurizers_dict: 13 | Featurizer = _featurizers_dict[featurizer_name] 14 | else: 15 | raise KeyError("Expect model name in {}, but got {}!".format(_featurizers_dict.keys(), featurizer_name)) 16 | 17 | featurizer = Featurizer(training=training, **cfg.get('args')) 18 | return featurizer -------------------------------------------------------------------------------- /data/featurizer/log_mel_featurizer.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | import torchaudio 6 | 7 | EPSILON = 1e-5 8 | 9 | class LogMelFeaturizer(nn.Module): 10 | def __init__(self, n_fft=2048, hop_length=512, win_length=1024, sr=44100, 11 | n_mels=128, fmin=20.0, fmax=22050.0, fmin_aug_range=0, fmax_aug_range=0, 12 | normalize_mean=0.0, normalize_std=1.0, 13 | freqm=48, timem=192, training=True 14 | ) -> None: 15 | super().__init__() 16 | 17 | self.n_fft = n_fft 18 | self.hop_length = hop_length 19 | self.win_length = win_length 20 | self.sr = sr 21 | 22 | self.n_mels = n_mels 23 | self.fmin = fmin 24 | self.fmax = fmax 25 | self.fmin_aug_range = fmin_aug_range 26 | self.fmax_aug_range = fmax_aug_range 27 | 28 | self.normalize_mean = normalize_mean 29 | self.normalize_std = normalize_std 30 | 31 | self.training = training 32 | self.freqm = torchaudio.transforms.FrequencyMasking(freqm) if (freqm > 0 and training) else nn.Identity() 33 | self.timem = torchaudio.transforms.TimeMasking(timem) if (timem > 0 and training) else nn.Identity() 34 | 35 | window = torch.hann_window(win_length, periodic=False) 36 | self.register_buffer("window", window, persistent=False) 37 | 38 | self.register_buffer("preemphasis_coefficient", torch.as_tensor([[[-0.97, 1]]]), persistent=False) 39 | 40 | def forward(self, x): 41 | with torch.no_grad(): 42 | x = x.unsqueeze(dim=0) 43 | x = F.conv1d(x.unsqueeze(dim=1), self.preemphasis_coefficient).squeeze(dim=1) 44 | 45 | x = torch.stft(x, n_fft=self.n_fft, hop_length=self.hop_length, win_length=self.win_length, window=self.window, return_complex=True) 46 | x = torch.abs(x) ** 2 47 | 48 | fmin = self.fmin + np.random.randint(0, self.fmin_aug_range) if self.training and self.fmin_aug_range > 0 else self.fmin 49 | fmax = self.fmax - np.random.randint(0, self.fmax_aug_range) if self.training and self.fmin_aug_range > 0 else self.fmax 50 | mel_basis, _ = torchaudio.compliance.kaldi.get_mel_banks( 51 | self.n_mels, self.n_fft, self.sr, fmin, fmax, vtln_low=100, vtln_high=-500, vtln_warp_factor=1.0) 52 | mel_basis = F.pad(mel_basis, (0, 1), mode='constant', value=0) 53 | 54 | mel_spec = torch.matmul(mel_basis, x) 55 | mel_spec = torch.log(mel_spec + EPSILON) 56 | 57 | if self.training: 58 | mel_spec = self.freqm(mel_spec) 59 | mel_spec = self.timem(mel_spec) 60 | mel_spec = mel_spec.squeeze(dim=0) 61 | 62 | return (mel_spec - self.normalize_mean) / self.normalize_std 63 | -------------------------------------------------------------------------------- /data/featurizer/waveform_featurizer.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | 3 | class WaveformFeaturizer(nn.Identity): 4 | def __init__(self, training=True) -> None: 5 | super().__init__() -------------------------------------------------------------------------------- /eval_model.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | import pytorch_lightning as pl 4 | 5 | from data import LatentSpaceDataModule 6 | from models import TeacherStudentModel 7 | 8 | 9 | def eval_model(config, ckpt_path, manifest_path): 10 | with open(config) as f: 11 | config = json.load(f) 12 | 13 | data_cfg = config["data"] 14 | data_cfg["test_manifest_path"] = manifest_path 15 | datamodule = LatentSpaceDataModule(**data_cfg) 16 | 17 | model = TeacherStudentModel.load_from_checkpoint(ckpt_path) 18 | 19 | evaluator = pl.Trainer(accelerator='gpu') 20 | evaluator.test(model, datamodule=datamodule) 21 | 22 | if __name__ == '__main__': 23 | import fire 24 | 25 | fire.Fire(eval_model) -------------------------------------------------------------------------------- /models/__init__.py: -------------------------------------------------------------------------------- 1 | from typing import Any 2 | 3 | import torch 4 | import torch.nn.functional as F 5 | import pytorch_lightning as pl 6 | 7 | from .basemodels import get_base_model_and_pred_from 8 | from .reg_loss_fn import get_reg_loss_fn 9 | from .metrics import MultiLabelBinaryEval 10 | 11 | class TeacherStudentModel(pl.LightningModule): 12 | def __init__(self, configs) -> None: 13 | super().__init__() 14 | 15 | self.save_hyperparameters() 16 | 17 | self.backbone_cfg = configs["backbone"] 18 | self.optim_cfg = configs["optim"] 19 | 20 | self.student, self.student_pred_from = get_base_model_and_pred_from(self.backbone_cfg["student"]) 21 | self.feature = self.backbone_cfg["feature"] 22 | 23 | if self.backbone_cfg["teacher"] is not None: 24 | self.teacher, self.teacher_pred_from = get_base_model_and_pred_from(self.backbone_cfg["teacher"]) 25 | for param in self.teacher.parameters(): 26 | param.requires_grad = False 27 | 28 | self.reg_loss_weight = self.backbone_cfg["reg_loss_weight"] 29 | self.reg_loss_fn = get_reg_loss_fn(self.backbone_cfg["regularization"]) 30 | else: 31 | self.teacher = None 32 | 33 | self.train_metrics = MultiLabelBinaryEval(num_classes=self.backbone_cfg["student"]["args"]["num_classes"]) 34 | self.val_metrics = MultiLabelBinaryEval(num_classes=self.backbone_cfg["student"]["args"]["num_classes"]) 35 | self.test_metrics = MultiLabelBinaryEval(num_classes=self.backbone_cfg["student"]["args"]["num_classes"]) 36 | 37 | def training_step(self, batch, batch_idx) -> Any: 38 | loss_dict, logits = self.common_step(batch) 39 | 40 | self.log("lr", self.optimizers().optimizer.param_groups[0]["lr"]) 41 | self.log_dict_prefix(loss_dict, "train") 42 | 43 | self.train_metrics.update(logits, torch.round(batch["y"]), batch["y_mask"]) 44 | 45 | return loss_dict["loss/total"] 46 | 47 | def validation_step(self, batch, batch_idx) -> Any: 48 | loss_dict, logits = self.common_step(batch) 49 | 50 | self.log_dict_prefix(loss_dict, "val") 51 | 52 | self.val_metrics.update(logits, torch.round(batch["y"]), batch["y_mask"]) 53 | 54 | return loss_dict["loss/total"] 55 | 56 | def test_step(self, batch, batch_idx): 57 | loss_dict, logits = self.common_step(batch) 58 | 59 | self.log_dict_prefix(loss_dict, "test") 60 | 61 | self.test_metrics.update(logits, torch.round(batch["y"]), batch["y_mask"]) 62 | 63 | def common_step(self, batch): 64 | y = batch['y'] 65 | y_mask = batch['y_mask'] 66 | 67 | loss_dict = dict() 68 | output_dict_s = self.submodel_forward(batch, self.student, self.student_pred_from) 69 | 70 | loss_pred = F.binary_cross_entropy(output_dict_s['logits'], y, reduction='none') 71 | loss_pred = loss_pred[y_mask].mean() 72 | 73 | loss_dict['loss/pred'] = loss_pred 74 | 75 | if self.teacher is not None: 76 | output_dict_t = self.submodel_forward(batch, self.teacher, self.teacher_pred_from) 77 | loss_reg = self.reg_loss_fn(output_dict_s, output_dict_t, y_mask) 78 | loss_dict['loss/reg'] = loss_reg 79 | loss = loss_pred * (1 - self.reg_loss_weight) + loss_reg * self.reg_loss_weight 80 | else: 81 | loss = loss_pred 82 | loss_dict['loss/total'] = loss 83 | return loss_dict, output_dict_s['logits'] 84 | 85 | def submodel_forward(self, batch, model, pred_from): 86 | if pred_from == "x": 87 | return model(batch["x"]) 88 | elif pred_from == "feature": 89 | return model(batch[self.feature]) 90 | else: 91 | raise NotImplementedError 92 | 93 | def on_train_epoch_start(self) -> None: 94 | self.train_metrics.reset() 95 | 96 | def on_validation_epoch_start(self) -> None: 97 | self.val_metrics.reset() 98 | 99 | def on_test_epoch_start(self) -> None: 100 | self.test_metrics.reset() 101 | 102 | def on_train_epoch_end(self) -> None: 103 | metric_dict = self.train_metrics.compute() 104 | self.log_dict_prefix(metric_dict, 'train') 105 | 106 | def on_validation_epoch_end(self) -> None: 107 | metric_dict = self.val_metrics.compute() 108 | self.log_dict_prefix(metric_dict, 'val') 109 | 110 | def on_test_epoch_end(self) -> None: 111 | metric_dict = self.test_metrics.compute() 112 | self.log_dict_prefix(metric_dict, 'test') 113 | 114 | def log_dict_prefix(self, d, prefix): 115 | for k, v in d.items(): 116 | self.log("{}/{}".format(prefix, k), v) 117 | 118 | def configure_optimizers(self) -> Any: 119 | optimizer_cfg = self.optim_cfg["optimizer"] 120 | scheduler_cfg = self.optim_cfg["scheduler"] 121 | 122 | optimizer = torch.optim.__dict__.get(optimizer_cfg["name"])(self.parameters(), **optimizer_cfg["args"]) 123 | scheduler = torch.optim.lr_scheduler.__dict__.get(scheduler_cfg["name"])(optimizer, **scheduler_cfg["args"]) 124 | return dict( 125 | optimizer=optimizer, 126 | lr_scheduler=dict( 127 | scheduler=scheduler, 128 | monitor=scheduler_cfg["monitor"], 129 | )) -------------------------------------------------------------------------------- /models/basemodels/__init__.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | from .pretrained_feature_classifier import PretrainedFeatureClassifier 4 | from .cp_resnet import CPResNet 5 | from .mobile_fcn import MobileFCN 6 | 7 | ALL_MODELS = dict( 8 | PretrainedFeatureClassifier=PretrainedFeatureClassifier, 9 | CPResNet=CPResNet, 10 | MobileFCN=MobileFCN 11 | ) 12 | PRED_FROM_DICT = dict( 13 | PretrainedFeatureClassifier="feature", 14 | CPResNet="x", 15 | MobileFCN="x" 16 | ) 17 | 18 | def get_base_model_and_pred_from(cfg: dict): 19 | name = cfg.get("name") 20 | ckpt = cfg.get("ckpt", None) 21 | args = cfg.get("args", {}) 22 | 23 | model = ALL_MODELS[name](**args) 24 | pred_from = PRED_FROM_DICT[name] 25 | if ckpt is not None: 26 | print("Loading pretrained model from {}!".format(ckpt)) 27 | pretrained_state_dict = torch.load(ckpt)["state_dict"] 28 | state_dict = dict() 29 | for layer_name in pretrained_state_dict: 30 | if not layer_name.startswith('student.'): 31 | continue 32 | new_layer_name = layer_name.replace('student.', '') 33 | state_dict[new_layer_name] = pretrained_state_dict[layer_name] 34 | model.load_state_dict(state_dict) 35 | 36 | return model, pred_from 37 | -------------------------------------------------------------------------------- /models/basemodels/cp_resnet.py: -------------------------------------------------------------------------------- 1 | """ 2 | Code for CP ResNet 3 | 4 | Referred to the code from the official implementation: https://github.com/kkoutini/cpjku_dcase20/blob/master/models/cp_resnet.py 5 | """ 6 | # coding: utf-8 7 | import math 8 | 9 | import torch 10 | import torch.nn as nn 11 | import torch.nn.functional as F 12 | 13 | 14 | def initialize_weights(module): 15 | if isinstance(module, nn.Conv2d): 16 | nn.init.kaiming_normal_(module.weight.data, mode='fan_in', nonlinearity="relu") 17 | # nn.init.kaiming_normal_(module.weight.data, mode='fan_out') 18 | elif isinstance(module, nn.BatchNorm2d): 19 | module.weight.data.fill_(1) 20 | module.bias.data.zero_() 21 | elif isinstance(module, nn.Linear): 22 | module.bias.data.zero_() 23 | 24 | 25 | layer_index_total = 0 26 | 27 | 28 | def initialize_weights_fixup(module): 29 | # source: https://github.com/ajbrock/BoilerPlate/blob/master/Models/fixup.py 30 | if isinstance(module, BasicBlock): 31 | # He init, rescaled by Fixup multiplier 32 | b = module 33 | n = b.conv1.kernel_size[0] * b.conv1.kernel_size[1] * b.conv1.out_channels 34 | #print(b.layer_index, math.sqrt(2. / n), layer_index_total ** (-0.5)) 35 | b.conv1.weight.data.normal_(0, (layer_index_total ** (-0.5)) * math.sqrt(2. / n)) 36 | b.conv2.weight.data.zero_() 37 | if b.shortcut._modules.get('conv') is not None: 38 | convShortcut = b.shortcut._modules.get('conv') 39 | n = convShortcut.kernel_size[0] * convShortcut.kernel_size[1] * convShortcut.out_channels 40 | convShortcut.weight.data.normal_(0, math.sqrt(2. / n)) 41 | if isinstance(module, nn.Conv2d): 42 | pass 43 | # nn.init.kaiming_normal_(module.weight.data, mode='fan_in', nonlinearity="relu") 44 | # nn.init.kaiming_normal_(module.weight.data, mode='fan_out') 45 | elif isinstance(module, nn.BatchNorm2d): 46 | module.weight.data.fill_(1) 47 | module.bias.data.zero_() 48 | elif isinstance(module, nn.Linear): 49 | module.bias.data.zero_() 50 | 51 | 52 | def calc_padding(kernal): 53 | try: 54 | return kernal // 3 55 | except TypeError: 56 | return [k // 3 for k in kernal] 57 | 58 | 59 | class BasicBlock(nn.Module): 60 | expansion = 1 61 | 62 | def __init__(self, in_channels, out_channels, stride, k1=3, k2=3): 63 | super(BasicBlock, self).__init__() 64 | global layer_index_total 65 | self.layer_index = layer_index_total 66 | layer_index_total = layer_index_total + 1 67 | self.conv1 = nn.Conv2d( 68 | in_channels, 69 | out_channels, 70 | kernel_size=k1, 71 | stride=stride, # downsample with first conv 72 | padding=calc_padding(k1), 73 | bias=False) 74 | self.bn1 = nn.BatchNorm2d(out_channels) 75 | self.conv2 = nn.Conv2d( 76 | out_channels, 77 | out_channels, 78 | kernel_size=k2, 79 | stride=1, 80 | padding=calc_padding(k2), 81 | bias=False) 82 | self.bn2 = nn.BatchNorm2d(out_channels) 83 | 84 | self.shortcut = nn.Sequential() 85 | if in_channels != out_channels: 86 | self.shortcut.add_module( 87 | 'conv', 88 | nn.Conv2d( 89 | in_channels, 90 | out_channels, 91 | kernel_size=1, 92 | stride=stride, # downsample 93 | padding=0, 94 | bias=False)) 95 | self.shortcut.add_module('bn', nn.BatchNorm2d(out_channels)) # BN 96 | 97 | def forward(self, x): 98 | y = F.relu(self.bn1(self.conv1(x)), inplace=True) 99 | y = self.bn2(self.conv2(y)) 100 | y += self.shortcut(x) 101 | y = F.relu(y, inplace=True) # apply ReLU after addition 102 | return y 103 | 104 | 105 | 106 | class CPResNet(nn.Module): 107 | def __init__( 108 | self, 109 | rho, 110 | in_channel, 111 | base_channels=128, 112 | num_classes=20, 113 | ): 114 | 115 | super(CPResNet, self).__init__() 116 | 117 | 118 | self.in_c = nn.Sequential( 119 | nn.Conv2d(in_channel, base_channels, 5, 2, 2, bias=False), 120 | nn.BatchNorm2d(base_channels), 121 | nn.ReLU(True) 122 | ) 123 | 124 | extra_kernal_rf = rho - 7 125 | 126 | self.stage1 = self._make_stage( 127 | base_channels, base_channels, 4, 128 | maxpool={1:2, 2:2, 4:2}, 129 | k1s=( 130 | 3, 131 | 3 - (-extra_kernal_rf > 6) * 2, 132 | 3 - (-extra_kernal_rf > 4) * 2, 133 | 3 - (-extra_kernal_rf > 2) * 2), 134 | k2s=( 135 | 1, 136 | 3 - (-extra_kernal_rf > 5) * 2, 137 | 3 - (-extra_kernal_rf > 3) * 2, 138 | 3 - (-extra_kernal_rf > 1) * 2)) 139 | 140 | self.stage2 = self._make_stage( 141 | base_channels, base_channels * 2, 4, 142 | k1s=( 143 | 3 - (-extra_kernal_rf > 0) * 2, 144 | 1 + (extra_kernal_rf > 1) * 2, 145 | 1 + (extra_kernal_rf > 3) * 2, 146 | 1 + (extra_kernal_rf > 5) * 2), 147 | k2s=(1, 148 | 3 - (-extra_kernal_rf > 5) * 2, 149 | 3 - (-extra_kernal_rf > 3) * 2, 150 | 3 - (-extra_kernal_rf > 1) * 2)) 151 | 152 | self.stage3 = self._make_stage( 153 | base_channels * 2, base_channels * 4, 4, 154 | k1s=( 155 | 1 + (extra_kernal_rf > 7) * 2, 156 | 1 + (extra_kernal_rf > 9) * 2, 157 | 1 + (extra_kernal_rf > 11) * 2, 158 | 1 + (extra_kernal_rf > 13) * 2), 159 | k2s=( 160 | 1 + (extra_kernal_rf > 8) * 2, 161 | 1 + (extra_kernal_rf > 10) * 2, 162 | 1 + (extra_kernal_rf > 12) * 2, 163 | 1 + (extra_kernal_rf > 14) * 2)) 164 | 165 | self.feed_forward = nn.Linear(base_channels * 4, num_classes) 166 | 167 | # initialize weights 168 | self.apply(initialize_weights) 169 | self.apply(initialize_weights_fixup) 170 | 171 | def _make_stage(self, in_channels, out_channels, n_blocks, maxpool=set(), k1s=[3, 3, 3, 3, 3, 3], 172 | k2s=[3, 3, 3, 3, 3, 3]): 173 | stage = nn.Sequential() 174 | 175 | for index in range(n_blocks): 176 | stage.add_module('block{}'.format(index + 1), 177 | BasicBlock(in_channels, out_channels, stride=1, k1=k1s[index], k2=k2s[index])) 178 | 179 | in_channels = out_channels 180 | if index + 1 in maxpool: 181 | stage.add_module("maxpool{}".format(index + 1), nn.MaxPool2d(maxpool[index + 1])) 182 | return stage 183 | 184 | def forward_conv(self, x): 185 | x = self.in_c(x) 186 | output_1 = self.stage1(x) 187 | output_2 = self.stage2(output_1) 188 | output_3 = self.stage3(output_2) 189 | return output_1, output_2, output_3 190 | 191 | def forward(self, x): 192 | x = x.unsqueeze(dim=1) 193 | output_1, output_2, output_3 = self.forward_conv(x) 194 | output = output_3.mean(dim=-1).mean(dim=-1) 195 | output = self.feed_forward(output) 196 | return dict( 197 | logits=torch.sigmoid(output), 198 | scores=output, 199 | output_1=output_1.mean(dim=-2).transpose(-1, -2), 200 | output_2=output_2.mean(dim=-2).transpose(-1, -2), 201 | output_3=output_3.mean(dim=-2).transpose(-1, -2) 202 | ) 203 | -------------------------------------------------------------------------------- /models/basemodels/mobile_fcn.py: -------------------------------------------------------------------------------- 1 | """ 2 | Referred to the implementation by Minz Won: https://github.com/minzwon/sota-music-tagging-models/blob/master/training/model.py 3 | """ 4 | 5 | import torch 6 | import torch.nn as nn 7 | 8 | class Conv_2d(nn.Module): 9 | def __init__(self, input_channels, output_channels, shape=3, stride=1, pooling=2): 10 | super(Conv_2d, self).__init__() 11 | self.conv = nn.Conv2d(input_channels, output_channels, shape, stride=stride, padding=shape//2, bias=False) 12 | self.bn = nn.BatchNorm2d(output_channels) 13 | self.relu = nn.ReLU() 14 | self.mp = nn.MaxPool2d(pooling) 15 | def forward(self, x): 16 | out = self.mp(self.relu(self.bn(self.conv(x)))) 17 | return out 18 | 19 | class Conv_2d_DW(nn.Module): 20 | def __init__(self, input_channels, output_channels, shape=3, stride=1, pooling=2) -> None: 21 | super().__init__() 22 | self.conv1 = nn.Conv2d(input_channels, input_channels, shape, stride=stride, padding=shape//2, groups=input_channels, bias=False) 23 | self.bn1 = nn.BatchNorm2d(input_channels) 24 | self.conv2 = nn.Conv2d(input_channels, output_channels, 1, bias=False) 25 | self.bn2 = nn.BatchNorm2d(output_channels) 26 | self.relu = nn.ReLU() 27 | self.mp = nn.MaxPool2d(pooling) 28 | 29 | def forward(self, x): 30 | out = self.relu(self.bn1(self.conv1(x))) 31 | out = self.mp(self.relu(self.bn2(self.conv2(out)))) 32 | return out 33 | 34 | 35 | class MobileFCN(nn.Module): 36 | ''' 37 | Choi et al. 2016 38 | Automatic tagging using deep convolutional neural networks. 39 | Fully convolutional network. 40 | ''' 41 | def __init__(self, n_mels=128, num_classes=50): 42 | super(MobileFCN, self).__init__() 43 | 44 | # FCN 45 | self.layer1 = Conv_2d(1, 64, pooling=(2,4)) 46 | self.layer2 = Conv_2d(64, 128, pooling=(2,3)) 47 | self.layer3 = Conv_2d_DW(128, 128, pooling=(2,2)) 48 | if n_mels == 128: 49 | self.layer4 = Conv_2d_DW(128, 128, pooling=(4,2)) 50 | else: 51 | self.layer4 = Conv_2d_DW(128, 128, pooling=(3,2)) 52 | 53 | self.layer5 = Conv_2d_DW(128, 128, pooling=(4,2)) 54 | self.layer6 = Conv_2d_DW(128, 256, pooling=1) 55 | self.layer7 = Conv_2d_DW(256, 256, pooling=1) 56 | self.layer8 = Conv_2d_DW(256, 256, pooling=1) 57 | 58 | # Dense 59 | self.dense = nn.Linear(256, num_classes) 60 | self.dropout = nn.Dropout(0.5) 61 | 62 | def forward(self, x): 63 | x = x.unsqueeze(dim=1) 64 | 65 | # FCN 66 | x = self.layer1(x) 67 | x = self.layer2(x) 68 | x = self.layer3(x) 69 | x = self.layer4(x) 70 | output_1 = x 71 | 72 | x = self.dropout(x) 73 | x = self.layer5(x) 74 | x = self.layer6(x) 75 | output_2 = x 76 | 77 | x = self.dropout(x) 78 | x = self.layer7(x) 79 | x = self.layer8(x) 80 | output_3 = x 81 | 82 | # Dense 83 | x = self.dropout(x) 84 | output = x.mean(dim=-1).mean(dim=-1) 85 | output = self.dense(output) 86 | # print(output_1.shape, output_2.shape, output_3.shape) 87 | return dict( 88 | logits=torch.sigmoid(output), 89 | scores=output, 90 | output_1=output_1.mean(dim=-2).transpose(-1, -2), 91 | output_2=output_2.mean(dim=-2).transpose(-1, -2), 92 | output_3=output_3.mean(dim=-2).transpose(-1, -2) 93 | ) 94 | 95 | if __name__ == "__main__": 96 | model = MobileFCN(num_classes=50) 97 | import numpy as np 98 | print(np.sum([np.prod(param.shape) for param in model.parameters()])) 99 | 100 | x = torch.randn(2, 128, 1457) 101 | model(x) -------------------------------------------------------------------------------- /models/basemodels/pretrained_feature_classifier.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | 5 | class PretrainedFeatureClassifier(nn.Module): 6 | def __init__( 7 | self, 8 | input_dim, 9 | num_classes 10 | ) -> None: 11 | super().__init__() 12 | 13 | self.proj = nn.Linear(input_dim, num_classes) 14 | 15 | def forward(self, x): 16 | output = self.proj(x.mean(dim=1)) 17 | return dict( 18 | scores=output, 19 | logits=torch.sigmoid(output), 20 | feature=x 21 | ) 22 | -------------------------------------------------------------------------------- /models/metrics/__init__.py: -------------------------------------------------------------------------------- 1 | from .metrics import MultiLabelBinaryEval -------------------------------------------------------------------------------- /models/metrics/metrics.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torchmetrics import Metric 3 | from torchmetrics.functional.classification import binary_auroc, binary_average_precision 4 | 5 | 6 | class MultiLabelBinaryEval(Metric): 7 | def __init__(self, num_classes) -> None: 8 | super().__init__() 9 | 10 | self.num_classes = num_classes 11 | self.logits = {i: [] for i in range(self.num_classes)} 12 | self.target = {i: [] for i in range(self.num_classes)} 13 | 14 | def update(self, logits, target, mask): 15 | with torch.no_grad(): 16 | for i in range(self.num_classes): 17 | idx = mask[:, i] 18 | 19 | if sum(idx) == 0: 20 | # This will only be used in OpenMIC to avoid error 21 | # The batch does not contain any label for this class 22 | continue 23 | 24 | self.logits[i].append(logits[idx, i]) 25 | self.target[i].append(target[idx, i]) 26 | 27 | def compute(self): 28 | with torch.no_grad(): 29 | logits = [torch.concat(self.logits[i]) for i in range(self.num_classes)] 30 | targets = [torch.concat(self.target[i]) for i in range(self.num_classes)] 31 | 32 | f1 = [self._compute_f1(logits[i], targets[i].int()) for i in range(self.num_classes)] 33 | binary_f1 = [x[0] for x in f1] 34 | macro_f1 = [x[1] for x in f1] 35 | 36 | mAP = [binary_average_precision(logits[i], targets[i].int()) for i in range(self.num_classes)] 37 | auc_roc = [binary_auroc(logits[i], targets[i].int()) for i in range(self.num_classes)] 38 | 39 | return dict( 40 | binary_f1=sum(binary_f1) / len(binary_f1), 41 | macro_f1=sum(macro_f1) / len(macro_f1), 42 | mAP=sum(mAP) / len(mAP), 43 | auc_roc=sum(auc_roc) / len(auc_roc), 44 | ) 45 | 46 | def reset(self) -> None: 47 | super().reset() 48 | self.logits = {i: [] for i in range(self.num_classes)} 49 | self.target = {i: [] for i in range(self.num_classes)} 50 | 51 | def _compute_f1(self, logits, target, threshold=0.4): 52 | # Hard coding for threshold 53 | # F1 only used on OpenMIC, not used on MagnaTagATune 54 | tp = torch.count_nonzero((logits > threshold) & (target == 1)) 55 | fp = torch.count_nonzero((logits > threshold) & (target == 0)) 56 | tn = torch.count_nonzero((logits <= threshold) & (target == 0)) 57 | fn = torch.count_nonzero((logits <= threshold) & (target == 1)) 58 | 59 | precision_p = tp / (tp + fp) 60 | recall_p = tp / (tp + fn) 61 | f1_p = 2 * precision_p * recall_p / (precision_p + recall_p) 62 | f1_p = torch.nan_to_num(f1_p, 0.) 63 | 64 | precision_n = tn / (tn + fn) 65 | recall_n = tn / (tn + fp) 66 | f1_n = 2 * precision_n * recall_n / (precision_n + recall_n) 67 | return f1_p, (f1_p + f1_n) / 2 68 | -------------------------------------------------------------------------------- /models/reg_loss_fn/__init__.py: -------------------------------------------------------------------------------- 1 | from .kd import KDLoss 2 | from .feature_space_reg import FeatureSpaceRegularizationLoss 3 | from .combine_dist_reg import CombineDistRegLoss 4 | 5 | ALL_LOSSES = dict( 6 | KDLoss=KDLoss, 7 | FeatureSpaceRegularizationLoss=FeatureSpaceRegularizationLoss, 8 | CombineDistRegLoss=CombineDistRegLoss 9 | ) 10 | 11 | def get_reg_loss_fn(cfg: dict): 12 | return ALL_LOSSES[cfg["name"]](**cfg["args"]) -------------------------------------------------------------------------------- /models/reg_loss_fn/combine_dist_reg.py: -------------------------------------------------------------------------------- 1 | from typing import Literal, Dict 2 | 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | 6 | from .feature_space_reg import FeatureSpaceRegularizationLoss 7 | from .kd import KDLoss 8 | 9 | class CombineDistRegLoss(nn.Module): 10 | def __init__( 11 | self, 12 | feature_mode, 13 | feature_stages_args: Dict[str, Dict], 14 | label_mode : Literal['soft', 'hard'], 15 | label_tau : int = 2.0, 16 | kd_weight : float = 0.3 17 | ) -> None: 18 | super().__init__() 19 | 20 | self.feature_loss_fn = FeatureSpaceRegularizationLoss(feature_mode, feature_stages_args) 21 | self.kd_loss_fn = KDLoss(label_mode, label_tau) 22 | self.kd_weight = kd_weight 23 | 24 | def forward(self, output_dict_s, output_dict_t, mask): 25 | feature_loss = self.feature_loss_fn(output_dict_s, output_dict_t, mask) 26 | kd_loss = self.kd_loss_fn(output_dict_s, output_dict_t, mask) 27 | return kd_loss * self.kd_weight + feature_loss * (1 - self.kd_weight) 28 | -------------------------------------------------------------------------------- /models/reg_loss_fn/feature_space_reg.py: -------------------------------------------------------------------------------- 1 | from typing import List, Dict 2 | 3 | import torch 4 | import torch.nn as nn 5 | import torch.nn.functional as F 6 | 7 | class FeatureSpaceRegularizationLoss(nn.Module): 8 | """ 9 | Feature space regularization loss 10 | 11 | Inputs 12 | ---------- 13 | output_dict_s: Dict 14 | student output dict, might use "output_1", "output_2", "output_3", (N, T, C) 15 | output_dict_t: Dict 16 | teacher output dict, might use "feature", (N, T, C) 17 | """ 18 | def __init__( 19 | self, 20 | mode, 21 | stages_args: Dict[str, Dict], 22 | ) -> None: 23 | super().__init__() 24 | 25 | self.mode = mode 26 | self.stage_args = stages_args 27 | weights = {stage: args.get("weight", 0.0) for stage, args in stages_args.items()} 28 | self.weights = {stage: weight / sum(weights.values()) for stage, weight in weights.items()} 29 | self.student_expand = {stage: args.get("student_expand", -1) for stage, args in stages_args.items()} 30 | self.teacher_expand = {stage: args.get("teacher_expand", -1) for stage, args in stages_args.items()} 31 | 32 | def forward(self, output_dict_s, output_dict_t, mask): 33 | loss = None 34 | for stage in self.stage_args: 35 | if not self.weights[stage] > 0: 36 | continue 37 | output = output_dict_s["output_{}".format(stage)] 38 | target = output_dict_t["feature"] 39 | 40 | output = self.expand_feature_time(output, self.student_expand[stage]) 41 | target = self.expand_feature_time(target, self.teacher_expand[stage]) 42 | # print(output.shape, target.shape) 43 | if len(target.shape) == 3: 44 | length = min(target.shape[1], output.shape[1]) 45 | output = output[:, :length] 46 | target = target[:, :length] 47 | assert output.shape[:-1] == target.shape[:-1] 48 | 49 | if loss is None: 50 | loss = self.weights[stage] * self.compute_reg_loss(output, target) 51 | else: 52 | loss += self.weights[stage] * self.compute_reg_loss(output, target) 53 | return loss 54 | 55 | def expand_feature_time(self, feature, expand): 56 | if expand == -1: 57 | return torch.mean(feature, dim=1) 58 | else: 59 | return torch.repeat_interleave(feature, expand, dim=1) 60 | 61 | def compute_distance_correlation(self, x, y): 62 | # x = F.normalize(x, dim=-1) # N, T, C or N, C 63 | # y = F.normalize(y, dim=-1) # N, T, C or N, C 64 | 65 | if len(x.shape) == 3: 66 | x = x.transpose(0, 1) # T, N, C 67 | y = y.transpose(0, 1) # T, N, C 68 | 69 | x = torch.sqrt(torch.sum(torch.square(x.unsqueeze(-3) - x.unsqueeze(-2)), dim = -1) + 1e-12) # T, N, N or N, N 70 | y = torch.sqrt(torch.sum(torch.square(y.unsqueeze(-3) - y.unsqueeze(-2)), dim = -1) + 1e-12) # T, N, N or N, N 71 | 72 | x = x - torch.mean(x, dim=-2, keepdims=True) - torch.mean(x, dim=-1, keepdims=True) + torch.mean(x, dim=(-2, -1), keepdims=True) 73 | y = y - torch.mean(y, dim=-2, keepdims=True) - torch.mean(y, dim=-1, keepdims=True) + torch.mean(y, dim=(-2, -1), keepdims=True) 74 | 75 | xy = torch.mean(x * y, dim=(-2, -1)) 76 | xx = torch.mean(x * x, dim=(-2, -1)) 77 | yy = torch.mean(y * y, dim=(-2, -1)) 78 | 79 | correlation_r = xy / torch.sqrt(xx * yy + 1e-9) 80 | return (1 - correlation_r).mean() 81 | 82 | def compute_cosine_distance_difference(self, x, y): 83 | x = F.normalize(x, dim=-1) # N, T, C or N, C 84 | y = F.normalize(y, dim=-1) # N, T, C or N, C 85 | 86 | if len(x.shape) == 3: 87 | x = x.transpose(0, 1) # T, N, C 88 | y = y.transpose(0, 1) # T, N, C 89 | 90 | x = torch.matmul(x, x.transpose(-1, -2)) # T, N, N or N, N 91 | y = torch.matmul(y, y.transpose(-1, -2)) # T, N, N or N, N 92 | 93 | dist = torch.abs(x - y) 94 | return dist.mean() 95 | 96 | def compute_reg_loss(self, x, y): 97 | if self.mode == "distance_correlation": 98 | return self.compute_distance_correlation(x, y) 99 | elif self.mode == "cosine_distance_difference": 100 | return self.compute_cosine_distance_difference(x, y) 101 | else: 102 | raise NotImplementedError -------------------------------------------------------------------------------- /models/reg_loss_fn/kd.py: -------------------------------------------------------------------------------- 1 | from typing import Literal 2 | 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | 6 | EPSILON = 1e-9 7 | 8 | class KDLoss(nn.Module): 9 | def __init__( 10 | self, 11 | mode : Literal['soft', 'hard'], 12 | tau : int = 2.0 13 | ) -> None: 14 | super().__init__() 15 | 16 | self.mode = mode 17 | self.tau = tau 18 | 19 | def forward(self, output_dict_s, output_dict_t, mask): 20 | logits_s = output_dict_s['logits'] 21 | logits_t = output_dict_t['logits'] 22 | 23 | if self.mode == 'soft': 24 | scores_s = output_dict_s['scores'] / self.tau 25 | scores_t = output_dict_t['scores'] / self.tau 26 | logits_s = scores_s.sigmoid() 27 | logits_t = scores_t.sigmoid() 28 | # KD for binary classification 29 | loss = logits_t * (scores_t - scores_s) + ((1 - logits_t + EPSILON) / (1 - logits_s + EPSILON)).log() 30 | loss = loss[mask].mean() * self.tau ** 2 31 | 32 | else: 33 | loss = F.binary_cross_entropy(logits_s, logits_t.round(), reduce="none") 34 | loss = loss[mask].mean() 35 | return loss 36 | 37 | 38 | 39 | -------------------------------------------------------------------------------- /preprocess/parse_magna.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | from tqdm import tqdm 4 | import soundfile as sf 5 | import warnings 6 | warnings.filterwarnings('ignore', message='PySoundFile failed') 7 | 8 | import librosa 9 | import numpy as np 10 | import pandas as pd 11 | 12 | def parse_magna(data_root, feature_root, output_dir): 13 | label_file = os.path.join(data_root, "annotations_final.csv") 14 | df_label = pd.read_csv(label_file, sep="\t") 15 | 16 | col_names = df_label.columns.to_list()[1:-1] 17 | label_count = df_label.to_numpy()[:, 1:-1].astype(np.int32).sum(axis=0) 18 | 19 | top_50_col_index = np.argsort(-label_count)[:50] 20 | top_50_col_names = [col_names[i] for i in top_50_col_index] 21 | 22 | df_label = df_label[["clip_id"] + top_50_col_names + ["mp3_path"]] 23 | 24 | if not os.path.exists(os.path.join(data_root, "wav")): 25 | os.mkdir(os.path.join(data_root, "wav")) 26 | 27 | data_list = [] 28 | for _, line in tqdm(df_label.iterrows()): 29 | clip_id = line["clip_id"] 30 | mp3_path = os.path.join(data_root, line["mp3_path"]) 31 | label = line.values[1:-1].astype(np.int32) 32 | label[label == 0] = -1 33 | wav_path = os.path.join(data_root, "wav/{}.wav".format(clip_id)) 34 | 35 | try: 36 | y, _ = librosa.load(mp3_path, sr=16000) 37 | sf.write(wav_path, y, samplerate=16000) 38 | except Exception: 39 | print(mp3_path) 40 | continue 41 | 42 | vggish = os.path.join(feature_root, 'vggish', '{}.npy'.format(clip_id)) 43 | # if not os.path.exists(vggish): 44 | # warnings.warn("Missing VGGish feature: {}".format(vggish)) 45 | 46 | openl3 = os.path.join(feature_root, 'openl3', '{}.npy'.format(clip_id)) 47 | # if not os.path.exists(openl3): 48 | # warnings.warn("Missing Open-L3 feature: {}".format(openl3)) 49 | 50 | passt = os.path.join(feature_root, 'passt', '{}.npy'.format(clip_id)) 51 | # if not os.path.exists(passt): 52 | # warnings.warn("Missing PaSST feature: {}".format(passt)) 53 | 54 | pann = os.path.join(feature_root, 'pann', '{}.npy'.format(clip_id)) 55 | # if not os.path.exists(pann): 56 | # warnings.warn("Missing pann feature: {}".format(pann)) 57 | 58 | data = dict( 59 | clip_id=clip_id, 60 | audio_path=wav_path, 61 | label=label.tolist(), 62 | vggish=vggish, 63 | openl3=openl3, 64 | passt=passt, 65 | pann=pann 66 | ) 67 | 68 | data_list.append(data) 69 | 70 | with open(os.path.join(output_dir, "magna.json"), "w") as f: 71 | for data in data_list: 72 | json.dump(data, f) 73 | f.write('\n') 74 | f.flush() 75 | 76 | if __name__ == '__main__': 77 | import fire 78 | 79 | fire.Fire(parse_magna) -------------------------------------------------------------------------------- /preprocess/parse_openmic.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import warnings 4 | 5 | import numpy as np 6 | import pandas as pd 7 | 8 | 9 | def parse_openmic_data(data_root, feature_root, output_dir): 10 | """ 11 | Parse the openmic data and generate json line data 12 | Raise error if missing audio file 13 | Raise warning if missing feature file 14 | 15 | Parameters 16 | ---------- 17 | data_root : str 18 | root of the openmic dataset 19 | feature_root : str 20 | root of the extracted feature for openmic 21 | output_dir : str 22 | path to write the summary json line file 23 | """ 24 | openmic_data = np.load(os.path.join(data_root, 'openmic-2018.npz'), allow_pickle=True) 25 | data_y, data_sample_key = openmic_data['Y_true'], openmic_data['sample_key'] 26 | 27 | data_y[data_y > 0.5] = 1 28 | data_y[data_y < 0.5] = -1 29 | data_y[data_y == 0.5] = 0 30 | data_y = data_y.astype(np.int32) 31 | 32 | split_train = pd.read_csv(os.path.join(data_root, 'partitions/split01_train.csv'), header=None).squeeze("columns") 33 | split_test = pd.read_csv(os.path.join(data_root, 'partitions/split01_test.csv'), header=None).squeeze("columns") 34 | split_train = set(split_train) 35 | split_test = set(split_test) 36 | 37 | train_output = os.path.join(output_dir, 'openmic_train.json') 38 | test_output = os.path.join(output_dir, 'openmic_test.json') 39 | 40 | fout_train = open(train_output, 'w') 41 | fout_test = open(test_output, 'w') 42 | 43 | for idx, key in enumerate(data_sample_key): 44 | 45 | audio_path = os.path.join(data_root, 'audio', key[:3], key + '.ogg') 46 | if not os.path.exists(audio_path): 47 | raise RuntimeError("Audio file not found! {}".format(audio_path)) 48 | 49 | vggish = os.path.join(feature_root, 'vggish', key + '.npy') 50 | if not os.path.exists(vggish): 51 | warnings.warn("Missing VGGish feature: {}".format(vggish)) 52 | 53 | openl3 = os.path.join(feature_root, 'openl3', key + '.npy') 54 | if not os.path.exists(openl3): 55 | warnings.warn("Missing Open-L3 feature: {}".format(openl3)) 56 | 57 | passt = os.path.join(feature_root, 'passt', key + '.npy') 58 | if not os.path.exists(passt): 59 | warnings.warn("Missing PaSST feature: {}".format(passt)) 60 | 61 | pann = os.path.join(feature_root, 'pann', key + '.npy') 62 | if not os.path.exists(pann): 63 | warnings.warn("Missing Pann feature: {}".format(pann)) 64 | 65 | data = dict( 66 | sample_key=key, 67 | audio_path=audio_path, 68 | label=data_y[idx].tolist(), 69 | vggish=vggish, 70 | openl3=openl3, 71 | passt=passt, 72 | pann=pann 73 | ) 74 | 75 | if key in split_train: 76 | json.dump(data, fout_train) 77 | fout_train.write('\n') 78 | fout_train.flush() 79 | elif key in split_test: 80 | json.dump(data, fout_test) 81 | fout_test.write('\n') 82 | fout_test.flush() 83 | else: 84 | raise RuntimeError('Unknown sample key={}! Abort!'.format(key)) 85 | 86 | return 87 | 88 | 89 | if __name__ == '__main__': 90 | import fire 91 | 92 | fire.Fire(parse_openmic_data) -------------------------------------------------------------------------------- /preprocess/pretrained_feature_extractor/extract_openl3.py: -------------------------------------------------------------------------------- 1 | import os 2 | import soundfile as sf 3 | from tqdm import tqdm 4 | 5 | import numpy as np 6 | import openl3 7 | 8 | def extract_openl3_feature(input_path, output_path, model): 9 | """ 10 | Extract vggish file for one wavefile and save as an .npy file 11 | 12 | Parameters 13 | ---------- 14 | input_path : str 15 | path to the audio file to extract the feature 16 | output_path : str 17 | path to save the output feature 18 | input_str : int 19 | sampling rate of the input file 20 | """ 21 | audio, sr = sf.read(input_path) 22 | output, _ = openl3.get_audio_embedding(audio, sr, hop_size=0.96, model=model) 23 | np.save(output_path, output) 24 | 25 | def extract_multiple_vggish_feature(input_dir, output_dir, dataset="openmic"): 26 | """ 27 | Extract openl3 file for multiple wav files and save as .npy files 28 | 29 | Parameters 30 | ---------- 31 | input_dir : str 32 | directory of the input wav files 33 | output_dir : str 34 | directory of the output feature files 35 | input_str : int 36 | sampling rate of the input file 37 | """ 38 | input_file_list = os.listdir(input_dir) 39 | 40 | if not os.path.exists(output_dir): 41 | os.mkdir(output_dir) 42 | 43 | model = openl3.models.load_audio_embedding_model(input_repr="mel128", content_type="music", embedding_size=512) 44 | 45 | if dataset == "magna": 46 | for input_file in tqdm(input_file_list): 47 | input_path = os.path.join(input_dir, input_file) 48 | 49 | output_file = input_file.replace('.wav', '.npy') 50 | output_path = os.path.join(output_dir, output_file) 51 | 52 | extract_openl3_feature(input_path, output_path, model) 53 | elif dataset == "openmic": 54 | for input_subdir in tqdm(input_file_list): 55 | input_subdir_path = os.path.join(input_dir, input_subdir) 56 | for input_file in tqdm(os.listdir(input_subdir_path)): 57 | input_path = os.path.join(input_subdir_path, input_file) 58 | 59 | output_file = input_file.replace('.ogg', '.npy') 60 | output_path = os.path.join(output_dir, output_file) 61 | 62 | extract_openl3_feature(input_path, output_path, model) 63 | 64 | if __name__ == '__main__': 65 | import fire 66 | 67 | fire.Fire() -------------------------------------------------------------------------------- /preprocess/pretrained_feature_extractor/extract_passt.py: -------------------------------------------------------------------------------- 1 | import os 2 | import soundfile as sf 3 | import sys 4 | sys.path.append("../../../src/hear21passt") 5 | from tqdm import tqdm 6 | 7 | import librosa 8 | import numpy as np 9 | import torch 10 | 11 | from hear21passt.base import load_model, get_timestamp_embeddings 12 | 13 | 14 | def extract_passt_feature(input_path, output_path, model): 15 | """ 16 | Extract passt feature for one wavefile and save as an .npy file 17 | 18 | Parameters 19 | ---------- 20 | input_path : str 21 | path to the audio file to extract the feature 22 | output_path : str 23 | path to save the output feature 24 | input_str : int 25 | sampling rate of the input file 26 | """ 27 | 28 | audio, _ = librosa.load(input_path, sr=32000) 29 | audio = torch.from_numpy(audio[np.newaxis]).float() 30 | output, _ = get_timestamp_embeddings(audio, model) 31 | output = output.cpu().detach().squeeze(dim=0).numpy() 32 | np.save(output_path, output) 33 | 34 | def extract_multiple_passt_feature(input_dir, output_dir, device): 35 | """ 36 | Extract vggish passt for multiple wav files and save as .npy files 37 | 38 | Parameters 39 | ---------- 40 | input_dir : str 41 | directory of the input wav files 42 | output_dir : str 43 | directory of the output feature files 44 | """ 45 | input_file_list = os.listdir(input_dir) 46 | 47 | model = load_model(mode="embed_only", timestamp_window=960, timestamp_hop=960) 48 | model = model.to(device) 49 | 50 | if not os.path.exists(output_dir): 51 | os.mkdir(output_dir) 52 | 53 | for input_file in tqdm(input_file_list): 54 | input_path = os.path.join(input_dir, input_file) 55 | 56 | output_file = input_file.replace('.wav', '.npy') 57 | output_path = os.path.join(output_dir, output_file) 58 | 59 | extract_passt_feature(input_path, output_path, model) 60 | 61 | 62 | if __name__ == '__main__': 63 | import fire 64 | 65 | fire.Fire() -------------------------------------------------------------------------------- /preprocess/pretrained_feature_extractor/extract_vggish.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | from tqdm import tqdm 4 | 5 | import numpy as np 6 | import torch 7 | 8 | model = torch.hub.load('harritaylor/torchvggish', 'vggish', device='cuda', postprocess=True) 9 | model.eval() 10 | 11 | def extract_vggish_feature(input_path, output_path, input_sr=44100): 12 | """ 13 | Extract vggish file for one wavefile and save as an .npy file 14 | 15 | Parameters 16 | ---------- 17 | input_path : str 18 | path to the audio file to extract the feature 19 | output_path : str 20 | path to save the output feature 21 | input_str : int 22 | sampling rate of the input file 23 | """ 24 | output = model(input_path, fs=input_sr) 25 | output = output.cpu().detach().numpy().astype(np.int32) 26 | np.save(output_path, output) 27 | 28 | def extract_multiple_vggish_feature(input_dir, output_dir, input_sr=44100): 29 | """ 30 | Extract vggish file for multiple wav files and save as .npy files 31 | 32 | Parameters 33 | ---------- 34 | input_dir : str 35 | directory of the input wav files 36 | output_dir : str 37 | directory of the output feature files 38 | input_str : int 39 | sampling rate of the input file 40 | """ 41 | input_file_list = os.listdir(input_dir) 42 | 43 | if not os.path.exists(output_dir): 44 | os.mkdir(output_dir) 45 | 46 | for input_file in tqdm(input_file_list): 47 | input_path = os.path.join(input_dir, input_file) 48 | 49 | output_file = input_file.replace('.wav', '.npy') 50 | output_path = os.path.join(output_dir, output_file) 51 | 52 | extract_vggish_feature(input_path, output_path, input_sr) 53 | 54 | if __name__ == '__main__': 55 | import fire 56 | 57 | fire.Fire() -------------------------------------------------------------------------------- /preprocess/resample_openmic.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import soundfile as sf 4 | import shutil 5 | from tqdm import tqdm 6 | 7 | import librosa 8 | 9 | def resample_openmic(data_root, target_root, target_sr=44100): 10 | """ 11 | Resample the OpenMIC data 12 | 13 | Parameters 14 | ---------- 15 | data_root : str 16 | root of the openmic dataset 17 | target_root : str 18 | root to write the output data 19 | target_sr : int 20 | target sampling rate 21 | """ 22 | data_root_audio = os.path.join(data_root, 'audio') 23 | target_root_audio = os.path.join(target_root, 'audio') 24 | if not os.path.exists(target_root_audio): 25 | os.mkdir(target_root_audio) 26 | 27 | for subdir in os.listdir(data_root_audio): 28 | target_subdir = os.path.join(target_root_audio, subdir) 29 | if not os.path.exists(target_subdir): 30 | os.mkdir(target_subdir) 31 | for filename in tqdm(os.listdir(os.path.join(data_root_audio, subdir))): 32 | input_file = os.path.join(data_root_audio, subdir, filename) 33 | output_file = os.path.join(target_root_audio, subdir, filename) 34 | if os.path.exists(output_file): 35 | os.remove(output_file) 36 | 37 | audio, sr = sf.read(input_file) 38 | if len(audio.shape) > 1: 39 | audio = audio.mean(axis=1) 40 | if not sr == target_sr: 41 | audio = librosa.resample(audio, orig_sr=sr, target_sr=target_sr) 42 | sf.write(output_file, audio, samplerate=target_sr) 43 | 44 | shutil.copyfile( 45 | os.path.join(data_root, 'openmic-2018.npz'), 46 | os.path.join(target_root, 'openmic-2018.npz') 47 | ) 48 | shutil.copytree( 49 | os.path.join(data_root, 'partitions'), 50 | os.path.join(target_root, 'partitions') 51 | ) 52 | 53 | 54 | if __name__ == '__main__': 55 | parser = argparse.ArgumentParser() 56 | 57 | parser.add_argument('-i', '--data_root', type=str, required=True, 58 | help="Root of openmic data") 59 | parser.add_argument('-o', '--target_root', type=str, required=True, 60 | help="Root to write the output data") 61 | parser.add_argument('-sr', '--target_sr', type=int, required=True, 62 | help="Target sampling rate") 63 | 64 | args = parser.parse_args() 65 | 66 | resample_openmic( 67 | data_root=args.data_root, 68 | target_root=args.target_root, 69 | target_sr=args.target_sr 70 | ) 71 | -------------------------------------------------------------------------------- /preprocess/split_magna.py: -------------------------------------------------------------------------------- 1 | import os 2 | import random 3 | random.seed(2022) 4 | 5 | def split_openmic_train(root_dir, valid=0.15, test=0.15): 6 | """ 7 | Split the openmic training data to separate training and valid 8 | 9 | Parameters 10 | ---------- 11 | root_dir : str 12 | directory with magna.json 13 | valid : float 14 | percent of validation data 15 | test : float 16 | percent of test data 17 | """ 18 | input_json = os.path.join(root_dir, 'magna.json') 19 | with open(input_json, 'r') as f: 20 | data = f.readlines() 21 | 22 | train_json = open(os.path.join(root_dir, 'magna_train.json'), 'w') 23 | valid_json = open(os.path.join(root_dir, 'magna_valid.json'), 'w') 24 | test_json = open(os.path.join(root_dir, 'magna_test.json'), 'w') 25 | for line in data: 26 | t = random.uniform(0., 1.) 27 | if t > valid + test: 28 | train_json.write(line) 29 | elif t < test: 30 | test_json.write(line) 31 | else: 32 | valid_json.write(line) 33 | 34 | train_json.close() 35 | valid_json.close() 36 | test_json.close() 37 | 38 | return 39 | 40 | 41 | if __name__ == '__main__': 42 | import fire 43 | 44 | fire.Fire(split_openmic_train) 45 | -------------------------------------------------------------------------------- /preprocess/split_openmic_train.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import random 4 | random.seed(2022) 5 | 6 | def split_openmic_train(root_dir, valid=0.15): 7 | """ 8 | Split the openmic training data to separate training and valid 9 | 10 | Parameters 11 | ---------- 12 | root_dir : str 13 | directory with openmic_train.json 14 | valid : float 15 | percent of validation data 16 | """ 17 | input_json = os.path.join(root_dir, 'openmic_train.json') 18 | with open(input_json, 'r') as f: 19 | data = f.readlines() 20 | 21 | train_json = open(os.path.join(root_dir, 'openmic_train.json'), 'w') 22 | valid_json = open(os.path.join(root_dir, 'openmic_valid.json'), 'w') 23 | for line in data: 24 | if random.uniform(0., 1.) < valid: 25 | valid_json.write(line) 26 | else: 27 | train_json.write(line) 28 | 29 | train_json.close() 30 | valid_json.close() 31 | 32 | return 33 | 34 | 35 | if __name__ == '__main__': 36 | parser = argparse.ArgumentParser() 37 | 38 | parser.add_argument('-d', '--root_dir', type=str, required=True, 39 | help="Directory containing openmic_train.json") 40 | parser.add_argument('-p', '--valid', type=float, default=0.15, 41 | help="Percent of validation data") 42 | 43 | args = parser.parse_args() 44 | 45 | split_openmic_train( 46 | root_dir=args.root_dir, 47 | valid=args.valid, 48 | ) 49 | 50 | -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | import pytorch_lightning as pl 4 | from pytorch_lightning.loggers import TensorBoardLogger 5 | from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping 6 | 7 | from data import LatentSpaceDataModule 8 | from models import TeacherStudentModel 9 | 10 | def train(config): 11 | with open(config) as f: 12 | config = json.load(f) 13 | 14 | pl.seed_everything(config["seed"], workers=True) 15 | 16 | data_cfg = config["data"] 17 | model_cfg = config["model"] 18 | trainer_cfg = config["trainer"] 19 | 20 | datamodule = LatentSpaceDataModule(**data_cfg) 21 | model = TeacherStudentModel(model_cfg) 22 | 23 | callbacks = [ 24 | ModelCheckpoint(**trainer_cfg["checkpoint"]), 25 | EarlyStopping(**trainer_cfg["early_stopping"]) 26 | ] 27 | 28 | trainer = pl.Trainer( 29 | **trainer_cfg["args"], 30 | logger=TensorBoardLogger(**trainer_cfg["logger"]), 31 | callbacks=callbacks 32 | ) 33 | 34 | trainer.fit(model, datamodule=datamodule) 35 | trainer.test(datamodule=datamodule) 36 | 37 | if __name__ == '__main__': 38 | import fire 39 | 40 | fire.Fire(train) -------------------------------------------------------------------------------- /work_dir_magna/east_final_openl3/epoch=93-val_mAP=0.448.ckpt: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:1e438d11c0bf0096e1dba7bcae2e97070407274baa86b5ccb5f2e954c2c0ec20 3 | size 3962949 4 | -------------------------------------------------------------------------------- /work_dir_magna/east_final_pann/epoch=82-val_mAP=0.450.ckpt: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:c1cead4a0e03d0175f6fccd25de04197122f85213ddbed567b930f2651fd7414 3 | size 4270149 4 | -------------------------------------------------------------------------------- /work_dir_magna/east_final_passt/epoch=64-val_mAP=0.466.ckpt: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:722690d5b48418af026413d3b00e12f00eb8197ec4a604deaae0b8e174c502f2 3 | size 4014149 4 | -------------------------------------------------------------------------------- /work_dir_magna/east_final_vggish/epoch=67-val_mAP=0.451.ckpt: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:f15deefd761440b1a77ec43a7397f0f2a6ef94e0f3c677fbb25dae252c1ecbdd 3 | size 3886149 4 | -------------------------------------------------------------------------------- /work_dir_openmic/east_final_openl3/epoch=62-val_mAP=0.851.ckpt: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:6ffad4b2e7c087de6eea46156f2e78ffdc212f97435ac48ac8a67f50575878f8 3 | size 66654143 4 | -------------------------------------------------------------------------------- /work_dir_openmic/east_final_pann/epoch=90-val_mAP=0.860.ckpt: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:e113ac0b6831539f5cb35fe956ae177b56e2a39e3c585a9afb04983a7316febe 3 | size 66777023 4 | -------------------------------------------------------------------------------- /work_dir_openmic/east_final_passt/epoch=96-val_mAP=0.864.ckpt: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:7605bb58b1d8737db909e61a8bbed994c186e579b9b909ac2488d899a5e3cc15 3 | size 66674623 4 | -------------------------------------------------------------------------------- /work_dir_openmic/east_final_vggish/epoch=75-val_mAP=0.854.ckpt: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:e95cee8df9a94feeb2528075cd498de2d7bac91d8338c52e71db1b6006aea868 3 | size 66623423 4 | --------------------------------------------------------------------------------