├── AudioSep_Colab.ipynb ├── CONTRIBUTING.md ├── LICENSE ├── README.md ├── assets └── results.png ├── benchmark.py ├── callbacks └── base.py ├── cog.yaml ├── config └── audiosep_base.yaml ├── data ├── audiotext_dataset.py ├── datamodules.py └── waveform_mixers.py ├── datafiles └── template.json ├── environment.yml ├── environment_win64.yaml ├── evaluation ├── evaluate_audiocaps.py ├── evaluate_audioset.py ├── evaluate_clotho.py ├── evaluate_esc50.py ├── evaluate_music.py ├── evaluate_vggsound.py └── metadata │ ├── audiocaps_eval.csv │ ├── audioset_eval.csv │ ├── class_labels_indices.csv │ ├── clotho_eval.csv │ ├── esc50_eval.csv │ ├── music_eval.csv │ └── vggsound_eval.csv ├── losses.py ├── models ├── CLAP │ ├── __init__.py │ ├── open_clip │ │ ├── __init__.py │ │ ├── bert.py │ │ ├── bpe_simple_vocab_16e6.txt.gz │ │ ├── factory.py │ │ ├── feature_fusion.py │ │ ├── htsat.py │ │ ├── linear_probe.py │ │ ├── loss.py │ │ ├── model.py │ │ ├── model_configs │ │ │ ├── HTSAT-base.json │ │ │ ├── HTSAT-large.json │ │ │ ├── HTSAT-tiny-win-1536.json │ │ │ ├── HTSAT-tiny.json │ │ │ ├── PANN-10.json │ │ │ ├── PANN-14-fmax-18k.json │ │ │ ├── PANN-14-fmax-8k-20s.json │ │ │ ├── PANN-14-tiny-transformer.json │ │ │ ├── PANN-14-win-1536.json │ │ │ ├── PANN-14.json │ │ │ ├── PANN-6.json │ │ │ ├── RN101-quickgelu.json │ │ │ ├── RN101.json │ │ │ ├── RN50-quickgelu.json │ │ │ ├── RN50.json │ │ │ ├── RN50x16.json │ │ │ ├── RN50x4.json │ │ │ ├── ViT-B-16.json │ │ │ ├── ViT-B-32-quickgelu.json │ │ │ ├── ViT-B-32.json │ │ │ └── ViT-L-14.json │ │ ├── openai.py │ │ ├── pann_model.py │ │ ├── pretrained.py │ │ ├── timm_model.py │ │ ├── tokenizer.py │ │ ├── transform.py │ │ ├── utils.py │ │ └── version.py │ └── training │ │ ├── __init__.py │ │ ├── audioset_textmap.npy │ │ ├── data.py │ │ ├── distributed.py │ │ ├── imagenet_zeroshot_data.py │ │ ├── infer_demo.py │ │ ├── logger.py │ │ ├── lp_main.py │ │ ├── lp_train.py │ │ ├── main.py │ │ ├── params.py │ │ ├── scheduler.py │ │ ├── train.py │ │ └── zero_shot.py ├── audiosep.py ├── base.py ├── clap_encoder.py └── resunet.py ├── optimizers └── lr_schedulers.py ├── pipeline.py ├── predict.py ├── train.py └── utils.py /AudioSep_Colab.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-AGI/AudioSep/HEAD/AudioSep_Colab.ipynb -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-AGI/AudioSep/HEAD/CONTRIBUTING.md -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-AGI/AudioSep/HEAD/LICENSE -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-AGI/AudioSep/HEAD/README.md -------------------------------------------------------------------------------- /assets/results.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-AGI/AudioSep/HEAD/assets/results.png -------------------------------------------------------------------------------- /benchmark.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-AGI/AudioSep/HEAD/benchmark.py -------------------------------------------------------------------------------- /callbacks/base.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-AGI/AudioSep/HEAD/callbacks/base.py -------------------------------------------------------------------------------- /cog.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-AGI/AudioSep/HEAD/cog.yaml -------------------------------------------------------------------------------- /config/audiosep_base.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-AGI/AudioSep/HEAD/config/audiosep_base.yaml -------------------------------------------------------------------------------- /data/audiotext_dataset.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-AGI/AudioSep/HEAD/data/audiotext_dataset.py -------------------------------------------------------------------------------- /data/datamodules.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-AGI/AudioSep/HEAD/data/datamodules.py -------------------------------------------------------------------------------- /data/waveform_mixers.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-AGI/AudioSep/HEAD/data/waveform_mixers.py -------------------------------------------------------------------------------- /datafiles/template.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-AGI/AudioSep/HEAD/datafiles/template.json -------------------------------------------------------------------------------- /environment.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-AGI/AudioSep/HEAD/environment.yml -------------------------------------------------------------------------------- /environment_win64.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-AGI/AudioSep/HEAD/environment_win64.yaml -------------------------------------------------------------------------------- /evaluation/evaluate_audiocaps.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-AGI/AudioSep/HEAD/evaluation/evaluate_audiocaps.py -------------------------------------------------------------------------------- /evaluation/evaluate_audioset.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-AGI/AudioSep/HEAD/evaluation/evaluate_audioset.py -------------------------------------------------------------------------------- /evaluation/evaluate_clotho.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-AGI/AudioSep/HEAD/evaluation/evaluate_clotho.py -------------------------------------------------------------------------------- /evaluation/evaluate_esc50.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-AGI/AudioSep/HEAD/evaluation/evaluate_esc50.py -------------------------------------------------------------------------------- /evaluation/evaluate_music.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-AGI/AudioSep/HEAD/evaluation/evaluate_music.py -------------------------------------------------------------------------------- /evaluation/evaluate_vggsound.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-AGI/AudioSep/HEAD/evaluation/evaluate_vggsound.py -------------------------------------------------------------------------------- /evaluation/metadata/audiocaps_eval.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-AGI/AudioSep/HEAD/evaluation/metadata/audiocaps_eval.csv -------------------------------------------------------------------------------- /evaluation/metadata/audioset_eval.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-AGI/AudioSep/HEAD/evaluation/metadata/audioset_eval.csv -------------------------------------------------------------------------------- /evaluation/metadata/class_labels_indices.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-AGI/AudioSep/HEAD/evaluation/metadata/class_labels_indices.csv -------------------------------------------------------------------------------- /evaluation/metadata/clotho_eval.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-AGI/AudioSep/HEAD/evaluation/metadata/clotho_eval.csv -------------------------------------------------------------------------------- /evaluation/metadata/esc50_eval.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-AGI/AudioSep/HEAD/evaluation/metadata/esc50_eval.csv -------------------------------------------------------------------------------- /evaluation/metadata/music_eval.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-AGI/AudioSep/HEAD/evaluation/metadata/music_eval.csv -------------------------------------------------------------------------------- /evaluation/metadata/vggsound_eval.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-AGI/AudioSep/HEAD/evaluation/metadata/vggsound_eval.csv -------------------------------------------------------------------------------- /losses.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-AGI/AudioSep/HEAD/losses.py -------------------------------------------------------------------------------- /models/CLAP/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /models/CLAP/open_clip/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-AGI/AudioSep/HEAD/models/CLAP/open_clip/__init__.py -------------------------------------------------------------------------------- /models/CLAP/open_clip/bert.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-AGI/AudioSep/HEAD/models/CLAP/open_clip/bert.py -------------------------------------------------------------------------------- /models/CLAP/open_clip/bpe_simple_vocab_16e6.txt.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-AGI/AudioSep/HEAD/models/CLAP/open_clip/bpe_simple_vocab_16e6.txt.gz -------------------------------------------------------------------------------- /models/CLAP/open_clip/factory.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-AGI/AudioSep/HEAD/models/CLAP/open_clip/factory.py -------------------------------------------------------------------------------- /models/CLAP/open_clip/feature_fusion.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-AGI/AudioSep/HEAD/models/CLAP/open_clip/feature_fusion.py -------------------------------------------------------------------------------- /models/CLAP/open_clip/htsat.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-AGI/AudioSep/HEAD/models/CLAP/open_clip/htsat.py -------------------------------------------------------------------------------- /models/CLAP/open_clip/linear_probe.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-AGI/AudioSep/HEAD/models/CLAP/open_clip/linear_probe.py -------------------------------------------------------------------------------- /models/CLAP/open_clip/loss.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-AGI/AudioSep/HEAD/models/CLAP/open_clip/loss.py -------------------------------------------------------------------------------- /models/CLAP/open_clip/model.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-AGI/AudioSep/HEAD/models/CLAP/open_clip/model.py -------------------------------------------------------------------------------- /models/CLAP/open_clip/model_configs/HTSAT-base.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-AGI/AudioSep/HEAD/models/CLAP/open_clip/model_configs/HTSAT-base.json -------------------------------------------------------------------------------- /models/CLAP/open_clip/model_configs/HTSAT-large.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-AGI/AudioSep/HEAD/models/CLAP/open_clip/model_configs/HTSAT-large.json -------------------------------------------------------------------------------- /models/CLAP/open_clip/model_configs/HTSAT-tiny-win-1536.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-AGI/AudioSep/HEAD/models/CLAP/open_clip/model_configs/HTSAT-tiny-win-1536.json -------------------------------------------------------------------------------- /models/CLAP/open_clip/model_configs/HTSAT-tiny.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-AGI/AudioSep/HEAD/models/CLAP/open_clip/model_configs/HTSAT-tiny.json -------------------------------------------------------------------------------- /models/CLAP/open_clip/model_configs/PANN-10.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-AGI/AudioSep/HEAD/models/CLAP/open_clip/model_configs/PANN-10.json -------------------------------------------------------------------------------- /models/CLAP/open_clip/model_configs/PANN-14-fmax-18k.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-AGI/AudioSep/HEAD/models/CLAP/open_clip/model_configs/PANN-14-fmax-18k.json -------------------------------------------------------------------------------- /models/CLAP/open_clip/model_configs/PANN-14-fmax-8k-20s.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-AGI/AudioSep/HEAD/models/CLAP/open_clip/model_configs/PANN-14-fmax-8k-20s.json -------------------------------------------------------------------------------- /models/CLAP/open_clip/model_configs/PANN-14-tiny-transformer.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-AGI/AudioSep/HEAD/models/CLAP/open_clip/model_configs/PANN-14-tiny-transformer.json -------------------------------------------------------------------------------- /models/CLAP/open_clip/model_configs/PANN-14-win-1536.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-AGI/AudioSep/HEAD/models/CLAP/open_clip/model_configs/PANN-14-win-1536.json -------------------------------------------------------------------------------- /models/CLAP/open_clip/model_configs/PANN-14.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-AGI/AudioSep/HEAD/models/CLAP/open_clip/model_configs/PANN-14.json -------------------------------------------------------------------------------- /models/CLAP/open_clip/model_configs/PANN-6.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-AGI/AudioSep/HEAD/models/CLAP/open_clip/model_configs/PANN-6.json -------------------------------------------------------------------------------- /models/CLAP/open_clip/model_configs/RN101-quickgelu.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-AGI/AudioSep/HEAD/models/CLAP/open_clip/model_configs/RN101-quickgelu.json -------------------------------------------------------------------------------- /models/CLAP/open_clip/model_configs/RN101.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-AGI/AudioSep/HEAD/models/CLAP/open_clip/model_configs/RN101.json -------------------------------------------------------------------------------- /models/CLAP/open_clip/model_configs/RN50-quickgelu.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-AGI/AudioSep/HEAD/models/CLAP/open_clip/model_configs/RN50-quickgelu.json -------------------------------------------------------------------------------- /models/CLAP/open_clip/model_configs/RN50.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-AGI/AudioSep/HEAD/models/CLAP/open_clip/model_configs/RN50.json -------------------------------------------------------------------------------- /models/CLAP/open_clip/model_configs/RN50x16.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-AGI/AudioSep/HEAD/models/CLAP/open_clip/model_configs/RN50x16.json -------------------------------------------------------------------------------- /models/CLAP/open_clip/model_configs/RN50x4.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-AGI/AudioSep/HEAD/models/CLAP/open_clip/model_configs/RN50x4.json -------------------------------------------------------------------------------- /models/CLAP/open_clip/model_configs/ViT-B-16.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-AGI/AudioSep/HEAD/models/CLAP/open_clip/model_configs/ViT-B-16.json -------------------------------------------------------------------------------- /models/CLAP/open_clip/model_configs/ViT-B-32-quickgelu.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-AGI/AudioSep/HEAD/models/CLAP/open_clip/model_configs/ViT-B-32-quickgelu.json -------------------------------------------------------------------------------- /models/CLAP/open_clip/model_configs/ViT-B-32.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-AGI/AudioSep/HEAD/models/CLAP/open_clip/model_configs/ViT-B-32.json -------------------------------------------------------------------------------- /models/CLAP/open_clip/model_configs/ViT-L-14.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-AGI/AudioSep/HEAD/models/CLAP/open_clip/model_configs/ViT-L-14.json -------------------------------------------------------------------------------- /models/CLAP/open_clip/openai.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-AGI/AudioSep/HEAD/models/CLAP/open_clip/openai.py -------------------------------------------------------------------------------- /models/CLAP/open_clip/pann_model.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-AGI/AudioSep/HEAD/models/CLAP/open_clip/pann_model.py -------------------------------------------------------------------------------- /models/CLAP/open_clip/pretrained.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-AGI/AudioSep/HEAD/models/CLAP/open_clip/pretrained.py -------------------------------------------------------------------------------- /models/CLAP/open_clip/timm_model.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-AGI/AudioSep/HEAD/models/CLAP/open_clip/timm_model.py -------------------------------------------------------------------------------- /models/CLAP/open_clip/tokenizer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-AGI/AudioSep/HEAD/models/CLAP/open_clip/tokenizer.py -------------------------------------------------------------------------------- /models/CLAP/open_clip/transform.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-AGI/AudioSep/HEAD/models/CLAP/open_clip/transform.py -------------------------------------------------------------------------------- /models/CLAP/open_clip/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-AGI/AudioSep/HEAD/models/CLAP/open_clip/utils.py -------------------------------------------------------------------------------- /models/CLAP/open_clip/version.py: -------------------------------------------------------------------------------- 1 | __version__ = "0.2.1" 2 | -------------------------------------------------------------------------------- /models/CLAP/training/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /models/CLAP/training/audioset_textmap.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-AGI/AudioSep/HEAD/models/CLAP/training/audioset_textmap.npy -------------------------------------------------------------------------------- /models/CLAP/training/data.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-AGI/AudioSep/HEAD/models/CLAP/training/data.py -------------------------------------------------------------------------------- /models/CLAP/training/distributed.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-AGI/AudioSep/HEAD/models/CLAP/training/distributed.py -------------------------------------------------------------------------------- /models/CLAP/training/imagenet_zeroshot_data.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-AGI/AudioSep/HEAD/models/CLAP/training/imagenet_zeroshot_data.py -------------------------------------------------------------------------------- /models/CLAP/training/infer_demo.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-AGI/AudioSep/HEAD/models/CLAP/training/infer_demo.py -------------------------------------------------------------------------------- /models/CLAP/training/logger.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-AGI/AudioSep/HEAD/models/CLAP/training/logger.py -------------------------------------------------------------------------------- /models/CLAP/training/lp_main.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-AGI/AudioSep/HEAD/models/CLAP/training/lp_main.py -------------------------------------------------------------------------------- /models/CLAP/training/lp_train.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-AGI/AudioSep/HEAD/models/CLAP/training/lp_train.py -------------------------------------------------------------------------------- /models/CLAP/training/main.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-AGI/AudioSep/HEAD/models/CLAP/training/main.py -------------------------------------------------------------------------------- /models/CLAP/training/params.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-AGI/AudioSep/HEAD/models/CLAP/training/params.py -------------------------------------------------------------------------------- /models/CLAP/training/scheduler.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-AGI/AudioSep/HEAD/models/CLAP/training/scheduler.py -------------------------------------------------------------------------------- /models/CLAP/training/train.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-AGI/AudioSep/HEAD/models/CLAP/training/train.py -------------------------------------------------------------------------------- /models/CLAP/training/zero_shot.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-AGI/AudioSep/HEAD/models/CLAP/training/zero_shot.py -------------------------------------------------------------------------------- /models/audiosep.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-AGI/AudioSep/HEAD/models/audiosep.py -------------------------------------------------------------------------------- /models/base.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-AGI/AudioSep/HEAD/models/base.py -------------------------------------------------------------------------------- /models/clap_encoder.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-AGI/AudioSep/HEAD/models/clap_encoder.py -------------------------------------------------------------------------------- /models/resunet.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-AGI/AudioSep/HEAD/models/resunet.py -------------------------------------------------------------------------------- /optimizers/lr_schedulers.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-AGI/AudioSep/HEAD/optimizers/lr_schedulers.py -------------------------------------------------------------------------------- /pipeline.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-AGI/AudioSep/HEAD/pipeline.py -------------------------------------------------------------------------------- /predict.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-AGI/AudioSep/HEAD/predict.py -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-AGI/AudioSep/HEAD/train.py -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-AGI/AudioSep/HEAD/utils.py --------------------------------------------------------------------------------