├── codec_evaluation ├── __init__.py ├── codecs │ ├── __init__.py │ ├── YuE │ │ ├── models │ │ │ └── __init__.py │ │ ├── quantization │ │ │ └── __init__.py │ │ ├── descriptaudiocodec │ │ │ └── dac │ │ │ │ ├── nn │ │ │ │ ├── __init__.py │ │ │ │ └── layers.py │ │ │ │ ├── model │ │ │ │ ├── __init__.py │ │ │ │ └── base.py │ │ │ │ └── __init__.py │ │ ├── modules │ │ │ ├── __init__.py │ │ │ └── norm.py │ │ └── RepCodec │ │ │ └── repcodec │ │ │ ├── modules │ │ │ └── residual_unit.py │ │ │ └── layers │ │ │ └── conv_layer.py │ ├── xcodec │ │ ├── models │ │ │ └── __init__.py │ │ ├── modules │ │ │ └── __init__.py │ │ ├── quantization │ │ │ └── __init__.py │ │ └── descriptaudiocodec │ │ │ └── dac │ │ │ ├── nn │ │ │ ├── __init__.py │ │ │ └── layers.py │ │ │ ├── model │ │ │ ├── __init__.py │ │ │ └── base.py │ │ │ └── __init__.py │ ├── levo_modules │ │ ├── Flow1dVAE │ │ │ ├── models │ │ │ │ └── __init__.py │ │ │ ├── tools │ │ │ │ ├── __init__.py │ │ │ │ ├── extract_rvq.py │ │ │ │ ├── safetensor2torch.py │ │ │ │ ├── get_1dvae.py │ │ │ │ ├── get_1dvae_1920.py │ │ │ │ ├── get_1dvae_large_melvae.py │ │ │ │ ├── get_1dvae_large.py │ │ │ │ ├── compare_2models.py │ │ │ │ ├── get_whisper_encoder.py │ │ │ │ ├── transmodelnorm.py │ │ │ │ ├── mix.py │ │ │ │ ├── check_stereo.py │ │ │ │ ├── infer_encodec.py │ │ │ │ ├── infer_encodec_speech.py │ │ │ │ ├── infer_encodec_vocal.py │ │ │ │ ├── creat_jsonl.py │ │ │ │ ├── infer_bsrnnvae441k.py │ │ │ │ └── infer_bsrnnvae441k_vocal.py │ │ │ ├── our_MERT_BESTRQ │ │ │ │ ├── mert_fairseq │ │ │ │ │ ├── models │ │ │ │ │ │ ├── musicfm │ │ │ │ │ │ │ ├── model │ │ │ │ │ │ │ │ ├── __init__.py │ │ │ │ │ │ │ │ └── w2v2_config.json │ │ │ │ │ │ │ ├── modules │ │ │ │ │ │ │ │ ├── __init__.py │ │ │ │ │ │ │ │ └── features.py │ │ │ │ │ │ │ └── __init__.py │ │ │ │ │ │ ├── mert │ │ │ │ │ │ │ ├── README.md │ │ │ │ │ │ │ └── __init__.py │ │ │ │ │ │ └── eat │ │ │ │ │ │ │ └── __init__.py │ │ │ │ │ ├── config │ │ │ │ │ │ └── pretrain │ │ │ │ │ │ │ ├── MERT_RVQ-VAE_CQT_95M_dac.yaml │ │ │ │ │ │ │ ├── run │ │ │ │ │ │ │ └── submitit_reg.yaml │ │ │ │ │ │ │ ├── MusicFM_95M_multinodes.yaml │ │ │ │ │ │ │ ├── MusicFM_95M_speech_multinodes.yaml │ │ │ │ │ │ │ └── MusicFM_95M_bestrvq_multinodes.yaml │ │ │ │ │ └── data │ │ │ │ │ │ ├── __init__.py │ │ │ │ │ │ └── eat_data │ │ │ │ │ │ ├── __init__.py │ │ │ │ │ │ └── add_class_target_dataset.py │ │ │ │ ├── modify_env.md │ │ │ │ ├── test.py │ │ │ │ └── run_training_eat.sh │ │ │ ├── models_gpt │ │ │ │ └── models │ │ │ │ │ └── tokenizer │ │ │ │ │ ├── structure.yaml │ │ │ │ │ └── pinyin │ │ │ │ │ └── symbols.py │ │ │ ├── compare_model_weight.py │ │ │ ├── configs │ │ │ │ ├── scheduler │ │ │ │ │ └── stable_diffusion_2.1_largenoise_sample.json │ │ │ │ └── models │ │ │ │ │ └── transformer2D_wocross_inch112_1x4_multi_large.json │ │ │ ├── cal_token_stat.py │ │ │ ├── extract_codes_stereo_7_1x4.py │ │ │ └── extract_codes_stereo_7_1x2.py │ │ ├── __init__.py │ │ └── stable_audio_tools │ │ │ ├── data │ │ │ └── __init__.py │ │ │ ├── inference │ │ │ ├── __init__.py │ │ │ └── utils.py │ │ │ ├── interface │ │ │ └── __init__.py │ │ │ ├── training │ │ │ ├── losses │ │ │ │ └── __init__.py │ │ │ └── __init__.py │ │ │ ├── models │ │ │ ├── __init__.py │ │ │ ├── pretrained.py │ │ │ └── diffusion_prior.py │ │ │ ├── __init__.py │ │ │ └── configs │ │ │ ├── dataset_configs │ │ │ ├── custom_metadata │ │ │ │ └── custom_md_example.py │ │ │ ├── s3_wds_example.json │ │ │ └── local_training_example.json │ │ │ └── model_configs │ │ │ ├── dance_diffusion │ │ │ ├── dance_diffusion_base.json │ │ │ ├── dance_diffusion_large.json │ │ │ ├── dance_diffusion_base_16k.json │ │ │ └── dance_diffusion_base_44k.json │ │ │ └── autoencoders │ │ │ └── dac_2048_32_vae.json │ ├── config │ │ ├── hubert_preprocessor_config.json │ │ └── qwen2audioencoder_preprocessor_config.json │ └── version.py ├── utils │ ├── demucs │ │ ├── __init__.py │ │ ├── models │ │ │ ├── __init__.py │ │ │ ├── pretrained.py │ │ │ └── spec.py │ │ └── ckpt │ │ │ └── htdemucs.yaml │ ├── schedule.py │ └── logger.py ├── perplexity │ └── config │ │ └── ppl_model_config.json └── probe │ └── config │ ├── Common_Voice_dataset │ ├── dac.yaml │ ├── YuE.yaml │ ├── mimi.yaml │ ├── encodec.yaml │ ├── wavtokenizer.yaml │ ├── semanticodec.yaml │ ├── speechtokenizer.yaml │ └── xcodec.yaml │ ├── MTT_dataset │ ├── encodec.yaml │ └── dac.yaml │ ├── NSynthI_dataset │ ├── dac.yaml │ ├── encodec.yaml │ ├── semanticodec.yaml │ ├── wavtokenizer.yaml │ └── YuE.yaml │ ├── NSynthP_dataset │ ├── dac.yaml │ ├── semanticodec.yaml │ ├── encodec.yaml │ └── YuE.yaml │ ├── GTZAN_dataset │ ├── wavtokenizer.yaml │ ├── YuE.yaml │ ├── dac.yaml │ └── encodec.yaml │ ├── EMO_dataset │ ├── encodec.yaml │ ├── YuE.yaml │ ├── dac.yaml │ ├── semanticodec.yaml │ └── wavtokenizer.yaml │ ├── GS_dataset │ ├── YuE.yaml │ ├── dac.yaml │ ├── encodec.yaml │ ├── semanticodec.yaml │ └── wavtokenizer.yaml │ ├── MELD_dataset │ ├── YuE.yaml │ ├── dac.yaml │ ├── mimi.yaml │ └── encodec.yaml │ └── ESC50_dataset │ ├── YuE.yaml │ ├── dac.yaml │ ├── encodec.yaml │ ├── mimi.yaml │ ├── semanticodec.yaml │ └── wavtokenizer.yaml ├── MANIFEST.in ├── env_build.sh ├── requirements.txt ├── doc └── chore.md └── setup.py /codec_evaluation/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /codec_evaluation/codecs/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /codec_evaluation/utils/demucs/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /codec_evaluation/codecs/YuE/models/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /codec_evaluation/utils/demucs/models/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /codec_evaluation/codecs/xcodec/models/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /codec_evaluation/codecs/xcodec/modules/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /codec_evaluation/codecs/levo_modules/Flow1dVAE/models/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /codec_evaluation/codecs/levo_modules/Flow1dVAE/tools/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /codec_evaluation/codecs/levo_modules/__init__.py: -------------------------------------------------------------------------------- 1 | # no need for training -------------------------------------------------------------------------------- /codec_evaluation/codecs/levo_modules/stable_audio_tools/data/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /codec_evaluation/utils/demucs/ckpt/htdemucs.yaml: -------------------------------------------------------------------------------- 1 | models: ['htdemucs'] 2 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include requirements.txt 2 | include README.md 3 | include LICENSE -------------------------------------------------------------------------------- /codec_evaluation/codecs/levo_modules/stable_audio_tools/inference/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /codec_evaluation/codecs/levo_modules/stable_audio_tools/interface/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /codec_evaluation/codecs/YuE/quantization/__init__.py: -------------------------------------------------------------------------------- 1 | from .vq import ResidualVectorQuantizer 2 | -------------------------------------------------------------------------------- /codec_evaluation/codecs/xcodec/quantization/__init__.py: -------------------------------------------------------------------------------- 1 | from .vq import ResidualVectorQuantizer 2 | -------------------------------------------------------------------------------- /codec_evaluation/codecs/levo_modules/stable_audio_tools/training/losses/__init__.py: -------------------------------------------------------------------------------- 1 | from .losses import * -------------------------------------------------------------------------------- /codec_evaluation/codecs/YuE/descriptaudiocodec/dac/nn/__init__.py: -------------------------------------------------------------------------------- 1 | from . import layers 2 | from . import quantize 3 | -------------------------------------------------------------------------------- /codec_evaluation/codecs/levo_modules/Flow1dVAE/our_MERT_BESTRQ/mert_fairseq/models/musicfm/model/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | -------------------------------------------------------------------------------- /codec_evaluation/codecs/levo_modules/Flow1dVAE/our_MERT_BESTRQ/mert_fairseq/config/pretrain/MERT_RVQ-VAE_CQT_95M_dac.yaml: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /codec_evaluation/codecs/levo_modules/Flow1dVAE/our_MERT_BESTRQ/mert_fairseq/models/musicfm/modules/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | -------------------------------------------------------------------------------- /codec_evaluation/codecs/xcodec/descriptaudiocodec/dac/nn/__init__.py: -------------------------------------------------------------------------------- 1 | from . import layers 2 | from . import quantize 3 | -------------------------------------------------------------------------------- /codec_evaluation/codecs/YuE/descriptaudiocodec/dac/model/__init__.py: -------------------------------------------------------------------------------- 1 | from .base import CodecMixin 2 | from .dac import DAC 3 | -------------------------------------------------------------------------------- /codec_evaluation/codecs/xcodec/descriptaudiocodec/dac/model/__init__.py: -------------------------------------------------------------------------------- 1 | from .base import CodecMixin 2 | from .dac import DAC -------------------------------------------------------------------------------- /codec_evaluation/codecs/levo_modules/Flow1dVAE/our_MERT_BESTRQ/mert_fairseq/models/musicfm/__init__.py: -------------------------------------------------------------------------------- 1 | from .musicfm_model import * -------------------------------------------------------------------------------- /codec_evaluation/codecs/levo_modules/stable_audio_tools/models/__init__.py: -------------------------------------------------------------------------------- 1 | from .factory import create_model_from_config, create_model_from_config_path -------------------------------------------------------------------------------- /codec_evaluation/codecs/levo_modules/Flow1dVAE/our_MERT_BESTRQ/mert_fairseq/data/__init__.py: -------------------------------------------------------------------------------- 1 | from .mert_dataset import MERTDataset 2 | from .eat_data import * -------------------------------------------------------------------------------- /codec_evaluation/codecs/levo_modules/stable_audio_tools/training/__init__.py: -------------------------------------------------------------------------------- 1 | from .factory import create_training_wrapper_from_config, create_demo_callback_from_config 2 | -------------------------------------------------------------------------------- /codec_evaluation/codecs/levo_modules/stable_audio_tools/__init__.py: -------------------------------------------------------------------------------- 1 | from .models.factory import create_model_from_config, create_model_from_config_path 2 | from .models.pretrained import get_pretrained_model -------------------------------------------------------------------------------- /codec_evaluation/codecs/levo_modules/stable_audio_tools/configs/dataset_configs/custom_metadata/custom_md_example.py: -------------------------------------------------------------------------------- 1 | def get_custom_metadata(info, audio): 2 | 3 | # Use relative path as the prompt 4 | return {"prompt": info["relpath"]} -------------------------------------------------------------------------------- /codec_evaluation/codecs/levo_modules/Flow1dVAE/models_gpt/models/tokenizer/structure.yaml: -------------------------------------------------------------------------------- 1 | - '[start]' 2 | - '[verse]' 3 | - '[chorus]' 4 | - '[outro]' 5 | - '[end]' 6 | - '[intro]' 7 | - '[solo]' 8 | - '[inst]' 9 | - '[bridge]' 10 | - '[break]' 11 | -------------------------------------------------------------------------------- /codec_evaluation/codecs/levo_modules/Flow1dVAE/our_MERT_BESTRQ/mert_fairseq/models/mert/README.md: -------------------------------------------------------------------------------- 1 | add cauchy extension from https://github.com/HazyResearch/state-spaces 2 | ```shell 3 | cd state-spaces/extensions/cauchy 4 | python setup.py install 5 | ``` 6 | -------------------------------------------------------------------------------- /codec_evaluation/codecs/levo_modules/Flow1dVAE/our_MERT_BESTRQ/mert_fairseq/models/eat/__init__.py: -------------------------------------------------------------------------------- 1 | try: 2 | from .EAT_pretraining import * 3 | except: 4 | import sys, os 5 | sys.path.append(os.path.join(os.path.dirname(os.path.abspath(__file__)), '.')) 6 | from EAT_pretraining import * -------------------------------------------------------------------------------- /codec_evaluation/codecs/config/hubert_preprocessor_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "do_normalize": true, 3 | "feature_extractor_type": "Wav2Vec2FeatureExtractor", 4 | "feature_size": 1, 5 | "padding_side": "right", 6 | "padding_value": 0, 7 | "return_attention_mask": true, 8 | "sampling_rate": 16000 9 | } 10 | -------------------------------------------------------------------------------- /codec_evaluation/codecs/levo_modules/Flow1dVAE/our_MERT_BESTRQ/mert_fairseq/models/mert/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # 3 | # This source code is licensed under the MIT license found in the 4 | # LICENSE file in the root directory of this source tree. 5 | 6 | from .mert_model import * # noqa -------------------------------------------------------------------------------- /codec_evaluation/codecs/levo_modules/stable_audio_tools/configs/dataset_configs/s3_wds_example.json: -------------------------------------------------------------------------------- 1 | { 2 | "dataset_type": "s3", 3 | "datasets": [ 4 | { 5 | "id": "s3-test", 6 | "s3_path": "s3://my-bucket/datasets/webdataset/audio/" 7 | } 8 | ], 9 | "random_crop": true 10 | } -------------------------------------------------------------------------------- /env_build.sh: -------------------------------------------------------------------------------- 1 | pip install -e . -i https://pypi.tuna.tsinghua.edu.cn/simple 2 | pip install git+https://github.com/haoheliu/SemantiCodec-inference@8dc464c3385d2389a695ed3f718f4a0caf3ed33f#egg=semanticodec 3 | pip install git+https://github.com/lucadellalib/WavTokenizer.git@main 4 | pip install git+https://github.com/pengzhendong/asr-decoder.git@master -------------------------------------------------------------------------------- /codec_evaluation/codecs/YuE/descriptaudiocodec/dac/__init__.py: -------------------------------------------------------------------------------- 1 | __version__ = "1.0.0" 2 | 3 | # preserved here for legacy reasons 4 | __model_version__ = "latest" 5 | 6 | import audiotools 7 | 8 | audiotools.ml.BaseModel.INTERN += ["dac.**"] 9 | audiotools.ml.BaseModel.EXTERN += ["einops"] 10 | 11 | 12 | from . import nn 13 | from . import model 14 | from .model import DAC 15 | -------------------------------------------------------------------------------- /codec_evaluation/codecs/levo_modules/stable_audio_tools/configs/dataset_configs/local_training_example.json: -------------------------------------------------------------------------------- 1 | { 2 | "dataset_type": "audio_dir", 3 | "datasets": [ 4 | { 5 | "id": "my_audio", 6 | "path": "train.jsonl", 7 | "custom_metadata_module": "custom_md_example.py" 8 | } 9 | ], 10 | "random_crop": true 11 | } 12 | -------------------------------------------------------------------------------- /codec_evaluation/codecs/levo_modules/Flow1dVAE/compare_model_weight.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import sys 3 | from safetensors.torch import load_file 4 | 5 | if __name__ == "__main__": 6 | m0, m1 = sys.argv[1], sys.argv[2] 7 | m0 = load_file(m0) 8 | m1 = load_file(m1) 9 | 10 | ks = [k for k in m0.keys() if 'bestrq' in k] 11 | for k in ks: 12 | print(k, (m0[k] - m1[k]).abs().sum()) 13 | -------------------------------------------------------------------------------- /codec_evaluation/codecs/xcodec/descriptaudiocodec/dac/__init__.py: -------------------------------------------------------------------------------- 1 | __version__ = "1.0.0" 2 | 3 | # preserved here for legacy reasons 4 | __model_version__ = "latest" 5 | 6 | import audiotools 7 | 8 | audiotools.ml.BaseModel.INTERN += ["dac.**"] 9 | audiotools.ml.BaseModel.EXTERN += ["einops"] 10 | 11 | 12 | from . import nn 13 | from . import model 14 | from .model import DAC 15 | # from .model import DACFile 16 | -------------------------------------------------------------------------------- /codec_evaluation/codecs/levo_modules/Flow1dVAE/tools/extract_rvq.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import sys 3 | 4 | if __name__=="__main__": 5 | p = sys.argv[1] 6 | bd = '/'.join(p.split('/')[:-1]) 7 | bn = p.split('/')[-1] 8 | 9 | d = {} 10 | m = torch.load(p, map_location='cpu') 11 | for k in m.keys(): 12 | if('rvq' in k): 13 | d[k] = m[k] 14 | 15 | torch.save(d, '{}/rvq.bin'.format(bd)) -------------------------------------------------------------------------------- /codec_evaluation/codecs/levo_modules/Flow1dVAE/tools/safetensor2torch.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from safetensors import safe_open 3 | import torch 4 | 5 | if __name__=="__main__": 6 | inname = sys.argv[1] 7 | outname = sys.argv[2] 8 | 9 | main_weights = {} 10 | with safe_open(inname, framework="pt", device="cpu") as f: 11 | for key in f.keys(): 12 | main_weights[key] = f.get_tensor(key) 13 | 14 | torch.save(main_weights, outname) -------------------------------------------------------------------------------- /codec_evaluation/codecs/config/qwen2audioencoder_preprocessor_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "chunk_length": 30, 3 | "feature_extractor_type": "WhisperFeatureExtractor", 4 | "feature_size": 128, 5 | "hop_length": 160, 6 | "n_fft": 400, 7 | "n_samples": 480000, 8 | "nb_max_frames": 3000, 9 | "padding_side": "right", 10 | "padding_value": 0.0, 11 | "processor_class": "WhisperProcessor", 12 | "return_attention_mask": true, 13 | "sampling_rate": 16000 14 | } 15 | -------------------------------------------------------------------------------- /codec_evaluation/codecs/levo_modules/Flow1dVAE/configs/scheduler/stable_diffusion_2.1_largenoise_sample.json: -------------------------------------------------------------------------------- 1 | { 2 | "_class_name": "DDIMScheduler", 3 | "_diffusers_version": "0.8.0", 4 | "beta_end": 0.02, 5 | "beta_schedule": "scaled_linear", 6 | "beta_start": 0.0015, 7 | "clip_sample": false, 8 | "num_train_timesteps": 1000, 9 | "prediction_type": "sample", 10 | "set_alpha_to_one": false, 11 | "skip_prk_steps": true, 12 | "steps_offset": 1, 13 | "trained_betas": null 14 | } 15 | -------------------------------------------------------------------------------- /codec_evaluation/codecs/levo_modules/stable_audio_tools/configs/model_configs/dance_diffusion/dance_diffusion_base.json: -------------------------------------------------------------------------------- 1 | { 2 | "model_type": "diffusion_uncond", 3 | "sample_size": 65536, 4 | "sample_rate": 48000, 5 | "model": { 6 | "type": "DAU1d", 7 | "config": { 8 | "n_attn_layers": 5 9 | } 10 | }, 11 | "training": { 12 | "learning_rate": 1e-4, 13 | "demo": { 14 | "demo_every": 2000, 15 | "demo_steps": 250 16 | } 17 | } 18 | } -------------------------------------------------------------------------------- /codec_evaluation/codecs/levo_modules/stable_audio_tools/configs/model_configs/dance_diffusion/dance_diffusion_large.json: -------------------------------------------------------------------------------- 1 | { 2 | "model_type": "diffusion_uncond", 3 | "sample_size": 131072, 4 | "sample_rate": 48000, 5 | "model": { 6 | "type": "DAU1d", 7 | "config": { 8 | "n_attn_layers": 5 9 | } 10 | }, 11 | "training": { 12 | "learning_rate": 1e-4, 13 | "demo": { 14 | "demo_every": 2000, 15 | "demo_steps": 250 16 | } 17 | } 18 | } -------------------------------------------------------------------------------- /codec_evaluation/codecs/YuE/modules/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | """Torch modules.""" 8 | 9 | # flake8: noqa 10 | from .conv import ( 11 | pad1d, 12 | unpad1d, 13 | NormConv1d, 14 | NormConvTranspose1d, 15 | NormConv2d, 16 | NormConvTranspose2d, 17 | SConv1d, 18 | SConvTranspose1d, 19 | ) 20 | 21 | 22 | -------------------------------------------------------------------------------- /codec_evaluation/codecs/levo_modules/stable_audio_tools/configs/model_configs/dance_diffusion/dance_diffusion_base_16k.json: -------------------------------------------------------------------------------- 1 | { 2 | "model_type": "diffusion_uncond", 3 | "sample_size": 65536, 4 | "sample_rate": 16000, 5 | "model": { 6 | "type": "DAU1d", 7 | "config": { 8 | "n_attn_layers": 5 9 | } 10 | }, 11 | "training": { 12 | "learning_rate": 1e-4, 13 | "demo": { 14 | "demo_every": 2000, 15 | "demo_steps": 250 16 | } 17 | } 18 | } -------------------------------------------------------------------------------- /codec_evaluation/codecs/levo_modules/stable_audio_tools/configs/model_configs/dance_diffusion/dance_diffusion_base_44k.json: -------------------------------------------------------------------------------- 1 | { 2 | "model_type": "diffusion_uncond", 3 | "sample_size": 65536, 4 | "sample_rate": 44100, 5 | "model": { 6 | "type": "DAU1d", 7 | "config": { 8 | "n_attn_layers": 5 9 | } 10 | }, 11 | "training": { 12 | "learning_rate": 4e-5, 13 | "demo": { 14 | "demo_every": 2000, 15 | "demo_steps": 250 16 | } 17 | } 18 | } -------------------------------------------------------------------------------- /codec_evaluation/codecs/levo_modules/Flow1dVAE/our_MERT_BESTRQ/modify_env.md: -------------------------------------------------------------------------------- 1 | cp -r fairseq/fairseq/model_parallel/megatron /opt/conda/envs/map/lib/python3.8/site-packages/fairseq/model_parallel/ 2 | vi /opt/conda/envs/map/lib/python3.8/site-packages/apex/amp/_initialize.py # string_classes = str 3 | vi /opt/conda/envs/map/lib/python3.8/site-packages/fairseq/modules/layer_norm.py 4 | vi /opt/conda/envs/map/lib/python3.8/site-packages/fairseq/distributed/utils.py # import datetime; timeout=datetime.timedelta(seconds=51200); logger.info("add nccl time to 51200") 5 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | beartype==0.20.0 2 | descript-audio-codec==1.0.0 3 | huggingface_hub==0.29.1 4 | speechtokenizer==1.0.1 5 | tensorboard==2.19.0 6 | torch==2.6.0 7 | torchaudio==2.6.0 8 | transformers==4.49.0 9 | vocos==0.1.0 10 | einops==0.8.1 11 | numpy==1.26.4 12 | descript-audiotools>=0.7.2 13 | scipy==1.10.1 14 | torchmetrics==1.4.1 15 | pytorch-lightning==2.4.0 16 | hydra-core==1.3.2 17 | omegaconf==2.3.0 18 | jiwer==3.1.0 19 | conformer==0.3.2 20 | pandas==2.2.3 21 | sentencepiece==0.2.0 22 | pesq==0.0.4 23 | speechbrain==1.0.2 24 | pandas==2.2.3 25 | -------------------------------------------------------------------------------- /doc/chore.md: -------------------------------------------------------------------------------- 1 | ## Software Packaging and Distribute 2 | 3 | Install `build` and generate built distribution. 4 | ``` 5 | pip install build 6 | python -m build 7 | ``` 8 | 9 | Install `twine` and upload .whl and .tar.gz file. 10 | 11 | > To securely upload your project, you'll need a PyPI API token. Create one at https://pypi.org/manage/account/#api-tokens, setting the "Scope" to "Entire account". Don't close the page until you have copied and saved the token — you won't see that token again. 12 | 13 | ``` 14 | pip install twine 15 | twine upload --repository pypi dist/* 16 | ``` 17 | -------------------------------------------------------------------------------- /codec_evaluation/codecs/levo_modules/Flow1dVAE/tools/get_1dvae.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from tqdm import tqdm 3 | import torchaudio 4 | from third_party.stable_audio_tools.stable_audio_tools.models.autoencoders import create_autoencoder_from_config 5 | import numpy as np 6 | import os 7 | import json 8 | 9 | def get_model(model_config, path): 10 | with open(model_config) as f: 11 | model_config = json.load(f) 12 | state_dict = torch.load(path) 13 | model = create_autoencoder_from_config(model_config) 14 | model.load_state_dict(state_dict['state_dict']) 15 | return model -------------------------------------------------------------------------------- /codec_evaluation/codecs/levo_modules/Flow1dVAE/tools/get_1dvae_1920.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from tqdm import tqdm 3 | import torchaudio 4 | from third_party.stable_audio_tools.stable_audio_tools.models.autoencoders import create_autoencoder_from_config 5 | import numpy as np 6 | import os 7 | import json 8 | 9 | def get_model(model_config, path): 10 | with open(model_config) as f: 11 | model_config = json.load(f) 12 | state_dict = torch.load(path) 13 | model = create_autoencoder_from_config(model_config) 14 | model.load_state_dict(state_dict['state_dict']) 15 | return model -------------------------------------------------------------------------------- /codec_evaluation/codecs/levo_modules/Flow1dVAE/tools/get_1dvae_large_melvae.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from tqdm import tqdm 3 | import torchaudio 4 | from third_party.stable_audio_tools.stable_audio_tools.models.autoencoders import create_autoencoder_from_config 5 | import numpy as np 6 | import os 7 | import json 8 | 9 | def get_model(model_config, path): 10 | with open(model_config) as f: 11 | model_config = json.load(f) 12 | state_dict = torch.load(path, map_location='cpu') 13 | model = create_autoencoder_from_config(model_config) 14 | model.load_state_dict(state_dict['state_dict'], strict=False) 15 | return model -------------------------------------------------------------------------------- /codec_evaluation/codecs/levo_modules/Flow1dVAE/tools/get_1dvae_large.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from tqdm import tqdm 3 | import torchaudio 4 | from codec_evaluation.codecs.levo_modules.stable_audio_tools.models.autoencoders import create_autoencoder_from_config 5 | import numpy as np 6 | import os 7 | import json 8 | 9 | def get_model(model_config, path): 10 | with open(model_config) as f: 11 | model_config = json.load(f) 12 | state_dict = torch.load(path, map_location='cpu') 13 | model = create_autoencoder_from_config(model_config) 14 | model.load_state_dict(state_dict['state_dict'], strict=False) 15 | return model 16 | -------------------------------------------------------------------------------- /codec_evaluation/codecs/levo_modules/Flow1dVAE/our_MERT_BESTRQ/mert_fairseq/config/pretrain/run/submitit_reg.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | 3 | hydra: 4 | launcher: 5 | cpus_per_task: 8 6 | gpus_per_node: 8 7 | tasks_per_node: ${hydra.launcher.gpus_per_node} 8 | nodes: 4 9 | comment: null 10 | mem_gb: 384 11 | timeout_min: 4320 12 | max_num_timeout: 100 13 | constraint: volta32gb 14 | name: ${hydra.job.config_name}/${hydra.job.override_dirname} 15 | submitit_folder: ${hydra.sweep.dir}/submitit/%j 16 | 17 | distributed_training: 18 | distributed_world_size: 32 19 | distributed_port: 29671 20 | nprocs_per_node: 8 21 | -------------------------------------------------------------------------------- /codec_evaluation/codecs/levo_modules/Flow1dVAE/our_MERT_BESTRQ/mert_fairseq/data/eat_data/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # 3 | # This source code is licensed under the MIT license found in the 4 | # LICENSE file in the root directory of this source tree. 5 | try: 6 | from .mae_image_dataset import MaeImageDataset 7 | from .raw_audio_dataset import FileAudioDataset 8 | except: 9 | import sys, os 10 | sys.path.append(os.path.join(os.path.dirname(os.path.abspath(__file__)), '.')) 11 | from mae_image_dataset import MaeImageDataset 12 | from raw_audio_dataset import FileAudioDataset 13 | 14 | __all__ = [ 15 | "MaeImageDataset", 16 | "FileAudioDataset", 17 | ] -------------------------------------------------------------------------------- /codec_evaluation/codecs/levo_modules/Flow1dVAE/tools/compare_2models.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import sys 3 | 4 | if __name__=="__main__": 5 | m1, m2 = sys.argv[1:3] 6 | m1 = torch.load(m1, map_location = 'cpu') 7 | m2 = torch.load(m2, map_location = 'cpu') 8 | m1_keys = set(m1.keys()) 9 | m2_keys = set(m2.keys()) 10 | 11 | m1_uniq_keys = m1_keys - m2_keys 12 | m2_uniq_keys = m2_keys - m1_keys 13 | m12_shared_keys = m1_keys & m2_keys 14 | 15 | print("m1_uniq_keys: ", m1_uniq_keys) 16 | print("m2_uniq_keys: ", m2_uniq_keys) 17 | print("m12_shared_keys but different: ") 18 | for k in m12_shared_keys: 19 | if(m1[k].numel() != m2[k].numel()): 20 | print(k,m1[k].shape,m2[k].shape) 21 | -------------------------------------------------------------------------------- /codec_evaluation/perplexity/config/ppl_model_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "attention_dropout": 0.0, 3 | "bos_token_id": 151643, 4 | "eos_token_id": 151643, 5 | "hidden_act": "silu", 6 | "hidden_size": 768, 7 | "initializer_range": 0.02, 8 | "intermediate_size": 3072, 9 | "max_position_embeddings": 32768, 10 | "max_window_layers": 24, 11 | "model_type": "qwen2", 12 | "num_attention_heads": 12, 13 | "num_hidden_layers": 10, 14 | "num_key_value_heads": 2, 15 | "rms_norm_eps": 1e-06, 16 | "rope_theta": 1000000.0, 17 | "tie_word_embeddings": true, 18 | "torch_dtype": "bfloat16", 19 | "use_cache": true, 20 | "use_mrope": false, 21 | "use_sliding_window": false, 22 | "vocab_size": 151936 23 | } -------------------------------------------------------------------------------- /codec_evaluation/codecs/YuE/modules/norm.py: -------------------------------------------------------------------------------- 1 | """Normalization modules.""" 2 | import typing as tp 3 | import einops 4 | import torch 5 | from torch import nn 6 | 7 | class ConvLayerNorm(nn.LayerNorm): 8 | """ 9 | Convolution-friendly LayerNorm that moves channels to last dimensions 10 | before running the normalization and moves them back to original position right after. 11 | """ 12 | def __init__(self, normalized_shape: tp.Union[int, tp.List[int], torch.Size], **kwargs): 13 | super().__init__(normalized_shape, **kwargs) 14 | 15 | def forward(self, x): 16 | x = einops.rearrange(x, 'b ... t -> b t ...') 17 | x = super().forward(x) 18 | x = einops.rearrange(x, 'b t ... -> b ... t') 19 | return 20 | -------------------------------------------------------------------------------- /codec_evaluation/codecs/levo_modules/Flow1dVAE/cal_token_stat.py: -------------------------------------------------------------------------------- 1 | import kaldiio 2 | from tqdm import tqdm 3 | import torch 4 | 5 | if __name__ == "__main__": 6 | bar = torch.zeros(1, 16384) 7 | with open('token.scp', 'r') as f: 8 | for item_idx, line in tqdm(enumerate(f)): 9 | idx, pos = line.strip().split() 10 | codes = kaldiio.load_mat(pos) 11 | for i0 in range(codes.shape[-1]): 12 | bar[0, codes[0, 0, i0]] += 1 13 | if(item_idx % 1000 == 0): 14 | print("=========") 15 | print(1 - (bar[0]==0).sum() / bar.shape[-1]) 16 | print("=========") 17 | print("=========") 18 | print(1 - (bar[0]==0).sum() / bar.shape[-1]) 19 | print("=========") -------------------------------------------------------------------------------- /codec_evaluation/codecs/version.py: -------------------------------------------------------------------------------- 1 | # ============================================================================== 2 | # Copyright 2024 Luca Della Libera. All Rights Reserved. 3 | # ============================================================================== 4 | 5 | """Version according to SemVer versioning system (https://semver.org/).""" 6 | 7 | 8 | __all__ = [ 9 | "VERSION", 10 | ] 11 | 12 | 13 | _MAJOR = "0" # Major version to increment in case of incompatible API changes 14 | 15 | _MINOR = ( 16 | "0" # Minor version to increment in case of backward compatible new functionality 17 | ) 18 | 19 | _PATCH = "1" # Patch version to increment in case of backward compatible bug fixes 20 | 21 | VERSION = f"{_MAJOR}.{_MINOR}.{_PATCH}" 22 | """The package version.""" 23 | -------------------------------------------------------------------------------- /codec_evaluation/utils/schedule.py: -------------------------------------------------------------------------------- 1 | import math 2 | 3 | 4 | def get_cosine_schedule_with_warmup_lr_lambda( 5 | current_step: int, 6 | *, 7 | num_warmup_steps: int | float, 8 | num_training_steps: int, 9 | num_cycles: float = 0.5, 10 | final_lr_ratio: float = 0.0, 11 | ): 12 | if 0 < num_warmup_steps < 1: # float mode 13 | num_warmup_steps = int(num_warmup_steps * num_training_steps) 14 | 15 | if current_step < num_warmup_steps: 16 | return float(current_step) / float(max(1, num_warmup_steps)) 17 | 18 | progress = float(current_step - num_warmup_steps) / float( 19 | max(1, num_training_steps - num_warmup_steps) 20 | ) 21 | 22 | return max( 23 | final_lr_ratio, 24 | 0.5 * (1.0 + math.cos(math.pi * float(num_cycles) * 2.0 * progress)), 25 | ) -------------------------------------------------------------------------------- /codec_evaluation/codecs/YuE/descriptaudiocodec/dac/nn/layers.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from torch.nn.utils import weight_norm 4 | 5 | def WNConv1d(*args, **kwargs): 6 | return weight_norm(nn.Conv1d(*args, **kwargs)) 7 | 8 | def WNConvTranspose1d(*args, **kwargs): 9 | return weight_norm(nn.ConvTranspose1d(*args, **kwargs)) 10 | 11 | # Scripting this brings model speed up 1.4x 12 | @torch.jit.script 13 | def snake(x, alpha): 14 | shape = x.shape 15 | x = x.reshape(shape[0], shape[1], -1) 16 | x = x + (alpha + 1e-9).reciprocal() * torch.sin(alpha * x).pow(2) 17 | x = x.reshape(shape) 18 | return x 19 | 20 | class Snake1d(nn.Module): 21 | def __init__(self, channels): 22 | super().__init__() 23 | self.alpha = nn.Parameter(torch.ones(1, channels, 1)) 24 | 25 | def forward(self, x): 26 | return snake(x, self.alpha) 27 | -------------------------------------------------------------------------------- /codec_evaluation/codecs/xcodec/descriptaudiocodec/dac/nn/layers.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from torch.nn.utils import weight_norm 4 | 5 | def WNConv1d(*args, **kwargs): 6 | return weight_norm(nn.Conv1d(*args, **kwargs)) 7 | 8 | def WNConvTranspose1d(*args, **kwargs): 9 | return weight_norm(nn.ConvTranspose1d(*args, **kwargs)) 10 | 11 | # Scripting this brings model speed up 1.4x 12 | @torch.jit.script 13 | def snake(x, alpha): 14 | shape = x.shape 15 | x = x.reshape(shape[0], shape[1], -1) 16 | x = x + (alpha + 1e-9).reciprocal() * torch.sin(alpha * x).pow(2) 17 | x = x.reshape(shape) 18 | return x 19 | 20 | class Snake1d(nn.Module): 21 | def __init__(self, channels): 22 | super().__init__() 23 | self.alpha = nn.Parameter(torch.ones(1, channels, 1)) 24 | 25 | def forward(self, x): 26 | return snake(x, self.alpha) 27 | -------------------------------------------------------------------------------- /codec_evaluation/codecs/levo_modules/Flow1dVAE/configs/models/transformer2D_wocross_inch112_1x4_multi_large.json: -------------------------------------------------------------------------------- 1 | { 2 | "_class_name": "Transformer2DModel", 3 | "_diffusers_version": "0.22.0.dev0", 4 | "activation_fn": "gelu-approximate", 5 | "attention_bias": true, 6 | "attention_head_dim": 72, 7 | "attention_type": "default", 8 | "cross_attention_dim": null, 9 | "double_self_attention": false, 10 | "dropout": 0.0, 11 | "in_channels": 96, 12 | "norm_elementwise_affine": false, 13 | "norm_eps": 1e-06, 14 | "norm_num_groups": 32, 15 | "norm_type": "ada_norm_single", 16 | "num_attention_heads": 22, 17 | "num_embeds_ada_norm": 1000, 18 | "num_layers": 24, 19 | "num_vector_embeds": null, 20 | "only_cross_attention": false, 21 | "out_channels": 32, 22 | "patch_size": 2, 23 | "sample_size": 384, 24 | "upcast_attention": false, 25 | "use_linear_projection": false 26 | } -------------------------------------------------------------------------------- /codec_evaluation/codecs/levo_modules/Flow1dVAE/tools/get_whisper_encoder.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from transformers import WhisperProcessor, WhisperForConditionalGeneration 3 | 4 | def get_whisper_encoder(): 5 | processor = WhisperProcessor.from_pretrained("openai/whisper-large-v3") 6 | model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large-v3").model.encoder 7 | return processor, model.eval() 8 | 9 | if __name__=="__main__": 10 | import numpy as np 11 | processor, model = get_whisper_encoder() 12 | model = model.cuda() 13 | 14 | with torch.no_grad(): 15 | input_features = processor(np.random.rand(16000*30,), sampling_rate=16000, return_tensors="pt").input_features.cuda() 16 | print(input_features.shape) 17 | out = model(input_features.repeat(10,1,1)) 18 | import pdb;pdb.set_trace() 19 | print(list(out.values())[0].shape) 20 | -------------------------------------------------------------------------------- /codec_evaluation/codecs/levo_modules/stable_audio_tools/models/pretrained.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | from .factory import create_model_from_config 4 | from .utils import load_ckpt_state_dict 5 | 6 | from huggingface_hub import hf_hub_download 7 | 8 | def get_pretrained_model(name: str): 9 | 10 | model_config_path = hf_hub_download(name, filename="model_config.json", repo_type='model') 11 | 12 | with open(model_config_path) as f: 13 | model_config = json.load(f) 14 | 15 | model = create_model_from_config(model_config) 16 | 17 | # Try to download the model.safetensors file first, if it doesn't exist, download the model.ckpt file 18 | try: 19 | model_ckpt_path = hf_hub_download(name, filename="model.safetensors", repo_type='model') 20 | except Exception as e: 21 | model_ckpt_path = hf_hub_download(name, filename="model.ckpt", repo_type='model') 22 | 23 | model.load_state_dict(load_ckpt_state_dict(model_ckpt_path)) 24 | 25 | return model, model_config -------------------------------------------------------------------------------- /codec_evaluation/codecs/levo_modules/Flow1dVAE/tools/transmodelnorm.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | if __name__=="__main__": 4 | src_ckpt = 'saved/train_mulan_v3_48k_everything3/latest/pytorch_model_2.bin' 5 | tgt_ckpt = 'saved/train_mulan_v3_48k_everything3_sepnorm/src_pytorch_model_2.bin' 6 | # src_ckpt = 'saved/train_enhcodec2D_again/latest/pytorch_model_3.bin' 7 | # tgt_ckpt = 'saved/train_enhcodec2D_again_sepnorm/pytorch_model_3.bin' 8 | 9 | ckpt = torch.load(src_ckpt, map_location='cpu') 10 | 11 | ckpt['normfeat.sum_x'] = torch.ones(16, 32, dtype=ckpt['normfeat.sum_x'].dtype) * ckpt['normfeat.sum_x'] / ckpt['normfeat.counts'] 12 | ckpt['normfeat.sum_x2'] = torch.ones(16, 32, dtype=ckpt['normfeat.sum_x2'].dtype) * ckpt['normfeat.sum_x2'] / ckpt['normfeat.counts'] 13 | ckpt['normfeat.sum_target_x2'] = torch.ones(16, 32, dtype=ckpt['normfeat.sum_target_x2'].dtype) * ckpt['normfeat.sum_target_x2'] / ckpt['normfeat.counts'] 14 | ckpt['normfeat.counts'] = torch.ones_like(ckpt['normfeat.counts']) 15 | torch.save(ckpt, tgt_ckpt) 16 | -------------------------------------------------------------------------------- /codec_evaluation/codecs/YuE/RepCodec/repcodec/modules/residual_unit.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | from codec_evaluation.codecs.YuE.RepCodec.repcodec.layers.conv_layer import Conv1d, Conv1d1x1 3 | 4 | class ResidualUnit(nn.Module): 5 | def __init__( 6 | self, 7 | in_channels: int, 8 | out_channels: int, 9 | kernel_size=3, 10 | dilation=1, 11 | bias=False, 12 | nonlinear_activation="ELU", 13 | nonlinear_activation_params={}, 14 | ): 15 | super().__init__() 16 | self.activation = getattr(nn, nonlinear_activation)(**nonlinear_activation_params) 17 | self.conv1 = Conv1d( 18 | in_channels=in_channels, 19 | out_channels=out_channels, 20 | kernel_size=kernel_size, 21 | stride=1, 22 | dilation=dilation, 23 | bias=bias, 24 | ) 25 | self.conv2 = Conv1d1x1(out_channels, out_channels, bias) 26 | 27 | def forward(self, x): 28 | y = self.conv1(self.activation(x)) 29 | y = self.conv2(self.activation(y)) 30 | return x + y 31 | -------------------------------------------------------------------------------- /codec_evaluation/codecs/levo_modules/stable_audio_tools/inference/utils.py: -------------------------------------------------------------------------------- 1 | from ..data.utils import PadCrop 2 | 3 | from torchaudio import transforms as T 4 | 5 | def set_audio_channels(audio, target_channels): 6 | if target_channels == 1: 7 | # Convert to mono 8 | audio = audio.mean(1, keepdim=True) 9 | elif target_channels == 2: 10 | # Convert to stereo 11 | if audio.shape[1] == 1: 12 | audio = audio.repeat(1, 2, 1) 13 | elif audio.shape[1] > 2: 14 | audio = audio[:, :2, :] 15 | return audio 16 | 17 | def prepare_audio(audio, in_sr, target_sr, target_length, target_channels, device): 18 | 19 | audio = audio.to(device) 20 | 21 | if in_sr != target_sr: 22 | resample_tf = T.Resample(in_sr, target_sr).to(device) 23 | audio = resample_tf(audio) 24 | 25 | audio = PadCrop(target_length, randomize=False)(audio) 26 | 27 | # Add batch dimension 28 | if audio.dim() == 1: 29 | audio = audio.unsqueeze(0).unsqueeze(0) 30 | elif audio.dim() == 2: 31 | audio = audio.unsqueeze(0) 32 | 33 | audio = set_audio_channels(audio, target_channels) 34 | 35 | return audio -------------------------------------------------------------------------------- /codec_evaluation/codecs/levo_modules/Flow1dVAE/models_gpt/models/tokenizer/pinyin/symbols.py: -------------------------------------------------------------------------------- 1 | _pause = ["sil", "eos", "sp", "#0", "#1", "#2", "#3"] 2 | 3 | _initials = [ 4 | "^", 5 | "b", 6 | "c", 7 | "ch", 8 | "d", 9 | "f", 10 | "g", 11 | "h", 12 | "j", 13 | "k", 14 | "l", 15 | "m", 16 | "n", 17 | "p", 18 | "q", 19 | "r", 20 | "s", 21 | "sh", 22 | "t", 23 | "x", 24 | "z", 25 | "zh", 26 | ] 27 | 28 | _tones = ["1", "2", "3", "4", "5"] 29 | 30 | _finals = [ 31 | "a", 32 | "ai", 33 | "an", 34 | "ang", 35 | "ao", 36 | "e", 37 | "ei", 38 | "en", 39 | "eng", 40 | "er", 41 | "i", 42 | "ia", 43 | "ian", 44 | "iang", 45 | "iao", 46 | "ie", 47 | "ii", 48 | "iii", 49 | "in", 50 | "ing", 51 | "iong", 52 | "iou", 53 | "o", 54 | "ong", 55 | "ou", 56 | "u", 57 | "ua", 58 | "uai", 59 | "uan", 60 | "uang", 61 | "uei", 62 | "uen", 63 | "ueng", 64 | "uo", 65 | "v", 66 | "van", 67 | "ve", 68 | "vn", 69 | ] 70 | 71 | symbols = _pause + _initials + [i + j for i in _finals for j in _tones] 72 | -------------------------------------------------------------------------------- /codec_evaluation/utils/demucs/models/pretrained.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | """ 4 | @File : pretrained.py 5 | @Time : 2023/8/8 下午7:22 6 | @Author : waytan 7 | @Contact : waytan@tencent.com 8 | @License : (C)Copyright 2023, Tencent 9 | @Desc : Loading pretrained models. 10 | """ 11 | from pathlib import Path 12 | 13 | import yaml 14 | 15 | from .apply import BagOfModels 16 | from .htdemucs import HTDemucs 17 | from .states import load_state_dict 18 | 19 | 20 | def add_model_flags(parser): 21 | group = parser.add_mutually_exclusive_group(required=False) 22 | group.add_argument("-s", "--sig", help="Locally trained XP signature.") 23 | group.add_argument("-n", "--name", default=None, 24 | help="Pretrained model name or signature. Default is htdemucs.") 25 | parser.add_argument("--repo", type=Path, 26 | help="Folder containing all pre-trained models for use with -n.") 27 | 28 | 29 | def get_model_from_yaml(yaml_file, model_file): 30 | bag = yaml.safe_load(open(yaml_file)) 31 | model = load_state_dict(HTDemucs, model_file) 32 | weights = bag.get('weights') 33 | segment = bag.get('segment') 34 | return BagOfModels([model], weights, segment) 35 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | from pathlib import Path 3 | 4 | 5 | requirements_path = Path(__file__).parent / "requirements.txt" 6 | readme_path = Path(__file__).parent / "README.md" 7 | 8 | # 读取 requirements.txt 9 | install_requires = [] 10 | with open(requirements_path, encoding="utf-8") as f: 11 | install_requires.extend([item for item in f.read().splitlines() if item.strip()]) 12 | 13 | setup( 14 | name="codec_evaluation", 15 | version="0.1.0", 16 | packages=find_packages(), 17 | install_requires=install_requires, 18 | description="A benchmark for codec evaluation", 19 | long_description=readme_path.read_text(encoding="utf-8") 20 | if readme_path.exists() 21 | else "", 22 | long_description_content_type="text/markdown", 23 | python_requires=">=3.10", 24 | entry_points={ 25 | "console_scripts": [ 26 | "codec_eval_probe = codec_evaluation.probe.train.train_inference:cli", 27 | "codec_eval_id_sensitive = codec_evaluation.id_sensitive.eval:cli", 28 | "codec_eval_reconstruction_speech = codec_evaluation.reconstruction_eval.reconstruction_speech_eval:cli", 29 | "codec_eval_reconstruction_music = codec_evaluation.reconstruction_eval.reconstruction_music_eval:cli", 30 | "codec_eval_ppl = codec_evaluation.perplexity.train_inference:cli", 31 | ] 32 | }, 33 | ) 34 | -------------------------------------------------------------------------------- /codec_evaluation/utils/demucs/models/spec.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | """ 4 | @File : spec.py 5 | @Time : 2023/8/8 下午5:10 6 | @Author : waytan 7 | @Contact : waytan@tencent.com 8 | @License : (C)Copyright 2023, Tencent 9 | @Desc : Spec 10 | """ 11 | 12 | import torch as th 13 | 14 | 15 | def spectro(x, n_fft=512, hop_length=None, pad=0): 16 | *other, length = x.shape 17 | x = x.reshape(-1, length) 18 | is_mps = x.device.type == 'mps' 19 | if is_mps: 20 | x = x.cpu() 21 | z = th.stft(x, 22 | n_fft * (1 + pad), 23 | hop_length or n_fft // 4, 24 | window=th.hann_window(n_fft).to(x), 25 | win_length=n_fft, 26 | normalized=True, 27 | center=True, 28 | return_complex=True, 29 | pad_mode='reflect') 30 | _, freqs, frame = z.shape 31 | return z.view(*other, freqs, frame) 32 | 33 | 34 | def ispectro(z, hop_length=None, length=None, pad=0): 35 | *other, freqs, frames = z.shape 36 | n_fft = 2 * freqs - 2 37 | z = z.view(-1, freqs, frames) 38 | win_length = n_fft // (1 + pad) 39 | is_mps = z.device.type == 'mps' 40 | if is_mps: 41 | z = z.cpu() 42 | x = th.istft(z, 43 | n_fft, 44 | hop_length, 45 | window=th.hann_window(win_length).to(z.real), 46 | win_length=win_length, 47 | normalized=True, 48 | length=length, 49 | center=True) 50 | _, length = x.shape 51 | return x.view(*other, length) 52 | -------------------------------------------------------------------------------- /codec_evaluation/codecs/levo_modules/Flow1dVAE/our_MERT_BESTRQ/test.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import torch 4 | from dataclasses import dataclass 5 | from logging import getLogger 6 | import fairseq.utils 7 | from fairseq.checkpoint_utils import load_model_ensemble_and_task 8 | 9 | logger = getLogger(__name__) 10 | 11 | @dataclass 12 | class UserDirModule: 13 | user_dir: str 14 | 15 | def find_project_root(current_path: str, target_folder: str = "Codec-Evaluation"): 16 | path = os.path.abspath(current_path) 17 | while True: 18 | if os.path.basename(path) == target_folder: 19 | return path 20 | parent = os.path.dirname(path) 21 | if parent == path: 22 | raise FileNotFoundError(f"Cannot find project root folder '{target_folder}' from {current_path}") 23 | path = parent 24 | 25 | def load_model(model_dir, checkpoint_dir): 26 | '''Load Fairseq SSL model''' 27 | project_root = find_project_root(os.path.dirname(__file__), target_folder="Codec-Evaluation") 28 | mert_path = os.path.join(project_root, "codec_evaluation", "codecs", model_dir) 29 | # model_dir 已经是完整目录到 mert_fairseq 30 | mert_path = os.path.abspath(mert_path) 31 | 32 | if not os.path.exists(mert_path): 33 | raise FileNotFoundError(f"Cannot find mert_fairseq in {mert_path} or {fixed_path}") 34 | 35 | # 加入 sys.path 36 | if mert_path not in sys.path: 37 | sys.path.insert(0, mert_path) 38 | 39 | # import_user_module 40 | module_args = UserDirModule(user_dir=mert_path) 41 | fairseq.utils.import_user_module(module_args) 42 | 43 | # 载入 checkpoint 44 | model, cfg, task = load_model_ensemble_and_task([checkpoint_dir], strict=False) 45 | model = model[0] 46 | 47 | return model 48 | 49 | -------------------------------------------------------------------------------- /codec_evaluation/codecs/levo_modules/Flow1dVAE/extract_codes_stereo_7_1x4.py: -------------------------------------------------------------------------------- 1 | import torch,torchaudio 2 | import os,sys,json 3 | from tqdm import tqdm 4 | 5 | #from codeclm_song_v1.codeclm.semantic_extractor.SpeechDecoder_v01.generate import Tango 6 | from generate_4rvq import Tango 7 | import kaldiio 8 | from kaldiio import WriteHelper 9 | 10 | if __name__ == "__main__": 11 | # Define Model 12 | json_path = sys.argv[1] 13 | outdir = sys.argv[2] 14 | 15 | mus_infos = [] 16 | with open(json_path) as f: 17 | for line in f: 18 | item = json.loads(line) 19 | mus_infos.append(item) 20 | 21 | tango = Tango(model_path = './saved/model_4rvq/model_2_fixed.safetensors', rvq_num=4) 22 | 23 | 24 | # Feature extraction loop 25 | # for i in tqdm(range(2000)): 26 | with WriteHelper('ark,scp:{}/token.ark,{}/token.scp'.format(outdir, outdir), write_function="pickle") as writer: 27 | print('ark,scp:{}/token.ark,{}/token.scp'.format(outdir, outdir)) 28 | for item in tqdm(mus_infos): 29 | try: 30 | # if True: 31 | idx = item['idx'] 32 | # print(idx) 33 | with torch.autocast(device_type="cuda", dtype=torch.float16): 34 | if(os.path.exists(item['path'])): 35 | codes = tango.file2code(item['path']) 36 | else: 37 | codes = tango.file2code('/mnt/share/' + item['path']) 38 | writer(str(idx), codes.cpu()) 39 | except: 40 | print(item['path']) 41 | continue 42 | # idx = item['idx'] 43 | # # print(idx) 44 | # with torch.autocast(device_type="cuda", dtype=torch.float16): 45 | # codes = tango.file2code(item['path']) 46 | # writer(str(idx), codes.cpu()) -------------------------------------------------------------------------------- /codec_evaluation/codecs/levo_modules/Flow1dVAE/tools/mix.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def a_weight(fs, n_fft, min_db=-80.0): 5 | freq = np.linspace(0, fs // 2, n_fft // 2 + 1) 6 | freq_sq = np.power(freq, 2) 7 | freq_sq[0] = 1.0 8 | weight = 2.0 + 20.0 * (2 * np.log10(12194) + 2 * np.log10(freq_sq) 9 | - np.log10(freq_sq + 12194 ** 2) 10 | - np.log10(freq_sq + 20.6 ** 2) 11 | - 0.5 * np.log10(freq_sq + 107.7 ** 2) 12 | - 0.5 * np.log10(freq_sq + 737.9 ** 2)) 13 | weight = np.maximum(weight, min_db) 14 | 15 | return weight 16 | 17 | 18 | def compute_gain(sound, fs, min_db=-80.0, mode="A_weighting"): 19 | if fs == 16000: 20 | n_fft = 2048 21 | elif fs == 44100: 22 | n_fft = 4096 23 | else: 24 | raise Exception("Invalid fs {}".format(fs)) 25 | stride = n_fft // 2 26 | 27 | gain = [] 28 | for i in range(0, len(sound) - n_fft + 1, stride): 29 | if mode == "RMSE": 30 | g = np.mean(sound[i: i + n_fft] ** 2) 31 | elif mode == "A_weighting": 32 | spec = np.fft.rfft(np.hanning(n_fft + 1)[:-1] * sound[i: i + n_fft]) 33 | power_spec = np.abs(spec) ** 2 34 | a_weighted_spec = power_spec * np.power(10, a_weight(fs, n_fft) / 10) 35 | g = np.sum(a_weighted_spec) 36 | else: 37 | raise Exception("Invalid mode {}".format(mode)) 38 | gain.append(g) 39 | 40 | gain = np.array(gain) 41 | gain = np.maximum(gain, np.power(10, min_db / 10)) 42 | gain_db = 10 * np.log10(gain) 43 | return gain_db 44 | 45 | 46 | def mix(sound1, sound2, r, fs): 47 | gain1 = np.max(compute_gain(sound1, fs)) # Decibel 48 | gain2 = np.max(compute_gain(sound2, fs)) 49 | t = 1.0 / (1 + np.power(10, (gain1 - gain2) / 20.) * (1 - r) / r) 50 | sound = ((sound1 * t + sound2 * (1 - t)) / np.sqrt(t ** 2 + (1 - t) ** 2)) 51 | return sound -------------------------------------------------------------------------------- /codec_evaluation/codecs/levo_modules/Flow1dVAE/tools/check_stereo.py: -------------------------------------------------------------------------------- 1 | ''' 2 | TAMPLEATE = { 3 | "path": "" 4 | "duration": "" 5 | "sample_rate": "" 6 | "amplitude": null, 7 | "weight": null, 8 | "info_path": null 9 | } 10 | ''' 11 | import torchaudio 12 | import json 13 | from tqdm import tqdm 14 | 15 | import torchaudio 16 | import numpy as np 17 | import torch, torch.nn as nn, random 18 | from torchaudio import transforms 19 | import os 20 | import argparse 21 | from tqdm import tqdm 22 | import torchaudio 23 | from torchaudio.transforms import Resample 24 | from multiprocessing import Pool 25 | 26 | def preprocess(args, wav_json, thread_id): 27 | # f = open("pretrain_tme_20230927.scp").readlines() 28 | f = open("out.{}".format(thread_id), 'w') 29 | for line in tqdm(wav_json): 30 | try: 31 | # import pdb; pdb.set_trace() 32 | line = line.strip() 33 | wav_info = json.loads(line) 34 | meta = torchaudio.info(wav_info["path"]) 35 | 36 | wav_info["num_channels"] = meta.num_channels 37 | json_string = json.dumps(wav_info) 38 | # print(json_string) 39 | f.write("{}\n".format(json_string)) 40 | except: 41 | print(line) 42 | 43 | if __name__ == "__main__": 44 | 45 | parser = argparse.ArgumentParser(description='Deep Speaker Embedding Inference') 46 | parser.add_argument('--wav_json', type=str) 47 | parser.add_argument('--num_thread', default=10, type=int, help='random seed') 48 | args = parser.parse_args() 49 | 50 | wav_json_total = open(args.wav_json).readlines() 51 | args.num_thread = min(len(wav_json_total), args.num_thread) 52 | wav_json_list = np.array_split(wav_json_total, args.num_thread) 53 | 54 | p = Pool(args.num_thread) 55 | for thread_id, wav_json in enumerate(wav_json_list): 56 | r = p.apply_async(preprocess, (args, wav_json, thread_id)) 57 | p.close() 58 | p.join() 59 | r.get() 60 | -------------------------------------------------------------------------------- /codec_evaluation/codecs/levo_modules/Flow1dVAE/tools/infer_encodec.py: -------------------------------------------------------------------------------- 1 | import json 2 | import torch 3 | from tqdm import tqdm 4 | from audiocraft.models.loaders import load_compression_model 5 | import torchaudio 6 | import librosa 7 | import os 8 | import math 9 | import numpy as np 10 | 11 | class Tango: 12 | def __init__(self, \ 13 | device="cuda:0"): 14 | 15 | self.sample_rate = 48000 16 | self.rsp48to32 = torchaudio.transforms.Resample(48000, 32000).to(device) 17 | self.rsp32to48 = torchaudio.transforms.Resample(32000, 48000).to(device) 18 | 19 | encodec = load_compression_model('compression_state_dict.bin', device='cpu').eval() 20 | encodec.set_num_codebooks(1) 21 | self.encodec = encodec.eval().to(device) 22 | self.device = torch.device(device) 23 | print ("Successfully loaded encodec model") 24 | 25 | @torch.no_grad() 26 | def remix(self, filename, start_step=1000, steps=999, disable_progress=False): 27 | """ Genrate audio without condition. """ 28 | init_audio, _ = librosa.load(filename, sr=self.sample_rate, mono=False) 29 | if(len(init_audio.shape)>1):init_audio = init_audio[0] 30 | init_audio = torch.from_numpy(init_audio)[None,None,:].to(self.device) 31 | init_audio = init_audio[:,:,int(0*self.sample_rate):int(10.24*3*self.sample_rate)] 32 | if(init_audio.shape[-1]1):init_audio = init_audio[0] 33 | init_audio = torch.from_numpy(init_audio)[None,None,:].to(self.device) 34 | init_audio = init_audio[:,:,0:int(10.24*2*self.sample_rate)] 35 | if(init_audio.shape[-1] None: 16 | """Initializes a multi-GPU-friendly python command line logger that logs on all processes 17 | with their rank prefixed in the log message. 18 | 19 | :param name: The name of the logger. Default is ``__name__``. 20 | :param rank_zero_only: Whether to force all logs to only occur on the rank zero process. Default is `False`. 21 | :param extra: (Optional) A dict-like object which provides contextual information. See `logging.LoggerAdapter`. 22 | """ 23 | logger = logging.getLogger(name) 24 | super().__init__(logger=logger, extra=extra) 25 | self.rank_zero_only = rank_zero_only 26 | 27 | def log( 28 | self, level: int, msg: str, rank: Optional[int] = None, *args, **kwargs 29 | ) -> None: 30 | """Delegate a log call to the underlying logger, after prefixing its message with the rank 31 | of the process it's being logged from. If `'rank'` is provided, then the log will only 32 | occur on that rank/process. 33 | 34 | :param level: The level to log at. Look at `logging.__init__.py` for more information. 35 | :param msg: The message to log. 36 | :param rank: The rank to log at. 37 | :param args: Additional args to pass to the underlying logging function. 38 | :param kwargs: Any additional keyword args to pass to the underlying logging function. 39 | """ 40 | if self.isEnabledFor(level): 41 | msg, kwargs = self.process(msg, kwargs) 42 | current_rank = getattr(rank_zero_only, "rank", None) 43 | if current_rank is None: 44 | raise RuntimeError( 45 | "The `rank_zero_only.rank` needs to be set before use" 46 | ) 47 | msg = rank_prefixed_message(msg, current_rank) 48 | if self.rank_zero_only: 49 | if current_rank == 0: 50 | self.logger.log(level, msg, *args, **kwargs) 51 | else: 52 | if rank is None: 53 | self.logger.log(level, msg, *args, **kwargs) 54 | elif current_rank == rank: 55 | self.logger.log(level, msg, *args, **kwargs) -------------------------------------------------------------------------------- /codec_evaluation/codecs/levo_modules/Flow1dVAE/our_MERT_BESTRQ/mert_fairseq/config/pretrain/MusicFM_95M_speech_multinodes.yaml: -------------------------------------------------------------------------------- 1 | # @package _group_ 2 | common: 3 | fp16: false 4 | log_format: json 5 | log_interval: 200 6 | seed: 1337 7 | # tensorboard_logdir: tblog_proj_name 8 | # wandb_project: wandb_proj_name 9 | 10 | checkpoint: 11 | save_interval_updates: 2500 12 | keep_interval_updates: 10000 13 | no_epoch_checkpoints: true 14 | 15 | 16 | distributed_training: 17 | ddp_backend: no_c10d 18 | distributed_backend: 'nccl' 19 | distributed_world_size: 64 20 | nprocs_per_node: 8 21 | find_unused_parameters: true 22 | 23 | task: 24 | _name: mert_pretraining 25 | data: ??? 26 | label_dir: ??? 27 | labels: ??? 28 | label_rate: ${model.label_rate} 29 | sample_rate: 24000 30 | # # crop to 5s 31 | # max_sample_size: 120000 32 | # min_sample_size: 72000 33 | 34 | # crop to 30s 35 | max_sample_size: 720000 36 | min_sample_size: 12000 37 | # clip_secs: 30 38 | 39 | pad_audio: false 40 | random_crop: true 41 | normalize: false # must be consistent with extractor 42 | 43 | 44 | dataset: 45 | num_workers: 6 46 | max_tokens: 2000000 47 | skip_invalid_size_inputs_valid_test: true 48 | validate_interval: 1 49 | validate_interval_updates: 10000 50 | disable_validation: true 51 | 52 | criterion: 53 | _name: model 54 | # log_keys: 55 | # - accuracies 56 | 57 | optimization: 58 | max_update: 400000 59 | lr: [0.0005] 60 | clip_norm: 10.0 61 | update_freq: [1] 62 | 63 | optimizer: 64 | _name: adam 65 | adam_betas: (0.9,0.98) 66 | adam_eps: 1e-06 67 | weight_decay: 0.01 68 | 69 | lr_scheduler: 70 | _name: polynomial_decay 71 | warmup_updates: 32000 72 | 73 | model: 74 | _name: musicfm 75 | label_rate: 25 76 | num_codebooks: 1 77 | codebook_dim: 16 78 | codebook_size: 4096 79 | features: ["melspec_2048"] 80 | hop_length: 240 81 | n_mels: 128 82 | conv_dim: 512 83 | encoder_dim: 1024 84 | encoder_depth: 12 85 | mask_hop: 0.4 86 | mask_prob: 0.6 87 | is_flash: false 88 | stat_path: msd_stats.json 89 | model_path: null 90 | w2v2_config_path: models--facebook--wav2vec2-conformer-rope-large-960h-ft/snapshots/6b36ef01c6443c67ae7ed0822876d091ab50e4aa 91 | 92 | hydra: 93 | job: 94 | config: 95 | override_dirname: 96 | kv_sep: '-' 97 | item_sep: '__' 98 | exclude_keys: 99 | - run 100 | - task.data 101 | - task.label_dir 102 | run: 103 | dir: ??? 104 | sweep: 105 | dir: ??? 106 | subdir: ${hydra.job.config_name}__${hydra.job.override_dirname} 107 | -------------------------------------------------------------------------------- /codec_evaluation/codecs/levo_modules/Flow1dVAE/tools/infer_bsrnnvae441k.py: -------------------------------------------------------------------------------- 1 | import json 2 | import torch 3 | from tqdm import tqdm 4 | import torchaudio 5 | import librosa 6 | import os 7 | import math 8 | import numpy as np 9 | from tools.get_bsrnnvae import get_bsrnnvae 10 | import tools.torch_tools as torch_tools 11 | 12 | class Tango: 13 | def __init__(self, \ 14 | device="cuda:0"): 15 | 16 | self.sample_rate = 44100 17 | self.device = device 18 | 19 | self.vae = get_bsrnnvae() 20 | self.vae = self.vae.eval().to(device) 21 | 22 | def sound2sound_generate_longterm(self, fname, batch_size=1, duration=15.36, steps=200, disable_progress=False): 23 | """ Genrate audio without condition. """ 24 | num_frames = math.ceil(duration * 100. / 8) 25 | with torch.no_grad(): 26 | orig_samples, fs = torchaudio.load(fname) 27 | if(fs!=44100): 28 | orig_samples = torchaudio.functional.resample(orig_samples, fs, 44100) 29 | fs = 44100 30 | if(orig_samples.shape[-1] 25_000): 33 | print("GPU memory {}, run matrix cal".format(free_mem)) 34 | break 35 | else: 36 | print("GPU memory {}, sleep 1min".format(free_mem)) 37 | time.sleep(60) 38 | 39 | mus_infos = [] 40 | with open(json_path) as f: 41 | for line in f: 42 | item = json.loads(line) 43 | mus_infos.append(item) 44 | 45 | tango = Tango(model_path = './saved/model_2rvq/model_2_fixed.safetensors', rvq_num=2) 46 | 47 | 48 | # Feature extraction loop 49 | # for i in tqdm(range(2000)): 50 | with WriteHelper('ark,scp:{}/token.ark,{}/token.scp'.format(outdir, outdir), write_function="pickle") as writer: 51 | print('ark,scp:{}/token.ark,{}/token.scp'.format(outdir, outdir)) 52 | for item in tqdm(mus_infos): 53 | try: 54 | # if True: 55 | idx = item['idx'] 56 | # print(idx) 57 | with torch.autocast(device_type="cuda", dtype=torch.float16): 58 | if(os.path.exists(item['path'])): 59 | codes = tango.file2code(item['path']) 60 | else: 61 | codes = tango.file2code('/mnt/share/' + item['path']) 62 | writer(str(idx), codes.cpu()) 63 | except: 64 | print(item['path']) 65 | continue 66 | # idx = item['idx'] 67 | # # print(idx) 68 | # with torch.autocast(device_type="cuda", dtype=torch.float16): 69 | # codes = tango.file2code(item['path']) 70 | # writer(str(idx), codes.cpu()) -------------------------------------------------------------------------------- /codec_evaluation/probe/config/Common_Voice_dataset/wavtokenizer.yaml: -------------------------------------------------------------------------------- 1 | mode: encode 2 | sample_rate: 48000 3 | probe_ckpt_dir: ??? 4 | seed: 666 5 | codec_name: wavtokenizer 6 | 7 | trainer: 8 | _target_: pytorch_lightning.Trainer 9 | accelerator: gpu 10 | devices: ??? 11 | precision: 32 12 | max_epochs: 10 13 | log_every_n_steps: 20 14 | val_check_interval: 1.0 15 | limit_val_batches: 5 16 | 17 | data: 18 | _target_: codec_evaluation.probe.dataset.Common_Voice_dataset.Common_voice_dataset.Common_voice_module 19 | target_samplerate: ${sample_rate} 20 | train_audio_dir: ??? 21 | val_audio_dir: ??? 22 | test_audio_dir: ??? 23 | base_audio_dir: /root/path/for/audio 24 | train_batch_size: 4 25 | val_batch_size: 4 26 | test_batch_size: 4 27 | train_num_workers: 4 28 | val_num_workers: 1 29 | test_num_workers: 1 30 | 31 | model: 32 | _target_: codec_evaluation.probe.model.ctc_lit_prober.CodecCTCProbe 33 | codec_name: ${codec_name} 34 | sample_rate: ${sample_rate} 35 | mode: ${mode} 36 | tokenizer: 37 | _target_: transformers.Speech2TextProcessor.from_pretrained 38 | pretrained_model_name_or_path: ??? 39 | probe_model_builder: 40 | _target_: codec_evaluation.probe.model.ctc_model.Ctc_Probe 41 | _partial_: true 42 | vocab_size: 10000 43 | codec_vocab_size: 4096 44 | dropout: 0.1 45 | lm_head_nums: 1 46 | conformer_depth: 3 47 | conformer_heads: 8 48 | model_ckpt_dir: ??? 49 | 50 | optimizer_builder: 51 | _target_: torch.optim.AdamW 52 | _partial_: true 53 | lr: 1e-4 54 | betas: [0.8, 0.99] 55 | eps: 1e-5 56 | weight_decay: 0.08 57 | 58 | lr_scheduler_builder: 59 | _target_: torch.optim.lr_scheduler.LambdaLR 60 | _partial_: true 61 | lr_lambda: 62 | _target_: codec_evaluation.utils.schedule.get_cosine_schedule_with_warmup_lr_lambda 63 | _partial_: true 64 | num_warmup_steps: 200 65 | num_training_steps: 4000 66 | final_lr_ratio: 0.2 67 | 68 | callbacks: 69 | learning_rate_monitor: 70 | _target_: pytorch_lightning.callbacks.LearningRateMonitor 71 | logging_interval: step 72 | 73 | rich_progress_bar: 74 | _target_: pytorch_lightning.callbacks.RichProgressBar 75 | 76 | model_summary: 77 | _target_: pytorch_lightning.callbacks.ModelSummary 78 | max_depth: 1 79 | 80 | model_checkpoint: 81 | _target_: pytorch_lightning.callbacks.ModelCheckpoint 82 | monitor: val_loss 83 | dirpath: ${probe_ckpt_dir} 84 | every_n_epochs: 1 85 | mode: min 86 | save_top_k: 1 87 | filename: ${codec_name}_${mode}_{epoch}-{val_loss:.4f} 88 | verbose: True 89 | 90 | tensorboard: 91 | _target_: pytorch_lightning.loggers.TensorBoardLogger 92 | save_dir: ??? 93 | name: ${codec_name}_${mode} 94 | log_graph: true -------------------------------------------------------------------------------- /codec_evaluation/probe/config/Common_Voice_dataset/semanticodec.yaml: -------------------------------------------------------------------------------- 1 | mode: encode 2 | sample_rate: 48000 3 | probe_ckpt_dir: ??? 4 | seed: 666 5 | codec_name: semanticodec 6 | 7 | trainer: 8 | _target_: pytorch_lightning.Trainer 9 | accelerator: gpu 10 | devices: ??? 11 | precision: 32 12 | max_epochs: 10 13 | log_every_n_steps: 20 14 | val_check_interval: 1.0 15 | limit_val_batches: 5 16 | 17 | data: 18 | _target_: codec_evaluation.probe.dataset.Common_Voice_dataset.Common_voice_dataset.Common_voice_module 19 | target_samplerate: ${sample_rate} 20 | train_audio_dir: ??? 21 | val_audio_dir: ??? 22 | test_audio_dir: ??? 23 | base_audio_dir: /root/path/for/audio 24 | train_batch_size: 4 25 | val_batch_size: 4 26 | test_batch_size: 4 27 | train_num_workers: 4 28 | val_num_workers: 1 29 | test_num_workers: 1 30 | 31 | model: 32 | _target_: codec_evaluation.probe.model.ctc_lit_prober.CodecCTCProbe 33 | codec_name: ${codec_name} 34 | sample_rate: ${sample_rate} 35 | mode: ${mode} 36 | tokenizer: 37 | _target_: transformers.Speech2TextProcessor.from_pretrained 38 | pretrained_model_name_or_path: ??? 39 | probe_model_builder: 40 | _target_: codec_evaluation.probe.model.ctc_model.Ctc_Probe 41 | _partial_: true 42 | vocab_size: 10000 43 | codec_vocab_size: 8192 44 | dropout: 0.1 45 | lm_head_nums: 2 46 | conformer_depth: 3 47 | conformer_heads: 8 48 | model_ckpt_dir: ??? 49 | 50 | optimizer_builder: 51 | _target_: torch.optim.AdamW 52 | _partial_: true 53 | lr: 1e-4 54 | betas: [0.8, 0.99] 55 | eps: 1e-5 56 | weight_decay: 0.08 57 | 58 | lr_scheduler_builder: 59 | _target_: torch.optim.lr_scheduler.LambdaLR 60 | _partial_: true 61 | lr_lambda: 62 | _target_: codec_evaluation.utils.schedule.get_cosine_schedule_with_warmup_lr_lambda 63 | _partial_: true 64 | num_warmup_steps: 200 65 | num_training_steps: 4000 66 | final_lr_ratio: 0.2 67 | 68 | callbacks: 69 | learning_rate_monitor: 70 | _target_: pytorch_lightning.callbacks.LearningRateMonitor 71 | logging_interval: step 72 | 73 | rich_progress_bar: 74 | _target_: pytorch_lightning.callbacks.RichProgressBar 75 | 76 | model_summary: 77 | _target_: pytorch_lightning.callbacks.ModelSummary 78 | max_depth: 1 79 | 80 | model_checkpoint: 81 | _target_: pytorch_lightning.callbacks.ModelCheckpoint 82 | monitor: val_loss 83 | dirpath: ${probe_ckpt_dir} 84 | every_n_epochs: 1 85 | mode: min 86 | save_top_k: 1 87 | filename: ${codec_name}_${mode}_{epoch}-{val_loss:.4f} 88 | verbose: True 89 | 90 | tensorboard: 91 | _target_: pytorch_lightning.loggers.TensorBoardLogger 92 | save_dir: ??? 93 | name: ${codec_name}_${mode} 94 | log_graph: true 95 | -------------------------------------------------------------------------------- /codec_evaluation/probe/config/Common_Voice_dataset/speechtokenizer.yaml: -------------------------------------------------------------------------------- 1 | mode: encode 2 | sample_rate: 48000 3 | probe_ckpt_dir: ??? 4 | seed: 666 5 | codec_name: speechtokenizer 6 | 7 | trainer: 8 | _target_: pytorch_lightning.Trainer 9 | accelerator: gpu 10 | devices: ??? 11 | precision: 32 12 | max_epochs: 10 13 | log_every_n_steps: 20 14 | val_check_interval: 1.0 15 | limit_val_batches: 5 16 | 17 | data: 18 | _target_: codec_evaluation.probe.dataset.Common_Voice_dataset.Common_voice_dataset.Common_voice_module 19 | target_samplerate: ${sample_rate} 20 | train_audio_dir: ??? 21 | val_audio_dir: ??? 22 | test_audio_dir: ??? 23 | base_audio_dir: /root/path/for/audio 24 | train_batch_size: 4 25 | val_batch_size: 4 26 | test_batch_size: 4 27 | train_num_workers: 4 28 | val_num_workers: 1 29 | test_num_workers: 1 30 | 31 | model: 32 | _target_: codec_evaluation.probe.model.ctc_lit_prober.CodecCTCProbe 33 | codec_name: ${codec_name} 34 | sample_rate: ${sample_rate} 35 | mode: ${mode} 36 | tokenizer: 37 | _target_: transformers.Speech2TextProcessor.from_pretrained 38 | pretrained_model_name_or_path: ??? 39 | probe_model_builder: 40 | _target_: codec_evaluation.probe.model.ctc_model.Ctc_Probe 41 | _partial_: true 42 | vocab_size: 10000 43 | codec_vocab_size: 1024 44 | dropout: 0.1 45 | lm_head_nums: 8 46 | conformer_depth: 3 47 | conformer_heads: 8 48 | model_ckpt_dir: ??? 49 | 50 | optimizer_builder: 51 | _target_: torch.optim.AdamW 52 | _partial_: true 53 | lr: 1e-4 54 | betas: [0.8, 0.99] 55 | eps: 1e-5 56 | weight_decay: 0.08 57 | 58 | lr_scheduler_builder: 59 | _target_: torch.optim.lr_scheduler.LambdaLR 60 | _partial_: true 61 | lr_lambda: 62 | _target_: codec_evaluation.utils.schedule.get_cosine_schedule_with_warmup_lr_lambda 63 | _partial_: true 64 | num_warmup_steps: 200 65 | num_training_steps: 4000 66 | final_lr_ratio: 0.2 67 | 68 | callbacks: 69 | learning_rate_monitor: 70 | _target_: pytorch_lightning.callbacks.LearningRateMonitor 71 | logging_interval: step 72 | 73 | rich_progress_bar: 74 | _target_: pytorch_lightning.callbacks.RichProgressBar 75 | 76 | model_summary: 77 | _target_: pytorch_lightning.callbacks.ModelSummary 78 | max_depth: 1 79 | 80 | model_checkpoint: 81 | _target_: pytorch_lightning.callbacks.ModelCheckpoint 82 | monitor: val_loss 83 | dirpath: ${probe_ckpt_dir} 84 | every_n_epochs: 1 85 | mode: min 86 | save_top_k: 1 87 | filename: ${codec_name}_${mode}_{epoch}-{val_loss:.4f} 88 | verbose: True 89 | 90 | tensorboard: 91 | _target_: pytorch_lightning.loggers.TensorBoardLogger 92 | save_dir: ??? 93 | name: ${codec_name}_${mode} 94 | log_graph: true -------------------------------------------------------------------------------- /codec_evaluation/probe/config/MTT_dataset/encodec.yaml: -------------------------------------------------------------------------------- 1 | mode: quantized_emb 2 | sample_rate: 16000 3 | target_sec: 10 4 | num_outputs: 50 5 | probe_ckpt_dir: ??? 6 | seed: 666 7 | codec_name: encodec 8 | task: multilabel 9 | save_result: null 10 | 11 | trainer: 12 | _target_: pytorch_lightning.Trainer 13 | accelerator: gpu 14 | devices: ??? 15 | precision: 32 16 | max_epochs: 50 17 | limit_val_batches: 10 18 | log_every_n_steps: 5 19 | val_check_interval: 1.0 20 | 21 | data: 22 | _target_: codec_evaluation.probe.dataset.MTT_dataset.MTT_dataset.MTTdataModule 23 | dataset_args: 24 | sample_rate: ${sample_rate} 25 | target_sec: ${target_sec} 26 | train_audio_dir: ??? 27 | val_audio_dir: ??? 28 | test_audio_dir: ??? 29 | train_batch_size: 64 30 | val_batch_size: 8 31 | test_batch_size: 64 32 | train_num_workers: 16 33 | val_num_workers: 4 34 | test_num_workers: 16 35 | 36 | model: 37 | _target_: codec_evaluation.probe.model.lit_prober.Prober 38 | codec_name: ${codec_name} 39 | sample_rate: ${sample_rate} 40 | mode: ${mode} 41 | task: ${task} 42 | num_outputs: ${num_outputs} 43 | probe_model_builder: 44 | _target_: codec_evaluation.probe.model.multilabel_model.MultilabelProber 45 | _partial_: true 46 | num_outputs: ${num_outputs} 47 | drop_out: 0.2 48 | channel_reduction: 16 49 | padding: 1 50 | kernel_size: 3 51 | stride: 1 52 | target_sec: ${target_sec} 53 | model_ckpt_dir: ??? 54 | 55 | optimizer_builder: 56 | _target_: torch.optim.AdamW 57 | _partial_: true 58 | lr: 1e-4 59 | betas: [0.8, 0.99] 60 | eps: 1e-5 61 | weight_decay: 0.08 62 | 63 | lr_scheduler_builder: 64 | _target_: torch.optim.lr_scheduler.LambdaLR 65 | _partial_: true 66 | lr_lambda: 67 | _target_: codec_evaluation.utils.schedule.get_cosine_schedule_with_warmup_lr_lambda 68 | _partial_: true 69 | num_warmup_steps: 10 70 | num_training_steps: 10000 71 | final_lr_ratio: 0.2 72 | 73 | callbacks: 74 | learning_rate_monitor: 75 | _target_: pytorch_lightning.callbacks.LearningRateMonitor 76 | logging_interval: step 77 | 78 | rich_progress_bar: 79 | _target_: pytorch_lightning.callbacks.RichProgressBar 80 | 81 | model_summary: 82 | _target_: pytorch_lightning.callbacks.ModelSummary 83 | max_depth: 1 84 | 85 | model_checkpoint: 86 | _target_: pytorch_lightning.callbacks.ModelCheckpoint 87 | monitor: val_loss 88 | dirpath: ${probe_ckpt_dir} 89 | every_n_epochs: 1 90 | mode: min 91 | save_top_k: 1 92 | filename: ${codec_name}_${mode}_{epoch}-{val_loss:.4f} 93 | verbose: True 94 | 95 | tensorboard: 96 | _target_: pytorch_lightning.loggers.TensorBoardLogger 97 | save_dir: ??? 98 | name: ${codec_name}_${mode} 99 | log_graph: true -------------------------------------------------------------------------------- /codec_evaluation/codecs/YuE/RepCodec/repcodec/layers/conv_layer.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | 3 | class Conv1d1x1(nn.Conv1d): 4 | """1x1 Conv1d.""" 5 | 6 | def __init__(self, in_channels, out_channels, bias=True): 7 | super(Conv1d1x1, self).__init__(in_channels, out_channels, kernel_size=1, bias=bias) 8 | 9 | class Conv1d(nn.Module): 10 | def __init__( 11 | self, 12 | in_channels: int, 13 | out_channels: int, 14 | kernel_size: int, 15 | stride: int = 1, 16 | padding: int = -1, 17 | dilation: int = 1, 18 | groups: int = 1, 19 | bias: bool = True 20 | ): 21 | super().__init__() 22 | self.in_channels = in_channels 23 | self.out_channels = out_channels 24 | self.kernel_size = kernel_size 25 | if padding < 0: 26 | padding = (kernel_size - 1) // 2 * dilation 27 | self.dilation = dilation 28 | self.conv = nn.Conv1d( 29 | in_channels=in_channels, 30 | out_channels=out_channels, 31 | kernel_size=kernel_size, 32 | stride=stride, 33 | padding=padding, 34 | dilation=dilation, 35 | groups=groups, 36 | bias=bias, 37 | ) 38 | 39 | def forward(self, x): 40 | """ 41 | Args: 42 | x (Tensor): Float tensor variable with the shape (B, C, T). 43 | Returns: 44 | Tensor: Float tensor variable with the shape (B, C, T). 45 | """ 46 | x = self.conv(x) 47 | return x 48 | 49 | 50 | class ConvTranspose1d(nn.Module): 51 | def __init__( 52 | self, 53 | in_channels: int, 54 | out_channels: int, 55 | kernel_size: int, 56 | stride: int, 57 | padding=-1, 58 | output_padding=-1, 59 | groups=1, 60 | bias=True, 61 | ): 62 | super().__init__() 63 | if padding < 0: 64 | padding = (stride + 1) // 2 65 | if output_padding < 0: 66 | output_padding = 1 if stride % 2 else 0 67 | self.deconv = nn.ConvTranspose1d( 68 | in_channels=in_channels, 69 | out_channels=out_channels, 70 | kernel_size=kernel_size, 71 | stride=stride, 72 | padding=padding, 73 | output_padding=output_padding, 74 | groups=groups, 75 | bias=bias, 76 | ) 77 | 78 | def forward(self, x): 79 | """ 80 | Args: 81 | x (Tensor): Float tensor variable with the shape (B, C, T). 82 | Returns: 83 | Tensor: Float tensor variable with the shape (B, C', T'). 84 | """ 85 | x = self.deconv(x) 86 | return x 87 | -------------------------------------------------------------------------------- /codec_evaluation/probe/config/NSynthI_dataset/dac.yaml: -------------------------------------------------------------------------------- 1 | mode: quantized_emb 2 | sample_rate: 16000 3 | target_sec: 4 4 | num_outputs: 11 5 | probe_ckpt_dir: ??? 6 | seed: 666 7 | codec_name: dac 8 | task: multiclass 9 | save_result: null 10 | 11 | trainer: 12 | _target_: pytorch_lightning.Trainer 13 | accelerator: gpu 14 | devices: ??? 15 | precision: 32 16 | max_epochs: 100 17 | limit_val_batches: 10 18 | log_every_n_steps: 5 19 | val_check_interval: 1.0 20 | 21 | data: 22 | _target_: codec_evaluation.probe.dataset.NSynthI_dataset.NSynthI_dataset.NSynthIdataModule 23 | dataset_args: 24 | sample_rate: ${sample_rate} 25 | target_sec: ${target_sec} 26 | train_audio_dir: ??? 27 | val_audio_dir: ??? 28 | test_audio_dir: ??? 29 | train_batch_size: 256 30 | val_batch_size: 32 31 | test_batch_size: 256 32 | train_num_workers: 8 33 | val_num_workers: 4 34 | test_num_workers: 4 35 | 36 | model: 37 | _target_: codec_evaluation.probe.model.lit_prober.Prober 38 | codec_name: ${codec_name} 39 | sample_rate: ${sample_rate} 40 | mode: ${mode} 41 | task: ${task} 42 | num_outputs: ${num_outputs} 43 | probe_model_builder: 44 | _target_: codec_evaluation.probe.model.multiclass_model.MulticlassProber 45 | _partial_: true 46 | num_outputs: ${num_outputs} 47 | drop_out: 0.1 48 | channel_reduction: 16 49 | padding: 1 50 | kernel_size: 3 51 | stride: 1 52 | target_sec: ${target_sec} 53 | model_ckpt_dir: ??? 54 | 55 | optimizer_builder: 56 | _target_: torch.optim.AdamW 57 | _partial_: true 58 | lr: 1e-4 59 | betas: [0.8, 0.99] 60 | eps: 1e-5 61 | weight_decay: 0.08 62 | 63 | lr_scheduler_builder: 64 | _target_: torch.optim.lr_scheduler.LambdaLR 65 | _partial_: true 66 | lr_lambda: 67 | _target_: codec_evaluation.utils.schedule.get_cosine_schedule_with_warmup_lr_lambda 68 | _partial_: true 69 | num_warmup_steps: 10 70 | num_training_steps: 10000 71 | final_lr_ratio: 0.2 72 | 73 | callbacks: 74 | learning_rate_monitor: 75 | _target_: pytorch_lightning.callbacks.LearningRateMonitor 76 | logging_interval: step 77 | 78 | rich_progress_bar: 79 | _target_: pytorch_lightning.callbacks.RichProgressBar 80 | 81 | model_summary: 82 | _target_: pytorch_lightning.callbacks.ModelSummary 83 | max_depth: 1 84 | 85 | model_checkpoint: 86 | _target_: pytorch_lightning.callbacks.ModelCheckpoint 87 | monitor: val_loss 88 | dirpath: ${probe_ckpt_dir} 89 | every_n_epochs: 1 90 | mode: min 91 | save_top_k: 1 92 | filename: ${codec_name}_${mode}_{epoch}-{val_loss:.4f} 93 | verbose: True 94 | 95 | tensorboard: 96 | _target_: pytorch_lightning.loggers.TensorBoardLogger 97 | save_dir: ??? 98 | name: ${codec_name}_${mode} 99 | log_graph: true -------------------------------------------------------------------------------- /codec_evaluation/probe/config/NSynthP_dataset/dac.yaml: -------------------------------------------------------------------------------- 1 | mode: quantized_emb 2 | sample_rate: 16000 3 | target_sec: 4 4 | num_outputs: 128 5 | probe_ckpt_dir: ??? 6 | seed: 666 7 | codec_name: dac 8 | task: multiclass 9 | save_result: null 10 | 11 | trainer: 12 | _target_: pytorch_lightning.Trainer 13 | accelerator: gpu 14 | devices: ??? 15 | precision: 32 16 | max_epochs: 100 17 | limit_val_batches: 10 18 | log_every_n_steps: 5 19 | val_check_interval: 1.0 20 | 21 | data: 22 | _target_: codec_evaluation.probe.dataset.NSynthP_dataset.NSynthP_dataset.NSynthPdataModule 23 | dataset_args: 24 | sample_rate: ${sample_rate} 25 | target_sec: ${target_sec} 26 | train_audio_dir: ??? 27 | val_audio_dir: ??? 28 | test_audio_dir: ??? 29 | train_batch_size: 256 30 | val_batch_size: 32 31 | test_batch_size: 128 32 | train_num_workers: 128 33 | val_num_workers: 8 34 | test_num_workers: 64 35 | 36 | model: 37 | _target_: codec_evaluation.probe.model.lit_prober.Prober 38 | codec_name: ${codec_name} 39 | sample_rate: ${sample_rate} 40 | mode: ${mode} 41 | task: ${task} 42 | num_outputs: ${num_outputs} 43 | probe_model_builder: 44 | _target_: codec_evaluation.probe.model.multiclass_model.MulticlassProber 45 | _partial_: true 46 | num_outputs: ${num_outputs} 47 | drop_out: 0.1 48 | channel_reduction: 16 49 | padding: 1 50 | kernel_size: 3 51 | stride: 1 52 | target_sec: ${target_sec} 53 | model_ckpt_dir: ??? 54 | 55 | optimizer_builder: 56 | _target_: torch.optim.AdamW 57 | _partial_: true 58 | lr: 1e-4 59 | betas: [0.8, 0.99] 60 | eps: 1e-5 61 | weight_decay: 0.08 62 | 63 | lr_scheduler_builder: 64 | _target_: torch.optim.lr_scheduler.LambdaLR 65 | _partial_: true 66 | lr_lambda: 67 | _target_: codec_evaluation.utils.schedule.get_cosine_schedule_with_warmup_lr_lambda 68 | _partial_: true 69 | num_warmup_steps: 10 70 | num_training_steps: 10000 71 | final_lr_ratio: 0.2 72 | 73 | callbacks: 74 | learning_rate_monitor: 75 | _target_: pytorch_lightning.callbacks.LearningRateMonitor 76 | logging_interval: step 77 | 78 | rich_progress_bar: 79 | _target_: pytorch_lightning.callbacks.RichProgressBar 80 | 81 | model_summary: 82 | _target_: pytorch_lightning.callbacks.ModelSummary 83 | max_depth: 1 84 | 85 | model_checkpoint: 86 | _target_: pytorch_lightning.callbacks.ModelCheckpoint 87 | monitor: val_loss 88 | dirpath: ${probe_ckpt_dir} 89 | every_n_epochs: 1 90 | mode: min 91 | save_top_k: 1 92 | filename: ${codec_name}_${mode}_{epoch}-{val_loss:.4f} 93 | verbose: True 94 | 95 | tensorboard: 96 | _target_: pytorch_lightning.loggers.TensorBoardLogger 97 | save_dir: ??? 98 | name: ${codec_name}_${mode} 99 | log_graph: true -------------------------------------------------------------------------------- /codec_evaluation/probe/config/GTZAN_dataset/wavtokenizer.yaml: -------------------------------------------------------------------------------- 1 | mode: quantized_emb 2 | sample_rate: 22050 3 | target_sec: 10 4 | num_outputs: 10 5 | probe_ckpt_dir: ??? 6 | seed: 666 7 | codec_name: wavtokenizer 8 | task: multiclass 9 | 10 | trainer: 11 | _target_: pytorch_lightning.Trainer 12 | accelerator: gpu 13 | devices: ??? 14 | precision: 32 15 | max_epochs: 100 16 | limit_val_batches: 10 17 | log_every_n_steps: 5 18 | val_check_interval: 1.0 19 | 20 | data: 21 | _target_: codec_evaluation.probe.dataset.GTZAN_dataset.GTZAN_dataset.GTZANdataModule 22 | dataset_args: 23 | sample_rate: ${sample_rate} 24 | target_sec: ${target_sec} 25 | train_audio_dir: ??? 26 | val_audio_dir: ??? 27 | test_audio_dir: ??? 28 | train_batch_size: 8 29 | val_batch_size: 1 30 | test_batch_size: 8 31 | train_num_workers: 8 32 | val_num_workers: 4 33 | test_num_workers: 4 34 | 35 | model: 36 | _target_: codec_evaluation.probe.model.lit_prober.Prober 37 | codec_name: ${codec_name} 38 | sample_rate: ${sample_rate} 39 | mode: ${mode} 40 | task: ${task} 41 | num_outputs: ${num_outputs} 42 | probe_model_builder: 43 | _target_: codec_evaluation.probe.model.multiclass_model.MulticlassProber 44 | _partial_: true 45 | num_outputs: ${num_outputs} 46 | drop_out: 0.1 47 | channel_reduction: 16 48 | padding: 1 49 | kernel_size: 3 50 | stride: 1 51 | target_sec: ${target_sec} 52 | model_ckpt_dir: ??? 53 | 54 | optimizer_builder: 55 | _target_: torch.optim.AdamW 56 | _partial_: true 57 | lr: 1e-4 58 | betas: [0.8, 0.99] 59 | eps: 1e-5 60 | weight_decay: 0.08 61 | 62 | lr_scheduler_builder: 63 | _target_: torch.optim.lr_scheduler.LambdaLR 64 | _partial_: true 65 | lr_lambda: 66 | _target_: codec_evaluation.utils.schedule.get_cosine_schedule_with_warmup_lr_lambda 67 | _partial_: true 68 | num_warmup_steps: 10 69 | num_training_steps: 10000 70 | final_lr_ratio: 0.2 71 | 72 | callbacks: 73 | learning_rate_monitor: 74 | _target_: pytorch_lightning.callbacks.LearningRateMonitor 75 | logging_interval: step 76 | 77 | rich_progress_bar: 78 | _target_: pytorch_lightning.callbacks.RichProgressBar 79 | 80 | model_summary: 81 | _target_: pytorch_lightning.callbacks.ModelSummary 82 | max_depth: 1 83 | 84 | model_checkpoint: 85 | _target_: pytorch_lightning.callbacks.ModelCheckpoint 86 | monitor: val_loss 87 | dirpath: ${probe_ckpt_dir} 88 | every_n_epochs: 1 89 | mode: min 90 | save_top_k: 1 91 | save_last: False 92 | filename: ${codec_name}_${mode}_{epoch}-{val_loss:.4f} 93 | verbose: True 94 | 95 | tensorboard: 96 | _target_: pytorch_lightning.loggers.TensorBoardLogger 97 | save_dir: ??? 98 | name: ${codec_name}_${mode} 99 | log_graph: true -------------------------------------------------------------------------------- /codec_evaluation/probe/config/NSynthI_dataset/encodec.yaml: -------------------------------------------------------------------------------- 1 | mode: quantized_emb 2 | sample_rate: 16000 3 | target_sec: 4 4 | num_outputs: 11 5 | probe_ckpt_dir: ??? 6 | seed: 666 7 | codec_name: encodec 8 | task: multiclass 9 | save_result: null 10 | 11 | trainer: 12 | _target_: pytorch_lightning.Trainer 13 | accelerator: gpu 14 | devices: ??? 15 | precision: 32 16 | max_epochs: 100 17 | limit_val_batches: 10 18 | log_every_n_steps: 5 19 | val_check_interval: 1.0 20 | 21 | data: 22 | _target_: codec_evaluation.probe.dataset.NSynthI_dataset.NSynthI_dataset.NSynthIdataModule 23 | dataset_args: 24 | sample_rate: ${sample_rate} 25 | target_sec: ${target_sec} 26 | train_audio_dir: ??? 27 | val_audio_dir: ??? 28 | test_audio_dir: ??? 29 | train_batch_size: 256 30 | val_batch_size: 32 31 | test_batch_size: 256 32 | train_num_workers: 8 33 | val_num_workers: 4 34 | test_num_workers: 4 35 | 36 | model: 37 | _target_: codec_evaluation.probe.model.lit_prober.Prober 38 | codec_name: ${codec_name} 39 | sample_rate: ${sample_rate} 40 | mode: ${mode} 41 | task: ${task} 42 | num_outputs: ${num_outputs} 43 | probe_model_builder: 44 | _target_: codec_evaluation.probe.model.multiclass_model.MulticlassProber 45 | _partial_: true 46 | num_outputs: ${num_outputs} 47 | drop_out: 0.1 48 | channel_reduction: 16 49 | padding: 1 50 | kernel_size: 3 51 | stride: 1 52 | target_sec: ${target_sec} 53 | model_ckpt_dir: ??? 54 | 55 | optimizer_builder: 56 | _target_: torch.optim.AdamW 57 | _partial_: true 58 | lr: 1e-4 59 | betas: [0.8, 0.99] 60 | eps: 1e-5 61 | weight_decay: 0.08 62 | 63 | lr_scheduler_builder: 64 | _target_: torch.optim.lr_scheduler.LambdaLR 65 | _partial_: true 66 | lr_lambda: 67 | _target_: codec_evaluation.utils.schedule.get_cosine_schedule_with_warmup_lr_lambda 68 | _partial_: true 69 | num_warmup_steps: 10 70 | num_training_steps: 10000 71 | final_lr_ratio: 0.2 72 | 73 | callbacks: 74 | learning_rate_monitor: 75 | _target_: pytorch_lightning.callbacks.LearningRateMonitor 76 | logging_interval: step 77 | 78 | rich_progress_bar: 79 | _target_: pytorch_lightning.callbacks.RichProgressBar 80 | 81 | model_summary: 82 | _target_: pytorch_lightning.callbacks.ModelSummary 83 | max_depth: 1 84 | 85 | model_checkpoint: 86 | _target_: pytorch_lightning.callbacks.ModelCheckpoint 87 | monitor: val_loss 88 | dirpath: ${probe_ckpt_dir} 89 | every_n_epochs: 1 90 | mode: min 91 | save_top_k: 1 92 | filename: ${codec_name}_${mode}_{epoch}-{val_loss:.4f} 93 | verbose: True 94 | 95 | tensorboard: 96 | _target_: pytorch_lightning.loggers.TensorBoardLogger 97 | save_dir: ??? 98 | name: ${codec_name}_${mode} 99 | log_graph: true -------------------------------------------------------------------------------- /codec_evaluation/probe/config/NSynthI_dataset/semanticodec.yaml: -------------------------------------------------------------------------------- 1 | mode: quantized_emb 2 | sample_rate: 16000 3 | target_sec: 4 4 | num_outputs: 11 5 | probe_ckpt_dir: ??? 6 | seed: 666 7 | codec_name: semanticodec 8 | task: multiclass 9 | save_result: null 10 | 11 | trainer: 12 | _target_: pytorch_lightning.Trainer 13 | accelerator: gpu 14 | devices: ??? 15 | precision: 32 16 | max_epochs: 100 17 | limit_val_batches: 10 18 | log_every_n_steps: 5 19 | val_check_interval: 1.0 20 | 21 | data: 22 | _target_: codec_evaluation.probe.dataset.NSynthI_dataset.NSynthI_dataset.NSynthIdataModule 23 | dataset_args: 24 | sample_rate: ${sample_rate} 25 | target_sec: ${target_sec} 26 | train_audio_dir: ??? 27 | val_audio_dir: ??? 28 | test_audio_dir: ??? 29 | train_batch_size: 256 30 | val_batch_size: 32 31 | test_batch_size: 256 32 | train_num_workers: 8 33 | val_num_workers: 4 34 | test_num_workers: 4 35 | 36 | model: 37 | _target_: codec_evaluation.probe.model.lit_prober.Prober 38 | codec_name: ${codec_name} 39 | sample_rate: ${sample_rate} 40 | mode: ${mode} 41 | task: ${task} 42 | num_outputs: ${num_outputs} 43 | probe_model_builder: 44 | _target_: codec_evaluation.probe.model.multiclass_model.MulticlassProber 45 | _partial_: true 46 | num_outputs: ${num_outputs} 47 | drop_out: 0.1 48 | channel_reduction: 16 49 | padding: 1 50 | kernel_size: 3 51 | stride: 1 52 | target_sec: ${target_sec} 53 | model_ckpt_dir: ??? 54 | 55 | optimizer_builder: 56 | _target_: torch.optim.AdamW 57 | _partial_: true 58 | lr: 1e-4 59 | betas: [0.8, 0.99] 60 | eps: 1e-5 61 | weight_decay: 0.08 62 | 63 | lr_scheduler_builder: 64 | _target_: torch.optim.lr_scheduler.LambdaLR 65 | _partial_: true 66 | lr_lambda: 67 | _target_: codec_evaluation.utils.schedule.get_cosine_schedule_with_warmup_lr_lambda 68 | _partial_: true 69 | num_warmup_steps: 10 70 | num_training_steps: 10000 71 | final_lr_ratio: 0.2 72 | 73 | callbacks: 74 | learning_rate_monitor: 75 | _target_: pytorch_lightning.callbacks.LearningRateMonitor 76 | logging_interval: step 77 | 78 | rich_progress_bar: 79 | _target_: pytorch_lightning.callbacks.RichProgressBar 80 | 81 | model_summary: 82 | _target_: pytorch_lightning.callbacks.ModelSummary 83 | max_depth: 1 84 | 85 | model_checkpoint: 86 | _target_: pytorch_lightning.callbacks.ModelCheckpoint 87 | monitor: val_loss 88 | dirpath: ${probe_ckpt_dir} 89 | every_n_epochs: 1 90 | mode: min 91 | save_top_k: 1 92 | filename: ${codec_name}_${mode}_{epoch}-{val_loss:.4f} 93 | verbose: True 94 | 95 | tensorboard: 96 | _target_: pytorch_lightning.loggers.TensorBoardLogger 97 | save_dir: ??? 98 | name: ${codec_name}_${mode} 99 | log_graph: true -------------------------------------------------------------------------------- /codec_evaluation/probe/config/NSynthI_dataset/wavtokenizer.yaml: -------------------------------------------------------------------------------- 1 | mode: quantized_emb 2 | sample_rate: 16000 3 | target_sec: 4 4 | num_outputs: 11 5 | probe_ckpt_dir: ??? 6 | seed: 666 7 | codec_name: wavtokenizer 8 | task: multiclass 9 | save_result: null 10 | 11 | trainer: 12 | _target_: pytorch_lightning.Trainer 13 | accelerator: gpu 14 | devices: ??? 15 | precision: 32 16 | max_epochs: 100 17 | limit_val_batches: 10 18 | log_every_n_steps: 5 19 | val_check_interval: 1.0 20 | 21 | data: 22 | _target_: codec_evaluation.probe.dataset.NSynthI_dataset.NSynthI_dataset.NSynthIdataModule 23 | dataset_args: 24 | sample_rate: ${sample_rate} 25 | target_sec: ${target_sec} 26 | train_audio_dir: ??? 27 | val_audio_dir: ??? 28 | test_audio_dir: ??? 29 | train_batch_size: 256 30 | val_batch_size: 32 31 | test_batch_size: 256 32 | train_num_workers: 8 33 | val_num_workers: 4 34 | test_num_workers: 4 35 | 36 | model: 37 | _target_: codec_evaluation.probe.model.lit_prober.Prober 38 | codec_name: ${codec_name} 39 | sample_rate: ${sample_rate} 40 | mode: ${mode} 41 | task: ${task} 42 | num_outputs: ${num_outputs} 43 | probe_model_builder: 44 | _target_: codec_evaluation.probe.model.multiclass_model.MulticlassProber 45 | _partial_: true 46 | num_outputs: ${num_outputs} 47 | drop_out: 0. 48 | channel_reduction: 16 49 | padding: 1 50 | kernel_size: 3 51 | stride: 1 52 | target_sec: ${target_sec} 53 | model_ckpt_dir: ??? 54 | 55 | optimizer_builder: 56 | _target_: torch.optim.AdamW 57 | _partial_: true 58 | lr: 1e-4 59 | betas: [0.8, 0.99] 60 | eps: 1e-5 61 | weight_decay: 0.08 62 | 63 | lr_scheduler_builder: 64 | _target_: torch.optim.lr_scheduler.LambdaLR 65 | _partial_: true 66 | lr_lambda: 67 | _target_: codec_evaluation.utils.schedule.get_cosine_schedule_with_warmup_lr_lambda 68 | _partial_: true 69 | num_warmup_steps: 10 70 | num_training_steps: 10000 71 | final_lr_ratio: 0.2 72 | 73 | callbacks: 74 | learning_rate_monitor: 75 | _target_: pytorch_lightning.callbacks.LearningRateMonitor 76 | logging_interval: step 77 | 78 | rich_progress_bar: 79 | _target_: pytorch_lightning.callbacks.RichProgressBar 80 | 81 | model_summary: 82 | _target_: pytorch_lightning.callbacks.ModelSummary 83 | max_depth: 1 84 | 85 | model_checkpoint: 86 | _target_: pytorch_lightning.callbacks.ModelCheckpoint 87 | monitor: val_loss 88 | dirpath: ${probe_ckpt_dir} 89 | every_n_epochs: 1 90 | mode: min 91 | save_top_k: 1 92 | filename: ${codec_name}_${mode}_{epoch}-{val_loss:.4f} 93 | verbose: True 94 | 95 | tensorboard: 96 | _target_: pytorch_lightning.loggers.TensorBoardLogger 97 | save_dir: ??? 98 | name: ${codec_name}_${mode} 99 | log_graph: true -------------------------------------------------------------------------------- /codec_evaluation/probe/config/NSynthP_dataset/semanticodec.yaml: -------------------------------------------------------------------------------- 1 | mode: quantized_emb 2 | sample_rate: 16000 3 | target_sec: 4 4 | num_outputs: 128 5 | probe_ckpt_dir: ??? 6 | seed: 666 7 | codec_name: semanticodec 8 | task: multiclass 9 | save_result: null 10 | 11 | trainer: 12 | _target_: pytorch_lightning.Trainer 13 | accelerator: gpu 14 | devices: ??? 15 | precision: 32 16 | max_epochs: 100 17 | limit_val_batches: 10 18 | log_every_n_steps: 5 19 | val_check_interval: 1.0 20 | 21 | data: 22 | _target_: codec_evaluation.probe.dataset.NSynthP_dataset.NSynthP_dataset.NSynthPdataModule 23 | dataset_args: 24 | sample_rate: ${sample_rate} 25 | target_sec: ${target_sec} 26 | train_audio_dir: ??? 27 | val_audio_dir: ??? 28 | test_audio_dir: ??? 29 | train_batch_size: 256 30 | val_batch_size: 32 31 | test_batch_size: 128 32 | train_num_workers: 128 33 | val_num_workers: 8 34 | test_num_workers: 64 35 | 36 | model: 37 | _target_: codec_evaluation.probe.model.lit_prober.Prober 38 | codec_name: ${codec_name} 39 | sample_rate: ${sample_rate} 40 | mode: ${mode} 41 | task: ${task} 42 | num_outputs: ${num_outputs} 43 | probe_model_builder: 44 | _target_: codec_evaluation.probe.model.multiclass_model.MulticlassProber 45 | _partial_: true 46 | num_outputs: ${num_outputs} 47 | drop_out: 0.1 48 | channel_reduction: 16 49 | padding: 1 50 | kernel_size: 3 51 | stride: 1 52 | target_sec: ${target_sec} 53 | model_ckpt_dir: ??? 54 | 55 | optimizer_builder: 56 | _target_: torch.optim.AdamW 57 | _partial_: true 58 | lr: 1e-4 59 | betas: [0.8, 0.99] 60 | eps: 1e-5 61 | weight_decay: 0.08 62 | 63 | lr_scheduler_builder: 64 | _target_: torch.optim.lr_scheduler.LambdaLR 65 | _partial_: true 66 | lr_lambda: 67 | _target_: codec_evaluation.utils.schedule.get_cosine_schedule_with_warmup_lr_lambda 68 | _partial_: true 69 | num_warmup_steps: 10 70 | num_training_steps: 10000 71 | final_lr_ratio: 0.2 72 | 73 | callbacks: 74 | learning_rate_monitor: 75 | _target_: pytorch_lightning.callbacks.LearningRateMonitor 76 | logging_interval: step 77 | 78 | rich_progress_bar: 79 | _target_: pytorch_lightning.callbacks.RichProgressBar 80 | 81 | model_summary: 82 | _target_: pytorch_lightning.callbacks.ModelSummary 83 | max_depth: 1 84 | 85 | model_checkpoint: 86 | _target_: pytorch_lightning.callbacks.ModelCheckpoint 87 | monitor: val_loss 88 | dirpath: ${probe_ckpt_dir} 89 | every_n_epochs: 1 90 | mode: min 91 | save_top_k: 1 92 | filename: ${codec_name}_${mode}_{epoch}-{val_loss:.4f} 93 | verbose: True 94 | 95 | tensorboard: 96 | _target_: pytorch_lightning.loggers.TensorBoardLogger 97 | save_dir: ??? 98 | name: ${codec_name}_${mode} 99 | log_graph: true -------------------------------------------------------------------------------- /codec_evaluation/probe/config/Common_Voice_dataset/xcodec.yaml: -------------------------------------------------------------------------------- 1 | mode: encode 2 | sample_rate: 48000 3 | probe_ckpt_dir: ??? 4 | seed: 666 5 | codec_name: xcodec 6 | 7 | trainer: 8 | _target_: pytorch_lightning.Trainer 9 | accelerator: gpu 10 | devices: ??? 11 | precision: 32 12 | max_epochs: 10 13 | limit_val_batches: 5 14 | log_every_n_steps: 20 15 | val_check_interval: 1.0 16 | 17 | data: 18 | _target_: codec_evaluation.probe.dataset.Common_Voice_dataset.Common_voice_dataset.Common_voice_module 19 | target_samplerate: ${sample_rate} 20 | train_audio_dir: ??? 21 | val_audio_dir: ??? 22 | test_audio_dir: ??? 23 | base_audio_dir: /root/path/for/audio 24 | train_batch_size: 4 25 | val_batch_size: 4 26 | test_batch_size: 4 27 | train_num_workers: 4 28 | val_num_workers: 1 29 | test_num_workers: 1 30 | 31 | model: 32 | _target_: codec_evaluation.probe.model.ctc_lit_prober.CodecCTCProbe 33 | codec_name: ${codec_name} 34 | sample_rate: ${sample_rate} 35 | mode: ${mode} 36 | tokenizer: 37 | _target_: transformers.Speech2TextProcessor.from_pretrained 38 | pretrained_model_name_or_path: ??? 39 | probe_model_builder: 40 | _target_: codec_evaluation.probe.model.ctc_model.Ctc_Probe 41 | _partial_: true 42 | vocab_size: 10000 43 | codec_vocab_size: 1024 44 | dropout: 0.1 45 | lm_head_nums: 8 46 | conformer_depth: 3 47 | conformer_heads: 8 48 | model_ckpt_dir: ??? 49 | teacher_ckpt_path: /codec_ckpt/path/for/xcodec/hubert_base_general_audio 50 | 51 | optimizer_builder: 52 | _target_: torch.optim.AdamW 53 | _partial_: true 54 | lr: 1e-4 55 | betas: [0.8, 0.99] 56 | eps: 1e-5 57 | weight_decay: 0.08 58 | 59 | lr_scheduler_builder: 60 | _target_: torch.optim.lr_scheduler.LambdaLR 61 | _partial_: true 62 | lr_lambda: 63 | _target_: codec_evaluation.utils.schedule.get_cosine_schedule_with_warmup_lr_lambda 64 | _partial_: true 65 | num_warmup_steps: 200 66 | num_training_steps: 4000 67 | final_lr_ratio: 0.2 68 | 69 | callbacks: 70 | learning_rate_monitor: 71 | _target_: pytorch_lightning.callbacks.LearningRateMonitor 72 | logging_interval: step 73 | 74 | rich_progress_bar: 75 | _target_: pytorch_lightning.callbacks.RichProgressBar 76 | 77 | model_summary: 78 | _target_: pytorch_lightning.callbacks.ModelSummary 79 | max_depth: 1 80 | 81 | model_checkpoint: 82 | _target_: pytorch_lightning.callbacks.ModelCheckpoint 83 | monitor: val_loss 84 | dirpath: ${probe_ckpt_dir} 85 | every_n_epochs: 1 86 | mode: min 87 | save_top_k: 1 88 | filename: ${codec_name}_${mode}_{epoch}-{val_loss:.4f} 89 | verbose: True 90 | 91 | tensorboard: 92 | _target_: pytorch_lightning.loggers.TensorBoardLogger 93 | save_dir: ??? 94 | name: ${codec_name}_${mode} 95 | log_graph: true -------------------------------------------------------------------------------- /codec_evaluation/codecs/levo_modules/Flow1dVAE/our_MERT_BESTRQ/run_training_eat.sh: -------------------------------------------------------------------------------- 1 | WORKER_RANK=${1:-$INDEX} 2 | PLATFORM=${2:-'shef'} 3 | YAML_NAME_WITHOUT_EXT=${3:-'MERT_RVQ-VAE_CQT_95M'} 4 | TRAINING_SETTING=${4:-'MERT_RVQ-VAE_CQT'} 5 | MASTER_PROC_ADD=${5:-$CHIEF_IP} 6 | DIST_PORT=${6:-'25520'} 7 | # echo $PATH 8 | # export PATH=$PATH:./ 9 | echo "worker rank ${WORKER_RANK}, master address ${MASTER_PROC_ADD}:${DIST_PORT}" 10 | 11 | MAP_PROJ_DIR=$(pwd) 12 | echo $MAP_PROJ_DIR 13 | 14 | NNODS=1 15 | BATCH_SIZE=12 16 | NUM_WOKERS=6 17 | 18 | run_command_prefix=' ' 19 | # Loading folders 20 | # 1. tsv files for audio paths 21 | # DATA_DIR=${MAP_PROJ_DIR}/data/audio_tsv 22 | DATA_DIR=${MAP_PROJ_DIR}/data/music4all_sh #audio_manifest 23 | # 2. working folder for saving checkpoints and loading config files 24 | CONFIG_DIR=/${MAP_PROJ_DIR}/mert_fairseq/config/pretrain 25 | # 3. clustering labels for training data 26 | LABEL_ROOT_DIR=${MAP_PROJ_DIR}/data/encodec_labels/custom_audio_dataset 27 | 28 | FAIRSEQ_PATH=${MAP_PROJ_DIR}/src/fairseq; 29 | SAVE_DIR=${MAP_PROJ_DIR}/data/fairseq_savedir/ 30 | 31 | case $YAML_NAME_WITHOUT_EXT in 32 | EAT_pretraining_music_multinodes) 33 | NNODS=4 34 | NPROCES_PER_NODE=8 35 | LABEL_RATE=25 36 | BATCH_SIZE=12 37 | ;; 38 | *) 39 | echo "Unknown running config: ${$YAML_NAME_WITHOUT_EXT}" 40 | exit 1 41 | ;; 42 | esac 43 | 44 | echo running $YAML_NAME_WITHOUT_EXT .. 45 | 46 | mkdir -p ${SAVE_DIR} 47 | echo "checkpoint save at: ${SAVE_DIR}" 48 | cd ${SAVE_DIR} 49 | 50 | DISTRIBUTED_WORLD_SIZE=`expr ${NNODS} \* ${NPROCES_PER_NODE}` 51 | ACTUAL_WORKER_RANK=`expr ${WORKER_RANK} \* ${NPROCES_PER_NODE}` 52 | echo "worker rank ${WORKER_RANK}, master address ${MASTER_PROC_ADD}:${DIST_PORT}, actual rank ${ACTUAL_WORKER_RANK}" 53 | 54 | DATE_SUFFIX=`date +"%Y-%m-%d_%H-%M"` 55 | 56 | OMP_NUM_THREADS=6 ${run_command_prefix} \ 57 | python -u ${FAIRSEQ_PATH}/fairseq_cli/hydra_train.py \ 58 | --config-dir ${CONFIG_DIR} --config-name ${YAML_NAME_WITHOUT_EXT} \ 59 | common.user_dir=${MAP_PROJ_DIR}/mert_fairseq \ 60 | common.tensorboard_logdir=${MAP_PROJ_DIR}/logs/pretrain_tb_${TRAINING_SETTING}_${YAML_NAME_WITHOUT_EXT}_multinodes${NNODS} \ 61 | checkpoint.save_dir=${SAVE_DIR}/ckpt_${TRAINING_SETTING}_multinodes${NNODS}_${DATE_SUFFIX}/${YAML_NAME_WITHOUT_EXT} \ 62 | distributed_training.distributed_rank=${ACTUAL_WORKER_RANK} \ 63 | distributed_training.distributed_world_size=${DISTRIBUTED_WORLD_SIZE} \ 64 | distributed_training.distributed_num_procs=${DISTRIBUTED_WORLD_SIZE} \ 65 | distributed_training.nprocs_per_node=${NPROCES_PER_NODE} \ 66 | distributed_training.distributed_init_method="tcp://${CHIEF_IP}:${DIST_PORT}" \ 67 | task.data=${DATA_DIR} \ 68 | dataset.num_workers=${NUM_WOKERS} \ 69 | dataset.batch_size=${BATCH_SIZE} \ 70 | dataset.disable_validation=true \ 71 | 72 | # pip install h5py timm -i https://mirrors.tencent.com/pypi/simple/ -------------------------------------------------------------------------------- /codec_evaluation/probe/config/EMO_dataset/encodec.yaml: -------------------------------------------------------------------------------- 1 | mode: quantized_emb 2 | sample_rate: 44100 3 | target_sec: 9 4 | num_outputs: 2 5 | probe_ckpt_dir: ??? 6 | seed: 666 7 | codec_name: encodec 8 | task: regression 9 | save_result: null 10 | 11 | trainer: 12 | _target_: pytorch_lightning.Trainer 13 | accelerator: gpu 14 | devices: ??? 15 | precision: 32 16 | max_epochs: 100 17 | limit_val_batches: 10 18 | log_every_n_steps: 5 19 | val_check_interval: 1.0 20 | 21 | data: 22 | _target_: codec_evaluation.probe.dataset.EMO_dataset.EMO_dataset.EMOdataModule 23 | dataset_args: 24 | sample_rate: ${sample_rate} 25 | target_sec: ${target_sec} 26 | train_audio_dir: ??? 27 | val_audio_dir: ??? 28 | test_audio_dir: ??? 29 | train_batch_size: 32 30 | val_batch_size: 1 31 | test_batch_size: 32 32 | train_num_workers: 8 33 | val_num_workers: 4 34 | test_num_workers: 4 35 | 36 | model: 37 | _target_: codec_evaluation.probe.model.lit_prober.Prober 38 | codec_name: ${codec_name} 39 | sample_rate: ${sample_rate} 40 | mode: ${mode} 41 | task: ${task} 42 | num_outputs: ${num_outputs} 43 | probe_model_builder: 44 | _target_: codec_evaluation.probe.model.regression_model.RegressionProber 45 | _partial_: true 46 | num_outputs: ${num_outputs} 47 | drop_out: 0.1 48 | channel_reduction: 16 49 | padding: 1 50 | kernel_size: 3 51 | stride: 1 52 | target_sec: ${target_sec} 53 | model_ckpt_dir: ??? 54 | 55 | optimizer_builder: 56 | _target_: torch.optim.AdamW 57 | _partial_: true 58 | lr: 1e-4 59 | betas: [0.8, 0.99] 60 | eps: 1e-5 61 | weight_decay: 0.08 62 | 63 | lr_scheduler_builder: 64 | _target_: torch.optim.lr_scheduler.LambdaLR 65 | _partial_: true 66 | lr_lambda: 67 | _target_: codec_evaluation.utils.schedule.get_cosine_schedule_with_warmup_lr_lambda 68 | _partial_: true 69 | num_warmup_steps: 10 70 | num_training_steps: 10000 71 | final_lr_ratio: 0.2 72 | 73 | callbacks: 74 | learning_rate_monitor: 75 | _target_: pytorch_lightning.callbacks.LearningRateMonitor 76 | logging_interval: step 77 | 78 | rich_progress_bar: 79 | _target_: pytorch_lightning.callbacks.RichProgressBar 80 | 81 | model_summary: 82 | _target_: pytorch_lightning.callbacks.ModelSummary 83 | max_depth: 1 84 | 85 | model_checkpoint: 86 | _target_: pytorch_lightning.callbacks.ModelCheckpoint 87 | monitor: val_loss 88 | dirpath: ${probe_ckpt_dir} 89 | every_n_epochs: 1 90 | mode: min 91 | save_top_k: 1 92 | save_last: False 93 | filename: ${codec_name}_${mode}_{epoch}-{val_loss:.4f} 94 | verbose: True 95 | 96 | tensorboard: 97 | _target_: pytorch_lightning.loggers.TensorBoardLogger 98 | save_dir: ??? 99 | name: ${codec_name}_${mode} 100 | log_graph: true -------------------------------------------------------------------------------- /codec_evaluation/probe/config/GS_dataset/YuE.yaml: -------------------------------------------------------------------------------- 1 | mode: quantized_emb 2 | sample_rate: 44100 3 | target_sec: 10 4 | num_outputs: 24 5 | probe_ckpt_dir: ??? 6 | seed: 666 7 | codec_name: yue 8 | task: multiclass 9 | save_result: null 10 | 11 | trainer: 12 | _target_: pytorch_lightning.Trainer 13 | accelerator: gpu 14 | devices: ??? 15 | precision: 32 16 | max_epochs: 100 17 | limit_val_batches: 10 18 | log_every_n_steps: 5 19 | val_check_interval: 1.0 20 | 21 | data: 22 | _target_: codec_evaluation.probe.dataset.GS_dataset.GS_dataset.GSdataModule 23 | dataset_args: 24 | sample_rate: ${sample_rate} 25 | target_sec: ${target_sec} 26 | train_audio_dir: ??? 27 | val_audio_dir: ??? 28 | test_audio_dir: ??? 29 | train_batch_size: 8 30 | val_batch_size: 1 31 | test_batch_size: 8 32 | train_num_workers: 4 33 | val_num_workers: 2 34 | test_num_workers: 2 35 | 36 | model: 37 | _target_: codec_evaluation.probe.model.lit_prober.Prober 38 | codec_name: ${codec_name} 39 | sample_rate: ${sample_rate} 40 | mode: ${mode} 41 | task: ${task} 42 | num_outputs: ${num_outputs} 43 | probe_model_builder: 44 | _target_: codec_evaluation.probe.model.multiclass_model.MulticlassProber 45 | _partial_: true 46 | num_outputs: ${num_outputs} 47 | drop_out: 0.1 48 | channel_reduction: 16 49 | padding: 1 50 | kernel_size: 3 51 | stride: 1 52 | target_sec: ${target_sec} 53 | model_ckpt_dir: ??? 54 | 55 | optimizer_builder: 56 | _target_: torch.optim.AdamW 57 | _partial_: true 58 | lr: 1e-4 59 | betas: [0.8, 0.99] 60 | eps: 1e-5 61 | weight_decay: 0.08 62 | 63 | lr_scheduler_builder: 64 | _target_: torch.optim.lr_scheduler.LambdaLR 65 | _partial_: true 66 | lr_lambda: 67 | _target_: codec_evaluation.utils.schedule.get_cosine_schedule_with_warmup_lr_lambda 68 | _partial_: true 69 | num_warmup_steps: 10 70 | num_training_steps: 10000 71 | final_lr_ratio: 0.2 72 | 73 | callbacks: 74 | learning_rate_monitor: 75 | _target_: pytorch_lightning.callbacks.LearningRateMonitor 76 | logging_interval: step 77 | 78 | rich_progress_bar: 79 | _target_: pytorch_lightning.callbacks.RichProgressBar 80 | 81 | model_summary: 82 | _target_: pytorch_lightning.callbacks.ModelSummary 83 | max_depth: 1 84 | 85 | model_checkpoint: 86 | _target_: pytorch_lightning.callbacks.ModelCheckpoint 87 | monitor: val_loss 88 | dirpath: ${probe_ckpt_dir} 89 | every_n_epochs: 1 90 | mode: min 91 | save_top_k: 1 92 | save_last: False 93 | filename: ${codec_name}_${mode}_{epoch}-{val_loss:.4f} 94 | verbose: True 95 | 96 | tensorboard: 97 | _target_: pytorch_lightning.loggers.TensorBoardLogger 98 | save_dir: ??? 99 | name: ${codec_name}_${mode} 100 | log_graph: true -------------------------------------------------------------------------------- /codec_evaluation/probe/config/GS_dataset/dac.yaml: -------------------------------------------------------------------------------- 1 | mode: quantized_emb 2 | sample_rate: 44100 3 | target_sec: 10 4 | num_outputs: 24 5 | probe_ckpt_dir: ??? 6 | seed: 666 7 | codec_name: dac # 需要更改 8 | task: multiclass 9 | save_result: null 10 | 11 | trainer: 12 | _target_: pytorch_lightning.Trainer 13 | accelerator: gpu 14 | devices: ??? 15 | precision: 32 16 | max_epochs: 100 17 | limit_val_batches: 10 18 | log_every_n_steps: 5 19 | val_check_interval: 1.0 20 | 21 | data: 22 | _target_: codec_evaluation.probe.dataset.GS_dataset.GS_dataset.GSdataModule 23 | dataset_args: 24 | sample_rate: ${sample_rate} 25 | target_sec: ${target_sec} 26 | train_audio_dir: ??? 27 | val_audio_dir: ??? 28 | test_audio_dir: ??? 29 | train_batch_size: 8 30 | val_batch_size: 1 31 | test_batch_size: 8 32 | train_num_workers: 4 33 | val_num_workers: 2 34 | test_num_workers: 2 35 | 36 | model: 37 | _target_: codec_evaluation.probe.model.lit_prober.Prober 38 | codec_name: ${codec_name} 39 | sample_rate: ${sample_rate} 40 | mode: ${mode} 41 | task: ${task} 42 | num_outputs: ${num_outputs} 43 | probe_model_builder: 44 | _target_: codec_evaluation.probe.model.multiclass_model.MulticlassProber 45 | _partial_: true 46 | num_outputs: ${num_outputs} 47 | drop_out: 0.1 48 | channel_reduction: 16 49 | padding: 1 50 | kernel_size: 3 51 | stride: 1 52 | target_sec: ${target_sec} 53 | model_ckpt_dir: ??? 54 | 55 | optimizer_builder: 56 | _target_: torch.optim.AdamW 57 | _partial_: true 58 | lr: 1e-4 59 | betas: [0.8, 0.99] 60 | eps: 1e-5 61 | weight_decay: 0.08 62 | 63 | lr_scheduler_builder: 64 | _target_: torch.optim.lr_scheduler.LambdaLR 65 | _partial_: true 66 | lr_lambda: 67 | _target_: codec_evaluation.utils.schedule.get_cosine_schedule_with_warmup_lr_lambda 68 | _partial_: true 69 | num_warmup_steps: 10 70 | num_training_steps: 10000 71 | final_lr_ratio: 0.2 72 | 73 | callbacks: 74 | learning_rate_monitor: 75 | _target_: pytorch_lightning.callbacks.LearningRateMonitor 76 | logging_interval: step 77 | 78 | rich_progress_bar: 79 | _target_: pytorch_lightning.callbacks.RichProgressBar 80 | 81 | model_summary: 82 | _target_: pytorch_lightning.callbacks.ModelSummary 83 | max_depth: 1 84 | 85 | model_checkpoint: 86 | _target_: pytorch_lightning.callbacks.ModelCheckpoint 87 | monitor: val_loss 88 | dirpath: ${probe_ckpt_dir} 89 | every_n_epochs: 1 90 | mode: min 91 | save_top_k: 1 92 | save_last: False 93 | filename: ${codec_name}_${mode}_{epoch}-{val_loss:.4f} 94 | verbose: True 95 | 96 | tensorboard: 97 | _target_: pytorch_lightning.loggers.TensorBoardLogger 98 | save_dir: ??? 99 | name: ${codec_name}_${mode} 100 | log_graph: true -------------------------------------------------------------------------------- /codec_evaluation/codecs/levo_modules/stable_audio_tools/models/diffusion_prior.py: -------------------------------------------------------------------------------- 1 | from enum import Enum 2 | import typing as tp 3 | 4 | from .diffusion import ConditionedDiffusionModelWrapper 5 | from ..inference.generation import generate_diffusion_cond 6 | from ..inference.utils import prepare_audio 7 | 8 | import torch 9 | from torch.nn import functional as F 10 | from torchaudio import transforms as T 11 | 12 | # Define prior types enum 13 | class PriorType(Enum): 14 | MonoToStereo = 1 15 | 16 | class DiffusionPrior(ConditionedDiffusionModelWrapper): 17 | def __init__(self, *args, prior_type: PriorType=None, **kwargs): 18 | super().__init__(*args, **kwargs) 19 | self.prior_type = prior_type 20 | 21 | class MonoToStereoDiffusionPrior(DiffusionPrior): 22 | def __init__(self, *args, **kwargs): 23 | super().__init__(*args, prior_type=PriorType.MonoToStereo, **kwargs) 24 | 25 | def stereoize( 26 | self, 27 | audio: torch.Tensor, # (batch, channels, time) 28 | in_sr: int, 29 | steps: int, 30 | sampler_kwargs: dict = {}, 31 | ): 32 | """ 33 | Generate stereo audio from mono audio using a pre-trained diffusion prior 34 | 35 | Args: 36 | audio: The mono audio to convert to stereo 37 | in_sr: The sample rate of the input audio 38 | steps: The number of diffusion steps to run 39 | sampler_kwargs: Keyword arguments to pass to the diffusion sampler 40 | """ 41 | 42 | device = audio.device 43 | 44 | sample_rate = self.sample_rate 45 | 46 | # Resample input audio if necessary 47 | if in_sr != sample_rate: 48 | resample_tf = T.Resample(in_sr, sample_rate).to(audio.device) 49 | audio = resample_tf(audio) 50 | 51 | audio_length = audio.shape[-1] 52 | 53 | # Pad input audio to be compatible with the model 54 | min_length = self.min_input_length 55 | padded_input_length = audio_length + (min_length - (audio_length % min_length)) % min_length 56 | 57 | # Pad input audio to be compatible with the model 58 | if padded_input_length > audio_length: 59 | audio = F.pad(audio, (0, padded_input_length - audio_length)) 60 | 61 | # Make audio mono, duplicate to stereo 62 | dual_mono = audio.mean(1, keepdim=True).repeat(1, 2, 1) 63 | 64 | if self.pretransform is not None: 65 | dual_mono = self.pretransform.encode(dual_mono) 66 | 67 | conditioning = {"source": [dual_mono]} 68 | 69 | stereo_audio = generate_diffusion_cond( 70 | self, 71 | conditioning_tensors=conditioning, 72 | steps=steps, 73 | sample_size=padded_input_length, 74 | sample_rate=sample_rate, 75 | device=device, 76 | **sampler_kwargs, 77 | ) 78 | 79 | return stereo_audio -------------------------------------------------------------------------------- /codec_evaluation/probe/config/EMO_dataset/YuE.yaml: -------------------------------------------------------------------------------- 1 | mode: quantized_emb 2 | sample_rate: 44100 3 | target_sec: 9 4 | num_outputs: 2 5 | probe_ckpt_dir: ??? 6 | seed: 666 7 | codec_name: yue # 需要更改 8 | task: regression 9 | save_result: null 10 | 11 | trainer: 12 | _target_: pytorch_lightning.Trainer 13 | accelerator: gpu 14 | devices: ??? 15 | precision: 32 16 | max_epochs: 100 17 | limit_val_batches: 10 18 | log_every_n_steps: 5 19 | val_check_interval: 1.0 20 | 21 | data: 22 | _target_: codec_evaluation.probe.dataset.EMO_dataset.EMO_dataset.EMOdataModule 23 | dataset_args: 24 | sample_rate: ${sample_rate} 25 | target_sec: ${target_sec} 26 | train_audio_dir: ??? 27 | val_audio_dir: ??? 28 | test_audio_dir: ??? 29 | train_batch_size: 32 30 | val_batch_size: 1 31 | test_batch_size: 32 32 | train_num_workers: 8 33 | val_num_workers: 4 34 | test_num_workers: 4 35 | 36 | model: 37 | _target_: codec_evaluation.probe.model.lit_prober.Prober 38 | codec_name: ${codec_name} 39 | sample_rate: ${sample_rate} 40 | mode: ${mode} 41 | task: ${task} 42 | num_outputs: ${num_outputs} 43 | probe_model_builder: 44 | _target_: codec_evaluation.probe.model.regression_model.RegressionProber 45 | _partial_: true 46 | num_outputs: ${num_outputs} 47 | drop_out: 0.1 48 | channel_reduction: 16 49 | padding: 1 50 | kernel_size: 3 51 | stride: 1 52 | target_sec: ${target_sec} 53 | model_ckpt_dir: ??? 54 | 55 | optimizer_builder: 56 | _target_: torch.optim.AdamW 57 | _partial_: true 58 | lr: 1e-4 59 | betas: [0.8, 0.99] 60 | eps: 1e-5 61 | weight_decay: 0.08 62 | 63 | lr_scheduler_builder: 64 | _target_: torch.optim.lr_scheduler.LambdaLR 65 | _partial_: true 66 | lr_lambda: 67 | _target_: codec_evaluation.utils.schedule.get_cosine_schedule_with_warmup_lr_lambda 68 | _partial_: true 69 | num_warmup_steps: 10 70 | num_training_steps: 10000 71 | final_lr_ratio: 0.2 72 | 73 | callbacks: 74 | learning_rate_monitor: 75 | _target_: pytorch_lightning.callbacks.LearningRateMonitor 76 | logging_interval: step 77 | 78 | rich_progress_bar: 79 | _target_: pytorch_lightning.callbacks.RichProgressBar 80 | 81 | model_summary: 82 | _target_: pytorch_lightning.callbacks.ModelSummary 83 | max_depth: 1 84 | 85 | model_checkpoint: 86 | _target_: pytorch_lightning.callbacks.ModelCheckpoint 87 | monitor: val_loss 88 | dirpath: ${probe_ckpt_dir} 89 | every_n_epochs: 1 90 | mode: min 91 | save_top_k: 1 92 | save_last: False 93 | filename: ${codec_name}_${mode}_{epoch}-{val_loss:.4f} 94 | verbose: True 95 | 96 | tensorboard: 97 | _target_: pytorch_lightning.loggers.TensorBoardLogger 98 | save_dir: ??? 99 | name: ${codec_name}_${mode} 100 | log_graph: true -------------------------------------------------------------------------------- /codec_evaluation/probe/config/EMO_dataset/dac.yaml: -------------------------------------------------------------------------------- 1 | mode: quantized_emb 2 | sample_rate: 44100 3 | target_sec: 9 4 | num_outputs: 2 5 | probe_ckpt_dir: ??? 6 | seed: 666 7 | codec_name: dac # 需要更改 8 | task: regression 9 | save_result: null 10 | 11 | trainer: 12 | _target_: pytorch_lightning.Trainer 13 | accelerator: gpu 14 | devices: ??? 15 | precision: 32 16 | max_epochs: 100 17 | limit_val_batches: 10 18 | log_every_n_steps: 5 19 | val_check_interval: 1.0 20 | 21 | data: 22 | _target_: codec_evaluation.probe.dataset.EMO_dataset.EMO_dataset.EMOdataModule 23 | dataset_args: 24 | sample_rate: ${sample_rate} 25 | target_sec: ${target_sec} 26 | train_audio_dir: ??? 27 | val_audio_dir: ??? 28 | test_audio_dir: ??? 29 | train_batch_size: 32 30 | val_batch_size: 1 31 | test_batch_size: 32 32 | train_num_workers: 8 33 | val_num_workers: 4 34 | test_num_workers: 4 35 | 36 | model: 37 | _target_: codec_evaluation.probe.model.lit_prober.Prober 38 | codec_name: ${codec_name} 39 | sample_rate: ${sample_rate} 40 | mode: ${mode} 41 | task: ${task} 42 | num_outputs: ${num_outputs} 43 | probe_model_builder: 44 | _target_: codec_evaluation.probe.model.regression_model.RegressionProber 45 | _partial_: true 46 | num_outputs: ${num_outputs} 47 | drop_out: 0.1 48 | channel_reduction: 16 49 | padding: 1 50 | kernel_size: 3 51 | stride: 1 52 | target_sec: ${target_sec} 53 | model_ckpt_dir: ??? 54 | 55 | optimizer_builder: 56 | _target_: torch.optim.AdamW 57 | _partial_: true 58 | lr: 1e-4 59 | betas: [0.8, 0.99] 60 | eps: 1e-5 61 | weight_decay: 0.08 62 | 63 | lr_scheduler_builder: 64 | _target_: torch.optim.lr_scheduler.LambdaLR 65 | _partial_: true 66 | lr_lambda: 67 | _target_: codec_evaluation.utils.schedule.get_cosine_schedule_with_warmup_lr_lambda 68 | _partial_: true 69 | num_warmup_steps: 10 70 | num_training_steps: 10000 71 | final_lr_ratio: 0.2 72 | 73 | callbacks: 74 | learning_rate_monitor: 75 | _target_: pytorch_lightning.callbacks.LearningRateMonitor 76 | logging_interval: step 77 | 78 | rich_progress_bar: 79 | _target_: pytorch_lightning.callbacks.RichProgressBar 80 | 81 | model_summary: 82 | _target_: pytorch_lightning.callbacks.ModelSummary 83 | max_depth: 1 84 | 85 | model_checkpoint: 86 | _target_: pytorch_lightning.callbacks.ModelCheckpoint 87 | monitor: val_loss 88 | dirpath: ${probe_ckpt_dir} 89 | every_n_epochs: 1 90 | mode: min 91 | save_top_k: 1 92 | save_last: False 93 | filename: ${codec_name}_${mode}_{epoch}-{val_loss:.4f} 94 | verbose: True 95 | 96 | tensorboard: 97 | _target_: pytorch_lightning.loggers.TensorBoardLogger 98 | save_dir: ??? 99 | name: ${codec_name}_${mode} 100 | log_graph: true -------------------------------------------------------------------------------- /codec_evaluation/probe/config/EMO_dataset/semanticodec.yaml: -------------------------------------------------------------------------------- 1 | mode: quantized_emb 2 | sample_rate: 44100 3 | target_sec: 9 4 | num_outputs: 2 5 | probe_ckpt_dir: ??? 6 | seed: 666 7 | codec_name: semanticodec 8 | task: regression 9 | save_result: null 10 | 11 | trainer: 12 | _target_: pytorch_lightning.Trainer 13 | accelerator: gpu 14 | devices: ??? 15 | precision: 32 16 | max_epochs: 100 17 | limit_val_batches: 10 18 | log_every_n_steps: 5 19 | val_check_interval: 1.0 20 | 21 | data: 22 | _target_: codec_evaluation.probe.dataset.EMO_dataset.EMO_dataset.EMOdataModule 23 | dataset_args: 24 | sample_rate: ${sample_rate} 25 | target_sec: ${target_sec} 26 | train_audio_dir: ??? 27 | val_audio_dir: ??? 28 | test_audio_dir: ??? 29 | train_batch_size: 32 30 | val_batch_size: 1 31 | test_batch_size: 32 32 | train_num_workers: 8 33 | val_num_workers: 4 34 | test_num_workers: 4 35 | 36 | model: 37 | _target_: codec_evaluation.probe.model.lit_prober.Prober 38 | codec_name: ${codec_name} 39 | sample_rate: ${sample_rate} 40 | mode: ${mode} 41 | task: ${task} 42 | num_outputs: ${num_outputs} 43 | probe_model_builder: 44 | _target_: codec_evaluation.probe.model.regression_model.RegressionProber 45 | _partial_: true 46 | num_outputs: ${num_outputs} 47 | drop_out: 0.1 48 | channel_reduction: 16 49 | padding: 1 50 | kernel_size: 3 51 | stride: 1 52 | target_sec: ${target_sec} 53 | model_ckpt_dir: ??? 54 | 55 | optimizer_builder: 56 | _target_: torch.optim.AdamW 57 | _partial_: true 58 | lr: 1e-4 59 | betas: [0.8, 0.99] 60 | eps: 1e-5 61 | weight_decay: 0.08 62 | 63 | lr_scheduler_builder: 64 | _target_: torch.optim.lr_scheduler.LambdaLR 65 | _partial_: true 66 | lr_lambda: 67 | _target_: codec_evaluation.utils.schedule.get_cosine_schedule_with_warmup_lr_lambda 68 | _partial_: true 69 | num_warmup_steps: 10 70 | num_training_steps: 10000 71 | final_lr_ratio: 0.2 72 | 73 | callbacks: 74 | learning_rate_monitor: 75 | _target_: pytorch_lightning.callbacks.LearningRateMonitor 76 | logging_interval: step 77 | 78 | rich_progress_bar: 79 | _target_: pytorch_lightning.callbacks.RichProgressBar 80 | 81 | model_summary: 82 | _target_: pytorch_lightning.callbacks.ModelSummary 83 | max_depth: 1 84 | 85 | model_checkpoint: 86 | _target_: pytorch_lightning.callbacks.ModelCheckpoint 87 | monitor: val_loss 88 | dirpath: ${probe_ckpt_dir} 89 | every_n_epochs: 1 90 | mode: min 91 | save_top_k: 1 92 | save_last: False 93 | filename: ${codec_name}_${mode}_{epoch}-{val_loss:.4f} 94 | verbose: True 95 | 96 | tensorboard: 97 | _target_: pytorch_lightning.loggers.TensorBoardLogger 98 | save_dir: ??? 99 | name: ${codec_name}_${mode} 100 | log_graph: true -------------------------------------------------------------------------------- /codec_evaluation/probe/config/EMO_dataset/wavtokenizer.yaml: -------------------------------------------------------------------------------- 1 | mode: quantized_emb 2 | sample_rate: 44100 3 | target_sec: 9 4 | num_outputs: 2 5 | probe_ckpt_dir: ??? 6 | seed: 666 7 | codec_name: wavtokenizer 8 | task: regression 9 | save_result: null 10 | 11 | trainer: 12 | _target_: pytorch_lightning.Trainer 13 | accelerator: gpu 14 | devices: ??? 15 | precision: 32 16 | max_epochs: 100 17 | limit_val_batches: 10 18 | log_every_n_steps: 5 19 | val_check_interval: 1.0 20 | 21 | data: 22 | _target_: codec_evaluation.probe.dataset.EMO_dataset.EMO_dataset.EMOdataModule 23 | dataset_args: 24 | sample_rate: ${sample_rate} 25 | target_sec: ${target_sec} 26 | train_audio_dir: ??? 27 | val_audio_dir: ??? 28 | test_audio_dir: ??? 29 | train_batch_size: 32 30 | val_batch_size: 1 31 | test_batch_size: 32 32 | train_num_workers: 8 33 | val_num_workers: 4 34 | test_num_workers: 4 35 | 36 | model: 37 | _target_: codec_evaluation.probe.model.lit_prober.Prober 38 | codec_name: ${codec_name} 39 | sample_rate: ${sample_rate} 40 | mode: ${mode} 41 | task: ${task} 42 | num_outputs: ${num_outputs} 43 | probe_model_builder: 44 | _target_: codec_evaluation.probe.model.regression_model.RegressionProber 45 | _partial_: true 46 | num_outputs: ${num_outputs} 47 | drop_out: 0.1 48 | channel_reduction: 16 49 | padding: 1 50 | kernel_size: 3 51 | stride: 1 52 | target_sec: ${target_sec} 53 | model_ckpt_dir: ??? 54 | 55 | optimizer_builder: 56 | _target_: torch.optim.AdamW 57 | _partial_: true 58 | lr: 1e-4 59 | betas: [0.8, 0.99] 60 | eps: 1e-5 61 | weight_decay: 0.08 62 | 63 | lr_scheduler_builder: 64 | _target_: torch.optim.lr_scheduler.LambdaLR 65 | _partial_: true 66 | lr_lambda: 67 | _target_: codec_evaluation.utils.schedule.get_cosine_schedule_with_warmup_lr_lambda 68 | _partial_: true 69 | num_warmup_steps: 10 70 | num_training_steps: 10000 71 | final_lr_ratio: 0.2 72 | 73 | callbacks: 74 | learning_rate_monitor: 75 | _target_: pytorch_lightning.callbacks.LearningRateMonitor 76 | logging_interval: step 77 | 78 | rich_progress_bar: 79 | _target_: pytorch_lightning.callbacks.RichProgressBar 80 | 81 | model_summary: 82 | _target_: pytorch_lightning.callbacks.ModelSummary 83 | max_depth: 1 84 | 85 | model_checkpoint: 86 | _target_: pytorch_lightning.callbacks.ModelCheckpoint 87 | monitor: val_loss 88 | dirpath: ${probe_ckpt_dir} 89 | every_n_epochs: 1 90 | mode: min 91 | save_top_k: 1 92 | save_last: False 93 | filename: ${codec_name}_${mode}_{epoch}-{val_loss:.4f} 94 | verbose: True 95 | 96 | tensorboard: 97 | _target_: pytorch_lightning.loggers.TensorBoardLogger 98 | save_dir: ??? 99 | name: ${codec_name}_${mode} 100 | log_graph: true -------------------------------------------------------------------------------- /codec_evaluation/probe/config/GTZAN_dataset/YuE.yaml: -------------------------------------------------------------------------------- 1 | mode: quantized_emb 2 | sample_rate: 22050 3 | target_sec: 10 4 | num_outputs: 10 5 | probe_ckpt_dir: ??? 6 | seed: 666 7 | codec_name: yue 8 | task: multiclass 9 | save_result: null 10 | 11 | trainer: 12 | _target_: pytorch_lightning.Trainer 13 | accelerator: gpu 14 | devices: ??? 15 | precision: 32 16 | max_epochs: 100 17 | limit_val_batches: 10 18 | log_every_n_steps: 5 19 | val_check_interval: 1.0 20 | 21 | data: 22 | _target_: codec_evaluation.probe.dataset.GTZAN_dataset.GTZAN_dataset.GTZANdataModule 23 | dataset_args: 24 | sample_rate: ${sample_rate} 25 | target_sec: ${target_sec} 26 | train_audio_dir: ??? 27 | val_audio_dir: ??? 28 | test_audio_dir: ??? 29 | train_batch_size: 8 30 | val_batch_size: 1 31 | test_batch_size: 8 32 | train_num_workers: 8 33 | val_num_workers: 4 34 | test_num_workers: 4 35 | 36 | model: 37 | _target_: codec_evaluation.probe.model.lit_prober.Prober 38 | codec_name: ${codec_name} 39 | sample_rate: ${sample_rate} 40 | mode: ${mode} 41 | task: ${task} 42 | num_outputs: ${num_outputs} 43 | probe_model_builder: 44 | _target_: codec_evaluation.probe.model.multiclass_model.MulticlassProber 45 | _partial_: true 46 | num_outputs: ${num_outputs} 47 | drop_out: 0.1 48 | channel_reduction: 16 49 | padding: 1 50 | kernel_size: 3 51 | stride: 1 52 | target_sec: ${target_sec} 53 | model_ckpt_dir: ??? 54 | 55 | optimizer_builder: 56 | _target_: torch.optim.AdamW 57 | _partial_: true 58 | lr: 1e-4 59 | betas: [0.8, 0.99] 60 | eps: 1e-5 61 | weight_decay: 0.08 62 | 63 | lr_scheduler_builder: 64 | _target_: torch.optim.lr_scheduler.LambdaLR 65 | _partial_: true 66 | lr_lambda: 67 | _target_: codec_evaluation.utils.schedule.get_cosine_schedule_with_warmup_lr_lambda 68 | _partial_: true 69 | num_warmup_steps: 10 70 | num_training_steps: 10000 71 | final_lr_ratio: 0.2 72 | 73 | callbacks: 74 | learning_rate_monitor: 75 | _target_: pytorch_lightning.callbacks.LearningRateMonitor 76 | logging_interval: step 77 | 78 | rich_progress_bar: 79 | _target_: pytorch_lightning.callbacks.RichProgressBar 80 | 81 | model_summary: 82 | _target_: pytorch_lightning.callbacks.ModelSummary 83 | max_depth: 1 84 | 85 | model_checkpoint: 86 | _target_: pytorch_lightning.callbacks.ModelCheckpoint 87 | monitor: val_loss 88 | dirpath: ${probe_ckpt_dir} 89 | every_n_epochs: 1 90 | mode: min 91 | save_top_k: 1 92 | save_last: False 93 | filename: ${codec_name}_${mode}_{epoch}-{val_loss:.4f} 94 | verbose: True 95 | 96 | tensorboard: 97 | _target_: pytorch_lightning.loggers.TensorBoardLogger 98 | save_dir: ??? 99 | name: ${codec_name}_${mode} 100 | log_graph: true -------------------------------------------------------------------------------- /codec_evaluation/probe/config/GTZAN_dataset/dac.yaml: -------------------------------------------------------------------------------- 1 | mode: quantized_emb 2 | sample_rate: 22050 3 | target_sec: 10 4 | num_outputs: 10 5 | probe_ckpt_dir: ??? 6 | seed: 666 7 | codec_name: dac 8 | task: multiclass 9 | save_result: null 10 | 11 | trainer: 12 | _target_: pytorch_lightning.Trainer 13 | accelerator: gpu 14 | devices: ??? 15 | precision: 32 16 | max_epochs: 100 17 | limit_val_batches: 10 18 | log_every_n_steps: 5 19 | val_check_interval: 1.0 20 | 21 | data: 22 | _target_: codec_evaluation.probe.dataset.GTZAN_dataset.GTZAN_dataset.GTZANdataModule 23 | dataset_args: 24 | sample_rate: ${sample_rate} 25 | target_sec: ${target_sec} 26 | train_audio_dir: ??? 27 | val_audio_dir: ??? 28 | test_audio_dir: ??? 29 | train_batch_size: 8 30 | val_batch_size: 1 31 | test_batch_size: 8 32 | train_num_workers: 8 33 | val_num_workers: 4 34 | test_num_workers: 4 35 | 36 | model: 37 | _target_: codec_evaluation.probe.model.lit_prober.Prober 38 | codec_name: ${codec_name} 39 | sample_rate: ${sample_rate} 40 | mode: ${mode} 41 | task: ${task} 42 | num_outputs: ${num_outputs} 43 | probe_model_builder: 44 | _target_: codec_evaluation.probe.model.multiclass_model.MulticlassProber 45 | _partial_: true 46 | num_outputs: ${num_outputs} 47 | drop_out: 0.1 48 | channel_reduction: 16 49 | padding: 1 50 | kernel_size: 3 51 | stride: 1 52 | target_sec: ${target_sec} 53 | model_ckpt_dir: ??? 54 | 55 | optimizer_builder: 56 | _target_: torch.optim.AdamW 57 | _partial_: true 58 | lr: 1e-4 59 | betas: [0.8, 0.99] 60 | eps: 1e-5 61 | weight_decay: 0.08 62 | 63 | lr_scheduler_builder: 64 | _target_: torch.optim.lr_scheduler.LambdaLR 65 | _partial_: true 66 | lr_lambda: 67 | _target_: codec_evaluation.utils.schedule.get_cosine_schedule_with_warmup_lr_lambda 68 | _partial_: true 69 | num_warmup_steps: 10 70 | num_training_steps: 10000 71 | final_lr_ratio: 0.2 72 | 73 | callbacks: 74 | learning_rate_monitor: 75 | _target_: pytorch_lightning.callbacks.LearningRateMonitor 76 | logging_interval: step 77 | 78 | rich_progress_bar: 79 | _target_: pytorch_lightning.callbacks.RichProgressBar 80 | 81 | model_summary: 82 | _target_: pytorch_lightning.callbacks.ModelSummary 83 | max_depth: 1 84 | 85 | model_checkpoint: 86 | _target_: pytorch_lightning.callbacks.ModelCheckpoint 87 | monitor: val_loss 88 | dirpath: ${probe_ckpt_dir} 89 | every_n_epochs: 1 90 | mode: min 91 | save_top_k: 1 92 | save_last: False 93 | filename: ${codec_name}_${mode}_{epoch}-{val_loss:.4f} 94 | verbose: True 95 | 96 | tensorboard: 97 | _target_: pytorch_lightning.loggers.TensorBoardLogger 98 | save_dir: ??? 99 | name: ${codec_name}_${mode} 100 | log_graph: true -------------------------------------------------------------------------------- /codec_evaluation/probe/config/MELD_dataset/YuE.yaml: -------------------------------------------------------------------------------- 1 | mode: quantized_emb 2 | sample_rate: 48000 3 | target_sec: 8 4 | num_outputs: 7 5 | probe_ckpt_dir: ??? 6 | seed: 666 7 | codec_name: yue 8 | task: multiclass 9 | save_result: null 10 | 11 | trainer: 12 | _target_: pytorch_lightning.Trainer 13 | accelerator: gpu 14 | devices: ??? 15 | precision: 32 16 | max_epochs: 100 17 | limit_val_batches: 10 18 | log_every_n_steps: 5 19 | val_check_interval: 1.0 20 | 21 | data: 22 | _target_: codec_evaluation.probe.dataset.MELD_dataset.MELD_dataset.MELDdataModule 23 | dataset_args: 24 | sample_rate: ${sample_rate} 25 | target_sec: ${target_sec} 26 | train_audio_dir: ??? 27 | val_audio_dir: ??? 28 | test_audio_dir: ??? 29 | train_batch_size: 64 30 | val_batch_size: 2 31 | test_batch_size: 32 32 | train_num_workers: 8 33 | val_num_workers: 4 34 | test_num_workers: 4 35 | 36 | model: 37 | _target_: codec_evaluation.probe.model.lit_prober.Prober 38 | codec_name: ${codec_name} 39 | sample_rate: ${sample_rate} 40 | mode: ${mode} 41 | task: ${task} 42 | num_outputs: ${num_outputs} 43 | probe_model_builder: 44 | _target_: codec_evaluation.probe.model.multiclass_model.MulticlassProber 45 | _partial_: true 46 | num_outputs: ${num_outputs} 47 | drop_out: 0.2 48 | channel_reduction: 16 49 | padding: 1 50 | kernel_size: 3 51 | stride: 1 52 | target_sec: ${target_sec} 53 | model_ckpt_dir: ??? 54 | 55 | optimizer_builder: 56 | _target_: torch.optim.AdamW 57 | _partial_: true 58 | lr: 1e-4 59 | betas: [0.8, 0.99] 60 | eps: 1e-5 61 | weight_decay: 0.08 62 | 63 | lr_scheduler_builder: 64 | _target_: torch.optim.lr_scheduler.LambdaLR 65 | _partial_: true 66 | lr_lambda: 67 | _target_: codec_evaluation.utils.schedule.get_cosine_schedule_with_warmup_lr_lambda 68 | _partial_: true 69 | num_warmup_steps: 10 70 | num_training_steps: 10000 71 | final_lr_ratio: 0.2 72 | 73 | callbacks: 74 | learning_rate_monitor: 75 | _target_: pytorch_lightning.callbacks.LearningRateMonitor 76 | logging_interval: step 77 | 78 | rich_progress_bar: 79 | _target_: pytorch_lightning.callbacks.RichProgressBar 80 | 81 | model_summary: 82 | _target_: pytorch_lightning.callbacks.ModelSummary 83 | max_depth: 1 84 | 85 | model_checkpoint: 86 | _target_: pytorch_lightning.callbacks.ModelCheckpoint 87 | monitor: val_loss 88 | dirpath: ${probe_ckpt_dir} 89 | every_n_epochs: 1 90 | mode: min 91 | save_top_k: 1 92 | save_last: False 93 | filename: ${codec_name}_${mode}_{epoch}-{val_loss:.4f} 94 | verbose: True 95 | 96 | tensorboard: 97 | _target_: pytorch_lightning.loggers.TensorBoardLogger 98 | save_dir: ??? 99 | name: ${codec_name}_${mode} 100 | log_graph: true -------------------------------------------------------------------------------- /codec_evaluation/probe/config/NSynthI_dataset/YuE.yaml: -------------------------------------------------------------------------------- 1 | mode: quantized_emb 2 | sample_rate: 16000 3 | target_sec: 4 4 | num_outputs: 11 5 | probe_ckpt_dir: ??? 6 | seed: 666 7 | codec_name: yue 8 | task: multiclass 9 | save_result: null 10 | 11 | 12 | trainer: 13 | _target_: pytorch_lightning.Trainer 14 | accelerator: gpu 15 | devices: ??? 16 | precision: 32 17 | max_epochs: 50 18 | limit_val_batches: 10 19 | log_every_n_steps: 5 20 | val_check_interval: 1.0 21 | 22 | data: 23 | _target_: codec_evaluation.probe.dataset.NSynthI_dataset.NSynthI_dataset.NSynthIdataModule 24 | dataset_args: 25 | sample_rate: ${sample_rate} 26 | target_sec: ${target_sec} 27 | train_audio_dir: ??? 28 | val_audio_dir: ??? 29 | test_audio_dir: ??? 30 | train_batch_size: 256 31 | val_batch_size: 32 32 | test_batch_size: 256 33 | train_num_workers: 8 34 | val_num_workers: 4 35 | test_num_workers: 4 36 | 37 | 38 | model: 39 | _target_: codec_evaluation.probe.model.lit_prober.Prober 40 | codec_name: ${codec_name} 41 | sample_rate: ${sample_rate} 42 | mode: ${mode} 43 | task: ${task} 44 | num_outputs: ${num_outputs} 45 | probe_model_builder: 46 | _target_: codec_evaluation.probe.model.multiclass_model.MulticlassProber 47 | _partial_: true 48 | num_outputs: ${num_outputs} 49 | drop_out: 0.1 50 | channel_reduction: 16 51 | padding: 1 52 | kernel_size: 3 53 | stride: 1 54 | target_sec: ${target_sec} 55 | model_ckpt_dir: ??? 56 | 57 | optimizer_builder: 58 | _target_: torch.optim.AdamW 59 | _partial_: true 60 | lr: 1e-4 61 | betas: [0.8, 0.99] 62 | eps: 1e-5 63 | weight_decay: 0.08 64 | 65 | lr_scheduler_builder: 66 | _target_: torch.optim.lr_scheduler.LambdaLR 67 | _partial_: true 68 | lr_lambda: 69 | _target_: codec_evaluation.utils.schedule.get_cosine_schedule_with_warmup_lr_lambda 70 | _partial_: true 71 | num_warmup_steps: 10 72 | num_training_steps: 10000 73 | final_lr_ratio: 0.2 74 | 75 | callbacks: 76 | learning_rate_monitor: 77 | _target_: pytorch_lightning.callbacks.LearningRateMonitor 78 | logging_interval: step 79 | 80 | rich_progress_bar: 81 | _target_: pytorch_lightning.callbacks.RichProgressBar 82 | 83 | model_summary: 84 | _target_: pytorch_lightning.callbacks.ModelSummary 85 | max_depth: 1 86 | 87 | model_checkpoint: 88 | _target_: pytorch_lightning.callbacks.ModelCheckpoint 89 | monitor: val_loss 90 | dirpath: ${probe_ckpt_dir} 91 | every_n_epochs: 1 92 | mode: min 93 | save_top_k: 1 94 | filename: ${codec_name}_${mode}_{epoch}-{val_loss:.4f} 95 | verbose: True 96 | 97 | tensorboard: 98 | _target_: pytorch_lightning.loggers.TensorBoardLogger 99 | save_dir: ??? 100 | name: ${codec_name}_${mode} 101 | log_graph: true -------------------------------------------------------------------------------- /codec_evaluation/probe/config/GS_dataset/encodec.yaml: -------------------------------------------------------------------------------- 1 | mode: quantized_emb 2 | sample_rate: 44100 3 | target_sec: 10 4 | num_outputs: 24 5 | probe_ckpt_dir: ??? 6 | seed: 666 7 | codec_name: encodec # 需要更改 8 | task: multiclass 9 | save_result: null 10 | 11 | trainer: 12 | _target_: pytorch_lightning.Trainer 13 | accelerator: gpu 14 | devices: ??? 15 | precision: 32 16 | max_epochs: 100 17 | limit_val_batches: 10 18 | log_every_n_steps: 5 19 | val_check_interval: 1.0 20 | 21 | data: 22 | _target_: codec_evaluation.probe.dataset.GS_dataset.GS_dataset.GSdataModule 23 | dataset_args: 24 | sample_rate: ${sample_rate} 25 | target_sec: ${target_sec} 26 | train_audio_dir: ??? 27 | val_audio_dir: ??? 28 | test_audio_dir: ??? 29 | train_batch_size: 8 30 | val_batch_size: 1 31 | test_batch_size: 8 32 | train_num_workers: 4 33 | val_num_workers: 2 34 | test_num_workers: 2 35 | 36 | model: 37 | _target_: codec_evaluation.probe.model.lit_prober.Prober 38 | codec_name: ${codec_name} 39 | sample_rate: ${sample_rate} 40 | mode: ${mode} 41 | task: ${task} 42 | num_outputs: ${num_outputs} 43 | probe_model_builder: 44 | _target_: codec_evaluation.probe.model.multiclass_model.MulticlassProber 45 | _partial_: true 46 | num_outputs: ${num_outputs} 47 | drop_out: 0.1 48 | channel_reduction: 16 49 | padding: 1 50 | kernel_size: 3 51 | stride: 1 52 | target_sec: ${target_sec} 53 | model_ckpt_dir: ??? 54 | 55 | optimizer_builder: 56 | _target_: torch.optim.AdamW 57 | _partial_: true 58 | lr: 1e-4 59 | betas: [0.8, 0.99] 60 | eps: 1e-5 61 | weight_decay: 0.08 62 | 63 | lr_scheduler_builder: 64 | _target_: torch.optim.lr_scheduler.LambdaLR 65 | _partial_: true 66 | lr_lambda: 67 | _target_: codec_evaluation.utils.schedule.get_cosine_schedule_with_warmup_lr_lambda 68 | _partial_: true 69 | num_warmup_steps: 10 70 | num_training_steps: 10000 71 | final_lr_ratio: 0.2 72 | 73 | callbacks: 74 | learning_rate_monitor: 75 | _target_: pytorch_lightning.callbacks.LearningRateMonitor 76 | logging_interval: step 77 | 78 | rich_progress_bar: 79 | _target_: pytorch_lightning.callbacks.RichProgressBar 80 | 81 | model_summary: 82 | _target_: pytorch_lightning.callbacks.ModelSummary 83 | max_depth: 1 84 | 85 | model_checkpoint: 86 | _target_: pytorch_lightning.callbacks.ModelCheckpoint 87 | monitor: val_loss 88 | dirpath: ${probe_ckpt_dir} 89 | every_n_epochs: 1 90 | mode: min 91 | save_top_k: 1 92 | save_last: False 93 | filename: ${codec_name}_${mode}_{epoch}-{val_loss:.4f} 94 | verbose: True 95 | 96 | tensorboard: 97 | _target_: pytorch_lightning.loggers.TensorBoardLogger 98 | save_dir: ??? 99 | name: ${codec_name}_${mode} 100 | log_graph: true -------------------------------------------------------------------------------- /codec_evaluation/probe/config/MELD_dataset/dac.yaml: -------------------------------------------------------------------------------- 1 | mode: quantized_emb 2 | sample_rate: 48000 3 | target_sec: 8 4 | num_outputs: 7 5 | probe_ckpt_dir: ??? 6 | seed: 666 7 | codec_name: dac # 需要更改 8 | task: multiclass 9 | save_result: null 10 | 11 | trainer: 12 | _target_: pytorch_lightning.Trainer 13 | accelerator: gpu 14 | devices: ??? 15 | precision: 32 16 | max_epochs: 100 17 | limit_val_batches: 10 18 | log_every_n_steps: 5 19 | val_check_interval: 1.0 20 | 21 | data: 22 | _target_: codec_evaluation.probe.dataset.MELD_dataset.MELD_dataset.MELDdataModule 23 | dataset_args: 24 | sample_rate: ${sample_rate} 25 | target_sec: ${target_sec} 26 | train_audio_dir: ??? 27 | val_audio_dir: ??? 28 | test_audio_dir: ??? 29 | train_batch_size: 64 30 | val_batch_size: 2 31 | test_batch_size: 32 32 | train_num_workers: 8 33 | val_num_workers: 4 34 | test_num_workers: 4 35 | 36 | model: 37 | _target_: codec_evaluation.probe.model.lit_prober.Prober 38 | codec_name: ${codec_name} 39 | sample_rate: ${sample_rate} 40 | mode: ${mode} 41 | task: ${task} 42 | num_outputs: ${num_outputs} 43 | probe_model_builder: 44 | _target_: codec_evaluation.probe.model.multiclass_model.MulticlassProber 45 | _partial_: true 46 | num_outputs: ${num_outputs} 47 | drop_out: 0.2 48 | channel_reduction: 16 49 | padding: 1 50 | kernel_size: 3 51 | stride: 1 52 | target_sec: ${target_sec} 53 | model_ckpt_dir: ??? 54 | 55 | optimizer_builder: 56 | _target_: torch.optim.AdamW 57 | _partial_: true 58 | lr: 1e-4 59 | betas: [0.8, 0.99] 60 | eps: 1e-5 61 | weight_decay: 0.08 62 | 63 | lr_scheduler_builder: 64 | _target_: torch.optim.lr_scheduler.LambdaLR 65 | _partial_: true 66 | lr_lambda: 67 | _target_: codec_evaluation.utils.schedule.get_cosine_schedule_with_warmup_lr_lambda 68 | _partial_: true 69 | num_warmup_steps: 10 70 | num_training_steps: 10000 71 | final_lr_ratio: 0.2 72 | 73 | callbacks: 74 | learning_rate_monitor: 75 | _target_: pytorch_lightning.callbacks.LearningRateMonitor 76 | logging_interval: step 77 | 78 | rich_progress_bar: 79 | _target_: pytorch_lightning.callbacks.RichProgressBar 80 | 81 | model_summary: 82 | _target_: pytorch_lightning.callbacks.ModelSummary 83 | max_depth: 1 84 | 85 | model_checkpoint: 86 | _target_: pytorch_lightning.callbacks.ModelCheckpoint 87 | monitor: val_loss 88 | dirpath: ${probe_ckpt_dir} 89 | every_n_epochs: 1 90 | mode: min 91 | save_top_k: 1 92 | save_last: False 93 | filename: ${codec_name}_${mode}_{epoch}-{val_loss:.4f} 94 | verbose: True 95 | 96 | tensorboard: 97 | _target_: pytorch_lightning.loggers.TensorBoardLogger 98 | save_dir: ??? 99 | name: ${codec_name}_${mode} 100 | log_graph: true -------------------------------------------------------------------------------- /codec_evaluation/probe/config/NSynthP_dataset/encodec.yaml: -------------------------------------------------------------------------------- 1 | mode: quantized_emb 2 | sample_rate: 16000 3 | target_sec: 4 4 | num_outputs: 128 5 | probe_ckpt_dir: ??? 6 | seed: 666 7 | codec_name: encodec 8 | task: multiclass 9 | save_result: null 10 | 11 | 12 | trainer: 13 | _target_: pytorch_lightning.Trainer 14 | accelerator: gpu 15 | devices: ??? 16 | precision: 32 17 | max_epochs: 100 18 | limit_val_batches: 10 19 | log_every_n_steps: 5 20 | val_check_interval: 1.0 21 | 22 | data: 23 | _target_: codec_evaluation.probe.dataset.NSynthP_dataset.NSynthP_dataset.NSynthPdataModule 24 | dataset_args: 25 | sample_rate: ${sample_rate} 26 | target_sec: ${target_sec} 27 | train_audio_dir: ??? 28 | val_audio_dir: ??? 29 | test_audio_dir: ??? 30 | train_batch_size: 256 31 | val_batch_size: 32 32 | test_batch_size: 128 33 | train_num_workers: 128 34 | val_num_workers: 8 35 | test_num_workers: 64 36 | 37 | model: 38 | _target_: codec_evaluation.probe.model.lit_prober.Prober 39 | codec_name: ${codec_name} 40 | sample_rate: ${sample_rate} 41 | mode: ${mode} 42 | task: ${task} 43 | num_outputs: ${num_outputs} 44 | probe_model_builder: 45 | _target_: codec_evaluation.probe.model.multiclass_model.MulticlassProber 46 | _partial_: true 47 | num_outputs: ${num_outputs} 48 | drop_out: 0.1 49 | channel_reduction: 16 50 | padding: 1 51 | kernel_size: 3 52 | stride: 1 53 | target_sec: ${target_sec} 54 | model_ckpt_dir: ??? 55 | 56 | optimizer_builder: 57 | _target_: torch.optim.AdamW 58 | _partial_: true 59 | lr: 1e-4 60 | betas: [0.8, 0.99] 61 | eps: 1e-5 62 | weight_decay: 0.08 63 | 64 | lr_scheduler_builder: 65 | _target_: torch.optim.lr_scheduler.LambdaLR 66 | _partial_: true 67 | lr_lambda: 68 | _target_: codec_evaluation.utils.schedule.get_cosine_schedule_with_warmup_lr_lambda 69 | _partial_: true 70 | num_warmup_steps: 10 71 | num_training_steps: 10000 72 | final_lr_ratio: 0.2 73 | 74 | callbacks: 75 | learning_rate_monitor: 76 | _target_: pytorch_lightning.callbacks.LearningRateMonitor 77 | logging_interval: step 78 | 79 | rich_progress_bar: 80 | _target_: pytorch_lightning.callbacks.RichProgressBar 81 | 82 | model_summary: 83 | _target_: pytorch_lightning.callbacks.ModelSummary 84 | max_depth: 1 85 | 86 | model_checkpoint: 87 | _target_: pytorch_lightning.callbacks.ModelCheckpoint 88 | monitor: val_loss 89 | dirpath: ${probe_ckpt_dir} 90 | every_n_epochs: 1 91 | mode: min 92 | save_top_k: 1 93 | filename: ${codec_name}_${mode}_{epoch}-{val_loss:.4f} 94 | verbose: True 95 | 96 | tensorboard: 97 | _target_: pytorch_lightning.loggers.TensorBoardLogger 98 | save_dir: ??? 99 | name: ${codec_name}_${mode} 100 | log_graph: true -------------------------------------------------------------------------------- /codec_evaluation/probe/config/MELD_dataset/mimi.yaml: -------------------------------------------------------------------------------- 1 | mode: quantized_emb 2 | sample_rate: 48000 3 | target_sec: 8 4 | num_outputs: 7 5 | probe_ckpt_dir: ??? 6 | seed: 666 7 | codec_name: mimi # 需要更改 8 | task: multiclass 9 | save_result: null 10 | 11 | trainer: 12 | _target_: pytorch_lightning.Trainer 13 | accelerator: gpu 14 | devices: ??? 15 | precision: 32 16 | max_epochs: 100 17 | limit_val_batches: 10 18 | log_every_n_steps: 5 19 | val_check_interval: 1.0 20 | 21 | data: 22 | _target_: codec_evaluation.probe.dataset.MELD_dataset.MELD_dataset.MELDdataModule 23 | dataset_args: 24 | sample_rate: ${sample_rate} 25 | target_sec: ${target_sec} 26 | train_audio_dir: ??? 27 | val_audio_dir: ??? 28 | test_audio_dir: ??? 29 | train_batch_size: 64 30 | val_batch_size: 2 31 | test_batch_size: 32 32 | train_num_workers: 8 33 | val_num_workers: 4 34 | test_num_workers: 4 35 | 36 | model: 37 | _target_: codec_evaluation.probe.model.lit_prober.Prober 38 | codec_name: ${codec_name} 39 | sample_rate: ${sample_rate} 40 | mode: ${mode} 41 | task: ${task} 42 | num_outputs: ${num_outputs} 43 | probe_model_builder: 44 | _target_: codec_evaluation.probe.model.multiclass_model.MulticlassProber 45 | _partial_: true 46 | num_outputs: ${num_outputs} 47 | drop_out: 0.2 48 | channel_reduction: 16 49 | padding: 1 50 | kernel_size: 3 51 | stride: 1 52 | target_sec: ${target_sec} 53 | model_ckpt_dir: ??? 54 | 55 | optimizer_builder: 56 | _target_: torch.optim.AdamW 57 | _partial_: true 58 | lr: 1e-4 59 | betas: [0.8, 0.99] 60 | eps: 1e-5 61 | weight_decay: 0.08 62 | 63 | lr_scheduler_builder: 64 | _target_: torch.optim.lr_scheduler.LambdaLR 65 | _partial_: true 66 | lr_lambda: 67 | _target_: codec_evaluation.utils.schedule.get_cosine_schedule_with_warmup_lr_lambda 68 | _partial_: true 69 | num_warmup_steps: 10 70 | num_training_steps: 10000 71 | final_lr_ratio: 0.2 72 | 73 | callbacks: 74 | learning_rate_monitor: 75 | _target_: pytorch_lightning.callbacks.LearningRateMonitor 76 | logging_interval: step 77 | 78 | rich_progress_bar: 79 | _target_: pytorch_lightning.callbacks.RichProgressBar 80 | 81 | model_summary: 82 | _target_: pytorch_lightning.callbacks.ModelSummary 83 | max_depth: 1 84 | 85 | model_checkpoint: 86 | _target_: pytorch_lightning.callbacks.ModelCheckpoint 87 | monitor: val_loss 88 | dirpath: ${probe_ckpt_dir} 89 | every_n_epochs: 1 90 | mode: min 91 | save_top_k: 1 92 | save_last: False 93 | filename: ${codec_name}_${mode}_{epoch}-{val_loss:.4f} 94 | verbose: True 95 | 96 | tensorboard: 97 | _target_: pytorch_lightning.loggers.TensorBoardLogger 98 | save_dir: ??? 99 | name: ${codec_name}_${mode} 100 | log_graph: true -------------------------------------------------------------------------------- /codec_evaluation/probe/config/NSynthP_dataset/YuE.yaml: -------------------------------------------------------------------------------- 1 | mode: quantized_emb 2 | sample_rate: 16000 3 | target_sec: 4 4 | n_segments: 1 5 | num_outputs: 128 6 | probe_ckpt_dir: ??? 7 | seed: 666 8 | codec_name: yue 9 | task: multiclass 10 | save_result: null 11 | 12 | trainer: 13 | _target_: pytorch_lightning.Trainer 14 | accelerator: gpu 15 | devices: ??? 16 | precision: 32 17 | max_epochs: 100 18 | limit_val_batches: 10 19 | log_every_n_steps: 5 20 | val_check_interval: 1.0 21 | 22 | data: 23 | _target_: codec_evaluation.probe.dataset.NSynthP_dataset.NSynthP_dataset.NSynthPdataModule 24 | dataset_args: 25 | sample_rate: ${sample_rate} 26 | target_sec: ${target_sec} 27 | train_audio_dir: ??? 28 | val_audio_dir: ??? 29 | test_audio_dir: ??? 30 | train_batch_size: 256 31 | val_batch_size: 32 32 | test_batch_size: 128 33 | train_num_workers: 128 34 | val_num_workers: 8 35 | test_num_workers: 64 36 | 37 | model: 38 | _target_: codec_evaluation.probe.model.lit_prober.Prober 39 | codec_name: ${codec_name} 40 | sample_rate: ${sample_rate} 41 | mode: ${mode} 42 | task: ${task} 43 | num_outputs: ${num_outputs} 44 | probe_model_builder: 45 | _target_: codec_evaluation.probe.model.multiclass_model.MulticlassProber 46 | _partial_: true 47 | num_outputs: ${num_outputs} 48 | drop_out: 0.2 49 | channel_reduction: 16 50 | padding: 1 51 | kernel_size: 3 52 | stride: 1 53 | target_sec: ${target_sec} 54 | model_ckpt_dir: ??? 55 | 56 | optimizer_builder: 57 | _target_: torch.optim.AdamW 58 | _partial_: true 59 | lr: 1e-4 60 | betas: [0.8, 0.99] 61 | eps: 1e-5 62 | weight_decay: 0.08 63 | 64 | lr_scheduler_builder: 65 | _target_: torch.optim.lr_scheduler.LambdaLR 66 | _partial_: true 67 | lr_lambda: 68 | _target_: codec_evaluation.utils.schedule.get_cosine_schedule_with_warmup_lr_lambda 69 | _partial_: true 70 | num_warmup_steps: 10 71 | num_training_steps: 10000 72 | final_lr_ratio: 0.2 73 | 74 | callbacks: 75 | learning_rate_monitor: 76 | _target_: pytorch_lightning.callbacks.LearningRateMonitor 77 | logging_interval: step 78 | 79 | rich_progress_bar: 80 | _target_: pytorch_lightning.callbacks.RichProgressBar 81 | 82 | model_summary: 83 | _target_: pytorch_lightning.callbacks.ModelSummary 84 | max_depth: 1 85 | 86 | model_checkpoint: 87 | _target_: pytorch_lightning.callbacks.ModelCheckpoint 88 | monitor: val_loss 89 | dirpath: ${probe_ckpt_dir} 90 | every_n_epochs: 1 91 | mode: min 92 | save_top_k: 1 93 | filename: ${codec_name}_${mode}_{epoch}-{val_loss:.4f} 94 | verbose: True 95 | 96 | tensorboard: 97 | _target_: pytorch_lightning.loggers.TensorBoardLogger 98 | save_dir: ??? 99 | name: ${codec_name}_${mode} 100 | log_graph: true -------------------------------------------------------------------------------- /codec_evaluation/codecs/levo_modules/Flow1dVAE/our_MERT_BESTRQ/mert_fairseq/models/musicfm/model/w2v2_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "activation_dropout": 0.1, 3 | "adapter_kernel_size": 3, 4 | "adapter_stride": 2, 5 | "add_adapter": false, 6 | "apply_spec_augment": true, 7 | "architectures": [ 8 | "Wav2Vec2ConformerForCTC" 9 | ], 10 | "attention_dropout": 0.1, 11 | "bos_token_id": 1, 12 | "classifier_proj_size": 256, 13 | "codevector_dim": 768, 14 | "conformer_conv_dropout": 0.1, 15 | "contrastive_logits_temperature": 0.1, 16 | "conv_bias": true, 17 | "conv_depthwise_kernel_size": 31, 18 | "conv_dim": [ 19 | 512, 20 | 512, 21 | 512, 22 | 512, 23 | 512, 24 | 512, 25 | 512 26 | ], 27 | "conv_kernel": [ 28 | 10, 29 | 3, 30 | 3, 31 | 3, 32 | 3, 33 | 2, 34 | 2 35 | ], 36 | "conv_stride": [ 37 | 5, 38 | 2, 39 | 2, 40 | 2, 41 | 2, 42 | 2, 43 | 2 44 | ], 45 | "ctc_loss_reduction": "sum", 46 | "ctc_zero_infinity": false, 47 | "diversity_loss_weight": 0.1, 48 | "do_stable_layer_norm": true, 49 | "eos_token_id": 2, 50 | "feat_extract_activation": "gelu", 51 | "feat_extract_dropout": 0.0, 52 | "feat_extract_norm": "layer", 53 | "feat_proj_dropout": 0.1, 54 | "feat_quantizer_dropout": 0.0, 55 | "final_dropout": 0.1, 56 | "gradient_checkpointing": false, 57 | "hidden_act": "swish", 58 | "hidden_dropout": 0.1, 59 | "hidden_dropout_prob": 0.1, 60 | "hidden_size": 1024, 61 | "initializer_range": 0.02, 62 | "intermediate_size": 4096, 63 | "layer_norm_eps": 1e-05, 64 | "layerdrop": 0.0, 65 | "mask_feature_length": 10, 66 | "mask_feature_min_masks": 0, 67 | "mask_feature_prob": 0.0, 68 | "mask_time_length": 10, 69 | "mask_time_min_masks": 2, 70 | "mask_time_prob": 0.05, 71 | "max_source_positions": 5000, 72 | "model_type": "wav2vec2-conformer", 73 | "num_adapter_layers": 3, 74 | "num_attention_heads": 16, 75 | "num_codevector_groups": 2, 76 | "num_codevectors_per_group": 320, 77 | "num_conv_pos_embedding_groups": 16, 78 | "num_conv_pos_embeddings": 128, 79 | "num_feat_extract_layers": 7, 80 | "num_hidden_layers": 24, 81 | "num_negatives": 100, 82 | "output_hidden_size": 1024, 83 | "pad_token_id": 0, 84 | "position_embeddings_type": "rotary", 85 | "proj_codevector_dim": 768, 86 | "rotary_embedding_base": 10000, 87 | "tdnn_dilation": [ 88 | 1, 89 | 2, 90 | 3, 91 | 1, 92 | 1 93 | ], 94 | "tdnn_dim": [ 95 | 512, 96 | 512, 97 | 512, 98 | 512, 99 | 1500 100 | ], 101 | "tdnn_kernel": [ 102 | 5, 103 | 3, 104 | 3, 105 | 1, 106 | 1 107 | ], 108 | "torch_dtype": "float32", 109 | "transformers_version": "4.19.0.dev0", 110 | "use_weighted_layer_sum": false, 111 | "vocab_size": 32, 112 | "xvector_output_dim": 512 113 | } 114 | -------------------------------------------------------------------------------- /codec_evaluation/probe/config/ESC50_dataset/YuE.yaml: -------------------------------------------------------------------------------- 1 | mode: quantized_emb 2 | sample_rate: 44100 3 | target_sec: 5 4 | num_outputs: 50 5 | probe_ckpt_dir: ??? 6 | seed: 666 7 | codec_name: yue # 需要更改 8 | task: multiclass 9 | save_result: null 10 | 11 | trainer: 12 | _target_: pytorch_lightning.Trainer 13 | accelerator: gpu 14 | devices: ??? 15 | precision: 32 16 | max_epochs: 100 17 | limit_val_batches: 10 18 | log_every_n_steps: 5 19 | val_check_interval: 1.0 20 | 21 | data: 22 | _target_: codec_evaluation.probe.dataset.ESC50_dataset.ESC50_dataset.ESC50dataModule 23 | dataset_args: 24 | sample_rate: ${sample_rate} 25 | target_sec: ${target_sec} 26 | train_audio_dir: ??? 27 | val_audio_dir: ??? 28 | test_audio_dir: ??? 29 | train_batch_size: 128 30 | val_batch_size: 2 31 | test_batch_size: 128 32 | train_num_workers: 32 33 | val_num_workers: 4 34 | test_num_workers: 32 35 | 36 | model: 37 | _target_: codec_evaluation.probe.model.lit_prober.Prober 38 | codec_name: ${codec_name} 39 | sample_rate: ${sample_rate} 40 | mode: ${mode} 41 | task: ${task} 42 | num_outputs: ${num_outputs} 43 | probe_model_builder: 44 | _target_: codec_evaluation.probe.model.multiclass_model.MulticlassProber 45 | _partial_: true 46 | num_outputs: ${num_outputs} 47 | drop_out: 0.1 48 | channel_reduction: 16 49 | padding: 1 50 | kernel_size: 3 51 | stride: 1 52 | target_sec: ${target_sec} 53 | model_ckpt_dir: ??? 54 | 55 | optimizer_builder: 56 | _target_: torch.optim.AdamW 57 | _partial_: true 58 | lr: 1e-4 59 | betas: [0.8, 0.99] 60 | eps: 1e-5 61 | weight_decay: 0.08 62 | 63 | lr_scheduler_builder: 64 | _target_: torch.optim.lr_scheduler.LambdaLR 65 | _partial_: true 66 | lr_lambda: 67 | _target_: codec_evaluation.utils.schedule.get_cosine_schedule_with_warmup_lr_lambda 68 | _partial_: true 69 | num_warmup_steps: 10 70 | num_training_steps: 10000 71 | final_lr_ratio: 0.2 72 | 73 | callbacks: 74 | learning_rate_monitor: 75 | _target_: pytorch_lightning.callbacks.LearningRateMonitor 76 | logging_interval: step 77 | 78 | rich_progress_bar: 79 | _target_: pytorch_lightning.callbacks.RichProgressBar 80 | 81 | model_summary: 82 | _target_: pytorch_lightning.callbacks.ModelSummary 83 | max_depth: 1 84 | 85 | model_checkpoint: 86 | _target_: pytorch_lightning.callbacks.ModelCheckpoint 87 | monitor: val_loss 88 | dirpath: ${probe_ckpt_dir} 89 | every_n_epochs: 1 90 | mode: min 91 | save_top_k: 1 92 | save_last: False 93 | filename: ${codec_name}_${mode}_{epoch}-{val_loss:.4f} 94 | verbose: True 95 | 96 | tensorboard: 97 | _target_: pytorch_lightning.loggers.TensorBoardLogger 98 | save_dir: ??? 99 | name: ${codec_name}_${mode} 100 | log_graph: true -------------------------------------------------------------------------------- /codec_evaluation/probe/config/ESC50_dataset/dac.yaml: -------------------------------------------------------------------------------- 1 | mode: quantized_emb 2 | sample_rate: 44100 3 | target_sec: 5 4 | num_outputs: 50 5 | probe_ckpt_dir: ??? 6 | seed: 666 7 | codec_name: dac # 需要更改 8 | task: multiclass 9 | save_result: null 10 | 11 | trainer: 12 | _target_: pytorch_lightning.Trainer 13 | accelerator: gpu 14 | devices: ??? 15 | precision: 32 16 | max_epochs: 100 17 | limit_val_batches: 10 18 | log_every_n_steps: 5 19 | val_check_interval: 1.0 20 | 21 | data: 22 | _target_: codec_evaluation.probe.dataset.ESC50_dataset.ESC50_dataset.ESC50dataModule 23 | dataset_args: 24 | sample_rate: ${sample_rate} 25 | target_sec: ${target_sec} 26 | train_audio_dir: ??? 27 | val_audio_dir: ??? 28 | test_audio_dir: ??? 29 | train_batch_size: 128 30 | val_batch_size: 2 31 | test_batch_size: 128 32 | train_num_workers: 32 33 | val_num_workers: 4 34 | test_num_workers: 32 35 | 36 | model: 37 | _target_: codec_evaluation.probe.model.lit_prober.Prober 38 | codec_name: ${codec_name} 39 | sample_rate: ${sample_rate} 40 | mode: ${mode} 41 | task: ${task} 42 | num_outputs: ${num_outputs} 43 | probe_model_builder: 44 | _target_: codec_evaluation.probe.model.multiclass_model.MulticlassProber 45 | _partial_: true 46 | num_outputs: ${num_outputs} 47 | drop_out: 0.1 48 | channel_reduction: 16 49 | padding: 1 50 | kernel_size: 3 51 | stride: 1 52 | target_sec: ${target_sec} 53 | model_ckpt_dir: ??? 54 | 55 | optimizer_builder: 56 | _target_: torch.optim.AdamW 57 | _partial_: true 58 | lr: 1e-4 59 | betas: [0.8, 0.99] 60 | eps: 1e-5 61 | weight_decay: 0.08 62 | 63 | lr_scheduler_builder: 64 | _target_: torch.optim.lr_scheduler.LambdaLR 65 | _partial_: true 66 | lr_lambda: 67 | _target_: codec_evaluation.utils.schedule.get_cosine_schedule_with_warmup_lr_lambda 68 | _partial_: true 69 | num_warmup_steps: 10 70 | num_training_steps: 10000 71 | final_lr_ratio: 0.2 72 | 73 | callbacks: 74 | learning_rate_monitor: 75 | _target_: pytorch_lightning.callbacks.LearningRateMonitor 76 | logging_interval: step 77 | 78 | rich_progress_bar: 79 | _target_: pytorch_lightning.callbacks.RichProgressBar 80 | 81 | model_summary: 82 | _target_: pytorch_lightning.callbacks.ModelSummary 83 | max_depth: 1 84 | 85 | model_checkpoint: 86 | _target_: pytorch_lightning.callbacks.ModelCheckpoint 87 | monitor: val_loss 88 | dirpath: ${probe_ckpt_dir} 89 | every_n_epochs: 1 90 | mode: min 91 | save_top_k: 1 92 | save_last: False 93 | filename: ${codec_name}_${mode}_{epoch}-{val_loss:.4f} 94 | verbose: True 95 | 96 | tensorboard: 97 | _target_: pytorch_lightning.loggers.TensorBoardLogger 98 | save_dir: ??? 99 | name: ${codec_name}_${mode} 100 | log_graph: true -------------------------------------------------------------------------------- /codec_evaluation/probe/config/ESC50_dataset/encodec.yaml: -------------------------------------------------------------------------------- 1 | mode: quantized_emb 2 | sample_rate: 44100 3 | target_sec: 5 4 | num_outputs: 50 5 | probe_ckpt_dir: ??? 6 | seed: 666 7 | codec_name: encodec 8 | task: multiclass 9 | save_result: null 10 | 11 | 12 | trainer: 13 | _target_: pytorch_lightning.Trainer 14 | accelerator: gpu 15 | devices: ??? 16 | precision: 32 17 | max_epochs: 100 18 | limit_val_batches: 10 19 | log_every_n_steps: 5 20 | val_check_interval: 1.0 21 | 22 | data: 23 | _target_: codec_evaluation.probe.dataset.ESC50_dataset.ESC50_dataset.ESC50dataModule 24 | dataset_args: 25 | sample_rate: ${sample_rate} 26 | target_sec: ${target_sec} 27 | train_audio_dir: ??? 28 | val_audio_dir: ??? 29 | test_audio_dir: ??? 30 | train_batch_size: 128 31 | val_batch_size: 2 32 | test_batch_size: 128 33 | train_num_workers: 32 34 | val_num_workers: 4 35 | test_num_workers: 32 36 | 37 | 38 | model: 39 | _target_: codec_evaluation.probe.model.lit_prober.Prober 40 | codec_name: ${codec_name} 41 | sample_rate: ${sample_rate} 42 | mode: ${mode} 43 | task: ${task} 44 | num_outputs: ${num_outputs} 45 | probe_model_builder: 46 | _target_: codec_evaluation.probe.model.multiclass_model.MulticlassProber 47 | _partial_: true 48 | num_outputs: ${num_outputs} 49 | drop_out: 0.1 50 | channel_reduction: 16 51 | padding: 1 52 | kernel_size: 3 53 | stride: 1 54 | target_sec: ${target_sec} 55 | model_ckpt_dir: ??? 56 | 57 | optimizer_builder: 58 | _target_: torch.optim.AdamW 59 | _partial_: true 60 | lr: 1e-4 61 | betas: [0.8, 0.99] 62 | eps: 1e-5 63 | weight_decay: 0.08 64 | 65 | lr_scheduler_builder: 66 | _target_: torch.optim.lr_scheduler.LambdaLR 67 | _partial_: true 68 | lr_lambda: 69 | _target_: codec_evaluation.utils.schedule.get_cosine_schedule_with_warmup_lr_lambda 70 | _partial_: true 71 | num_warmup_steps: 10 72 | num_training_steps: 10000 73 | final_lr_ratio: 0.2 74 | 75 | callbacks: 76 | learning_rate_monitor: 77 | _target_: pytorch_lightning.callbacks.LearningRateMonitor 78 | logging_interval: step 79 | 80 | rich_progress_bar: 81 | _target_: pytorch_lightning.callbacks.RichProgressBar 82 | 83 | model_summary: 84 | _target_: pytorch_lightning.callbacks.ModelSummary 85 | max_depth: 1 86 | 87 | model_checkpoint: 88 | _target_: pytorch_lightning.callbacks.ModelCheckpoint 89 | monitor: val_loss 90 | dirpath: ${probe_ckpt_dir} 91 | every_n_epochs: 1 92 | mode: min 93 | save_top_k: 1 94 | save_last: False 95 | filename: ${codec_name}_${mode}_{epoch}-{val_loss:.4f} 96 | verbose: True 97 | 98 | tensorboard: 99 | _target_: pytorch_lightning.loggers.TensorBoardLogger 100 | save_dir: ??? 101 | name: ${codec_name}_${mode} 102 | log_graph: true -------------------------------------------------------------------------------- /codec_evaluation/probe/config/ESC50_dataset/mimi.yaml: -------------------------------------------------------------------------------- 1 | mode: quantized_emb 2 | sample_rate: 16000 3 | target_sec: 5 4 | num_outputs: 50 5 | probe_ckpt_dir: ??? 6 | seed: 666 7 | codec_name: mimi # 需要更改 8 | task: multiclass 9 | save_result: null 10 | 11 | trainer: 12 | _target_: pytorch_lightning.Trainer 13 | accelerator: gpu 14 | devices: ??? 15 | precision: 32 16 | max_epochs: 100 17 | limit_val_batches: 10 18 | log_every_n_steps: 5 19 | val_check_interval: 1.0 20 | 21 | data: 22 | _target_: codec_evaluation.probe.dataset.ESC50_dataset.ESC50_dataset.ESC50dataModule 23 | dataset_args: 24 | sample_rate: ${sample_rate} 25 | target_sec: ${target_sec} 26 | train_audio_dir: ??? 27 | val_audio_dir: ??? 28 | test_audio_dir: ??? 29 | train_batch_size: 128 30 | val_batch_size: 2 31 | test_batch_size: 128 32 | train_num_workers: 32 33 | val_num_workers: 4 34 | test_num_workers: 32 35 | 36 | model: 37 | _target_: codec_evaluation.probe.model.lit_prober.Prober 38 | codec_name: ${codec_name} 39 | sample_rate: ${sample_rate} 40 | mode: ${mode} 41 | task: ${task} 42 | num_outputs: ${num_outputs} 43 | probe_model_builder: 44 | _target_: codec_evaluation.probe.model.multiclass_model.MulticlassProber 45 | _partial_: true 46 | num_outputs: ${num_outputs} 47 | drop_out: 0.1 48 | channel_reduction: 16 49 | padding: 1 50 | kernel_size: 3 51 | stride: 1 52 | target_sec: ${target_sec} 53 | model_ckpt_dir: ??? 54 | 55 | optimizer_builder: 56 | _target_: torch.optim.AdamW 57 | _partial_: true 58 | lr: 1e-4 59 | betas: [0.8, 0.99] 60 | eps: 1e-5 61 | weight_decay: 0.08 62 | 63 | lr_scheduler_builder: 64 | _target_: torch.optim.lr_scheduler.LambdaLR 65 | _partial_: true 66 | lr_lambda: 67 | _target_: codec_evaluation.utils.schedule.get_cosine_schedule_with_warmup_lr_lambda 68 | _partial_: true 69 | num_warmup_steps: 10 70 | num_training_steps: 10000 71 | final_lr_ratio: 0.2 72 | 73 | callbacks: 74 | learning_rate_monitor: 75 | _target_: pytorch_lightning.callbacks.LearningRateMonitor 76 | logging_interval: step 77 | 78 | rich_progress_bar: 79 | _target_: pytorch_lightning.callbacks.RichProgressBar 80 | 81 | model_summary: 82 | _target_: pytorch_lightning.callbacks.ModelSummary 83 | max_depth: 1 84 | 85 | model_checkpoint: 86 | _target_: pytorch_lightning.callbacks.ModelCheckpoint 87 | monitor: val_loss 88 | dirpath: ${probe_ckpt_dir} 89 | every_n_epochs: 1 90 | mode: min 91 | save_top_k: 1 92 | save_last: False 93 | filename: ${codec_name}_${mode}_{epoch}-{val_loss:.4f} 94 | verbose: True 95 | 96 | tensorboard: 97 | _target_: pytorch_lightning.loggers.TensorBoardLogger 98 | save_dir: ??? 99 | name: ${codec_name}_${mode} 100 | log_graph: true -------------------------------------------------------------------------------- /codec_evaluation/probe/config/ESC50_dataset/semanticodec.yaml: -------------------------------------------------------------------------------- 1 | mode: quantized_emb 2 | sample_rate: 44100 3 | target_sec: 5 4 | num_outputs: 50 5 | probe_ckpt_dir: ??? 6 | seed: 666 7 | codec_name: semanticodec 8 | task: multiclass 9 | save_result: null 10 | 11 | trainer: 12 | _target_: pytorch_lightning.Trainer 13 | accelerator: gpu 14 | devices: ??? 15 | precision: 32 16 | max_epochs: 100 17 | limit_val_batches: 10 18 | log_every_n_steps: 5 19 | val_check_interval: 1.0 20 | 21 | data: 22 | _target_: codec_evaluation.probe.dataset.ESC50_dataset.ESC50_dataset.ESC50dataModule 23 | dataset_args: 24 | sample_rate: ${sample_rate} 25 | target_sec: ${target_sec} 26 | train_audio_dir: ??? 27 | val_audio_dir: ??? 28 | test_audio_dir: ??? 29 | train_batch_size: 128 30 | val_batch_size: 2 31 | test_batch_size: 128 32 | train_num_workers: 32 33 | val_num_workers: 4 34 | test_num_workers: 32 35 | 36 | model: 37 | _target_: codec_evaluation.probe.model.lit_prober.Prober 38 | codec_name: ${codec_name} 39 | sample_rate: ${sample_rate} 40 | mode: ${mode} 41 | task: ${task} 42 | num_outputs: ${num_outputs} 43 | probe_model_builder: 44 | _target_: codec_evaluation.probe.model.multiclass_model.MulticlassProber 45 | _partial_: true 46 | num_outputs: ${num_outputs} 47 | drop_out: 0.1 48 | channel_reduction: 16 49 | padding: 1 50 | kernel_size: 3 51 | stride: 1 52 | target_sec: ${target_sec} 53 | model_ckpt_dir: ??? 54 | 55 | optimizer_builder: 56 | _target_: torch.optim.AdamW 57 | _partial_: true 58 | lr: 1e-4 59 | betas: [0.8, 0.99] 60 | eps: 1e-5 61 | weight_decay: 0.08 62 | 63 | lr_scheduler_builder: 64 | _target_: torch.optim.lr_scheduler.LambdaLR 65 | _partial_: true 66 | lr_lambda: 67 | _target_: codec_evaluation.utils.schedule.get_cosine_schedule_with_warmup_lr_lambda 68 | _partial_: true 69 | num_warmup_steps: 10 70 | num_training_steps: 10000 71 | final_lr_ratio: 0.2 72 | 73 | callbacks: 74 | learning_rate_monitor: 75 | _target_: pytorch_lightning.callbacks.LearningRateMonitor 76 | logging_interval: step 77 | 78 | rich_progress_bar: 79 | _target_: pytorch_lightning.callbacks.RichProgressBar 80 | 81 | model_summary: 82 | _target_: pytorch_lightning.callbacks.ModelSummary 83 | max_depth: 1 84 | 85 | model_checkpoint: 86 | _target_: pytorch_lightning.callbacks.ModelCheckpoint 87 | monitor: val_loss 88 | dirpath: ${probe_ckpt_dir} 89 | every_n_epochs: 1 90 | mode: min 91 | save_top_k: 1 92 | save_last: False 93 | filename: ${codec_name}_${mode}_{epoch}-{val_loss:.4f} 94 | verbose: True 95 | 96 | tensorboard: 97 | _target_: pytorch_lightning.loggers.TensorBoardLogger 98 | save_dir: ??? 99 | name: ${codec_name}_${mode} 100 | log_graph: true -------------------------------------------------------------------------------- /codec_evaluation/probe/config/ESC50_dataset/wavtokenizer.yaml: -------------------------------------------------------------------------------- 1 | mode: quantized_emb 2 | sample_rate: 44100 3 | target_sec: 5 4 | num_outputs: 50 5 | probe_ckpt_dir: ??? 6 | seed: 666 7 | codec_name: wavtokenizer 8 | task: multiclass 9 | save_result: null 10 | 11 | trainer: 12 | _target_: pytorch_lightning.Trainer 13 | accelerator: gpu 14 | devices: ??? 15 | precision: 32 16 | max_epochs: 100 17 | limit_val_batches: 10 18 | log_every_n_steps: 5 19 | val_check_interval: 1.0 20 | 21 | data: 22 | _target_: codec_evaluation.probe.dataset.ESC50_dataset.ESC50_dataset.ESC50dataModule 23 | dataset_args: 24 | sample_rate: ${sample_rate} 25 | target_sec: ${target_sec} 26 | train_audio_dir: ??? 27 | val_audio_dir: ??? 28 | test_audio_dir: ??? 29 | train_batch_size: 128 30 | val_batch_size: 2 31 | test_batch_size: 128 32 | train_num_workers: 32 33 | val_num_workers: 4 34 | test_num_workers: 32 35 | 36 | model: 37 | _target_: codec_evaluation.probe.model.lit_prober.Prober 38 | codec_name: ${codec_name} 39 | sample_rate: ${sample_rate} 40 | mode: ${mode} 41 | task: ${task} 42 | num_outputs: ${num_outputs} 43 | probe_model_builder: 44 | _target_: codec_evaluation.probe.model.multiclass_model.MulticlassProber 45 | _partial_: true 46 | num_outputs: ${num_outputs} 47 | drop_out: 0.1 48 | channel_reduction: 16 49 | padding: 1 50 | kernel_size: 3 51 | stride: 1 52 | target_sec: ${target_sec} 53 | model_ckpt_dir: ??? 54 | 55 | optimizer_builder: 56 | _target_: torch.optim.AdamW 57 | _partial_: true 58 | lr: 1e-4 59 | betas: [0.8, 0.99] 60 | eps: 1e-5 61 | weight_decay: 0.08 62 | 63 | lr_scheduler_builder: 64 | _target_: torch.optim.lr_scheduler.LambdaLR 65 | _partial_: true 66 | lr_lambda: 67 | _target_: codec_evaluation.utils.schedule.get_cosine_schedule_with_warmup_lr_lambda 68 | _partial_: true 69 | num_warmup_steps: 10 70 | num_training_steps: 10000 71 | final_lr_ratio: 0.2 72 | 73 | callbacks: 74 | learning_rate_monitor: 75 | _target_: pytorch_lightning.callbacks.LearningRateMonitor 76 | logging_interval: step 77 | 78 | rich_progress_bar: 79 | _target_: pytorch_lightning.callbacks.RichProgressBar 80 | 81 | model_summary: 82 | _target_: pytorch_lightning.callbacks.ModelSummary 83 | max_depth: 1 84 | 85 | model_checkpoint: 86 | _target_: pytorch_lightning.callbacks.ModelCheckpoint 87 | monitor: val_loss 88 | dirpath: ${probe_ckpt_dir} 89 | every_n_epochs: 1 90 | mode: min 91 | save_top_k: 1 92 | save_last: False 93 | filename: ${codec_name}_${mode}_{epoch}-{val_loss:.4f} 94 | verbose: True 95 | 96 | tensorboard: 97 | _target_: pytorch_lightning.loggers.TensorBoardLogger 98 | save_dir: ??? 99 | name: ${codec_name}_${mode} 100 | log_graph: true -------------------------------------------------------------------------------- /codec_evaluation/probe/config/GS_dataset/semanticodec.yaml: -------------------------------------------------------------------------------- 1 | mode: quantized_emb 2 | sample_rate: 44100 3 | target_sec: 10 4 | num_outputs: 24 5 | probe_ckpt_dir: ??? 6 | seed: 666 7 | codec_name: semanticodec # 需要更改 8 | task: multiclass 9 | save_result: null 10 | 11 | trainer: 12 | _target_: pytorch_lightning.Trainer 13 | accelerator: gpu 14 | devices: ??? 15 | precision: 32 16 | max_epochs: 100 17 | limit_val_batches: 10 18 | log_every_n_steps: 5 19 | val_check_interval: 1.0 20 | 21 | data: 22 | _target_: codec_evaluation.probe.dataset.GS_dataset.GS_dataset.GSdataModule 23 | dataset_args: 24 | sample_rate: ${sample_rate} 25 | target_sec: ${target_sec} 26 | train_audio_dir: ??? 27 | val_audio_dir: ??? 28 | test_audio_dir: ??? 29 | train_batch_size: 8 30 | val_batch_size: 1 31 | test_batch_size: 8 32 | train_num_workers: 4 33 | val_num_workers: 2 34 | test_num_workers: 2 35 | 36 | model: 37 | _target_: codec_evaluation.probe.model.lit_prober.Prober 38 | codec_name: ${codec_name} 39 | sample_rate: ${sample_rate} 40 | mode: ${mode} 41 | task: ${task} 42 | num_outputs: ${num_outputs} 43 | probe_model_builder: 44 | _target_: codec_evaluation.probe.model.multiclass_model.MulticlassProber 45 | _partial_: true 46 | num_outputs: ${num_outputs} 47 | drop_out: 0.1 48 | channel_reduction: 16 49 | padding: 1 50 | kernel_size: 3 51 | stride: 1 52 | target_sec: ${target_sec} 53 | model_ckpt_dir: ??? 54 | 55 | optimizer_builder: 56 | _target_: torch.optim.AdamW 57 | _partial_: true 58 | lr: 1e-4 59 | betas: [0.8, 0.99] 60 | eps: 1e-5 61 | weight_decay: 0.08 62 | 63 | lr_scheduler_builder: 64 | _target_: torch.optim.lr_scheduler.LambdaLR 65 | _partial_: true 66 | lr_lambda: 67 | _target_: codec_evaluation.utils.schedule.get_cosine_schedule_with_warmup_lr_lambda 68 | _partial_: true 69 | num_warmup_steps: 10 70 | num_training_steps: 10000 71 | final_lr_ratio: 0.2 72 | 73 | callbacks: 74 | learning_rate_monitor: 75 | _target_: pytorch_lightning.callbacks.LearningRateMonitor 76 | logging_interval: step 77 | 78 | rich_progress_bar: 79 | _target_: pytorch_lightning.callbacks.RichProgressBar 80 | 81 | model_summary: 82 | _target_: pytorch_lightning.callbacks.ModelSummary 83 | max_depth: 1 84 | 85 | model_checkpoint: 86 | _target_: pytorch_lightning.callbacks.ModelCheckpoint 87 | monitor: val_loss 88 | dirpath: ${probe_ckpt_dir} 89 | every_n_epochs: 1 90 | mode: min 91 | save_top_k: 1 92 | save_last: False 93 | filename: ${codec_name}_${mode}_{epoch}-{val_loss:.4f} 94 | verbose: True 95 | 96 | tensorboard: 97 | _target_: pytorch_lightning.loggers.TensorBoardLogger 98 | save_dir: ??? 99 | name: ${codec_name}_${mode} 100 | log_graph: true -------------------------------------------------------------------------------- /codec_evaluation/probe/config/GS_dataset/wavtokenizer.yaml: -------------------------------------------------------------------------------- 1 | mode: quantized_emb 2 | sample_rate: 44100 3 | target_sec: 10 4 | num_outputs: 24 5 | probe_ckpt_dir: ??? 6 | seed: 666 7 | codec_name: wavtokenizer # 需要更改 8 | task: multiclass 9 | save_result: null 10 | 11 | trainer: 12 | _target_: pytorch_lightning.Trainer 13 | accelerator: gpu 14 | devices: ??? 15 | precision: 32 16 | max_epochs: 100 17 | limit_val_batches: 10 18 | log_every_n_steps: 5 19 | val_check_interval: 1.0 20 | 21 | data: 22 | _target_: codec_evaluation.probe.dataset.GS_dataset.GS_dataset.GSdataModule 23 | dataset_args: 24 | sample_rate: ${sample_rate} 25 | target_sec: ${target_sec} 26 | train_audio_dir: ??? 27 | val_audio_dir: ??? 28 | test_audio_dir: ??? 29 | train_batch_size: 8 30 | val_batch_size: 1 31 | test_batch_size: 8 32 | train_num_workers: 4 33 | val_num_workers: 2 34 | test_num_workers: 2 35 | 36 | model: 37 | _target_: codec_evaluation.probe.model.lit_prober.Prober 38 | codec_name: ${codec_name} 39 | sample_rate: ${sample_rate} 40 | mode: ${mode} 41 | task: ${task} 42 | num_outputs: ${num_outputs} 43 | probe_model_builder: 44 | _target_: codec_evaluation.probe.model.multiclass_model.MulticlassProber 45 | _partial_: true 46 | num_outputs: ${num_outputs} 47 | drop_out: 0.1 48 | channel_reduction: 16 49 | padding: 1 50 | kernel_size: 3 51 | stride: 1 52 | target_sec: ${target_sec} 53 | model_ckpt_dir: ??? 54 | 55 | optimizer_builder: 56 | _target_: torch.optim.AdamW 57 | _partial_: true 58 | lr: 1e-4 59 | betas: [0.8, 0.99] 60 | eps: 1e-5 61 | weight_decay: 0.08 62 | 63 | lr_scheduler_builder: 64 | _target_: torch.optim.lr_scheduler.LambdaLR 65 | _partial_: true 66 | lr_lambda: 67 | _target_: codec_evaluation.utils.schedule.get_cosine_schedule_with_warmup_lr_lambda 68 | _partial_: true 69 | num_warmup_steps: 10 70 | num_training_steps: 10000 71 | final_lr_ratio: 0.2 72 | 73 | callbacks: 74 | learning_rate_monitor: 75 | _target_: pytorch_lightning.callbacks.LearningRateMonitor 76 | logging_interval: step 77 | 78 | rich_progress_bar: 79 | _target_: pytorch_lightning.callbacks.RichProgressBar 80 | 81 | model_summary: 82 | _target_: pytorch_lightning.callbacks.ModelSummary 83 | max_depth: 1 84 | 85 | model_checkpoint: 86 | _target_: pytorch_lightning.callbacks.ModelCheckpoint 87 | monitor: val_loss 88 | dirpath: ${probe_ckpt_dir} 89 | every_n_epochs: 1 90 | mode: min 91 | save_top_k: 1 92 | save_last: False 93 | filename: ${codec_name}_${mode}_{epoch}-{val_loss:.4f} 94 | verbose: True 95 | 96 | tensorboard: 97 | _target_: pytorch_lightning.loggers.TensorBoardLogger 98 | save_dir: ??? 99 | name: ${codec_name}_${mode} 100 | log_graph: true -------------------------------------------------------------------------------- /codec_evaluation/probe/config/GTZAN_dataset/encodec.yaml: -------------------------------------------------------------------------------- 1 | mode: quantized_emb 2 | sample_rate: 22050 3 | target_sec: 10 4 | num_outputs: 10 5 | probe_ckpt_dir: ??? 6 | seed: 666 7 | codec_name: encodec 8 | task: multiclass 9 | save_result: null 10 | 11 | 12 | trainer: 13 | _target_: pytorch_lightning.Trainer 14 | accelerator: gpu 15 | devices: ??? 16 | precision: 32 17 | max_epochs: 100 18 | limit_val_batches: 10 19 | log_every_n_steps: 5 20 | val_check_interval: 1.0 21 | 22 | data: 23 | _target_: codec_evaluation.probe.dataset.GTZAN_dataset.GTZAN_dataset.GTZANdataModule 24 | dataset_args: 25 | sample_rate: ${sample_rate} 26 | target_sec: ${target_sec} 27 | train_audio_dir: ??? 28 | val_audio_dir: ??? 29 | test_audio_dir: ??? 30 | train_batch_size: 8 31 | val_batch_size: 1 32 | test_batch_size: 8 33 | train_num_workers: 8 34 | val_num_workers: 4 35 | test_num_workers: 4 36 | 37 | model: 38 | _target_: codec_evaluation.probe.model.lit_prober.Prober 39 | codec_name: ${codec_name} 40 | sample_rate: ${sample_rate} 41 | mode: ${mode} 42 | task: ${task} 43 | num_outputs: ${num_outputs} 44 | probe_model_builder: 45 | _target_: codec_evaluation.probe.model.multiclass_model.MulticlassProber 46 | _partial_: true 47 | num_outputs: ${num_outputs} 48 | drop_out: 0.1 49 | channel_reduction: 16 50 | padding: 1 51 | kernel_size: 3 52 | stride: 1 53 | target_sec: ${target_sec} 54 | model_ckpt_dir: ??? 55 | 56 | optimizer_builder: 57 | _target_: torch.optim.AdamW 58 | _partial_: true 59 | lr: 1e-4 60 | betas: [0.8, 0.99] 61 | eps: 1e-5 62 | weight_decay: 0.08 63 | 64 | lr_scheduler_builder: 65 | _target_: torch.optim.lr_scheduler.LambdaLR 66 | _partial_: true 67 | lr_lambda: 68 | _target_: codec_evaluation.utils.schedule.get_cosine_schedule_with_warmup_lr_lambda 69 | _partial_: true 70 | num_warmup_steps: 10 71 | num_training_steps: 10000 72 | final_lr_ratio: 0.2 73 | 74 | callbacks: 75 | learning_rate_monitor: 76 | _target_: pytorch_lightning.callbacks.LearningRateMonitor 77 | logging_interval: step 78 | 79 | rich_progress_bar: 80 | _target_: pytorch_lightning.callbacks.RichProgressBar 81 | 82 | model_summary: 83 | _target_: pytorch_lightning.callbacks.ModelSummary 84 | max_depth: 1 85 | 86 | model_checkpoint: 87 | _target_: pytorch_lightning.callbacks.ModelCheckpoint 88 | monitor: val_loss 89 | dirpath: ${probe_ckpt_dir} 90 | every_n_epochs: 1 91 | mode: min 92 | save_top_k: 1 93 | save_last: False 94 | filename: ${codec_name}_${mode}_{epoch}-{val_loss:.4f} 95 | verbose: True 96 | 97 | tensorboard: 98 | _target_: pytorch_lightning.loggers.TensorBoardLogger 99 | save_dir: ??? 100 | name: ${codec_name}_${mode} 101 | log_graph: true -------------------------------------------------------------------------------- /codec_evaluation/probe/config/MELD_dataset/encodec.yaml: -------------------------------------------------------------------------------- 1 | mode: quantized_emb 2 | sample_rate: 48000 3 | target_sec: 8 4 | num_outputs: 7 5 | probe_ckpt_dir: ??? 6 | seed: 666 7 | codec_name: encodec # 需要更改 8 | task: multiclass 9 | save_result: null 10 | 11 | trainer: 12 | _target_: pytorch_lightning.Trainer 13 | accelerator: gpu 14 | devices: ??? 15 | precision: 32 16 | max_epochs: 100 17 | limit_val_batches: 10 18 | log_every_n_steps: 5 19 | val_check_interval: 1.0 20 | 21 | data: 22 | _target_: codec_evaluation.probe.dataset.MELD_dataset.MELD_dataset.MELDdataModule 23 | dataset_args: 24 | sample_rate: ${sample_rate} 25 | target_sec: ${target_sec} 26 | train_audio_dir: ??? 27 | val_audio_dir: ??? 28 | test_audio_dir: ??? 29 | train_batch_size: 64 30 | val_batch_size: 2 31 | test_batch_size: 32 32 | train_num_workers: 8 33 | val_num_workers: 4 34 | test_num_workers: 4 35 | 36 | model: 37 | _target_: codec_evaluation.probe.model.lit_prober.Prober 38 | codec_name: ${codec_name} 39 | sample_rate: ${sample_rate} 40 | mode: ${mode} 41 | task: ${task} 42 | num_outputs: ${num_outputs} 43 | probe_model_builder: 44 | _target_: codec_evaluation.probe.model.multiclass_model.MulticlassProber 45 | _partial_: true 46 | num_outputs: ${num_outputs} 47 | drop_out: 0.2 48 | channel_reduction: 16 49 | padding: 1 50 | kernel_size: 3 51 | stride: 1 52 | target_sec: ${target_sec} 53 | model_ckpt_dir: ??? 54 | 55 | optimizer_builder: 56 | _target_: torch.optim.AdamW 57 | _partial_: true 58 | lr: 1e-4 59 | betas: [0.8, 0.99] 60 | eps: 1e-5 61 | weight_decay: 0.08 62 | 63 | lr_scheduler_builder: 64 | _target_: torch.optim.lr_scheduler.LambdaLR 65 | _partial_: true 66 | lr_lambda: 67 | _target_: codec_evaluation.utils.schedule.get_cosine_schedule_with_warmup_lr_lambda 68 | _partial_: true 69 | num_warmup_steps: 10 70 | num_training_steps: 10000 71 | final_lr_ratio: 0.2 72 | 73 | callbacks: 74 | learning_rate_monitor: 75 | _target_: pytorch_lightning.callbacks.LearningRateMonitor 76 | logging_interval: step 77 | 78 | rich_progress_bar: 79 | _target_: pytorch_lightning.callbacks.RichProgressBar 80 | 81 | model_summary: 82 | _target_: pytorch_lightning.callbacks.ModelSummary 83 | max_depth: 1 84 | 85 | model_checkpoint: 86 | _target_: pytorch_lightning.callbacks.ModelCheckpoint 87 | monitor: val_loss 88 | dirpath: ${probe_ckpt_dir} 89 | every_n_epochs: 1 90 | mode: min 91 | save_top_k: 1 92 | save_last: False 93 | filename: ${codec_name}_${mode}_{epoch}-{val_loss:.4f} 94 | verbose: True 95 | 96 | tensorboard: 97 | _target_: pytorch_lightning.loggers.TensorBoardLogger 98 | save_dir: ??? 99 | name: ${codec_name}_${mode} 100 | log_graph: true -------------------------------------------------------------------------------- /codec_evaluation/probe/config/MTT_dataset/dac.yaml: -------------------------------------------------------------------------------- 1 | mode: quantized_emb 2 | sample_rate: 16000 3 | target_sec: 5 4 | num_outputs: 50 5 | probe_ckpt_dir: ??? 6 | seed: 666 7 | codec_name: dac 8 | task: multilabel 9 | save_result: null 10 | 11 | trainer: 12 | _target_: pytorch_lightning.Trainer 13 | accelerator: gpu 14 | devices: ??? 15 | precision: 32 16 | max_epochs: 50 17 | limit_val_batches: 10 18 | log_every_n_steps: 5 19 | val_check_interval: 1.0 20 | 21 | data: 22 | _target_: codec_evaluation.probe.dataset.MTT_dataset.MTT_dataset.MTTdataModule 23 | dataset_args: 24 | sample_rate: ${sample_rate} 25 | target_sec: ${target_sec} 26 | base_audio_dir: /root/path/for/audio 27 | train_audio_dir: ??? 28 | val_audio_dir: ??? 29 | test_audio_dir: ??? 30 | train_batch_size: 64 31 | val_batch_size: 8 32 | test_batch_size: 64 33 | train_num_workers: 16 34 | val_num_workers: 4 35 | test_num_workers: 16 36 | 37 | model: 38 | _target_: codec_evaluation.probe.model.lit_prober.Prober 39 | codec_name: ${codec_name} 40 | sample_rate: ${sample_rate} 41 | mode: ${mode} 42 | task: ${task} 43 | num_outputs: ${num_outputs} 44 | probe_model_builder: 45 | _target_: codec_evaluation.probe.model.multilabel_model.MultilabelProber 46 | _partial_: true 47 | num_outputs: ${num_outputs} 48 | drop_out: 0.2 49 | channel_reduction: 16 50 | padding: 1 51 | kernel_size: 3 52 | stride: 1 53 | target_sec: ${target_sec} 54 | model_ckpt_dir: ??? 55 | 56 | optimizer_builder: 57 | _target_: torch.optim.AdamW 58 | _partial_: true 59 | lr: 1e-4 60 | betas: [0.8, 0.99] 61 | eps: 1e-5 62 | weight_decay: 0.08 63 | 64 | lr_scheduler_builder: 65 | _target_: torch.optim.lr_scheduler.LambdaLR 66 | _partial_: true 67 | lr_lambda: 68 | _target_: codec_evaluation.utils.schedule.get_cosine_schedule_with_warmup_lr_lambda 69 | _partial_: true 70 | num_warmup_steps: 10 71 | num_training_steps: 10000 72 | final_lr_ratio: 0.2 73 | 74 | callbacks: 75 | learning_rate_monitor: 76 | _target_: pytorch_lightning.callbacks.LearningRateMonitor 77 | logging_interval: step 78 | 79 | rich_progress_bar: 80 | _target_: pytorch_lightning.callbacks.RichProgressBar 81 | 82 | model_summary: 83 | _target_: pytorch_lightning.callbacks.ModelSummary 84 | max_depth: 1 85 | 86 | model_checkpoint: 87 | _target_: pytorch_lightning.callbacks.ModelCheckpoint 88 | monitor: val_loss 89 | dirpath: ${probe_ckpt_dir} 90 | every_n_epochs: 1 91 | mode: min 92 | save_top_k: 1 93 | filename: ${codec_name}_${mode}_{epoch}-{val_loss:.4f} 94 | verbose: True 95 | 96 | tensorboard: 97 | _target_: pytorch_lightning.loggers.TensorBoardLogger 98 | save_dir: ??? 99 | name: ${codec_name}_${mode} 100 | log_graph: true --------------------------------------------------------------------------------