├── codec_evaluation
    ├── __init__.py
    ├── codecs
    │   ├── __init__.py
    │   ├── YuE
    │   │   ├── models
    │   │   │   └── __init__.py
    │   │   ├── quantization
    │   │   │   └── __init__.py
    │   │   ├── descriptaudiocodec
    │   │   │   └── dac
    │   │   │   │   ├── nn
    │   │   │   │       ├── __init__.py
    │   │   │   │       └── layers.py
    │   │   │   │   ├── model
    │   │   │   │       ├── __init__.py
    │   │   │   │       └── base.py
    │   │   │   │   └── __init__.py
    │   │   ├── modules
    │   │   │   ├── __init__.py
    │   │   │   └── norm.py
    │   │   └── RepCodec
    │   │   │   └── repcodec
    │   │   │       ├── modules
    │   │   │           └── residual_unit.py
    │   │   │       └── layers
    │   │   │           └── conv_layer.py
    │   ├── xcodec
    │   │   ├── models
    │   │   │   └── __init__.py
    │   │   ├── modules
    │   │   │   └── __init__.py
    │   │   ├── quantization
    │   │   │   └── __init__.py
    │   │   └── descriptaudiocodec
    │   │   │   └── dac
    │   │   │       ├── nn
    │   │   │           ├── __init__.py
    │   │   │           └── layers.py
    │   │   │       ├── model
    │   │   │           ├── __init__.py
    │   │   │           └── base.py
    │   │   │       └── __init__.py
    │   ├── levo_modules
    │   │   ├── Flow1dVAE
    │   │   │   ├── models
    │   │   │   │   └── __init__.py
    │   │   │   ├── tools
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── extract_rvq.py
    │   │   │   │   ├── safetensor2torch.py
    │   │   │   │   ├── get_1dvae.py
    │   │   │   │   ├── get_1dvae_1920.py
    │   │   │   │   ├── get_1dvae_large_melvae.py
    │   │   │   │   ├── get_1dvae_large.py
    │   │   │   │   ├── compare_2models.py
    │   │   │   │   ├── get_whisper_encoder.py
    │   │   │   │   ├── transmodelnorm.py
    │   │   │   │   ├── mix.py
    │   │   │   │   ├── check_stereo.py
    │   │   │   │   ├── infer_encodec.py
    │   │   │   │   ├── infer_encodec_speech.py
    │   │   │   │   ├── infer_encodec_vocal.py
    │   │   │   │   ├── creat_jsonl.py
    │   │   │   │   ├── infer_bsrnnvae441k.py
    │   │   │   │   └── infer_bsrnnvae441k_vocal.py
    │   │   │   ├── our_MERT_BESTRQ
    │   │   │   │   ├── mert_fairseq
    │   │   │   │   │   ├── models
    │   │   │   │   │   │   ├── musicfm
    │   │   │   │   │   │   │   ├── model
    │   │   │   │   │   │   │   │   ├── __init__.py
    │   │   │   │   │   │   │   │   └── w2v2_config.json
    │   │   │   │   │   │   │   ├── modules
    │   │   │   │   │   │   │   │   ├── __init__.py
    │   │   │   │   │   │   │   │   └── features.py
    │   │   │   │   │   │   │   └── __init__.py
    │   │   │   │   │   │   ├── mert
    │   │   │   │   │   │   │   ├── README.md
    │   │   │   │   │   │   │   └── __init__.py
    │   │   │   │   │   │   └── eat
    │   │   │   │   │   │   │   └── __init__.py
    │   │   │   │   │   ├── config
    │   │   │   │   │   │   └── pretrain
    │   │   │   │   │   │   │   ├── MERT_RVQ-VAE_CQT_95M_dac.yaml
    │   │   │   │   │   │   │   ├── run
    │   │   │   │   │   │   │       └── submitit_reg.yaml
    │   │   │   │   │   │   │   ├── MusicFM_95M_multinodes.yaml
    │   │   │   │   │   │   │   ├── MusicFM_95M_speech_multinodes.yaml
    │   │   │   │   │   │   │   └── MusicFM_95M_bestrvq_multinodes.yaml
    │   │   │   │   │   └── data
    │   │   │   │   │   │   ├── __init__.py
    │   │   │   │   │   │   └── eat_data
    │   │   │   │   │   │       ├── __init__.py
    │   │   │   │   │   │       └── add_class_target_dataset.py
    │   │   │   │   ├── modify_env.md
    │   │   │   │   ├── test.py
    │   │   │   │   └── run_training_eat.sh
    │   │   │   ├── models_gpt
    │   │   │   │   └── models
    │   │   │   │   │   └── tokenizer
    │   │   │   │   │       ├── structure.yaml
    │   │   │   │   │       └── pinyin
    │   │   │   │   │           └── symbols.py
    │   │   │   ├── compare_model_weight.py
    │   │   │   ├── configs
    │   │   │   │   ├── scheduler
    │   │   │   │   │   └── stable_diffusion_2.1_largenoise_sample.json
    │   │   │   │   └── models
    │   │   │   │   │   └── transformer2D_wocross_inch112_1x4_multi_large.json
    │   │   │   ├── cal_token_stat.py
    │   │   │   ├── extract_codes_stereo_7_1x4.py
    │   │   │   └── extract_codes_stereo_7_1x2.py
    │   │   ├── __init__.py
    │   │   └── stable_audio_tools
    │   │   │   ├── data
    │   │   │       └── __init__.py
    │   │   │   ├── inference
    │   │   │       ├── __init__.py
    │   │   │       └── utils.py
    │   │   │   ├── interface
    │   │   │       └── __init__.py
    │   │   │   ├── training
    │   │   │       ├── losses
    │   │   │       │   └── __init__.py
    │   │   │       └── __init__.py
    │   │   │   ├── models
    │   │   │       ├── __init__.py
    │   │   │       ├── pretrained.py
    │   │   │       └── diffusion_prior.py
    │   │   │   ├── __init__.py
    │   │   │   └── configs
    │   │   │       ├── dataset_configs
    │   │   │           ├── custom_metadata
    │   │   │           │   └── custom_md_example.py
    │   │   │           ├── s3_wds_example.json
    │   │   │           └── local_training_example.json
    │   │   │       └── model_configs
    │   │   │           ├── dance_diffusion
    │   │   │               ├── dance_diffusion_base.json
    │   │   │               ├── dance_diffusion_large.json
    │   │   │               ├── dance_diffusion_base_16k.json
    │   │   │               └── dance_diffusion_base_44k.json
    │   │   │           └── autoencoders
    │   │   │               └── dac_2048_32_vae.json
    │   ├── config
    │   │   ├── hubert_preprocessor_config.json
    │   │   └── qwen2audioencoder_preprocessor_config.json
    │   └── version.py
    ├── utils
    │   ├── demucs
    │   │   ├── __init__.py
    │   │   ├── models
    │   │   │   ├── __init__.py
    │   │   │   ├── pretrained.py
    │   │   │   └── spec.py
    │   │   └── ckpt
    │   │   │   └── htdemucs.yaml
    │   ├── schedule.py
    │   └── logger.py
    ├── perplexity
    │   └── config
    │   │   └── ppl_model_config.json
    └── probe
    │   └── config
    │       ├── Common_Voice_dataset
    │           ├── dac.yaml
    │           ├── YuE.yaml
    │           ├── mimi.yaml
    │           ├── encodec.yaml
    │           ├── wavtokenizer.yaml
    │           ├── semanticodec.yaml
    │           ├── speechtokenizer.yaml
    │           └── xcodec.yaml
    │       ├── MTT_dataset
    │           ├── encodec.yaml
    │           └── dac.yaml
    │       ├── NSynthI_dataset
    │           ├── dac.yaml
    │           ├── encodec.yaml
    │           ├── semanticodec.yaml
    │           ├── wavtokenizer.yaml
    │           └── YuE.yaml
    │       ├── NSynthP_dataset
    │           ├── dac.yaml
    │           ├── semanticodec.yaml
    │           ├── encodec.yaml
    │           └── YuE.yaml
    │       ├── GTZAN_dataset
    │           ├── wavtokenizer.yaml
    │           ├── YuE.yaml
    │           ├── dac.yaml
    │           └── encodec.yaml
    │       ├── EMO_dataset
    │           ├── encodec.yaml
    │           ├── YuE.yaml
    │           ├── dac.yaml
    │           ├── semanticodec.yaml
    │           └── wavtokenizer.yaml
    │       ├── GS_dataset
    │           ├── YuE.yaml
    │           ├── dac.yaml
    │           ├── encodec.yaml
    │           ├── semanticodec.yaml
    │           └── wavtokenizer.yaml
    │       ├── MELD_dataset
    │           ├── YuE.yaml
    │           ├── dac.yaml
    │           ├── mimi.yaml
    │           └── encodec.yaml
    │       └── ESC50_dataset
    │           ├── YuE.yaml
    │           ├── dac.yaml
    │           ├── encodec.yaml
    │           ├── mimi.yaml
    │           ├── semanticodec.yaml
    │           └── wavtokenizer.yaml
├── MANIFEST.in
├── env_build.sh
├── requirements.txt
├── doc
    └── chore.md
└── setup.py


/codec_evaluation/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/codec_evaluation/codecs/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/codec_evaluation/utils/demucs/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/codec_evaluation/codecs/YuE/models/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/codec_evaluation/utils/demucs/models/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/codec_evaluation/codecs/xcodec/models/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/codec_evaluation/codecs/xcodec/modules/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/codec_evaluation/codecs/levo_modules/Flow1dVAE/models/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/codec_evaluation/codecs/levo_modules/Flow1dVAE/tools/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/codec_evaluation/codecs/levo_modules/__init__.py:
--------------------------------------------------------------------------------
1 | # no need for training


--------------------------------------------------------------------------------
/codec_evaluation/codecs/levo_modules/stable_audio_tools/data/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/codec_evaluation/utils/demucs/ckpt/htdemucs.yaml:
--------------------------------------------------------------------------------
1 | models: ['htdemucs']
2 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include requirements.txt
2 | include README.md
3 | include LICENSE


--------------------------------------------------------------------------------
/codec_evaluation/codecs/levo_modules/stable_audio_tools/inference/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/codec_evaluation/codecs/levo_modules/stable_audio_tools/interface/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/codec_evaluation/codecs/YuE/quantization/__init__.py:
--------------------------------------------------------------------------------
1 | from .vq import ResidualVectorQuantizer
2 | 


--------------------------------------------------------------------------------
/codec_evaluation/codecs/xcodec/quantization/__init__.py:
--------------------------------------------------------------------------------
1 | from .vq import  ResidualVectorQuantizer
2 | 


--------------------------------------------------------------------------------
/codec_evaluation/codecs/levo_modules/stable_audio_tools/training/losses/__init__.py:
--------------------------------------------------------------------------------
1 | from .losses import *


--------------------------------------------------------------------------------
/codec_evaluation/codecs/YuE/descriptaudiocodec/dac/nn/__init__.py:
--------------------------------------------------------------------------------
1 | from . import layers
2 | from . import quantize
3 | 


--------------------------------------------------------------------------------
/codec_evaluation/codecs/levo_modules/Flow1dVAE/our_MERT_BESTRQ/mert_fairseq/models/musicfm/model/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 
3 | 


--------------------------------------------------------------------------------
/codec_evaluation/codecs/levo_modules/Flow1dVAE/our_MERT_BESTRQ/mert_fairseq/config/pretrain/MERT_RVQ-VAE_CQT_95M_dac.yaml:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/codec_evaluation/codecs/levo_modules/Flow1dVAE/our_MERT_BESTRQ/mert_fairseq/models/musicfm/modules/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 
3 | 


--------------------------------------------------------------------------------
/codec_evaluation/codecs/xcodec/descriptaudiocodec/dac/nn/__init__.py:
--------------------------------------------------------------------------------
1 | from . import layers
2 | from . import quantize
3 | 


--------------------------------------------------------------------------------
/codec_evaluation/codecs/YuE/descriptaudiocodec/dac/model/__init__.py:
--------------------------------------------------------------------------------
1 | from .base import CodecMixin
2 | from .dac import DAC
3 | 


--------------------------------------------------------------------------------
/codec_evaluation/codecs/xcodec/descriptaudiocodec/dac/model/__init__.py:
--------------------------------------------------------------------------------
1 | from .base import CodecMixin
2 | from .dac import DAC


--------------------------------------------------------------------------------
/codec_evaluation/codecs/levo_modules/Flow1dVAE/our_MERT_BESTRQ/mert_fairseq/models/musicfm/__init__.py:
--------------------------------------------------------------------------------
1 | from .musicfm_model import *


--------------------------------------------------------------------------------
/codec_evaluation/codecs/levo_modules/stable_audio_tools/models/__init__.py:
--------------------------------------------------------------------------------
1 | from .factory import create_model_from_config, create_model_from_config_path


--------------------------------------------------------------------------------
/codec_evaluation/codecs/levo_modules/Flow1dVAE/our_MERT_BESTRQ/mert_fairseq/data/__init__.py:
--------------------------------------------------------------------------------
1 | from .mert_dataset import MERTDataset
2 | from .eat_data import *


--------------------------------------------------------------------------------
/codec_evaluation/codecs/levo_modules/stable_audio_tools/training/__init__.py:
--------------------------------------------------------------------------------
1 | from .factory import create_training_wrapper_from_config, create_demo_callback_from_config
2 | 


--------------------------------------------------------------------------------
/codec_evaluation/codecs/levo_modules/stable_audio_tools/__init__.py:
--------------------------------------------------------------------------------
1 | from .models.factory import create_model_from_config, create_model_from_config_path
2 | from .models.pretrained import get_pretrained_model


--------------------------------------------------------------------------------
/codec_evaluation/codecs/levo_modules/stable_audio_tools/configs/dataset_configs/custom_metadata/custom_md_example.py:
--------------------------------------------------------------------------------
1 | def get_custom_metadata(info, audio):
2 | 
3 |     # Use relative path as the prompt
4 |     return {"prompt": info["relpath"]}


--------------------------------------------------------------------------------
/codec_evaluation/codecs/levo_modules/Flow1dVAE/models_gpt/models/tokenizer/structure.yaml:
--------------------------------------------------------------------------------
 1 | - '[start]'
 2 | - '[verse]'
 3 | - '[chorus]'
 4 | - '[outro]'
 5 | - '[end]'
 6 | - '[intro]'
 7 | - '[solo]'
 8 | - '[inst]'
 9 | - '[bridge]'
10 | - '[break]'
11 | 


--------------------------------------------------------------------------------
/codec_evaluation/codecs/levo_modules/Flow1dVAE/our_MERT_BESTRQ/mert_fairseq/models/mert/README.md:
--------------------------------------------------------------------------------
1 | add cauchy extension from https://github.com/HazyResearch/state-spaces
2 | ```shell
3 | cd state-spaces/extensions/cauchy
4 | python setup.py install
5 | ```
6 | 


--------------------------------------------------------------------------------
/codec_evaluation/codecs/levo_modules/Flow1dVAE/our_MERT_BESTRQ/mert_fairseq/models/eat/__init__.py:
--------------------------------------------------------------------------------
1 | try:
2 |     from .EAT_pretraining import *
3 | except:
4 |     import sys, os
5 |     sys.path.append(os.path.join(os.path.dirname(os.path.abspath(__file__)), '.'))
6 |     from EAT_pretraining import *


--------------------------------------------------------------------------------
/codec_evaluation/codecs/config/hubert_preprocessor_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "do_normalize": true,
 3 |   "feature_extractor_type": "Wav2Vec2FeatureExtractor",
 4 |   "feature_size": 1,
 5 |   "padding_side": "right",
 6 |   "padding_value": 0,
 7 |   "return_attention_mask": true,
 8 |   "sampling_rate": 16000
 9 | }
10 | 


--------------------------------------------------------------------------------
/codec_evaluation/codecs/levo_modules/Flow1dVAE/our_MERT_BESTRQ/mert_fairseq/models/mert/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | #
3 | # This source code is licensed under the MIT license found in the
4 | # LICENSE file in the root directory of this source tree.
5 | 
6 | from .mert_model import *  # noqa


--------------------------------------------------------------------------------
/codec_evaluation/codecs/levo_modules/stable_audio_tools/configs/dataset_configs/s3_wds_example.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "dataset_type": "s3",
 3 |     "datasets": [
 4 |         {
 5 |             "id": "s3-test",
 6 |             "s3_path": "s3://my-bucket/datasets/webdataset/audio/"
 7 |         }
 8 |     ],
 9 |     "random_crop": true
10 | }


--------------------------------------------------------------------------------
/env_build.sh:
--------------------------------------------------------------------------------
1 | pip install -e . -i https://pypi.tuna.tsinghua.edu.cn/simple
2 | pip install git+https://github.com/haoheliu/SemantiCodec-inference@8dc464c3385d2389a695ed3f718f4a0caf3ed33f#egg=semanticodec
3 | pip install git+https://github.com/lucadellalib/WavTokenizer.git@main
4 | pip install git+https://github.com/pengzhendong/asr-decoder.git@master


--------------------------------------------------------------------------------
/codec_evaluation/codecs/YuE/descriptaudiocodec/dac/__init__.py:
--------------------------------------------------------------------------------
 1 | __version__ = "1.0.0"
 2 | 
 3 | # preserved here for legacy reasons
 4 | __model_version__ = "latest"
 5 | 
 6 | import audiotools
 7 | 
 8 | audiotools.ml.BaseModel.INTERN += ["dac.**"]
 9 | audiotools.ml.BaseModel.EXTERN += ["einops"]
10 | 
11 | 
12 | from . import nn
13 | from . import model
14 | from .model import DAC
15 | 


--------------------------------------------------------------------------------
/codec_evaluation/codecs/levo_modules/stable_audio_tools/configs/dataset_configs/local_training_example.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "dataset_type": "audio_dir",
 3 |     "datasets": [
 4 |         {
 5 |             "id": "my_audio",
 6 |             "path": "train.jsonl",
 7 |             "custom_metadata_module": "custom_md_example.py"
 8 |         }
 9 |     ],
10 |     "random_crop": true
11 | }
12 | 


--------------------------------------------------------------------------------
/codec_evaluation/codecs/levo_modules/Flow1dVAE/compare_model_weight.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import sys
 3 | from safetensors.torch import load_file
 4 | 
 5 | if __name__ == "__main__":
 6 |     m0, m1 = sys.argv[1], sys.argv[2]
 7 |     m0 = load_file(m0)
 8 |     m1 = load_file(m1)
 9 |     
10 |     ks = [k for k in m0.keys() if 'bestrq' in k]
11 |     for k in ks:
12 |         print(k, (m0[k] - m1[k]).abs().sum())
13 |         


--------------------------------------------------------------------------------
/codec_evaluation/codecs/xcodec/descriptaudiocodec/dac/__init__.py:
--------------------------------------------------------------------------------
 1 | __version__ = "1.0.0"
 2 | 
 3 | # preserved here for legacy reasons
 4 | __model_version__ = "latest"
 5 | 
 6 | import audiotools
 7 | 
 8 | audiotools.ml.BaseModel.INTERN += ["dac.**"]
 9 | audiotools.ml.BaseModel.EXTERN += ["einops"]
10 | 
11 | 
12 | from . import nn
13 | from . import model
14 | from .model import DAC
15 | # from .model import DACFile
16 | 


--------------------------------------------------------------------------------
/codec_evaluation/codecs/levo_modules/Flow1dVAE/tools/extract_rvq.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import sys
 3 | 
 4 | if __name__=="__main__":
 5 |     p = sys.argv[1]
 6 |     bd = '/'.join(p.split('/')[:-1])
 7 |     bn = p.split('/')[-1]
 8 | 
 9 |     d = {}
10 |     m = torch.load(p, map_location='cpu')
11 |     for k in m.keys():
12 |         if('rvq' in k):
13 |             d[k] = m[k]
14 | 
15 |     torch.save(d, '{}/rvq.bin'.format(bd))


--------------------------------------------------------------------------------
/codec_evaluation/codecs/levo_modules/Flow1dVAE/tools/safetensor2torch.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | from safetensors import safe_open
 3 | import torch
 4 | 
 5 | if __name__=="__main__":
 6 |     inname = sys.argv[1]
 7 |     outname = sys.argv[2]
 8 | 
 9 |     main_weights = {}
10 |     with safe_open(inname, framework="pt", device="cpu") as f:
11 |         for key in f.keys():
12 |             main_weights[key] = f.get_tensor(key)
13 | 
14 |     torch.save(main_weights, outname)


--------------------------------------------------------------------------------
/codec_evaluation/codecs/config/qwen2audioencoder_preprocessor_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "chunk_length": 30,
 3 |   "feature_extractor_type": "WhisperFeatureExtractor",
 4 |   "feature_size": 128,
 5 |   "hop_length": 160,
 6 |   "n_fft": 400,
 7 |   "n_samples": 480000,
 8 |   "nb_max_frames": 3000,
 9 |   "padding_side": "right",
10 |   "padding_value": 0.0,
11 |   "processor_class": "WhisperProcessor",
12 |   "return_attention_mask": true,
13 |   "sampling_rate": 16000
14 | }
15 | 


--------------------------------------------------------------------------------
/codec_evaluation/codecs/levo_modules/Flow1dVAE/configs/scheduler/stable_diffusion_2.1_largenoise_sample.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "_class_name": "DDIMScheduler",
 3 |   "_diffusers_version": "0.8.0",
 4 |   "beta_end": 0.02,
 5 |   "beta_schedule": "scaled_linear",
 6 |   "beta_start": 0.0015,
 7 |   "clip_sample": false,
 8 |   "num_train_timesteps": 1000,
 9 |   "prediction_type": "sample",
10 |   "set_alpha_to_one": false,
11 |   "skip_prk_steps": true,
12 |   "steps_offset": 1,
13 |   "trained_betas": null
14 | }
15 | 


--------------------------------------------------------------------------------
/codec_evaluation/codecs/levo_modules/stable_audio_tools/configs/model_configs/dance_diffusion/dance_diffusion_base.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "model_type": "diffusion_uncond",
 3 |     "sample_size": 65536,
 4 |     "sample_rate": 48000,
 5 |     "model": {
 6 |         "type": "DAU1d",
 7 |         "config": {
 8 |             "n_attn_layers": 5
 9 |         }
10 |     },
11 |     "training": {
12 |         "learning_rate": 1e-4,
13 |         "demo": {
14 |             "demo_every": 2000,
15 |             "demo_steps": 250
16 |         }
17 |     }
18 | }


--------------------------------------------------------------------------------
/codec_evaluation/codecs/levo_modules/stable_audio_tools/configs/model_configs/dance_diffusion/dance_diffusion_large.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "model_type": "diffusion_uncond",
 3 |     "sample_size": 131072,
 4 |     "sample_rate": 48000,
 5 |     "model": {
 6 |         "type": "DAU1d",
 7 |         "config": {
 8 |             "n_attn_layers": 5
 9 |         }
10 |     },
11 |     "training": {
12 |         "learning_rate": 1e-4,
13 |         "demo": {
14 |             "demo_every": 2000,
15 |             "demo_steps": 250
16 |         }
17 |     }
18 | }


--------------------------------------------------------------------------------
/codec_evaluation/codecs/YuE/modules/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | """Torch modules."""
 8 | 
 9 | # flake8: noqa
10 | from .conv import (
11 |     pad1d,
12 |     unpad1d,
13 |     NormConv1d,
14 |     NormConvTranspose1d,
15 |     NormConv2d,
16 |     NormConvTranspose2d,
17 |     SConv1d,
18 |     SConvTranspose1d,
19 | )
20 | 
21 | 
22 | 


--------------------------------------------------------------------------------
/codec_evaluation/codecs/levo_modules/stable_audio_tools/configs/model_configs/dance_diffusion/dance_diffusion_base_16k.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "model_type": "diffusion_uncond",
 3 |     "sample_size": 65536,
 4 |     "sample_rate": 16000,
 5 |     "model": {
 6 |         "type": "DAU1d",
 7 |         "config": {
 8 |             "n_attn_layers": 5
 9 |         }
10 |     },
11 |     "training": {
12 |         "learning_rate": 1e-4,
13 |         "demo": {
14 |             "demo_every": 2000,
15 |             "demo_steps": 250
16 |         }
17 |     }
18 | }


--------------------------------------------------------------------------------
/codec_evaluation/codecs/levo_modules/stable_audio_tools/configs/model_configs/dance_diffusion/dance_diffusion_base_44k.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "model_type": "diffusion_uncond",
 3 |     "sample_size": 65536,
 4 |     "sample_rate": 44100,
 5 |     "model": {
 6 |         "type": "DAU1d",
 7 |         "config": {
 8 |             "n_attn_layers": 5
 9 |         }
10 |     },
11 |     "training": {
12 |         "learning_rate": 4e-5,
13 |         "demo": {
14 |             "demo_every": 2000,
15 |             "demo_steps": 250
16 |         }
17 |     }
18 | }


--------------------------------------------------------------------------------
/codec_evaluation/codecs/levo_modules/Flow1dVAE/our_MERT_BESTRQ/modify_env.md:
--------------------------------------------------------------------------------
1 | cp -r fairseq/fairseq/model_parallel/megatron /opt/conda/envs/map/lib/python3.8/site-packages/fairseq/model_parallel/
2 | vi /opt/conda/envs/map/lib/python3.8/site-packages/apex/amp/_initialize.py # string_classes = str
3 | vi /opt/conda/envs/map/lib/python3.8/site-packages/fairseq/modules/layer_norm.py
4 | vi /opt/conda/envs/map/lib/python3.8/site-packages/fairseq/distributed/utils.py # import datetime; timeout=datetime.timedelta(seconds=51200); logger.info("add nccl time to 51200")
5 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | beartype==0.20.0
 2 | descript-audio-codec==1.0.0
 3 | huggingface_hub==0.29.1
 4 | speechtokenizer==1.0.1
 5 | tensorboard==2.19.0
 6 | torch==2.6.0
 7 | torchaudio==2.6.0
 8 | transformers==4.49.0
 9 | vocos==0.1.0
10 | einops==0.8.1
11 | numpy==1.26.4
12 | descript-audiotools>=0.7.2
13 | scipy==1.10.1
14 | torchmetrics==1.4.1
15 | pytorch-lightning==2.4.0
16 | hydra-core==1.3.2
17 | omegaconf==2.3.0
18 | jiwer==3.1.0
19 | conformer==0.3.2
20 | pandas==2.2.3
21 | sentencepiece==0.2.0
22 | pesq==0.0.4
23 | speechbrain==1.0.2
24 | pandas==2.2.3
25 | 


--------------------------------------------------------------------------------
/doc/chore.md:
--------------------------------------------------------------------------------
 1 | ## Software Packaging and Distribute
 2 | 
 3 | Install `build` and generate built distribution.
 4 | ```
 5 | pip install build
 6 | python -m build
 7 | ```
 8 | 
 9 | Install `twine` and upload .whl and .tar.gz file.
10 | 
11 | > To securely upload your project, you'll need a PyPI API token. Create one at https://pypi.org/manage/account/#api-tokens, setting the "Scope" to "Entire account". Don't close the page until you have copied and saved the token — you won't see that token again.
12 | 
13 | ```
14 | pip install twine
15 | twine upload --repository pypi dist/*
16 | ```
17 | 


--------------------------------------------------------------------------------
/codec_evaluation/codecs/levo_modules/Flow1dVAE/tools/get_1dvae.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from tqdm import tqdm
 3 | import torchaudio
 4 | from third_party.stable_audio_tools.stable_audio_tools.models.autoencoders import create_autoencoder_from_config
 5 | import numpy as np
 6 | import os
 7 | import json
 8 | 
 9 | def get_model(model_config, path):
10 |     with open(model_config) as f:
11 |         model_config = json.load(f)
12 |     state_dict = torch.load(path)
13 |     model = create_autoencoder_from_config(model_config)
14 |     model.load_state_dict(state_dict['state_dict'])
15 |     return model


--------------------------------------------------------------------------------
/codec_evaluation/codecs/levo_modules/Flow1dVAE/tools/get_1dvae_1920.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from tqdm import tqdm
 3 | import torchaudio
 4 | from third_party.stable_audio_tools.stable_audio_tools.models.autoencoders import create_autoencoder_from_config
 5 | import numpy as np
 6 | import os
 7 | import json
 8 | 
 9 | def get_model(model_config, path):
10 |     with open(model_config) as f:
11 |         model_config = json.load(f)
12 |     state_dict = torch.load(path)
13 |     model = create_autoencoder_from_config(model_config)
14 |     model.load_state_dict(state_dict['state_dict'])
15 |     return model


--------------------------------------------------------------------------------
/codec_evaluation/codecs/levo_modules/Flow1dVAE/tools/get_1dvae_large_melvae.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from tqdm import tqdm
 3 | import torchaudio
 4 | from third_party.stable_audio_tools.stable_audio_tools.models.autoencoders import create_autoencoder_from_config
 5 | import numpy as np
 6 | import os
 7 | import json
 8 | 
 9 | def get_model(model_config, path):
10 |     with open(model_config) as f:
11 |         model_config = json.load(f)
12 |     state_dict = torch.load(path, map_location='cpu')
13 |     model = create_autoencoder_from_config(model_config)
14 |     model.load_state_dict(state_dict['state_dict'], strict=False)
15 |     return model


--------------------------------------------------------------------------------
/codec_evaluation/codecs/levo_modules/Flow1dVAE/tools/get_1dvae_large.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from tqdm import tqdm
 3 | import torchaudio
 4 | from codec_evaluation.codecs.levo_modules.stable_audio_tools.models.autoencoders import create_autoencoder_from_config
 5 | import numpy as np
 6 | import os
 7 | import json
 8 | 
 9 | def get_model(model_config, path):
10 |     with open(model_config) as f:
11 |         model_config = json.load(f)
12 |     state_dict = torch.load(path, map_location='cpu')
13 |     model = create_autoencoder_from_config(model_config)
14 |     model.load_state_dict(state_dict['state_dict'], strict=False)
15 |     return model
16 | 


--------------------------------------------------------------------------------
/codec_evaluation/codecs/levo_modules/Flow1dVAE/our_MERT_BESTRQ/mert_fairseq/config/pretrain/run/submitit_reg.yaml:
--------------------------------------------------------------------------------
 1 | # @package _global_
 2 | 
 3 | hydra:
 4 |   launcher:
 5 |     cpus_per_task: 8
 6 |     gpus_per_node: 8
 7 |     tasks_per_node: ${hydra.launcher.gpus_per_node}
 8 |     nodes: 4
 9 |     comment: null
10 |     mem_gb: 384
11 |     timeout_min: 4320
12 |     max_num_timeout: 100
13 |     constraint: volta32gb
14 |     name: ${hydra.job.config_name}/${hydra.job.override_dirname}
15 |     submitit_folder: ${hydra.sweep.dir}/submitit/%j
16 | 
17 | distributed_training:
18 |   distributed_world_size: 32
19 |   distributed_port: 29671
20 |   nprocs_per_node: 8
21 | 


--------------------------------------------------------------------------------
/codec_evaluation/codecs/levo_modules/Flow1dVAE/our_MERT_BESTRQ/mert_fairseq/data/eat_data/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | #
 3 | # This source code is licensed under the MIT license found in the
 4 | # LICENSE file in the root directory of this source tree.
 5 | try:
 6 |     from .mae_image_dataset import MaeImageDataset
 7 |     from .raw_audio_dataset import FileAudioDataset
 8 | except:
 9 |     import sys, os
10 |     sys.path.append(os.path.join(os.path.dirname(os.path.abspath(__file__)), '.'))
11 |     from mae_image_dataset import MaeImageDataset
12 |     from raw_audio_dataset import FileAudioDataset
13 | 
14 | __all__ = [
15 |     "MaeImageDataset",
16 |     "FileAudioDataset",
17 | ]


--------------------------------------------------------------------------------
/codec_evaluation/codecs/levo_modules/Flow1dVAE/tools/compare_2models.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import sys
 3 | 
 4 | if __name__=="__main__":
 5 |     m1, m2 = sys.argv[1:3]
 6 |     m1 = torch.load(m1, map_location = 'cpu')
 7 |     m2 = torch.load(m2, map_location = 'cpu')
 8 |     m1_keys = set(m1.keys())
 9 |     m2_keys = set(m2.keys())
10 | 
11 |     m1_uniq_keys = m1_keys - m2_keys
12 |     m2_uniq_keys = m2_keys - m1_keys
13 |     m12_shared_keys = m1_keys & m2_keys
14 | 
15 |     print("m1_uniq_keys: ", m1_uniq_keys)
16 |     print("m2_uniq_keys: ", m2_uniq_keys)
17 |     print("m12_shared_keys but different: ")
18 |     for k in m12_shared_keys:
19 |         if(m1[k].numel() != m2[k].numel()):
20 |             print(k,m1[k].shape,m2[k].shape)
21 | 


--------------------------------------------------------------------------------
/codec_evaluation/perplexity/config/ppl_model_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "attention_dropout": 0.0,
 3 |     "bos_token_id": 151643,
 4 |     "eos_token_id": 151643,
 5 |     "hidden_act": "silu",
 6 |     "hidden_size": 768,
 7 |     "initializer_range": 0.02,
 8 |     "intermediate_size": 3072,
 9 |     "max_position_embeddings": 32768,
10 |     "max_window_layers": 24,
11 |     "model_type": "qwen2",
12 |     "num_attention_heads": 12,
13 |     "num_hidden_layers": 10,
14 |     "num_key_value_heads": 2,
15 |     "rms_norm_eps": 1e-06,
16 |     "rope_theta": 1000000.0,
17 |     "tie_word_embeddings": true,
18 |     "torch_dtype": "bfloat16",
19 |     "use_cache": true,
20 |     "use_mrope": false,
21 |     "use_sliding_window": false,
22 |     "vocab_size": 151936
23 | }


--------------------------------------------------------------------------------
/codec_evaluation/codecs/YuE/modules/norm.py:
--------------------------------------------------------------------------------
 1 | """Normalization modules."""
 2 | import typing as tp
 3 | import einops
 4 | import torch
 5 | from torch import nn
 6 | 
 7 | class ConvLayerNorm(nn.LayerNorm):
 8 |     """
 9 |     Convolution-friendly LayerNorm that moves channels to last dimensions
10 |     before running the normalization and moves them back to original position right after.
11 |     """
12 |     def __init__(self, normalized_shape: tp.Union[int, tp.List[int], torch.Size], **kwargs):
13 |         super().__init__(normalized_shape, **kwargs)
14 | 
15 |     def forward(self, x):
16 |         x = einops.rearrange(x, 'b ... t -> b t ...')
17 |         x = super().forward(x)
18 |         x = einops.rearrange(x, 'b t ... -> b ... t')
19 |         return
20 | 


--------------------------------------------------------------------------------
/codec_evaluation/codecs/levo_modules/Flow1dVAE/cal_token_stat.py:
--------------------------------------------------------------------------------
 1 | import kaldiio
 2 | from tqdm import tqdm
 3 | import torch
 4 | 
 5 | if __name__ == "__main__":
 6 |     bar = torch.zeros(1, 16384)
 7 |     with open('token.scp', 'r') as f:
 8 |         for item_idx, line in tqdm(enumerate(f)):
 9 |             idx, pos = line.strip().split()
10 |             codes = kaldiio.load_mat(pos)
11 |             for i0 in range(codes.shape[-1]):
12 |                 bar[0, codes[0, 0, i0]] += 1
13 |             if(item_idx % 1000 == 0):
14 |                 print("=========")
15 |                 print(1 - (bar[0]==0).sum() / bar.shape[-1])
16 |                 print("=========")
17 |         print("=========")
18 |         print(1 - (bar[0]==0).sum() / bar.shape[-1])
19 |         print("=========")


--------------------------------------------------------------------------------
/codec_evaluation/codecs/version.py:
--------------------------------------------------------------------------------
 1 | # ==============================================================================
 2 | # Copyright 2024 Luca Della Libera. All Rights Reserved.
 3 | # ==============================================================================
 4 | 
 5 | """Version according to SemVer versioning system (https://semver.org/)."""
 6 | 
 7 | 
 8 | __all__ = [
 9 |     "VERSION",
10 | ]
11 | 
12 | 
13 | _MAJOR = "0"  # Major version to increment in case of incompatible API changes
14 | 
15 | _MINOR = (
16 |     "0"  # Minor version to increment in case of backward compatible new functionality
17 | )
18 | 
19 | _PATCH = "1"  # Patch version to increment in case of backward compatible bug fixes
20 | 
21 | VERSION = f"{_MAJOR}.{_MINOR}.{_PATCH}"
22 | """The package version."""
23 | 


--------------------------------------------------------------------------------
/codec_evaluation/utils/schedule.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | 
 3 | 
 4 | def get_cosine_schedule_with_warmup_lr_lambda(
 5 |     current_step: int,
 6 |     *,
 7 |     num_warmup_steps: int | float,
 8 |     num_training_steps: int,
 9 |     num_cycles: float = 0.5,
10 |     final_lr_ratio: float = 0.0,
11 | ):
12 |     if 0 < num_warmup_steps < 1:  # float mode
13 |         num_warmup_steps = int(num_warmup_steps * num_training_steps)
14 | 
15 |     if current_step < num_warmup_steps:
16 |         return float(current_step) / float(max(1, num_warmup_steps))
17 | 
18 |     progress = float(current_step - num_warmup_steps) / float(
19 |         max(1, num_training_steps - num_warmup_steps)
20 |     )
21 | 
22 |     return max(
23 |         final_lr_ratio,
24 |         0.5 * (1.0 + math.cos(math.pi * float(num_cycles) * 2.0 * progress)),
25 |     )


--------------------------------------------------------------------------------
/codec_evaluation/codecs/YuE/descriptaudiocodec/dac/nn/layers.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | from torch.nn.utils import weight_norm
 4 | 
 5 | def WNConv1d(*args, **kwargs):
 6 |     return weight_norm(nn.Conv1d(*args, **kwargs))
 7 | 
 8 | def WNConvTranspose1d(*args, **kwargs):
 9 |     return weight_norm(nn.ConvTranspose1d(*args, **kwargs))
10 | 
11 | # Scripting this brings model speed up 1.4x
12 | @torch.jit.script
13 | def snake(x, alpha):
14 |     shape = x.shape
15 |     x = x.reshape(shape[0], shape[1], -1)
16 |     x = x + (alpha + 1e-9).reciprocal() * torch.sin(alpha * x).pow(2)
17 |     x = x.reshape(shape)
18 |     return x
19 | 
20 | class Snake1d(nn.Module):
21 |     def __init__(self, channels):
22 |         super().__init__()
23 |         self.alpha = nn.Parameter(torch.ones(1, channels, 1))
24 | 
25 |     def forward(self, x):
26 |         return snake(x, self.alpha)
27 | 


--------------------------------------------------------------------------------
/codec_evaluation/codecs/xcodec/descriptaudiocodec/dac/nn/layers.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | from torch.nn.utils import weight_norm
 4 | 
 5 | def WNConv1d(*args, **kwargs):
 6 |     return weight_norm(nn.Conv1d(*args, **kwargs))
 7 | 
 8 | def WNConvTranspose1d(*args, **kwargs):
 9 |     return weight_norm(nn.ConvTranspose1d(*args, **kwargs))
10 | 
11 | # Scripting this brings model speed up 1.4x
12 | @torch.jit.script
13 | def snake(x, alpha):
14 |     shape = x.shape
15 |     x = x.reshape(shape[0], shape[1], -1)
16 |     x = x + (alpha + 1e-9).reciprocal() * torch.sin(alpha * x).pow(2)
17 |     x = x.reshape(shape)
18 |     return x
19 | 
20 | class Snake1d(nn.Module):
21 |     def __init__(self, channels):
22 |         super().__init__()
23 |         self.alpha = nn.Parameter(torch.ones(1, channels, 1))
24 | 
25 |     def forward(self, x):
26 |         return snake(x, self.alpha)
27 | 


--------------------------------------------------------------------------------
/codec_evaluation/codecs/levo_modules/Flow1dVAE/configs/models/transformer2D_wocross_inch112_1x4_multi_large.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "_class_name": "Transformer2DModel",
 3 |   "_diffusers_version": "0.22.0.dev0",
 4 |   "activation_fn": "gelu-approximate",
 5 |   "attention_bias": true,
 6 |   "attention_head_dim": 72,
 7 |   "attention_type": "default",
 8 |   "cross_attention_dim": null,
 9 |   "double_self_attention": false,
10 |   "dropout": 0.0,
11 |   "in_channels": 96,
12 |   "norm_elementwise_affine": false,
13 |   "norm_eps": 1e-06,
14 |   "norm_num_groups": 32,
15 |   "norm_type": "ada_norm_single",
16 |   "num_attention_heads": 22,
17 |   "num_embeds_ada_norm": 1000,
18 |   "num_layers": 24,
19 |   "num_vector_embeds": null,
20 |   "only_cross_attention": false,
21 |   "out_channels": 32,
22 |   "patch_size": 2,
23 |   "sample_size": 384,
24 |   "upcast_attention": false,
25 |   "use_linear_projection": false
26 | }


--------------------------------------------------------------------------------
/codec_evaluation/codecs/levo_modules/Flow1dVAE/tools/get_whisper_encoder.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from transformers import WhisperProcessor, WhisperForConditionalGeneration
 3 | 
 4 | def get_whisper_encoder():
 5 |     processor = WhisperProcessor.from_pretrained("openai/whisper-large-v3")
 6 |     model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large-v3").model.encoder
 7 |     return processor, model.eval()
 8 | 
 9 | if __name__=="__main__":
10 |     import numpy as np
11 |     processor, model = get_whisper_encoder()
12 |     model = model.cuda()
13 |     
14 |     with torch.no_grad():
15 |         input_features = processor(np.random.rand(16000*30,), sampling_rate=16000, return_tensors="pt").input_features.cuda()
16 |         print(input_features.shape)
17 |         out = model(input_features.repeat(10,1,1))
18 |         import pdb;pdb.set_trace()
19 |         print(list(out.values())[0].shape)
20 | 


--------------------------------------------------------------------------------
/codec_evaluation/codecs/levo_modules/stable_audio_tools/models/pretrained.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | 
 3 | from .factory import create_model_from_config
 4 | from .utils import load_ckpt_state_dict
 5 | 
 6 | from huggingface_hub import hf_hub_download
 7 | 
 8 | def get_pretrained_model(name: str):
 9 |     
10 |     model_config_path = hf_hub_download(name, filename="model_config.json", repo_type='model')
11 | 
12 |     with open(model_config_path) as f:
13 |         model_config = json.load(f)
14 | 
15 |     model = create_model_from_config(model_config)
16 | 
17 |     # Try to download the model.safetensors file first, if it doesn't exist, download the model.ckpt file
18 |     try:
19 |         model_ckpt_path = hf_hub_download(name, filename="model.safetensors", repo_type='model')
20 |     except Exception as e:
21 |         model_ckpt_path = hf_hub_download(name, filename="model.ckpt", repo_type='model')
22 | 
23 |     model.load_state_dict(load_ckpt_state_dict(model_ckpt_path))
24 | 
25 |     return model, model_config


--------------------------------------------------------------------------------
/codec_evaluation/codecs/levo_modules/Flow1dVAE/tools/transmodelnorm.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | if __name__=="__main__":
 4 |     src_ckpt = 'saved/train_mulan_v3_48k_everything3/latest/pytorch_model_2.bin'
 5 |     tgt_ckpt = 'saved/train_mulan_v3_48k_everything3_sepnorm/src_pytorch_model_2.bin'
 6 |     # src_ckpt = 'saved/train_enhcodec2D_again/latest/pytorch_model_3.bin'
 7 |     # tgt_ckpt = 'saved/train_enhcodec2D_again_sepnorm/pytorch_model_3.bin'
 8 | 
 9 |     ckpt = torch.load(src_ckpt, map_location='cpu')
10 | 
11 |     ckpt['normfeat.sum_x'] = torch.ones(16, 32, dtype=ckpt['normfeat.sum_x'].dtype) * ckpt['normfeat.sum_x'] / ckpt['normfeat.counts']
12 |     ckpt['normfeat.sum_x2'] = torch.ones(16, 32, dtype=ckpt['normfeat.sum_x2'].dtype) * ckpt['normfeat.sum_x2'] / ckpt['normfeat.counts']
13 |     ckpt['normfeat.sum_target_x2'] = torch.ones(16, 32, dtype=ckpt['normfeat.sum_target_x2'].dtype) * ckpt['normfeat.sum_target_x2'] / ckpt['normfeat.counts']
14 |     ckpt['normfeat.counts'] = torch.ones_like(ckpt['normfeat.counts'])
15 |     torch.save(ckpt, tgt_ckpt)
16 |     


--------------------------------------------------------------------------------
/codec_evaluation/codecs/YuE/RepCodec/repcodec/modules/residual_unit.py:
--------------------------------------------------------------------------------
 1 | import torch.nn as nn
 2 | from codec_evaluation.codecs.YuE.RepCodec.repcodec.layers.conv_layer import Conv1d, Conv1d1x1
 3 | 
 4 | class ResidualUnit(nn.Module):
 5 |     def __init__(
 6 |             self,
 7 |             in_channels: int,
 8 |             out_channels: int,
 9 |             kernel_size=3,
10 |             dilation=1,
11 |             bias=False,
12 |             nonlinear_activation="ELU",
13 |             nonlinear_activation_params={},
14 |     ):
15 |         super().__init__()
16 |         self.activation = getattr(nn, nonlinear_activation)(**nonlinear_activation_params)
17 |         self.conv1 = Conv1d(
18 |             in_channels=in_channels,
19 |             out_channels=out_channels,
20 |             kernel_size=kernel_size,
21 |             stride=1,
22 |             dilation=dilation,
23 |             bias=bias,
24 |         )
25 |         self.conv2 = Conv1d1x1(out_channels, out_channels, bias)
26 | 
27 |     def forward(self, x):
28 |         y = self.conv1(self.activation(x))
29 |         y = self.conv2(self.activation(y))
30 |         return x + y
31 | 


--------------------------------------------------------------------------------
/codec_evaluation/codecs/levo_modules/stable_audio_tools/inference/utils.py:
--------------------------------------------------------------------------------
 1 | from ..data.utils import PadCrop
 2 | 
 3 | from torchaudio import transforms as T
 4 | 
 5 | def set_audio_channels(audio, target_channels):
 6 |     if target_channels == 1:
 7 |         # Convert to mono
 8 |         audio = audio.mean(1, keepdim=True)
 9 |     elif target_channels == 2:
10 |         # Convert to stereo
11 |         if audio.shape[1] == 1:
12 |             audio = audio.repeat(1, 2, 1)
13 |         elif audio.shape[1] > 2:
14 |             audio = audio[:, :2, :]
15 |     return audio
16 | 
17 | def prepare_audio(audio, in_sr, target_sr, target_length, target_channels, device):
18 |     
19 |     audio = audio.to(device)
20 | 
21 |     if in_sr != target_sr:
22 |         resample_tf = T.Resample(in_sr, target_sr).to(device)
23 |         audio = resample_tf(audio)
24 | 
25 |     audio = PadCrop(target_length, randomize=False)(audio)
26 | 
27 |     # Add batch dimension
28 |     if audio.dim() == 1:
29 |         audio = audio.unsqueeze(0).unsqueeze(0)
30 |     elif audio.dim() == 2:
31 |         audio = audio.unsqueeze(0)
32 | 
33 |     audio = set_audio_channels(audio, target_channels)
34 | 
35 |     return audio


--------------------------------------------------------------------------------
/codec_evaluation/codecs/levo_modules/Flow1dVAE/models_gpt/models/tokenizer/pinyin/symbols.py:
--------------------------------------------------------------------------------
 1 | _pause = ["sil", "eos", "sp", "#0", "#1", "#2", "#3"]
 2 | 
 3 | _initials = [
 4 |     "^",
 5 |     "b",
 6 |     "c",
 7 |     "ch",
 8 |     "d",
 9 |     "f",
10 |     "g",
11 |     "h",
12 |     "j",
13 |     "k",
14 |     "l",
15 |     "m",
16 |     "n",
17 |     "p",
18 |     "q",
19 |     "r",
20 |     "s",
21 |     "sh",
22 |     "t",
23 |     "x",
24 |     "z",
25 |     "zh",
26 | ]
27 | 
28 | _tones = ["1", "2", "3", "4", "5"]
29 | 
30 | _finals = [
31 |     "a",
32 |     "ai",
33 |     "an",
34 |     "ang",
35 |     "ao",
36 |     "e",
37 |     "ei",
38 |     "en",
39 |     "eng",
40 |     "er",
41 |     "i",
42 |     "ia",
43 |     "ian",
44 |     "iang",
45 |     "iao",
46 |     "ie",
47 |     "ii",
48 |     "iii",
49 |     "in",
50 |     "ing",
51 |     "iong",
52 |     "iou",
53 |     "o",
54 |     "ong",
55 |     "ou",
56 |     "u",
57 |     "ua",
58 |     "uai",
59 |     "uan",
60 |     "uang",
61 |     "uei",
62 |     "uen",
63 |     "ueng",
64 |     "uo",
65 |     "v",
66 |     "van",
67 |     "ve",
68 |     "vn",
69 | ]
70 | 
71 | symbols = _pause + _initials + [i + j for i in _finals for j in _tones]
72 | 


--------------------------------------------------------------------------------
/codec_evaluation/utils/demucs/models/pretrained.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | @File    : pretrained.py
 5 | @Time    : 2023/8/8 下午7:22
 6 | @Author  : waytan
 7 | @Contact : waytan@tencent.com
 8 | @License : (C)Copyright 2023, Tencent
 9 | @Desc    : Loading pretrained models.
10 | """
11 | from pathlib import Path
12 | 
13 | import yaml
14 | 
15 | from .apply import BagOfModels
16 | from .htdemucs import HTDemucs
17 | from .states import load_state_dict
18 | 
19 | 
20 | def add_model_flags(parser):
21 |     group = parser.add_mutually_exclusive_group(required=False)
22 |     group.add_argument("-s", "--sig", help="Locally trained XP signature.")
23 |     group.add_argument("-n", "--name", default=None,
24 |                        help="Pretrained model name or signature. Default is htdemucs.")
25 |     parser.add_argument("--repo", type=Path,
26 |                         help="Folder containing all pre-trained models for use with -n.")
27 | 
28 | 
29 | def get_model_from_yaml(yaml_file, model_file):
30 |     bag = yaml.safe_load(open(yaml_file))
31 |     model = load_state_dict(HTDemucs, model_file)
32 |     weights = bag.get('weights')
33 |     segment = bag.get('segment')
34 |     return BagOfModels([model], weights, segment)
35 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | from pathlib import Path
 3 | 
 4 | 
 5 | requirements_path = Path(__file__).parent / "requirements.txt"
 6 | readme_path = Path(__file__).parent / "README.md"
 7 | 
 8 | # 读取 requirements.txt
 9 | install_requires = []
10 | with open(requirements_path, encoding="utf-8") as f:
11 |     install_requires.extend([item for item in f.read().splitlines() if item.strip()])
12 | 
13 | setup(
14 |     name="codec_evaluation",
15 |     version="0.1.0",
16 |     packages=find_packages(),
17 |     install_requires=install_requires,
18 |     description="A benchmark for codec evaluation",
19 |     long_description=readme_path.read_text(encoding="utf-8")
20 |     if readme_path.exists()
21 |     else "",
22 |     long_description_content_type="text/markdown",
23 |     python_requires=">=3.10",
24 |     entry_points={
25 |         "console_scripts": [
26 |             "codec_eval_probe = codec_evaluation.probe.train.train_inference:cli",
27 |             "codec_eval_id_sensitive = codec_evaluation.id_sensitive.eval:cli",
28 |             "codec_eval_reconstruction_speech = codec_evaluation.reconstruction_eval.reconstruction_speech_eval:cli",
29 |             "codec_eval_reconstruction_music = codec_evaluation.reconstruction_eval.reconstruction_music_eval:cli",
30 |             "codec_eval_ppl = codec_evaluation.perplexity.train_inference:cli",
31 |         ]
32 |     },
33 | )
34 | 


--------------------------------------------------------------------------------
/codec_evaluation/utils/demucs/models/spec.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | @File    : spec.py
 5 | @Time    : 2023/8/8 下午5:10
 6 | @Author  : waytan
 7 | @Contact : waytan@tencent.com
 8 | @License : (C)Copyright 2023, Tencent
 9 | @Desc    : Spec
10 | """
11 | 
12 | import torch as th
13 | 
14 | 
15 | def spectro(x, n_fft=512, hop_length=None, pad=0):
16 |     *other, length = x.shape
17 |     x = x.reshape(-1, length)
18 |     is_mps = x.device.type == 'mps'
19 |     if is_mps:
20 |         x = x.cpu()
21 |     z = th.stft(x,
22 |                 n_fft * (1 + pad),
23 |                 hop_length or n_fft // 4,
24 |                 window=th.hann_window(n_fft).to(x),
25 |                 win_length=n_fft,
26 |                 normalized=True,
27 |                 center=True,
28 |                 return_complex=True,
29 |                 pad_mode='reflect')
30 |     _, freqs, frame = z.shape
31 |     return z.view(*other, freqs, frame)
32 | 
33 | 
34 | def ispectro(z, hop_length=None, length=None, pad=0):
35 |     *other, freqs, frames = z.shape
36 |     n_fft = 2 * freqs - 2
37 |     z = z.view(-1, freqs, frames)
38 |     win_length = n_fft // (1 + pad)
39 |     is_mps = z.device.type == 'mps'
40 |     if is_mps:
41 |         z = z.cpu()
42 |     x = th.istft(z,
43 |                  n_fft,
44 |                  hop_length,
45 |                  window=th.hann_window(win_length).to(z.real),
46 |                  win_length=win_length,
47 |                  normalized=True,
48 |                  length=length,
49 |                  center=True)
50 |     _, length = x.shape
51 |     return x.view(*other, length)
52 | 


--------------------------------------------------------------------------------
/codec_evaluation/codecs/levo_modules/Flow1dVAE/our_MERT_BESTRQ/test.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import torch
 4 | from dataclasses import dataclass
 5 | from logging import getLogger
 6 | import fairseq.utils
 7 | from fairseq.checkpoint_utils import load_model_ensemble_and_task
 8 | 
 9 | logger = getLogger(__name__)
10 | 
11 | @dataclass
12 | class UserDirModule:
13 |     user_dir: str
14 | 
15 | def find_project_root(current_path: str, target_folder: str = "Codec-Evaluation"):
16 |     path = os.path.abspath(current_path)
17 |     while True:
18 |         if os.path.basename(path) == target_folder:
19 |             return path
20 |         parent = os.path.dirname(path)
21 |         if parent == path:
22 |             raise FileNotFoundError(f"Cannot find project root folder '{target_folder}' from {current_path}")
23 |         path = parent
24 | 
25 | def load_model(model_dir, checkpoint_dir):
26 |     '''Load Fairseq SSL model'''
27 |     project_root = find_project_root(os.path.dirname(__file__), target_folder="Codec-Evaluation")
28 |     mert_path = os.path.join(project_root, "codec_evaluation", "codecs", model_dir)
29 |     # model_dir 已经是完整目录到 mert_fairseq
30 |     mert_path = os.path.abspath(mert_path)
31 |     
32 |     if not os.path.exists(mert_path):
33 |         raise FileNotFoundError(f"Cannot find mert_fairseq in {mert_path} or {fixed_path}")
34 | 
35 |     # 加入 sys.path
36 |     if mert_path not in sys.path:
37 |         sys.path.insert(0, mert_path)
38 | 
39 |     # import_user_module
40 |     module_args = UserDirModule(user_dir=mert_path)
41 |     fairseq.utils.import_user_module(module_args)
42 | 
43 |     # 载入 checkpoint
44 |     model, cfg, task = load_model_ensemble_and_task([checkpoint_dir], strict=False)
45 |     model = model[0]
46 | 
47 |     return model
48 | 
49 | 


--------------------------------------------------------------------------------
/codec_evaluation/codecs/levo_modules/Flow1dVAE/extract_codes_stereo_7_1x4.py:
--------------------------------------------------------------------------------
 1 | import torch,torchaudio
 2 | import os,sys,json
 3 | from tqdm import tqdm
 4 | 
 5 | #from codeclm_song_v1.codeclm.semantic_extractor.SpeechDecoder_v01.generate import Tango
 6 | from generate_4rvq import Tango
 7 | import kaldiio
 8 | from kaldiio import WriteHelper
 9 | 
10 | if __name__ == "__main__":
11 |     # Define Model
12 |     json_path = sys.argv[1]
13 |     outdir = sys.argv[2]
14 |     
15 |     mus_infos = []
16 |     with open(json_path) as f:
17 |         for line in f:
18 |             item = json.loads(line)
19 |             mus_infos.append(item)
20 | 
21 |     tango = Tango(model_path = './saved/model_4rvq/model_2_fixed.safetensors', rvq_num=4)
22 |     
23 |     
24 |     # Feature extraction loop
25 |     # for i in tqdm(range(2000)):
26 |     with WriteHelper('ark,scp:{}/token.ark,{}/token.scp'.format(outdir, outdir), write_function="pickle") as writer:
27 |         print('ark,scp:{}/token.ark,{}/token.scp'.format(outdir, outdir))
28 |         for item in tqdm(mus_infos):
29 |             try:
30 |             # if True:
31 |                 idx = item['idx']
32 |                 # print(idx)
33 |                 with torch.autocast(device_type="cuda", dtype=torch.float16):
34 |                     if(os.path.exists(item['path'])):
35 |                         codes = tango.file2code(item['path'])
36 |                     else:
37 |                         codes = tango.file2code('/mnt/share/' + item['path'])
38 |                 writer(str(idx), codes.cpu())
39 |             except:
40 |                 print(item['path'])
41 |                 continue
42 |             # idx = item['idx']
43 |             # # print(idx)
44 |             # with torch.autocast(device_type="cuda", dtype=torch.float16):
45 |             #     codes = tango.file2code(item['path'])
46 |             # writer(str(idx), codes.cpu())


--------------------------------------------------------------------------------
/codec_evaluation/codecs/levo_modules/Flow1dVAE/tools/mix.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | def a_weight(fs, n_fft, min_db=-80.0):
 5 |     freq = np.linspace(0, fs // 2, n_fft // 2 + 1)
 6 |     freq_sq = np.power(freq, 2)
 7 |     freq_sq[0] = 1.0
 8 |     weight = 2.0 + 20.0 * (2 * np.log10(12194) + 2 * np.log10(freq_sq)
 9 |                            - np.log10(freq_sq + 12194 ** 2)
10 |                            - np.log10(freq_sq + 20.6 ** 2)
11 |                            - 0.5 * np.log10(freq_sq + 107.7 ** 2)
12 |                            - 0.5 * np.log10(freq_sq + 737.9 ** 2))
13 |     weight = np.maximum(weight, min_db)
14 | 
15 |     return weight
16 | 
17 | 
18 | def compute_gain(sound, fs, min_db=-80.0, mode="A_weighting"):
19 |     if fs == 16000:
20 |         n_fft = 2048
21 |     elif fs == 44100:
22 |         n_fft = 4096
23 |     else:
24 |         raise Exception("Invalid fs {}".format(fs))
25 |     stride = n_fft // 2
26 | 
27 |     gain = []
28 |     for i in range(0, len(sound) - n_fft + 1, stride):
29 |         if mode == "RMSE":
30 |             g = np.mean(sound[i: i + n_fft] ** 2)
31 |         elif mode == "A_weighting":
32 |             spec = np.fft.rfft(np.hanning(n_fft + 1)[:-1] * sound[i: i + n_fft])
33 |             power_spec = np.abs(spec) ** 2
34 |             a_weighted_spec = power_spec * np.power(10, a_weight(fs, n_fft) / 10)
35 |             g = np.sum(a_weighted_spec)
36 |         else:
37 |             raise Exception("Invalid mode {}".format(mode))
38 |         gain.append(g)
39 | 
40 |     gain = np.array(gain)
41 |     gain = np.maximum(gain, np.power(10, min_db / 10))
42 |     gain_db = 10 * np.log10(gain)
43 |     return gain_db
44 | 
45 | 
46 | def mix(sound1, sound2, r, fs):
47 |     gain1 = np.max(compute_gain(sound1, fs))  # Decibel
48 |     gain2 = np.max(compute_gain(sound2, fs))
49 |     t = 1.0 / (1 + np.power(10, (gain1 - gain2) / 20.) * (1 - r) / r)
50 |     sound = ((sound1 * t + sound2 * (1 - t)) / np.sqrt(t ** 2 + (1 - t) ** 2))
51 |     return sound


--------------------------------------------------------------------------------
/codec_evaluation/codecs/levo_modules/Flow1dVAE/tools/check_stereo.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | TAMPLEATE = {
 3 |     "path": ""
 4 |     "duration": ""
 5 |     "sample_rate": ""
 6 |     "amplitude": null, 
 7 |     "weight": null, 
 8 |     "info_path": null
 9 | }
10 | '''
11 | import torchaudio
12 | import json
13 | from tqdm import tqdm
14 | 
15 | import torchaudio
16 | import numpy as np
17 | import torch, torch.nn as nn, random
18 | from torchaudio import transforms
19 | import os
20 | import argparse
21 | from tqdm import tqdm
22 | import torchaudio
23 | from torchaudio.transforms import Resample
24 | from multiprocessing import Pool
25 | 
26 | def preprocess(args, wav_json, thread_id):
27 |     # f =  open("pretrain_tme_20230927.scp").readlines() 
28 |     f = open("out.{}".format(thread_id), 'w')
29 |     for line in tqdm(wav_json):
30 |         try:
31 |             # import pdb; pdb.set_trace()
32 |             line = line.strip()
33 |             wav_info = json.loads(line)
34 |             meta = torchaudio.info(wav_info["path"])
35 |             
36 |             wav_info["num_channels"] = meta.num_channels
37 |             json_string = json.dumps(wav_info)
38 |             # print(json_string)
39 |             f.write("{}\n".format(json_string))
40 |         except:
41 |             print(line)
42 | 
43 | if __name__ == "__main__":
44 | 
45 |     parser = argparse.ArgumentParser(description='Deep Speaker Embedding Inference')
46 |     parser.add_argument('--wav_json', type=str)
47 |     parser.add_argument('--num_thread', default=10, type=int, help='random seed')
48 |     args = parser.parse_args()
49 |     
50 |     wav_json_total = open(args.wav_json).readlines()
51 |     args.num_thread = min(len(wav_json_total), args.num_thread)
52 |     wav_json_list = np.array_split(wav_json_total, args.num_thread)
53 | 
54 |     p = Pool(args.num_thread)
55 |     for thread_id, wav_json in enumerate(wav_json_list):
56 |         r = p.apply_async(preprocess, (args, wav_json, thread_id))
57 |     p.close()
58 |     p.join() 
59 |     r.get()
60 | 


--------------------------------------------------------------------------------
/codec_evaluation/codecs/levo_modules/Flow1dVAE/tools/infer_encodec.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import torch
 3 | from tqdm import tqdm
 4 | from audiocraft.models.loaders import load_compression_model
 5 | import torchaudio
 6 | import librosa
 7 | import os
 8 | import math
 9 | import numpy as np
10 | 
11 | class Tango:
12 |     def __init__(self, \
13 |         device="cuda:0"):
14 |         
15 |         self.sample_rate = 48000
16 |         self.rsp48to32 = torchaudio.transforms.Resample(48000, 32000).to(device)
17 |         self.rsp32to48 = torchaudio.transforms.Resample(32000, 48000).to(device)
18 | 
19 |         encodec = load_compression_model('compression_state_dict.bin', device='cpu').eval()
20 |         encodec.set_num_codebooks(1)
21 |         self.encodec = encodec.eval().to(device)
22 |         self.device = torch.device(device)
23 |         print ("Successfully loaded encodec model")
24 | 
25 |     @torch.no_grad()
26 |     def remix(self, filename, start_step=1000, steps=999, disable_progress=False):
27 |         """ Genrate audio without condition. """
28 |         init_audio, _ = librosa.load(filename, sr=self.sample_rate, mono=False)
29 |         if(len(init_audio.shape)>1):init_audio = init_audio[0]
30 |         init_audio = torch.from_numpy(init_audio)[None,None,:].to(self.device)
31 |         init_audio = init_audio[:,:,int(0*self.sample_rate):int(10.24*3*self.sample_rate)]
32 |         if(init_audio.shape[-1]<int(10.24*3*self.sample_rate)):
33 |             init_audio = torch.cat([init_audio, torch.zeros([1,1,int(10.24*3*self.sample_rate)-init_audio.shape[-1]], device=self.device)],-1)
34 | 
35 |         rsped_audios = self.rsp48to32(init_audio)
36 |         codes_rspd = self.encodec.encode(rsped_audios)[0]
37 |         codec_audios = self.encodec.decode(codes_rspd, None)
38 |         codec_audios = self.rsp32to48(codec_audios)
39 |         rsped_audios = self.rsp32to48(rsped_audios)
40 | 
41 |         minlen = min(rsped_audios.shape[-1], codec_audios.shape[-1])
42 |         output = torch.cat([rsped_audios.detach().cpu()[:,0,0:minlen],codec_audios.detach().cpu()[:,0,0:minlen]],0)
43 |         return output
44 | 


--------------------------------------------------------------------------------
/codec_evaluation/codecs/levo_modules/Flow1dVAE/our_MERT_BESTRQ/mert_fairseq/models/musicfm/modules/features.py:
--------------------------------------------------------------------------------
 1 | # MIT License
 2 | #
 3 | # Copyright 2023 ByteDance Inc.
 4 | #
 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”),
 6 | # to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
 7 | # and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
 8 | #
 9 | # The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
10 | #
11 | # THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
12 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
13 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
14 | # IN THE SOFTWARE.
15 | 
16 | import torchaudio
17 | from torch import nn
18 | 
19 | 
20 | class MelSTFT(nn.Module):
21 |     def __init__(
22 |         self,
23 |         sample_rate=24000,
24 |         n_fft=2048,
25 |         hop_length=240,
26 |         n_mels=128,
27 |         is_db=False,
28 |     ):
29 |         super(MelSTFT, self).__init__()
30 | 
31 |         # spectrogram
32 |         self.mel_stft = torchaudio.transforms.MelSpectrogram(
33 |             sample_rate=sample_rate, n_fft=n_fft, hop_length=hop_length, n_mels=n_mels
34 |         )
35 | 
36 |         # amplitude to decibel
37 |         self.is_db = is_db
38 |         if is_db:
39 |             self.amplitude_to_db = torchaudio.transforms.AmplitudeToDB()
40 | 
41 |     def forward(self, waveform):
42 |         if self.is_db:
43 |             return self.amplitude_to_db(self.mel_stft(waveform))
44 |         else:
45 |             return self.mel_stft(waveform)
46 | 


--------------------------------------------------------------------------------
/codec_evaluation/codecs/levo_modules/Flow1dVAE/tools/infer_encodec_speech.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import torch
 3 | from tqdm import tqdm
 4 | from audiocraft.models.loaders import load_compression_model
 5 | import torchaudio
 6 | import librosa
 7 | import os
 8 | import math
 9 | import numpy as np
10 | 
11 | class Tango:
12 |     def __init__(self, \
13 |         device="cuda:0"):
14 |         
15 |         self.sample_rate = 48000
16 |         self.rsp48to32 = torchaudio.transforms.Resample(48000, 32000).to(device)
17 |         self.rsp32to48 = torchaudio.transforms.Resample(32000, 48000).to(device)
18 | 
19 |         encodec = load_compression_model('compression_state_dict.bin', device='cpu').eval()
20 |         encodec.set_num_codebooks(1)
21 |         self.encodec = encodec.eval().to(device)
22 |         self.device = torch.device(device)
23 |         print ("Successfully loaded encodec model")
24 | 
25 |     @torch.no_grad()
26 |     def remix(self, filename, duration=10.24, start_step=1000, steps=999, disable_progress=False):
27 |         """ Genrate audio without condition. """
28 |         orig_samples, fs = torchaudio.load(filename)
29 |         if(orig_samples.shape[-1]<int(duration*48000)):
30 |             orig_samples = orig_samples.repeat(1,math.ceil(int(duration*48000)/float(orig_samples.shape[-1])))
31 |         orig_samples = torch.cat([orig_samples, torch.zeros(orig_samples.shape[0], int(duration * fs)//2, dtype=orig_samples.dtype, device=orig_samples.device)], -1).to(self.device)
32 |         if(fs!=48000):orig_samples = torchaudio.functional.resample(orig_samples, fs, 48000)
33 |         init_audio = orig_samples[[0],None,0:int(duration*48000)]
34 | 
35 |         rsped_audios = self.rsp48to32(init_audio)
36 |         codes_rspd = self.encodec.encode(rsped_audios)[0]
37 |         codec_audios = self.encodec.decode(codes_rspd, None)
38 |         codec_audios = self.rsp32to48(codec_audios)
39 |         rsped_audios = self.rsp32to48(rsped_audios)
40 | 
41 |         minlen = min(rsped_audios.shape[-1], codec_audios.shape[-1])
42 |         output = torch.cat([rsped_audios.detach().cpu()[:,0,0:minlen],codec_audios.detach().cpu()[:,0,0:minlen]],0)
43 |         return output
44 | 


--------------------------------------------------------------------------------
/codec_evaluation/codecs/levo_modules/Flow1dVAE/tools/infer_encodec_vocal.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import torch
 3 | from tqdm import tqdm
 4 | from audiocraft.models.loaders import load_compression_model
 5 | import torchaudio
 6 | import librosa
 7 | import os
 8 | import math
 9 | import numpy as np
10 | 
11 | class Tango:
12 |     def __init__(self, \
13 |         device="cuda:0"):
14 |         
15 |         self.sample_rate = 48000
16 |         self.rsp48to32 = torchaudio.transforms.Resample(48000, 32000).to(device)
17 |         self.rsp32to48 = torchaudio.transforms.Resample(32000, 48000).to(device)
18 | 
19 |         encodec = load_compression_model('compression_state_dict.bin', device='cpu').eval()
20 |         encodec.set_num_codebooks(4)
21 |         self.encodec = encodec.eval().to(device)
22 |         self.device = torch.device(device)
23 |         print ("Successfully loaded encodec model")
24 | 
25 |     def set_num_codebooks(self, num):
26 |         self.encodec.set_num_codebooks(num)
27 | 
28 |     @torch.no_grad()
29 |     def remix(self, filename, start_step=1000, steps=999, disable_progress=False):
30 |         """ Genrate audio without condition. """
31 |         init_audio, _ = librosa.load(filename, sr=self.sample_rate, mono=False)
32 |         if(len(init_audio.shape)>1):init_audio = init_audio[0]
33 |         init_audio = torch.from_numpy(init_audio)[None,None,:].to(self.device)
34 |         init_audio = init_audio[:,:,0:int(10.24*2*self.sample_rate)]
35 |         if(init_audio.shape[-1]<int(10.24*2*self.sample_rate)):
36 |             init_audio = torch.cat([init_audio, torch.zeros([1,1,int(10.24*2*self.sample_rate)-init_audio.shape[-1]], device=self.device)],-1)
37 | 
38 |         rsped_audios = self.rsp48to32(init_audio)
39 |         codes_rspd = self.encodec.encode(rsped_audios)[0]
40 |         codec_audios = self.encodec.decode(codes_rspd, None)
41 |         codec_audios = self.rsp32to48(codec_audios)
42 |         rsped_audios = self.rsp32to48(rsped_audios)
43 | 
44 |         minlen = min(rsped_audios.shape[-1], codec_audios.shape[-1])
45 |         output = torch.cat([rsped_audios.detach().cpu()[:,0,0:minlen],codec_audios.detach().cpu()[:,0,0:minlen]],0)
46 |         return output
47 | 


--------------------------------------------------------------------------------
/codec_evaluation/codecs/levo_modules/Flow1dVAE/our_MERT_BESTRQ/mert_fairseq/data/eat_data/add_class_target_dataset.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | #
 3 | # This source code is licensed under the MIT license found in the
 4 | # LICENSE file in the root directory of this source tree.
 5 | 
 6 | import torch
 7 | 
 8 | from fairseq.data import BaseWrapperDataset
 9 | 
10 | # add labels for audio clips in fine-tuning
11 | class AddClassTargetDataset(BaseWrapperDataset):
12 |     def __init__(
13 |         self,
14 |         dataset,
15 |         labels,
16 |         multi_class,
17 |         num_classes=None,
18 |         label_indices=None,
19 |         add_to_input=True,
20 |     ):
21 |         super().__init__(dataset)
22 | 
23 |         self.label_indices = label_indices
24 |         self.labels = labels
25 |         self.multi_class = multi_class
26 |         self.add_to_input = add_to_input
27 |         if num_classes is None and multi_class:
28 |             assert self.label_indices is not None
29 |             num_classes = len(self.label_indices)
30 | 
31 |         self.num_classes = num_classes
32 | 
33 |     def __getitem__(self, index):
34 |         item = self.dataset[index]
35 |         item_labels = self.labels[index]
36 |         if self.multi_class:
37 |             item["label"] = torch.zeros(self.num_classes)
38 |             for il in item_labels:
39 |                 if self.label_indices is not None:
40 |                     il = self.label_indices[il]
41 |                 item["label"][int(il)] = 1.0
42 |         else:
43 |             item["label"] = torch.tensor(
44 |                 self.labels[index]
45 |                 if self.label_indices is None
46 |                 else self.label_indices[self.labels[index]]
47 |             )
48 | 
49 |         return item
50 | 
51 |     def collater(self, samples):
52 |         collated = self.dataset.collater(samples)
53 |         if len(collated) == 0:
54 |             return collated
55 | 
56 |         indices = set(collated["id"].tolist())
57 |         target = [s["label"] for s in samples if s["id"] in indices]
58 |         collated["label"] = torch.stack(target, dim=0)
59 | 
60 |         if self.add_to_input:
61 |             collated["net_input"]["label"] = collated["label"]
62 | 
63 |         return collated
64 | 


--------------------------------------------------------------------------------
/codec_evaluation/codecs/levo_modules/stable_audio_tools/configs/model_configs/autoencoders/dac_2048_32_vae.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "model_type": "autoencoder",
 3 |     "sample_size": 65536,
 4 |     "sample_rate": 44100,
 5 |     "audio_channels": 1,
 6 |     "model": {
 7 |         "encoder": {
 8 |             "type": "dac",
 9 |             "config": {
10 |                 "latent_dim": 64,
11 |                 "d_model": 128,
12 |                 "strides": [4, 8, 8, 8]
13 |             }
14 |         },
15 |         "decoder": {
16 |             "type": "dac",
17 |             "config": {
18 |                 "latent_dim": 32,
19 |                 "channels": 1536,
20 |                 "rates": [8, 8, 8, 4]
21 |             }
22 |         },
23 |         "bottleneck": {
24 |             "type": "vae"
25 |         },
26 |         "latent_dim": 32,
27 |         "downsampling_ratio": 2048,
28 |         "io_channels": 1
29 |     },
30 |     "training": {
31 |         "learning_rate": 1e-4,
32 |         "warmup_steps": 0,
33 |         "use_ema": false,
34 |         "loss_configs": {
35 |             "discriminator": {
36 |                 "type": "encodec",
37 |                 "config": {
38 |                     "filters": 32,
39 |                     "n_ffts": [2048, 1024, 512, 256, 128, 64, 32],
40 |                     "hop_lengths": [512, 256, 128, 64, 32, 16, 8],
41 |                     "win_lengths": [2048, 1024, 512, 256, 128, 64, 32]
42 |                 },
43 |                 "weights": {
44 |                     "adversarial": 0.1,
45 |                     "feature_matching": 5.0
46 |                 }
47 |             },
48 |             "spectral": {
49 |                 "type": "mrstft",
50 |                 "config": {
51 |                     "fft_sizes": [2048, 1024, 512, 256, 128, 64, 32],
52 |                     "hop_sizes": [512, 256, 128, 64, 32, 16, 8],
53 |                     "win_lengths": [2048, 1024, 512, 256, 128, 64, 32],
54 |                     "perceptual_weighting": true
55 |                 },
56 |                 "weights": {
57 |                     "mrstft": 1.0
58 |                 }
59 |             },
60 |             "time": {
61 |                 "type": "l1",
62 |                 "weights": {
63 |                     "l1": 0.0
64 |                 }
65 |             }
66 |         },
67 |         "demo": {
68 |             "demo_every": 2000
69 |         }
70 |     }
71 | }


--------------------------------------------------------------------------------
/codec_evaluation/codecs/xcodec/descriptaudiocodec/dac/model/base.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | from torch import nn
 3 | 
 4 | class CodecMixin:
 5 |     @property
 6 |     def padding(self):
 7 |         if not hasattr(self, "_padding"):
 8 |             self._padding = True
 9 |         return self._padding
10 | 
11 |     @padding.setter
12 |     def padding(self, value):
13 |         assert isinstance(value, bool)
14 |         layers = [
15 |             l for l in self.modules() if isinstance(l, (nn.Conv1d, nn.ConvTranspose1d))
16 |         ]
17 |         for layer in layers:
18 |             if value:
19 |                 if hasattr(layer, "original_padding"):
20 |                     layer.padding = layer.original_padding
21 |             else:
22 |                 layer.original_padding = layer.padding
23 |                 layer.padding = tuple(0 for _ in range(len(layer.padding)))
24 | 
25 |         self._padding = value
26 | 
27 |     def get_delay(self):
28 |         # Any number works here, delay is invariant to input length
29 |         l_out = self.get_output_length(0)
30 |         L = l_out
31 |         layers = []
32 |         for layer in self.modules():
33 |             if isinstance(layer, (nn.Conv1d, nn.ConvTranspose1d)):
34 |                 layers.append(layer)
35 | 
36 |         for layer in reversed(layers):
37 |             d = layer.dilation[0]
38 |             k = layer.kernel_size[0]
39 |             s = layer.stride[0]
40 |             if isinstance(layer, nn.ConvTranspose1d):
41 |                 L = ((L - d * (k - 1) - 1) / s) + 1
42 |             elif isinstance(layer, nn.Conv1d):
43 |                 L = (L - 1) * s + d * (k - 1) + 1
44 |             L = math.ceil(L)
45 |         l_in = L
46 |         return (l_in - l_out) // 2
47 | 
48 |     def get_output_length(self, input_length):
49 |         L = input_length
50 |         # Calculate output length
51 |         for layer in self.modules():
52 |             if isinstance(layer, (nn.Conv1d, nn.ConvTranspose1d)):
53 |                 d = layer.dilation[0]
54 |                 k = layer.kernel_size[0]
55 |                 s = layer.stride[0]
56 |                 if isinstance(layer, nn.Conv1d):
57 |                     L = ((L - d * (k - 1) - 1) / s) + 1
58 |                 elif isinstance(layer, nn.ConvTranspose1d):
59 |                     L = (L - 1) * s + d * (k - 1) + 1
60 |                 L = math.floor(L)
61 |         return L
62 | 


--------------------------------------------------------------------------------
/codec_evaluation/codecs/YuE/descriptaudiocodec/dac/model/base.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | from torch import nn
 3 | 
 4 | class CodecMixin:
 5 |     @property
 6 |     def padding(self):
 7 |         if not hasattr(self, "_padding"):
 8 |             self._padding = True
 9 |         return self._padding
10 | 
11 |     @padding.setter
12 |     def padding(self, value):
13 |         assert isinstance(value, bool)
14 |         layers = [
15 |             l for l in self.modules() if isinstance(l, (nn.Conv1d, nn.ConvTranspose1d))
16 |         ]
17 |         for layer in layers:
18 |             if value:
19 |                 if hasattr(layer, "original_padding"):
20 |                     layer.padding = layer.original_padding
21 |             else:
22 |                 layer.original_padding = layer.padding
23 |                 layer.padding = tuple(0 for _ in range(len(layer.padding)))
24 |         self._padding = value
25 | 
26 |     def get_delay(self):
27 |         # Any number works here, delay is invariant to input length
28 |         l_out = self.get_output_length(0)
29 |         L = l_out
30 |         layers = []
31 |         for layer in self.modules():
32 |             if isinstance(layer, (nn.Conv1d, nn.ConvTranspose1d)):
33 |                 layers.append(layer)
34 |         for layer in reversed(layers):
35 |             d = layer.dilation[0]
36 |             k = layer.kernel_size[0]
37 |             s = layer.stride[0]
38 | 
39 |             if isinstance(layer, nn.ConvTranspose1d):
40 |                 L = ((L - d * (k - 1) - 1) / s) + 1
41 |             elif isinstance(layer, nn.Conv1d):
42 |                 L = (L - 1) * s + d * (k - 1) + 1
43 |             L = math.ceil(L)
44 |         l_in = L
45 | 
46 |         return (l_in - l_out) // 2
47 | 
48 |     def get_output_length(self, input_length):
49 |         L = input_length
50 |         # Calculate output length
51 |         for layer in self.modules():
52 |             if isinstance(layer, (nn.Conv1d, nn.ConvTranspose1d)):
53 |                 d = layer.dilation[0]
54 |                 k = layer.kernel_size[0]
55 |                 s = layer.stride[0]
56 | 
57 |                 if isinstance(layer, nn.Conv1d):
58 |                     L = ((L - d * (k - 1) - 1) / s) + 1
59 |                 elif isinstance(layer, nn.ConvTranspose1d):
60 |                     L = (L - 1) * s + d * (k - 1) + 1
61 |                 L = math.floor(L)
62 |         return L
63 | 


--------------------------------------------------------------------------------
/codec_evaluation/codecs/levo_modules/Flow1dVAE/tools/creat_jsonl.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | TAMPLEATE = {
 3 |     "path": ""
 4 |     "duration": ""
 5 |     "sample_rate": ""
 6 |     "amplitude": null, 
 7 |     "weight": null, 
 8 |     "info_path": null
 9 | }
10 | '''
11 | import torchaudio
12 | import json
13 | from tqdm import tqdm
14 | 
15 | import torchaudio
16 | import numpy as np
17 | import torch, torch.nn as nn, random
18 | from torchaudio import transforms
19 | import os
20 | import argparse
21 | from tqdm import tqdm
22 | import torchaudio
23 | from torchaudio.transforms import Resample
24 | from multiprocessing import Pool
25 | 
26 | def preprocess(args, wav_scp, thread_id):
27 |     # f =  open("pretrain_tme_20230927.scp").readlines() 
28 |     f = open("out.{}".format(thread_id), 'w')
29 |     for line in tqdm(wav_scp):
30 |         try:
31 |             # import pdb; pdb.set_trace()
32 |             line = line.strip()
33 |             meta = torchaudio.info(line)
34 |             duration = meta.num_frames / float(meta.sample_rate)
35 |             sr = meta.sample_rate
36 |             
37 |             # json_path = line.replace(".flac", ".json")
38 |             # with open(json_path, encoding='utf-8') as fh:
39 |             #     data = json.load(fh)
40 |             # duration = data['duration']
41 |             wav_info = {
42 |                 "path": line,
43 |                 "duration": duration,
44 |                 "sample_rate": sr,
45 |                 "amplitude": None, 
46 |                 "weight": None, 
47 |                 "info_path": None
48 |             }
49 |             json_string = json.dumps(wav_info)
50 |             # print(json_string)
51 |             f.write("{}\n".format(json_string))
52 |         except:
53 |             print(line)
54 | 
55 | if __name__ == "__main__":
56 | 
57 |     parser = argparse.ArgumentParser(description='Deep Speaker Embedding Inference')
58 |     parser.add_argument('--wav_scp', type=str)
59 |     parser.add_argument('--num_thread', default=10, type=int, help='random seed')
60 |     args = parser.parse_args()
61 |     
62 |     wav_scp_total = open(args.wav_scp).readlines()
63 |     args.num_thread = min(len(wav_scp_total), args.num_thread)
64 |     wav_scp_list = np.array_split(wav_scp_total, args.num_thread)
65 | 
66 |     p = Pool(args.num_thread)
67 |     for thread_id, wav_scp in enumerate(wav_scp_list):
68 |         r = p.apply_async(preprocess, (args, wav_scp, thread_id))
69 |     p.close()
70 |     p.join() 
71 |     r.get()
72 | 


--------------------------------------------------------------------------------
/codec_evaluation/codecs/levo_modules/Flow1dVAE/our_MERT_BESTRQ/mert_fairseq/config/pretrain/MusicFM_95M_multinodes.yaml:
--------------------------------------------------------------------------------
  1 | # @package _group_
  2 | common:
  3 |   fp16: false
  4 |   log_format: json
  5 |   log_interval: 200
  6 |   seed: 1337
  7 |   # tensorboard_logdir: tblog_proj_name
  8 |   # wandb_project: wandb_proj_name
  9 | 
 10 | checkpoint:
 11 |   save_interval_updates: 12500
 12 |   keep_interval_updates: -1
 13 |   no_epoch_checkpoints: true
 14 | 
 15 | 
 16 | distributed_training:
 17 |   ddp_backend: no_c10d
 18 |   distributed_backend: 'nccl'
 19 |   distributed_world_size: 64
 20 |   nprocs_per_node: 8
 21 |   find_unused_parameters: true
 22 | 
 23 | task:
 24 |   _name: mert_pretraining
 25 |   data: ???
 26 |   label_dir: ???
 27 |   labels: ???
 28 |   label_rate: ${model.label_rate}
 29 |   sample_rate: 24000
 30 |   # # crop to 5s
 31 |   # max_sample_size: 120000
 32 |   # min_sample_size: 72000
 33 | 
 34 |   # crop to 30s
 35 |   max_sample_size: 720000
 36 |   min_sample_size: 432000
 37 |   clip_secs: 30
 38 | 
 39 |   pad_audio: false
 40 |   random_crop: true
 41 |   normalize: false # must be consistent with extractor
 42 | 
 43 | 
 44 | dataset:
 45 |   num_workers: 6
 46 |   max_tokens: 2000000
 47 |   skip_invalid_size_inputs_valid_test: true
 48 |   validate_interval: 1
 49 |   validate_interval_updates: 10000
 50 | 
 51 | criterion:
 52 |   _name: model 
 53 |   # log_keys:
 54 |   #   - accuracies
 55 | 
 56 | optimization:
 57 |   max_update: 400000
 58 |   lr: [0.0005]
 59 |   clip_norm: 10.0
 60 |   update_freq: [1]
 61 | 
 62 | optimizer:
 63 |   _name: adam
 64 |   adam_betas: (0.9,0.98)
 65 |   adam_eps: 1e-06
 66 |   weight_decay: 0.01
 67 | 
 68 | lr_scheduler:
 69 |   _name: polynomial_decay
 70 |   warmup_updates: 32000
 71 | 
 72 | model:
 73 |   _name: musicfm
 74 |   label_rate: 25
 75 |   num_codebooks: 1
 76 |   codebook_dim: 16
 77 |   codebook_size: 4096
 78 |   features: ["melspec_2048"]
 79 |   hop_length: 240
 80 |   n_mels: 128
 81 |   conv_dim: 512
 82 |   encoder_dim: 1024
 83 |   encoder_depth: 12
 84 |   mask_hop: 0.4
 85 |   mask_prob: 0.6
 86 |   is_flash: false
 87 |   stat_path: msd_stats.json
 88 |   model_path: pretrained_msd.pt
 89 |   w2v2_config_path: models--facebook--wav2vec2-conformer-rope-large-960h-ft/snapshots/6b36ef01c6443c67ae7ed0822876d091ab50e4aa
 90 | 
 91 | hydra:
 92 |   job:
 93 |     config:
 94 |       override_dirname:
 95 |         kv_sep: '-'
 96 |         item_sep: '__'
 97 |         exclude_keys:
 98 |           - run
 99 |           - task.data
100 |           - task.label_dir
101 |   run:
102 |     dir: ???
103 |   sweep:
104 |     dir: ???
105 |     subdir: ${hydra.job.config_name}__${hydra.job.override_dirname}
106 | 


--------------------------------------------------------------------------------
/codec_evaluation/utils/logger.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | from typing import Mapping, Optional
 3 | 
 4 | from lightning_utilities.core.rank_zero import rank_prefixed_message, rank_zero_only
 5 | 
 6 | 
 7 | class RankedLogger(logging.LoggerAdapter):
 8 |     """A multi-GPU-friendly python command line logger."""
 9 | 
10 |     def __init__(
11 |         self,
12 |         name: str = __name__,
13 |         rank_zero_only: bool = True,
14 |         extra: Optional[Mapping[str, object]] = None,
15 |     ) -> None:
16 |         """Initializes a multi-GPU-friendly python command line logger that logs on all processes
17 |         with their rank prefixed in the log message.
18 | 
19 |         :param name: The name of the logger. Default is ``__name__``.
20 |         :param rank_zero_only: Whether to force all logs to only occur on the rank zero process. Default is `False`.
21 |         :param extra: (Optional) A dict-like object which provides contextual information. See `logging.LoggerAdapter`.
22 |         """
23 |         logger = logging.getLogger(name)
24 |         super().__init__(logger=logger, extra=extra)
25 |         self.rank_zero_only = rank_zero_only
26 | 
27 |     def log(
28 |         self, level: int, msg: str, rank: Optional[int] = None, *args, **kwargs
29 |     ) -> None:
30 |         """Delegate a log call to the underlying logger, after prefixing its message with the rank
31 |         of the process it's being logged from. If `'rank'` is provided, then the log will only
32 |         occur on that rank/process.
33 | 
34 |         :param level: The level to log at. Look at `logging.__init__.py` for more information.
35 |         :param msg: The message to log.
36 |         :param rank: The rank to log at.
37 |         :param args: Additional args to pass to the underlying logging function.
38 |         :param kwargs: Any additional keyword args to pass to the underlying logging function.
39 |         """
40 |         if self.isEnabledFor(level):
41 |             msg, kwargs = self.process(msg, kwargs)
42 |             current_rank = getattr(rank_zero_only, "rank", None)
43 |             if current_rank is None:
44 |                 raise RuntimeError(
45 |                     "The `rank_zero_only.rank` needs to be set before use"
46 |                 )
47 |             msg = rank_prefixed_message(msg, current_rank)
48 |             if self.rank_zero_only:
49 |                 if current_rank == 0:
50 |                     self.logger.log(level, msg, *args, **kwargs)
51 |             else:
52 |                 if rank is None:
53 |                     self.logger.log(level, msg, *args, **kwargs)
54 |                 elif current_rank == rank:
55 |                     self.logger.log(level, msg, *args, **kwargs)


--------------------------------------------------------------------------------
/codec_evaluation/codecs/levo_modules/Flow1dVAE/our_MERT_BESTRQ/mert_fairseq/config/pretrain/MusicFM_95M_speech_multinodes.yaml:
--------------------------------------------------------------------------------
  1 | # @package _group_
  2 | common:
  3 |   fp16: false
  4 |   log_format: json
  5 |   log_interval: 200
  6 |   seed: 1337
  7 |   # tensorboard_logdir: tblog_proj_name
  8 |   # wandb_project: wandb_proj_name
  9 | 
 10 | checkpoint:
 11 |   save_interval_updates: 2500
 12 |   keep_interval_updates: 10000
 13 |   no_epoch_checkpoints: true
 14 | 
 15 | 
 16 | distributed_training:
 17 |   ddp_backend: no_c10d
 18 |   distributed_backend: 'nccl'
 19 |   distributed_world_size: 64
 20 |   nprocs_per_node: 8
 21 |   find_unused_parameters: true
 22 | 
 23 | task:
 24 |   _name: mert_pretraining
 25 |   data: ???
 26 |   label_dir: ???
 27 |   labels: ???
 28 |   label_rate: ${model.label_rate}
 29 |   sample_rate: 24000
 30 |   # # crop to 5s
 31 |   # max_sample_size: 120000
 32 |   # min_sample_size: 72000
 33 | 
 34 |   # crop to 30s
 35 |   max_sample_size: 720000
 36 |   min_sample_size: 12000
 37 |   # clip_secs: 30
 38 | 
 39 |   pad_audio: false
 40 |   random_crop: true
 41 |   normalize: false # must be consistent with extractor
 42 | 
 43 | 
 44 | dataset:
 45 |   num_workers: 6
 46 |   max_tokens: 2000000
 47 |   skip_invalid_size_inputs_valid_test: true
 48 |   validate_interval: 1
 49 |   validate_interval_updates: 10000
 50 |   disable_validation: true
 51 | 
 52 | criterion:
 53 |   _name: model 
 54 |   # log_keys:
 55 |   #   - accuracies
 56 | 
 57 | optimization:
 58 |   max_update: 400000
 59 |   lr: [0.0005]
 60 |   clip_norm: 10.0
 61 |   update_freq: [1]
 62 | 
 63 | optimizer:
 64 |   _name: adam
 65 |   adam_betas: (0.9,0.98)
 66 |   adam_eps: 1e-06
 67 |   weight_decay: 0.01
 68 | 
 69 | lr_scheduler:
 70 |   _name: polynomial_decay
 71 |   warmup_updates: 32000
 72 | 
 73 | model:
 74 |   _name: musicfm
 75 |   label_rate: 25
 76 |   num_codebooks: 1
 77 |   codebook_dim: 16
 78 |   codebook_size: 4096
 79 |   features: ["melspec_2048"]
 80 |   hop_length: 240
 81 |   n_mels: 128
 82 |   conv_dim: 512
 83 |   encoder_dim: 1024
 84 |   encoder_depth: 12
 85 |   mask_hop: 0.4
 86 |   mask_prob: 0.6
 87 |   is_flash: false
 88 |   stat_path: msd_stats.json
 89 |   model_path: null
 90 |   w2v2_config_path: models--facebook--wav2vec2-conformer-rope-large-960h-ft/snapshots/6b36ef01c6443c67ae7ed0822876d091ab50e4aa
 91 | 
 92 | hydra:
 93 |   job:
 94 |     config:
 95 |       override_dirname:
 96 |         kv_sep: '-'
 97 |         item_sep: '__'
 98 |         exclude_keys:
 99 |           - run
100 |           - task.data
101 |           - task.label_dir
102 |   run:
103 |     dir: ???
104 |   sweep:
105 |     dir: ???
106 |     subdir: ${hydra.job.config_name}__${hydra.job.override_dirname}
107 | 


--------------------------------------------------------------------------------
/codec_evaluation/codecs/levo_modules/Flow1dVAE/tools/infer_bsrnnvae441k.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import torch
 3 | from tqdm import tqdm
 4 | import torchaudio
 5 | import librosa
 6 | import os
 7 | import math
 8 | import numpy as np
 9 | from tools.get_bsrnnvae import get_bsrnnvae
10 | import tools.torch_tools as torch_tools
11 | 
12 | class Tango:
13 |     def __init__(self, \
14 |         device="cuda:0"):
15 |         
16 |         self.sample_rate = 44100
17 |         self.device = device
18 | 
19 |         self.vae = get_bsrnnvae()
20 |         self.vae = self.vae.eval().to(device)
21 | 
22 |     def sound2sound_generate_longterm(self, fname, batch_size=1, duration=15.36, steps=200, disable_progress=False):
23 |         """ Genrate audio without condition. """
24 |         num_frames = math.ceil(duration * 100. / 8)
25 |         with torch.no_grad():
26 |             orig_samples, fs = torchaudio.load(fname)
27 |             if(fs!=44100):
28 |                 orig_samples = torchaudio.functional.resample(orig_samples, fs, 44100)
29 |                 fs = 44100
30 |             if(orig_samples.shape[-1]<int(duration*44100*2)):
31 |                 orig_samples =  torch.cat([orig_samples, torch.zeros(orig_samples.shape[0], int(duration*44100*2+480)-orig_samples.shape[-1], \
32 |                     dtype=orig_samples.dtype, device=orig_samples.device)], -1)
33 |             # orig_samples = torch.cat([torch.zeros(orig_samples.shape[0], int(duration * fs)//2, dtype=orig_samples.dtype, device=orig_samples.device), orig_samples, torch.zeros(orig_samples.shape[0], int(duration * fs)//2, dtype=orig_samples.dtype, device=orig_samples.device)], -1).to(self.device)
34 |             orig_samples = torch.cat([orig_samples, torch.zeros(orig_samples.shape[0], int(duration * fs)//2, dtype=orig_samples.dtype, device=orig_samples.device)], -1).to(self.device)
35 |             if(fs!=44100):orig_samples = torchaudio.functional.resample(orig_samples, fs, 44100)
36 |             # resampled_audios = orig_samples[[0],int(4.64*44100):int(35.36*48000)+480].clamp(-1,1)
37 |             resampled_audios = orig_samples[[0],0:int(duration*2*44100)+480].clamp(-1,1)
38 |             orig_samples = orig_samples[[0],0:int(duration*2*44100)]
39 | 
40 |             audio = self.vae(orig_samples[:,None,:])[:,0,:]
41 | 
42 |             if(orig_samples.shape[-1]<audio.shape[-1]):
43 |                 orig_samples = torch.cat([orig_samples, torch.zeros(orig_samples.shape[0], audio.shape[-1]-orig_samples.shape[-1], dtype=orig_samples.dtype, device=orig_samples.device)],-1)
44 |             else:
45 |                 orig_samples = orig_samples[:,0:audio.shape[-1]]
46 |             output = torch.cat([orig_samples.detach().cpu(),audio.detach().cpu()],0)
47 |         return output
48 | 


--------------------------------------------------------------------------------
/codec_evaluation/codecs/levo_modules/Flow1dVAE/tools/infer_bsrnnvae441k_vocal.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import torch
 3 | from tqdm import tqdm
 4 | import torchaudio
 5 | import librosa
 6 | import os
 7 | import math
 8 | import numpy as np
 9 | from tools.get_bsrnnvae import get_bsrnnvae
10 | import tools.torch_tools as torch_tools
11 | 
12 | class Tango:
13 |     def __init__(self, \
14 |         device="cuda:0"):
15 |         
16 |         self.sample_rate = 44100
17 |         self.device = device
18 | 
19 |         self.vae = get_bsrnnvae()
20 |         self.vae = self.vae.eval().to(device)
21 | 
22 |     def sound2sound_generate_longterm(self, fname, batch_size=1, duration=20.48, steps=200, disable_progress=False):
23 |         """ Genrate audio without condition. """
24 |         num_frames = math.ceil(duration * 100. / 8)
25 |         with torch.no_grad():
26 |             orig_samples, fs = torchaudio.load(fname)
27 |             if(fs!=44100):
28 |                 orig_samples = torchaudio.functional.resample(orig_samples, fs, 44100)
29 |                 fs = 44100
30 |             if(orig_samples.shape[-1]<int(duration*44100*2)):
31 |                 orig_samples =  torch.cat([orig_samples, torch.zeros(orig_samples.shape[0], int(duration*44100*2+480)-orig_samples.shape[-1], \
32 |                     dtype=orig_samples.dtype, device=orig_samples.device)], -1)
33 |             # orig_samples = torch.cat([torch.zeros(orig_samples.shape[0], int(duration * fs)//2, dtype=orig_samples.dtype, device=orig_samples.device), orig_samples, torch.zeros(orig_samples.shape[0], int(duration * fs)//2, dtype=orig_samples.dtype, device=orig_samples.device)], -1).to(self.device)
34 |             orig_samples = torch.cat([orig_samples, torch.zeros(orig_samples.shape[0], int(duration * fs)//2, dtype=orig_samples.dtype, device=orig_samples.device)], -1).to(self.device)
35 |             if(fs!=44100):orig_samples = torchaudio.functional.resample(orig_samples, fs, 44100)
36 |             # resampled_audios = orig_samples[[0],int(4.64*44100):int(35.36*48000)+480].clamp(-1,1)
37 |             resampled_audios = orig_samples[[0],0:int(duration*2*44100)+480].clamp(-1,1)
38 |             orig_samples = orig_samples[[0],0:int(duration*2*44100)]
39 | 
40 |             audio = self.vae(orig_samples[:,None,:])[:,0,:]
41 | 
42 |             if(orig_samples.shape[-1]<audio.shape[-1]):
43 |                 orig_samples = torch.cat([orig_samples, torch.zeros(orig_samples.shape[0], audio.shape[-1]-orig_samples.shape[-1], dtype=orig_samples.dtype, device=orig_samples.device)],-1)
44 |             else:
45 |                 orig_samples = orig_samples[:,0:audio.shape[-1]]
46 |             output = torch.cat([orig_samples.detach().cpu(),audio.detach().cpu()],0)
47 |         return output
48 | 


--------------------------------------------------------------------------------
/codec_evaluation/codecs/levo_modules/Flow1dVAE/our_MERT_BESTRQ/mert_fairseq/config/pretrain/MusicFM_95M_bestrvq_multinodes.yaml:
--------------------------------------------------------------------------------
  1 | # @package _group_
  2 | common:
  3 |   fp16: false
  4 |   log_format: json
  5 |   log_interval: 200
  6 |   seed: 1337
  7 |   # tensorboard_logdir: tblog_proj_name
  8 |   # wandb_project: wandb_proj_name
  9 | 
 10 | checkpoint:
 11 |   save_interval_updates: 12500
 12 |   keep_interval_updates: -1
 13 |   no_epoch_checkpoints: true
 14 | 
 15 | 
 16 | distributed_training:
 17 |   ddp_backend: no_c10d
 18 |   distributed_backend: 'nccl'
 19 |   distributed_world_size: 64
 20 |   nprocs_per_node: 8
 21 |   find_unused_parameters: true
 22 | 
 23 | task:
 24 |   _name: mert_pretraining
 25 |   data: ???
 26 |   label_dir: ???
 27 |   labels: ???
 28 |   label_rate: ${model.label_rate}
 29 |   sample_rate: 24000
 30 |   # # crop to 5s
 31 |   # max_sample_size: 120000
 32 |   # min_sample_size: 72000
 33 | 
 34 |   # crop to 30s
 35 |   max_sample_size: 720000
 36 |   min_sample_size: 432000
 37 |   clip_secs: 30
 38 | 
 39 |   pad_audio: false
 40 |   random_crop: true
 41 |   normalize: false # must be consistent with extractor
 42 | 
 43 | 
 44 | dataset:
 45 |   num_workers: 6
 46 |   max_tokens: 2000000
 47 |   skip_invalid_size_inputs_valid_test: true
 48 |   validate_interval: 1
 49 |   validate_interval_updates: 10000
 50 | 
 51 | criterion:
 52 |   _name: model 
 53 |   # log_keys:
 54 |   #   - accuracies
 55 | 
 56 | optimization:
 57 |   max_update: 400000
 58 |   lr: [0.0005]
 59 |   clip_norm: 10.0
 60 |   update_freq: [1]
 61 | 
 62 | optimizer:
 63 |   _name: adam
 64 |   adam_betas: (0.9,0.98)
 65 |   adam_eps: 1e-06
 66 |   weight_decay: 0.01
 67 | 
 68 | lr_scheduler:
 69 |   _name: polynomial_decay
 70 |   warmup_updates: 32000
 71 | 
 72 | model:
 73 |   _name: musicfm
 74 |   label_rate: 25
 75 |   num_codebooks: 1
 76 |   codebook_dim: 16
 77 |   codebook_size: 8192 # 4096
 78 |   features: ["melspec_2048"]
 79 |   hop_length: 240
 80 |   n_mels: 128
 81 |   conv_dim: 512
 82 |   encoder_dim: 1024
 83 |   encoder_depth: 12
 84 |   mask_hop: 0.4
 85 |   mask_prob: 0.6
 86 |   is_flash: false
 87 |   
 88 |   stat_path: msd_stats.json
 89 |   model_path: null
 90 |   w2v2_config_path: our-MERT/data/models--facebook--wav2vec2-conformer-rope-large-960h-ft/snapshots/6b36ef01c6443c67ae7ed0822876d091ab50e4aa
 91 |   use_rvq_target: true
 92 |   rvq_ckpt_path: RVQ_4000.pth
 93 | 
 94 | hydra:
 95 |   job:
 96 |     config:
 97 |       override_dirname:
 98 |         kv_sep: '-'
 99 |         item_sep: '__'
100 |         exclude_keys:
101 |           - run
102 |           - task.data
103 |           - task.label_dir
104 |   run:
105 |     dir: ???
106 |   sweep:
107 |     dir: ???
108 |     subdir: ${hydra.job.config_name}__${hydra.job.override_dirname}
109 | 


--------------------------------------------------------------------------------
/codec_evaluation/probe/config/Common_Voice_dataset/dac.yaml:
--------------------------------------------------------------------------------
 1 | mode: encode
 2 | sample_rate: 48000
 3 | probe_ckpt_dir: ???
 4 | seed: 666
 5 | codec_name: dac
 6 | 
 7 | trainer:
 8 |   _target_: pytorch_lightning.Trainer
 9 |   accelerator: gpu
10 |   devices: ???
11 |   precision: 32
12 |   max_epochs: 10
13 |   limit_val_batches: 10
14 |   log_every_n_steps: 20
15 |   val_check_interval: 1.0
16 | 
17 | data:
18 |   _target_: codec_evaluation.probe.dataset.Common_Voice_dataset.Common_voice_dataset.Common_voice_module
19 |   target_samplerate: ${sample_rate}
20 |   train_audio_dir: ???
21 |   val_audio_dir: ???
22 |   test_audio_dir: ???
23 |   base_audio_dir: /root/path/for/audio
24 |   train_batch_size: 4
25 |   val_batch_size: 4
26 |   test_batch_size: 4
27 |   train_num_workers: 4
28 |   val_num_workers: 1
29 |   test_num_workers: 1
30 | 
31 | model:
32 |   _target_: codec_evaluation.probe.model.ctc_lit_prober.CodecCTCProbe
33 |   codec_name: ${codec_name}
34 |   sample_rate: ${sample_rate}
35 |   mode: ${mode}
36 |   tokenizer:
37 |     _target_: transformers.Speech2TextProcessor.from_pretrained
38 |     pretrained_model_name_or_path: ???
39 |   probe_model_builder:
40 |     _target_: codec_evaluation.probe.model.ctc_model.Ctc_Probe
41 |     _partial_: true
42 |     vocab_size: 10000
43 |     codec_vocab_size: 1024
44 |     dropout: 0.1
45 |     lm_head_nums: 8
46 |     conformer_depth: 3
47 |     conformer_heads: 8
48 |   model_ckpt_dir: ???
49 | 
50 |   optimizer_builder:
51 |       _target_: torch.optim.AdamW
52 |       _partial_: true
53 |       lr: 1e-4
54 |       betas: [0.8, 0.99]
55 |       eps: 1e-5
56 |       weight_decay: 0.08
57 | 
58 |   lr_scheduler_builder:
59 |     _target_: torch.optim.lr_scheduler.LambdaLR
60 |     _partial_: true
61 |     lr_lambda:
62 |       _target_: codec_evaluation.utils.schedule.get_cosine_schedule_with_warmup_lr_lambda
63 |       _partial_: true
64 |       num_warmup_steps: 200
65 |       num_training_steps: 4000
66 |       final_lr_ratio: 0.2
67 | 
68 | callbacks:
69 |   learning_rate_monitor:
70 |     _target_: pytorch_lightning.callbacks.LearningRateMonitor
71 |     logging_interval: step
72 | 
73 |   rich_progress_bar:
74 |     _target_: pytorch_lightning.callbacks.RichProgressBar
75 | 
76 |   model_summary:
77 |     _target_: pytorch_lightning.callbacks.ModelSummary
78 |     max_depth: 1
79 | 
80 |   model_checkpoint:
81 |     _target_: pytorch_lightning.callbacks.ModelCheckpoint
82 |     monitor: val_loss
83 |     dirpath: ${probe_ckpt_dir}
84 |     every_n_epochs: 1
85 |     mode: min
86 |     save_top_k: 1
87 |     filename: ${codec_name}_${mode}_{epoch}-{val_loss:.4f}
88 |     verbose: True
89 | 
90 | tensorboard:
91 |   _target_: pytorch_lightning.loggers.TensorBoardLogger
92 |   save_dir: ???
93 |   name: ${codec_name}_${mode}
94 |   log_graph: true


--------------------------------------------------------------------------------
/codec_evaluation/probe/config/Common_Voice_dataset/YuE.yaml:
--------------------------------------------------------------------------------
 1 | mode: encode
 2 | sample_rate: 48000
 3 | probe_ckpt_dir: ???
 4 | seed: 666
 5 | codec_name: yue
 6 | 
 7 | trainer:
 8 |   _target_: pytorch_lightning.Trainer
 9 |   accelerator: gpu
10 |   devices: ???
11 |   precision: 32
12 |   max_epochs: 10
13 |   limit_val_batches: 10
14 |   log_every_n_steps: 20
15 |   val_check_interval: 1.0
16 | 
17 | data:
18 |   _target_: codec_evaluation.probe.dataset.Common_Voice_dataset.Common_voice_dataset.Common_voice_module
19 |   target_samplerate: ${sample_rate}
20 |   train_audio_dir: ???
21 |   val_audio_dir: ???
22 |   test_audio_dir: ???
23 |   base_audio_dir: /root/path/for/audio
24 |   train_batch_size: 4
25 |   val_batch_size: 4
26 |   test_batch_size: 4
27 |   train_num_workers: 4
28 |   val_num_workers: 1
29 |   test_num_workers: 1
30 |   
31 | model:
32 |   _target_: codec_evaluation.probe.model.ctc_lit_prober.CodecCTCProbe
33 |   codec_name: ${codec_name}
34 |   sample_rate: ${sample_rate}
35 |   mode: ${mode}
36 |   tokenizer:
37 |     _target_: transformers.Speech2TextProcessor.from_pretrained
38 |     pretrained_model_name_or_path: ???
39 |   probe_model_builder:
40 |     _target_: codec_evaluation.probe.model.ctc_model.Ctc_Probe
41 |     _partial_: true
42 |     vocab_size: 10000
43 |     codec_vocab_size: 1024
44 |     dropout: 0.1
45 |     lm_head_nums: 8
46 |     conformer_depth: 3
47 |     conformer_heads: 8
48 |   model_ckpt_dir: ???
49 | 
50 |   optimizer_builder:
51 |       _target_: torch.optim.AdamW
52 |       _partial_: true
53 |       lr: 1e-4
54 |       betas: [0.8, 0.99]
55 |       eps: 1e-5
56 |       weight_decay: 0.08
57 | 
58 |   lr_scheduler_builder:
59 |     _target_: torch.optim.lr_scheduler.LambdaLR
60 |     _partial_: true
61 |     lr_lambda:
62 |       _target_: codec_evaluation.utils.schedule.get_cosine_schedule_with_warmup_lr_lambda
63 |       _partial_: true
64 |       num_warmup_steps: 200
65 |       num_training_steps: 4000
66 |       final_lr_ratio: 0.2
67 | 
68 | callbacks:
69 |   learning_rate_monitor:
70 |     _target_: pytorch_lightning.callbacks.LearningRateMonitor
71 |     logging_interval: step
72 | 
73 |   rich_progress_bar:
74 |     _target_: pytorch_lightning.callbacks.RichProgressBar
75 | 
76 |   model_summary:
77 |     _target_: pytorch_lightning.callbacks.ModelSummary
78 |     max_depth: 1
79 | 
80 |   model_checkpoint:
81 |     _target_: pytorch_lightning.callbacks.ModelCheckpoint
82 |     monitor: val_loss
83 |     dirpath: ${probe_ckpt_dir}
84 |     every_n_epochs: 1
85 |     mode: min
86 |     save_top_k: 1
87 |     filename: ${codec_name}_${mode}_{epoch}-{val_loss:.4f}
88 |     verbose: True
89 | 
90 | tensorboard:
91 |   _target_: pytorch_lightning.loggers.TensorBoardLogger
92 |   save_dir: ???
93 |   name: ${codec_name}_${mode}
94 |   log_graph: true


--------------------------------------------------------------------------------
/codec_evaluation/probe/config/Common_Voice_dataset/mimi.yaml:
--------------------------------------------------------------------------------
 1 | mode: encode
 2 | sample_rate: 48000
 3 | probe_ckpt_dir: ???
 4 | seed: 666
 5 | codec_name: mimi
 6 | 
 7 | trainer:
 8 |   _target_: pytorch_lightning.Trainer
 9 |   accelerator: gpu
10 |   devices: ???
11 |   precision: 32
12 |   max_epochs: 10
13 |   log_every_n_steps: 20
14 |   val_check_interval: 1.0
15 |   limit_val_batches: 5
16 | 
17 | data:
18 |   _target_: codec_evaluation.probe.dataset.Common_Voice_dataset.Common_voice_dataset.Common_voice_module
19 |   target_samplerate: ${sample_rate}
20 |   train_audio_dir: ???
21 |   val_audio_dir: ???
22 |   test_audio_dir: ???
23 |   base_audio_dir: /root/path/for/audio
24 |   train_batch_size: 4
25 |   val_batch_size: 4
26 |   test_batch_size: 4
27 |   train_num_workers: 4
28 |   val_num_workers: 1
29 |   test_num_workers: 1
30 | 
31 | model:
32 |   _target_: codec_evaluation.probe.model.ctc_lit_prober.CodecCTCProbe
33 |   codec_name: ${codec_name}
34 |   sample_rate: ${sample_rate}
35 |   mode: ${mode}
36 |   tokenizer:
37 |     _target_: transformers.Speech2TextProcessor.from_pretrained
38 |     pretrained_model_name_or_path: ???
39 |   probe_model_builder:
40 |     _target_: codec_evaluation.probe.model.ctc_model.Ctc_Probe
41 |     _partial_: true
42 |     vocab_size: 10000
43 |     codec_vocab_size: 2048
44 |     dropout: 0.1
45 |     lm_head_nums: 8
46 |     conformer_depth: 3
47 |     conformer_heads: 8
48 |   model_ckpt_dir: ???
49 | 
50 |   optimizer_builder:
51 |       _target_: torch.optim.AdamW
52 |       _partial_: true
53 |       lr: 1e-4
54 |       betas: [0.8, 0.99]
55 |       eps: 1e-5
56 |       weight_decay: 0.08
57 | 
58 |   lr_scheduler_builder:
59 |     _target_: torch.optim.lr_scheduler.LambdaLR
60 |     _partial_: true
61 |     lr_lambda:
62 |       _target_: codec_evaluation.utils.schedule.get_cosine_schedule_with_warmup_lr_lambda
63 |       _partial_: true
64 |       num_warmup_steps: 200
65 |       num_training_steps: 4000
66 |       final_lr_ratio: 0.2
67 | 
68 | callbacks:
69 |   learning_rate_monitor:
70 |     _target_: pytorch_lightning.callbacks.LearningRateMonitor
71 |     logging_interval: step
72 | 
73 |   rich_progress_bar:
74 |     _target_: pytorch_lightning.callbacks.RichProgressBar
75 | 
76 |   model_summary:
77 |     _target_: pytorch_lightning.callbacks.ModelSummary
78 |     max_depth: 1
79 | 
80 |   model_checkpoint:
81 |     _target_: pytorch_lightning.callbacks.ModelCheckpoint
82 |     monitor: val_loss
83 |     dirpath: ${probe_ckpt_dir}
84 |     every_n_epochs: 1
85 |     mode: min
86 |     save_top_k: 1
87 |     filename: ${codec_name}_${mode}_{epoch}-{val_loss:.4f}
88 |     verbose: True
89 | 
90 | tensorboard:
91 |   _target_: pytorch_lightning.loggers.TensorBoardLogger
92 |   save_dir: ???
93 |   name: ${codec_name}_${mode}
94 |   log_graph: true


--------------------------------------------------------------------------------
/codec_evaluation/probe/config/Common_Voice_dataset/encodec.yaml:
--------------------------------------------------------------------------------
 1 | mode: encode
 2 | sample_rate: 48000
 3 | probe_ckpt_dir: ???
 4 | seed: 666
 5 | codec_name: encodec
 6 | 
 7 | trainer:
 8 |   _target_: pytorch_lightning.Trainer
 9 |   accelerator: gpu
10 |   devices: ???
11 |   precision: 32
12 |   max_epochs: 10
13 |   limit_val_batches: 10
14 |   log_every_n_steps: 20
15 |   val_check_interval: 1.0
16 | 
17 | data:
18 |   _target_: codec_evaluation.probe.dataset.Common_Voice_dataset.Common_voice_dataset.Common_voice_module
19 |   target_samplerate: ${sample_rate}
20 |   train_audio_dir: ???
21 |   val_audio_dir: ???
22 |   test_audio_dir: ???
23 |   base_audio_dir: /root/path/for/audio
24 |   train_batch_size: 4
25 |   val_batch_size: 4
26 |   test_batch_size: 4
27 |   train_num_workers: 4
28 |   val_num_workers: 1
29 |   test_num_workers: 1
30 | 
31 | model:
32 |   _target_: codec_evaluation.probe.model.ctc_lit_prober.CodecCTCProbe
33 |   codec_name: ${codec_name}
34 |   sample_rate: ${sample_rate}
35 |   mode: ${mode}
36 |   tokenizer:
37 |     _target_: transformers.Speech2TextProcessor.from_pretrained
38 |     pretrained_model_name_or_path: ???
39 |   probe_model_builder:
40 |     _target_: codec_evaluation.probe.model.ctc_model.Ctc_Probe
41 |     _partial_: true
42 |     vocab_size: 10000
43 |     codec_vocab_size: 1024
44 |     dropout: 0.1
45 |     lm_head_nums: 8
46 |     conformer_depth: 3
47 |     conformer_heads: 8
48 |   model_ckpt_dir: ???
49 | 
50 |   optimizer_builder:
51 |       _target_: torch.optim.AdamW
52 |       _partial_: true
53 |       lr: 1e-4
54 |       betas: [0.8, 0.99]
55 |       eps: 1e-5
56 |       weight_decay: 0.08
57 | 
58 |   lr_scheduler_builder:
59 |     _target_: torch.optim.lr_scheduler.LambdaLR
60 |     _partial_: true
61 |     lr_lambda:
62 |       _target_: codec_evaluation.utils.schedule.get_cosine_schedule_with_warmup_lr_lambda
63 |       _partial_: true
64 |       num_warmup_steps: 200
65 |       num_training_steps: 4000
66 |       final_lr_ratio: 0.2
67 | 
68 | callbacks:
69 |   learning_rate_monitor:
70 |     _target_: pytorch_lightning.callbacks.LearningRateMonitor
71 |     logging_interval: step
72 | 
73 |   rich_progress_bar:
74 |     _target_: pytorch_lightning.callbacks.RichProgressBar
75 | 
76 |   model_summary:
77 |     _target_: pytorch_lightning.callbacks.ModelSummary
78 |     max_depth: 1
79 | 
80 |   model_checkpoint:
81 |     _target_: pytorch_lightning.callbacks.ModelCheckpoint
82 |     monitor: val_loss
83 |     dirpath: ${probe_ckpt_dir}
84 |     every_n_epochs: 1
85 |     mode: min
86 |     save_top_k: 1
87 |     filename: ${codec_name}_${mode}_{epoch}-{val_loss:.4f}
88 |     verbose: True
89 | 
90 | tensorboard:
91 |   _target_: pytorch_lightning.loggers.TensorBoardLogger
92 |   save_dir: ???
93 |   name: ${codec_name}_${mode}
94 |   log_graph: true


--------------------------------------------------------------------------------
/codec_evaluation/codecs/levo_modules/Flow1dVAE/extract_codes_stereo_7_1x2.py:
--------------------------------------------------------------------------------
 1 | import torch,torchaudio
 2 | import os,sys,json
 3 | from tqdm import tqdm
 4 | 
 5 | #from codeclm_song_v1.codeclm.semantic_extractor.SpeechDecoder_v01.generate import Tango
 6 | from generate_2rvq import Tango
 7 | import kaldiio
 8 | from kaldiio import WriteHelper
 9 | import torch
10 | import subprocess
11 | import time
12 | import sys
13 | 
14 | def get_gpu_memory():
15 |     _output_to_list = lambda x: x.decode('ascii').split('\n')[:-1]
16 | 
17 |     ACCEPTABLE_AVAILABLE_MEMORY = 1024
18 |     COMMAND = "nvidia-smi --query-gpu=memory.free --format=csv"
19 |     memory_free_info = _output_to_list(subprocess.check_output(COMMAND.split()))[1:]
20 |     memory_free_values = [int(x.split()[0]) for i, x in enumerate(memory_free_info)]
21 |     return memory_free_values
22 | 
23 | if __name__ == "__main__":
24 |     # Define Model
25 |     json_path = sys.argv[1]
26 |     outdir = sys.argv[2]
27 |     
28 |     gpu_idx = int(os.environ['CUDA_VISIBLE_DEVICES'])
29 |     while True:
30 |         free_mem = get_gpu_memory()
31 |         free_mem = free_mem[gpu_idx]
32 |         if(free_mem > 25_000):
33 |             print("GPU memory {}, run matrix cal".format(free_mem))
34 |             break
35 |         else:
36 |             print("GPU memory {}, sleep 1min".format(free_mem))
37 |             time.sleep(60)
38 |     
39 |     mus_infos = []
40 |     with open(json_path) as f:
41 |         for line in f:
42 |             item = json.loads(line)
43 |             mus_infos.append(item)
44 | 
45 |     tango = Tango(model_path = './saved/model_2rvq/model_2_fixed.safetensors', rvq_num=2)
46 |     
47 |     
48 |     # Feature extraction loop
49 |     # for i in tqdm(range(2000)):
50 |     with WriteHelper('ark,scp:{}/token.ark,{}/token.scp'.format(outdir, outdir), write_function="pickle") as writer:
51 |         print('ark,scp:{}/token.ark,{}/token.scp'.format(outdir, outdir))
52 |         for item in tqdm(mus_infos):
53 |             try:
54 |             # if True:
55 |                 idx = item['idx']
56 |                 # print(idx)
57 |                 with torch.autocast(device_type="cuda", dtype=torch.float16):
58 |                     if(os.path.exists(item['path'])):
59 |                         codes = tango.file2code(item['path'])
60 |                     else:
61 |                         codes = tango.file2code('/mnt/share/' + item['path'])
62 |                 writer(str(idx), codes.cpu())
63 |             except:
64 |                 print(item['path'])
65 |                 continue
66 |             # idx = item['idx']
67 |             # # print(idx)
68 |             # with torch.autocast(device_type="cuda", dtype=torch.float16):
69 |             #     codes = tango.file2code(item['path'])
70 |             # writer(str(idx), codes.cpu())


--------------------------------------------------------------------------------
/codec_evaluation/probe/config/Common_Voice_dataset/wavtokenizer.yaml:
--------------------------------------------------------------------------------
 1 | mode: encode
 2 | sample_rate: 48000
 3 | probe_ckpt_dir: ???
 4 | seed: 666
 5 | codec_name: wavtokenizer
 6 | 
 7 | trainer:
 8 |   _target_: pytorch_lightning.Trainer
 9 |   accelerator: gpu
10 |   devices: ???
11 |   precision: 32
12 |   max_epochs: 10
13 |   log_every_n_steps: 20
14 |   val_check_interval: 1.0
15 |   limit_val_batches: 5
16 | 
17 | data:
18 |   _target_: codec_evaluation.probe.dataset.Common_Voice_dataset.Common_voice_dataset.Common_voice_module
19 |   target_samplerate: ${sample_rate}
20 |   train_audio_dir: ???
21 |   val_audio_dir: ???
22 |   test_audio_dir: ???
23 |   base_audio_dir: /root/path/for/audio
24 |   train_batch_size: 4
25 |   val_batch_size: 4
26 |   test_batch_size: 4
27 |   train_num_workers: 4
28 |   val_num_workers: 1
29 |   test_num_workers: 1
30 | 
31 | model:
32 |   _target_: codec_evaluation.probe.model.ctc_lit_prober.CodecCTCProbe
33 |   codec_name: ${codec_name}
34 |   sample_rate: ${sample_rate}
35 |   mode: ${mode}
36 |   tokenizer:
37 |     _target_: transformers.Speech2TextProcessor.from_pretrained
38 |     pretrained_model_name_or_path: ???
39 |   probe_model_builder:
40 |     _target_: codec_evaluation.probe.model.ctc_model.Ctc_Probe
41 |     _partial_: true
42 |     vocab_size: 10000
43 |     codec_vocab_size: 4096
44 |     dropout: 0.1
45 |     lm_head_nums: 1
46 |     conformer_depth: 3
47 |     conformer_heads: 8
48 |   model_ckpt_dir: ???
49 | 
50 |   optimizer_builder:
51 |       _target_: torch.optim.AdamW
52 |       _partial_: true
53 |       lr: 1e-4
54 |       betas: [0.8, 0.99]
55 |       eps: 1e-5
56 |       weight_decay: 0.08
57 | 
58 |   lr_scheduler_builder:
59 |     _target_: torch.optim.lr_scheduler.LambdaLR
60 |     _partial_: true
61 |     lr_lambda:
62 |       _target_: codec_evaluation.utils.schedule.get_cosine_schedule_with_warmup_lr_lambda
63 |       _partial_: true
64 |       num_warmup_steps: 200
65 |       num_training_steps: 4000
66 |       final_lr_ratio: 0.2
67 | 
68 | callbacks:
69 |   learning_rate_monitor:
70 |     _target_: pytorch_lightning.callbacks.LearningRateMonitor
71 |     logging_interval: step
72 | 
73 |   rich_progress_bar:
74 |     _target_: pytorch_lightning.callbacks.RichProgressBar
75 | 
76 |   model_summary:
77 |     _target_: pytorch_lightning.callbacks.ModelSummary
78 |     max_depth: 1
79 | 
80 |   model_checkpoint:
81 |     _target_: pytorch_lightning.callbacks.ModelCheckpoint
82 |     monitor: val_loss
83 |     dirpath: ${probe_ckpt_dir}
84 |     every_n_epochs: 1
85 |     mode: min
86 |     save_top_k: 1
87 |     filename: ${codec_name}_${mode}_{epoch}-{val_loss:.4f}
88 |     verbose: True
89 | 
90 | tensorboard:
91 |   _target_: pytorch_lightning.loggers.TensorBoardLogger
92 |   save_dir: ???
93 |   name: ${codec_name}_${mode}
94 |   log_graph: true


--------------------------------------------------------------------------------
/codec_evaluation/probe/config/Common_Voice_dataset/semanticodec.yaml:
--------------------------------------------------------------------------------
 1 | mode: encode
 2 | sample_rate: 48000
 3 | probe_ckpt_dir: ???
 4 | seed: 666
 5 | codec_name: semanticodec
 6 | 
 7 | trainer:
 8 |   _target_: pytorch_lightning.Trainer
 9 |   accelerator: gpu
10 |   devices: ???
11 |   precision: 32
12 |   max_epochs: 10
13 |   log_every_n_steps: 20
14 |   val_check_interval: 1.0
15 |   limit_val_batches: 5
16 | 
17 | data:
18 |   _target_: codec_evaluation.probe.dataset.Common_Voice_dataset.Common_voice_dataset.Common_voice_module
19 |   target_samplerate: ${sample_rate}
20 |   train_audio_dir: ???
21 |   val_audio_dir: ???
22 |   test_audio_dir: ???
23 |   base_audio_dir: /root/path/for/audio
24 |   train_batch_size: 4
25 |   val_batch_size: 4
26 |   test_batch_size: 4
27 |   train_num_workers: 4
28 |   val_num_workers: 1
29 |   test_num_workers: 1
30 | 
31 | model:
32 |   _target_: codec_evaluation.probe.model.ctc_lit_prober.CodecCTCProbe
33 |   codec_name: ${codec_name}
34 |   sample_rate: ${sample_rate}
35 |   mode: ${mode}
36 |   tokenizer:
37 |     _target_: transformers.Speech2TextProcessor.from_pretrained
38 |     pretrained_model_name_or_path: ???
39 |   probe_model_builder:
40 |     _target_: codec_evaluation.probe.model.ctc_model.Ctc_Probe
41 |     _partial_: true
42 |     vocab_size: 10000
43 |     codec_vocab_size: 8192
44 |     dropout: 0.1
45 |     lm_head_nums: 2
46 |     conformer_depth: 3
47 |     conformer_heads: 8
48 |   model_ckpt_dir: ???
49 | 
50 |   optimizer_builder:
51 |       _target_: torch.optim.AdamW
52 |       _partial_: true
53 |       lr: 1e-4
54 |       betas: [0.8, 0.99]
55 |       eps: 1e-5
56 |       weight_decay: 0.08
57 | 
58 |   lr_scheduler_builder:
59 |     _target_: torch.optim.lr_scheduler.LambdaLR
60 |     _partial_: true
61 |     lr_lambda:
62 |       _target_: codec_evaluation.utils.schedule.get_cosine_schedule_with_warmup_lr_lambda
63 |       _partial_: true
64 |       num_warmup_steps: 200
65 |       num_training_steps: 4000
66 |       final_lr_ratio: 0.2
67 | 
68 | callbacks:
69 |   learning_rate_monitor:
70 |     _target_: pytorch_lightning.callbacks.LearningRateMonitor
71 |     logging_interval: step
72 | 
73 |   rich_progress_bar:
74 |     _target_: pytorch_lightning.callbacks.RichProgressBar
75 | 
76 |   model_summary:
77 |     _target_: pytorch_lightning.callbacks.ModelSummary
78 |     max_depth: 1
79 | 
80 |   model_checkpoint:
81 |     _target_: pytorch_lightning.callbacks.ModelCheckpoint
82 |     monitor: val_loss
83 |     dirpath: ${probe_ckpt_dir}
84 |     every_n_epochs: 1
85 |     mode: min
86 |     save_top_k: 1
87 |     filename: ${codec_name}_${mode}_{epoch}-{val_loss:.4f}
88 |     verbose: True
89 | 
90 | tensorboard:
91 |   _target_: pytorch_lightning.loggers.TensorBoardLogger
92 |   save_dir: ???
93 |   name: ${codec_name}_${mode}
94 |   log_graph: true
95 | 


--------------------------------------------------------------------------------
/codec_evaluation/probe/config/Common_Voice_dataset/speechtokenizer.yaml:
--------------------------------------------------------------------------------
 1 | mode: encode
 2 | sample_rate: 48000
 3 | probe_ckpt_dir: ???
 4 | seed: 666
 5 | codec_name: speechtokenizer
 6 | 
 7 | trainer:
 8 |   _target_: pytorch_lightning.Trainer
 9 |   accelerator: gpu
10 |   devices: ???
11 |   precision: 32
12 |   max_epochs: 10
13 |   log_every_n_steps: 20
14 |   val_check_interval: 1.0
15 |   limit_val_batches: 5
16 | 
17 | data:
18 |   _target_: codec_evaluation.probe.dataset.Common_Voice_dataset.Common_voice_dataset.Common_voice_module
19 |   target_samplerate: ${sample_rate}
20 |   train_audio_dir: ???
21 |   val_audio_dir: ???
22 |   test_audio_dir: ???
23 |   base_audio_dir: /root/path/for/audio
24 |   train_batch_size: 4
25 |   val_batch_size: 4
26 |   test_batch_size: 4
27 |   train_num_workers: 4
28 |   val_num_workers: 1
29 |   test_num_workers: 1
30 | 
31 | model:
32 |   _target_: codec_evaluation.probe.model.ctc_lit_prober.CodecCTCProbe
33 |   codec_name: ${codec_name}
34 |   sample_rate: ${sample_rate}
35 |   mode: ${mode}
36 |   tokenizer:
37 |     _target_: transformers.Speech2TextProcessor.from_pretrained
38 |     pretrained_model_name_or_path: ???
39 |   probe_model_builder:
40 |     _target_: codec_evaluation.probe.model.ctc_model.Ctc_Probe
41 |     _partial_: true
42 |     vocab_size: 10000
43 |     codec_vocab_size: 1024
44 |     dropout: 0.1
45 |     lm_head_nums: 8
46 |     conformer_depth: 3
47 |     conformer_heads: 8
48 |   model_ckpt_dir: ???
49 | 
50 |   optimizer_builder:
51 |       _target_: torch.optim.AdamW
52 |       _partial_: true
53 |       lr: 1e-4
54 |       betas: [0.8, 0.99]
55 |       eps: 1e-5
56 |       weight_decay: 0.08
57 | 
58 |   lr_scheduler_builder:
59 |     _target_: torch.optim.lr_scheduler.LambdaLR
60 |     _partial_: true
61 |     lr_lambda:
62 |       _target_: codec_evaluation.utils.schedule.get_cosine_schedule_with_warmup_lr_lambda
63 |       _partial_: true
64 |       num_warmup_steps: 200
65 |       num_training_steps: 4000
66 |       final_lr_ratio: 0.2
67 | 
68 | callbacks:
69 |   learning_rate_monitor:
70 |     _target_: pytorch_lightning.callbacks.LearningRateMonitor
71 |     logging_interval: step
72 | 
73 |   rich_progress_bar:
74 |     _target_: pytorch_lightning.callbacks.RichProgressBar
75 | 
76 |   model_summary:
77 |     _target_: pytorch_lightning.callbacks.ModelSummary
78 |     max_depth: 1
79 | 
80 |   model_checkpoint:
81 |     _target_: pytorch_lightning.callbacks.ModelCheckpoint
82 |     monitor: val_loss
83 |     dirpath: ${probe_ckpt_dir}
84 |     every_n_epochs: 1
85 |     mode: min
86 |     save_top_k: 1
87 |     filename: ${codec_name}_${mode}_{epoch}-{val_loss:.4f}
88 |     verbose: True
89 | 
90 | tensorboard:
91 |   _target_: pytorch_lightning.loggers.TensorBoardLogger
92 |   save_dir: ???
93 |   name: ${codec_name}_${mode}
94 |   log_graph: true


--------------------------------------------------------------------------------
/codec_evaluation/probe/config/MTT_dataset/encodec.yaml:
--------------------------------------------------------------------------------
 1 | mode: quantized_emb
 2 | sample_rate: 16000
 3 | target_sec: 10
 4 | num_outputs: 50
 5 | probe_ckpt_dir: ???
 6 | seed: 666
 7 | codec_name: encodec 
 8 | task: multilabel
 9 | save_result: null
10 | 
11 | trainer:
12 |   _target_: pytorch_lightning.Trainer
13 |   accelerator: gpu
14 |   devices: ???
15 |   precision: 32
16 |   max_epochs: 50
17 |   limit_val_batches: 10
18 |   log_every_n_steps: 5
19 |   val_check_interval: 1.0
20 | 
21 | data:
22 |   _target_: codec_evaluation.probe.dataset.MTT_dataset.MTT_dataset.MTTdataModule
23 |   dataset_args:
24 |     sample_rate: ${sample_rate}
25 |     target_sec: ${target_sec}
26 |   train_audio_dir: ???
27 |   val_audio_dir: ???
28 |   test_audio_dir: ???
29 |   train_batch_size: 64
30 |   val_batch_size: 8
31 |   test_batch_size: 64
32 |   train_num_workers: 16
33 |   val_num_workers: 4
34 |   test_num_workers: 16
35 | 
36 | model:
37 |   _target_: codec_evaluation.probe.model.lit_prober.Prober
38 |   codec_name: ${codec_name}
39 |   sample_rate: ${sample_rate}
40 |   mode: ${mode}
41 |   task: ${task}
42 |   num_outputs: ${num_outputs}
43 |   probe_model_builder:
44 |     _target_: codec_evaluation.probe.model.multilabel_model.MultilabelProber
45 |     _partial_: true
46 |     num_outputs: ${num_outputs}
47 |     drop_out: 0.2
48 |     channel_reduction: 16
49 |     padding: 1
50 |     kernel_size: 3
51 |     stride: 1
52 |   target_sec: ${target_sec}
53 |   model_ckpt_dir: ???
54 | 
55 |   optimizer_builder:
56 |       _target_: torch.optim.AdamW
57 |       _partial_: true
58 |       lr: 1e-4
59 |       betas: [0.8, 0.99]
60 |       eps: 1e-5
61 |       weight_decay: 0.08
62 |     
63 |   lr_scheduler_builder:
64 |     _target_: torch.optim.lr_scheduler.LambdaLR
65 |     _partial_: true
66 |     lr_lambda:
67 |       _target_: codec_evaluation.utils.schedule.get_cosine_schedule_with_warmup_lr_lambda
68 |       _partial_: true
69 |       num_warmup_steps: 10
70 |       num_training_steps: 10000
71 |       final_lr_ratio: 0.2
72 | 
73 | callbacks:
74 |   learning_rate_monitor:
75 |     _target_: pytorch_lightning.callbacks.LearningRateMonitor
76 |     logging_interval: step
77 | 
78 |   rich_progress_bar:
79 |     _target_: pytorch_lightning.callbacks.RichProgressBar
80 | 
81 |   model_summary:
82 |     _target_: pytorch_lightning.callbacks.ModelSummary
83 |     max_depth: 1
84 | 
85 |   model_checkpoint:
86 |     _target_: pytorch_lightning.callbacks.ModelCheckpoint
87 |     monitor: val_loss
88 |     dirpath: ${probe_ckpt_dir}
89 |     every_n_epochs: 1
90 |     mode: min
91 |     save_top_k: 1
92 |     filename: ${codec_name}_${mode}_{epoch}-{val_loss:.4f}
93 |     verbose: True
94 | 
95 | tensorboard:
96 |   _target_: pytorch_lightning.loggers.TensorBoardLogger
97 |   save_dir: ???
98 |   name: ${codec_name}_${mode}
99 |   log_graph: true


--------------------------------------------------------------------------------
/codec_evaluation/codecs/YuE/RepCodec/repcodec/layers/conv_layer.py:
--------------------------------------------------------------------------------
 1 | import torch.nn as nn
 2 | 
 3 | class Conv1d1x1(nn.Conv1d):
 4 |     """1x1 Conv1d."""
 5 | 
 6 |     def __init__(self, in_channels, out_channels, bias=True):
 7 |         super(Conv1d1x1, self).__init__(in_channels, out_channels, kernel_size=1, bias=bias)
 8 | 
 9 | class Conv1d(nn.Module):
10 |     def __init__(
11 |             self,
12 |             in_channels: int,
13 |             out_channels: int,
14 |             kernel_size: int,
15 |             stride: int = 1,
16 |             padding: int = -1,
17 |             dilation: int = 1,
18 |             groups: int = 1,
19 |             bias: bool = True
20 |     ):
21 |         super().__init__()
22 |         self.in_channels = in_channels
23 |         self.out_channels = out_channels
24 |         self.kernel_size = kernel_size
25 |         if padding < 0:
26 |             padding = (kernel_size - 1) // 2 * dilation
27 |         self.dilation = dilation
28 |         self.conv = nn.Conv1d(
29 |             in_channels=in_channels,
30 |             out_channels=out_channels,
31 |             kernel_size=kernel_size,
32 |             stride=stride,
33 |             padding=padding,
34 |             dilation=dilation,
35 |             groups=groups,
36 |             bias=bias,
37 |         )
38 | 
39 |     def forward(self, x):
40 |         """
41 |         Args:
42 |             x (Tensor): Float tensor variable with the shape  (B, C, T).
43 |         Returns:
44 |             Tensor: Float tensor variable with the shape (B, C, T).
45 |         """
46 |         x = self.conv(x)
47 |         return x
48 | 
49 | 
50 | class ConvTranspose1d(nn.Module):
51 |     def __init__(
52 |             self,
53 |             in_channels: int,
54 |             out_channels: int,
55 |             kernel_size: int,
56 |             stride: int,
57 |             padding=-1,
58 |             output_padding=-1,
59 |             groups=1,
60 |             bias=True,
61 |     ):
62 |         super().__init__()
63 |         if padding < 0:
64 |             padding = (stride + 1) // 2
65 |         if output_padding < 0:
66 |             output_padding = 1 if stride % 2 else 0
67 |         self.deconv = nn.ConvTranspose1d(
68 |             in_channels=in_channels,
69 |             out_channels=out_channels,
70 |             kernel_size=kernel_size,
71 |             stride=stride,
72 |             padding=padding,
73 |             output_padding=output_padding,
74 |             groups=groups,
75 |             bias=bias,
76 |         )
77 | 
78 |     def forward(self, x):
79 |         """
80 |         Args:
81 |             x (Tensor): Float tensor variable with the shape  (B, C, T).
82 |         Returns:
83 |             Tensor: Float tensor variable with the shape (B, C', T').
84 |         """
85 |         x = self.deconv(x)
86 |         return x
87 | 


--------------------------------------------------------------------------------
/codec_evaluation/probe/config/NSynthI_dataset/dac.yaml:
--------------------------------------------------------------------------------
 1 | mode: quantized_emb
 2 | sample_rate: 16000
 3 | target_sec: 4
 4 | num_outputs: 11
 5 | probe_ckpt_dir: ???
 6 | seed: 666
 7 | codec_name: dac
 8 | task: multiclass
 9 | save_result: null
10 | 
11 | trainer:
12 |   _target_: pytorch_lightning.Trainer
13 |   accelerator: gpu
14 |   devices: ???
15 |   precision: 32
16 |   max_epochs: 100
17 |   limit_val_batches: 10
18 |   log_every_n_steps: 5
19 |   val_check_interval: 1.0
20 | 
21 | data:
22 |   _target_: codec_evaluation.probe.dataset.NSynthI_dataset.NSynthI_dataset.NSynthIdataModule
23 |   dataset_args:
24 |     sample_rate: ${sample_rate}
25 |     target_sec: ${target_sec}
26 |   train_audio_dir: ???
27 |   val_audio_dir: ???
28 |   test_audio_dir: ???
29 |   train_batch_size: 256
30 |   val_batch_size: 32
31 |   test_batch_size: 256
32 |   train_num_workers: 8
33 |   val_num_workers: 4
34 |   test_num_workers: 4
35 | 
36 | model:
37 |   _target_: codec_evaluation.probe.model.lit_prober.Prober
38 |   codec_name: ${codec_name}
39 |   sample_rate: ${sample_rate}
40 |   mode: ${mode}
41 |   task: ${task}
42 |   num_outputs: ${num_outputs}
43 |   probe_model_builder:
44 |     _target_: codec_evaluation.probe.model.multiclass_model.MulticlassProber
45 |     _partial_: true
46 |     num_outputs: ${num_outputs}
47 |     drop_out: 0.1
48 |     channel_reduction: 16
49 |     padding: 1
50 |     kernel_size: 3
51 |     stride: 1
52 |   target_sec: ${target_sec}
53 |   model_ckpt_dir: ???
54 | 
55 |   optimizer_builder:
56 |       _target_: torch.optim.AdamW
57 |       _partial_: true
58 |       lr: 1e-4
59 |       betas: [0.8, 0.99]
60 |       eps: 1e-5
61 |       weight_decay: 0.08
62 |     
63 |   lr_scheduler_builder:
64 |     _target_: torch.optim.lr_scheduler.LambdaLR
65 |     _partial_: true
66 |     lr_lambda:
67 |       _target_: codec_evaluation.utils.schedule.get_cosine_schedule_with_warmup_lr_lambda
68 |       _partial_: true
69 |       num_warmup_steps: 10
70 |       num_training_steps: 10000
71 |       final_lr_ratio: 0.2
72 | 
73 | callbacks:
74 |   learning_rate_monitor:
75 |     _target_: pytorch_lightning.callbacks.LearningRateMonitor
76 |     logging_interval: step
77 | 
78 |   rich_progress_bar:
79 |     _target_: pytorch_lightning.callbacks.RichProgressBar
80 | 
81 |   model_summary:
82 |     _target_: pytorch_lightning.callbacks.ModelSummary
83 |     max_depth: 1
84 | 
85 |   model_checkpoint:
86 |     _target_: pytorch_lightning.callbacks.ModelCheckpoint
87 |     monitor: val_loss
88 |     dirpath: ${probe_ckpt_dir}
89 |     every_n_epochs: 1
90 |     mode: min
91 |     save_top_k: 1
92 |     filename: ${codec_name}_${mode}_{epoch}-{val_loss:.4f}
93 |     verbose: True
94 | 
95 | tensorboard:
96 |   _target_: pytorch_lightning.loggers.TensorBoardLogger
97 |   save_dir: ???
98 |   name: ${codec_name}_${mode}
99 |   log_graph: true


--------------------------------------------------------------------------------
/codec_evaluation/probe/config/NSynthP_dataset/dac.yaml:
--------------------------------------------------------------------------------
 1 | mode: quantized_emb
 2 | sample_rate: 16000
 3 | target_sec: 4
 4 | num_outputs: 128
 5 | probe_ckpt_dir: ???
 6 | seed: 666
 7 | codec_name: dac
 8 | task: multiclass
 9 | save_result: null
10 | 
11 | trainer:
12 |   _target_: pytorch_lightning.Trainer
13 |   accelerator: gpu
14 |   devices: ???
15 |   precision: 32
16 |   max_epochs: 100
17 |   limit_val_batches: 10
18 |   log_every_n_steps: 5
19 |   val_check_interval: 1.0
20 | 
21 | data:
22 |   _target_: codec_evaluation.probe.dataset.NSynthP_dataset.NSynthP_dataset.NSynthPdataModule
23 |   dataset_args:
24 |     sample_rate: ${sample_rate}
25 |     target_sec: ${target_sec}
26 |   train_audio_dir: ???
27 |   val_audio_dir: ???
28 |   test_audio_dir: ???
29 |   train_batch_size: 256
30 |   val_batch_size: 32
31 |   test_batch_size: 128
32 |   train_num_workers: 128
33 |   val_num_workers: 8
34 |   test_num_workers: 64
35 | 
36 | model:
37 |   _target_: codec_evaluation.probe.model.lit_prober.Prober
38 |   codec_name: ${codec_name}
39 |   sample_rate: ${sample_rate}
40 |   mode: ${mode}
41 |   task: ${task}
42 |   num_outputs: ${num_outputs}
43 |   probe_model_builder:
44 |     _target_: codec_evaluation.probe.model.multiclass_model.MulticlassProber
45 |     _partial_: true
46 |     num_outputs: ${num_outputs}
47 |     drop_out: 0.1
48 |     channel_reduction: 16
49 |     padding: 1
50 |     kernel_size: 3
51 |     stride: 1
52 |   target_sec: ${target_sec}
53 |   model_ckpt_dir: ???
54 | 
55 |   optimizer_builder:
56 |       _target_: torch.optim.AdamW
57 |       _partial_: true
58 |       lr: 1e-4
59 |       betas: [0.8, 0.99]
60 |       eps: 1e-5
61 |       weight_decay: 0.08
62 |     
63 |   lr_scheduler_builder:
64 |     _target_: torch.optim.lr_scheduler.LambdaLR
65 |     _partial_: true
66 |     lr_lambda:
67 |       _target_: codec_evaluation.utils.schedule.get_cosine_schedule_with_warmup_lr_lambda
68 |       _partial_: true
69 |       num_warmup_steps: 10
70 |       num_training_steps: 10000
71 |       final_lr_ratio: 0.2
72 | 
73 | callbacks:
74 |   learning_rate_monitor:
75 |     _target_: pytorch_lightning.callbacks.LearningRateMonitor
76 |     logging_interval: step
77 | 
78 |   rich_progress_bar:
79 |     _target_: pytorch_lightning.callbacks.RichProgressBar
80 | 
81 |   model_summary:
82 |     _target_: pytorch_lightning.callbacks.ModelSummary
83 |     max_depth: 1
84 | 
85 |   model_checkpoint:
86 |     _target_: pytorch_lightning.callbacks.ModelCheckpoint
87 |     monitor: val_loss
88 |     dirpath: ${probe_ckpt_dir}
89 |     every_n_epochs: 1
90 |     mode: min
91 |     save_top_k: 1
92 |     filename: ${codec_name}_${mode}_{epoch}-{val_loss:.4f}
93 |     verbose: True
94 | 
95 | tensorboard:
96 |   _target_: pytorch_lightning.loggers.TensorBoardLogger
97 |   save_dir: ???
98 |   name: ${codec_name}_${mode}
99 |   log_graph: true


--------------------------------------------------------------------------------
/codec_evaluation/probe/config/GTZAN_dataset/wavtokenizer.yaml:
--------------------------------------------------------------------------------
 1 | mode: quantized_emb
 2 | sample_rate: 22050
 3 | target_sec: 10
 4 | num_outputs: 10
 5 | probe_ckpt_dir: ???
 6 | seed: 666
 7 | codec_name: wavtokenizer
 8 | task: multiclass
 9 | 
10 | trainer:
11 |   _target_: pytorch_lightning.Trainer
12 |   accelerator: gpu
13 |   devices: ???
14 |   precision: 32
15 |   max_epochs: 100
16 |   limit_val_batches: 10
17 |   log_every_n_steps: 5
18 |   val_check_interval: 1.0
19 | 
20 | data:
21 |   _target_: codec_evaluation.probe.dataset.GTZAN_dataset.GTZAN_dataset.GTZANdataModule
22 |   dataset_args:
23 |     sample_rate: ${sample_rate}
24 |     target_sec: ${target_sec}
25 |   train_audio_dir: ???
26 |   val_audio_dir: ???
27 |   test_audio_dir: ???
28 |   train_batch_size: 8
29 |   val_batch_size: 1
30 |   test_batch_size: 8
31 |   train_num_workers: 8
32 |   val_num_workers: 4
33 |   test_num_workers: 4
34 | 
35 | model:
36 |   _target_: codec_evaluation.probe.model.lit_prober.Prober
37 |   codec_name: ${codec_name}
38 |   sample_rate: ${sample_rate}
39 |   mode: ${mode}
40 |   task: ${task}
41 |   num_outputs: ${num_outputs}
42 |   probe_model_builder:
43 |     _target_: codec_evaluation.probe.model.multiclass_model.MulticlassProber
44 |     _partial_: true
45 |     num_outputs: ${num_outputs}
46 |     drop_out: 0.1
47 |     channel_reduction: 16
48 |     padding: 1
49 |     kernel_size: 3
50 |     stride: 1
51 |   target_sec: ${target_sec}
52 |   model_ckpt_dir: ???
53 | 
54 |   optimizer_builder:
55 |       _target_: torch.optim.AdamW
56 |       _partial_: true
57 |       lr: 1e-4
58 |       betas: [0.8, 0.99]
59 |       eps: 1e-5
60 |       weight_decay: 0.08
61 |     
62 |   lr_scheduler_builder:
63 |     _target_: torch.optim.lr_scheduler.LambdaLR
64 |     _partial_: true
65 |     lr_lambda:
66 |       _target_: codec_evaluation.utils.schedule.get_cosine_schedule_with_warmup_lr_lambda
67 |       _partial_: true
68 |       num_warmup_steps: 10
69 |       num_training_steps: 10000
70 |       final_lr_ratio: 0.2
71 | 
72 | callbacks:
73 |   learning_rate_monitor:
74 |     _target_: pytorch_lightning.callbacks.LearningRateMonitor
75 |     logging_interval: step
76 | 
77 |   rich_progress_bar:
78 |     _target_: pytorch_lightning.callbacks.RichProgressBar
79 | 
80 |   model_summary:
81 |     _target_: pytorch_lightning.callbacks.ModelSummary
82 |     max_depth: 1
83 | 
84 |   model_checkpoint:
85 |     _target_: pytorch_lightning.callbacks.ModelCheckpoint
86 |     monitor: val_loss
87 |     dirpath: ${probe_ckpt_dir}
88 |     every_n_epochs: 1
89 |     mode: min
90 |     save_top_k: 1
91 |     save_last: False
92 |     filename: ${codec_name}_${mode}_{epoch}-{val_loss:.4f}
93 |     verbose: True
94 | 
95 | tensorboard:
96 |   _target_: pytorch_lightning.loggers.TensorBoardLogger
97 |   save_dir: ???
98 |   name: ${codec_name}_${mode}
99 |   log_graph: true


--------------------------------------------------------------------------------
/codec_evaluation/probe/config/NSynthI_dataset/encodec.yaml:
--------------------------------------------------------------------------------
 1 | mode: quantized_emb
 2 | sample_rate: 16000
 3 | target_sec: 4
 4 | num_outputs: 11
 5 | probe_ckpt_dir: ???
 6 | seed: 666
 7 | codec_name: encodec
 8 | task: multiclass
 9 | save_result: null
10 | 
11 | trainer:
12 |   _target_: pytorch_lightning.Trainer
13 |   accelerator: gpu
14 |   devices: ???
15 |   precision: 32
16 |   max_epochs: 100
17 |   limit_val_batches: 10
18 |   log_every_n_steps: 5
19 |   val_check_interval: 1.0
20 | 
21 | data:
22 |   _target_: codec_evaluation.probe.dataset.NSynthI_dataset.NSynthI_dataset.NSynthIdataModule
23 |   dataset_args:
24 |     sample_rate: ${sample_rate}
25 |     target_sec: ${target_sec}
26 |   train_audio_dir: ???
27 |   val_audio_dir: ???
28 |   test_audio_dir: ???
29 |   train_batch_size: 256
30 |   val_batch_size: 32
31 |   test_batch_size: 256
32 |   train_num_workers: 8
33 |   val_num_workers: 4
34 |   test_num_workers: 4
35 | 
36 | model:
37 |   _target_: codec_evaluation.probe.model.lit_prober.Prober
38 |   codec_name: ${codec_name}
39 |   sample_rate: ${sample_rate}
40 |   mode: ${mode}
41 |   task: ${task}
42 |   num_outputs: ${num_outputs}
43 |   probe_model_builder:
44 |     _target_: codec_evaluation.probe.model.multiclass_model.MulticlassProber
45 |     _partial_: true
46 |     num_outputs: ${num_outputs}
47 |     drop_out: 0.1
48 |     channel_reduction: 16
49 |     padding: 1
50 |     kernel_size: 3
51 |     stride: 1
52 |   target_sec: ${target_sec}
53 |   model_ckpt_dir: ???
54 |   
55 |   optimizer_builder:
56 |       _target_: torch.optim.AdamW
57 |       _partial_: true
58 |       lr: 1e-4
59 |       betas: [0.8, 0.99]
60 |       eps: 1e-5
61 |       weight_decay: 0.08
62 |     
63 |   lr_scheduler_builder:
64 |     _target_: torch.optim.lr_scheduler.LambdaLR
65 |     _partial_: true
66 |     lr_lambda:
67 |       _target_: codec_evaluation.utils.schedule.get_cosine_schedule_with_warmup_lr_lambda
68 |       _partial_: true
69 |       num_warmup_steps: 10
70 |       num_training_steps: 10000
71 |       final_lr_ratio: 0.2
72 | 
73 | callbacks:
74 |   learning_rate_monitor:
75 |     _target_: pytorch_lightning.callbacks.LearningRateMonitor
76 |     logging_interval: step
77 | 
78 |   rich_progress_bar:
79 |     _target_: pytorch_lightning.callbacks.RichProgressBar
80 | 
81 |   model_summary:
82 |     _target_: pytorch_lightning.callbacks.ModelSummary
83 |     max_depth: 1
84 | 
85 |   model_checkpoint:
86 |     _target_: pytorch_lightning.callbacks.ModelCheckpoint
87 |     monitor: val_loss
88 |     dirpath: ${probe_ckpt_dir}
89 |     every_n_epochs: 1
90 |     mode: min
91 |     save_top_k: 1
92 |     filename: ${codec_name}_${mode}_{epoch}-{val_loss:.4f}
93 |     verbose: True
94 | 
95 | tensorboard:
96 |   _target_: pytorch_lightning.loggers.TensorBoardLogger
97 |   save_dir: ???
98 |   name: ${codec_name}_${mode}
99 |   log_graph: true


--------------------------------------------------------------------------------
/codec_evaluation/probe/config/NSynthI_dataset/semanticodec.yaml:
--------------------------------------------------------------------------------
 1 | mode: quantized_emb
 2 | sample_rate: 16000
 3 | target_sec: 4
 4 | num_outputs: 11
 5 | probe_ckpt_dir: ???
 6 | seed: 666
 7 | codec_name: semanticodec
 8 | task: multiclass
 9 | save_result: null
10 | 
11 | trainer:
12 |   _target_: pytorch_lightning.Trainer
13 |   accelerator: gpu
14 |   devices: ???
15 |   precision: 32
16 |   max_epochs: 100
17 |   limit_val_batches: 10
18 |   log_every_n_steps: 5
19 |   val_check_interval: 1.0
20 | 
21 | data:
22 |   _target_: codec_evaluation.probe.dataset.NSynthI_dataset.NSynthI_dataset.NSynthIdataModule
23 |   dataset_args:
24 |     sample_rate: ${sample_rate}
25 |     target_sec: ${target_sec}
26 |   train_audio_dir: ???
27 |   val_audio_dir: ???
28 |   test_audio_dir: ???
29 |   train_batch_size: 256
30 |   val_batch_size: 32
31 |   test_batch_size: 256
32 |   train_num_workers: 8
33 |   val_num_workers: 4
34 |   test_num_workers: 4
35 | 
36 | model:
37 |   _target_: codec_evaluation.probe.model.lit_prober.Prober
38 |   codec_name: ${codec_name}
39 |   sample_rate: ${sample_rate}
40 |   mode: ${mode}
41 |   task: ${task}
42 |   num_outputs: ${num_outputs}
43 |   probe_model_builder:
44 |     _target_: codec_evaluation.probe.model.multiclass_model.MulticlassProber
45 |     _partial_: true
46 |     num_outputs: ${num_outputs}
47 |     drop_out: 0.1
48 |     channel_reduction: 16
49 |     padding: 1
50 |     kernel_size: 3
51 |     stride: 1
52 |   target_sec: ${target_sec}
53 |   model_ckpt_dir: ???
54 | 
55 |   optimizer_builder:
56 |       _target_: torch.optim.AdamW
57 |       _partial_: true
58 |       lr: 1e-4
59 |       betas: [0.8, 0.99]
60 |       eps: 1e-5
61 |       weight_decay: 0.08
62 |     
63 |   lr_scheduler_builder:
64 |     _target_: torch.optim.lr_scheduler.LambdaLR
65 |     _partial_: true
66 |     lr_lambda:
67 |       _target_: codec_evaluation.utils.schedule.get_cosine_schedule_with_warmup_lr_lambda
68 |       _partial_: true
69 |       num_warmup_steps: 10
70 |       num_training_steps: 10000
71 |       final_lr_ratio: 0.2
72 | 
73 | callbacks:
74 |   learning_rate_monitor:
75 |     _target_: pytorch_lightning.callbacks.LearningRateMonitor
76 |     logging_interval: step
77 | 
78 |   rich_progress_bar:
79 |     _target_: pytorch_lightning.callbacks.RichProgressBar
80 | 
81 |   model_summary:
82 |     _target_: pytorch_lightning.callbacks.ModelSummary
83 |     max_depth: 1
84 | 
85 |   model_checkpoint:
86 |     _target_: pytorch_lightning.callbacks.ModelCheckpoint
87 |     monitor: val_loss
88 |     dirpath: ${probe_ckpt_dir}
89 |     every_n_epochs: 1
90 |     mode: min
91 |     save_top_k: 1
92 |     filename: ${codec_name}_${mode}_{epoch}-{val_loss:.4f}
93 |     verbose: True
94 | 
95 | tensorboard:
96 |   _target_: pytorch_lightning.loggers.TensorBoardLogger
97 |   save_dir: ???
98 |   name: ${codec_name}_${mode}
99 |   log_graph: true


--------------------------------------------------------------------------------
/codec_evaluation/probe/config/NSynthI_dataset/wavtokenizer.yaml:
--------------------------------------------------------------------------------
 1 | mode: quantized_emb
 2 | sample_rate: 16000
 3 | target_sec: 4
 4 | num_outputs: 11
 5 | probe_ckpt_dir: ???
 6 | seed: 666
 7 | codec_name: wavtokenizer
 8 | task: multiclass
 9 | save_result: null
10 | 
11 | trainer:
12 |   _target_: pytorch_lightning.Trainer
13 |   accelerator: gpu
14 |   devices: ???
15 |   precision: 32
16 |   max_epochs: 100
17 |   limit_val_batches: 10
18 |   log_every_n_steps: 5
19 |   val_check_interval: 1.0
20 | 
21 | data:
22 |   _target_: codec_evaluation.probe.dataset.NSynthI_dataset.NSynthI_dataset.NSynthIdataModule
23 |   dataset_args:
24 |     sample_rate: ${sample_rate}
25 |     target_sec: ${target_sec}
26 |   train_audio_dir: ???
27 |   val_audio_dir: ???
28 |   test_audio_dir: ???
29 |   train_batch_size: 256
30 |   val_batch_size: 32
31 |   test_batch_size: 256
32 |   train_num_workers: 8
33 |   val_num_workers: 4
34 |   test_num_workers: 4
35 | 
36 | model:
37 |   _target_: codec_evaluation.probe.model.lit_prober.Prober
38 |   codec_name: ${codec_name}
39 |   sample_rate: ${sample_rate}
40 |   mode: ${mode}
41 |   task: ${task}
42 |   num_outputs: ${num_outputs}
43 |   probe_model_builder:
44 |     _target_: codec_evaluation.probe.model.multiclass_model.MulticlassProber
45 |     _partial_: true
46 |     num_outputs: ${num_outputs}
47 |     drop_out: 0.
48 |     channel_reduction: 16
49 |     padding: 1
50 |     kernel_size: 3
51 |     stride: 1
52 |   target_sec: ${target_sec}
53 |   model_ckpt_dir: ???
54 | 
55 |   optimizer_builder:
56 |       _target_: torch.optim.AdamW
57 |       _partial_: true
58 |       lr: 1e-4
59 |       betas: [0.8, 0.99]
60 |       eps: 1e-5
61 |       weight_decay: 0.08
62 |     
63 |   lr_scheduler_builder:
64 |     _target_: torch.optim.lr_scheduler.LambdaLR
65 |     _partial_: true
66 |     lr_lambda:
67 |       _target_: codec_evaluation.utils.schedule.get_cosine_schedule_with_warmup_lr_lambda
68 |       _partial_: true
69 |       num_warmup_steps: 10
70 |       num_training_steps: 10000
71 |       final_lr_ratio: 0.2
72 | 
73 | callbacks:
74 |   learning_rate_monitor:
75 |     _target_: pytorch_lightning.callbacks.LearningRateMonitor
76 |     logging_interval: step
77 | 
78 |   rich_progress_bar:
79 |     _target_: pytorch_lightning.callbacks.RichProgressBar
80 | 
81 |   model_summary:
82 |     _target_: pytorch_lightning.callbacks.ModelSummary
83 |     max_depth: 1
84 | 
85 |   model_checkpoint:
86 |     _target_: pytorch_lightning.callbacks.ModelCheckpoint
87 |     monitor: val_loss
88 |     dirpath: ${probe_ckpt_dir}
89 |     every_n_epochs: 1
90 |     mode: min
91 |     save_top_k: 1
92 |     filename: ${codec_name}_${mode}_{epoch}-{val_loss:.4f}
93 |     verbose: True
94 | 
95 | tensorboard:
96 |   _target_: pytorch_lightning.loggers.TensorBoardLogger
97 |   save_dir: ???
98 |   name: ${codec_name}_${mode}
99 |   log_graph: true


--------------------------------------------------------------------------------
/codec_evaluation/probe/config/NSynthP_dataset/semanticodec.yaml:
--------------------------------------------------------------------------------
 1 | mode: quantized_emb
 2 | sample_rate: 16000
 3 | target_sec: 4
 4 | num_outputs: 128
 5 | probe_ckpt_dir: ???
 6 | seed: 666
 7 | codec_name: semanticodec
 8 | task: multiclass
 9 | save_result: null
10 | 
11 | trainer:
12 |   _target_: pytorch_lightning.Trainer
13 |   accelerator: gpu
14 |   devices: ???
15 |   precision: 32
16 |   max_epochs: 100
17 |   limit_val_batches: 10
18 |   log_every_n_steps: 5
19 |   val_check_interval: 1.0
20 | 
21 | data:
22 |   _target_: codec_evaluation.probe.dataset.NSynthP_dataset.NSynthP_dataset.NSynthPdataModule
23 |   dataset_args:
24 |     sample_rate: ${sample_rate}
25 |     target_sec: ${target_sec}
26 |   train_audio_dir: ???
27 |   val_audio_dir: ???
28 |   test_audio_dir: ???
29 |   train_batch_size: 256
30 |   val_batch_size: 32
31 |   test_batch_size: 128
32 |   train_num_workers: 128
33 |   val_num_workers: 8
34 |   test_num_workers: 64
35 | 
36 | model:
37 |   _target_: codec_evaluation.probe.model.lit_prober.Prober
38 |   codec_name: ${codec_name}
39 |   sample_rate: ${sample_rate}
40 |   mode: ${mode}
41 |   task: ${task}
42 |   num_outputs: ${num_outputs}
43 |   probe_model_builder:
44 |     _target_: codec_evaluation.probe.model.multiclass_model.MulticlassProber
45 |     _partial_: true
46 |     num_outputs: ${num_outputs}
47 |     drop_out: 0.1
48 |     channel_reduction: 16
49 |     padding: 1
50 |     kernel_size: 3
51 |     stride: 1
52 |   target_sec: ${target_sec}
53 |   model_ckpt_dir: ???
54 |   
55 |   optimizer_builder:
56 |       _target_: torch.optim.AdamW
57 |       _partial_: true
58 |       lr: 1e-4
59 |       betas: [0.8, 0.99]
60 |       eps: 1e-5
61 |       weight_decay: 0.08
62 |     
63 |   lr_scheduler_builder:
64 |     _target_: torch.optim.lr_scheduler.LambdaLR
65 |     _partial_: true
66 |     lr_lambda:
67 |       _target_: codec_evaluation.utils.schedule.get_cosine_schedule_with_warmup_lr_lambda
68 |       _partial_: true
69 |       num_warmup_steps: 10
70 |       num_training_steps: 10000
71 |       final_lr_ratio: 0.2
72 | 
73 | callbacks:
74 |   learning_rate_monitor:
75 |     _target_: pytorch_lightning.callbacks.LearningRateMonitor
76 |     logging_interval: step
77 | 
78 |   rich_progress_bar:
79 |     _target_: pytorch_lightning.callbacks.RichProgressBar
80 | 
81 |   model_summary:
82 |     _target_: pytorch_lightning.callbacks.ModelSummary
83 |     max_depth: 1
84 | 
85 |   model_checkpoint:
86 |     _target_: pytorch_lightning.callbacks.ModelCheckpoint
87 |     monitor: val_loss
88 |     dirpath: ${probe_ckpt_dir}
89 |     every_n_epochs: 1
90 |     mode: min
91 |     save_top_k: 1
92 |     filename: ${codec_name}_${mode}_{epoch}-{val_loss:.4f}
93 |     verbose: True
94 | 
95 | tensorboard:
96 |   _target_: pytorch_lightning.loggers.TensorBoardLogger
97 |   save_dir: ???
98 |   name: ${codec_name}_${mode}
99 |   log_graph: true


--------------------------------------------------------------------------------
/codec_evaluation/probe/config/Common_Voice_dataset/xcodec.yaml:
--------------------------------------------------------------------------------
 1 | mode: encode
 2 | sample_rate: 48000
 3 | probe_ckpt_dir: ???
 4 | seed: 666
 5 | codec_name: xcodec
 6 | 
 7 | trainer:
 8 |   _target_: pytorch_lightning.Trainer
 9 |   accelerator: gpu
10 |   devices: ???
11 |   precision: 32
12 |   max_epochs: 10
13 |   limit_val_batches: 5
14 |   log_every_n_steps: 20
15 |   val_check_interval: 1.0
16 | 
17 | data:
18 |   _target_: codec_evaluation.probe.dataset.Common_Voice_dataset.Common_voice_dataset.Common_voice_module
19 |   target_samplerate: ${sample_rate}
20 |   train_audio_dir: ???
21 |   val_audio_dir: ???
22 |   test_audio_dir: ???
23 |   base_audio_dir: /root/path/for/audio
24 |   train_batch_size: 4
25 |   val_batch_size: 4
26 |   test_batch_size: 4
27 |   train_num_workers: 4
28 |   val_num_workers: 1
29 |   test_num_workers: 1
30 | 
31 | model:
32 |   _target_: codec_evaluation.probe.model.ctc_lit_prober.CodecCTCProbe
33 |   codec_name: ${codec_name}
34 |   sample_rate: ${sample_rate}
35 |   mode: ${mode}
36 |   tokenizer:
37 |     _target_: transformers.Speech2TextProcessor.from_pretrained
38 |     pretrained_model_name_or_path: ???
39 |   probe_model_builder:
40 |     _target_: codec_evaluation.probe.model.ctc_model.Ctc_Probe
41 |     _partial_: true
42 |     vocab_size: 10000
43 |     codec_vocab_size: 1024
44 |     dropout: 0.1
45 |     lm_head_nums: 8
46 |     conformer_depth: 3
47 |     conformer_heads: 8
48 |   model_ckpt_dir: ???
49 |   teacher_ckpt_path: /codec_ckpt/path/for/xcodec/hubert_base_general_audio
50 | 
51 |   optimizer_builder:
52 |       _target_: torch.optim.AdamW
53 |       _partial_: true
54 |       lr: 1e-4
55 |       betas: [0.8, 0.99]
56 |       eps: 1e-5
57 |       weight_decay: 0.08
58 | 
59 |   lr_scheduler_builder:
60 |     _target_: torch.optim.lr_scheduler.LambdaLR
61 |     _partial_: true
62 |     lr_lambda:
63 |       _target_: codec_evaluation.utils.schedule.get_cosine_schedule_with_warmup_lr_lambda
64 |       _partial_: true
65 |       num_warmup_steps: 200
66 |       num_training_steps: 4000
67 |       final_lr_ratio: 0.2
68 | 
69 | callbacks:
70 |   learning_rate_monitor:
71 |     _target_: pytorch_lightning.callbacks.LearningRateMonitor
72 |     logging_interval: step
73 | 
74 |   rich_progress_bar:
75 |     _target_: pytorch_lightning.callbacks.RichProgressBar
76 | 
77 |   model_summary:
78 |     _target_: pytorch_lightning.callbacks.ModelSummary
79 |     max_depth: 1
80 | 
81 |   model_checkpoint:
82 |     _target_: pytorch_lightning.callbacks.ModelCheckpoint
83 |     monitor: val_loss
84 |     dirpath: ${probe_ckpt_dir}
85 |     every_n_epochs: 1
86 |     mode: min
87 |     save_top_k: 1
88 |     filename: ${codec_name}_${mode}_{epoch}-{val_loss:.4f}
89 |     verbose: True
90 | 
91 | tensorboard:
92 |   _target_: pytorch_lightning.loggers.TensorBoardLogger
93 |   save_dir: ???
94 |   name: ${codec_name}_${mode}
95 |   log_graph: true


--------------------------------------------------------------------------------
/codec_evaluation/codecs/levo_modules/Flow1dVAE/our_MERT_BESTRQ/run_training_eat.sh:
--------------------------------------------------------------------------------
 1 | WORKER_RANK=${1:-$INDEX}
 2 | PLATFORM=${2:-'shef'} 
 3 | YAML_NAME_WITHOUT_EXT=${3:-'MERT_RVQ-VAE_CQT_95M'}
 4 | TRAINING_SETTING=${4:-'MERT_RVQ-VAE_CQT'}
 5 | MASTER_PROC_ADD=${5:-$CHIEF_IP}
 6 | DIST_PORT=${6:-'25520'}
 7 | # echo $PATH
 8 | # export PATH=$PATH:./
 9 | echo "worker rank ${WORKER_RANK}, master address ${MASTER_PROC_ADD}:${DIST_PORT}"
10 | 
11 | MAP_PROJ_DIR=$(pwd)
12 | echo $MAP_PROJ_DIR
13 | 
14 | NNODS=1
15 | BATCH_SIZE=12
16 | NUM_WOKERS=6
17 | 
18 | run_command_prefix=' '
19 | # Loading folders
20 | # 1. tsv files for audio paths
21 | # DATA_DIR=${MAP_PROJ_DIR}/data/audio_tsv
22 | DATA_DIR=${MAP_PROJ_DIR}/data/music4all_sh #audio_manifest
23 | # 2. working folder for saving checkpoints and loading config files
24 | CONFIG_DIR=/${MAP_PROJ_DIR}/mert_fairseq/config/pretrain
25 | # 3. clustering labels for training data
26 | LABEL_ROOT_DIR=${MAP_PROJ_DIR}/data/encodec_labels/custom_audio_dataset
27 | 
28 | FAIRSEQ_PATH=${MAP_PROJ_DIR}/src/fairseq;
29 | SAVE_DIR=${MAP_PROJ_DIR}/data/fairseq_savedir/
30 | 
31 | case $YAML_NAME_WITHOUT_EXT in
32 |     EAT_pretraining_music_multinodes)
33 |         NNODS=4
34 |         NPROCES_PER_NODE=8
35 |         LABEL_RATE=25
36 |         BATCH_SIZE=12
37 |         ;;
38 |     *)
39 |         echo "Unknown running config: ${$YAML_NAME_WITHOUT_EXT}"
40 |         exit 1
41 |         ;;
42 |     esac
43 | 
44 | echo running $YAML_NAME_WITHOUT_EXT ..
45 | 
46 | mkdir -p ${SAVE_DIR}
47 | echo "checkpoint save at: ${SAVE_DIR}"
48 | cd ${SAVE_DIR}
49 | 
50 | DISTRIBUTED_WORLD_SIZE=`expr ${NNODS} \* ${NPROCES_PER_NODE}`
51 | ACTUAL_WORKER_RANK=`expr ${WORKER_RANK} \* ${NPROCES_PER_NODE}`
52 | echo "worker rank ${WORKER_RANK}, master address ${MASTER_PROC_ADD}:${DIST_PORT}, actual rank ${ACTUAL_WORKER_RANK}"
53 | 
54 | DATE_SUFFIX=`date +"%Y-%m-%d_%H-%M"`
55 | 
56 | OMP_NUM_THREADS=6 ${run_command_prefix} \
57 | python -u ${FAIRSEQ_PATH}/fairseq_cli/hydra_train.py \
58 | --config-dir ${CONFIG_DIR} --config-name ${YAML_NAME_WITHOUT_EXT} \
59 | common.user_dir=${MAP_PROJ_DIR}/mert_fairseq \
60 | common.tensorboard_logdir=${MAP_PROJ_DIR}/logs/pretrain_tb_${TRAINING_SETTING}_${YAML_NAME_WITHOUT_EXT}_multinodes${NNODS} \
61 | checkpoint.save_dir=${SAVE_DIR}/ckpt_${TRAINING_SETTING}_multinodes${NNODS}_${DATE_SUFFIX}/${YAML_NAME_WITHOUT_EXT} \
62 | distributed_training.distributed_rank=${ACTUAL_WORKER_RANK} \
63 | distributed_training.distributed_world_size=${DISTRIBUTED_WORLD_SIZE}  \
64 | distributed_training.distributed_num_procs=${DISTRIBUTED_WORLD_SIZE}  \
65 | distributed_training.nprocs_per_node=${NPROCES_PER_NODE} \
66 | distributed_training.distributed_init_method="tcp://${CHIEF_IP}:${DIST_PORT}" \
67 | task.data=${DATA_DIR} \
68 | dataset.num_workers=${NUM_WOKERS} \
69 | dataset.batch_size=${BATCH_SIZE} \
70 | dataset.disable_validation=true \
71 | 
72 | # pip install h5py timm -i https://mirrors.tencent.com/pypi/simple/


--------------------------------------------------------------------------------
/codec_evaluation/probe/config/EMO_dataset/encodec.yaml:
--------------------------------------------------------------------------------
  1 | mode: quantized_emb
  2 | sample_rate: 44100
  3 | target_sec: 9
  4 | num_outputs: 2
  5 | probe_ckpt_dir: ???
  6 | seed: 666
  7 | codec_name: encodec
  8 | task: regression
  9 | save_result: null
 10 | 
 11 | trainer:
 12 |   _target_: pytorch_lightning.Trainer
 13 |   accelerator: gpu
 14 |   devices: ???
 15 |   precision: 32
 16 |   max_epochs: 100
 17 |   limit_val_batches: 10
 18 |   log_every_n_steps: 5
 19 |   val_check_interval: 1.0
 20 | 
 21 | data:
 22 |   _target_: codec_evaluation.probe.dataset.EMO_dataset.EMO_dataset.EMOdataModule
 23 |   dataset_args:
 24 |     sample_rate: ${sample_rate}
 25 |     target_sec: ${target_sec}
 26 |   train_audio_dir: ???
 27 |   val_audio_dir: ???
 28 |   test_audio_dir: ???
 29 |   train_batch_size: 32
 30 |   val_batch_size: 1
 31 |   test_batch_size: 32
 32 |   train_num_workers: 8
 33 |   val_num_workers: 4
 34 |   test_num_workers: 4
 35 | 
 36 | model:
 37 |   _target_: codec_evaluation.probe.model.lit_prober.Prober
 38 |   codec_name: ${codec_name}
 39 |   sample_rate: ${sample_rate}
 40 |   mode: ${mode}
 41 |   task: ${task}
 42 |   num_outputs: ${num_outputs}
 43 |   probe_model_builder:
 44 |     _target_: codec_evaluation.probe.model.regression_model.RegressionProber
 45 |     _partial_: true
 46 |     num_outputs: ${num_outputs}
 47 |     drop_out: 0.1
 48 |     channel_reduction: 16
 49 |     padding: 1
 50 |     kernel_size: 3
 51 |     stride: 1
 52 |   target_sec: ${target_sec}
 53 |   model_ckpt_dir: ???
 54 | 
 55 |   optimizer_builder:
 56 |     _target_: torch.optim.AdamW
 57 |     _partial_: true
 58 |     lr: 1e-4
 59 |     betas: [0.8, 0.99]
 60 |     eps: 1e-5
 61 |     weight_decay: 0.08
 62 |     
 63 |   lr_scheduler_builder:
 64 |     _target_: torch.optim.lr_scheduler.LambdaLR
 65 |     _partial_: true
 66 |     lr_lambda:
 67 |       _target_: codec_evaluation.utils.schedule.get_cosine_schedule_with_warmup_lr_lambda
 68 |       _partial_: true
 69 |       num_warmup_steps: 10
 70 |       num_training_steps: 10000
 71 |       final_lr_ratio: 0.2
 72 | 
 73 | callbacks:
 74 |   learning_rate_monitor:
 75 |     _target_: pytorch_lightning.callbacks.LearningRateMonitor
 76 |     logging_interval: step
 77 | 
 78 |   rich_progress_bar:
 79 |     _target_: pytorch_lightning.callbacks.RichProgressBar
 80 | 
 81 |   model_summary:
 82 |     _target_: pytorch_lightning.callbacks.ModelSummary
 83 |     max_depth: 1
 84 | 
 85 |   model_checkpoint:
 86 |     _target_: pytorch_lightning.callbacks.ModelCheckpoint
 87 |     monitor: val_loss
 88 |     dirpath: ${probe_ckpt_dir}
 89 |     every_n_epochs: 1
 90 |     mode: min
 91 |     save_top_k: 1
 92 |     save_last: False
 93 |     filename: ${codec_name}_${mode}_{epoch}-{val_loss:.4f}
 94 |     verbose: True
 95 | 
 96 | tensorboard:
 97 |   _target_: pytorch_lightning.loggers.TensorBoardLogger
 98 |   save_dir: ???
 99 |   name: ${codec_name}_${mode}
100 |   log_graph: true


--------------------------------------------------------------------------------
/codec_evaluation/probe/config/GS_dataset/YuE.yaml:
--------------------------------------------------------------------------------
  1 | mode: quantized_emb
  2 | sample_rate: 44100
  3 | target_sec: 10
  4 | num_outputs: 24
  5 | probe_ckpt_dir: ???
  6 | seed: 666
  7 | codec_name: yue 
  8 | task: multiclass
  9 | save_result: null 
 10 | 
 11 | trainer:
 12 |   _target_: pytorch_lightning.Trainer
 13 |   accelerator: gpu
 14 |   devices: ???
 15 |   precision: 32
 16 |   max_epochs: 100
 17 |   limit_val_batches: 10
 18 |   log_every_n_steps: 5
 19 |   val_check_interval: 1.0
 20 | 
 21 | data:
 22 |   _target_: codec_evaluation.probe.dataset.GS_dataset.GS_dataset.GSdataModule
 23 |   dataset_args:
 24 |     sample_rate: ${sample_rate}
 25 |     target_sec: ${target_sec}
 26 |   train_audio_dir: ???
 27 |   val_audio_dir: ???
 28 |   test_audio_dir: ???
 29 |   train_batch_size: 8
 30 |   val_batch_size: 1
 31 |   test_batch_size: 8
 32 |   train_num_workers: 4
 33 |   val_num_workers: 2
 34 |   test_num_workers: 2
 35 | 
 36 | model:
 37 |   _target_: codec_evaluation.probe.model.lit_prober.Prober
 38 |   codec_name: ${codec_name}
 39 |   sample_rate: ${sample_rate}
 40 |   mode: ${mode}
 41 |   task: ${task}
 42 |   num_outputs: ${num_outputs}
 43 |   probe_model_builder:
 44 |     _target_: codec_evaluation.probe.model.multiclass_model.MulticlassProber
 45 |     _partial_: true
 46 |     num_outputs: ${num_outputs}
 47 |     drop_out: 0.1
 48 |     channel_reduction: 16
 49 |     padding: 1
 50 |     kernel_size: 3
 51 |     stride: 1
 52 |   target_sec: ${target_sec}
 53 |   model_ckpt_dir: ???
 54 | 
 55 |   optimizer_builder:
 56 |       _target_: torch.optim.AdamW
 57 |       _partial_: true
 58 |       lr: 1e-4
 59 |       betas: [0.8, 0.99]
 60 |       eps: 1e-5
 61 |       weight_decay: 0.08
 62 |     
 63 |   lr_scheduler_builder:
 64 |     _target_: torch.optim.lr_scheduler.LambdaLR
 65 |     _partial_: true
 66 |     lr_lambda:
 67 |       _target_: codec_evaluation.utils.schedule.get_cosine_schedule_with_warmup_lr_lambda
 68 |       _partial_: true
 69 |       num_warmup_steps: 10
 70 |       num_training_steps: 10000
 71 |       final_lr_ratio: 0.2
 72 | 
 73 | callbacks:
 74 |   learning_rate_monitor:
 75 |     _target_: pytorch_lightning.callbacks.LearningRateMonitor
 76 |     logging_interval: step
 77 | 
 78 |   rich_progress_bar:
 79 |     _target_: pytorch_lightning.callbacks.RichProgressBar
 80 | 
 81 |   model_summary:
 82 |     _target_: pytorch_lightning.callbacks.ModelSummary
 83 |     max_depth: 1
 84 | 
 85 |   model_checkpoint:
 86 |     _target_: pytorch_lightning.callbacks.ModelCheckpoint
 87 |     monitor: val_loss
 88 |     dirpath: ${probe_ckpt_dir}
 89 |     every_n_epochs: 1
 90 |     mode: min
 91 |     save_top_k: 1
 92 |     save_last: False
 93 |     filename: ${codec_name}_${mode}_{epoch}-{val_loss:.4f}
 94 |     verbose: True
 95 | 
 96 | tensorboard:
 97 |   _target_: pytorch_lightning.loggers.TensorBoardLogger
 98 |   save_dir: ???
 99 |   name: ${codec_name}_${mode}
100 |   log_graph: true


--------------------------------------------------------------------------------
/codec_evaluation/probe/config/GS_dataset/dac.yaml:
--------------------------------------------------------------------------------
  1 | mode: quantized_emb
  2 | sample_rate: 44100
  3 | target_sec: 10
  4 | num_outputs: 24
  5 | probe_ckpt_dir: ???
  6 | seed: 666
  7 | codec_name: dac # 需要更改
  8 | task: multiclass
  9 | save_result: null 
 10 | 
 11 | trainer:
 12 |   _target_: pytorch_lightning.Trainer
 13 |   accelerator: gpu
 14 |   devices: ???
 15 |   precision: 32
 16 |   max_epochs: 100
 17 |   limit_val_batches: 10
 18 |   log_every_n_steps: 5
 19 |   val_check_interval: 1.0
 20 | 
 21 | data:
 22 |   _target_: codec_evaluation.probe.dataset.GS_dataset.GS_dataset.GSdataModule
 23 |   dataset_args:
 24 |     sample_rate: ${sample_rate}
 25 |     target_sec: ${target_sec}
 26 |   train_audio_dir: ???
 27 |   val_audio_dir: ???
 28 |   test_audio_dir: ???
 29 |   train_batch_size: 8
 30 |   val_batch_size: 1
 31 |   test_batch_size: 8
 32 |   train_num_workers: 4
 33 |   val_num_workers: 2
 34 |   test_num_workers: 2
 35 | 
 36 | model:
 37 |   _target_: codec_evaluation.probe.model.lit_prober.Prober
 38 |   codec_name: ${codec_name}
 39 |   sample_rate: ${sample_rate}
 40 |   mode: ${mode}
 41 |   task: ${task}
 42 |   num_outputs: ${num_outputs}
 43 |   probe_model_builder:
 44 |     _target_: codec_evaluation.probe.model.multiclass_model.MulticlassProber
 45 |     _partial_: true
 46 |     num_outputs: ${num_outputs}
 47 |     drop_out: 0.1
 48 |     channel_reduction: 16
 49 |     padding: 1
 50 |     kernel_size: 3
 51 |     stride: 1
 52 |   target_sec: ${target_sec}
 53 |   model_ckpt_dir: ???
 54 | 
 55 |   optimizer_builder:
 56 |       _target_: torch.optim.AdamW
 57 |       _partial_: true
 58 |       lr: 1e-4
 59 |       betas: [0.8, 0.99]
 60 |       eps: 1e-5
 61 |       weight_decay: 0.08
 62 |     
 63 |   lr_scheduler_builder:
 64 |     _target_: torch.optim.lr_scheduler.LambdaLR
 65 |     _partial_: true
 66 |     lr_lambda:
 67 |       _target_: codec_evaluation.utils.schedule.get_cosine_schedule_with_warmup_lr_lambda
 68 |       _partial_: true
 69 |       num_warmup_steps: 10
 70 |       num_training_steps: 10000
 71 |       final_lr_ratio: 0.2
 72 | 
 73 | callbacks:
 74 |   learning_rate_monitor:
 75 |     _target_: pytorch_lightning.callbacks.LearningRateMonitor
 76 |     logging_interval: step
 77 | 
 78 |   rich_progress_bar:
 79 |     _target_: pytorch_lightning.callbacks.RichProgressBar
 80 | 
 81 |   model_summary:
 82 |     _target_: pytorch_lightning.callbacks.ModelSummary
 83 |     max_depth: 1
 84 | 
 85 |   model_checkpoint:
 86 |     _target_: pytorch_lightning.callbacks.ModelCheckpoint
 87 |     monitor: val_loss
 88 |     dirpath: ${probe_ckpt_dir}
 89 |     every_n_epochs: 1
 90 |     mode: min
 91 |     save_top_k: 1
 92 |     save_last: False
 93 |     filename: ${codec_name}_${mode}_{epoch}-{val_loss:.4f}
 94 |     verbose: True
 95 | 
 96 | tensorboard:
 97 |   _target_: pytorch_lightning.loggers.TensorBoardLogger
 98 |   save_dir: ???
 99 |   name: ${codec_name}_${mode}
100 |   log_graph: true


--------------------------------------------------------------------------------
/codec_evaluation/codecs/levo_modules/stable_audio_tools/models/diffusion_prior.py:
--------------------------------------------------------------------------------
 1 | from enum import Enum
 2 | import typing as tp
 3 | 
 4 | from .diffusion import ConditionedDiffusionModelWrapper
 5 | from ..inference.generation import generate_diffusion_cond
 6 | from ..inference.utils import prepare_audio
 7 | 
 8 | import torch
 9 | from torch.nn import functional as F
10 | from torchaudio import transforms as T
11 | 
12 | # Define prior types enum
13 | class PriorType(Enum):
14 |     MonoToStereo = 1
15 | 
16 | class DiffusionPrior(ConditionedDiffusionModelWrapper):
17 |     def __init__(self, *args, prior_type: PriorType=None, **kwargs):
18 |         super().__init__(*args, **kwargs)
19 |         self.prior_type = prior_type  
20 | 
21 | class MonoToStereoDiffusionPrior(DiffusionPrior):
22 |     def __init__(self, *args, **kwargs):
23 |         super().__init__(*args, prior_type=PriorType.MonoToStereo, **kwargs)
24 | 
25 |     def stereoize(
26 |         self, 
27 |         audio: torch.Tensor, # (batch, channels, time)
28 |         in_sr: int,
29 |         steps: int,
30 |         sampler_kwargs: dict = {},
31 |     ):
32 |         """
33 |         Generate stereo audio from mono audio using a pre-trained diffusion prior
34 | 
35 |         Args:
36 |             audio: The mono audio to convert to stereo
37 |             in_sr: The sample rate of the input audio
38 |             steps: The number of diffusion steps to run
39 |             sampler_kwargs: Keyword arguments to pass to the diffusion sampler
40 |         """
41 | 
42 |         device = audio.device
43 | 
44 |         sample_rate = self.sample_rate
45 | 
46 |         # Resample input audio if necessary
47 |         if in_sr != sample_rate:
48 |             resample_tf = T.Resample(in_sr, sample_rate).to(audio.device)
49 |             audio = resample_tf(audio)
50 | 
51 |         audio_length = audio.shape[-1]
52 | 
53 |         # Pad input audio to be compatible with the model
54 |         min_length = self.min_input_length
55 |         padded_input_length = audio_length + (min_length - (audio_length % min_length)) % min_length
56 | 
57 |         # Pad input audio to be compatible with the model
58 |         if padded_input_length > audio_length:
59 |             audio = F.pad(audio, (0, padded_input_length - audio_length))
60 | 
61 |         # Make audio mono, duplicate to stereo
62 |         dual_mono = audio.mean(1, keepdim=True).repeat(1, 2, 1)
63 | 
64 |         if self.pretransform is not None:
65 |             dual_mono = self.pretransform.encode(dual_mono)
66 | 
67 |         conditioning = {"source": [dual_mono]}
68 | 
69 |         stereo_audio = generate_diffusion_cond(
70 |             self, 
71 |             conditioning_tensors=conditioning,
72 |             steps=steps,
73 |             sample_size=padded_input_length,
74 |             sample_rate=sample_rate,
75 |             device=device,
76 |             **sampler_kwargs,
77 |         ) 
78 | 
79 |         return stereo_audio


--------------------------------------------------------------------------------
/codec_evaluation/probe/config/EMO_dataset/YuE.yaml:
--------------------------------------------------------------------------------
  1 | mode: quantized_emb
  2 | sample_rate: 44100
  3 | target_sec: 9
  4 | num_outputs: 2
  5 | probe_ckpt_dir: ???
  6 | seed: 666
  7 | codec_name: yue # 需要更改
  8 | task: regression
  9 | save_result: null
 10 | 
 11 | trainer:
 12 |   _target_: pytorch_lightning.Trainer
 13 |   accelerator: gpu
 14 |   devices: ???
 15 |   precision: 32
 16 |   max_epochs: 100
 17 |   limit_val_batches: 10
 18 |   log_every_n_steps: 5
 19 |   val_check_interval: 1.0
 20 | 
 21 | data:
 22 |   _target_: codec_evaluation.probe.dataset.EMO_dataset.EMO_dataset.EMOdataModule
 23 |   dataset_args:
 24 |     sample_rate: ${sample_rate}
 25 |     target_sec: ${target_sec}
 26 |   train_audio_dir: ???
 27 |   val_audio_dir: ???
 28 |   test_audio_dir: ???
 29 |   train_batch_size: 32
 30 |   val_batch_size: 1
 31 |   test_batch_size: 32
 32 |   train_num_workers: 8
 33 |   val_num_workers: 4
 34 |   test_num_workers: 4
 35 | 
 36 | model:
 37 |   _target_: codec_evaluation.probe.model.lit_prober.Prober
 38 |   codec_name: ${codec_name}
 39 |   sample_rate: ${sample_rate}
 40 |   mode: ${mode}
 41 |   task: ${task}
 42 |   num_outputs: ${num_outputs}
 43 |   probe_model_builder:
 44 |     _target_: codec_evaluation.probe.model.regression_model.RegressionProber
 45 |     _partial_: true
 46 |     num_outputs: ${num_outputs}
 47 |     drop_out: 0.1
 48 |     channel_reduction: 16
 49 |     padding: 1
 50 |     kernel_size: 3
 51 |     stride: 1
 52 |   target_sec: ${target_sec}
 53 |   model_ckpt_dir: ???
 54 |   
 55 |   optimizer_builder:
 56 |       _target_: torch.optim.AdamW
 57 |       _partial_: true
 58 |       lr: 1e-4
 59 |       betas: [0.8, 0.99]
 60 |       eps: 1e-5
 61 |       weight_decay: 0.08
 62 |     
 63 |   lr_scheduler_builder:
 64 |     _target_: torch.optim.lr_scheduler.LambdaLR
 65 |     _partial_: true
 66 |     lr_lambda:
 67 |       _target_: codec_evaluation.utils.schedule.get_cosine_schedule_with_warmup_lr_lambda
 68 |       _partial_: true
 69 |       num_warmup_steps: 10
 70 |       num_training_steps: 10000
 71 |       final_lr_ratio: 0.2
 72 | 
 73 | callbacks:
 74 |   learning_rate_monitor:
 75 |     _target_: pytorch_lightning.callbacks.LearningRateMonitor
 76 |     logging_interval: step
 77 | 
 78 |   rich_progress_bar:
 79 |     _target_: pytorch_lightning.callbacks.RichProgressBar
 80 | 
 81 |   model_summary:
 82 |     _target_: pytorch_lightning.callbacks.ModelSummary
 83 |     max_depth: 1
 84 | 
 85 |   model_checkpoint:
 86 |     _target_: pytorch_lightning.callbacks.ModelCheckpoint
 87 |     monitor: val_loss
 88 |     dirpath: ${probe_ckpt_dir}
 89 |     every_n_epochs: 1
 90 |     mode: min
 91 |     save_top_k: 1
 92 |     save_last: False
 93 |     filename: ${codec_name}_${mode}_{epoch}-{val_loss:.4f}
 94 |     verbose: True
 95 | 
 96 | tensorboard:
 97 |   _target_: pytorch_lightning.loggers.TensorBoardLogger
 98 |   save_dir: ???
 99 |   name: ${codec_name}_${mode}
100 |   log_graph: true


--------------------------------------------------------------------------------
/codec_evaluation/probe/config/EMO_dataset/dac.yaml:
--------------------------------------------------------------------------------
  1 | mode: quantized_emb
  2 | sample_rate: 44100
  3 | target_sec: 9
  4 | num_outputs: 2
  5 | probe_ckpt_dir: ???
  6 | seed: 666
  7 | codec_name: dac # 需要更改
  8 | task: regression
  9 | save_result: null
 10 | 
 11 | trainer:
 12 |   _target_: pytorch_lightning.Trainer
 13 |   accelerator: gpu
 14 |   devices: ???
 15 |   precision: 32
 16 |   max_epochs: 100
 17 |   limit_val_batches: 10
 18 |   log_every_n_steps: 5
 19 |   val_check_interval: 1.0
 20 | 
 21 | data:
 22 |   _target_: codec_evaluation.probe.dataset.EMO_dataset.EMO_dataset.EMOdataModule
 23 |   dataset_args:
 24 |     sample_rate: ${sample_rate}
 25 |     target_sec: ${target_sec}
 26 |   train_audio_dir: ???
 27 |   val_audio_dir: ???
 28 |   test_audio_dir: ???
 29 |   train_batch_size: 32
 30 |   val_batch_size: 1
 31 |   test_batch_size: 32
 32 |   train_num_workers: 8
 33 |   val_num_workers: 4
 34 |   test_num_workers: 4
 35 | 
 36 | model:
 37 |   _target_: codec_evaluation.probe.model.lit_prober.Prober
 38 |   codec_name: ${codec_name}
 39 |   sample_rate: ${sample_rate}
 40 |   mode: ${mode}
 41 |   task: ${task}
 42 |   num_outputs: ${num_outputs}
 43 |   probe_model_builder:
 44 |     _target_: codec_evaluation.probe.model.regression_model.RegressionProber
 45 |     _partial_: true
 46 |     num_outputs: ${num_outputs}
 47 |     drop_out: 0.1
 48 |     channel_reduction: 16
 49 |     padding: 1
 50 |     kernel_size: 3
 51 |     stride: 1
 52 |   target_sec: ${target_sec}
 53 |   model_ckpt_dir: ???
 54 | 
 55 |   optimizer_builder:
 56 |       _target_: torch.optim.AdamW
 57 |       _partial_: true
 58 |       lr: 1e-4
 59 |       betas: [0.8, 0.99]
 60 |       eps: 1e-5
 61 |       weight_decay: 0.08
 62 |     
 63 |   lr_scheduler_builder:
 64 |     _target_: torch.optim.lr_scheduler.LambdaLR
 65 |     _partial_: true
 66 |     lr_lambda:
 67 |       _target_: codec_evaluation.utils.schedule.get_cosine_schedule_with_warmup_lr_lambda
 68 |       _partial_: true
 69 |       num_warmup_steps: 10
 70 |       num_training_steps: 10000
 71 |       final_lr_ratio: 0.2
 72 | 
 73 | callbacks:
 74 |   learning_rate_monitor:
 75 |     _target_: pytorch_lightning.callbacks.LearningRateMonitor
 76 |     logging_interval: step
 77 | 
 78 |   rich_progress_bar:
 79 |     _target_: pytorch_lightning.callbacks.RichProgressBar
 80 | 
 81 |   model_summary:
 82 |     _target_: pytorch_lightning.callbacks.ModelSummary
 83 |     max_depth: 1
 84 | 
 85 |   model_checkpoint:
 86 |     _target_: pytorch_lightning.callbacks.ModelCheckpoint
 87 |     monitor: val_loss
 88 |     dirpath: ${probe_ckpt_dir}
 89 |     every_n_epochs: 1
 90 |     mode: min
 91 |     save_top_k: 1
 92 |     save_last: False
 93 |     filename: ${codec_name}_${mode}_{epoch}-{val_loss:.4f}
 94 |     verbose: True
 95 | 
 96 | tensorboard:
 97 |   _target_: pytorch_lightning.loggers.TensorBoardLogger
 98 |   save_dir: ???
 99 |   name: ${codec_name}_${mode}
100 |   log_graph: true


--------------------------------------------------------------------------------
/codec_evaluation/probe/config/EMO_dataset/semanticodec.yaml:
--------------------------------------------------------------------------------
  1 | mode: quantized_emb
  2 | sample_rate: 44100
  3 | target_sec: 9
  4 | num_outputs: 2
  5 | probe_ckpt_dir: ???
  6 | seed: 666
  7 | codec_name: semanticodec
  8 | task: regression
  9 | save_result: null
 10 | 
 11 | trainer:
 12 |   _target_: pytorch_lightning.Trainer
 13 |   accelerator: gpu
 14 |   devices: ???
 15 |   precision: 32
 16 |   max_epochs: 100
 17 |   limit_val_batches: 10
 18 |   log_every_n_steps: 5
 19 |   val_check_interval: 1.0
 20 | 
 21 | data:
 22 |   _target_: codec_evaluation.probe.dataset.EMO_dataset.EMO_dataset.EMOdataModule
 23 |   dataset_args:
 24 |     sample_rate: ${sample_rate}
 25 |     target_sec: ${target_sec}
 26 |   train_audio_dir: ???
 27 |   val_audio_dir: ???
 28 |   test_audio_dir: ???
 29 |   train_batch_size: 32
 30 |   val_batch_size: 1
 31 |   test_batch_size: 32
 32 |   train_num_workers: 8
 33 |   val_num_workers: 4
 34 |   test_num_workers: 4
 35 | 
 36 | model:
 37 |   _target_: codec_evaluation.probe.model.lit_prober.Prober
 38 |   codec_name: ${codec_name}
 39 |   sample_rate: ${sample_rate}
 40 |   mode: ${mode}
 41 |   task: ${task}
 42 |   num_outputs: ${num_outputs}
 43 |   probe_model_builder:
 44 |     _target_: codec_evaluation.probe.model.regression_model.RegressionProber
 45 |     _partial_: true
 46 |     num_outputs: ${num_outputs}
 47 |     drop_out: 0.1
 48 |     channel_reduction: 16
 49 |     padding: 1
 50 |     kernel_size: 3
 51 |     stride: 1
 52 |   target_sec: ${target_sec}
 53 |   model_ckpt_dir: ???
 54 |   
 55 |   optimizer_builder:
 56 |     _target_: torch.optim.AdamW
 57 |     _partial_: true
 58 |     lr: 1e-4
 59 |     betas: [0.8, 0.99]
 60 |     eps: 1e-5
 61 |     weight_decay: 0.08
 62 |     
 63 |   lr_scheduler_builder:
 64 |     _target_: torch.optim.lr_scheduler.LambdaLR
 65 |     _partial_: true
 66 |     lr_lambda:
 67 |       _target_: codec_evaluation.utils.schedule.get_cosine_schedule_with_warmup_lr_lambda
 68 |       _partial_: true
 69 |       num_warmup_steps: 10
 70 |       num_training_steps: 10000
 71 |       final_lr_ratio: 0.2
 72 | 
 73 | callbacks:
 74 |   learning_rate_monitor:
 75 |     _target_: pytorch_lightning.callbacks.LearningRateMonitor
 76 |     logging_interval: step
 77 | 
 78 |   rich_progress_bar:
 79 |     _target_: pytorch_lightning.callbacks.RichProgressBar
 80 | 
 81 |   model_summary:
 82 |     _target_: pytorch_lightning.callbacks.ModelSummary
 83 |     max_depth: 1
 84 | 
 85 |   model_checkpoint:
 86 |     _target_: pytorch_lightning.callbacks.ModelCheckpoint
 87 |     monitor: val_loss
 88 |     dirpath: ${probe_ckpt_dir}
 89 |     every_n_epochs: 1
 90 |     mode: min
 91 |     save_top_k: 1
 92 |     save_last: False
 93 |     filename: ${codec_name}_${mode}_{epoch}-{val_loss:.4f}
 94 |     verbose: True
 95 | 
 96 | tensorboard:
 97 |   _target_: pytorch_lightning.loggers.TensorBoardLogger
 98 |   save_dir: ???
 99 |   name: ${codec_name}_${mode}
100 |   log_graph: true


--------------------------------------------------------------------------------
/codec_evaluation/probe/config/EMO_dataset/wavtokenizer.yaml:
--------------------------------------------------------------------------------
  1 | mode: quantized_emb
  2 | sample_rate: 44100
  3 | target_sec: 9
  4 | num_outputs: 2
  5 | probe_ckpt_dir: ???
  6 | seed: 666
  7 | codec_name: wavtokenizer
  8 | task: regression
  9 | save_result: null
 10 | 
 11 | trainer:
 12 |   _target_: pytorch_lightning.Trainer
 13 |   accelerator: gpu
 14 |   devices: ???
 15 |   precision: 32
 16 |   max_epochs: 100
 17 |   limit_val_batches: 10
 18 |   log_every_n_steps: 5
 19 |   val_check_interval: 1.0
 20 | 
 21 | data:
 22 |   _target_: codec_evaluation.probe.dataset.EMO_dataset.EMO_dataset.EMOdataModule
 23 |   dataset_args:
 24 |     sample_rate: ${sample_rate}
 25 |     target_sec: ${target_sec}
 26 |   train_audio_dir: ???
 27 |   val_audio_dir: ???
 28 |   test_audio_dir: ???
 29 |   train_batch_size: 32
 30 |   val_batch_size: 1
 31 |   test_batch_size: 32
 32 |   train_num_workers: 8
 33 |   val_num_workers: 4
 34 |   test_num_workers: 4
 35 | 
 36 | model:
 37 |   _target_: codec_evaluation.probe.model.lit_prober.Prober
 38 |   codec_name: ${codec_name}
 39 |   sample_rate: ${sample_rate}
 40 |   mode: ${mode}
 41 |   task: ${task}
 42 |   num_outputs: ${num_outputs}
 43 |   probe_model_builder:
 44 |     _target_: codec_evaluation.probe.model.regression_model.RegressionProber
 45 |     _partial_: true
 46 |     num_outputs: ${num_outputs}
 47 |     drop_out: 0.1
 48 |     channel_reduction: 16
 49 |     padding: 1
 50 |     kernel_size: 3
 51 |     stride: 1
 52 |   target_sec: ${target_sec}
 53 |   model_ckpt_dir: ???
 54 |   
 55 |   optimizer_builder:
 56 |     _target_: torch.optim.AdamW
 57 |     _partial_: true
 58 |     lr: 1e-4
 59 |     betas: [0.8, 0.99]
 60 |     eps: 1e-5
 61 |     weight_decay: 0.08
 62 |     
 63 |   lr_scheduler_builder:
 64 |     _target_: torch.optim.lr_scheduler.LambdaLR
 65 |     _partial_: true
 66 |     lr_lambda:
 67 |       _target_: codec_evaluation.utils.schedule.get_cosine_schedule_with_warmup_lr_lambda
 68 |       _partial_: true
 69 |       num_warmup_steps: 10
 70 |       num_training_steps: 10000
 71 |       final_lr_ratio: 0.2
 72 | 
 73 | callbacks:
 74 |   learning_rate_monitor:
 75 |     _target_: pytorch_lightning.callbacks.LearningRateMonitor
 76 |     logging_interval: step
 77 | 
 78 |   rich_progress_bar:
 79 |     _target_: pytorch_lightning.callbacks.RichProgressBar
 80 | 
 81 |   model_summary:
 82 |     _target_: pytorch_lightning.callbacks.ModelSummary
 83 |     max_depth: 1
 84 | 
 85 |   model_checkpoint:
 86 |     _target_: pytorch_lightning.callbacks.ModelCheckpoint
 87 |     monitor: val_loss
 88 |     dirpath: ${probe_ckpt_dir}
 89 |     every_n_epochs: 1
 90 |     mode: min
 91 |     save_top_k: 1
 92 |     save_last: False
 93 |     filename: ${codec_name}_${mode}_{epoch}-{val_loss:.4f}
 94 |     verbose: True
 95 | 
 96 | tensorboard:
 97 |   _target_: pytorch_lightning.loggers.TensorBoardLogger
 98 |   save_dir: ???
 99 |   name: ${codec_name}_${mode}
100 |   log_graph: true


--------------------------------------------------------------------------------
/codec_evaluation/probe/config/GTZAN_dataset/YuE.yaml:
--------------------------------------------------------------------------------
  1 | mode: quantized_emb
  2 | sample_rate: 22050
  3 | target_sec: 10
  4 | num_outputs: 10
  5 | probe_ckpt_dir: ???
  6 | seed: 666
  7 | codec_name: yue
  8 | task: multiclass
  9 | save_result: null
 10 | 
 11 | trainer:
 12 |   _target_: pytorch_lightning.Trainer
 13 |   accelerator: gpu
 14 |   devices: ???
 15 |   precision: 32
 16 |   max_epochs: 100
 17 |   limit_val_batches: 10
 18 |   log_every_n_steps: 5
 19 |   val_check_interval: 1.0
 20 | 
 21 | data:
 22 |   _target_: codec_evaluation.probe.dataset.GTZAN_dataset.GTZAN_dataset.GTZANdataModule
 23 |   dataset_args:
 24 |     sample_rate: ${sample_rate}
 25 |     target_sec: ${target_sec}
 26 |   train_audio_dir: ???
 27 |   val_audio_dir: ???
 28 |   test_audio_dir: ???
 29 |   train_batch_size: 8
 30 |   val_batch_size: 1
 31 |   test_batch_size: 8
 32 |   train_num_workers: 8
 33 |   val_num_workers: 4
 34 |   test_num_workers: 4
 35 | 
 36 | model:
 37 |   _target_: codec_evaluation.probe.model.lit_prober.Prober
 38 |   codec_name: ${codec_name}
 39 |   sample_rate: ${sample_rate}
 40 |   mode: ${mode}
 41 |   task: ${task}
 42 |   num_outputs: ${num_outputs}
 43 |   probe_model_builder:
 44 |     _target_: codec_evaluation.probe.model.multiclass_model.MulticlassProber
 45 |     _partial_: true
 46 |     num_outputs: ${num_outputs}
 47 |     drop_out: 0.1
 48 |     channel_reduction: 16
 49 |     padding: 1
 50 |     kernel_size: 3
 51 |     stride: 1
 52 |   target_sec: ${target_sec}
 53 |   model_ckpt_dir: ???
 54 | 
 55 |   optimizer_builder:
 56 |       _target_: torch.optim.AdamW
 57 |       _partial_: true
 58 |       lr: 1e-4
 59 |       betas: [0.8, 0.99]
 60 |       eps: 1e-5
 61 |       weight_decay: 0.08
 62 |     
 63 |   lr_scheduler_builder:
 64 |     _target_: torch.optim.lr_scheduler.LambdaLR
 65 |     _partial_: true
 66 |     lr_lambda:
 67 |       _target_: codec_evaluation.utils.schedule.get_cosine_schedule_with_warmup_lr_lambda
 68 |       _partial_: true
 69 |       num_warmup_steps: 10
 70 |       num_training_steps: 10000
 71 |       final_lr_ratio: 0.2
 72 | 
 73 | callbacks:
 74 |   learning_rate_monitor:
 75 |     _target_: pytorch_lightning.callbacks.LearningRateMonitor
 76 |     logging_interval: step
 77 | 
 78 |   rich_progress_bar:
 79 |     _target_: pytorch_lightning.callbacks.RichProgressBar
 80 | 
 81 |   model_summary:
 82 |     _target_: pytorch_lightning.callbacks.ModelSummary
 83 |     max_depth: 1
 84 | 
 85 |   model_checkpoint:
 86 |     _target_: pytorch_lightning.callbacks.ModelCheckpoint
 87 |     monitor: val_loss
 88 |     dirpath: ${probe_ckpt_dir}
 89 |     every_n_epochs: 1
 90 |     mode: min
 91 |     save_top_k: 1
 92 |     save_last: False
 93 |     filename: ${codec_name}_${mode}_{epoch}-{val_loss:.4f}
 94 |     verbose: True
 95 | 
 96 | tensorboard:
 97 |   _target_: pytorch_lightning.loggers.TensorBoardLogger
 98 |   save_dir: ???
 99 |   name: ${codec_name}_${mode}
100 |   log_graph: true


--------------------------------------------------------------------------------
/codec_evaluation/probe/config/GTZAN_dataset/dac.yaml:
--------------------------------------------------------------------------------
  1 | mode: quantized_emb
  2 | sample_rate: 22050
  3 | target_sec: 10
  4 | num_outputs: 10
  5 | probe_ckpt_dir: ???
  6 | seed: 666
  7 | codec_name: dac
  8 | task: multiclass
  9 | save_result: null
 10 | 
 11 | trainer:
 12 |   _target_: pytorch_lightning.Trainer
 13 |   accelerator: gpu
 14 |   devices: ???
 15 |   precision: 32
 16 |   max_epochs: 100
 17 |   limit_val_batches: 10
 18 |   log_every_n_steps: 5
 19 |   val_check_interval: 1.0
 20 | 
 21 | data:
 22 |   _target_: codec_evaluation.probe.dataset.GTZAN_dataset.GTZAN_dataset.GTZANdataModule
 23 |   dataset_args:
 24 |     sample_rate: ${sample_rate}
 25 |     target_sec: ${target_sec}
 26 |   train_audio_dir: ???
 27 |   val_audio_dir: ???
 28 |   test_audio_dir: ???
 29 |   train_batch_size: 8
 30 |   val_batch_size: 1
 31 |   test_batch_size: 8
 32 |   train_num_workers: 8
 33 |   val_num_workers: 4
 34 |   test_num_workers: 4
 35 | 
 36 | model:
 37 |   _target_: codec_evaluation.probe.model.lit_prober.Prober
 38 |   codec_name: ${codec_name}
 39 |   sample_rate: ${sample_rate}
 40 |   mode: ${mode}
 41 |   task: ${task}
 42 |   num_outputs: ${num_outputs}
 43 |   probe_model_builder:
 44 |     _target_: codec_evaluation.probe.model.multiclass_model.MulticlassProber
 45 |     _partial_: true
 46 |     num_outputs: ${num_outputs}
 47 |     drop_out: 0.1
 48 |     channel_reduction: 16
 49 |     padding: 1
 50 |     kernel_size: 3
 51 |     stride: 1
 52 |   target_sec: ${target_sec}
 53 |   model_ckpt_dir: ???
 54 | 
 55 |   optimizer_builder:
 56 |       _target_: torch.optim.AdamW
 57 |       _partial_: true
 58 |       lr: 1e-4
 59 |       betas: [0.8, 0.99]
 60 |       eps: 1e-5
 61 |       weight_decay: 0.08
 62 |     
 63 |   lr_scheduler_builder:
 64 |     _target_: torch.optim.lr_scheduler.LambdaLR
 65 |     _partial_: true
 66 |     lr_lambda:
 67 |       _target_: codec_evaluation.utils.schedule.get_cosine_schedule_with_warmup_lr_lambda
 68 |       _partial_: true
 69 |       num_warmup_steps: 10
 70 |       num_training_steps: 10000
 71 |       final_lr_ratio: 0.2
 72 | 
 73 | callbacks:
 74 |   learning_rate_monitor:
 75 |     _target_: pytorch_lightning.callbacks.LearningRateMonitor
 76 |     logging_interval: step
 77 | 
 78 |   rich_progress_bar:
 79 |     _target_: pytorch_lightning.callbacks.RichProgressBar
 80 | 
 81 |   model_summary:
 82 |     _target_: pytorch_lightning.callbacks.ModelSummary
 83 |     max_depth: 1
 84 | 
 85 |   model_checkpoint:
 86 |     _target_: pytorch_lightning.callbacks.ModelCheckpoint
 87 |     monitor: val_loss
 88 |     dirpath: ${probe_ckpt_dir}
 89 |     every_n_epochs: 1
 90 |     mode: min
 91 |     save_top_k: 1
 92 |     save_last: False
 93 |     filename: ${codec_name}_${mode}_{epoch}-{val_loss:.4f}
 94 |     verbose: True
 95 | 
 96 | tensorboard:
 97 |   _target_: pytorch_lightning.loggers.TensorBoardLogger
 98 |   save_dir: ???
 99 |   name: ${codec_name}_${mode}
100 |   log_graph: true


--------------------------------------------------------------------------------
/codec_evaluation/probe/config/MELD_dataset/YuE.yaml:
--------------------------------------------------------------------------------
  1 | mode: quantized_emb
  2 | sample_rate: 48000
  3 | target_sec: 8
  4 | num_outputs: 7
  5 | probe_ckpt_dir: ???
  6 | seed: 666
  7 | codec_name: yue
  8 | task: multiclass
  9 | save_result: null 
 10 | 
 11 | trainer:
 12 |   _target_: pytorch_lightning.Trainer
 13 |   accelerator: gpu
 14 |   devices: ???
 15 |   precision: 32
 16 |   max_epochs: 100
 17 |   limit_val_batches: 10
 18 |   log_every_n_steps: 5
 19 |   val_check_interval: 1.0
 20 | 
 21 | data:
 22 |   _target_: codec_evaluation.probe.dataset.MELD_dataset.MELD_dataset.MELDdataModule
 23 |   dataset_args:
 24 |     sample_rate: ${sample_rate}
 25 |     target_sec: ${target_sec}
 26 |   train_audio_dir: ???
 27 |   val_audio_dir: ???
 28 |   test_audio_dir: ???
 29 |   train_batch_size: 64
 30 |   val_batch_size: 2
 31 |   test_batch_size: 32
 32 |   train_num_workers: 8
 33 |   val_num_workers: 4
 34 |   test_num_workers: 4
 35 | 
 36 | model:
 37 |   _target_: codec_evaluation.probe.model.lit_prober.Prober
 38 |   codec_name: ${codec_name}
 39 |   sample_rate: ${sample_rate}
 40 |   mode: ${mode}
 41 |   task: ${task}
 42 |   num_outputs: ${num_outputs}
 43 |   probe_model_builder:
 44 |     _target_: codec_evaluation.probe.model.multiclass_model.MulticlassProber
 45 |     _partial_: true
 46 |     num_outputs: ${num_outputs}
 47 |     drop_out: 0.2
 48 |     channel_reduction: 16
 49 |     padding: 1
 50 |     kernel_size: 3
 51 |     stride: 1
 52 |   target_sec: ${target_sec}
 53 |   model_ckpt_dir: ???
 54 | 
 55 |   optimizer_builder:
 56 |       _target_: torch.optim.AdamW
 57 |       _partial_: true
 58 |       lr: 1e-4
 59 |       betas: [0.8, 0.99]
 60 |       eps: 1e-5
 61 |       weight_decay: 0.08
 62 |     
 63 |   lr_scheduler_builder:
 64 |     _target_: torch.optim.lr_scheduler.LambdaLR
 65 |     _partial_: true
 66 |     lr_lambda:
 67 |       _target_: codec_evaluation.utils.schedule.get_cosine_schedule_with_warmup_lr_lambda
 68 |       _partial_: true
 69 |       num_warmup_steps: 10
 70 |       num_training_steps: 10000
 71 |       final_lr_ratio: 0.2
 72 | 
 73 | callbacks:
 74 |   learning_rate_monitor:
 75 |     _target_: pytorch_lightning.callbacks.LearningRateMonitor
 76 |     logging_interval: step
 77 | 
 78 |   rich_progress_bar:
 79 |     _target_: pytorch_lightning.callbacks.RichProgressBar
 80 | 
 81 |   model_summary:
 82 |     _target_: pytorch_lightning.callbacks.ModelSummary
 83 |     max_depth: 1
 84 | 
 85 |   model_checkpoint:
 86 |     _target_: pytorch_lightning.callbacks.ModelCheckpoint
 87 |     monitor: val_loss
 88 |     dirpath: ${probe_ckpt_dir}
 89 |     every_n_epochs: 1
 90 |     mode: min
 91 |     save_top_k: 1
 92 |     save_last: False
 93 |     filename: ${codec_name}_${mode}_{epoch}-{val_loss:.4f}
 94 |     verbose: True
 95 | 
 96 | tensorboard:
 97 |   _target_: pytorch_lightning.loggers.TensorBoardLogger
 98 |   save_dir: ???
 99 |   name: ${codec_name}_${mode}
100 |   log_graph: true


--------------------------------------------------------------------------------
/codec_evaluation/probe/config/NSynthI_dataset/YuE.yaml:
--------------------------------------------------------------------------------
  1 | mode: quantized_emb
  2 | sample_rate: 16000
  3 | target_sec: 4
  4 | num_outputs: 11
  5 | probe_ckpt_dir: ???
  6 | seed: 666
  7 | codec_name: yue
  8 | task: multiclass
  9 | save_result: null
 10 | 
 11 | 
 12 | trainer:
 13 |   _target_: pytorch_lightning.Trainer
 14 |   accelerator: gpu
 15 |   devices: ???
 16 |   precision: 32
 17 |   max_epochs: 50
 18 |   limit_val_batches: 10
 19 |   log_every_n_steps: 5
 20 |   val_check_interval: 1.0
 21 | 
 22 | data:
 23 |   _target_: codec_evaluation.probe.dataset.NSynthI_dataset.NSynthI_dataset.NSynthIdataModule
 24 |   dataset_args:
 25 |     sample_rate: ${sample_rate}
 26 |     target_sec: ${target_sec}
 27 |   train_audio_dir: ???
 28 |   val_audio_dir: ???
 29 |   test_audio_dir: ???
 30 |   train_batch_size: 256
 31 |   val_batch_size: 32
 32 |   test_batch_size: 256
 33 |   train_num_workers: 8
 34 |   val_num_workers: 4
 35 |   test_num_workers: 4
 36 |   
 37 | 
 38 | model:
 39 |   _target_: codec_evaluation.probe.model.lit_prober.Prober
 40 |   codec_name: ${codec_name}
 41 |   sample_rate: ${sample_rate}
 42 |   mode: ${mode}
 43 |   task: ${task}
 44 |   num_outputs: ${num_outputs}
 45 |   probe_model_builder:
 46 |     _target_: codec_evaluation.probe.model.multiclass_model.MulticlassProber
 47 |     _partial_: true
 48 |     num_outputs: ${num_outputs}
 49 |     drop_out: 0.1
 50 |     channel_reduction: 16
 51 |     padding: 1
 52 |     kernel_size: 3
 53 |     stride: 1
 54 |   target_sec: ${target_sec}
 55 |   model_ckpt_dir: ???
 56 | 
 57 |   optimizer_builder:
 58 |       _target_: torch.optim.AdamW
 59 |       _partial_: true
 60 |       lr: 1e-4
 61 |       betas: [0.8, 0.99]
 62 |       eps: 1e-5
 63 |       weight_decay: 0.08
 64 |     
 65 |   lr_scheduler_builder:
 66 |     _target_: torch.optim.lr_scheduler.LambdaLR
 67 |     _partial_: true
 68 |     lr_lambda:
 69 |       _target_: codec_evaluation.utils.schedule.get_cosine_schedule_with_warmup_lr_lambda
 70 |       _partial_: true
 71 |       num_warmup_steps: 10
 72 |       num_training_steps: 10000
 73 |       final_lr_ratio: 0.2
 74 | 
 75 | callbacks:
 76 |   learning_rate_monitor:
 77 |     _target_: pytorch_lightning.callbacks.LearningRateMonitor
 78 |     logging_interval: step
 79 | 
 80 |   rich_progress_bar:
 81 |     _target_: pytorch_lightning.callbacks.RichProgressBar
 82 | 
 83 |   model_summary:
 84 |     _target_: pytorch_lightning.callbacks.ModelSummary
 85 |     max_depth: 1
 86 | 
 87 |   model_checkpoint:
 88 |     _target_: pytorch_lightning.callbacks.ModelCheckpoint
 89 |     monitor: val_loss
 90 |     dirpath: ${probe_ckpt_dir}
 91 |     every_n_epochs: 1
 92 |     mode: min
 93 |     save_top_k: 1
 94 |     filename: ${codec_name}_${mode}_{epoch}-{val_loss:.4f}
 95 |     verbose: True
 96 | 
 97 | tensorboard:
 98 |   _target_: pytorch_lightning.loggers.TensorBoardLogger
 99 |   save_dir: ???
100 |   name: ${codec_name}_${mode}
101 |   log_graph: true


--------------------------------------------------------------------------------
/codec_evaluation/probe/config/GS_dataset/encodec.yaml:
--------------------------------------------------------------------------------
  1 | mode: quantized_emb
  2 | sample_rate: 44100
  3 | target_sec: 10
  4 | num_outputs: 24
  5 | probe_ckpt_dir: ???
  6 | seed: 666
  7 | codec_name: encodec # 需要更改
  8 | task: multiclass
  9 | save_result: null
 10 | 
 11 | trainer:
 12 |   _target_: pytorch_lightning.Trainer
 13 |   accelerator: gpu
 14 |   devices: ???
 15 |   precision: 32
 16 |   max_epochs: 100
 17 |   limit_val_batches: 10
 18 |   log_every_n_steps: 5
 19 |   val_check_interval: 1.0
 20 | 
 21 | data:
 22 |   _target_: codec_evaluation.probe.dataset.GS_dataset.GS_dataset.GSdataModule
 23 |   dataset_args:
 24 |     sample_rate: ${sample_rate}
 25 |     target_sec: ${target_sec}
 26 |   train_audio_dir: ???
 27 |   val_audio_dir: ???
 28 |   test_audio_dir: ???
 29 |   train_batch_size: 8
 30 |   val_batch_size: 1
 31 |   test_batch_size: 8
 32 |   train_num_workers: 4
 33 |   val_num_workers: 2
 34 |   test_num_workers: 2
 35 | 
 36 | model:
 37 |   _target_: codec_evaluation.probe.model.lit_prober.Prober
 38 |   codec_name: ${codec_name}
 39 |   sample_rate: ${sample_rate}
 40 |   mode: ${mode}
 41 |   task: ${task}
 42 |   num_outputs: ${num_outputs}
 43 |   probe_model_builder:
 44 |     _target_: codec_evaluation.probe.model.multiclass_model.MulticlassProber
 45 |     _partial_: true
 46 |     num_outputs: ${num_outputs}
 47 |     drop_out: 0.1
 48 |     channel_reduction: 16
 49 |     padding: 1
 50 |     kernel_size: 3
 51 |     stride: 1
 52 |   target_sec: ${target_sec}
 53 |   model_ckpt_dir: ???
 54 | 
 55 |   optimizer_builder:
 56 |       _target_: torch.optim.AdamW
 57 |       _partial_: true
 58 |       lr: 1e-4
 59 |       betas: [0.8, 0.99]
 60 |       eps: 1e-5
 61 |       weight_decay: 0.08
 62 |     
 63 |   lr_scheduler_builder:
 64 |     _target_: torch.optim.lr_scheduler.LambdaLR
 65 |     _partial_: true
 66 |     lr_lambda:
 67 |       _target_: codec_evaluation.utils.schedule.get_cosine_schedule_with_warmup_lr_lambda
 68 |       _partial_: true
 69 |       num_warmup_steps: 10
 70 |       num_training_steps: 10000
 71 |       final_lr_ratio: 0.2
 72 | 
 73 | callbacks:
 74 |   learning_rate_monitor:
 75 |     _target_: pytorch_lightning.callbacks.LearningRateMonitor
 76 |     logging_interval: step
 77 | 
 78 |   rich_progress_bar:
 79 |     _target_: pytorch_lightning.callbacks.RichProgressBar
 80 | 
 81 |   model_summary:
 82 |     _target_: pytorch_lightning.callbacks.ModelSummary
 83 |     max_depth: 1
 84 | 
 85 |   model_checkpoint:
 86 |     _target_: pytorch_lightning.callbacks.ModelCheckpoint
 87 |     monitor: val_loss
 88 |     dirpath: ${probe_ckpt_dir}
 89 |     every_n_epochs: 1
 90 |     mode: min
 91 |     save_top_k: 1
 92 |     save_last: False
 93 |     filename: ${codec_name}_${mode}_{epoch}-{val_loss:.4f}
 94 |     verbose: True
 95 | 
 96 | tensorboard:
 97 |   _target_: pytorch_lightning.loggers.TensorBoardLogger
 98 |   save_dir: ???
 99 |   name: ${codec_name}_${mode}
100 |   log_graph: true


--------------------------------------------------------------------------------
/codec_evaluation/probe/config/MELD_dataset/dac.yaml:
--------------------------------------------------------------------------------
  1 | mode: quantized_emb
  2 | sample_rate: 48000
  3 | target_sec: 8
  4 | num_outputs: 7
  5 | probe_ckpt_dir: ???
  6 | seed: 666
  7 | codec_name: dac # 需要更改
  8 | task: multiclass
  9 | save_result: null 
 10 | 
 11 | trainer:
 12 |   _target_: pytorch_lightning.Trainer
 13 |   accelerator: gpu
 14 |   devices: ???
 15 |   precision: 32
 16 |   max_epochs: 100
 17 |   limit_val_batches: 10
 18 |   log_every_n_steps: 5
 19 |   val_check_interval: 1.0
 20 | 
 21 | data:
 22 |   _target_: codec_evaluation.probe.dataset.MELD_dataset.MELD_dataset.MELDdataModule
 23 |   dataset_args:
 24 |     sample_rate: ${sample_rate}
 25 |     target_sec: ${target_sec}
 26 |   train_audio_dir: ???
 27 |   val_audio_dir: ???
 28 |   test_audio_dir: ???
 29 |   train_batch_size: 64
 30 |   val_batch_size: 2
 31 |   test_batch_size: 32
 32 |   train_num_workers: 8
 33 |   val_num_workers: 4
 34 |   test_num_workers: 4
 35 | 
 36 | model:
 37 |   _target_: codec_evaluation.probe.model.lit_prober.Prober
 38 |   codec_name: ${codec_name}
 39 |   sample_rate: ${sample_rate}
 40 |   mode: ${mode}
 41 |   task: ${task}
 42 |   num_outputs: ${num_outputs}
 43 |   probe_model_builder:
 44 |     _target_: codec_evaluation.probe.model.multiclass_model.MulticlassProber
 45 |     _partial_: true
 46 |     num_outputs: ${num_outputs}
 47 |     drop_out: 0.2
 48 |     channel_reduction: 16
 49 |     padding: 1
 50 |     kernel_size: 3
 51 |     stride: 1
 52 |   target_sec: ${target_sec}
 53 |   model_ckpt_dir: ???
 54 | 
 55 |   optimizer_builder:
 56 |       _target_: torch.optim.AdamW
 57 |       _partial_: true
 58 |       lr: 1e-4
 59 |       betas: [0.8, 0.99]
 60 |       eps: 1e-5
 61 |       weight_decay: 0.08
 62 |     
 63 |   lr_scheduler_builder:
 64 |     _target_: torch.optim.lr_scheduler.LambdaLR
 65 |     _partial_: true
 66 |     lr_lambda:
 67 |       _target_: codec_evaluation.utils.schedule.get_cosine_schedule_with_warmup_lr_lambda
 68 |       _partial_: true
 69 |       num_warmup_steps: 10
 70 |       num_training_steps: 10000
 71 |       final_lr_ratio: 0.2
 72 | 
 73 | callbacks:
 74 |   learning_rate_monitor:
 75 |     _target_: pytorch_lightning.callbacks.LearningRateMonitor
 76 |     logging_interval: step
 77 | 
 78 |   rich_progress_bar:
 79 |     _target_: pytorch_lightning.callbacks.RichProgressBar
 80 | 
 81 |   model_summary:
 82 |     _target_: pytorch_lightning.callbacks.ModelSummary
 83 |     max_depth: 1
 84 | 
 85 |   model_checkpoint:
 86 |     _target_: pytorch_lightning.callbacks.ModelCheckpoint
 87 |     monitor: val_loss
 88 |     dirpath: ${probe_ckpt_dir}
 89 |     every_n_epochs: 1
 90 |     mode: min
 91 |     save_top_k: 1
 92 |     save_last: False
 93 |     filename: ${codec_name}_${mode}_{epoch}-{val_loss:.4f}
 94 |     verbose: True
 95 | 
 96 | tensorboard:
 97 |   _target_: pytorch_lightning.loggers.TensorBoardLogger
 98 |   save_dir: ???
 99 |   name: ${codec_name}_${mode}
100 |   log_graph: true


--------------------------------------------------------------------------------
/codec_evaluation/probe/config/NSynthP_dataset/encodec.yaml:
--------------------------------------------------------------------------------
  1 | mode: quantized_emb
  2 | sample_rate: 16000
  3 | target_sec: 4
  4 | num_outputs: 128
  5 | probe_ckpt_dir: ???
  6 | seed: 666
  7 | codec_name: encodec
  8 | task: multiclass
  9 | save_result: null
 10 | 
 11 | 
 12 | trainer:
 13 |   _target_: pytorch_lightning.Trainer
 14 |   accelerator: gpu
 15 |   devices: ???
 16 |   precision: 32
 17 |   max_epochs: 100
 18 |   limit_val_batches: 10
 19 |   log_every_n_steps: 5
 20 |   val_check_interval: 1.0
 21 | 
 22 | data:
 23 |   _target_: codec_evaluation.probe.dataset.NSynthP_dataset.NSynthP_dataset.NSynthPdataModule
 24 |   dataset_args:
 25 |     sample_rate: ${sample_rate}
 26 |     target_sec: ${target_sec}
 27 |   train_audio_dir: ???
 28 |   val_audio_dir: ???
 29 |   test_audio_dir: ???
 30 |   train_batch_size: 256
 31 |   val_batch_size: 32
 32 |   test_batch_size: 128
 33 |   train_num_workers: 128
 34 |   val_num_workers: 8
 35 |   test_num_workers: 64
 36 | 
 37 | model:
 38 |   _target_: codec_evaluation.probe.model.lit_prober.Prober
 39 |   codec_name: ${codec_name}
 40 |   sample_rate: ${sample_rate}
 41 |   mode: ${mode}
 42 |   task: ${task}
 43 |   num_outputs: ${num_outputs}
 44 |   probe_model_builder:
 45 |     _target_: codec_evaluation.probe.model.multiclass_model.MulticlassProber
 46 |     _partial_: true
 47 |     num_outputs: ${num_outputs}
 48 |     drop_out: 0.1
 49 |     channel_reduction: 16
 50 |     padding: 1
 51 |     kernel_size: 3
 52 |     stride: 1
 53 |   target_sec: ${target_sec}
 54 |   model_ckpt_dir: ???
 55 |   
 56 |   optimizer_builder:
 57 |       _target_: torch.optim.AdamW
 58 |       _partial_: true
 59 |       lr: 1e-4
 60 |       betas: [0.8, 0.99]
 61 |       eps: 1e-5
 62 |       weight_decay: 0.08
 63 |     
 64 |   lr_scheduler_builder:
 65 |     _target_: torch.optim.lr_scheduler.LambdaLR
 66 |     _partial_: true
 67 |     lr_lambda:
 68 |       _target_: codec_evaluation.utils.schedule.get_cosine_schedule_with_warmup_lr_lambda
 69 |       _partial_: true
 70 |       num_warmup_steps: 10
 71 |       num_training_steps: 10000
 72 |       final_lr_ratio: 0.2
 73 | 
 74 | callbacks:
 75 |   learning_rate_monitor:
 76 |     _target_: pytorch_lightning.callbacks.LearningRateMonitor
 77 |     logging_interval: step
 78 | 
 79 |   rich_progress_bar:
 80 |     _target_: pytorch_lightning.callbacks.RichProgressBar
 81 | 
 82 |   model_summary:
 83 |     _target_: pytorch_lightning.callbacks.ModelSummary
 84 |     max_depth: 1
 85 | 
 86 |   model_checkpoint:
 87 |     _target_: pytorch_lightning.callbacks.ModelCheckpoint
 88 |     monitor: val_loss
 89 |     dirpath: ${probe_ckpt_dir}
 90 |     every_n_epochs: 1
 91 |     mode: min
 92 |     save_top_k: 1
 93 |     filename: ${codec_name}_${mode}_{epoch}-{val_loss:.4f}
 94 |     verbose: True
 95 | 
 96 | tensorboard:
 97 |   _target_: pytorch_lightning.loggers.TensorBoardLogger
 98 |   save_dir: ???
 99 |   name: ${codec_name}_${mode}
100 |   log_graph: true


--------------------------------------------------------------------------------
/codec_evaluation/probe/config/MELD_dataset/mimi.yaml:
--------------------------------------------------------------------------------
  1 | mode: quantized_emb
  2 | sample_rate: 48000
  3 | target_sec: 8
  4 | num_outputs: 7
  5 | probe_ckpt_dir: ???
  6 | seed: 666
  7 | codec_name: mimi # 需要更改
  8 | task: multiclass
  9 | save_result: null 
 10 | 
 11 | trainer:
 12 |   _target_: pytorch_lightning.Trainer
 13 |   accelerator: gpu
 14 |   devices: ???
 15 |   precision: 32
 16 |   max_epochs: 100
 17 |   limit_val_batches: 10
 18 |   log_every_n_steps: 5
 19 |   val_check_interval: 1.0
 20 | 
 21 | data:
 22 |   _target_: codec_evaluation.probe.dataset.MELD_dataset.MELD_dataset.MELDdataModule
 23 |   dataset_args:
 24 |     sample_rate: ${sample_rate}
 25 |     target_sec: ${target_sec}
 26 |   train_audio_dir: ???
 27 |   val_audio_dir: ???
 28 |   test_audio_dir: ???
 29 |   train_batch_size: 64
 30 |   val_batch_size: 2
 31 |   test_batch_size: 32
 32 |   train_num_workers: 8
 33 |   val_num_workers: 4
 34 |   test_num_workers: 4
 35 |   
 36 | model:
 37 |   _target_: codec_evaluation.probe.model.lit_prober.Prober
 38 |   codec_name: ${codec_name}
 39 |   sample_rate: ${sample_rate}
 40 |   mode: ${mode}
 41 |   task: ${task}
 42 |   num_outputs: ${num_outputs}
 43 |   probe_model_builder:
 44 |     _target_: codec_evaluation.probe.model.multiclass_model.MulticlassProber
 45 |     _partial_: true
 46 |     num_outputs: ${num_outputs}
 47 |     drop_out: 0.2
 48 |     channel_reduction: 16
 49 |     padding: 1
 50 |     kernel_size: 3
 51 |     stride: 1
 52 |   target_sec: ${target_sec}
 53 |   model_ckpt_dir: ???
 54 | 
 55 |   optimizer_builder:
 56 |       _target_: torch.optim.AdamW
 57 |       _partial_: true
 58 |       lr: 1e-4
 59 |       betas: [0.8, 0.99]
 60 |       eps: 1e-5
 61 |       weight_decay: 0.08
 62 |     
 63 |   lr_scheduler_builder:
 64 |     _target_: torch.optim.lr_scheduler.LambdaLR
 65 |     _partial_: true
 66 |     lr_lambda:
 67 |       _target_: codec_evaluation.utils.schedule.get_cosine_schedule_with_warmup_lr_lambda
 68 |       _partial_: true
 69 |       num_warmup_steps: 10
 70 |       num_training_steps: 10000
 71 |       final_lr_ratio: 0.2
 72 | 
 73 | callbacks:
 74 |   learning_rate_monitor:
 75 |     _target_: pytorch_lightning.callbacks.LearningRateMonitor
 76 |     logging_interval: step
 77 | 
 78 |   rich_progress_bar:
 79 |     _target_: pytorch_lightning.callbacks.RichProgressBar
 80 | 
 81 |   model_summary:
 82 |     _target_: pytorch_lightning.callbacks.ModelSummary
 83 |     max_depth: 1
 84 | 
 85 |   model_checkpoint:
 86 |     _target_: pytorch_lightning.callbacks.ModelCheckpoint
 87 |     monitor: val_loss
 88 |     dirpath: ${probe_ckpt_dir}
 89 |     every_n_epochs: 1
 90 |     mode: min
 91 |     save_top_k: 1
 92 |     save_last: False
 93 |     filename: ${codec_name}_${mode}_{epoch}-{val_loss:.4f}
 94 |     verbose: True
 95 | 
 96 | tensorboard:
 97 |   _target_: pytorch_lightning.loggers.TensorBoardLogger
 98 |   save_dir: ???
 99 |   name: ${codec_name}_${mode}
100 |   log_graph: true


--------------------------------------------------------------------------------
/codec_evaluation/probe/config/NSynthP_dataset/YuE.yaml:
--------------------------------------------------------------------------------
  1 | mode: quantized_emb
  2 | sample_rate: 16000
  3 | target_sec: 4
  4 | n_segments: 1
  5 | num_outputs: 128
  6 | probe_ckpt_dir: ???
  7 | seed: 666
  8 | codec_name: yue
  9 | task: multiclass
 10 | save_result: null
 11 | 
 12 | trainer:
 13 |   _target_: pytorch_lightning.Trainer
 14 |   accelerator: gpu
 15 |   devices: ???
 16 |   precision: 32
 17 |   max_epochs: 100
 18 |   limit_val_batches: 10
 19 |   log_every_n_steps: 5
 20 |   val_check_interval: 1.0
 21 | 
 22 | data:
 23 |   _target_: codec_evaluation.probe.dataset.NSynthP_dataset.NSynthP_dataset.NSynthPdataModule
 24 |   dataset_args:
 25 |     sample_rate: ${sample_rate}
 26 |     target_sec: ${target_sec}
 27 |   train_audio_dir: ???
 28 |   val_audio_dir: ???
 29 |   test_audio_dir: ???
 30 |   train_batch_size: 256
 31 |   val_batch_size: 32
 32 |   test_batch_size: 128
 33 |   train_num_workers: 128
 34 |   val_num_workers: 8
 35 |   test_num_workers: 64
 36 | 
 37 | model:
 38 |   _target_: codec_evaluation.probe.model.lit_prober.Prober
 39 |   codec_name: ${codec_name}
 40 |   sample_rate: ${sample_rate}
 41 |   mode: ${mode}
 42 |   task: ${task}
 43 |   num_outputs: ${num_outputs}
 44 |   probe_model_builder:
 45 |     _target_: codec_evaluation.probe.model.multiclass_model.MulticlassProber
 46 |     _partial_: true
 47 |     num_outputs: ${num_outputs}
 48 |     drop_out: 0.2
 49 |     channel_reduction: 16
 50 |     padding: 1
 51 |     kernel_size: 3
 52 |     stride: 1
 53 |   target_sec: ${target_sec}
 54 |   model_ckpt_dir: ???
 55 |   
 56 |   optimizer_builder:
 57 |       _target_: torch.optim.AdamW
 58 |       _partial_: true
 59 |       lr: 1e-4
 60 |       betas: [0.8, 0.99]
 61 |       eps: 1e-5
 62 |       weight_decay: 0.08
 63 |     
 64 |   lr_scheduler_builder:
 65 |     _target_: torch.optim.lr_scheduler.LambdaLR
 66 |     _partial_: true
 67 |     lr_lambda:
 68 |       _target_: codec_evaluation.utils.schedule.get_cosine_schedule_with_warmup_lr_lambda
 69 |       _partial_: true
 70 |       num_warmup_steps: 10
 71 |       num_training_steps: 10000
 72 |       final_lr_ratio: 0.2
 73 | 
 74 | callbacks:
 75 |   learning_rate_monitor:
 76 |     _target_: pytorch_lightning.callbacks.LearningRateMonitor
 77 |     logging_interval: step
 78 | 
 79 |   rich_progress_bar:
 80 |     _target_: pytorch_lightning.callbacks.RichProgressBar
 81 | 
 82 |   model_summary:
 83 |     _target_: pytorch_lightning.callbacks.ModelSummary
 84 |     max_depth: 1
 85 | 
 86 |   model_checkpoint:
 87 |     _target_: pytorch_lightning.callbacks.ModelCheckpoint
 88 |     monitor: val_loss
 89 |     dirpath: ${probe_ckpt_dir}
 90 |     every_n_epochs: 1
 91 |     mode: min
 92 |     save_top_k: 1
 93 |     filename: ${codec_name}_${mode}_{epoch}-{val_loss:.4f}
 94 |     verbose: True
 95 | 
 96 | tensorboard:
 97 |   _target_: pytorch_lightning.loggers.TensorBoardLogger
 98 |   save_dir: ???
 99 |   name: ${codec_name}_${mode}
100 |   log_graph: true


--------------------------------------------------------------------------------
/codec_evaluation/codecs/levo_modules/Flow1dVAE/our_MERT_BESTRQ/mert_fairseq/models/musicfm/model/w2v2_config.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "activation_dropout": 0.1,
  3 |   "adapter_kernel_size": 3,
  4 |   "adapter_stride": 2,
  5 |   "add_adapter": false,
  6 |   "apply_spec_augment": true,
  7 |   "architectures": [
  8 |     "Wav2Vec2ConformerForCTC"
  9 |   ],
 10 |   "attention_dropout": 0.1,
 11 |   "bos_token_id": 1,
 12 |   "classifier_proj_size": 256,
 13 |   "codevector_dim": 768,
 14 |   "conformer_conv_dropout": 0.1,
 15 |   "contrastive_logits_temperature": 0.1,
 16 |   "conv_bias": true,
 17 |   "conv_depthwise_kernel_size": 31,
 18 |   "conv_dim": [
 19 |     512,
 20 |     512,
 21 |     512,
 22 |     512,
 23 |     512,
 24 |     512,
 25 |     512
 26 |   ],
 27 |   "conv_kernel": [
 28 |     10,
 29 |     3,
 30 |     3,
 31 |     3,
 32 |     3,
 33 |     2,
 34 |     2
 35 |   ],
 36 |   "conv_stride": [
 37 |     5,
 38 |     2,
 39 |     2,
 40 |     2,
 41 |     2,
 42 |     2,
 43 |     2
 44 |   ],
 45 |   "ctc_loss_reduction": "sum",
 46 |   "ctc_zero_infinity": false,
 47 |   "diversity_loss_weight": 0.1,
 48 |   "do_stable_layer_norm": true,
 49 |   "eos_token_id": 2,
 50 |   "feat_extract_activation": "gelu",
 51 |   "feat_extract_dropout": 0.0,
 52 |   "feat_extract_norm": "layer",
 53 |   "feat_proj_dropout": 0.1,
 54 |   "feat_quantizer_dropout": 0.0,
 55 |   "final_dropout": 0.1,
 56 |   "gradient_checkpointing": false,
 57 |   "hidden_act": "swish",
 58 |   "hidden_dropout": 0.1,
 59 |   "hidden_dropout_prob": 0.1,
 60 |   "hidden_size": 1024,
 61 |   "initializer_range": 0.02,
 62 |   "intermediate_size": 4096,
 63 |   "layer_norm_eps": 1e-05,
 64 |   "layerdrop": 0.0,
 65 |   "mask_feature_length": 10,
 66 |   "mask_feature_min_masks": 0,
 67 |   "mask_feature_prob": 0.0,
 68 |   "mask_time_length": 10,
 69 |   "mask_time_min_masks": 2,
 70 |   "mask_time_prob": 0.05,
 71 |   "max_source_positions": 5000,
 72 |   "model_type": "wav2vec2-conformer",
 73 |   "num_adapter_layers": 3,
 74 |   "num_attention_heads": 16,
 75 |   "num_codevector_groups": 2,
 76 |   "num_codevectors_per_group": 320,
 77 |   "num_conv_pos_embedding_groups": 16,
 78 |   "num_conv_pos_embeddings": 128,
 79 |   "num_feat_extract_layers": 7,
 80 |   "num_hidden_layers": 24,
 81 |   "num_negatives": 100,
 82 |   "output_hidden_size": 1024,
 83 |   "pad_token_id": 0,
 84 |   "position_embeddings_type": "rotary",
 85 |   "proj_codevector_dim": 768,
 86 |   "rotary_embedding_base": 10000,
 87 |   "tdnn_dilation": [
 88 |     1,
 89 |     2,
 90 |     3,
 91 |     1,
 92 |     1
 93 |   ],
 94 |   "tdnn_dim": [
 95 |     512,
 96 |     512,
 97 |     512,
 98 |     512,
 99 |     1500
100 |   ],
101 |   "tdnn_kernel": [
102 |     5,
103 |     3,
104 |     3,
105 |     1,
106 |     1
107 |   ],
108 |   "torch_dtype": "float32",
109 |   "transformers_version": "4.19.0.dev0",
110 |   "use_weighted_layer_sum": false,
111 |   "vocab_size": 32,
112 |   "xvector_output_dim": 512
113 | }
114 | 


--------------------------------------------------------------------------------
/codec_evaluation/probe/config/ESC50_dataset/YuE.yaml:
--------------------------------------------------------------------------------
  1 | mode: quantized_emb
  2 | sample_rate: 44100
  3 | target_sec: 5
  4 | num_outputs: 50
  5 | probe_ckpt_dir: ???
  6 | seed: 666
  7 | codec_name: yue # 需要更改
  8 | task: multiclass
  9 | save_result: null
 10 | 
 11 | trainer:
 12 |   _target_: pytorch_lightning.Trainer
 13 |   accelerator: gpu
 14 |   devices: ???
 15 |   precision: 32
 16 |   max_epochs: 100
 17 |   limit_val_batches: 10
 18 |   log_every_n_steps: 5
 19 |   val_check_interval: 1.0
 20 | 
 21 | data:
 22 |   _target_: codec_evaluation.probe.dataset.ESC50_dataset.ESC50_dataset.ESC50dataModule
 23 |   dataset_args:
 24 |     sample_rate: ${sample_rate}
 25 |     target_sec: ${target_sec}
 26 |   train_audio_dir: ???
 27 |   val_audio_dir: ???
 28 |   test_audio_dir: ???
 29 |   train_batch_size: 128
 30 |   val_batch_size: 2
 31 |   test_batch_size: 128
 32 |   train_num_workers: 32
 33 |   val_num_workers: 4
 34 |   test_num_workers: 32
 35 | 
 36 | model:
 37 |   _target_: codec_evaluation.probe.model.lit_prober.Prober
 38 |   codec_name: ${codec_name}
 39 |   sample_rate: ${sample_rate}
 40 |   mode: ${mode}
 41 |   task: ${task}
 42 |   num_outputs: ${num_outputs}
 43 |   probe_model_builder:
 44 |     _target_: codec_evaluation.probe.model.multiclass_model.MulticlassProber
 45 |     _partial_: true
 46 |     num_outputs: ${num_outputs}
 47 |     drop_out: 0.1
 48 |     channel_reduction: 16
 49 |     padding: 1
 50 |     kernel_size: 3
 51 |     stride: 1
 52 |   target_sec: ${target_sec}
 53 |   model_ckpt_dir: ???
 54 | 
 55 |   optimizer_builder:
 56 |       _target_: torch.optim.AdamW
 57 |       _partial_: true
 58 |       lr: 1e-4
 59 |       betas: [0.8, 0.99]
 60 |       eps: 1e-5
 61 |       weight_decay: 0.08
 62 |     
 63 |   lr_scheduler_builder:
 64 |     _target_: torch.optim.lr_scheduler.LambdaLR
 65 |     _partial_: true
 66 |     lr_lambda:
 67 |       _target_: codec_evaluation.utils.schedule.get_cosine_schedule_with_warmup_lr_lambda
 68 |       _partial_: true
 69 |       num_warmup_steps: 10
 70 |       num_training_steps: 10000
 71 |       final_lr_ratio: 0.2
 72 | 
 73 | callbacks:
 74 |   learning_rate_monitor:
 75 |     _target_: pytorch_lightning.callbacks.LearningRateMonitor
 76 |     logging_interval: step
 77 | 
 78 |   rich_progress_bar:
 79 |     _target_: pytorch_lightning.callbacks.RichProgressBar
 80 | 
 81 |   model_summary:
 82 |     _target_: pytorch_lightning.callbacks.ModelSummary
 83 |     max_depth: 1
 84 | 
 85 |   model_checkpoint:
 86 |     _target_: pytorch_lightning.callbacks.ModelCheckpoint
 87 |     monitor: val_loss
 88 |     dirpath: ${probe_ckpt_dir}
 89 |     every_n_epochs: 1
 90 |     mode: min
 91 |     save_top_k: 1
 92 |     save_last: False
 93 |     filename: ${codec_name}_${mode}_{epoch}-{val_loss:.4f}
 94 |     verbose: True
 95 | 
 96 | tensorboard:
 97 |   _target_: pytorch_lightning.loggers.TensorBoardLogger
 98 |   save_dir: ???
 99 |   name: ${codec_name}_${mode}
100 |   log_graph: true


--------------------------------------------------------------------------------
/codec_evaluation/probe/config/ESC50_dataset/dac.yaml:
--------------------------------------------------------------------------------
  1 | mode: quantized_emb
  2 | sample_rate: 44100
  3 | target_sec: 5
  4 | num_outputs: 50
  5 | probe_ckpt_dir: ???
  6 | seed: 666
  7 | codec_name: dac # 需要更改
  8 | task: multiclass
  9 | save_result: null
 10 | 
 11 | trainer:
 12 |   _target_: pytorch_lightning.Trainer
 13 |   accelerator: gpu
 14 |   devices: ???
 15 |   precision: 32
 16 |   max_epochs: 100
 17 |   limit_val_batches: 10
 18 |   log_every_n_steps: 5
 19 |   val_check_interval: 1.0
 20 | 
 21 | data:
 22 |   _target_: codec_evaluation.probe.dataset.ESC50_dataset.ESC50_dataset.ESC50dataModule
 23 |   dataset_args:
 24 |     sample_rate: ${sample_rate}
 25 |     target_sec: ${target_sec}
 26 |   train_audio_dir: ???
 27 |   val_audio_dir: ???
 28 |   test_audio_dir: ???
 29 |   train_batch_size: 128
 30 |   val_batch_size: 2
 31 |   test_batch_size: 128
 32 |   train_num_workers: 32
 33 |   val_num_workers: 4
 34 |   test_num_workers: 32
 35 | 
 36 | model:
 37 |   _target_: codec_evaluation.probe.model.lit_prober.Prober
 38 |   codec_name: ${codec_name}
 39 |   sample_rate: ${sample_rate}
 40 |   mode: ${mode}
 41 |   task: ${task}
 42 |   num_outputs: ${num_outputs}
 43 |   probe_model_builder:
 44 |     _target_: codec_evaluation.probe.model.multiclass_model.MulticlassProber
 45 |     _partial_: true
 46 |     num_outputs: ${num_outputs}
 47 |     drop_out: 0.1
 48 |     channel_reduction: 16
 49 |     padding: 1
 50 |     kernel_size: 3
 51 |     stride: 1
 52 |   target_sec: ${target_sec}
 53 |   model_ckpt_dir: ???
 54 | 
 55 |   optimizer_builder:
 56 |       _target_: torch.optim.AdamW
 57 |       _partial_: true
 58 |       lr: 1e-4
 59 |       betas: [0.8, 0.99]
 60 |       eps: 1e-5
 61 |       weight_decay: 0.08
 62 |     
 63 |   lr_scheduler_builder:
 64 |     _target_: torch.optim.lr_scheduler.LambdaLR
 65 |     _partial_: true
 66 |     lr_lambda:
 67 |       _target_: codec_evaluation.utils.schedule.get_cosine_schedule_with_warmup_lr_lambda
 68 |       _partial_: true
 69 |       num_warmup_steps: 10
 70 |       num_training_steps: 10000
 71 |       final_lr_ratio: 0.2
 72 | 
 73 | callbacks:
 74 |   learning_rate_monitor:
 75 |     _target_: pytorch_lightning.callbacks.LearningRateMonitor
 76 |     logging_interval: step
 77 | 
 78 |   rich_progress_bar:
 79 |     _target_: pytorch_lightning.callbacks.RichProgressBar
 80 | 
 81 |   model_summary:
 82 |     _target_: pytorch_lightning.callbacks.ModelSummary
 83 |     max_depth: 1
 84 | 
 85 |   model_checkpoint:
 86 |     _target_: pytorch_lightning.callbacks.ModelCheckpoint
 87 |     monitor: val_loss
 88 |     dirpath: ${probe_ckpt_dir}
 89 |     every_n_epochs: 1
 90 |     mode: min
 91 |     save_top_k: 1
 92 |     save_last: False
 93 |     filename: ${codec_name}_${mode}_{epoch}-{val_loss:.4f}
 94 |     verbose: True
 95 | 
 96 | tensorboard:
 97 |   _target_: pytorch_lightning.loggers.TensorBoardLogger
 98 |   save_dir: ???
 99 |   name: ${codec_name}_${mode}
100 |   log_graph: true


--------------------------------------------------------------------------------
/codec_evaluation/probe/config/ESC50_dataset/encodec.yaml:
--------------------------------------------------------------------------------
  1 | mode: quantized_emb
  2 | sample_rate: 44100
  3 | target_sec: 5
  4 | num_outputs: 50
  5 | probe_ckpt_dir: ???
  6 | seed: 666
  7 | codec_name: encodec
  8 | task: multiclass
  9 | save_result: null
 10 | 
 11 | 
 12 | trainer:
 13 |   _target_: pytorch_lightning.Trainer
 14 |   accelerator: gpu
 15 |   devices: ???
 16 |   precision: 32
 17 |   max_epochs: 100
 18 |   limit_val_batches: 10
 19 |   log_every_n_steps: 5
 20 |   val_check_interval: 1.0
 21 | 
 22 | data:
 23 |   _target_: codec_evaluation.probe.dataset.ESC50_dataset.ESC50_dataset.ESC50dataModule
 24 |   dataset_args:
 25 |     sample_rate: ${sample_rate}
 26 |     target_sec: ${target_sec}
 27 |   train_audio_dir: ???
 28 |   val_audio_dir: ???
 29 |   test_audio_dir: ???
 30 |   train_batch_size: 128
 31 |   val_batch_size: 2
 32 |   test_batch_size: 128
 33 |   train_num_workers: 32
 34 |   val_num_workers: 4
 35 |   test_num_workers: 32
 36 | 
 37 | 
 38 | model:
 39 |   _target_: codec_evaluation.probe.model.lit_prober.Prober
 40 |   codec_name: ${codec_name}
 41 |   sample_rate: ${sample_rate}
 42 |   mode: ${mode}
 43 |   task: ${task}
 44 |   num_outputs: ${num_outputs}
 45 |   probe_model_builder:
 46 |     _target_: codec_evaluation.probe.model.multiclass_model.MulticlassProber
 47 |     _partial_: true
 48 |     num_outputs: ${num_outputs}
 49 |     drop_out: 0.1
 50 |     channel_reduction: 16
 51 |     padding: 1
 52 |     kernel_size: 3
 53 |     stride: 1
 54 |   target_sec: ${target_sec}
 55 |   model_ckpt_dir: ???
 56 | 
 57 |   optimizer_builder:
 58 |     _target_: torch.optim.AdamW
 59 |     _partial_: true
 60 |     lr: 1e-4
 61 |     betas: [0.8, 0.99]
 62 |     eps: 1e-5
 63 |     weight_decay: 0.08
 64 |     
 65 |   lr_scheduler_builder:
 66 |     _target_: torch.optim.lr_scheduler.LambdaLR
 67 |     _partial_: true
 68 |     lr_lambda:
 69 |       _target_: codec_evaluation.utils.schedule.get_cosine_schedule_with_warmup_lr_lambda
 70 |       _partial_: true
 71 |       num_warmup_steps: 10
 72 |       num_training_steps: 10000
 73 |       final_lr_ratio: 0.2
 74 | 
 75 | callbacks:
 76 |   learning_rate_monitor:
 77 |     _target_: pytorch_lightning.callbacks.LearningRateMonitor
 78 |     logging_interval: step
 79 | 
 80 |   rich_progress_bar:
 81 |     _target_: pytorch_lightning.callbacks.RichProgressBar
 82 | 
 83 |   model_summary:
 84 |     _target_: pytorch_lightning.callbacks.ModelSummary
 85 |     max_depth: 1
 86 | 
 87 |   model_checkpoint:
 88 |     _target_: pytorch_lightning.callbacks.ModelCheckpoint
 89 |     monitor: val_loss
 90 |     dirpath: ${probe_ckpt_dir}
 91 |     every_n_epochs: 1
 92 |     mode: min
 93 |     save_top_k: 1
 94 |     save_last: False
 95 |     filename: ${codec_name}_${mode}_{epoch}-{val_loss:.4f}
 96 |     verbose: True
 97 | 
 98 | tensorboard:
 99 |   _target_: pytorch_lightning.loggers.TensorBoardLogger
100 |   save_dir: ???
101 |   name: ${codec_name}_${mode}
102 |   log_graph: true


--------------------------------------------------------------------------------
/codec_evaluation/probe/config/ESC50_dataset/mimi.yaml:
--------------------------------------------------------------------------------
  1 | mode: quantized_emb
  2 | sample_rate: 16000
  3 | target_sec: 5
  4 | num_outputs: 50
  5 | probe_ckpt_dir: ???
  6 | seed: 666
  7 | codec_name: mimi # 需要更改
  8 | task: multiclass
  9 | save_result: null
 10 | 
 11 | trainer:
 12 |   _target_: pytorch_lightning.Trainer
 13 |   accelerator: gpu
 14 |   devices: ???
 15 |   precision: 32
 16 |   max_epochs: 100
 17 |   limit_val_batches: 10
 18 |   log_every_n_steps: 5
 19 |   val_check_interval: 1.0
 20 | 
 21 | data:
 22 |   _target_: codec_evaluation.probe.dataset.ESC50_dataset.ESC50_dataset.ESC50dataModule
 23 |   dataset_args:
 24 |     sample_rate: ${sample_rate}
 25 |     target_sec: ${target_sec}
 26 |   train_audio_dir: ???
 27 |   val_audio_dir: ???
 28 |   test_audio_dir: ???
 29 |   train_batch_size: 128
 30 |   val_batch_size: 2
 31 |   test_batch_size: 128
 32 |   train_num_workers: 32
 33 |   val_num_workers: 4
 34 |   test_num_workers: 32
 35 | 
 36 | model:
 37 |   _target_: codec_evaluation.probe.model.lit_prober.Prober
 38 |   codec_name: ${codec_name}
 39 |   sample_rate: ${sample_rate}
 40 |   mode: ${mode}
 41 |   task: ${task}
 42 |   num_outputs: ${num_outputs}
 43 |   probe_model_builder:
 44 |     _target_: codec_evaluation.probe.model.multiclass_model.MulticlassProber
 45 |     _partial_: true
 46 |     num_outputs: ${num_outputs}
 47 |     drop_out: 0.1
 48 |     channel_reduction: 16
 49 |     padding: 1
 50 |     kernel_size: 3
 51 |     stride: 1
 52 |   target_sec: ${target_sec}
 53 |   model_ckpt_dir: ???
 54 | 
 55 |   optimizer_builder:
 56 |       _target_: torch.optim.AdamW
 57 |       _partial_: true
 58 |       lr: 1e-4
 59 |       betas: [0.8, 0.99]
 60 |       eps: 1e-5
 61 |       weight_decay: 0.08
 62 |     
 63 |   lr_scheduler_builder:
 64 |     _target_: torch.optim.lr_scheduler.LambdaLR
 65 |     _partial_: true
 66 |     lr_lambda:
 67 |       _target_: codec_evaluation.utils.schedule.get_cosine_schedule_with_warmup_lr_lambda
 68 |       _partial_: true
 69 |       num_warmup_steps: 10
 70 |       num_training_steps: 10000
 71 |       final_lr_ratio: 0.2
 72 | 
 73 | callbacks:
 74 |   learning_rate_monitor:
 75 |     _target_: pytorch_lightning.callbacks.LearningRateMonitor
 76 |     logging_interval: step
 77 | 
 78 |   rich_progress_bar:
 79 |     _target_: pytorch_lightning.callbacks.RichProgressBar
 80 | 
 81 |   model_summary:
 82 |     _target_: pytorch_lightning.callbacks.ModelSummary
 83 |     max_depth: 1
 84 | 
 85 |   model_checkpoint:
 86 |     _target_: pytorch_lightning.callbacks.ModelCheckpoint
 87 |     monitor: val_loss
 88 |     dirpath: ${probe_ckpt_dir}
 89 |     every_n_epochs: 1
 90 |     mode: min
 91 |     save_top_k: 1
 92 |     save_last: False
 93 |     filename: ${codec_name}_${mode}_{epoch}-{val_loss:.4f}
 94 |     verbose: True
 95 | 
 96 | tensorboard:
 97 |   _target_: pytorch_lightning.loggers.TensorBoardLogger
 98 |   save_dir: ???
 99 |   name: ${codec_name}_${mode}
100 |   log_graph: true


--------------------------------------------------------------------------------
/codec_evaluation/probe/config/ESC50_dataset/semanticodec.yaml:
--------------------------------------------------------------------------------
  1 | mode: quantized_emb
  2 | sample_rate: 44100
  3 | target_sec: 5
  4 | num_outputs: 50
  5 | probe_ckpt_dir: ???
  6 | seed: 666
  7 | codec_name: semanticodec
  8 | task: multiclass
  9 | save_result: null
 10 | 
 11 | trainer:
 12 |   _target_: pytorch_lightning.Trainer
 13 |   accelerator: gpu
 14 |   devices: ???
 15 |   precision: 32
 16 |   max_epochs: 100
 17 |   limit_val_batches: 10
 18 |   log_every_n_steps: 5
 19 |   val_check_interval: 1.0
 20 | 
 21 | data:
 22 |   _target_: codec_evaluation.probe.dataset.ESC50_dataset.ESC50_dataset.ESC50dataModule
 23 |   dataset_args:
 24 |     sample_rate: ${sample_rate}
 25 |     target_sec: ${target_sec}
 26 |   train_audio_dir: ???
 27 |   val_audio_dir: ???
 28 |   test_audio_dir: ???
 29 |   train_batch_size: 128
 30 |   val_batch_size: 2
 31 |   test_batch_size: 128
 32 |   train_num_workers: 32
 33 |   val_num_workers: 4
 34 |   test_num_workers: 32
 35 |   
 36 | model:
 37 |   _target_: codec_evaluation.probe.model.lit_prober.Prober
 38 |   codec_name: ${codec_name}
 39 |   sample_rate: ${sample_rate}
 40 |   mode: ${mode}
 41 |   task: ${task}
 42 |   num_outputs: ${num_outputs}
 43 |   probe_model_builder:
 44 |     _target_: codec_evaluation.probe.model.multiclass_model.MulticlassProber
 45 |     _partial_: true
 46 |     num_outputs: ${num_outputs}
 47 |     drop_out: 0.1
 48 |     channel_reduction: 16
 49 |     padding: 1
 50 |     kernel_size: 3
 51 |     stride: 1
 52 |   target_sec: ${target_sec}
 53 |   model_ckpt_dir: ???
 54 |   
 55 |   optimizer_builder:
 56 |     _target_: torch.optim.AdamW
 57 |     _partial_: true
 58 |     lr: 1e-4
 59 |     betas: [0.8, 0.99]
 60 |     eps: 1e-5
 61 |     weight_decay: 0.08
 62 |     
 63 |   lr_scheduler_builder:
 64 |     _target_: torch.optim.lr_scheduler.LambdaLR
 65 |     _partial_: true
 66 |     lr_lambda:
 67 |       _target_: codec_evaluation.utils.schedule.get_cosine_schedule_with_warmup_lr_lambda
 68 |       _partial_: true
 69 |       num_warmup_steps: 10
 70 |       num_training_steps: 10000
 71 |       final_lr_ratio: 0.2
 72 | 
 73 | callbacks:
 74 |   learning_rate_monitor:
 75 |     _target_: pytorch_lightning.callbacks.LearningRateMonitor
 76 |     logging_interval: step
 77 | 
 78 |   rich_progress_bar:
 79 |     _target_: pytorch_lightning.callbacks.RichProgressBar
 80 | 
 81 |   model_summary:
 82 |     _target_: pytorch_lightning.callbacks.ModelSummary
 83 |     max_depth: 1
 84 | 
 85 |   model_checkpoint:
 86 |     _target_: pytorch_lightning.callbacks.ModelCheckpoint
 87 |     monitor: val_loss
 88 |     dirpath: ${probe_ckpt_dir}
 89 |     every_n_epochs: 1
 90 |     mode: min
 91 |     save_top_k: 1
 92 |     save_last: False
 93 |     filename: ${codec_name}_${mode}_{epoch}-{val_loss:.4f}
 94 |     verbose: True
 95 | 
 96 | tensorboard:
 97 |   _target_: pytorch_lightning.loggers.TensorBoardLogger
 98 |   save_dir: ???
 99 |   name: ${codec_name}_${mode}
100 |   log_graph: true


--------------------------------------------------------------------------------
/codec_evaluation/probe/config/ESC50_dataset/wavtokenizer.yaml:
--------------------------------------------------------------------------------
  1 | mode: quantized_emb
  2 | sample_rate: 44100
  3 | target_sec: 5
  4 | num_outputs: 50
  5 | probe_ckpt_dir: ???
  6 | seed: 666
  7 | codec_name: wavtokenizer
  8 | task: multiclass
  9 | save_result: null
 10 | 
 11 | trainer:
 12 |   _target_: pytorch_lightning.Trainer
 13 |   accelerator: gpu
 14 |   devices: ???
 15 |   precision: 32
 16 |   max_epochs: 100
 17 |   limit_val_batches: 10
 18 |   log_every_n_steps: 5
 19 |   val_check_interval: 1.0
 20 | 
 21 | data:
 22 |   _target_: codec_evaluation.probe.dataset.ESC50_dataset.ESC50_dataset.ESC50dataModule
 23 |   dataset_args:
 24 |     sample_rate: ${sample_rate}
 25 |     target_sec: ${target_sec}
 26 |   train_audio_dir: ???
 27 |   val_audio_dir: ???
 28 |   test_audio_dir: ???
 29 |   train_batch_size: 128
 30 |   val_batch_size: 2
 31 |   test_batch_size: 128
 32 |   train_num_workers: 32
 33 |   val_num_workers: 4
 34 |   test_num_workers: 32
 35 | 
 36 | model:
 37 |   _target_: codec_evaluation.probe.model.lit_prober.Prober
 38 |   codec_name: ${codec_name}
 39 |   sample_rate: ${sample_rate}
 40 |   mode: ${mode}
 41 |   task: ${task}
 42 |   num_outputs: ${num_outputs}
 43 |   probe_model_builder:
 44 |     _target_: codec_evaluation.probe.model.multiclass_model.MulticlassProber
 45 |     _partial_: true
 46 |     num_outputs: ${num_outputs}
 47 |     drop_out: 0.1
 48 |     channel_reduction: 16
 49 |     padding: 1
 50 |     kernel_size: 3
 51 |     stride: 1
 52 |   target_sec: ${target_sec}
 53 |   model_ckpt_dir: ???
 54 |   
 55 |   optimizer_builder:
 56 |     _target_: torch.optim.AdamW
 57 |     _partial_: true
 58 |     lr: 1e-4
 59 |     betas: [0.8, 0.99]
 60 |     eps: 1e-5
 61 |     weight_decay: 0.08
 62 |     
 63 |   lr_scheduler_builder:
 64 |     _target_: torch.optim.lr_scheduler.LambdaLR
 65 |     _partial_: true
 66 |     lr_lambda:
 67 |       _target_: codec_evaluation.utils.schedule.get_cosine_schedule_with_warmup_lr_lambda
 68 |       _partial_: true
 69 |       num_warmup_steps: 10
 70 |       num_training_steps: 10000
 71 |       final_lr_ratio: 0.2
 72 | 
 73 | callbacks:
 74 |   learning_rate_monitor:
 75 |     _target_: pytorch_lightning.callbacks.LearningRateMonitor
 76 |     logging_interval: step
 77 | 
 78 |   rich_progress_bar:
 79 |     _target_: pytorch_lightning.callbacks.RichProgressBar
 80 | 
 81 |   model_summary:
 82 |     _target_: pytorch_lightning.callbacks.ModelSummary
 83 |     max_depth: 1
 84 | 
 85 |   model_checkpoint:
 86 |     _target_: pytorch_lightning.callbacks.ModelCheckpoint
 87 |     monitor: val_loss
 88 |     dirpath: ${probe_ckpt_dir}
 89 |     every_n_epochs: 1
 90 |     mode: min
 91 |     save_top_k: 1
 92 |     save_last: False
 93 |     filename: ${codec_name}_${mode}_{epoch}-{val_loss:.4f}
 94 |     verbose: True
 95 | 
 96 | tensorboard:
 97 |   _target_: pytorch_lightning.loggers.TensorBoardLogger
 98 |   save_dir: ???
 99 |   name: ${codec_name}_${mode}
100 |   log_graph: true


--------------------------------------------------------------------------------
/codec_evaluation/probe/config/GS_dataset/semanticodec.yaml:
--------------------------------------------------------------------------------
  1 | mode: quantized_emb
  2 | sample_rate: 44100
  3 | target_sec: 10
  4 | num_outputs: 24
  5 | probe_ckpt_dir: ???
  6 | seed: 666
  7 | codec_name: semanticodec # 需要更改
  8 | task: multiclass
  9 | save_result: null
 10 | 
 11 | trainer:
 12 |   _target_: pytorch_lightning.Trainer
 13 |   accelerator: gpu
 14 |   devices: ???
 15 |   precision: 32
 16 |   max_epochs: 100
 17 |   limit_val_batches: 10
 18 |   log_every_n_steps: 5
 19 |   val_check_interval: 1.0
 20 | 
 21 | data:
 22 |   _target_: codec_evaluation.probe.dataset.GS_dataset.GS_dataset.GSdataModule
 23 |   dataset_args:
 24 |     sample_rate: ${sample_rate}
 25 |     target_sec: ${target_sec}
 26 |   train_audio_dir: ???
 27 |   val_audio_dir: ???
 28 |   test_audio_dir: ???
 29 |   train_batch_size: 8
 30 |   val_batch_size: 1
 31 |   test_batch_size: 8
 32 |   train_num_workers: 4
 33 |   val_num_workers: 2
 34 |   test_num_workers: 2
 35 | 
 36 | model:
 37 |   _target_: codec_evaluation.probe.model.lit_prober.Prober
 38 |   codec_name: ${codec_name}
 39 |   sample_rate: ${sample_rate}
 40 |   mode: ${mode}
 41 |   task: ${task}
 42 |   num_outputs: ${num_outputs}
 43 |   probe_model_builder:
 44 |     _target_: codec_evaluation.probe.model.multiclass_model.MulticlassProber
 45 |     _partial_: true
 46 |     num_outputs: ${num_outputs}
 47 |     drop_out: 0.1
 48 |     channel_reduction: 16
 49 |     padding: 1
 50 |     kernel_size: 3
 51 |     stride: 1
 52 |   target_sec: ${target_sec}
 53 |   model_ckpt_dir: ???
 54 |   
 55 |   optimizer_builder:
 56 |       _target_: torch.optim.AdamW
 57 |       _partial_: true
 58 |       lr: 1e-4
 59 |       betas: [0.8, 0.99]
 60 |       eps: 1e-5
 61 |       weight_decay: 0.08
 62 |     
 63 |   lr_scheduler_builder:
 64 |     _target_: torch.optim.lr_scheduler.LambdaLR
 65 |     _partial_: true
 66 |     lr_lambda:
 67 |       _target_: codec_evaluation.utils.schedule.get_cosine_schedule_with_warmup_lr_lambda
 68 |       _partial_: true
 69 |       num_warmup_steps: 10
 70 |       num_training_steps: 10000
 71 |       final_lr_ratio: 0.2
 72 | 
 73 | callbacks:
 74 |   learning_rate_monitor:
 75 |     _target_: pytorch_lightning.callbacks.LearningRateMonitor
 76 |     logging_interval: step
 77 | 
 78 |   rich_progress_bar:
 79 |     _target_: pytorch_lightning.callbacks.RichProgressBar
 80 | 
 81 |   model_summary:
 82 |     _target_: pytorch_lightning.callbacks.ModelSummary
 83 |     max_depth: 1
 84 | 
 85 |   model_checkpoint:
 86 |     _target_: pytorch_lightning.callbacks.ModelCheckpoint
 87 |     monitor: val_loss
 88 |     dirpath: ${probe_ckpt_dir}
 89 |     every_n_epochs: 1
 90 |     mode: min
 91 |     save_top_k: 1
 92 |     save_last: False
 93 |     filename: ${codec_name}_${mode}_{epoch}-{val_loss:.4f}
 94 |     verbose: True
 95 | 
 96 | tensorboard:
 97 |   _target_: pytorch_lightning.loggers.TensorBoardLogger
 98 |   save_dir: ???
 99 |   name: ${codec_name}_${mode}
100 |   log_graph: true


--------------------------------------------------------------------------------
/codec_evaluation/probe/config/GS_dataset/wavtokenizer.yaml:
--------------------------------------------------------------------------------
  1 | mode: quantized_emb
  2 | sample_rate: 44100
  3 | target_sec: 10
  4 | num_outputs: 24
  5 | probe_ckpt_dir: ???
  6 | seed: 666
  7 | codec_name: wavtokenizer # 需要更改
  8 | task: multiclass
  9 | save_result: null
 10 | 
 11 | trainer:
 12 |   _target_: pytorch_lightning.Trainer
 13 |   accelerator: gpu
 14 |   devices: ???
 15 |   precision: 32
 16 |   max_epochs: 100
 17 |   limit_val_batches: 10
 18 |   log_every_n_steps: 5
 19 |   val_check_interval: 1.0
 20 | 
 21 | data:
 22 |   _target_: codec_evaluation.probe.dataset.GS_dataset.GS_dataset.GSdataModule
 23 |   dataset_args:
 24 |     sample_rate: ${sample_rate}
 25 |     target_sec: ${target_sec}
 26 |   train_audio_dir: ???
 27 |   val_audio_dir: ???
 28 |   test_audio_dir: ???
 29 |   train_batch_size: 8
 30 |   val_batch_size: 1
 31 |   test_batch_size: 8
 32 |   train_num_workers: 4
 33 |   val_num_workers: 2
 34 |   test_num_workers: 2
 35 | 
 36 | model:
 37 |   _target_: codec_evaluation.probe.model.lit_prober.Prober
 38 |   codec_name: ${codec_name}
 39 |   sample_rate: ${sample_rate}
 40 |   mode: ${mode}
 41 |   task: ${task}
 42 |   num_outputs: ${num_outputs}
 43 |   probe_model_builder:
 44 |     _target_: codec_evaluation.probe.model.multiclass_model.MulticlassProber
 45 |     _partial_: true
 46 |     num_outputs: ${num_outputs}
 47 |     drop_out: 0.1
 48 |     channel_reduction: 16
 49 |     padding: 1
 50 |     kernel_size: 3
 51 |     stride: 1
 52 |   target_sec: ${target_sec}
 53 |   model_ckpt_dir: ???
 54 |   
 55 |   optimizer_builder:
 56 |       _target_: torch.optim.AdamW
 57 |       _partial_: true
 58 |       lr: 1e-4
 59 |       betas: [0.8, 0.99]
 60 |       eps: 1e-5
 61 |       weight_decay: 0.08
 62 |     
 63 |   lr_scheduler_builder:
 64 |     _target_: torch.optim.lr_scheduler.LambdaLR
 65 |     _partial_: true
 66 |     lr_lambda:
 67 |       _target_: codec_evaluation.utils.schedule.get_cosine_schedule_with_warmup_lr_lambda
 68 |       _partial_: true
 69 |       num_warmup_steps: 10
 70 |       num_training_steps: 10000
 71 |       final_lr_ratio: 0.2
 72 | 
 73 | callbacks:
 74 |   learning_rate_monitor:
 75 |     _target_: pytorch_lightning.callbacks.LearningRateMonitor
 76 |     logging_interval: step
 77 | 
 78 |   rich_progress_bar:
 79 |     _target_: pytorch_lightning.callbacks.RichProgressBar
 80 | 
 81 |   model_summary:
 82 |     _target_: pytorch_lightning.callbacks.ModelSummary
 83 |     max_depth: 1
 84 | 
 85 |   model_checkpoint:
 86 |     _target_: pytorch_lightning.callbacks.ModelCheckpoint
 87 |     monitor: val_loss
 88 |     dirpath: ${probe_ckpt_dir}
 89 |     every_n_epochs: 1
 90 |     mode: min
 91 |     save_top_k: 1
 92 |     save_last: False
 93 |     filename: ${codec_name}_${mode}_{epoch}-{val_loss:.4f}
 94 |     verbose: True
 95 | 
 96 | tensorboard:
 97 |   _target_: pytorch_lightning.loggers.TensorBoardLogger
 98 |   save_dir: ???
 99 |   name: ${codec_name}_${mode}
100 |   log_graph: true


--------------------------------------------------------------------------------
/codec_evaluation/probe/config/GTZAN_dataset/encodec.yaml:
--------------------------------------------------------------------------------
  1 | mode: quantized_emb
  2 | sample_rate: 22050
  3 | target_sec: 10
  4 | num_outputs: 10
  5 | probe_ckpt_dir: ???
  6 | seed: 666
  7 | codec_name: encodec 
  8 | task: multiclass
  9 | save_result: null
 10 | 
 11 | 
 12 | trainer:
 13 |   _target_: pytorch_lightning.Trainer
 14 |   accelerator: gpu
 15 |   devices: ???
 16 |   precision: 32
 17 |   max_epochs: 100
 18 |   limit_val_batches: 10
 19 |   log_every_n_steps: 5
 20 |   val_check_interval: 1.0
 21 | 
 22 | data:
 23 |   _target_: codec_evaluation.probe.dataset.GTZAN_dataset.GTZAN_dataset.GTZANdataModule
 24 |   dataset_args:
 25 |     sample_rate: ${sample_rate}
 26 |     target_sec: ${target_sec}
 27 |   train_audio_dir: ???
 28 |   val_audio_dir: ???
 29 |   test_audio_dir: ???
 30 |   train_batch_size: 8
 31 |   val_batch_size: 1
 32 |   test_batch_size: 8
 33 |   train_num_workers: 8
 34 |   val_num_workers: 4
 35 |   test_num_workers: 4
 36 | 
 37 | model:
 38 |   _target_: codec_evaluation.probe.model.lit_prober.Prober
 39 |   codec_name: ${codec_name}
 40 |   sample_rate: ${sample_rate}
 41 |   mode: ${mode}
 42 |   task: ${task}
 43 |   num_outputs: ${num_outputs}
 44 |   probe_model_builder:
 45 |     _target_: codec_evaluation.probe.model.multiclass_model.MulticlassProber
 46 |     _partial_: true
 47 |     num_outputs: ${num_outputs}
 48 |     drop_out: 0.1
 49 |     channel_reduction: 16
 50 |     padding: 1
 51 |     kernel_size: 3
 52 |     stride: 1
 53 |   target_sec: ${target_sec}
 54 |   model_ckpt_dir: ???
 55 | 
 56 |   optimizer_builder:
 57 |       _target_: torch.optim.AdamW
 58 |       _partial_: true
 59 |       lr: 1e-4
 60 |       betas: [0.8, 0.99]
 61 |       eps: 1e-5
 62 |       weight_decay: 0.08
 63 |     
 64 |   lr_scheduler_builder:
 65 |     _target_: torch.optim.lr_scheduler.LambdaLR
 66 |     _partial_: true
 67 |     lr_lambda:
 68 |       _target_: codec_evaluation.utils.schedule.get_cosine_schedule_with_warmup_lr_lambda
 69 |       _partial_: true
 70 |       num_warmup_steps: 10
 71 |       num_training_steps: 10000
 72 |       final_lr_ratio: 0.2
 73 | 
 74 | callbacks:
 75 |   learning_rate_monitor:
 76 |     _target_: pytorch_lightning.callbacks.LearningRateMonitor
 77 |     logging_interval: step
 78 | 
 79 |   rich_progress_bar:
 80 |     _target_: pytorch_lightning.callbacks.RichProgressBar
 81 | 
 82 |   model_summary:
 83 |     _target_: pytorch_lightning.callbacks.ModelSummary
 84 |     max_depth: 1
 85 | 
 86 |   model_checkpoint:
 87 |     _target_: pytorch_lightning.callbacks.ModelCheckpoint
 88 |     monitor: val_loss
 89 |     dirpath: ${probe_ckpt_dir}
 90 |     every_n_epochs: 1
 91 |     mode: min
 92 |     save_top_k: 1
 93 |     save_last: False
 94 |     filename: ${codec_name}_${mode}_{epoch}-{val_loss:.4f}
 95 |     verbose: True
 96 | 
 97 | tensorboard:
 98 |   _target_: pytorch_lightning.loggers.TensorBoardLogger
 99 |   save_dir: ???
100 |   name: ${codec_name}_${mode}
101 |   log_graph: true


--------------------------------------------------------------------------------
/codec_evaluation/probe/config/MELD_dataset/encodec.yaml:
--------------------------------------------------------------------------------
  1 | mode: quantized_emb
  2 | sample_rate: 48000
  3 | target_sec: 8
  4 | num_outputs: 7
  5 | probe_ckpt_dir: ???
  6 | seed: 666
  7 | codec_name: encodec # 需要更改
  8 | task: multiclass
  9 | save_result: null
 10 | 
 11 | trainer:
 12 |   _target_: pytorch_lightning.Trainer
 13 |   accelerator: gpu
 14 |   devices: ???
 15 |   precision: 32
 16 |   max_epochs: 100
 17 |   limit_val_batches: 10
 18 |   log_every_n_steps: 5
 19 |   val_check_interval: 1.0
 20 | 
 21 | data:
 22 |   _target_: codec_evaluation.probe.dataset.MELD_dataset.MELD_dataset.MELDdataModule
 23 |   dataset_args:
 24 |     sample_rate: ${sample_rate}
 25 |     target_sec: ${target_sec}
 26 |   train_audio_dir: ???
 27 |   val_audio_dir: ???
 28 |   test_audio_dir: ???
 29 |   train_batch_size: 64
 30 |   val_batch_size: 2
 31 |   test_batch_size: 32
 32 |   train_num_workers: 8
 33 |   val_num_workers: 4
 34 |   test_num_workers: 4
 35 | 
 36 | model:
 37 |   _target_: codec_evaluation.probe.model.lit_prober.Prober
 38 |   codec_name: ${codec_name}
 39 |   sample_rate: ${sample_rate}
 40 |   mode: ${mode}
 41 |   task: ${task}
 42 |   num_outputs: ${num_outputs}
 43 |   probe_model_builder:
 44 |     _target_: codec_evaluation.probe.model.multiclass_model.MulticlassProber
 45 |     _partial_: true
 46 |     num_outputs: ${num_outputs}
 47 |     drop_out: 0.2
 48 |     channel_reduction: 16
 49 |     padding: 1
 50 |     kernel_size: 3
 51 |     stride: 1
 52 |   target_sec: ${target_sec}
 53 |   model_ckpt_dir: ???
 54 |   
 55 |   optimizer_builder:
 56 |       _target_: torch.optim.AdamW
 57 |       _partial_: true
 58 |       lr: 1e-4
 59 |       betas: [0.8, 0.99]
 60 |       eps: 1e-5
 61 |       weight_decay: 0.08
 62 |     
 63 |   lr_scheduler_builder:
 64 |     _target_: torch.optim.lr_scheduler.LambdaLR
 65 |     _partial_: true
 66 |     lr_lambda:
 67 |       _target_: codec_evaluation.utils.schedule.get_cosine_schedule_with_warmup_lr_lambda
 68 |       _partial_: true
 69 |       num_warmup_steps: 10
 70 |       num_training_steps: 10000
 71 |       final_lr_ratio: 0.2
 72 | 
 73 | callbacks:
 74 |   learning_rate_monitor:
 75 |     _target_: pytorch_lightning.callbacks.LearningRateMonitor
 76 |     logging_interval: step
 77 | 
 78 |   rich_progress_bar:
 79 |     _target_: pytorch_lightning.callbacks.RichProgressBar
 80 | 
 81 |   model_summary:
 82 |     _target_: pytorch_lightning.callbacks.ModelSummary
 83 |     max_depth: 1
 84 | 
 85 |   model_checkpoint:
 86 |     _target_: pytorch_lightning.callbacks.ModelCheckpoint
 87 |     monitor: val_loss
 88 |     dirpath: ${probe_ckpt_dir}
 89 |     every_n_epochs: 1
 90 |     mode: min
 91 |     save_top_k: 1
 92 |     save_last: False
 93 |     filename: ${codec_name}_${mode}_{epoch}-{val_loss:.4f}
 94 |     verbose: True
 95 | 
 96 | tensorboard:
 97 |   _target_: pytorch_lightning.loggers.TensorBoardLogger
 98 |   save_dir: ???
 99 |   name: ${codec_name}_${mode}
100 |   log_graph: true


--------------------------------------------------------------------------------
/codec_evaluation/probe/config/MTT_dataset/dac.yaml:
--------------------------------------------------------------------------------
  1 | mode: quantized_emb
  2 | sample_rate: 16000
  3 | target_sec: 5
  4 | num_outputs: 50
  5 | probe_ckpt_dir: ???
  6 | seed: 666
  7 | codec_name: dac
  8 | task: multilabel
  9 | save_result: null
 10 | 
 11 | trainer:
 12 |   _target_: pytorch_lightning.Trainer
 13 |   accelerator: gpu
 14 |   devices: ???
 15 |   precision: 32
 16 |   max_epochs: 50
 17 |   limit_val_batches: 10
 18 |   log_every_n_steps: 5
 19 |   val_check_interval: 1.0
 20 | 
 21 | data:
 22 |   _target_: codec_evaluation.probe.dataset.MTT_dataset.MTT_dataset.MTTdataModule
 23 |   dataset_args:
 24 |     sample_rate: ${sample_rate}
 25 |     target_sec: ${target_sec}
 26 |     base_audio_dir: /root/path/for/audio
 27 |   train_audio_dir: ???
 28 |   val_audio_dir: ???
 29 |   test_audio_dir: ???
 30 |   train_batch_size: 64
 31 |   val_batch_size: 8
 32 |   test_batch_size: 64
 33 |   train_num_workers: 16
 34 |   val_num_workers: 4
 35 |   test_num_workers: 16
 36 | 
 37 | model:
 38 |   _target_: codec_evaluation.probe.model.lit_prober.Prober
 39 |   codec_name: ${codec_name}
 40 |   sample_rate: ${sample_rate}
 41 |   mode: ${mode}
 42 |   task: ${task}
 43 |   num_outputs: ${num_outputs}
 44 |   probe_model_builder:
 45 |     _target_: codec_evaluation.probe.model.multilabel_model.MultilabelProber
 46 |     _partial_: true
 47 |     num_outputs: ${num_outputs}
 48 |     drop_out: 0.2
 49 |     channel_reduction: 16
 50 |     padding: 1
 51 |     kernel_size: 3
 52 |     stride: 1
 53 |   target_sec: ${target_sec}
 54 |   model_ckpt_dir: ???
 55 | 
 56 |   optimizer_builder:
 57 |       _target_: torch.optim.AdamW
 58 |       _partial_: true
 59 |       lr: 1e-4
 60 |       betas: [0.8, 0.99]
 61 |       eps: 1e-5
 62 |       weight_decay: 0.08
 63 |     
 64 |   lr_scheduler_builder:
 65 |     _target_: torch.optim.lr_scheduler.LambdaLR
 66 |     _partial_: true
 67 |     lr_lambda:
 68 |       _target_: codec_evaluation.utils.schedule.get_cosine_schedule_with_warmup_lr_lambda
 69 |       _partial_: true
 70 |       num_warmup_steps: 10
 71 |       num_training_steps: 10000
 72 |       final_lr_ratio: 0.2
 73 | 
 74 | callbacks:
 75 |   learning_rate_monitor:
 76 |     _target_: pytorch_lightning.callbacks.LearningRateMonitor
 77 |     logging_interval: step
 78 | 
 79 |   rich_progress_bar:
 80 |     _target_: pytorch_lightning.callbacks.RichProgressBar
 81 | 
 82 |   model_summary:
 83 |     _target_: pytorch_lightning.callbacks.ModelSummary
 84 |     max_depth: 1
 85 | 
 86 |   model_checkpoint:
 87 |     _target_: pytorch_lightning.callbacks.ModelCheckpoint
 88 |     monitor: val_loss
 89 |     dirpath: ${probe_ckpt_dir}
 90 |     every_n_epochs: 1
 91 |     mode: min
 92 |     save_top_k: 1
 93 |     filename: ${codec_name}_${mode}_{epoch}-{val_loss:.4f}
 94 |     verbose: True
 95 | 
 96 | tensorboard:
 97 |   _target_: pytorch_lightning.loggers.TensorBoardLogger
 98 |   save_dir: ???
 99 |   name: ${codec_name}_${mode}
100 |   log_graph: true


--------------------------------------------------------------------------------