├── SongGeneration
    ├── third_party
    │   ├── hub
    │   │   └── version.txt
    │   ├── demucs
    │   │   ├── __init__.py
    │   │   ├── models
    │   │   │   ├── __init__.py
    │   │   │   ├── pretrained.py
    │   │   │   └── spec.py
    │   │   └── ckpt
    │   │   │   └── htdemucs.yaml
    │   ├── dac
    │   │   ├── compare
    │   │   │   ├── __init__.py
    │   │   │   └── encodec.py
    │   │   ├── nn
    │   │   │   ├── __init__.py
    │   │   │   └── layers.py
    │   │   ├── model
    │   │   │   └── __init__.py
    │   │   ├── __init__.py
    │   │   ├── __main__.py
    │   │   └── utils
    │   │   │   ├── decode.py
    │   │   │   └── encode.py
    │   ├── stable_audio_tools
    │   │   ├── stable_audio_tools
    │   │   │   ├── data
    │   │   │   │   └── __init__.py
    │   │   │   ├── inference
    │   │   │   │   ├── __init__.py
    │   │   │   │   └── utils.py
    │   │   │   ├── interface
    │   │   │   │   └── __init__.py
    │   │   │   ├── training
    │   │   │   │   ├── losses
    │   │   │   │   │   ├── __init__.py
    │   │   │   │   │   └── losses.py
    │   │   │   │   └── __init__.py
    │   │   │   ├── models
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── pretrained.py
    │   │   │   │   └── diffusion_prior.py
    │   │   │   ├── __init__.py
    │   │   │   └── configs
    │   │   │   │   ├── dataset_configs
    │   │   │   │       ├── custom_metadata
    │   │   │   │       │   └── custom_md_example.py
    │   │   │   │       ├── s3_wds_example.json
    │   │   │   │       └── local_training_example.json
    │   │   │   │   └── model_configs
    │   │   │   │       ├── dance_diffusion
    │   │   │   │           ├── dance_diffusion_base.json
    │   │   │   │           ├── dance_diffusion_large.json
    │   │   │   │           ├── dance_diffusion_base_16k.json
    │   │   │   │           └── dance_diffusion_base_44k.json
    │   │   │   │       └── autoencoders
    │   │   │   │           ├── dac_2048_32_vae.json
    │   │   │   │           └── encodec_musicgen_rvq.json
    │   │   ├── pyproject.toml
    │   │   ├── scripts
    │   │   │   └── ds_zero_to_pl_ckpt.py
    │   │   ├── LICENSE
    │   │   ├── LICENSES
    │   │   │   ├── LICENSE_ADP.txt
    │   │   │   ├── LICENSE_XTRANSFORMERS.txt
    │   │   │   ├── LICENSE_DESCRIPT.txt
    │   │   │   ├── LICENSE_NVIDIA.txt
    │   │   │   └── LICENSE_META.txt
    │   │   ├── defaults.ini
    │   │   ├── setup.py
    │   │   ├── run_gradio.py
    │   │   └── docs
    │   │   │   └── pretransforms.md
    │   └── Qwen2-7B
    │   │   ├── generation_config.json
    │   │   ├── config.json
    │   │   └── tokenizer_config.json
    ├── codeclm
    │   ├── tokenizer
    │   │   ├── Flow1dVAE
    │   │   │   ├── __init__.py
    │   │   │   ├── models
    │   │   │   │   ├── __init__.py
    │   │   │   │   └── test_model.py
    │   │   │   ├── tools
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── __pycache__
    │   │   │   │   │   ├── mix.cpython-311.pyc
    │   │   │   │   │   ├── __init__.cpython-311.pyc
    │   │   │   │   │   ├── torch_tools.cpython-311.pyc
    │   │   │   │   │   └── get_1dvae_large.cpython-311.pyc
    │   │   │   │   ├── extract_rvq.py
    │   │   │   │   ├── safetensor2torch.py
    │   │   │   │   ├── get_1dvae.py
    │   │   │   │   ├── get_1dvae_1920.py
    │   │   │   │   ├── get_1dvae_large_melvae.py
    │   │   │   │   ├── get_1dvae_large.py
    │   │   │   │   ├── compare_2models.py
    │   │   │   │   ├── get_whisper_encoder.py
    │   │   │   │   ├── transmodelnorm.py
    │   │   │   │   ├── mix.py
    │   │   │   │   ├── check_stereo.py
    │   │   │   │   ├── infer_encodec.py
    │   │   │   │   ├── infer_encodec_speech.py
    │   │   │   │   ├── infer_encodec_vocal.py
    │   │   │   │   ├── creat_jsonl.py
    │   │   │   │   ├── infer_bsrnnvae441k.py
    │   │   │   │   ├── infer_bsrnnvae441k_vocal.py
    │   │   │   │   ├── infer_hifigan48k_speech.py
    │   │   │   │   ├── infer_hifigan48k_vocal.py
    │   │   │   │   ├── infer_vaehifigan48k_speech.py
    │   │   │   │   ├── infer_vaehifigan48k.py
    │   │   │   │   ├── infer_vaehifigan48k_vocal.py
    │   │   │   │   └── infer_vaehifigan48k_soundmusic.py
    │   │   │   ├── our_MERT_BESTRQ
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── mert_fairseq
    │   │   │   │   │   ├── __init__.py
    │   │   │   │   │   ├── models
    │   │   │   │   │   │   ├── __init__.py
    │   │   │   │   │   │   ├── musicfm
    │   │   │   │   │   │   │   ├── model
    │   │   │   │   │   │   │   │   ├── rvq.py
    │   │   │   │   │   │   │   │   ├── __init__.py
    │   │   │   │   │   │   │   │   └── w2v2_config.json
    │   │   │   │   │   │   │   ├── modules
    │   │   │   │   │   │   │   │   ├── __init__.py
    │   │   │   │   │   │   │   │   ├── features.py
    │   │   │   │   │   │   │   │   ├── random_quantizer.py
    │   │   │   │   │   │   │   │   └── conv.py
    │   │   │   │   │   │   │   └── __init__.py
    │   │   │   │   │   │   ├── mert
    │   │   │   │   │   │   │   ├── README.md
    │   │   │   │   │   │   │   └── __init__.py
    │   │   │   │   │   │   └── eat
    │   │   │   │   │   │   │   └── __init__.py
    │   │   │   │   │   ├── config
    │   │   │   │   │   │   └── pretrain
    │   │   │   │   │   │   │   ├── MERT_RVQ-VAE_CQT_95M_dac.yaml
    │   │   │   │   │   │   │   ├── run
    │   │   │   │   │   │   │       └── submitit_reg.yaml
    │   │   │   │   │   │   │   ├── MusicFM_95M_multinodes.yaml
    │   │   │   │   │   │   │   ├── MusicFM_95M_speech_multinodes.yaml
    │   │   │   │   │   │   │   ├── MusicFM_95M_bestrvq_multinodes.yaml
    │   │   │   │   │   │   │   ├── EAT_pretraining_AS2M.yaml
    │   │   │   │   │   │   │   ├── MERT_RVQ-VAE_CQT_95M.yaml
    │   │   │   │   │   │   │   ├── EAT_pretraining_music_multinodes.yaml
    │   │   │   │   │   │   │   ├── MERT_RVQ-VAE_CQT_95M_bestrvq_multinodes.yaml
    │   │   │   │   │   │   │   ├── MERT_RVQ-VAE_CQT_95M_dac_multinodes.yaml
    │   │   │   │   │   │   │   ├── MERT_RVQ-VAE_CQT_95M_mel_multinodes.yaml
    │   │   │   │   │   │   │   ├── MERT_RVQ-VAE_CQT_95M_bestrq.yaml
    │   │   │   │   │   │   │   ├── MERT_RVQ-VAE_CQT_95M_groupbestrq_multinodes.yaml
    │   │   │   │   │   │   │   ├── MERT_RVQ-VAE_CQT_95M_bestrq_multinodes.yaml
    │   │   │   │   │   │   │   ├── MERT_RVQ-VAE_CQT_95M_bestrq_chroma_multinodes.yaml
    │   │   │   │   │   │   │   ├── MERT_RVQ-VAE_CQT_95M_bestrq_norm_multinodes.yaml
    │   │   │   │   │   │   │   ├── MERT_RVQ-VAE_CQT_95M_bestrq_norm_speech_multinodes.yaml
    │   │   │   │   │   │   │   ├── MERT_RVQ-VAE_CQT_330M_orig.yaml
    │   │   │   │   │   │   │   ├── MERT_RVQ-VAE_CQT_330M_multinodes_debug1node.yaml
    │   │   │   │   │   │   │   └── MERT_RVQ-VAE_CQT_330M.yaml
    │   │   │   │   │   └── data
    │   │   │   │   │   │   ├── __init__.py
    │   │   │   │   │   │   └── eat_data
    │   │   │   │   │   │       ├── __init__.py
    │   │   │   │   │   │       └── add_class_target_dataset.py
    │   │   │   │   ├── modify_env.md
    │   │   │   │   ├── test.py
    │   │   │   │   ├── run_training_eat.sh
    │   │   │   │   └── run_training_orig.sh
    │   │   │   ├── models_gpt
    │   │   │   │   └── models
    │   │   │   │   │   └── tokenizer
    │   │   │   │   │       ├── structure.yaml
    │   │   │   │   │       ├── pinyin
    │   │   │   │   │           └── symbols.py
    │   │   │   │   │       └── tokenizer1.py
    │   │   │   ├── compare_model_weight.py
    │   │   │   ├── configs
    │   │   │   │   ├── scheduler
    │   │   │   │   │   └── stable_diffusion_2.1_largenoise_sample.json
    │   │   │   │   └── models
    │   │   │   │   │   └── transformer2D_wocross_inch112_1x4_multi_large.json
    │   │   │   ├── cal_token_stat.py
    │   │   │   ├── extract_codes_stereo_7_1x4.py
    │   │   │   ├── extract_codes_stereo_7_1x2.py
    │   │   │   └── extract_codes_stereo_7_1x4_ds.py
    │   │   └── __init__.py
    │   ├── models
    │   │   ├── __init__.py
    │   │   └── llama
    │   │   │   └── __init__.py
    │   └── utils
    │   │   └── autocast.py
    ├── sample
    │   ├── description
    │   │   ├── gender.txt
    │   │   ├── timbre.txt
    │   │   ├── emotion.txt
    │   │   ├── genre.txt
    │   │   └── instrument.txt
    │   ├── sample_prompt_audio.wav
    │   └── lyrics.jsonl
    ├── img
    │   ├── logo.jpg
    │   └── over.jpg
    └── conf
    │   ├── vocab.yaml
    │   └── w2v2_config.json
├── example_workflows
    └── SongGeneration.png
├── __init__.py
└── README.md


/SongGeneration/third_party/hub/version.txt:
--------------------------------------------------------------------------------
1 | 1


--------------------------------------------------------------------------------
/SongGeneration/third_party/demucs/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/SongGeneration/third_party/demucs/models/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/SongGeneration/codeclm/tokenizer/Flow1dVAE/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/SongGeneration/third_party/dac/compare/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/SongGeneration/codeclm/tokenizer/Flow1dVAE/models/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/SongGeneration/codeclm/tokenizer/Flow1dVAE/tools/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/SongGeneration/codeclm/tokenizer/__init__.py:
--------------------------------------------------------------------------------
1 | # no need for training


--------------------------------------------------------------------------------
/SongGeneration/sample/description/gender.txt:
--------------------------------------------------------------------------------
1 | female
2 | male
3 | 


--------------------------------------------------------------------------------
/SongGeneration/codeclm/tokenizer/Flow1dVAE/our_MERT_BESTRQ/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/SongGeneration/third_party/demucs/ckpt/htdemucs.yaml:
--------------------------------------------------------------------------------
1 | models: ['htdemucs']
2 | 


--------------------------------------------------------------------------------
/SongGeneration/codeclm/tokenizer/Flow1dVAE/our_MERT_BESTRQ/mert_fairseq/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/SongGeneration/third_party/stable_audio_tools/stable_audio_tools/data/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/SongGeneration/third_party/stable_audio_tools/stable_audio_tools/inference/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/SongGeneration/third_party/stable_audio_tools/stable_audio_tools/interface/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/SongGeneration/codeclm/tokenizer/Flow1dVAE/our_MERT_BESTRQ/mert_fairseq/models/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/SongGeneration/codeclm/tokenizer/Flow1dVAE/our_MERT_BESTRQ/mert_fairseq/models/musicfm/model/rvq.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/SongGeneration/sample/description/timbre.txt:
--------------------------------------------------------------------------------
1 | dark
2 | bright
3 | warm
4 | rock
5 | varies
6 | soft
7 | vocal
8 | 


--------------------------------------------------------------------------------
/SongGeneration/third_party/stable_audio_tools/stable_audio_tools/training/losses/__init__.py:
--------------------------------------------------------------------------------
1 | from .losses import *


--------------------------------------------------------------------------------
/SongGeneration/codeclm/tokenizer/Flow1dVAE/our_MERT_BESTRQ/mert_fairseq/config/pretrain/MERT_RVQ-VAE_CQT_95M_dac.yaml:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/SongGeneration/codeclm/tokenizer/Flow1dVAE/our_MERT_BESTRQ/mert_fairseq/models/musicfm/model/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 
3 | 


--------------------------------------------------------------------------------
/SongGeneration/codeclm/tokenizer/Flow1dVAE/our_MERT_BESTRQ/mert_fairseq/models/musicfm/modules/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 
3 | 


--------------------------------------------------------------------------------
/SongGeneration/img/logo.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/smthemex/ComfyUI_SongGeneration/HEAD/SongGeneration/img/logo.jpg


--------------------------------------------------------------------------------
/SongGeneration/img/over.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/smthemex/ComfyUI_SongGeneration/HEAD/SongGeneration/img/over.jpg


--------------------------------------------------------------------------------
/SongGeneration/third_party/dac/nn/__init__.py:
--------------------------------------------------------------------------------
1 | from . import layers
2 | from . import loss
3 | from . import quantize
4 | 


--------------------------------------------------------------------------------
/SongGeneration/codeclm/tokenizer/Flow1dVAE/our_MERT_BESTRQ/mert_fairseq/models/musicfm/__init__.py:
--------------------------------------------------------------------------------
1 | from .musicfm_model import *


--------------------------------------------------------------------------------
/SongGeneration/sample/description/emotion.txt:
--------------------------------------------------------------------------------
1 | sad
2 | emotional
3 | angry
4 | happy
5 | uplifting
6 | intense
7 | romantic
8 | melancholic
9 | 


--------------------------------------------------------------------------------
/example_workflows/SongGeneration.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/smthemex/ComfyUI_SongGeneration/HEAD/example_workflows/SongGeneration.png


--------------------------------------------------------------------------------
/SongGeneration/third_party/stable_audio_tools/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = ["setuptools"]
3 | build-backend = "setuptools.build_meta"


--------------------------------------------------------------------------------
/SongGeneration/codeclm/tokenizer/Flow1dVAE/our_MERT_BESTRQ/mert_fairseq/data/__init__.py:
--------------------------------------------------------------------------------
1 | from .mert_dataset import MERTDataset
2 | from .eat_data import *


--------------------------------------------------------------------------------
/SongGeneration/sample/sample_prompt_audio.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/smthemex/ComfyUI_SongGeneration/HEAD/SongGeneration/sample/sample_prompt_audio.wav


--------------------------------------------------------------------------------
/SongGeneration/third_party/stable_audio_tools/stable_audio_tools/models/__init__.py:
--------------------------------------------------------------------------------
1 | from .factory import create_model_from_config, create_model_from_config_path


--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | from .SongGeneration_node import NODE_CLASS_MAPPINGS, NODE_DISPLAY_NAME_MAPPINGS
3 | 
4 | __all__ = ['NODE_CLASS_MAPPINGS', 'NODE_DISPLAY_NAME_MAPPINGS']
5 | 


--------------------------------------------------------------------------------
/SongGeneration/third_party/dac/model/__init__.py:
--------------------------------------------------------------------------------
1 | from .base import CodecMixin
2 | from .base import DACFile
3 | from .dac import DAC
4 | from .discriminator import Discriminator
5 | 


--------------------------------------------------------------------------------
/SongGeneration/third_party/stable_audio_tools/stable_audio_tools/training/__init__.py:
--------------------------------------------------------------------------------
1 | from .factory import create_training_wrapper_from_config, create_demo_callback_from_config
2 | 


--------------------------------------------------------------------------------
/SongGeneration/third_party/stable_audio_tools/stable_audio_tools/__init__.py:
--------------------------------------------------------------------------------
1 | from .models.factory import create_model_from_config, create_model_from_config_path
2 | from .models.pretrained import get_pretrained_model


--------------------------------------------------------------------------------
/SongGeneration/third_party/Qwen2-7B/generation_config.json:
--------------------------------------------------------------------------------
1 | {
2 |   "bos_token_id": 151643,
3 |   "do_sample": false,
4 |   "eos_token_id": 151643,
5 |   "max_new_tokens": 2048,
6 |   "transformers_version": "4.37.0"
7 | }


--------------------------------------------------------------------------------
/SongGeneration/codeclm/tokenizer/Flow1dVAE/tools/__pycache__/mix.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/smthemex/ComfyUI_SongGeneration/HEAD/SongGeneration/codeclm/tokenizer/Flow1dVAE/tools/__pycache__/mix.cpython-311.pyc


--------------------------------------------------------------------------------
/SongGeneration/codeclm/tokenizer/Flow1dVAE/tools/__pycache__/__init__.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/smthemex/ComfyUI_SongGeneration/HEAD/SongGeneration/codeclm/tokenizer/Flow1dVAE/tools/__pycache__/__init__.cpython-311.pyc


--------------------------------------------------------------------------------
/SongGeneration/codeclm/tokenizer/Flow1dVAE/tools/__pycache__/torch_tools.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/smthemex/ComfyUI_SongGeneration/HEAD/SongGeneration/codeclm/tokenizer/Flow1dVAE/tools/__pycache__/torch_tools.cpython-311.pyc


--------------------------------------------------------------------------------
/SongGeneration/codeclm/tokenizer/Flow1dVAE/tools/__pycache__/get_1dvae_large.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/smthemex/ComfyUI_SongGeneration/HEAD/SongGeneration/codeclm/tokenizer/Flow1dVAE/tools/__pycache__/get_1dvae_large.cpython-311.pyc


--------------------------------------------------------------------------------
/SongGeneration/third_party/stable_audio_tools/stable_audio_tools/configs/dataset_configs/custom_metadata/custom_md_example.py:
--------------------------------------------------------------------------------
1 | def get_custom_metadata(info, audio):
2 | 
3 |     # Use relative path as the prompt
4 |     return {"prompt": info["relpath"]}


--------------------------------------------------------------------------------
/SongGeneration/codeclm/tokenizer/Flow1dVAE/models_gpt/models/tokenizer/structure.yaml:
--------------------------------------------------------------------------------
 1 | - '[start]'
 2 | - '[verse]'
 3 | - '[chorus]'
 4 | - '[outro]'
 5 | - '[end]'
 6 | - '[intro]'
 7 | - '[solo]'
 8 | - '[inst]'
 9 | - '[bridge]'
10 | - '[break]'
11 | 


--------------------------------------------------------------------------------
/SongGeneration/codeclm/tokenizer/Flow1dVAE/our_MERT_BESTRQ/mert_fairseq/models/mert/README.md:
--------------------------------------------------------------------------------
1 | add cauchy extension from https://github.com/HazyResearch/state-spaces
2 | ```shell
3 | cd state-spaces/extensions/cauchy
4 | python setup.py install
5 | ```
6 | 


--------------------------------------------------------------------------------
/SongGeneration/codeclm/tokenizer/Flow1dVAE/our_MERT_BESTRQ/mert_fairseq/models/eat/__init__.py:
--------------------------------------------------------------------------------
1 | # try:
2 | #from .EAT_pretraining import *
3 | # except:
4 | #     import sys, os
5 | #     sys.path.append(os.path.join(os.path.dirname(os.path.abspath(__file__)), '.'))
6 | #     from EAT_pretraining import *


--------------------------------------------------------------------------------
/SongGeneration/conf/vocab.yaml:
--------------------------------------------------------------------------------
 1 | - '[verse]'
 2 | - '[chorus]'
 3 | - '[bridge]'
 4 | - '[intro-short]'
 5 | - '[intro-medium]'
 6 | - '[intro-long]'
 7 | - '[outro-short]'
 8 | - '[outro-medium]'
 9 | - '[outro-long]'
10 | - '[inst-short]'
11 | - '[inst-medium]'
12 | - '[inst-long]'
13 | - '[silence]'
14 | 


--------------------------------------------------------------------------------
/SongGeneration/codeclm/tokenizer/Flow1dVAE/our_MERT_BESTRQ/mert_fairseq/models/mert/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | #
3 | # This source code is licensed under the MIT license found in the
4 | # LICENSE file in the root directory of this source tree.
5 | 
6 | from .mert_model import *  # noqa


--------------------------------------------------------------------------------
/SongGeneration/third_party/stable_audio_tools/stable_audio_tools/configs/dataset_configs/s3_wds_example.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "dataset_type": "s3",
 3 |     "datasets": [
 4 |         {
 5 |             "id": "s3-test",
 6 |             "s3_path": "s3://my-bucket/datasets/webdataset/audio/"
 7 |         }
 8 |     ],
 9 |     "random_crop": true
10 | }


--------------------------------------------------------------------------------
/SongGeneration/third_party/stable_audio_tools/stable_audio_tools/configs/dataset_configs/local_training_example.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "dataset_type": "audio_dir",
 3 |     "datasets": [
 4 |         {
 5 |             "id": "my_audio",
 6 |             "path": "train.jsonl",
 7 |             "custom_metadata_module": "custom_md_example.py"
 8 |         }
 9 |     ],
10 |     "random_crop": true
11 | }
12 | 


--------------------------------------------------------------------------------
/SongGeneration/codeclm/tokenizer/Flow1dVAE/compare_model_weight.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import sys
 3 | from safetensors.torch import load_file
 4 | 
 5 | if __name__ == "__main__":
 6 |     m0, m1 = sys.argv[1], sys.argv[2]
 7 |     m0 = load_file(m0)
 8 |     m1 = load_file(m1)
 9 |     
10 |     ks = [k for k in m0.keys() if 'bestrq' in k]
11 |     for k in ks:
12 |         print(k, (m0[k] - m1[k]).abs().sum())
13 |         


--------------------------------------------------------------------------------
/SongGeneration/third_party/dac/__init__.py:
--------------------------------------------------------------------------------
 1 | __version__ = "1.0.0"
 2 | 
 3 | # preserved here for legacy reasons
 4 | __model_version__ = "latest"
 5 | 
 6 | import audiotools
 7 | 
 8 | audiotools.ml.BaseModel.INTERN += ["dac.**"]
 9 | audiotools.ml.BaseModel.EXTERN += ["einops"]
10 | 
11 | 
12 | from . import nn
13 | from . import model
14 | from . import utils
15 | from .model import DAC
16 | from .model import DACFile
17 | 


--------------------------------------------------------------------------------
/SongGeneration/codeclm/tokenizer/Flow1dVAE/tools/extract_rvq.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import sys
 3 | 
 4 | if __name__=="__main__":
 5 |     p = sys.argv[1]
 6 |     bd = '/'.join(p.split('/')[:-1])
 7 |     bn = p.split('/')[-1]
 8 | 
 9 |     d = {}
10 |     m = torch.load(p, map_location='cpu')
11 |     for k in m.keys():
12 |         if('rvq' in k):
13 |             d[k] = m[k]
14 | 
15 |     torch.save(d, '{}/rvq.bin'.format(bd))


--------------------------------------------------------------------------------
/SongGeneration/codeclm/models/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | """
 7 | Models for EnCodec, AudioGen, MusicGen, as well as the generic LMModel.
 8 | """
 9 | # flake8: noqa
10 | from . import builders
11 | from .codeclm import CodecLM
12 | 


--------------------------------------------------------------------------------
/SongGeneration/sample/description/genre.txt:
--------------------------------------------------------------------------------
 1 | pop
 2 | electronic
 3 | hip hop
 4 | rock
 5 | jazz
 6 | blues
 7 | classical
 8 | rap
 9 | country
10 | classic rock
11 | hard rock
12 | folk
13 | soul
14 | dance, electronic
15 | rockabilly
16 | dance, dancepop, house, pop
17 | reggae
18 | experimental
19 | dance, pop
20 | dance, deephouse, electronic
21 | k-pop
22 | experimental pop
23 | pop punk
24 | rock and roll
25 | R&B
26 | varies
27 | pop rock
28 | 


--------------------------------------------------------------------------------
/SongGeneration/codeclm/tokenizer/Flow1dVAE/tools/safetensor2torch.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | from safetensors import safe_open
 3 | import torch
 4 | 
 5 | if __name__=="__main__":
 6 |     inname = sys.argv[1]
 7 |     outname = sys.argv[2]
 8 | 
 9 |     main_weights = {}
10 |     with safe_open(inname, framework="pt", device="cpu") as f:
11 |         for key in f.keys():
12 |             main_weights[key] = f.get_tensor(key)
13 | 
14 |     torch.save(main_weights, outname)


--------------------------------------------------------------------------------
/SongGeneration/codeclm/tokenizer/Flow1dVAE/configs/scheduler/stable_diffusion_2.1_largenoise_sample.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "_class_name": "DDIMScheduler",
 3 |   "_diffusers_version": "0.8.0",
 4 |   "beta_end": 0.02,
 5 |   "beta_schedule": "scaled_linear",
 6 |   "beta_start": 0.0015,
 7 |   "clip_sample": false,
 8 |   "num_train_timesteps": 1000,
 9 |   "prediction_type": "sample",
10 |   "set_alpha_to_one": false,
11 |   "skip_prk_steps": true,
12 |   "steps_offset": 1,
13 |   "trained_betas": null
14 | }
15 | 


--------------------------------------------------------------------------------
/SongGeneration/third_party/stable_audio_tools/stable_audio_tools/configs/model_configs/dance_diffusion/dance_diffusion_base.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "model_type": "diffusion_uncond",
 3 |     "sample_size": 65536,
 4 |     "sample_rate": 48000,
 5 |     "model": {
 6 |         "type": "DAU1d",
 7 |         "config": {
 8 |             "n_attn_layers": 5
 9 |         }
10 |     },
11 |     "training": {
12 |         "learning_rate": 1e-4,
13 |         "demo": {
14 |             "demo_every": 2000,
15 |             "demo_steps": 250
16 |         }
17 |     }
18 | }


--------------------------------------------------------------------------------
/SongGeneration/third_party/stable_audio_tools/stable_audio_tools/configs/model_configs/dance_diffusion/dance_diffusion_large.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "model_type": "diffusion_uncond",
 3 |     "sample_size": 131072,
 4 |     "sample_rate": 48000,
 5 |     "model": {
 6 |         "type": "DAU1d",
 7 |         "config": {
 8 |             "n_attn_layers": 5
 9 |         }
10 |     },
11 |     "training": {
12 |         "learning_rate": 1e-4,
13 |         "demo": {
14 |             "demo_every": 2000,
15 |             "demo_steps": 250
16 |         }
17 |     }
18 | }


--------------------------------------------------------------------------------
/SongGeneration/codeclm/tokenizer/Flow1dVAE/our_MERT_BESTRQ/modify_env.md:
--------------------------------------------------------------------------------
1 | cp -r fairseq/fairseq/model_parallel/megatron /opt/conda/envs/map/lib/python3.8/site-packages/fairseq/model_parallel/
2 | vi /opt/conda/envs/map/lib/python3.8/site-packages/apex/amp/_initialize.py # string_classes = str
3 | vi /opt/conda/envs/map/lib/python3.8/site-packages/fairseq/modules/layer_norm.py
4 | vi /opt/conda/envs/map/lib/python3.8/site-packages/fairseq/distributed/utils.py # import datetime; timeout=datetime.timedelta(seconds=51200); logger.info("add nccl time to 51200")
5 | 


--------------------------------------------------------------------------------
/SongGeneration/third_party/stable_audio_tools/stable_audio_tools/configs/model_configs/dance_diffusion/dance_diffusion_base_16k.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "model_type": "diffusion_uncond",
 3 |     "sample_size": 65536,
 4 |     "sample_rate": 16000,
 5 |     "model": {
 6 |         "type": "DAU1d",
 7 |         "config": {
 8 |             "n_attn_layers": 5
 9 |         }
10 |     },
11 |     "training": {
12 |         "learning_rate": 1e-4,
13 |         "demo": {
14 |             "demo_every": 2000,
15 |             "demo_steps": 250
16 |         }
17 |     }
18 | }


--------------------------------------------------------------------------------
/SongGeneration/third_party/stable_audio_tools/stable_audio_tools/configs/model_configs/dance_diffusion/dance_diffusion_base_44k.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "model_type": "diffusion_uncond",
 3 |     "sample_size": 65536,
 4 |     "sample_rate": 44100,
 5 |     "model": {
 6 |         "type": "DAU1d",
 7 |         "config": {
 8 |             "n_attn_layers": 5
 9 |         }
10 |     },
11 |     "training": {
12 |         "learning_rate": 4e-5,
13 |         "demo": {
14 |             "demo_every": 2000,
15 |             "demo_steps": 250
16 |         }
17 |     }
18 | }


--------------------------------------------------------------------------------
/SongGeneration/codeclm/tokenizer/Flow1dVAE/tools/get_1dvae.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from tqdm import tqdm
 3 | import torchaudio
 4 | from .....third_party.stable_audio_tools.stable_audio_tools.models.autoencoders import create_autoencoder_from_config
 5 | import numpy as np
 6 | import os
 7 | import json
 8 | 
 9 | def get_model(model_config, path):
10 |     with open(model_config) as f:
11 |         model_config = json.load(f)
12 |     state_dict = torch.load(path)
13 |     model = create_autoencoder_from_config(model_config)
14 |     model.load_state_dict(state_dict['state_dict'])
15 |     return model


--------------------------------------------------------------------------------
/SongGeneration/codeclm/tokenizer/Flow1dVAE/tools/get_1dvae_1920.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from tqdm import tqdm
 3 | import torchaudio
 4 | from .....third_party.stable_audio_tools.stable_audio_tools.models.autoencoders import create_autoencoder_from_config
 5 | import numpy as np
 6 | import os
 7 | import json
 8 | 
 9 | def get_model(model_config, path):
10 |     with open(model_config) as f:
11 |         model_config = json.load(f)
12 |     state_dict = torch.load(path)
13 |     model = create_autoencoder_from_config(model_config)
14 |     model.load_state_dict(state_dict['state_dict'])
15 |     return model


--------------------------------------------------------------------------------
/SongGeneration/codeclm/tokenizer/Flow1dVAE/tools/get_1dvae_large_melvae.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from tqdm import tqdm
 3 | import torchaudio
 4 | from .....third_party.stable_audio_tools.stable_audio_tools.models.autoencoders import create_autoencoder_from_config
 5 | import numpy as np
 6 | import os
 7 | import json
 8 | 
 9 | def get_model(model_config, path):
10 |     with open(model_config) as f:
11 |         model_config = json.load(f)
12 |     state_dict = torch.load(path, map_location='cpu')
13 |     model = create_autoencoder_from_config(model_config)
14 |     model.load_state_dict(state_dict['state_dict'], strict=False)
15 |     return model


--------------------------------------------------------------------------------
/SongGeneration/codeclm/tokenizer/Flow1dVAE/our_MERT_BESTRQ/mert_fairseq/config/pretrain/run/submitit_reg.yaml:
--------------------------------------------------------------------------------
 1 | # @package _global_
 2 | 
 3 | hydra:
 4 |   launcher:
 5 |     cpus_per_task: 8
 6 |     gpus_per_node: 8
 7 |     tasks_per_node: ${hydra.launcher.gpus_per_node}
 8 |     nodes: 4
 9 |     comment: null
10 |     mem_gb: 384
11 |     timeout_min: 4320
12 |     max_num_timeout: 100
13 |     constraint: volta32gb
14 |     name: ${hydra.job.config_name}/${hydra.job.override_dirname}
15 |     submitit_folder: ${hydra.sweep.dir}/submitit/%j
16 | 
17 | distributed_training:
18 |   distributed_world_size: 32
19 |   distributed_port: 29671
20 |   nprocs_per_node: 8
21 | 


--------------------------------------------------------------------------------
/SongGeneration/codeclm/tokenizer/Flow1dVAE/tools/get_1dvae_large.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from tqdm import tqdm
 3 | import torchaudio
 4 | from .....third_party.stable_audio_tools.stable_audio_tools.models.autoencoders import create_autoencoder_from_config
 5 | import numpy as np
 6 | import os
 7 | import json
 8 | 
 9 | def get_model(model_config, path):
10 |     with open(model_config) as f:
11 |         model_config = json.load(f)
12 |     state_dict = torch.load(path, map_location='cpu')
13 |     model = create_autoencoder_from_config(model_config)
14 |     model.load_state_dict(state_dict['state_dict'], strict=False)
15 |     del state_dict
16 |     return model
17 | 


--------------------------------------------------------------------------------
/SongGeneration/codeclm/tokenizer/Flow1dVAE/our_MERT_BESTRQ/mert_fairseq/data/eat_data/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | #
 3 | # This source code is licensed under the MIT license found in the
 4 | # LICENSE file in the root directory of this source tree.
 5 | try:
 6 |     from .mae_image_dataset import MaeImageDataset
 7 |     from .raw_audio_dataset import FileAudioDataset
 8 | except:
 9 |     import sys, os
10 |     sys.path.append(os.path.join(os.path.dirname(os.path.abspath(__file__)), '.'))
11 |     from mae_image_dataset import MaeImageDataset
12 |     from raw_audio_dataset import FileAudioDataset
13 | 
14 | __all__ = [
15 |     "MaeImageDataset",
16 |     "FileAudioDataset",
17 | ]


--------------------------------------------------------------------------------
/SongGeneration/third_party/stable_audio_tools/scripts/ds_zero_to_pl_ckpt.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | # from lightning.pytorch.utilities.deepspeed import convert_zero_checkpoint_to_fp32_state_dict
 3 | 
 4 | if __name__ == "__main__":
 5 | 
 6 |     parser = argparse.ArgumentParser()
 7 |     parser.add_argument("--save_path", type=str, help="Path to the zero checkpoint")
 8 |     parser.add_argument("--output_path", type=str, help="Path to the output checkpoint", default="lightning_model.pt")
 9 |     args = parser.parse_args()
10 | 
11 |     # lightning deepspeed has saved a directory instead of a file
12 |     save_path = args.save_path
13 |     output_path = args.output_path
14 |     convert_zero_checkpoint_to_fp32_state_dict(save_path, output_path)


--------------------------------------------------------------------------------
/SongGeneration/codeclm/tokenizer/Flow1dVAE/tools/compare_2models.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import sys
 3 | 
 4 | if __name__=="__main__":
 5 |     m1, m2 = sys.argv[1:3]
 6 |     m1 = torch.load(m1, map_location = 'cpu')
 7 |     m2 = torch.load(m2, map_location = 'cpu')
 8 |     m1_keys = set(m1.keys())
 9 |     m2_keys = set(m2.keys())
10 | 
11 |     m1_uniq_keys = m1_keys - m2_keys
12 |     m2_uniq_keys = m2_keys - m1_keys
13 |     m12_shared_keys = m1_keys & m2_keys
14 | 
15 |     print("m1_uniq_keys: ", m1_uniq_keys)
16 |     print("m2_uniq_keys: ", m2_uniq_keys)
17 |     print("m12_shared_keys but different: ")
18 |     for k in m12_shared_keys:
19 |         if(m1[k].numel() != m2[k].numel()):
20 |             print(k,m1[k].shape,m2[k].shape)
21 | 


--------------------------------------------------------------------------------
/SongGeneration/codeclm/tokenizer/Flow1dVAE/cal_token_stat.py:
--------------------------------------------------------------------------------
 1 | import kaldiio
 2 | from tqdm import tqdm
 3 | import torch
 4 | 
 5 | if __name__ == "__main__":
 6 |     bar = torch.zeros(1, 16384)
 7 |     with open('token.scp', 'r') as f:
 8 |         for item_idx, line in tqdm(enumerate(f)):
 9 |             idx, pos = line.strip().split()
10 |             codes = kaldiio.load_mat(pos)
11 |             for i0 in range(codes.shape[-1]):
12 |                 bar[0, codes[0, 0, i0]] += 1
13 |             if(item_idx % 1000 == 0):
14 |                 print("=========")
15 |                 print(1 - (bar[0]==0).sum() / bar.shape[-1])
16 |                 print("=========")
17 |         print("=========")
18 |         print(1 - (bar[0]==0).sum() / bar.shape[-1])
19 |         print("=========")


--------------------------------------------------------------------------------
/SongGeneration/third_party/Qwen2-7B/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "architectures": [
 3 |     "Qwen2ForCausalLM"
 4 |   ],
 5 |   "attention_dropout": 0.0,
 6 |   "bos_token_id": 151643,
 7 |   "eos_token_id": 151643,
 8 |   "hidden_act": "silu",
 9 |   "hidden_size": 3584,
10 |   "initializer_range": 0.02,
11 |   "intermediate_size": 18944,
12 |   "max_position_embeddings": 131072,
13 |   "max_window_layers": 28,
14 |   "model_type": "qwen2",
15 |   "num_attention_heads": 28,
16 |   "num_hidden_layers": 28,
17 |   "num_key_value_heads": 4,
18 |   "rms_norm_eps": 1e-06,
19 |   "rope_theta": 1000000.0,
20 |   "sliding_window": 131072,
21 |   "tie_word_embeddings": false,
22 |   "torch_dtype": "bfloat16",
23 |   "transformers_version": "4.37.2",
24 |   "use_cache": true,
25 |   "use_sliding_window": false,
26 |   "vocab_size": 152064
27 | }
28 | 


--------------------------------------------------------------------------------
/SongGeneration/third_party/dac/__main__.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | 
 3 | import argbind
 4 | 
 5 | from dac.utils import download
 6 | from dac.utils.decode import decode
 7 | from dac.utils.encode import encode
 8 | 
 9 | STAGES = ["encode", "decode", "download"]
10 | 
11 | 
12 | def run(stage: str):
13 |     """Run stages.
14 | 
15 |     Parameters
16 |     ----------
17 |     stage : str
18 |         Stage to run
19 |     """
20 |     if stage not in STAGES:
21 |         raise ValueError(f"Unknown command: {stage}. Allowed commands are {STAGES}")
22 |     stage_fn = globals()[stage]
23 | 
24 |     if stage == "download":
25 |         stage_fn()
26 |         return
27 | 
28 |     stage_fn()
29 | 
30 | 
31 | if __name__ == "__main__":
32 |     group = sys.argv.pop(1)
33 |     args = argbind.parse_args(group=group)
34 | 
35 |     with argbind.scope(args):
36 |         run(group)
37 | 


--------------------------------------------------------------------------------
/SongGeneration/codeclm/tokenizer/Flow1dVAE/configs/models/transformer2D_wocross_inch112_1x4_multi_large.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "_class_name": "Transformer2DModel",
 3 |   "_diffusers_version": "0.22.0.dev0",
 4 |   "activation_fn": "gelu-approximate",
 5 |   "attention_bias": true,
 6 |   "attention_head_dim": 72,
 7 |   "attention_type": "default",
 8 |   "cross_attention_dim": null,
 9 |   "double_self_attention": false,
10 |   "dropout": 0.0,
11 |   "in_channels": 96,
12 |   "norm_elementwise_affine": false,
13 |   "norm_eps": 1e-06,
14 |   "norm_num_groups": 32,
15 |   "norm_type": "ada_norm_single",
16 |   "num_attention_heads": 22,
17 |   "num_embeds_ada_norm": 1000,
18 |   "num_layers": 24,
19 |   "num_vector_embeds": null,
20 |   "only_cross_attention": false,
21 |   "out_channels": 32,
22 |   "patch_size": 2,
23 |   "sample_size": 384,
24 |   "upcast_attention": false,
25 |   "use_linear_projection": false
26 | }


--------------------------------------------------------------------------------
/SongGeneration/codeclm/tokenizer/Flow1dVAE/tools/get_whisper_encoder.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from transformers import WhisperProcessor, WhisperForConditionalGeneration
 3 | 
 4 | def get_whisper_encoder():
 5 |     processor = WhisperProcessor.from_pretrained("openai/whisper-large-v3")
 6 |     model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large-v3").model.encoder
 7 |     return processor, model.eval()
 8 | 
 9 | if __name__=="__main__":
10 |     import numpy as np
11 |     processor, model = get_whisper_encoder()
12 |     model = model.cuda()
13 |     
14 |     with torch.no_grad():
15 |         input_features = processor(np.random.rand(16000*30,), sampling_rate=16000, return_tensors="pt").input_features.cuda()
16 |         print(input_features.shape)
17 |         out = model(input_features.repeat(10,1,1))
18 |         import pdb;pdb.set_trace()
19 |         print(list(out.values())[0].shape)
20 | 


--------------------------------------------------------------------------------
/SongGeneration/sample/description/instrument.txt:
--------------------------------------------------------------------------------
 1 | synthesizer and piano
 2 | piano and drums
 3 | piano and synthesizer
 4 | synthesizer and drums
 5 | piano and strings
 6 | guitar and drums
 7 | guitar and piano
 8 | piano and double bass
 9 | piano and guitar
10 | acoustic guitar and piano
11 | acoustic guitar and synthesizer
12 | synthesizer and guitar
13 | piano and saxophone
14 | saxophone and piano
15 | piano and violin
16 | electric guitar and drums
17 | acoustic guitar and drums
18 | synthesizer
19 | guitar and fiddle
20 | guitar and harmonica
21 | synthesizer and acoustic guitar
22 | beats
23 | piano
24 | acoustic guitar and fiddle
25 | brass and piano
26 | bass and drums
27 | violin
28 | acoustic guitar and harmonica
29 | piano and cello
30 | saxophone and trumpet
31 | guitar and banjo
32 | guitar and synthesizer
33 | saxophone
34 | violin and piano
35 | synthesizer and bass
36 | synthesizer and electric guitar
37 | electric guitar and piano
38 | beats and piano
39 | synthesizer and
40 | guitar
41 | 


--------------------------------------------------------------------------------
/SongGeneration/third_party/dac/nn/layers.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import torch
 3 | import torch.nn as nn
 4 | import torch.nn.functional as F
 5 | from einops import rearrange
 6 | from torch.nn.utils import weight_norm
 7 | 
 8 | 
 9 | def WNConv1d(*args, **kwargs):
10 |     return weight_norm(nn.Conv1d(*args, **kwargs))
11 | 
12 | 
13 | def WNConvTranspose1d(*args, **kwargs):
14 |     return weight_norm(nn.ConvTranspose1d(*args, **kwargs))
15 | 
16 | 
17 | # Scripting this brings model speed up 1.4x
18 | @torch.jit.script
19 | def snake(x, alpha):
20 |     shape = x.shape
21 |     x = x.reshape(shape[0], shape[1], -1)
22 |     x = x + (alpha + 1e-9).reciprocal() * torch.sin(alpha * x).pow(2)
23 |     x = x.reshape(shape)
24 |     return x
25 | 
26 | 
27 | class Snake1d(nn.Module):
28 |     def __init__(self, channels):
29 |         super().__init__()
30 |         self.alpha = nn.Parameter(torch.ones(1, channels, 1))
31 | 
32 |     def forward(self, x):
33 |         return snake(x, self.alpha)
34 | 
35 | 


--------------------------------------------------------------------------------
/SongGeneration/third_party/stable_audio_tools/stable_audio_tools/models/pretrained.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | 
 3 | from .factory import create_model_from_config
 4 | from .utils import load_ckpt_state_dict
 5 | 
 6 | from huggingface_hub import hf_hub_download
 7 | 
 8 | def get_pretrained_model(name: str):
 9 |     
10 |     model_config_path = hf_hub_download(name, filename="model_config.json", repo_type='model')
11 | 
12 |     with open(model_config_path) as f:
13 |         model_config = json.load(f)
14 | 
15 |     model = create_model_from_config(model_config)
16 | 
17 |     # Try to download the model.safetensors file first, if it doesn't exist, download the model.ckpt file
18 |     try:
19 |         model_ckpt_path = hf_hub_download(name, filename="model.safetensors", repo_type='model')
20 |     except Exception as e:
21 |         model_ckpt_path = hf_hub_download(name, filename="model.ckpt", repo_type='model')
22 | 
23 |     model.load_state_dict(load_ckpt_state_dict(model_ckpt_path))
24 | 
25 |     return model, model_config


--------------------------------------------------------------------------------
/SongGeneration/codeclm/tokenizer/Flow1dVAE/tools/transmodelnorm.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | if __name__=="__main__":
 4 |     src_ckpt = 'saved/train_mulan_v3_48k_everything3/latest/pytorch_model_2.bin'
 5 |     tgt_ckpt = 'saved/train_mulan_v3_48k_everything3_sepnorm/src_pytorch_model_2.bin'
 6 |     # src_ckpt = 'saved/train_enhcodec2D_again/latest/pytorch_model_3.bin'
 7 |     # tgt_ckpt = 'saved/train_enhcodec2D_again_sepnorm/pytorch_model_3.bin'
 8 | 
 9 |     ckpt = torch.load(src_ckpt, map_location='cpu')
10 | 
11 |     ckpt['normfeat.sum_x'] = torch.ones(16, 32, dtype=ckpt['normfeat.sum_x'].dtype) * ckpt['normfeat.sum_x'] / ckpt['normfeat.counts']
12 |     ckpt['normfeat.sum_x2'] = torch.ones(16, 32, dtype=ckpt['normfeat.sum_x2'].dtype) * ckpt['normfeat.sum_x2'] / ckpt['normfeat.counts']
13 |     ckpt['normfeat.sum_target_x2'] = torch.ones(16, 32, dtype=ckpt['normfeat.sum_target_x2'].dtype) * ckpt['normfeat.sum_target_x2'] / ckpt['normfeat.counts']
14 |     ckpt['normfeat.counts'] = torch.ones_like(ckpt['normfeat.counts'])
15 |     torch.save(ckpt, tgt_ckpt)
16 |     


--------------------------------------------------------------------------------
/SongGeneration/codeclm/tokenizer/Flow1dVAE/our_MERT_BESTRQ/test.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from dataclasses import dataclass
 3 | from logging import getLogger
 4 | import torch.nn.functional as F
 5 | import fairseq.utils
 6 | from fairseq.checkpoint_utils import load_model_ensemble_and_task
 7 | import  folder_paths
 8 | import os
 9 | logger = getLogger(__name__)
10 | 
11 | @dataclass
12 | class UserDirModule:
13 |     user_dir: str
14 | 
15 | def load_model(model_dir, checkpoint_dir):
16 |     '''Load Fairseq SSL model'''
17 | 
18 |     #导入模型所在的代码模块
19 |     model_dir=os.path.join(folder_paths.base_path,"custom_nodes/ComfyUI_SongGeneration/SongGeneration",model_dir)
20 |     model_path = UserDirModule(model_dir)
21 |     
22 |     checkpoint_dir=os.path.join(folder_paths.models_dir,"SongGeneration/ckpt/encode-s12k.pt")
23 |     fairseq.utils.import_user_module(model_path)
24 |     #print(checkpoint_dir,model_dir)
25 |     #载入模型的checkpoint
26 |     model, cfg, task = load_model_ensemble_and_task([checkpoint_dir], strict=False)
27 |     model = model[0]
28 | 
29 |     return model
30 | 


--------------------------------------------------------------------------------
/SongGeneration/third_party/stable_audio_tools/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 Stability AI
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/SongGeneration/third_party/stable_audio_tools/stable_audio_tools/inference/utils.py:
--------------------------------------------------------------------------------
 1 | from ..data.utils import PadCrop
 2 | 
 3 | from torchaudio import transforms as T
 4 | 
 5 | def set_audio_channels(audio, target_channels):
 6 |     if target_channels == 1:
 7 |         # Convert to mono
 8 |         audio = audio.mean(1, keepdim=True)
 9 |     elif target_channels == 2:
10 |         # Convert to stereo
11 |         if audio.shape[1] == 1:
12 |             audio = audio.repeat(1, 2, 1)
13 |         elif audio.shape[1] > 2:
14 |             audio = audio[:, :2, :]
15 |     return audio
16 | 
17 | def prepare_audio(audio, in_sr, target_sr, target_length, target_channels, device):
18 |     
19 |     audio = audio.to(device)
20 | 
21 |     if in_sr != target_sr:
22 |         resample_tf = T.Resample(in_sr, target_sr).to(device)
23 |         audio = resample_tf(audio)
24 | 
25 |     audio = PadCrop(target_length, randomize=False)(audio)
26 | 
27 |     # Add batch dimension
28 |     if audio.dim() == 1:
29 |         audio = audio.unsqueeze(0).unsqueeze(0)
30 |     elif audio.dim() == 2:
31 |         audio = audio.unsqueeze(0)
32 | 
33 |     audio = set_audio_channels(audio, target_channels)
34 | 
35 |     return audio


--------------------------------------------------------------------------------
/SongGeneration/codeclm/tokenizer/Flow1dVAE/models_gpt/models/tokenizer/pinyin/symbols.py:
--------------------------------------------------------------------------------
 1 | _pause = ["sil", "eos", "sp", "#0", "#1", "#2", "#3"]
 2 | 
 3 | _initials = [
 4 |     "^",
 5 |     "b",
 6 |     "c",
 7 |     "ch",
 8 |     "d",
 9 |     "f",
10 |     "g",
11 |     "h",
12 |     "j",
13 |     "k",
14 |     "l",
15 |     "m",
16 |     "n",
17 |     "p",
18 |     "q",
19 |     "r",
20 |     "s",
21 |     "sh",
22 |     "t",
23 |     "x",
24 |     "z",
25 |     "zh",
26 | ]
27 | 
28 | _tones = ["1", "2", "3", "4", "5"]
29 | 
30 | _finals = [
31 |     "a",
32 |     "ai",
33 |     "an",
34 |     "ang",
35 |     "ao",
36 |     "e",
37 |     "ei",
38 |     "en",
39 |     "eng",
40 |     "er",
41 |     "i",
42 |     "ia",
43 |     "ian",
44 |     "iang",
45 |     "iao",
46 |     "ie",
47 |     "ii",
48 |     "iii",
49 |     "in",
50 |     "ing",
51 |     "iong",
52 |     "iou",
53 |     "o",
54 |     "ong",
55 |     "ou",
56 |     "u",
57 |     "ua",
58 |     "uai",
59 |     "uan",
60 |     "uang",
61 |     "uei",
62 |     "uen",
63 |     "ueng",
64 |     "uo",
65 |     "v",
66 |     "van",
67 |     "ve",
68 |     "vn",
69 | ]
70 | 
71 | symbols = _pause + _initials + [i + j for i in _finals for j in _tones]
72 | 


--------------------------------------------------------------------------------
/SongGeneration/third_party/stable_audio_tools/LICENSES/LICENSE_ADP.txt:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2022 archinet.ai
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/SongGeneration/third_party/stable_audio_tools/LICENSES/LICENSE_XTRANSFORMERS.txt:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 Phil Wang
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/SongGeneration/third_party/stable_audio_tools/LICENSES/LICENSE_DESCRIPT.txt:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023-present, Descript
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/SongGeneration/third_party/stable_audio_tools/LICENSES/LICENSE_NVIDIA.txt:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2022 NVIDIA CORPORATION.
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software. 
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/SongGeneration/third_party/stable_audio_tools/LICENSES/LICENSE_META.txt:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) Meta Platforms, Inc. and affiliates.
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/SongGeneration/third_party/demucs/models/pretrained.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | @File    : pretrained.py
 5 | @Time    : 2023/8/8 下午7:22
 6 | @Author  : waytan
 7 | @Contact : waytan@tencent.com
 8 | @License : (C)Copyright 2023, Tencent
 9 | @Desc    : Loading pretrained models.
10 | """
11 | from pathlib import Path
12 | 
13 | import yaml
14 | 
15 | from .apply import BagOfModels
16 | from .htdemucs import HTDemucs
17 | from .states import load_state_dict
18 | 
19 | 
20 | def add_model_flags(parser):
21 |     group = parser.add_mutually_exclusive_group(required=False)
22 |     group.add_argument("-s", "--sig", help="Locally trained XP signature.")
23 |     group.add_argument("-n", "--name", default=None,
24 |                        help="Pretrained model name or signature. Default is htdemucs.")
25 |     parser.add_argument("--repo", type=Path,
26 |                         help="Folder containing all pre-trained models for use with -n.")
27 | 
28 | 
29 | def get_model_from_yaml(yaml_file, model_file):
30 |     bag = yaml.safe_load(open(yaml_file))
31 |     model = load_state_dict(HTDemucs, model_file)
32 |     weights = bag.get('weights')
33 |     segment = bag.get('segment')
34 |     return BagOfModels([model], weights, segment)
35 | 


--------------------------------------------------------------------------------
/SongGeneration/codeclm/utils/autocast.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | 
 4 | class TorchAutocast:
 5 |     """TorchAutocast utility class.
 6 |     Allows you to enable and disable autocast. This is specially useful
 7 |     when dealing with different architectures and clusters with different
 8 |     levels of support.
 9 | 
10 |     Args:
11 |         enabled (bool): Whether to enable torch.autocast or not.
12 |         args: Additional args for torch.autocast.
13 |         kwargs: Additional kwargs for torch.autocast
14 |     """
15 |     def __init__(self, enabled: bool, *args, **kwargs):
16 |         self.autocast = torch.autocast(*args, **kwargs) if enabled else None
17 | 
18 |     def __enter__(self):
19 |         if self.autocast is None:
20 |             return
21 |         try:
22 |             self.autocast.__enter__()
23 |         except RuntimeError:
24 |             device = self.autocast.device
25 |             dtype = self.autocast.fast_dtype
26 |             raise RuntimeError(
27 |                 f"There was an error autocasting with dtype={dtype} device={device}\n"
28 |                 "If you are on the FAIR Cluster, you might need to use autocast_dtype=float16"
29 |             )
30 | 
31 |     def __exit__(self, *args, **kwargs):
32 |         if self.autocast is None:
33 |             return
34 |         self.autocast.__exit__(*args, **kwargs)
35 | 


--------------------------------------------------------------------------------
/SongGeneration/third_party/stable_audio_tools/defaults.ini:
--------------------------------------------------------------------------------
 1 | 
 2 | [DEFAULTS]
 3 | 
 4 | #name of the run
 5 | name = stable_audio_tools
 6 | 
 7 | # the batch size
 8 | batch_size = 8 
 9 | 
10 | # number of GPUs to use for training
11 | num_gpus = 1 
12 | 
13 | # number of nodes to use for training
14 | num_nodes = 1 
15 | 
16 | # Multi-GPU strategy for PyTorch Lightning
17 | strategy = ""
18 | 
19 | # Precision to use for training
20 | precision = "16-mixed"
21 | 
22 | # number of CPU workers for the DataLoader
23 | num_workers = 8
24 | 
25 | # the random seed
26 | seed = 42
27 | 
28 | # Batches for gradient accumulation
29 | accum_batches = 1
30 | 
31 | # Number of steps between checkpoints
32 | checkpoint_every = 10000                              
33 |                      
34 | # trainer checkpoint file to restart training from
35 | ckpt_path = ''
36 | 
37 | # model checkpoint file to start a new training run from
38 | pretrained_ckpt_path = ''
39 | 
40 | # Checkpoint path for the pretransform model if needed
41 | pretransform_ckpt_path = ''
42 | 
43 | # configuration model specifying model hyperparameters
44 | model_config = ''
45 | 
46 | # configuration for datasets
47 | dataset_config = ''
48 | 
49 | # directory to save the checkpoints in
50 | save_dir = ''
51 | 
52 | # gradient_clip_val passed into PyTorch Lightning Trainer
53 | gradient_clip_val = 0.0
54 | 
55 | # remove the weight norm from the pretransform model
56 | remove_pretransform_weight_norm = ''


--------------------------------------------------------------------------------
/SongGeneration/sample/lyrics.jsonl:
--------------------------------------------------------------------------------
1 | {"idx": "sample_01_autoprompt", "gt_lyric": "[intro-short] ; [verse] 雪花舞动在无尽的天际.情缘如同雪花般轻轻逝去.希望与真挚.永不磨灭.你的忧虑.随风而逝 ; [chorus] 我怀抱着守护这片梦境.在这世界中寻找爱与虚幻.苦辣酸甜.我们一起品尝.在雪的光芒中.紧紧相拥 ; [inst-short] ; [verse] 雪花再次在风中飘扬.情愿如同雪花般消失无踪.希望与真挚.永不消失.在痛苦与喧嚣中.你找到解脱 ; [chorus] 我环绕着守护这片梦境.在这世界中感受爱与虚假.苦辣酸甜.我们一起分享.在白银的光芒中.我们同在 ; [outro-short]", "auto_prompt_audio_type": "Auto"}
2 | {"idx": "sample_01_noprompt", "gt_lyric": "[intro-short] ; [verse] 雪花舞动在无尽的天际.情缘如同雪花般轻轻逝去.希望与真挚.永不磨灭.你的忧虑.随风而逝 ; [chorus] 我怀抱着守护这片梦境.在这世界中寻找爱与虚幻.苦辣酸甜.我们一起品尝.在雪的光芒中.紧紧相拥 ; [inst-short] ; [verse] 雪花再次在风中飘扬.情愿如同雪花般消失无踪.希望与真挚.永不消失.在痛苦与喧嚣中.你找到解脱 ; [chorus] 我环绕着守护这片梦境.在这世界中感受爱与虚假.苦辣酸甜.我们一起分享.在白银的光芒中.我们同在 ; [outro-short]"}
3 | {"idx": "sample_01_textprompt", "descriptions": "female, dark, pop, sad, piano and drums, the bpm is 125.", "gt_lyric": "[intro-short] ;  [verse] 雪花舞动在无尽的天际.情缘如同雪花般轻轻逝去.希望与真挚.永不磨灭.你的忧虑.随风而逝 ; [chorus] 我怀抱着守护这片梦境.在这世界中寻找爱与虚幻.苦辣酸甜.我们一起品尝.在雪的光芒中.紧紧相拥 ; [inst-short] ; [verse] 雪花再次在风中飘扬.情愿如同雪花般消失无踪.希望与真挚.永不消失.在痛苦与喧嚣中.你找到解脱 ; [chorus] 我环绕着守护这片梦境.在这世界中感受爱与虚假.苦辣酸甜.我们一起分享.在白银的光芒中.我们同在 ; [outro-short]"}
4 | {"idx": "sample_01_audioprompt", "gt_lyric": "[intro-short] ; [verse] 雪花舞动在无尽的天际.情缘如同雪花般轻轻逝去.希望与真挚.永不磨灭.你的忧虑.随风而逝 ; [chorus] 我怀抱着守护这片梦境.在这世界中寻找爱与虚幻.苦辣酸甜.我们一起品尝.在雪的光芒中.紧紧相拥 ; [inst-short] ; [verse] 雪花再次在风中飘扬.情愿如同雪花般消失无踪.希望与真挚.永不消失.在痛苦与喧嚣中.你找到解脱 ; [chorus] 我环绕着守护这片梦境.在这世界中感受爱与虚假.苦辣酸甜.我们一起分享.在白银的光芒中.我们同在 ; [outro-short]", "prompt_audio_path": "input/sample_prompt_audio.wav"}
5 | 


--------------------------------------------------------------------------------
/SongGeneration/third_party/Qwen2-7B/tokenizer_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "add_prefix_space": false,
 3 |   "added_tokens_decoder": {
 4 |     "151643": {
 5 |       "content": "<|endoftext|>",
 6 |       "lstrip": false,
 7 |       "normalized": false,
 8 |       "rstrip": false,
 9 |       "single_word": false,
10 |       "special": true
11 |     },
12 |     "151644": {
13 |       "content": "<|im_start|>",
14 |       "lstrip": false,
15 |       "normalized": false,
16 |       "rstrip": false,
17 |       "single_word": false,
18 |       "special": true
19 |     },
20 |     "151645": {
21 |       "content": "<|im_end|>",
22 |       "lstrip": false,
23 |       "normalized": false,
24 |       "rstrip": false,
25 |       "single_word": false,
26 |       "special": true
27 |     }
28 |   },
29 |   "additional_special_tokens": ["<|im_start|>", "<|im_end|>"],
30 |   "bos_token": null,
31 |   "chat_template": "{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system\nYou are a helpful assistant<|im_end|>\n' }}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
32 |   "clean_up_tokenization_spaces": false,
33 |   "eos_token": "<|endoftext|>",
34 |   "errors": "replace",
35 |   "model_max_length": 32768,
36 |   "pad_token": "<|endoftext|>",
37 |   "split_special_tokens": false,
38 |   "tokenizer_class": "Qwen2Tokenizer",
39 |   "unk_token": null
40 | }
41 | 


--------------------------------------------------------------------------------
/SongGeneration/third_party/stable_audio_tools/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | 
 3 | setup(
 4 |     name='stable-audio-tools',
 5 |     version='0.0.16',
 6 |     url='https://github.com/Stability-AI/stable-audio-tools.git',
 7 |     author='Stability AI',
 8 |     description='Training and inference tools for generative audio models from Stability AI',
 9 |     packages=find_packages(),  
10 |     install_requires=[
11 |         'aeiou==0.0.20',
12 |         'alias-free-torch==0.0.6',
13 |         'auraloss==0.4.0',
14 |         'descript-audio-codec==1.0.0',
15 |         'einops==0.7.0',
16 |         'einops-exts==0.0.4',
17 |         'ema-pytorch==0.2.3',
18 |         'encodec==0.1.1',
19 |         'gradio>=3.42.0',
20 |         'huggingface_hub',
21 |         'importlib-resources==5.12.0',
22 |         'k-diffusion==0.1.1',
23 |         'laion-clap==1.1.4',
24 |         'local-attention==1.8.6',
25 |         'pandas==2.0.2',
26 |         'pedalboard==0.7.4',
27 |         'prefigure==0.0.9',
28 |         'pytorch_lightning==2.1.0', 
29 |         'PyWavelets==1.4.1',
30 |         'safetensors',
31 |         'sentencepiece==0.1.99',
32 |         's3fs',
33 |         'torch>=2.0.1',
34 |         'torchaudio>=2.0.2',
35 |         'torchmetrics==0.11.4',
36 |         'tqdm',
37 |         'transformers',
38 |         'v-diffusion-pytorch==0.0.2',
39 |         'vector-quantize-pytorch==1.9.14',
40 |         'wandb==0.15.4',
41 |         'webdataset==0.2.48',
42 |         'x-transformers<1.27.0'
43 |     ],
44 | )


--------------------------------------------------------------------------------
/SongGeneration/third_party/demucs/models/spec.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | @File    : spec.py
 5 | @Time    : 2023/8/8 下午5:10
 6 | @Author  : waytan
 7 | @Contact : waytan@tencent.com
 8 | @License : (C)Copyright 2023, Tencent
 9 | @Desc    : Spec
10 | """
11 | 
12 | import torch as th
13 | 
14 | 
15 | def spectro(x, n_fft=512, hop_length=None, pad=0):
16 |     *other, length = x.shape
17 |     x = x.reshape(-1, length)
18 |     is_mps = x.device.type == 'mps'
19 |     if is_mps:
20 |         x = x.cpu()
21 |     z = th.stft(x,
22 |                 n_fft * (1 + pad),
23 |                 hop_length or n_fft // 4,
24 |                 window=th.hann_window(n_fft).to(x),
25 |                 win_length=n_fft,
26 |                 normalized=True,
27 |                 center=True,
28 |                 return_complex=True,
29 |                 pad_mode='reflect')
30 |     _, freqs, frame = z.shape
31 |     return z.view(*other, freqs, frame)
32 | 
33 | 
34 | def ispectro(z, hop_length=None, length=None, pad=0):
35 |     *other, freqs, frames = z.shape
36 |     n_fft = 2 * freqs - 2
37 |     z = z.view(-1, freqs, frames)
38 |     win_length = n_fft // (1 + pad)
39 |     is_mps = z.device.type == 'mps'
40 |     if is_mps:
41 |         z = z.cpu()
42 |     x = th.istft(z,
43 |                  n_fft,
44 |                  hop_length,
45 |                  window=th.hann_window(win_length).to(z.real),
46 |                  win_length=win_length,
47 |                  normalized=True,
48 |                  length=length,
49 |                  center=True)
50 |     _, length = x.shape
51 |     return x.view(*other, length)
52 | 


--------------------------------------------------------------------------------
/SongGeneration/third_party/stable_audio_tools/run_gradio.py:
--------------------------------------------------------------------------------
 1 | from stable_audio_tools import get_pretrained_model
 2 | from stable_audio_tools.interface.gradio import create_ui
 3 | import json 
 4 | 
 5 | import torch
 6 | 
 7 | def main(args):
 8 |     torch.manual_seed(42)
 9 | 
10 |     interface = create_ui(
11 |         model_config_path = args.model_config, 
12 |         ckpt_path=args.ckpt_path, 
13 |         pretrained_name=args.pretrained_name, 
14 |         pretransform_ckpt_path=args.pretransform_ckpt_path,
15 |         model_half=args.model_half
16 |     )
17 |     interface.queue()
18 |     interface.launch(share=args.share, auth=(args.username, args.password) if args.username is not None else None)
19 | 
20 | if __name__ == "__main__":
21 |     import argparse
22 |     parser = argparse.ArgumentParser(description='Run gradio interface')
23 |     parser.add_argument('--pretrained-name', type=str, help='Name of pretrained model', required=False)
24 |     parser.add_argument('--model-config', type=str, help='Path to model config', required=False)
25 |     parser.add_argument('--ckpt-path', type=str, help='Path to model checkpoint', required=False)
26 |     parser.add_argument('--pretransform-ckpt-path', type=str, help='Optional to model pretransform checkpoint', required=False)
27 |     parser.add_argument('--share', action='store_true', help='Create a publicly shareable link', required=False)
28 |     parser.add_argument('--username', type=str, help='Gradio username', required=False)
29 |     parser.add_argument('--password', type=str, help='Gradio password', required=False)
30 |     parser.add_argument('--model-half', action='store_true', help='Whether to use half precision', required=False)
31 |     args = parser.parse_args()
32 |     main(args)


--------------------------------------------------------------------------------
/SongGeneration/third_party/dac/compare/encodec.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from audiotools import AudioSignal
 3 | from audiotools.ml import BaseModel
 4 | from encodec import EncodecModel
 5 | 
 6 | 
 7 | class Encodec(BaseModel):
 8 |     def __init__(self, sample_rate: int = 24000, bandwidth: float = 24.0):
 9 |         super().__init__()
10 | 
11 |         if sample_rate == 24000:
12 |             self.model = EncodecModel.encodec_model_24khz()
13 |         else:
14 |             self.model = EncodecModel.encodec_model_48khz()
15 |         self.model.set_target_bandwidth(bandwidth)
16 |         self.sample_rate = 44100
17 | 
18 |     def forward(
19 |         self,
20 |         audio_data: torch.Tensor,
21 |         sample_rate: int = 44100,
22 |         n_quantizers: int = None,
23 |     ):
24 |         signal = AudioSignal(audio_data, sample_rate)
25 |         signal.resample(self.model.sample_rate)
26 |         recons = self.model(signal.audio_data)
27 |         recons = AudioSignal(recons, self.model.sample_rate)
28 |         recons.resample(sample_rate)
29 |         return {"audio": recons.audio_data}
30 | 
31 | 
32 | if __name__ == "__main__":
33 |     import numpy as np
34 |     from functools import partial
35 | 
36 |     model = Encodec()
37 | 
38 |     for n, m in model.named_modules():
39 |         o = m.extra_repr()
40 |         p = sum([np.prod(p.size()) for p in m.parameters()])
41 |         fn = lambda o, p: o + f" {p/1e6:<.3f}M params."
42 |         setattr(m, "extra_repr", partial(fn, o=o, p=p))
43 |     print(model)
44 |     print("Total # of params: ", sum([np.prod(p.size()) for p in model.parameters()]))
45 | 
46 |     length = 88200 * 2
47 |     x = torch.randn(1, 1, length).to(model.device)
48 |     x.requires_grad_(True)
49 |     x.retain_grad()
50 | 
51 |     # Make a forward pass
52 |     out = model(x)["audio"]
53 | 
54 |     print(x.shape, out.shape)
55 | 


--------------------------------------------------------------------------------
/SongGeneration/codeclm/tokenizer/Flow1dVAE/extract_codes_stereo_7_1x4.py:
--------------------------------------------------------------------------------
 1 | import torch,torchaudio
 2 | import os,sys,json
 3 | from tqdm import tqdm
 4 | 
 5 | #from codeclm_song_v1.codeclm.semantic_extractor.SpeechDecoder_v01.generate import Tango
 6 | from .generate_4rvq import Tango
 7 | import kaldiio
 8 | from kaldiio import WriteHelper
 9 | 
10 | if __name__ == "__main__":
11 |     # Define Model
12 |     json_path = sys.argv[1]
13 |     outdir = sys.argv[2]
14 |     
15 |     mus_infos = []
16 |     with open(json_path) as f:
17 |         for line in f:
18 |             item = json.loads(line)
19 |             mus_infos.append(item)
20 | 
21 |     tango = Tango(model_path = './saved/model_4rvq/model_2_fixed.safetensors', rvq_num=4)
22 |     
23 |     
24 |     # Feature extraction loop
25 |     # for i in tqdm(range(2000)):
26 |     with WriteHelper('ark,scp:{}/token.ark,{}/token.scp'.format(outdir, outdir), write_function="pickle") as writer:
27 |         print('ark,scp:{}/token.ark,{}/token.scp'.format(outdir, outdir))
28 |         for item in tqdm(mus_infos):
29 |             try:
30 |             # if True:
31 |                 idx = item['idx']
32 |                 # print(idx)
33 |                 with torch.autocast(device_type="cuda", dtype=torch.float16):
34 |                     if(os.path.exists(item['path'])):
35 |                         codes = tango.file2code(item['path'])
36 |                     else:
37 |                         codes = tango.file2code('/mnt/share/' + item['path'])
38 |                 writer(str(idx), codes.cpu())
39 |             except:
40 |                 print(item['path'])
41 |                 continue
42 |             # idx = item['idx']
43 |             # # print(idx)
44 |             # with torch.autocast(device_type="cuda", dtype=torch.float16):
45 |             #     codes = tango.file2code(item['path'])
46 |             # writer(str(idx), codes.cpu())


--------------------------------------------------------------------------------
/SongGeneration/codeclm/tokenizer/Flow1dVAE/tools/mix.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | def a_weight(fs, n_fft, min_db=-80.0):
 5 |     freq = np.linspace(0, fs // 2, n_fft // 2 + 1)
 6 |     freq_sq = np.power(freq, 2)
 7 |     freq_sq[0] = 1.0
 8 |     weight = 2.0 + 20.0 * (2 * np.log10(12194) + 2 * np.log10(freq_sq)
 9 |                            - np.log10(freq_sq + 12194 ** 2)
10 |                            - np.log10(freq_sq + 20.6 ** 2)
11 |                            - 0.5 * np.log10(freq_sq + 107.7 ** 2)
12 |                            - 0.5 * np.log10(freq_sq + 737.9 ** 2))
13 |     weight = np.maximum(weight, min_db)
14 | 
15 |     return weight
16 | 
17 | 
18 | def compute_gain(sound, fs, min_db=-80.0, mode="A_weighting"):
19 |     if fs == 16000:
20 |         n_fft = 2048
21 |     elif fs == 44100:
22 |         n_fft = 4096
23 |     else:
24 |         raise Exception("Invalid fs {}".format(fs))
25 |     stride = n_fft // 2
26 | 
27 |     gain = []
28 |     for i in range(0, len(sound) - n_fft + 1, stride):
29 |         if mode == "RMSE":
30 |             g = np.mean(sound[i: i + n_fft] ** 2)
31 |         elif mode == "A_weighting":
32 |             spec = np.fft.rfft(np.hanning(n_fft + 1)[:-1] * sound[i: i + n_fft])
33 |             power_spec = np.abs(spec) ** 2
34 |             a_weighted_spec = power_spec * np.power(10, a_weight(fs, n_fft) / 10)
35 |             g = np.sum(a_weighted_spec)
36 |         else:
37 |             raise Exception("Invalid mode {}".format(mode))
38 |         gain.append(g)
39 | 
40 |     gain = np.array(gain)
41 |     gain = np.maximum(gain, np.power(10, min_db / 10))
42 |     gain_db = 10 * np.log10(gain)
43 |     return gain_db
44 | 
45 | 
46 | def mix(sound1, sound2, r, fs):
47 |     gain1 = np.max(compute_gain(sound1, fs))  # Decibel
48 |     gain2 = np.max(compute_gain(sound2, fs))
49 |     t = 1.0 / (1 + np.power(10, (gain1 - gain2) / 20.) * (1 - r) / r)
50 |     sound = ((sound1 * t + sound2 * (1 - t)) / np.sqrt(t ** 2 + (1 - t) ** 2))
51 |     return sound


--------------------------------------------------------------------------------
/SongGeneration/codeclm/tokenizer/Flow1dVAE/tools/check_stereo.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | TAMPLEATE = {
 3 |     "path": ""
 4 |     "duration": ""
 5 |     "sample_rate": ""
 6 |     "amplitude": null, 
 7 |     "weight": null, 
 8 |     "info_path": null
 9 | }
10 | '''
11 | import torchaudio
12 | import json
13 | from tqdm import tqdm
14 | 
15 | import torchaudio
16 | import numpy as np
17 | import torch, torch.nn as nn, random
18 | from torchaudio import transforms
19 | import os
20 | import argparse
21 | from tqdm import tqdm
22 | import torchaudio
23 | from torchaudio.transforms import Resample
24 | from multiprocessing import Pool
25 | 
26 | def preprocess(args, wav_json, thread_id):
27 |     # f =  open("pretrain_tme_20230927.scp").readlines() 
28 |     f = open("out.{}".format(thread_id), 'w')
29 |     for line in tqdm(wav_json):
30 |         try:
31 |             # import pdb; pdb.set_trace()
32 |             line = line.strip()
33 |             wav_info = json.loads(line)
34 |             meta = torchaudio.info(wav_info["path"])
35 |             
36 |             wav_info["num_channels"] = meta.num_channels
37 |             json_string = json.dumps(wav_info)
38 |             # print(json_string)
39 |             f.write("{}\n".format(json_string))
40 |         except:
41 |             print(line)
42 | 
43 | if __name__ == "__main__":
44 | 
45 |     parser = argparse.ArgumentParser(description='Deep Speaker Embedding Inference')
46 |     parser.add_argument('--wav_json', type=str)
47 |     parser.add_argument('--num_thread', default=10, type=int, help='random seed')
48 |     args = parser.parse_args()
49 |     
50 |     wav_json_total = open(args.wav_json).readlines()
51 |     args.num_thread = min(len(wav_json_total), args.num_thread)
52 |     wav_json_list = np.array_split(wav_json_total, args.num_thread)
53 | 
54 |     p = Pool(args.num_thread)
55 |     for thread_id, wav_json in enumerate(wav_json_list):
56 |         r = p.apply_async(preprocess, (args, wav_json, thread_id))
57 |     p.close()
58 |     p.join() 
59 |     r.get()
60 | 


--------------------------------------------------------------------------------
/SongGeneration/codeclm/tokenizer/Flow1dVAE/tools/infer_encodec.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import torch
 3 | from tqdm import tqdm
 4 | from audiocraft.models.loaders import load_compression_model
 5 | import torchaudio
 6 | import librosa
 7 | import os
 8 | import math
 9 | import numpy as np
10 | 
11 | class Tango:
12 |     def __init__(self, \
13 |         device="cuda:0"):
14 |         
15 |         self.sample_rate = 48000
16 |         self.rsp48to32 = torchaudio.transforms.Resample(48000, 32000).to(device)
17 |         self.rsp32to48 = torchaudio.transforms.Resample(32000, 48000).to(device)
18 | 
19 |         encodec = load_compression_model('compression_state_dict.bin', device='cpu').eval()
20 |         encodec.set_num_codebooks(1)
21 |         self.encodec = encodec.eval().to(device)
22 |         self.device = torch.device(device)
23 |         print ("Successfully loaded encodec model")
24 | 
25 |     @torch.no_grad()
26 |     def remix(self, filename, start_step=1000, steps=999, disable_progress=False):
27 |         """ Genrate audio without condition. """
28 |         init_audio, _ = librosa.load(filename, sr=self.sample_rate, mono=False)
29 |         if(len(init_audio.shape)>1):init_audio = init_audio[0]
30 |         init_audio = torch.from_numpy(init_audio)[None,None,:].to(self.device)
31 |         init_audio = init_audio[:,:,int(0*self.sample_rate):int(10.24*3*self.sample_rate)]
32 |         if(init_audio.shape[-1]<int(10.24*3*self.sample_rate)):
33 |             init_audio = torch.cat([init_audio, torch.zeros([1,1,int(10.24*3*self.sample_rate)-init_audio.shape[-1]], device=self.device)],-1)
34 | 
35 |         rsped_audios = self.rsp48to32(init_audio)
36 |         codes_rspd = self.encodec.encode(rsped_audios)[0]
37 |         codec_audios = self.encodec.decode(codes_rspd, None)
38 |         codec_audios = self.rsp32to48(codec_audios)
39 |         rsped_audios = self.rsp32to48(rsped_audios)
40 | 
41 |         minlen = min(rsped_audios.shape[-1], codec_audios.shape[-1])
42 |         output = torch.cat([rsped_audios.detach().cpu()[:,0,0:minlen],codec_audios.detach().cpu()[:,0,0:minlen]],0)
43 |         return output
44 | 


--------------------------------------------------------------------------------
/SongGeneration/codeclm/tokenizer/Flow1dVAE/our_MERT_BESTRQ/mert_fairseq/models/musicfm/modules/features.py:
--------------------------------------------------------------------------------
 1 | # MIT License
 2 | #
 3 | # Copyright 2023 ByteDance Inc.
 4 | #
 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”),
 6 | # to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
 7 | # and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
 8 | #
 9 | # The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
10 | #
11 | # THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
12 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
13 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
14 | # IN THE SOFTWARE.
15 | 
16 | import torchaudio
17 | from torch import nn
18 | 
19 | 
20 | class MelSTFT(nn.Module):
21 |     def __init__(
22 |         self,
23 |         sample_rate=24000,
24 |         n_fft=2048,
25 |         hop_length=240,
26 |         n_mels=128,
27 |         is_db=False,
28 |     ):
29 |         super(MelSTFT, self).__init__()
30 | 
31 |         # spectrogram
32 |         self.mel_stft = torchaudio.transforms.MelSpectrogram(
33 |             sample_rate=sample_rate, n_fft=n_fft, hop_length=hop_length, n_mels=n_mels
34 |         )
35 | 
36 |         # amplitude to decibel
37 |         self.is_db = is_db
38 |         if is_db:
39 |             self.amplitude_to_db = torchaudio.transforms.AmplitudeToDB()
40 | 
41 |     def forward(self, waveform):
42 |         if self.is_db:
43 |             return self.amplitude_to_db(self.mel_stft(waveform))
44 |         else:
45 |             return self.mel_stft(waveform)
46 | 


--------------------------------------------------------------------------------
/SongGeneration/codeclm/tokenizer/Flow1dVAE/tools/infer_encodec_speech.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import torch
 3 | from tqdm import tqdm
 4 | from audiocraft.models.loaders import load_compression_model
 5 | import torchaudio
 6 | import librosa
 7 | import os
 8 | import math
 9 | import numpy as np
10 | 
11 | class Tango:
12 |     def __init__(self, \
13 |         device="cuda:0"):
14 |         
15 |         self.sample_rate = 48000
16 |         self.rsp48to32 = torchaudio.transforms.Resample(48000, 32000).to(device)
17 |         self.rsp32to48 = torchaudio.transforms.Resample(32000, 48000).to(device)
18 | 
19 |         encodec = load_compression_model('compression_state_dict.bin', device='cpu').eval()
20 |         encodec.set_num_codebooks(1)
21 |         self.encodec = encodec.eval().to(device)
22 |         self.device = torch.device(device)
23 |         print ("Successfully loaded encodec model")
24 | 
25 |     @torch.no_grad()
26 |     def remix(self, filename, duration=10.24, start_step=1000, steps=999, disable_progress=False):
27 |         """ Genrate audio without condition. """
28 |         orig_samples, fs = torchaudio.load(filename)
29 |         if(orig_samples.shape[-1]<int(duration*48000)):
30 |             orig_samples = orig_samples.repeat(1,math.ceil(int(duration*48000)/float(orig_samples.shape[-1])))
31 |         orig_samples = torch.cat([orig_samples, torch.zeros(orig_samples.shape[0], int(duration * fs)//2, dtype=orig_samples.dtype, device=orig_samples.device)], -1).to(self.device)
32 |         if(fs!=48000):orig_samples = torchaudio.functional.resample(orig_samples, fs, 48000)
33 |         init_audio = orig_samples[[0],None,0:int(duration*48000)]
34 | 
35 |         rsped_audios = self.rsp48to32(init_audio)
36 |         codes_rspd = self.encodec.encode(rsped_audios)[0]
37 |         codec_audios = self.encodec.decode(codes_rspd, None)
38 |         codec_audios = self.rsp32to48(codec_audios)
39 |         rsped_audios = self.rsp32to48(rsped_audios)
40 | 
41 |         minlen = min(rsped_audios.shape[-1], codec_audios.shape[-1])
42 |         output = torch.cat([rsped_audios.detach().cpu()[:,0,0:minlen],codec_audios.detach().cpu()[:,0,0:minlen]],0)
43 |         return output
44 | 


--------------------------------------------------------------------------------
/SongGeneration/codeclm/tokenizer/Flow1dVAE/tools/infer_encodec_vocal.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import torch
 3 | from tqdm import tqdm
 4 | from audiocraft.models.loaders import load_compression_model
 5 | import torchaudio
 6 | import librosa
 7 | import os
 8 | import math
 9 | import numpy as np
10 | 
11 | class Tango:
12 |     def __init__(self, \
13 |         device="cuda:0"):
14 |         
15 |         self.sample_rate = 48000
16 |         self.rsp48to32 = torchaudio.transforms.Resample(48000, 32000).to(device)
17 |         self.rsp32to48 = torchaudio.transforms.Resample(32000, 48000).to(device)
18 | 
19 |         encodec = load_compression_model('compression_state_dict.bin', device='cpu').eval()
20 |         encodec.set_num_codebooks(4)
21 |         self.encodec = encodec.eval().to(device)
22 |         self.device = torch.device(device)
23 |         print ("Successfully loaded encodec model")
24 | 
25 |     def set_num_codebooks(self, num):
26 |         self.encodec.set_num_codebooks(num)
27 | 
28 |     @torch.no_grad()
29 |     def remix(self, filename, start_step=1000, steps=999, disable_progress=False):
30 |         """ Genrate audio without condition. """
31 |         init_audio, _ = librosa.load(filename, sr=self.sample_rate, mono=False)
32 |         if(len(init_audio.shape)>1):init_audio = init_audio[0]
33 |         init_audio = torch.from_numpy(init_audio)[None,None,:].to(self.device)
34 |         init_audio = init_audio[:,:,0:int(10.24*2*self.sample_rate)]
35 |         if(init_audio.shape[-1]<int(10.24*2*self.sample_rate)):
36 |             init_audio = torch.cat([init_audio, torch.zeros([1,1,int(10.24*2*self.sample_rate)-init_audio.shape[-1]], device=self.device)],-1)
37 | 
38 |         rsped_audios = self.rsp48to32(init_audio)
39 |         codes_rspd = self.encodec.encode(rsped_audios)[0]
40 |         codec_audios = self.encodec.decode(codes_rspd, None)
41 |         codec_audios = self.rsp32to48(codec_audios)
42 |         rsped_audios = self.rsp32to48(rsped_audios)
43 | 
44 |         minlen = min(rsped_audios.shape[-1], codec_audios.shape[-1])
45 |         output = torch.cat([rsped_audios.detach().cpu()[:,0,0:minlen],codec_audios.detach().cpu()[:,0,0:minlen]],0)
46 |         return output
47 | 


--------------------------------------------------------------------------------
/SongGeneration/codeclm/tokenizer/Flow1dVAE/our_MERT_BESTRQ/mert_fairseq/data/eat_data/add_class_target_dataset.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | #
 3 | # This source code is licensed under the MIT license found in the
 4 | # LICENSE file in the root directory of this source tree.
 5 | 
 6 | import torch
 7 | 
 8 | from fairseq.data import BaseWrapperDataset
 9 | 
10 | # add labels for audio clips in fine-tuning
11 | class AddClassTargetDataset(BaseWrapperDataset):
12 |     def __init__(
13 |         self,
14 |         dataset,
15 |         labels,
16 |         multi_class,
17 |         num_classes=None,
18 |         label_indices=None,
19 |         add_to_input=True,
20 |     ):
21 |         super().__init__(dataset)
22 | 
23 |         self.label_indices = label_indices
24 |         self.labels = labels
25 |         self.multi_class = multi_class
26 |         self.add_to_input = add_to_input
27 |         if num_classes is None and multi_class:
28 |             assert self.label_indices is not None
29 |             num_classes = len(self.label_indices)
30 | 
31 |         self.num_classes = num_classes
32 | 
33 |     def __getitem__(self, index):
34 |         item = self.dataset[index]
35 |         item_labels = self.labels[index]
36 |         if self.multi_class:
37 |             item["label"] = torch.zeros(self.num_classes)
38 |             for il in item_labels:
39 |                 if self.label_indices is not None:
40 |                     il = self.label_indices[il]
41 |                 item["label"][int(il)] = 1.0
42 |         else:
43 |             item["label"] = torch.tensor(
44 |                 self.labels[index]
45 |                 if self.label_indices is None
46 |                 else self.label_indices[self.labels[index]]
47 |             )
48 | 
49 |         return item
50 | 
51 |     def collater(self, samples):
52 |         collated = self.dataset.collater(samples)
53 |         if len(collated) == 0:
54 |             return collated
55 | 
56 |         indices = set(collated["id"].tolist())
57 |         target = [s["label"] for s in samples if s["id"] in indices]
58 |         collated["label"] = torch.stack(target, dim=0)
59 | 
60 |         if self.add_to_input:
61 |             collated["net_input"]["label"] = collated["label"]
62 | 
63 |         return collated
64 | 


--------------------------------------------------------------------------------
/SongGeneration/third_party/stable_audio_tools/stable_audio_tools/configs/model_configs/autoencoders/dac_2048_32_vae.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "model_type": "autoencoder",
 3 |     "sample_size": 65536,
 4 |     "sample_rate": 44100,
 5 |     "audio_channels": 1,
 6 |     "model": {
 7 |         "encoder": {
 8 |             "type": "dac",
 9 |             "config": {
10 |                 "latent_dim": 64,
11 |                 "d_model": 128,
12 |                 "strides": [4, 8, 8, 8]
13 |             }
14 |         },
15 |         "decoder": {
16 |             "type": "dac",
17 |             "config": {
18 |                 "latent_dim": 32,
19 |                 "channels": 1536,
20 |                 "rates": [8, 8, 8, 4]
21 |             }
22 |         },
23 |         "bottleneck": {
24 |             "type": "vae"
25 |         },
26 |         "latent_dim": 32,
27 |         "downsampling_ratio": 2048,
28 |         "io_channels": 1
29 |     },
30 |     "training": {
31 |         "learning_rate": 1e-4,
32 |         "warmup_steps": 0,
33 |         "use_ema": false,
34 |         "loss_configs": {
35 |             "discriminator": {
36 |                 "type": "encodec",
37 |                 "config": {
38 |                     "filters": 32,
39 |                     "n_ffts": [2048, 1024, 512, 256, 128, 64, 32],
40 |                     "hop_lengths": [512, 256, 128, 64, 32, 16, 8],
41 |                     "win_lengths": [2048, 1024, 512, 256, 128, 64, 32]
42 |                 },
43 |                 "weights": {
44 |                     "adversarial": 0.1,
45 |                     "feature_matching": 5.0
46 |                 }
47 |             },
48 |             "spectral": {
49 |                 "type": "mrstft",
50 |                 "config": {
51 |                     "fft_sizes": [2048, 1024, 512, 256, 128, 64, 32],
52 |                     "hop_sizes": [512, 256, 128, 64, 32, 16, 8],
53 |                     "win_lengths": [2048, 1024, 512, 256, 128, 64, 32],
54 |                     "perceptual_weighting": true
55 |                 },
56 |                 "weights": {
57 |                     "mrstft": 1.0
58 |                 }
59 |             },
60 |             "time": {
61 |                 "type": "l1",
62 |                 "weights": {
63 |                     "l1": 0.0
64 |                 }
65 |             }
66 |         },
67 |         "demo": {
68 |             "demo_every": 2000
69 |         }
70 |     }
71 | }


--------------------------------------------------------------------------------
/SongGeneration/codeclm/tokenizer/Flow1dVAE/tools/creat_jsonl.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | TAMPLEATE = {
 3 |     "path": ""
 4 |     "duration": ""
 5 |     "sample_rate": ""
 6 |     "amplitude": null, 
 7 |     "weight": null, 
 8 |     "info_path": null
 9 | }
10 | '''
11 | import torchaudio
12 | import json
13 | from tqdm import tqdm
14 | 
15 | import torchaudio
16 | import numpy as np
17 | import torch, torch.nn as nn, random
18 | from torchaudio import transforms
19 | import os
20 | import argparse
21 | from tqdm import tqdm
22 | import torchaudio
23 | from torchaudio.transforms import Resample
24 | from multiprocessing import Pool
25 | 
26 | def preprocess(args, wav_scp, thread_id):
27 |     # f =  open("pretrain_tme_20230927.scp").readlines() 
28 |     f = open("out.{}".format(thread_id), 'w')
29 |     for line in tqdm(wav_scp):
30 |         try:
31 |             # import pdb; pdb.set_trace()
32 |             line = line.strip()
33 |             meta = torchaudio.info(line)
34 |             duration = meta.num_frames / float(meta.sample_rate)
35 |             sr = meta.sample_rate
36 |             
37 |             # json_path = line.replace(".flac", ".json")
38 |             # with open(json_path, encoding='utf-8') as fh:
39 |             #     data = json.load(fh)
40 |             # duration = data['duration']
41 |             wav_info = {
42 |                 "path": line,
43 |                 "duration": duration,
44 |                 "sample_rate": sr,
45 |                 "amplitude": None, 
46 |                 "weight": None, 
47 |                 "info_path": None
48 |             }
49 |             json_string = json.dumps(wav_info)
50 |             # print(json_string)
51 |             f.write("{}\n".format(json_string))
52 |         except:
53 |             print(line)
54 | 
55 | if __name__ == "__main__":
56 | 
57 |     parser = argparse.ArgumentParser(description='Deep Speaker Embedding Inference')
58 |     parser.add_argument('--wav_scp', type=str)
59 |     parser.add_argument('--num_thread', default=10, type=int, help='random seed')
60 |     args = parser.parse_args()
61 |     
62 |     wav_scp_total = open(args.wav_scp).readlines()
63 |     args.num_thread = min(len(wav_scp_total), args.num_thread)
64 |     wav_scp_list = np.array_split(wav_scp_total, args.num_thread)
65 | 
66 |     p = Pool(args.num_thread)
67 |     for thread_id, wav_scp in enumerate(wav_scp_list):
68 |         r = p.apply_async(preprocess, (args, wav_scp, thread_id))
69 |     p.close()
70 |     p.join() 
71 |     r.get()
72 | 


--------------------------------------------------------------------------------
/SongGeneration/codeclm/tokenizer/Flow1dVAE/our_MERT_BESTRQ/mert_fairseq/config/pretrain/MusicFM_95M_multinodes.yaml:
--------------------------------------------------------------------------------
  1 | # @package _group_
  2 | common:
  3 |   fp16: false
  4 |   log_format: json
  5 |   log_interval: 200
  6 |   seed: 1337
  7 |   # tensorboard_logdir: tblog_proj_name
  8 |   # wandb_project: wandb_proj_name
  9 | 
 10 | checkpoint:
 11 |   save_interval_updates: 12500
 12 |   keep_interval_updates: -1
 13 |   no_epoch_checkpoints: true
 14 | 
 15 | 
 16 | distributed_training:
 17 |   ddp_backend: no_c10d
 18 |   distributed_backend: 'nccl'
 19 |   distributed_world_size: 64
 20 |   nprocs_per_node: 8
 21 |   find_unused_parameters: true
 22 | 
 23 | task:
 24 |   _name: mert_pretraining
 25 |   data: ???
 26 |   label_dir: ???
 27 |   labels: ???
 28 |   label_rate: ${model.label_rate}
 29 |   sample_rate: 24000
 30 |   # # crop to 5s
 31 |   # max_sample_size: 120000
 32 |   # min_sample_size: 72000
 33 | 
 34 |   # crop to 30s
 35 |   max_sample_size: 720000
 36 |   min_sample_size: 432000
 37 |   clip_secs: 30
 38 | 
 39 |   pad_audio: false
 40 |   random_crop: true
 41 |   normalize: false # must be consistent with extractor
 42 | 
 43 | 
 44 | dataset:
 45 |   num_workers: 6
 46 |   max_tokens: 2000000
 47 |   skip_invalid_size_inputs_valid_test: true
 48 |   validate_interval: 1
 49 |   validate_interval_updates: 10000
 50 | 
 51 | criterion:
 52 |   _name: model 
 53 |   # log_keys:
 54 |   #   - accuracies
 55 | 
 56 | optimization:
 57 |   max_update: 400000
 58 |   lr: [0.0005]
 59 |   clip_norm: 10.0
 60 |   update_freq: [1]
 61 | 
 62 | optimizer:
 63 |   _name: adam
 64 |   adam_betas: (0.9,0.98)
 65 |   adam_eps: 1e-06
 66 |   weight_decay: 0.01
 67 | 
 68 | lr_scheduler:
 69 |   _name: polynomial_decay
 70 |   warmup_updates: 32000
 71 | 
 72 | model:
 73 |   _name: musicfm
 74 |   label_rate: 25
 75 |   num_codebooks: 1
 76 |   codebook_dim: 16
 77 |   codebook_size: 4096
 78 |   features: ["melspec_2048"]
 79 |   hop_length: 240
 80 |   n_mels: 128
 81 |   conv_dim: 512
 82 |   encoder_dim: 1024
 83 |   encoder_depth: 12
 84 |   mask_hop: 0.4
 85 |   mask_prob: 0.6
 86 |   is_flash: false
 87 |   stat_path: msd_stats.json
 88 |   model_path: pretrained_msd.pt
 89 |   w2v2_config_path: models--facebook--wav2vec2-conformer-rope-large-960h-ft/snapshots/6b36ef01c6443c67ae7ed0822876d091ab50e4aa
 90 | 
 91 | hydra:
 92 |   job:
 93 |     config:
 94 |       override_dirname:
 95 |         kv_sep: '-'
 96 |         item_sep: '__'
 97 |         exclude_keys:
 98 |           - run
 99 |           - task.data
100 |           - task.label_dir
101 |   run:
102 |     dir: ???
103 |   sweep:
104 |     dir: ???
105 |     subdir: ${hydra.job.config_name}__${hydra.job.override_dirname}
106 | 


--------------------------------------------------------------------------------
/SongGeneration/third_party/stable_audio_tools/docs/pretransforms.md:
--------------------------------------------------------------------------------
 1 | # Pretransforms
 2 | Many models require some fixed transform to be applied to the input audio before the audio is passed in to the trainable layers of the model, as well as a corresponding inverse transform to be applied to the outputs of the model. We refer to these as "pretransforms".
 3 | 
 4 | At the moment, `stable-audio-tools` supports two pretransforms, frozen autoencoders for latent diffusion models and wavelet decompositions.
 5 | 
 6 | Pretransforms have a similar interface to autoencoders with "encode" and "decode" functions defined for each pretransform.
 7 | 
 8 | ## Autoencoder pretransform
 9 | To define a model with an autoencoder pretransform, you can define the "pretransform" property in the model config, with the `type` property set to `autoencoder`. The `config` property should be an autoencoder model definition.
10 | 
11 | Example:
12 | ```json
13 | "pretransform": {
14 |     "type": "autoencoder",
15 |     "config": {
16 |         "encoder": {
17 |             ...
18 |         },
19 |         "decoder": {
20 |             ...
21 |         }
22 |         ...normal autoencoder configuration
23 |     }
24 | }
25 | ```
26 | 
27 | ### Latent rescaling
28 | The original [Latent Diffusion paper](https://arxiv.org/abs/2112.10752) found that rescaling the latent series to unit variance before performing diffusion improved quality. To this end, we expose a `scale` property on autoencoder pretransforms that will take care of this rescaling. The scale should be set to the original standard deviation of the latents, which can be determined experimentally, or by looking at the `latent_std` value during training. The pretransform code will divide by this scale factor in the `encode` function and multiply by this scale in the `decode` function.
29 | 
30 | ## Wavelet pretransform
31 | `stable-audio-tools` also exposes wavelet decomposition as a pretransform. Wavelet decomposition is a quick way to trade off sequence length for channels in autoencoders, while maintaining a multi-band implicit bias.
32 | 
33 | Wavelet pretransforms take the following properties:
34 | 
35 | - `channels`
36 |     - The number of input and output audio channels for the wavelet transform
37 | - `levels`
38 |     - The number of successive wavelet decompositions to perform. Each level doubles the channel count and halves the sequence length
39 | - `wavelet`
40 |     - The specific wavelet from [PyWavelets](https://pywavelets.readthedocs.io/en/latest/ref/wavelets.html) to use, currently limited to `"bior2.2", "bior2.4", "bior2.6", "bior2.8", "bior4.4", "bior6.8"`
41 | 
42 | ## Future work
43 | We hope to add more filters and transforms to this list, including PQMF and STFT transforms.


--------------------------------------------------------------------------------
/SongGeneration/codeclm/tokenizer/Flow1dVAE/our_MERT_BESTRQ/mert_fairseq/config/pretrain/MusicFM_95M_speech_multinodes.yaml:
--------------------------------------------------------------------------------
  1 | # @package _group_
  2 | common:
  3 |   fp16: false
  4 |   log_format: json
  5 |   log_interval: 200
  6 |   seed: 1337
  7 |   # tensorboard_logdir: tblog_proj_name
  8 |   # wandb_project: wandb_proj_name
  9 | 
 10 | checkpoint:
 11 |   save_interval_updates: 2500
 12 |   keep_interval_updates: 10000
 13 |   no_epoch_checkpoints: true
 14 | 
 15 | 
 16 | distributed_training:
 17 |   ddp_backend: no_c10d
 18 |   distributed_backend: 'nccl'
 19 |   distributed_world_size: 64
 20 |   nprocs_per_node: 8
 21 |   find_unused_parameters: true
 22 | 
 23 | task:
 24 |   _name: mert_pretraining
 25 |   data: ???
 26 |   label_dir: ???
 27 |   labels: ???
 28 |   label_rate: ${model.label_rate}
 29 |   sample_rate: 24000
 30 |   # # crop to 5s
 31 |   # max_sample_size: 120000
 32 |   # min_sample_size: 72000
 33 | 
 34 |   # crop to 30s
 35 |   max_sample_size: 720000
 36 |   min_sample_size: 12000
 37 |   # clip_secs: 30
 38 | 
 39 |   pad_audio: false
 40 |   random_crop: true
 41 |   normalize: false # must be consistent with extractor
 42 | 
 43 | 
 44 | dataset:
 45 |   num_workers: 6
 46 |   max_tokens: 2000000
 47 |   skip_invalid_size_inputs_valid_test: true
 48 |   validate_interval: 1
 49 |   validate_interval_updates: 10000
 50 |   disable_validation: true
 51 | 
 52 | criterion:
 53 |   _name: model 
 54 |   # log_keys:
 55 |   #   - accuracies
 56 | 
 57 | optimization:
 58 |   max_update: 400000
 59 |   lr: [0.0005]
 60 |   clip_norm: 10.0
 61 |   update_freq: [1]
 62 | 
 63 | optimizer:
 64 |   _name: adam
 65 |   adam_betas: (0.9,0.98)
 66 |   adam_eps: 1e-06
 67 |   weight_decay: 0.01
 68 | 
 69 | lr_scheduler:
 70 |   _name: polynomial_decay
 71 |   warmup_updates: 32000
 72 | 
 73 | model:
 74 |   _name: musicfm
 75 |   label_rate: 25
 76 |   num_codebooks: 1
 77 |   codebook_dim: 16
 78 |   codebook_size: 4096
 79 |   features: ["melspec_2048"]
 80 |   hop_length: 240
 81 |   n_mels: 128
 82 |   conv_dim: 512
 83 |   encoder_dim: 1024
 84 |   encoder_depth: 12
 85 |   mask_hop: 0.4
 86 |   mask_prob: 0.6
 87 |   is_flash: false
 88 |   stat_path: msd_stats.json
 89 |   model_path: null
 90 |   w2v2_config_path: models--facebook--wav2vec2-conformer-rope-large-960h-ft/snapshots/6b36ef01c6443c67ae7ed0822876d091ab50e4aa
 91 | 
 92 | hydra:
 93 |   job:
 94 |     config:
 95 |       override_dirname:
 96 |         kv_sep: '-'
 97 |         item_sep: '__'
 98 |         exclude_keys:
 99 |           - run
100 |           - task.data
101 |           - task.label_dir
102 |   run:
103 |     dir: ???
104 |   sweep:
105 |     dir: ???
106 |     subdir: ${hydra.job.config_name}__${hydra.job.override_dirname}
107 | 


--------------------------------------------------------------------------------
/SongGeneration/codeclm/tokenizer/Flow1dVAE/tools/infer_bsrnnvae441k.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import torch
 3 | from tqdm import tqdm
 4 | import torchaudio
 5 | import librosa
 6 | import os
 7 | import math
 8 | import numpy as np
 9 | from .get_bsrnnvae import get_bsrnnvae
10 | from . import torch_tools as torch_tools
11 | 
12 | class Tango:
13 |     def __init__(self, \
14 |         device="cuda:0"):
15 |         
16 |         self.sample_rate = 44100
17 |         self.device = device
18 | 
19 |         self.vae = get_bsrnnvae()
20 |         self.vae = self.vae.eval().to(device)
21 | 
22 |     def sound2sound_generate_longterm(self, fname, batch_size=1, duration=15.36, steps=200, disable_progress=False):
23 |         """ Genrate audio without condition. """
24 |         num_frames = math.ceil(duration * 100. / 8)
25 |         with torch.no_grad():
26 |             orig_samples, fs = torchaudio.load(fname)
27 |             if(fs!=44100):
28 |                 orig_samples = torchaudio.functional.resample(orig_samples, fs, 44100)
29 |                 fs = 44100
30 |             if(orig_samples.shape[-1]<int(duration*44100*2)):
31 |                 orig_samples =  torch.cat([orig_samples, torch.zeros(orig_samples.shape[0], int(duration*44100*2+480)-orig_samples.shape[-1], \
32 |                     dtype=orig_samples.dtype, device=orig_samples.device)], -1)
33 |             # orig_samples = torch.cat([torch.zeros(orig_samples.shape[0], int(duration * fs)//2, dtype=orig_samples.dtype, device=orig_samples.device), orig_samples, torch.zeros(orig_samples.shape[0], int(duration * fs)//2, dtype=orig_samples.dtype, device=orig_samples.device)], -1).to(self.device)
34 |             orig_samples = torch.cat([orig_samples, torch.zeros(orig_samples.shape[0], int(duration * fs)//2, dtype=orig_samples.dtype, device=orig_samples.device)], -1).to(self.device)
35 |             if(fs!=44100):orig_samples = torchaudio.functional.resample(orig_samples, fs, 44100)
36 |             # resampled_audios = orig_samples[[0],int(4.64*44100):int(35.36*48000)+480].clamp(-1,1)
37 |             resampled_audios = orig_samples[[0],0:int(duration*2*44100)+480].clamp(-1,1)
38 |             orig_samples = orig_samples[[0],0:int(duration*2*44100)]
39 | 
40 |             audio = self.vae(orig_samples[:,None,:])[:,0,:]
41 | 
42 |             if(orig_samples.shape[-1]<audio.shape[-1]):
43 |                 orig_samples = torch.cat([orig_samples, torch.zeros(orig_samples.shape[0], audio.shape[-1]-orig_samples.shape[-1], dtype=orig_samples.dtype, device=orig_samples.device)],-1)
44 |             else:
45 |                 orig_samples = orig_samples[:,0:audio.shape[-1]]
46 |             output = torch.cat([orig_samples.detach().cpu(),audio.detach().cpu()],0)
47 |         return output
48 | 


--------------------------------------------------------------------------------
/SongGeneration/codeclm/tokenizer/Flow1dVAE/tools/infer_bsrnnvae441k_vocal.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import torch
 3 | from tqdm import tqdm
 4 | import torchaudio
 5 | import librosa
 6 | import os
 7 | import math
 8 | import numpy as np
 9 | from .get_bsrnnvae import get_bsrnnvae
10 | from . import torch_tools as torch_tools
11 | 
12 | class Tango:
13 |     def __init__(self, \
14 |         device="cuda:0"):
15 |         
16 |         self.sample_rate = 44100
17 |         self.device = device
18 | 
19 |         self.vae = get_bsrnnvae()
20 |         self.vae = self.vae.eval().to(device)
21 | 
22 |     def sound2sound_generate_longterm(self, fname, batch_size=1, duration=20.48, steps=200, disable_progress=False):
23 |         """ Genrate audio without condition. """
24 |         num_frames = math.ceil(duration * 100. / 8)
25 |         with torch.no_grad():
26 |             orig_samples, fs = torchaudio.load(fname)
27 |             if(fs!=44100):
28 |                 orig_samples = torchaudio.functional.resample(orig_samples, fs, 44100)
29 |                 fs = 44100
30 |             if(orig_samples.shape[-1]<int(duration*44100*2)):
31 |                 orig_samples =  torch.cat([orig_samples, torch.zeros(orig_samples.shape[0], int(duration*44100*2+480)-orig_samples.shape[-1], \
32 |                     dtype=orig_samples.dtype, device=orig_samples.device)], -1)
33 |             # orig_samples = torch.cat([torch.zeros(orig_samples.shape[0], int(duration * fs)//2, dtype=orig_samples.dtype, device=orig_samples.device), orig_samples, torch.zeros(orig_samples.shape[0], int(duration * fs)//2, dtype=orig_samples.dtype, device=orig_samples.device)], -1).to(self.device)
34 |             orig_samples = torch.cat([orig_samples, torch.zeros(orig_samples.shape[0], int(duration * fs)//2, dtype=orig_samples.dtype, device=orig_samples.device)], -1).to(self.device)
35 |             if(fs!=44100):orig_samples = torchaudio.functional.resample(orig_samples, fs, 44100)
36 |             # resampled_audios = orig_samples[[0],int(4.64*44100):int(35.36*48000)+480].clamp(-1,1)
37 |             resampled_audios = orig_samples[[0],0:int(duration*2*44100)+480].clamp(-1,1)
38 |             orig_samples = orig_samples[[0],0:int(duration*2*44100)]
39 | 
40 |             audio = self.vae(orig_samples[:,None,:])[:,0,:]
41 | 
42 |             if(orig_samples.shape[-1]<audio.shape[-1]):
43 |                 orig_samples = torch.cat([orig_samples, torch.zeros(orig_samples.shape[0], audio.shape[-1]-orig_samples.shape[-1], dtype=orig_samples.dtype, device=orig_samples.device)],-1)
44 |             else:
45 |                 orig_samples = orig_samples[:,0:audio.shape[-1]]
46 |             output = torch.cat([orig_samples.detach().cpu(),audio.detach().cpu()],0)
47 |         return output
48 | 


--------------------------------------------------------------------------------
/SongGeneration/codeclm/tokenizer/Flow1dVAE/our_MERT_BESTRQ/mert_fairseq/config/pretrain/MusicFM_95M_bestrvq_multinodes.yaml:
--------------------------------------------------------------------------------
  1 | # @package _group_
  2 | common:
  3 |   fp16: false
  4 |   log_format: json
  5 |   log_interval: 200
  6 |   seed: 1337
  7 |   # tensorboard_logdir: tblog_proj_name
  8 |   # wandb_project: wandb_proj_name
  9 | 
 10 | checkpoint:
 11 |   save_interval_updates: 12500
 12 |   keep_interval_updates: -1
 13 |   no_epoch_checkpoints: true
 14 | 
 15 | 
 16 | distributed_training:
 17 |   ddp_backend: no_c10d
 18 |   distributed_backend: 'nccl'
 19 |   distributed_world_size: 64
 20 |   nprocs_per_node: 8
 21 |   find_unused_parameters: true
 22 | 
 23 | task:
 24 |   _name: mert_pretraining
 25 |   data: ???
 26 |   label_dir: ???
 27 |   labels: ???
 28 |   label_rate: ${model.label_rate}
 29 |   sample_rate: 24000
 30 |   # # crop to 5s
 31 |   # max_sample_size: 120000
 32 |   # min_sample_size: 72000
 33 | 
 34 |   # crop to 30s
 35 |   max_sample_size: 720000
 36 |   min_sample_size: 432000
 37 |   clip_secs: 30
 38 | 
 39 |   pad_audio: false
 40 |   random_crop: true
 41 |   normalize: false # must be consistent with extractor
 42 | 
 43 | 
 44 | dataset:
 45 |   num_workers: 6
 46 |   max_tokens: 2000000
 47 |   skip_invalid_size_inputs_valid_test: true
 48 |   validate_interval: 1
 49 |   validate_interval_updates: 10000
 50 | 
 51 | criterion:
 52 |   _name: model 
 53 |   # log_keys:
 54 |   #   - accuracies
 55 | 
 56 | optimization:
 57 |   max_update: 400000
 58 |   lr: [0.0005]
 59 |   clip_norm: 10.0
 60 |   update_freq: [1]
 61 | 
 62 | optimizer:
 63 |   _name: adam
 64 |   adam_betas: (0.9,0.98)
 65 |   adam_eps: 1e-06
 66 |   weight_decay: 0.01
 67 | 
 68 | lr_scheduler:
 69 |   _name: polynomial_decay
 70 |   warmup_updates: 32000
 71 | 
 72 | model:
 73 |   _name: musicfm
 74 |   label_rate: 25
 75 |   num_codebooks: 1
 76 |   codebook_dim: 16
 77 |   codebook_size: 8192 # 4096
 78 |   features: ["melspec_2048"]
 79 |   hop_length: 240
 80 |   n_mels: 128
 81 |   conv_dim: 512
 82 |   encoder_dim: 1024
 83 |   encoder_depth: 12
 84 |   mask_hop: 0.4
 85 |   mask_prob: 0.6
 86 |   is_flash: false
 87 |   
 88 |   stat_path: msd_stats.json
 89 |   model_path: null
 90 |   w2v2_config_path: our-MERT/data/models--facebook--wav2vec2-conformer-rope-large-960h-ft/snapshots/6b36ef01c6443c67ae7ed0822876d091ab50e4aa
 91 |   use_rvq_target: true
 92 |   rvq_ckpt_path: RVQ_4000.pth
 93 | 
 94 | hydra:
 95 |   job:
 96 |     config:
 97 |       override_dirname:
 98 |         kv_sep: '-'
 99 |         item_sep: '__'
100 |         exclude_keys:
101 |           - run
102 |           - task.data
103 |           - task.label_dir
104 |   run:
105 |     dir: ???
106 |   sweep:
107 |     dir: ???
108 |     subdir: ${hydra.job.config_name}__${hydra.job.override_dirname}
109 | 


--------------------------------------------------------------------------------
/SongGeneration/codeclm/tokenizer/Flow1dVAE/extract_codes_stereo_7_1x2.py:
--------------------------------------------------------------------------------
 1 | import torch,torchaudio
 2 | import os,sys,json
 3 | from tqdm import tqdm
 4 | 
 5 | #from codeclm_song_v1.codeclm.semantic_extractor.SpeechDecoder_v01.generate import Tango
 6 | from .generate_2rvq import Tango
 7 | import kaldiio
 8 | from kaldiio import WriteHelper
 9 | import torch
10 | import subprocess
11 | import time
12 | import sys
13 | 
14 | def get_gpu_memory():
15 |     _output_to_list = lambda x: x.decode('ascii').split('\n')[:-1]
16 | 
17 |     ACCEPTABLE_AVAILABLE_MEMORY = 1024
18 |     COMMAND = "nvidia-smi --query-gpu=memory.free --format=csv"
19 |     memory_free_info = _output_to_list(subprocess.check_output(COMMAND.split()))[1:]
20 |     memory_free_values = [int(x.split()[0]) for i, x in enumerate(memory_free_info)]
21 |     return memory_free_values
22 | 
23 | if __name__ == "__main__":
24 |     # Define Model
25 |     json_path = sys.argv[1]
26 |     outdir = sys.argv[2]
27 |     
28 |     gpu_idx = int(os.environ['CUDA_VISIBLE_DEVICES'])
29 |     while True:
30 |         free_mem = get_gpu_memory()
31 |         free_mem = free_mem[gpu_idx]
32 |         if(free_mem > 25_000):
33 |             print("GPU memory {}, run matrix cal".format(free_mem))
34 |             break
35 |         else:
36 |             print("GPU memory {}, sleep 1min".format(free_mem))
37 |             time.sleep(60)
38 |     
39 |     mus_infos = []
40 |     with open(json_path) as f:
41 |         for line in f:
42 |             item = json.loads(line)
43 |             mus_infos.append(item)
44 | 
45 |     tango = Tango(model_path = './saved/model_2rvq/model_2_fixed.safetensors', rvq_num=2)
46 |     
47 |     
48 |     # Feature extraction loop
49 |     # for i in tqdm(range(2000)):
50 |     with WriteHelper('ark,scp:{}/token.ark,{}/token.scp'.format(outdir, outdir), write_function="pickle") as writer:
51 |         print('ark,scp:{}/token.ark,{}/token.scp'.format(outdir, outdir))
52 |         for item in tqdm(mus_infos):
53 |             try:
54 |             # if True:
55 |                 idx = item['idx']
56 |                 # print(idx)
57 |                 with torch.autocast(device_type="cuda", dtype=torch.float16):
58 |                     if(os.path.exists(item['path'])):
59 |                         codes = tango.file2code(item['path'])
60 |                     else:
61 |                         codes = tango.file2code('/mnt/share/' + item['path'])
62 |                 writer(str(idx), codes.cpu())
63 |             except:
64 |                 print(item['path'])
65 |                 continue
66 |             # idx = item['idx']
67 |             # # print(idx)
68 |             # with torch.autocast(device_type="cuda", dtype=torch.float16):
69 |             #     codes = tango.file2code(item['path'])
70 |             # writer(str(idx), codes.cpu())


--------------------------------------------------------------------------------
/SongGeneration/conf/w2v2_config.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "activation_dropout": 0.1,
  3 |   "adapter_kernel_size": 3,
  4 |   "adapter_stride": 2,
  5 |   "add_adapter": false,
  6 |   "apply_spec_augment": true,
  7 |   "architectures": [
  8 |     "Wav2Vec2ConformerForCTC"
  9 |   ],
 10 |   "attention_dropout": 0.1,
 11 |   "bos_token_id": 1,
 12 |   "classifier_proj_size": 256,
 13 |   "codevector_dim": 768,
 14 |   "conformer_conv_dropout": 0.1,
 15 |   "contrastive_logits_temperature": 0.1,
 16 |   "conv_bias": true,
 17 |   "conv_depthwise_kernel_size": 31,
 18 |   "conv_dim": [
 19 |     512,
 20 |     512,
 21 |     512,
 22 |     512,
 23 |     512,
 24 |     512,
 25 |     512
 26 |   ],
 27 |   "conv_kernel": [
 28 |     10,
 29 |     3,
 30 |     3,
 31 |     3,
 32 |     3,
 33 |     2,
 34 |     2
 35 |   ],
 36 |   "conv_stride": [
 37 |     5,
 38 |     2,
 39 |     2,
 40 |     2,
 41 |     2,
 42 |     2,
 43 |     2
 44 |   ],
 45 |   "ctc_loss_reduction": "sum",
 46 |   "ctc_zero_infinity": false,
 47 |   "diversity_loss_weight": 0.1,
 48 |   "do_stable_layer_norm": true,
 49 |   "eos_token_id": 2,
 50 |   "feat_extract_activation": "gelu",
 51 |   "feat_extract_dropout": 0.0,
 52 |   "feat_extract_norm": "layer",
 53 |   "feat_proj_dropout": 0.1,
 54 |   "feat_quantizer_dropout": 0.0,
 55 |   "final_dropout": 0.1,
 56 |   "gradient_checkpointing": false,
 57 |   "hidden_act": "swish",
 58 |   "hidden_dropout": 0.1,
 59 |   "hidden_dropout_prob": 0.1,
 60 |   "hidden_size": 1024,
 61 |   "initializer_range": 0.02,
 62 |   "intermediate_size": 4096,
 63 |   "layer_norm_eps": 1e-05,
 64 |   "layerdrop": 0.0,
 65 |   "mask_feature_length": 10,
 66 |   "mask_feature_min_masks": 0,
 67 |   "mask_feature_prob": 0.0,
 68 |   "mask_time_length": 10,
 69 |   "mask_time_min_masks": 2,
 70 |   "mask_time_prob": 0.05,
 71 |   "max_source_positions": 5000,
 72 |   "model_type": "wav2vec2-conformer",
 73 |   "num_adapter_layers": 3,
 74 |   "num_attention_heads": 16,
 75 |   "num_codevector_groups": 2,
 76 |   "num_codevectors_per_group": 320,
 77 |   "num_conv_pos_embedding_groups": 16,
 78 |   "num_conv_pos_embeddings": 128,
 79 |   "num_feat_extract_layers": 7,
 80 |   "num_hidden_layers": 24,
 81 |   "num_negatives": 100,
 82 |   "output_hidden_size": 1024,
 83 |   "pad_token_id": 0,
 84 |   "position_embeddings_type": "rotary",
 85 |   "proj_codevector_dim": 768,
 86 |   "rotary_embedding_base": 10000,
 87 |   "tdnn_dilation": [
 88 |     1,
 89 |     2,
 90 |     3,
 91 |     1,
 92 |     1
 93 |   ],
 94 |   "tdnn_dim": [
 95 |     512,
 96 |     512,
 97 |     512,
 98 |     512,
 99 |     1500
100 |   ],
101 |   "tdnn_kernel": [
102 |     5,
103 |     3,
104 |     3,
105 |     1,
106 |     1
107 |   ],
108 |   "torch_dtype": "float32",
109 |   "transformers_version": "4.19.0.dev0",
110 |   "use_weighted_layer_sum": false,
111 |   "vocab_size": 32,
112 |   "xvector_output_dim": 512
113 | }
114 | 


--------------------------------------------------------------------------------
/SongGeneration/codeclm/tokenizer/Flow1dVAE/our_MERT_BESTRQ/run_training_eat.sh:
--------------------------------------------------------------------------------
 1 | WORKER_RANK=${1:-$INDEX}
 2 | PLATFORM=${2:-'shef'} 
 3 | YAML_NAME_WITHOUT_EXT=${3:-'MERT_RVQ-VAE_CQT_95M'}
 4 | TRAINING_SETTING=${4:-'MERT_RVQ-VAE_CQT'}
 5 | MASTER_PROC_ADD=${5:-$CHIEF_IP}
 6 | DIST_PORT=${6:-'25520'}
 7 | # echo $PATH
 8 | # export PATH=$PATH:./
 9 | echo "worker rank ${WORKER_RANK}, master address ${MASTER_PROC_ADD}:${DIST_PORT}"
10 | 
11 | MAP_PROJ_DIR=$(pwd)
12 | echo $MAP_PROJ_DIR
13 | 
14 | NNODS=1
15 | BATCH_SIZE=12
16 | NUM_WOKERS=6
17 | 
18 | run_command_prefix=' '
19 | # Loading folders
20 | # 1. tsv files for audio paths
21 | # DATA_DIR=${MAP_PROJ_DIR}/data/audio_tsv
22 | DATA_DIR=${MAP_PROJ_DIR}/data/music4all_sh #audio_manifest
23 | # 2. working folder for saving checkpoints and loading config files
24 | CONFIG_DIR=/${MAP_PROJ_DIR}/mert_fairseq/config/pretrain
25 | # 3. clustering labels for training data
26 | LABEL_ROOT_DIR=${MAP_PROJ_DIR}/data/encodec_labels/custom_audio_dataset
27 | 
28 | FAIRSEQ_PATH=${MAP_PROJ_DIR}/src/fairseq;
29 | SAVE_DIR=${MAP_PROJ_DIR}/data/fairseq_savedir/
30 | 
31 | case $YAML_NAME_WITHOUT_EXT in
32 |     EAT_pretraining_music_multinodes)
33 |         NNODS=4
34 |         NPROCES_PER_NODE=8
35 |         LABEL_RATE=25
36 |         BATCH_SIZE=12
37 |         ;;
38 |     *)
39 |         echo "Unknown running config: ${$YAML_NAME_WITHOUT_EXT}"
40 |         exit 1
41 |         ;;
42 |     esac
43 | 
44 | echo running $YAML_NAME_WITHOUT_EXT ..
45 | 
46 | mkdir -p ${SAVE_DIR}
47 | echo "checkpoint save at: ${SAVE_DIR}"
48 | cd ${SAVE_DIR}
49 | 
50 | DISTRIBUTED_WORLD_SIZE=`expr ${NNODS} \* ${NPROCES_PER_NODE}`
51 | ACTUAL_WORKER_RANK=`expr ${WORKER_RANK} \* ${NPROCES_PER_NODE}`
52 | echo "worker rank ${WORKER_RANK}, master address ${MASTER_PROC_ADD}:${DIST_PORT}, actual rank ${ACTUAL_WORKER_RANK}"
53 | 
54 | DATE_SUFFIX=`date +"%Y-%m-%d_%H-%M"`
55 | 
56 | OMP_NUM_THREADS=6 ${run_command_prefix} \
57 | python -u ${FAIRSEQ_PATH}/fairseq_cli/hydra_train.py \
58 | --config-dir ${CONFIG_DIR} --config-name ${YAML_NAME_WITHOUT_EXT} \
59 | common.user_dir=${MAP_PROJ_DIR}/mert_fairseq \
60 | common.tensorboard_logdir=${MAP_PROJ_DIR}/logs/pretrain_tb_${TRAINING_SETTING}_${YAML_NAME_WITHOUT_EXT}_multinodes${NNODS} \
61 | checkpoint.save_dir=${SAVE_DIR}/ckpt_${TRAINING_SETTING}_multinodes${NNODS}_${DATE_SUFFIX}/${YAML_NAME_WITHOUT_EXT} \
62 | distributed_training.distributed_rank=${ACTUAL_WORKER_RANK} \
63 | distributed_training.distributed_world_size=${DISTRIBUTED_WORLD_SIZE}  \
64 | distributed_training.distributed_num_procs=${DISTRIBUTED_WORLD_SIZE}  \
65 | distributed_training.nprocs_per_node=${NPROCES_PER_NODE} \
66 | distributed_training.distributed_init_method="tcp://${CHIEF_IP}:${DIST_PORT}" \
67 | task.data=${DATA_DIR} \
68 | dataset.num_workers=${NUM_WOKERS} \
69 | dataset.batch_size=${BATCH_SIZE} \
70 | dataset.disable_validation=true \
71 | 
72 | # pip install h5py timm -i https://mirrors.tencent.com/pypi/simple/


--------------------------------------------------------------------------------
/SongGeneration/third_party/stable_audio_tools/stable_audio_tools/models/diffusion_prior.py:
--------------------------------------------------------------------------------
 1 | from enum import Enum
 2 | import typing as tp
 3 | 
 4 | from .diffusion import ConditionedDiffusionModelWrapper
 5 | from ..inference.generation import generate_diffusion_cond
 6 | from ..inference.utils import prepare_audio
 7 | 
 8 | import torch
 9 | from torch.nn import functional as F
10 | from torchaudio import transforms as T
11 | 
12 | # Define prior types enum
13 | class PriorType(Enum):
14 |     MonoToStereo = 1
15 | 
16 | class DiffusionPrior(ConditionedDiffusionModelWrapper):
17 |     def __init__(self, *args, prior_type: PriorType=None, **kwargs):
18 |         super().__init__(*args, **kwargs)
19 |         self.prior_type = prior_type  
20 | 
21 | class MonoToStereoDiffusionPrior(DiffusionPrior):
22 |     def __init__(self, *args, **kwargs):
23 |         super().__init__(*args, prior_type=PriorType.MonoToStereo, **kwargs)
24 | 
25 |     def stereoize(
26 |         self, 
27 |         audio: torch.Tensor, # (batch, channels, time)
28 |         in_sr: int,
29 |         steps: int,
30 |         sampler_kwargs: dict = {},
31 |     ):
32 |         """
33 |         Generate stereo audio from mono audio using a pre-trained diffusion prior
34 | 
35 |         Args:
36 |             audio: The mono audio to convert to stereo
37 |             in_sr: The sample rate of the input audio
38 |             steps: The number of diffusion steps to run
39 |             sampler_kwargs: Keyword arguments to pass to the diffusion sampler
40 |         """
41 | 
42 |         device = audio.device
43 | 
44 |         sample_rate = self.sample_rate
45 | 
46 |         # Resample input audio if necessary
47 |         if in_sr != sample_rate:
48 |             resample_tf = T.Resample(in_sr, sample_rate).to(audio.device)
49 |             audio = resample_tf(audio)
50 | 
51 |         audio_length = audio.shape[-1]
52 | 
53 |         # Pad input audio to be compatible with the model
54 |         min_length = self.min_input_length
55 |         padded_input_length = audio_length + (min_length - (audio_length % min_length)) % min_length
56 | 
57 |         # Pad input audio to be compatible with the model
58 |         if padded_input_length > audio_length:
59 |             audio = F.pad(audio, (0, padded_input_length - audio_length))
60 | 
61 |         # Make audio mono, duplicate to stereo
62 |         dual_mono = audio.mean(1, keepdim=True).repeat(1, 2, 1)
63 | 
64 |         if self.pretransform is not None:
65 |             dual_mono = self.pretransform.encode(dual_mono)
66 | 
67 |         conditioning = {"source": [dual_mono]}
68 | 
69 |         stereo_audio = generate_diffusion_cond(
70 |             self, 
71 |             conditioning_tensors=conditioning,
72 |             steps=steps,
73 |             sample_size=padded_input_length,
74 |             sample_rate=sample_rate,
75 |             device=device,
76 |             **sampler_kwargs,
77 |         ) 
78 | 
79 |         return stereo_audio


--------------------------------------------------------------------------------
/SongGeneration/codeclm/tokenizer/Flow1dVAE/our_MERT_BESTRQ/mert_fairseq/models/musicfm/model/w2v2_config.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "activation_dropout": 0.1,
  3 |   "adapter_kernel_size": 3,
  4 |   "adapter_stride": 2,
  5 |   "add_adapter": false,
  6 |   "apply_spec_augment": true,
  7 |   "architectures": [
  8 |     "Wav2Vec2ConformerForCTC"
  9 |   ],
 10 |   "attention_dropout": 0.1,
 11 |   "bos_token_id": 1,
 12 |   "classifier_proj_size": 256,
 13 |   "codevector_dim": 768,
 14 |   "conformer_conv_dropout": 0.1,
 15 |   "contrastive_logits_temperature": 0.1,
 16 |   "conv_bias": true,
 17 |   "conv_depthwise_kernel_size": 31,
 18 |   "conv_dim": [
 19 |     512,
 20 |     512,
 21 |     512,
 22 |     512,
 23 |     512,
 24 |     512,
 25 |     512
 26 |   ],
 27 |   "conv_kernel": [
 28 |     10,
 29 |     3,
 30 |     3,
 31 |     3,
 32 |     3,
 33 |     2,
 34 |     2
 35 |   ],
 36 |   "conv_stride": [
 37 |     5,
 38 |     2,
 39 |     2,
 40 |     2,
 41 |     2,
 42 |     2,
 43 |     2
 44 |   ],
 45 |   "ctc_loss_reduction": "sum",
 46 |   "ctc_zero_infinity": false,
 47 |   "diversity_loss_weight": 0.1,
 48 |   "do_stable_layer_norm": true,
 49 |   "eos_token_id": 2,
 50 |   "feat_extract_activation": "gelu",
 51 |   "feat_extract_dropout": 0.0,
 52 |   "feat_extract_norm": "layer",
 53 |   "feat_proj_dropout": 0.1,
 54 |   "feat_quantizer_dropout": 0.0,
 55 |   "final_dropout": 0.1,
 56 |   "gradient_checkpointing": false,
 57 |   "hidden_act": "swish",
 58 |   "hidden_dropout": 0.1,
 59 |   "hidden_dropout_prob": 0.1,
 60 |   "hidden_size": 1024,
 61 |   "initializer_range": 0.02,
 62 |   "intermediate_size": 4096,
 63 |   "layer_norm_eps": 1e-05,
 64 |   "layerdrop": 0.0,
 65 |   "mask_feature_length": 10,
 66 |   "mask_feature_min_masks": 0,
 67 |   "mask_feature_prob": 0.0,
 68 |   "mask_time_length": 10,
 69 |   "mask_time_min_masks": 2,
 70 |   "mask_time_prob": 0.05,
 71 |   "max_source_positions": 5000,
 72 |   "model_type": "wav2vec2-conformer",
 73 |   "num_adapter_layers": 3,
 74 |   "num_attention_heads": 16,
 75 |   "num_codevector_groups": 2,
 76 |   "num_codevectors_per_group": 320,
 77 |   "num_conv_pos_embedding_groups": 16,
 78 |   "num_conv_pos_embeddings": 128,
 79 |   "num_feat_extract_layers": 7,
 80 |   "num_hidden_layers": 24,
 81 |   "num_negatives": 100,
 82 |   "output_hidden_size": 1024,
 83 |   "pad_token_id": 0,
 84 |   "position_embeddings_type": "rotary",
 85 |   "proj_codevector_dim": 768,
 86 |   "rotary_embedding_base": 10000,
 87 |   "tdnn_dilation": [
 88 |     1,
 89 |     2,
 90 |     3,
 91 |     1,
 92 |     1
 93 |   ],
 94 |   "tdnn_dim": [
 95 |     512,
 96 |     512,
 97 |     512,
 98 |     512,
 99 |     1500
100 |   ],
101 |   "tdnn_kernel": [
102 |     5,
103 |     3,
104 |     3,
105 |     1,
106 |     1
107 |   ],
108 |   "torch_dtype": "float32",
109 |   "transformers_version": "4.19.0.dev0",
110 |   "use_weighted_layer_sum": false,
111 |   "vocab_size": 32,
112 |   "xvector_output_dim": 512
113 | }
114 | 


--------------------------------------------------------------------------------
/SongGeneration/codeclm/tokenizer/Flow1dVAE/our_MERT_BESTRQ/mert_fairseq/config/pretrain/EAT_pretraining_AS2M.yaml:
--------------------------------------------------------------------------------
  1 | # @package _group_
  2 | 
  3 | common:
  4 |   fp16: true
  5 |   log_format: json
  6 |   log_interval: 200
  7 |   tensorboard_logdir: tb
  8 |   min_loss_scale: 1e-6
  9 |   fp16_no_flatten_grads: true
 10 |   user_dir: ${env:PWD}
 11 |   seed: 1
 12 | 
 13 | checkpoint:
 14 |   save_interval: 1
 15 |   save_interval_updates: 10000
 16 |   keep_interval_updates: 1
 17 |   no_epoch_checkpoints: true
 18 | 
 19 | task:
 20 |   _name: mae_image_pretraining
 21 |   data: unbalanced_train
 22 |   rebuild_batches: true
 23 |   key: source
 24 |   precompute_mask_config: {}
 25 |   downsr_16hz: true
 26 |   audio_mae: true
 27 |   h5_format: false
 28 |   target_length: 1024
 29 |   flexible_mask: false
 30 | 
 31 | dataset:
 32 |   num_workers: 10
 33 |   batch_size: 12
 34 |   skip_invalid_size_inputs_valid_test: true
 35 |   required_batch_size_multiple: 1
 36 |   disable_validation: true
 37 | 
 38 | distributed_training:
 39 |   distributed_world_size: 4
 40 |   ddp_backend: c10d
 41 | 
 42 | criterion:
 43 |   _name: model
 44 |   log_keys:
 45 |     - ema_decay
 46 |     - target_var
 47 |     - pred_var
 48 |     - model_norm
 49 |     - ema_norm
 50 |     - masked_pct
 51 | 
 52 | optimization:
 53 |   max_update: 400000
 54 |   lr: [ 0.0005 ]
 55 |   debug_param_names: true
 56 |   clip_norm: 4
 57 | 
 58 | optimizer:
 59 |   _name: composite
 60 |   dynamic_groups: true
 61 |   groups:
 62 |     default:
 63 |       lr_float: 0.0005
 64 |       optimizer:
 65 |         _name: adam
 66 |         adam_betas: [0.9,0.95]
 67 |         weight_decay: 0.05
 68 |       lr_scheduler:
 69 |         _name: cosine
 70 |         warmup_updates: 53333
 71 | 
 72 | lr_scheduler: pass_through
 73 | 
 74 | model:
 75 |   _name: data2vec_multi
 76 | 
 77 |   ema_decay: 0.9998
 78 |   ema_end_decay: 0.99999
 79 |   ema_anneal_end_step: 100000
 80 |   instance_norm_target_layer: true
 81 |   layer_norm_target_layer: false
 82 |   layer_norm_targets: true
 83 |   end_of_block_targets: false
 84 | 
 85 |   depth: 12
 86 |   average_top_k_layers: 12
 87 |   clone_batch: 16
 88 | 
 89 |   norm_eps: 1e-6
 90 | 
 91 |   min_target_var: 0
 92 |   min_pred_var: 0
 93 | 
 94 |   encoder_dropout: 0
 95 |   post_mlp_drop: 0
 96 |   attention_dropout: 0
 97 |   activation_dropout: 0
 98 | 
 99 |   supported_modality: IMAGE
100 |   cls_loss: 1
101 | 
102 |   ema_encoder_only: false
103 | 
104 |   modalities:
105 |     image:
106 |       in_chans: 1
107 |       inverse_mask: true
108 |       mask_prob: 0.8
109 |       mask_prob_adjust: 0.07
110 |       mask_length: 5
111 |       mask_noise_std: 0.01
112 |       prenet_depth: 0
113 |       ema_local_encoder: true
114 |       num_extra_tokens: 1
115 |       init_extra_token_zero: false
116 |       use_alibi_encoder: false
117 |       decoder:
118 |         decoder_dim: 768
119 |         decoder_groups: 16
120 |         decoder_kernel: 3
121 |         decoder_layers: 6
122 |         input_dropout: 0


--------------------------------------------------------------------------------
/SongGeneration/codeclm/tokenizer/Flow1dVAE/our_MERT_BESTRQ/mert_fairseq/config/pretrain/MERT_RVQ-VAE_CQT_95M.yaml:
--------------------------------------------------------------------------------
  1 | # @package _group_
  2 | common:
  3 |   fp16: false
  4 |   log_format: json
  5 |   log_interval: 200
  6 |   seed: 1337
  7 |   # tensorboard_logdir: tblog_proj_name
  8 |   # wandb_project: wandb_proj_name
  9 | 
 10 | checkpoint:
 11 |   save_interval_updates: 25000
 12 |   keep_interval_updates: -1
 13 |   no_epoch_checkpoints: true
 14 | 
 15 | 
 16 | distributed_training:
 17 |   ddp_backend: no_c10d
 18 |   distributed_backend: 'nccl'
 19 |   distributed_world_size: 64
 20 |   nprocs_per_node: 8
 21 |   find_unused_parameters: true
 22 | 
 23 | task:
 24 |   _name: mert_pretraining
 25 |   data: ???
 26 |   label_dir: ???
 27 |   labels: ???
 28 |   label_rate: ${model.label_rate}
 29 |   sample_rate: 24000
 30 |   # crop to 5s
 31 |   max_sample_size: 120000
 32 |   min_sample_size: 72000
 33 | 
 34 |   pad_audio: false
 35 |   random_crop: true
 36 |   normalize: false # must be consistent with extractor
 37 | 
 38 | 
 39 | dataset:
 40 |   num_workers: 6
 41 |   max_tokens: 2000000
 42 |   skip_invalid_size_inputs_valid_test: true
 43 |   validate_interval: 1
 44 |   validate_interval_updates: 10000
 45 | 
 46 | criterion:
 47 |   _name: hubert
 48 |   pred_masked_weight: 1.0
 49 |   pred_nomask_weight: 0.0
 50 |   loss_weights: [10, 1]
 51 | 
 52 | optimization:
 53 |   max_update: 400000
 54 |   lr: [0.0005]
 55 |   clip_norm: 10.0
 56 | 
 57 | optimizer:
 58 |   _name: adam
 59 |   adam_betas: (0.9,0.98)
 60 |   adam_eps: 1e-06
 61 |   weight_decay: 0.01
 62 | 
 63 | lr_scheduler:
 64 |   _name: polynomial_decay
 65 |   warmup_updates: 32000
 66 | 
 67 | model:
 68 |   _name: mert
 69 |   label_rate: ???
 70 |   skip_masked: false
 71 |   skip_nomask: true
 72 |   mask_prob: 0.8
 73 |   mask_length: 5
 74 | 
 75 |   logit_temp: 0.1
 76 | 
 77 |   # ----- mixture ------
 78 |   mixture_prob: 0.5
 79 |   inbatch_noise_augment_len_range: "[12000, 24000]"
 80 |   inbatch_noise_augment_number_range: "[1, 3]"
 81 |   inbatch_noise_augment_volume: 1.0
 82 |   # ------------------------
 83 |   extractor_mode: default
 84 |   audio_extract_type: w2v_conv
 85 |   conv_feature_layers: '[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2'
 86 | 
 87 |   # ---- cqt reconstruction, need to add loss weight ---
 88 |   audio_cqt_loss_m: true
 89 |   audio_cqt_bins: 336
 90 |   # -----------
 91 |   final_dim: 64
 92 |   encoder_layerdrop: 0.05
 93 |   dropout_input: 0.1
 94 |   dropout_features: 0.1
 95 |   dropout: 0.1
 96 |   attention_dropout: 0.1
 97 |   feature_grad_mult: 0.1
 98 |   untie_final_proj: true
 99 |   activation_dropout: 0.0
100 | 
101 | 
102 | hydra:
103 |   job:
104 |     config:
105 |       override_dirname:
106 |         kv_sep: '-'
107 |         item_sep: '__'
108 |         exclude_keys:
109 |           - run
110 |           - task.data
111 |           - task.label_dir
112 |   run:
113 |     dir: ???
114 |   sweep:
115 |     dir: ???
116 |     subdir: ${hydra.job.config_name}__${hydra.job.override_dirname}
117 | 


--------------------------------------------------------------------------------
/SongGeneration/third_party/stable_audio_tools/stable_audio_tools/configs/model_configs/autoencoders/encodec_musicgen_rvq.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "model_type": "autoencoder",
 3 |     "sample_size": 32000,
 4 |     "sample_rate": 32000,
 5 |     "audio_channels": 1,
 6 |     "model": {
 7 |         "encoder": {
 8 |             "type": "seanet",
 9 |             "config": {
10 |                 "channels": 1,
11 |                 "dimension": 128,
12 |                 "n_filters": 64,
13 |                 "ratios": [4, 4, 5, 8],
14 |                 "n_residual_layers": 1,
15 |                 "dilation_base": 2,
16 |                 "lstm": 2,
17 |                 "norm": "weight_norm"
18 |             }
19 |         },
20 |         "decoder": {
21 |             "type": "seanet",
22 |             "config": {
23 |                 "channels": 1,
24 |                 "dimension": 128,
25 |                 "n_filters": 64,
26 |                 "ratios": [4, 4, 5, 8],
27 |                 "n_residual_layers": 1,
28 |                 "dilation_base": 2,
29 |                 "lstm": 2,
30 |                 "norm": "weight_norm"
31 |             }
32 |         },
33 |         "bottleneck": {
34 |             "type": "rvq",
35 |             "config": {
36 |                 "num_quantizers": 4,
37 |                 "codebook_size": 2048,
38 |                 "dim": 128,
39 |                 "decay": 0.99,
40 |                 "threshold_ema_dead_code": 2
41 |             }
42 |         },
43 |         "latent_dim": 128,
44 |         "downsampling_ratio": 640,
45 |         "io_channels": 1
46 |     },
47 |     "training": {
48 |         "learning_rate": 1e-4,
49 |         "warmup_steps": 0,
50 |         "use_ema": true,
51 |         "loss_configs": {
52 |             "discriminator": {
53 |                 "type": "encodec",
54 |                 "config": {
55 |                     "filters": 32,
56 |                     "n_ffts": [2048, 1024, 512, 256, 128],
57 |                     "hop_lengths": [512, 256, 128, 64, 32],
58 |                     "win_lengths": [2048, 1024, 512, 256, 128]
59 |                 },
60 |                 "weights": {
61 |                     "adversarial": 0.1,
62 |                     "feature_matching": 5.0
63 |                 }
64 |             },
65 |             "spectral": {
66 |                 "type": "mrstft",
67 |                 "config": {
68 |                     "fft_sizes": [2048, 1024, 512, 256, 128, 64, 32],
69 |                     "hop_sizes": [512, 256, 128, 64, 32, 16, 8],
70 |                     "win_lengths": [2048, 1024, 512, 256, 128, 64, 32],
71 |                     "perceptual_weighting": true
72 |                 },
73 |                 "weights": {
74 |                     "mrstft": 1.0
75 |                 }
76 |             },
77 |             "time": {
78 |                 "type": "l1",
79 |                 "weights": {
80 |                     "l1": 0.0
81 |                 }
82 |             }
83 |         },
84 |         "demo": {
85 |             "demo_every": 2000
86 |         }
87 |     }
88 | }


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # ComfyUI_SongGeneration
 2 |  [SongGeneration](https://github.com/tencent-ailab/SongGeneration):High-Quality Song Generation with Multi-Preference Alignment (SOTA),you can try VRAM>12G
 3 | 
 4 | # Update
 5 | * 11/22 修复入参顺序颠倒的错误，修复一个找很久没找到的print，并修复其模块导入问题
 6 | * 10/23 同步官方代码，删除fairseq库，已无安装难度； 
 7 | * 10/21同步官方代码，精简模型加载，删除hubert模型，优化lm模型加载顺序，避免转移到显存时峰值OOM；  
 8 | * 10/18  修改加载流程，支持最新的full ，new，large模型，large模型12GVram可能会OOM，修复高版本transformer 的函数错误/Modify the loading process to support the latest full, new, and large models, and fix function errors in higher versions of transformers   
 9 | *  07/29，支持bgm和人声（vocal，目前还是有bgm底噪）单独输出，选择mixed为合成全部，模型加载方式更合理，去掉诸多debug打印，新增save_separate按钮，开启则保存三个音频（bgm，vocal，mixed）；
10 | * Test env（插件测试环境）：window11，python3.11， torch2.6 ，cu124， VR12G,（transformers 4.45.1）
11 | 
12 | 
13 | # 1. Installation
14 | 
15 | In the ./ComfyUI/custom_nodes directory, run the following:   
16 | ```
17 | git clone https://github.com/smthemex/ComfyUI_SongGeneration.git
18 | ```
19 | 
20 | # 2. Requirements  
21 | 
22 | * 如果缺失库，打开requirements_orgin.txt文件，看是少了哪个，手动安装；
23 | * If the library is missing, open the ’requirements_orgin.txt‘ file and see which one is missing, then manually install it；  
24 | 
25 | ```
26 | pip install -r requirements.txt
27 | ```
28 | 
29 | # 3.Model
30 | * 3.1.1 download  ckpt  from [tencent/SongGeneration](https://huggingface.co/tencent/SongGeneration/tree/main)   国内建议魔搭[AI-ModelScope/SongGeneration](https://www.modelscope.cn/models/AI-ModelScope/SongGeneration/files)    
31 | * 3.1.2  [new base](https://huggingface.co/lglg666/SongGeneration-base-new),[large ](https://huggingface.co/lglg666/SongGeneration-large),[full](https://huggingface.co/lglg666/SongGeneration-base-full)    
32 | * 3.1.3 new prompt,[emb](https://github.com/tencent-ailab/SongGeneration/tree/main/tools)   
33 | * 3.1.4 download htdemucs.pth [tencent/SongGeneration](https://huggingface.co/tencent/SongGeneration/tree/main/third_party/demucs/ckpt)  
34 | * 文件结构如下,修改了加载流程，原来的结构也能用：
35 | ```
36 | --  ComfyUI/models/SongGeneration/ # 24.4G all 整个文件夹的大小
37 |     |-- htdemucs.pth #150M
38 |     |--prompt.pt  # 3M
39 |     |--new_prompt.pt  # 3M
40 |     |--model_2.safetensors
41 |     |--model_2_fixed.safetensors
42 |     |--new_model.pt  # rename from model.pt #可选
43 |     |--large_model.pt  #  rename from model.pt #可选
44 |     |-- ckpt/  
45 |         |--encode-s12k.pt  # 3.68G
46 | --  ComfyUI/models/vae/
47 |     |--autoencoder_music_1320k.ckpt  
48 | ```
49 | # 4 Example
50 | ![](https://github.com/smthemex/ComfyUI_SongGeneration/blob/main/example_workflows/SongGeneration.png)
51 | 
52 | # 5 Citation
53 | ```
54 | @article{lei2025levo,
55 |   title={LeVo: High-Quality Song Generation with Multi-Preference Alignment},
56 |   author={Lei, Shun and Xu, Yaoxun and Lin, Zhiwei and Zhang, Huaicheng and Tan, Wei and Chen, Hangting and Yu, Jianwei and Zhang, Yixuan and Yang, Chenyu and Zhu, Haina and Wang, Shuai and Wu, Zhiyong and Yu, Dong},
57 |   journal={arXiv preprint arXiv:2506.07520},
58 |   year={2025}
59 | }
60 | ```
61 | 
62 | 
63 | 
64 | 


--------------------------------------------------------------------------------
/SongGeneration/codeclm/models/llama/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2022 EleutherAI and The HuggingFace Inc. team. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | from typing import TYPE_CHECKING
15 | 
16 | from transformers.utils import (
17 |     OptionalDependencyNotAvailable,
18 |     _LazyModule,
19 |     is_sentencepiece_available,
20 |     is_tokenizers_available,
21 |     is_torch_available,
22 | )
23 | 
24 | 
25 | _import_structure = {
26 |     "configuration_llama": ["LLAMA_PRETRAINED_CONFIG_ARCHIVE_MAP", "LlamaConfig"],
27 | }
28 | 
29 | try:
30 |     if not is_sentencepiece_available():
31 |         raise OptionalDependencyNotAvailable()
32 | except OptionalDependencyNotAvailable:
33 |     pass
34 | else:
35 |     _import_structure["tokenization_llama"] = ["LlamaTokenizer"]
36 | 
37 | try:
38 |     if not is_tokenizers_available():
39 |         raise OptionalDependencyNotAvailable()
40 | except OptionalDependencyNotAvailable:
41 |     pass
42 | else:
43 |     _import_structure["tokenization_llama_fast"] = ["LlamaTokenizerFast"]
44 | 
45 | try:
46 |     if not is_torch_available():
47 |         raise OptionalDependencyNotAvailable()
48 | except OptionalDependencyNotAvailable:
49 |     pass
50 | else:
51 |     _import_structure["modeling_llama"] = [
52 |         "LlamaForCausalLM",
53 |         "LlamaModel",
54 |         "LlamaPreTrainedModel",
55 |         "LlamaForSequenceClassification",
56 |     ]
57 | 
58 | 
59 | if TYPE_CHECKING:
60 |     from .configuration_llama import LLAMA_PRETRAINED_CONFIG_ARCHIVE_MAP, LlamaConfig
61 | 
62 |     try:
63 |         if not is_sentencepiece_available():
64 |             raise OptionalDependencyNotAvailable()
65 |     except OptionalDependencyNotAvailable:
66 |         pass
67 |     else:
68 |         from .tokenization_llama import LlamaTokenizer
69 | 
70 |     try:
71 |         if not is_tokenizers_available():
72 |             raise OptionalDependencyNotAvailable()
73 |     except OptionalDependencyNotAvailable:
74 |         pass
75 |     else:
76 |         from .tokenization_llama_fast import LlamaTokenizerFast
77 | 
78 |     try:
79 |         if not is_torch_available():
80 |             raise OptionalDependencyNotAvailable()
81 |     except OptionalDependencyNotAvailable:
82 |         pass
83 |     else:
84 |         from .modeling_llama import LlamaForCausalLM, LlamaForSequenceClassification, LlamaModel, LlamaPreTrainedModel
85 | 
86 | 
87 | else:
88 |     import sys
89 | 
90 |     sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
91 | 


--------------------------------------------------------------------------------
/SongGeneration/codeclm/tokenizer/Flow1dVAE/our_MERT_BESTRQ/mert_fairseq/config/pretrain/EAT_pretraining_music_multinodes.yaml:
--------------------------------------------------------------------------------
  1 | # @package _group_
  2 | 
  3 | common:
  4 |   fp16: true
  5 |   log_format: json
  6 |   log_interval: 200
  7 |   tensorboard_logdir: tb
  8 |   min_loss_scale: 1e-6
  9 |   fp16_no_flatten_grads: true
 10 |   user_dir: ${env:PWD}
 11 |   seed: 1
 12 | 
 13 | checkpoint:
 14 |   save_interval: 1
 15 |   save_interval_updates: 10000
 16 |   keep_interval_updates: 1000
 17 |   no_epoch_checkpoints: true
 18 | 
 19 | task:
 20 |   _name: mae_image_pretraining
 21 |   data: music4all_sh/
 22 |   rebuild_batches: true
 23 |   key: source
 24 |   precompute_mask_config: {}
 25 |   downsr_16hz: false
 26 |   audio_mae: true
 27 |   h5_format: false
 28 |   target_length: 752
 29 |   flexible_mask: false
 30 |   sample_rate: 24000
 31 |   fixed_duration: 30
 32 | 
 33 | dataset:
 34 |   num_workers: 10
 35 |   batch_size: 12
 36 |   skip_invalid_size_inputs_valid_test: true
 37 |   required_batch_size_multiple: 1
 38 |   disable_validation: true
 39 | 
 40 | distributed_training:
 41 |   distributed_world_size: 4
 42 |   ddp_backend: c10d
 43 | 
 44 | criterion:
 45 |   _name: model
 46 |   log_keys:
 47 |     - ema_decay
 48 |     - target_var
 49 |     - pred_var
 50 |     - model_norm
 51 |     - ema_norm
 52 |     - masked_pct
 53 | 
 54 | optimization:
 55 |   max_update: 400000
 56 |   lr: [ 0.0001 ]
 57 |   # debug_param_names: true
 58 |   clip_norm: 4
 59 | 
 60 | optimizer:
 61 |   _name: composite
 62 |   # dynamic_groups: true
 63 |   groups:
 64 |     default:
 65 |       lr_float: 0.0005
 66 |       optimizer:
 67 |         _name: adam
 68 |         adam_betas: [0.9,0.95]
 69 |         weight_decay: 0.05
 70 |       lr_scheduler:
 71 |         _name: cosine
 72 |         warmup_updates: 10000 # 53333
 73 | 
 74 | lr_scheduler: pass_through
 75 | 
 76 | model:
 77 |   _name: data2vec_multi
 78 | 
 79 |   ema_decay: 0.9998
 80 |   ema_end_decay: 0.99999
 81 |   ema_anneal_end_step: 100000
 82 |   instance_norm_target_layer: true
 83 |   layer_norm_target_layer: false
 84 |   layer_norm_targets: true
 85 |   end_of_block_targets: false
 86 | 
 87 |   depth: 12
 88 |   average_top_k_layers: 12
 89 |   clone_batch: 16
 90 | 
 91 |   norm_eps: 1e-6
 92 | 
 93 |   min_target_var: 0
 94 |   min_pred_var: 0
 95 | 
 96 |   encoder_dropout: 0
 97 |   post_mlp_drop: 0
 98 |   attention_dropout: 0
 99 |   activation_dropout: 0
100 | 
101 |   supported_modality: IMAGE
102 |   cls_loss: 1
103 | 
104 |   ema_encoder_only: false
105 | 
106 |   modalities:
107 |     image:
108 |       in_chans: 1
109 |       inverse_mask: true
110 |       mask_prob: 0.8
111 |       mask_prob_adjust: 0.07
112 |       mask_length: 5
113 |       mask_noise_std: 0.01
114 |       prenet_depth: 0
115 |       ema_local_encoder: true
116 |       num_extra_tokens: 1
117 |       init_extra_token_zero: false
118 |       use_alibi_encoder: false
119 |       decoder:
120 |         decoder_dim: 768
121 |         decoder_groups: 16
122 |         decoder_kernel: 3
123 |         decoder_layers: 6
124 |         input_dropout: 0
125 |       target_length: 752


--------------------------------------------------------------------------------
/SongGeneration/codeclm/tokenizer/Flow1dVAE/tools/infer_hifigan48k_speech.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import torch
 3 | from tqdm import tqdm
 4 | import torchaudio
 5 | import librosa
 6 | import os
 7 | import math
 8 | import numpy as np
 9 | from .get_melvaehifigan48k import build_pretrained_models
10 | from . import torch_tools as torch_tools
11 | 
12 | class Tango:
13 |     def __init__(self, \
14 |         device="cuda:0"):
15 |         
16 |         self.sample_rate = 48000
17 |         self.device = device
18 | 
19 |         self.vae, self.stft = build_pretrained_models()
20 |         self.vae, self.stft = self.vae.eval().to(device), self.stft.eval().to(device)
21 | 
22 |     def mel_spectrogram_to_waveform(self, mel_spectrogram):
23 |         if mel_spectrogram.dim() == 4:
24 |             mel_spectrogram = mel_spectrogram.squeeze(1)
25 | 
26 |         waveform = self.vocoder(mel_spectrogram)
27 |         # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
28 |         waveform = waveform.cpu().float()
29 |         return waveform
30 | 
31 |     def sound2sound_generate_longterm(self, fname, batch_size=1, duration=10.24, steps=200, disable_progress=False):
32 |         """ Genrate audio without condition. """
33 |         num_frames = math.ceil(duration * 100. / 8)
34 |         with torch.no_grad():
35 |             orig_samples, fs = torchaudio.load(fname)
36 |             if(orig_samples.shape[-1]<int(duration*48000)):
37 |                 orig_samples = orig_samples.repeat(1,math.ceil(int(duration*48000)/float(orig_samples.shape[-1])))
38 |             # orig_samples = torch.cat([torch.zeros(orig_samples.shape[0], int(duration * fs)//2, dtype=orig_samples.dtype, device=orig_samples.device), orig_samples, torch.zeros(orig_samples.shape[0], int(duration * fs)//2, dtype=orig_samples.dtype, device=orig_samples.device)], -1).to(self.device)
39 |             orig_samples = torch.cat([orig_samples, torch.zeros(orig_samples.shape[0], int(duration * fs)//2, dtype=orig_samples.dtype, device=orig_samples.device)], -1).to(self.device)
40 |             if(fs!=48000):orig_samples = torchaudio.functional.resample(orig_samples, fs, 48000)
41 |             # resampled_audios = orig_samples[[0],int(4.64*48000):int(35.36*48000)+480].clamp(-1,1)
42 |             resampled_audios = orig_samples[[0],0:int(duration*48000)+480].clamp(-1,1)
43 |             orig_samples = orig_samples[[0],0:int(duration*48000)]
44 | 
45 |             mel, _, _ = torch_tools.wav_to_fbank2(resampled_audios, -1, fn_STFT=self.stft)
46 |             mel = mel.unsqueeze(1).to(self.device)
47 | 
48 |             audio = self.vae.decode_to_waveform(mel)
49 |             audio = torch.from_numpy(audio)
50 | 
51 |             if(orig_samples.shape[-1]<audio.shape[-1]):
52 |                 orig_samples = torch.cat([orig_samples, torch.zeros(orig_samples.shape[0], audio.shape[-1]-orig_samples.shape[-1], dtype=orig_samples.dtype, device=orig_samples.device)],-1)
53 |             else:
54 |                 orig_samples = orig_samples[:,0:audio.shape[-1]]
55 |             output = torch.cat([orig_samples.detach().cpu(),audio.detach().cpu()],0)
56 |         return output
57 | 


--------------------------------------------------------------------------------
/SongGeneration/codeclm/tokenizer/Flow1dVAE/our_MERT_BESTRQ/mert_fairseq/config/pretrain/MERT_RVQ-VAE_CQT_95M_bestrvq_multinodes.yaml:
--------------------------------------------------------------------------------
  1 | # @package _group_
  2 | common:
  3 |   fp16: false
  4 |   log_format: json
  5 |   log_interval: 200
  6 |   seed: 1337
  7 |   # tensorboard_logdir: tblog_proj_name
  8 |   # wandb_project: wandb_proj_name
  9 | 
 10 | checkpoint:
 11 |   save_interval_updates: 12500
 12 |   keep_interval_updates: -1
 13 |   no_epoch_checkpoints: true
 14 | 
 15 | 
 16 | distributed_training:
 17 |   ddp_backend: no_c10d
 18 |   distributed_backend: 'nccl'
 19 |   distributed_world_size: 64
 20 |   nprocs_per_node: 8
 21 |   find_unused_parameters: true
 22 | 
 23 | task:
 24 |   _name: mert_pretraining
 25 |   data: ???
 26 |   label_dir: ???
 27 |   labels: ???
 28 |   label_rate: ${model.label_rate}
 29 |   sample_rate: 24000
 30 |   # crop to 5s
 31 |   max_sample_size: 120000
 32 |   min_sample_size: 72000
 33 | 
 34 |   pad_audio: false
 35 |   random_crop: true
 36 |   normalize: false # must be consistent with extractor
 37 | 
 38 | 
 39 | dataset:
 40 |   num_workers: 6
 41 |   max_tokens: 2000000
 42 |   skip_invalid_size_inputs_valid_test: true
 43 |   validate_interval: 1
 44 |   validate_interval_updates: 10000
 45 | 
 46 | criterion:
 47 |   _name: hubert
 48 |   pred_masked_weight: 1.0
 49 |   pred_nomask_weight: 0.0
 50 |   loss_weights: [10, 1]
 51 | 
 52 | optimization:
 53 |   max_update: 400000
 54 |   lr: [0.0005]
 55 |   clip_norm: 10.0
 56 |   update_freq: [4]
 57 | 
 58 | optimizer:
 59 |   _name: adam
 60 |   adam_betas: (0.9,0.98)
 61 |   adam_eps: 1e-06
 62 |   weight_decay: 0.01
 63 | 
 64 | lr_scheduler:
 65 |   _name: polynomial_decay
 66 |   warmup_updates: 32000
 67 | 
 68 | model:
 69 |   _name: mert
 70 |   label_rate: ???
 71 |   skip_masked: false
 72 |   skip_nomask: true
 73 |   mask_prob: 0.8
 74 |   mask_length: 5
 75 | 
 76 |   logit_temp: 0.1
 77 | 
 78 |   # ----- mixture ------
 79 |   mixture_prob: 0.5
 80 |   inbatch_noise_augment_len_range: "[12000, 24000]"
 81 |   inbatch_noise_augment_number_range: "[1, 3]"
 82 |   inbatch_noise_augment_volume: 1.0
 83 |   # ------------------------
 84 |   extractor_mode: default
 85 |   audio_extract_type: w2v_conv
 86 |   conv_feature_layers: '[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2'
 87 | 
 88 |   # ---- codec target
 89 |   audio_codec_type: rvq
 90 |   audio_codec_ckpt_path: RVQ_3000.pth
 91 | 
 92 |   # ---- cqt reconstruction, need to add loss weight ---
 93 |   audio_cqt_loss_m: true
 94 |   audio_cqt_bins: 336
 95 |   # -----------
 96 |   final_dim: 64
 97 |   encoder_layerdrop: 0.05
 98 |   dropout_input: 0.1
 99 |   dropout_features: 0.1
100 |   dropout: 0.1
101 |   attention_dropout: 0.1
102 |   feature_grad_mult: 0.1
103 |   untie_final_proj: true
104 |   activation_dropout: 0.0
105 | 
106 | 
107 | hydra:
108 |   job:
109 |     config:
110 |       override_dirname:
111 |         kv_sep: '-'
112 |         item_sep: '__'
113 |         exclude_keys:
114 |           - run
115 |           - task.data
116 |           - task.label_dir
117 |   run:
118 |     dir: ???
119 |   sweep:
120 |     dir: ???
121 |     subdir: ${hydra.job.config_name}__${hydra.job.override_dirname}
122 | 


--------------------------------------------------------------------------------
/SongGeneration/codeclm/tokenizer/Flow1dVAE/our_MERT_BESTRQ/mert_fairseq/config/pretrain/MERT_RVQ-VAE_CQT_95M_dac_multinodes.yaml:
--------------------------------------------------------------------------------
  1 | # @package _group_
  2 | common:
  3 |   fp16: false
  4 |   log_format: json
  5 |   log_interval: 200
  6 |   seed: 1337
  7 |   # tensorboard_logdir: tblog_proj_name
  8 |   # wandb_project: wandb_proj_name
  9 | 
 10 | checkpoint:
 11 |   save_interval_updates: 12500
 12 |   keep_interval_updates: -1
 13 |   no_epoch_checkpoints: true
 14 | 
 15 | 
 16 | distributed_training:
 17 |   ddp_backend: no_c10d
 18 |   distributed_backend: 'nccl'
 19 |   distributed_world_size: 64
 20 |   nprocs_per_node: 8
 21 |   find_unused_parameters: true
 22 | 
 23 | task:
 24 |   _name: mert_pretraining
 25 |   data: ???
 26 |   label_dir: ???
 27 |   labels: ???
 28 |   label_rate: ${model.label_rate}
 29 |   sample_rate: 24000
 30 |   # crop to 5s
 31 |   max_sample_size: 120000
 32 |   min_sample_size: 72000
 33 | 
 34 |   pad_audio: false
 35 |   random_crop: true
 36 |   normalize: false # must be consistent with extractor
 37 | 
 38 | 
 39 | dataset:
 40 |   num_workers: 6
 41 |   max_tokens: 2000000
 42 |   skip_invalid_size_inputs_valid_test: true
 43 |   validate_interval: 1
 44 |   validate_interval_updates: 10000
 45 | 
 46 | criterion:
 47 |   _name: hubert
 48 |   pred_masked_weight: 1.0
 49 |   pred_nomask_weight: 0.0
 50 |   loss_weights: [10, 1]
 51 | 
 52 | optimization:
 53 |   max_update: 400000
 54 |   lr: [0.0005]
 55 |   clip_norm: 10.0
 56 |   update_freq: [4]
 57 | 
 58 | optimizer:
 59 |   _name: adam
 60 |   adam_betas: (0.9,0.98)
 61 |   adam_eps: 1e-06
 62 |   weight_decay: 0.01
 63 | 
 64 | lr_scheduler:
 65 |   _name: polynomial_decay
 66 |   warmup_updates: 32000
 67 | 
 68 | model:
 69 |   _name: mert
 70 |   label_rate: ???
 71 |   skip_masked: false
 72 |   skip_nomask: true
 73 |   mask_prob: 0.8
 74 |   mask_length: 5
 75 | 
 76 |   logit_temp: 0.1
 77 | 
 78 |   # ----- mixture ------
 79 |   mixture_prob: 0.5
 80 |   inbatch_noise_augment_len_range: "[12000, 24000]"
 81 |   inbatch_noise_augment_number_range: "[1, 3]"
 82 |   inbatch_noise_augment_volume: 1.0
 83 |   # ------------------------
 84 |   extractor_mode: default
 85 |   audio_extract_type: w2v_conv
 86 |   conv_feature_layers: '[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2'
 87 | 
 88 |   # ---- codec target
 89 |   audio_codec_type: dac
 90 |   audio_codec_dac_model_path: weights_24khz_8kbps_0.0.4.pth  #nj
 91 | 
 92 |   # ---- cqt reconstruction, need to add loss weight ---
 93 |   audio_cqt_loss_m: true
 94 |   audio_cqt_bins: 336
 95 |   # -----------
 96 |   final_dim: 64
 97 |   encoder_layerdrop: 0.05
 98 |   dropout_input: 0.1
 99 |   dropout_features: 0.1
100 |   dropout: 0.1
101 |   attention_dropout: 0.1
102 |   feature_grad_mult: 0.1
103 |   untie_final_proj: true
104 |   activation_dropout: 0.0
105 | 
106 | 
107 | hydra:
108 |   job:
109 |     config:
110 |       override_dirname:
111 |         kv_sep: '-'
112 |         item_sep: '__'
113 |         exclude_keys:
114 |           - run
115 |           - task.data
116 |           - task.label_dir
117 |   run:
118 |     dir: ???
119 |   sweep:
120 |     dir: ???
121 |     subdir: ${hydra.job.config_name}__${hydra.job.override_dirname}
122 | 


--------------------------------------------------------------------------------
/SongGeneration/codeclm/tokenizer/Flow1dVAE/tools/infer_hifigan48k_vocal.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import torch
 3 | from tqdm import tqdm
 4 | import torchaudio
 5 | import librosa
 6 | import os
 7 | import math
 8 | import numpy as np
 9 | from .get_melvaehifigan48k import build_pretrained_models
10 | from . import torch_tools as torch_tools
11 | 
12 | class Tango:
13 |     def __init__(self, \
14 |         device="cuda:0"):
15 |         
16 |         self.sample_rate = 48000
17 |         self.device = device
18 | 
19 |         self.vae, self.stft = build_pretrained_models()
20 |         self.vae, self.stft = self.vae.eval().to(device), self.stft.eval().to(device)
21 | 
22 |     def mel_spectrogram_to_waveform(self, mel_spectrogram):
23 |         if mel_spectrogram.dim() == 4:
24 |             mel_spectrogram = mel_spectrogram.squeeze(1)
25 | 
26 |         waveform = self.vocoder(mel_spectrogram)
27 |         # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
28 |         waveform = waveform.cpu().float()
29 |         return waveform
30 | 
31 |     def sound2sound_generate_longterm(self, fname, batch_size=1, duration=10.24, steps=200, disable_progress=False):
32 |         """ Genrate audio without condition. """
33 |         num_frames = math.ceil(duration * 100. / 8)
34 |         with torch.no_grad():
35 |             orig_samples, fs = torchaudio.load(fname)
36 |             if(orig_samples.shape[-1]<int(duration*48000*2)):
37 |                 orig_samples =  torch.cat([orig_samples, torch.zeros(orig_samples.shape[0], int(duration*48000*2+480)-orig_samples.shape[-1], \
38 |                     dtype=orig_samples.dtype, device=orig_samples.device)], -1)
39 |             # orig_samples = torch.cat([torch.zeros(orig_samples.shape[0], int(duration * fs)//2, dtype=orig_samples.dtype, device=orig_samples.device), orig_samples, torch.zeros(orig_samples.shape[0], int(duration * fs)//2, dtype=orig_samples.dtype, device=orig_samples.device)], -1).to(self.device)
40 |             orig_samples = torch.cat([orig_samples, torch.zeros(orig_samples.shape[0], int(duration * fs)//2, dtype=orig_samples.dtype, device=orig_samples.device)], -1).to(self.device)
41 |             if(fs!=48000):orig_samples = torchaudio.functional.resample(orig_samples, fs, 48000)
42 |             # resampled_audios = orig_samples[[0],int(4.64*48000):int(35.36*48000)+480].clamp(-1,1)
43 |             resampled_audios = orig_samples[[0],0:int(duration*2*48000)+480].clamp(-1,1)
44 |             orig_samples = orig_samples[[0],0:int(duration*2*48000)]
45 | 
46 |             mel, _, _ = torch_tools.wav_to_fbank2(resampled_audios, -1, fn_STFT=self.stft)
47 |             mel = mel.unsqueeze(1).to(self.device)
48 | 
49 |             audio = self.vae.decode_to_waveform(mel)
50 |             audio = torch.from_numpy(audio)
51 | 
52 |             if(orig_samples.shape[-1]<audio.shape[-1]):
53 |                 orig_samples = torch.cat([orig_samples, torch.zeros(orig_samples.shape[0], audio.shape[-1]-orig_samples.shape[-1], dtype=orig_samples.dtype, device=orig_samples.device)],-1)
54 |             else:
55 |                 orig_samples = orig_samples[:,0:audio.shape[-1]]
56 |             output = torch.cat([orig_samples.detach().cpu(),audio.detach().cpu()],0)
57 |         return output
58 | 


--------------------------------------------------------------------------------
/SongGeneration/codeclm/tokenizer/Flow1dVAE/our_MERT_BESTRQ/run_training_orig.sh:
--------------------------------------------------------------------------------
 1 | # the rank of distributed node worker
 2 | # If I use two nodes, 4 gpus per each, then WORKER_RANK for the two node should be 0, 4, i.e. the starting indice of the GPU.
 3 | WORKER_RANK=${1:-'0'}
 4 | PLATFORM=${2:-'shef'} 
 5 | YAML_NAME_WITHOUT_EXT=${3:-'MERT_RVQ-VAE_CQT_95M'}
 6 | TRAINING_SETTING=${4:-'MERT_RVQ-VAE_CQT'}
 7 | MASTER_PROC_ADD=${5:-'127.0.0.1'}
 8 | DIST_PORT=${6:-'39683'}
 9 | 
10 | echo "worker rank ${WORKER_RANK}, master address ${MASTER_PROC_ADD}:${DIST_PORT}"
11 | 
12 | MAP_PROJ_DIR=$HOME/MERT
13 | 
14 | DISTRIBUTED_WORLD_SIZE=2
15 | NPROCES_PER_NODE=2
16 | MAX_TOKENS=1000000 # set for 80GB A100
17 | NUM_WOKERS=6
18 | 
19 | run_command_prefix=' '
20 | # Loading folders
21 | # 1. tsv files for audio paths
22 | DATA_DIR=${MAP_PROJ_DIR}/data/audio_tsv
23 | # 2. working folder for saving checkpoints and loading config files
24 | CONFIG_DIR=/${MAP_PROJ_DIR}/mert_fairseq/config/pretrain
25 | # 3. clustering labels for training data
26 | LABEL_ROOT_DIR=${MAP_PROJ_DIR}/data/labels
27 | 
28 | 
29 | FAIRSEQ_PATH=${MAP_PROJ_DIR}/src/fairseq;
30 | SAVE_DIR=${MAP_PROJ_DIR}/data/fairseq_savedir/
31 | 
32 | # set 75 for the RVQ-VAE model
33 | LABEL_RATE=75
34 | 
35 | case $YAML_NAME_WITHOUT_EXT in
36 |     MERT_RVQ-VAE_CQT_95M)
37 |         TASK_LABELS_POSTFIX='["encodec_0","encodec_1","encodec_2","encodec_3","encodec_4","encodec_5","encodec_6","encodec_7"]'
38 |         DISTRIBUTED_WORLD_SIZE=8
39 |         NPROCES_PER_NODE=1
40 |         LABEL_RATE=75
41 |         MAX_TOKENS=1800000
42 |         ;;
43 |     MERT_RVQ-VAE_CQT_330M)
44 |         TASK_LABELS_POSTFIX='["encodec_0","encodec_1","encodec_2","encodec_3","encodec_4","encodec_5","encodec_6","encodec_7"]'
45 |         DISTRIBUTED_WORLD_SIZE=64
46 |         NPROCES_PER_NODE=8
47 |         LABEL_RATE=75
48 |         MAX_TOKENS=920000
49 |         ;;
50 |     *)
51 |         echo "Unknown running config: ${$YAML_NAME_WITHOUT_EXT}"
52 |         exit 1
53 |         ;;
54 |     esac
55 | 
56 |  echo running $YAML_NAME_WITHOUT_EXT ..
57 | 
58 |   mkdir -p ${SAVE_DIR}
59 |   echo "checkpoint save at: ${SAVE_DIR}"
60 |   cd ${SAVE_DIR}
61 | 
62 |   ACTUAL_WORKER_RANK=`expr ${WORKER_RANK} \* ${NPROCES_PER_NODE}`
63 |   echo "worker rank ${WORKER_RANK}, master address ${MASTER_PROC_ADD}:${DIST_PORT}, actual rank ${ACTUAL_WORKER_RANK}"
64 | 
65 |   OMP_NUM_THREADS=6 ${run_command_prefix} python -u ${FAIRSEQ_PATH}/fairseq_cli/hydra_train.py \
66 |     --config-dir ${CONFIG_DIR} --config-name ${YAML_NAME_WITHOUT_EXT} \
67 |     common.user_dir=${MAP_PROJ_DIR}/mert_faiseq \
68 |     common.wandb_project=pretrain_${TRAINING_SETTING} \
69 |     checkpoint.save_dir=${SAVE_DIR}/ckpt_${TRAINING_SETTING}/${YAML_NAME_WITHOUT_EXT} \
70 |     distributed_training.distributed_rank=${ACTUAL_WORKER_RANK} \
71 |     distributed_training.distributed_world_size=${DISTRIBUTED_WORLD_SIZE}  \
72 |     distributed_training.nprocs_per_node=${NPROCES_PER_NODE} \
73 |     distributed_training.distributed_init_method="tcp://${MASTER_PROC_ADD}:${DIST_PORT}" \
74 |     task.data=${DATA_DIR} task.label_dir=${LABEL_DIR} \
75 |     task.labels=${TASK_LABELS_POSTFIX} \
76 |     dataset.num_workers=${NUM_WOKERS} \
77 |     dataset.max_tokens=${MAX_TOKENS} \
78 |     dataset.disable_validation=true \
79 |     model.label_rate=${LABEL_RATE}


--------------------------------------------------------------------------------
/SongGeneration/codeclm/tokenizer/Flow1dVAE/our_MERT_BESTRQ/mert_fairseq/config/pretrain/MERT_RVQ-VAE_CQT_95M_mel_multinodes.yaml:
--------------------------------------------------------------------------------
  1 | # @package _group_
  2 | common:
  3 |   fp16: false
  4 |   log_format: json
  5 |   log_interval: 200
  6 |   seed: 1337
  7 |   # tensorboard_logdir: tblog_proj_name
  8 |   # wandb_project: wandb_proj_name
  9 | 
 10 | checkpoint:
 11 |   save_interval_updates: 12500
 12 |   keep_interval_updates: -1
 13 |   no_epoch_checkpoints: true
 14 | 
 15 | 
 16 | distributed_training:
 17 |   ddp_backend: no_c10d
 18 |   distributed_backend: 'nccl'
 19 |   distributed_world_size: 64
 20 |   nprocs_per_node: 8
 21 |   find_unused_parameters: true
 22 | 
 23 | task:
 24 |   _name: mert_pretraining
 25 |   data: ???
 26 |   label_dir: ???
 27 |   labels: ???
 28 |   label_rate: ${model.label_rate}
 29 |   sample_rate: 24000
 30 |   # crop to 5s
 31 |   max_sample_size: 120000
 32 |   min_sample_size: 72000
 33 | 
 34 |   pad_audio: false
 35 |   random_crop: true
 36 |   normalize: false # must be consistent with extractor
 37 | 
 38 | 
 39 | dataset:
 40 |   num_workers: 6
 41 |   max_tokens: 2000000
 42 |   skip_invalid_size_inputs_valid_test: true
 43 |   validate_interval: 1
 44 |   validate_interval_updates: 10000
 45 | 
 46 | criterion:
 47 |   _name: hubert
 48 |   pred_masked_weight: 1.0
 49 |   pred_nomask_weight: 0.0
 50 |   loss_weights: [10, 1]
 51 | 
 52 | optimization:
 53 |   max_update: 400000
 54 |   lr: [0.0005]
 55 |   clip_norm: 10.0
 56 |   update_freq: [4]
 57 | 
 58 | optimizer:
 59 |   _name: adam
 60 |   adam_betas: (0.9,0.98)
 61 |   adam_eps: 1e-06
 62 |   weight_decay: 0.01
 63 | 
 64 | lr_scheduler:
 65 |   _name: polynomial_decay
 66 |   warmup_updates: 32000
 67 | 
 68 | model:
 69 |   _name: mert
 70 |   label_rate: ???
 71 |   skip_masked: false
 72 |   skip_nomask: true
 73 |   mask_prob: 0.8
 74 |   mask_length: 5
 75 | 
 76 |   logit_temp: 0.1
 77 | 
 78 |   # ----- mixture ------
 79 |   mixture_prob: 0.5
 80 |   inbatch_noise_augment_len_range: "[12000, 24000]"
 81 |   inbatch_noise_augment_number_range: "[1, 3]"
 82 |   inbatch_noise_augment_volume: 1.0
 83 |   # ------------------------
 84 |   extractor_mode: default
 85 |   audio_extract_type: melspec # use melspec (instead of `w2v_conv`)
 86 |   melspec_n_bins: 120 # for melspec we use 120, means 12 bins per octave
 87 |   conv_feature_layers: '[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2'
 88 | 
 89 |   # best-rq loss
 90 |   audio_rq_loss_m: false
 91 |   audio_rq_loss_embed_dim: 16
 92 |   audio_rq_loss_num_codebooks: 1
 93 |   audio_rq_loss_num_embeds: 8192
 94 | 
 95 |   # ---- cqt reconstruction, need to add loss weight ---
 96 |   audio_cqt_loss_m: true
 97 |   audio_cqt_bins: 336
 98 |   # -----------
 99 |   final_dim: 64
100 |   encoder_layerdrop: 0.05
101 |   dropout_input: 0.1
102 |   dropout_features: 0.1
103 |   dropout: 0.1
104 |   attention_dropout: 0.1
105 |   feature_grad_mult: 0.1
106 |   untie_final_proj: true
107 |   activation_dropout: 0.0
108 | 
109 | 
110 | hydra:
111 |   job:
112 |     config:
113 |       override_dirname:
114 |         kv_sep: '-'
115 |         item_sep: '__'
116 |         exclude_keys:
117 |           - run
118 |           - task.data
119 |           - task.label_dir
120 |   run:
121 |     dir: ???
122 |   sweep:
123 |     dir: ???
124 |     subdir: ${hydra.job.config_name}__${hydra.job.override_dirname}
125 | 


--------------------------------------------------------------------------------
/SongGeneration/codeclm/tokenizer/Flow1dVAE/tools/infer_vaehifigan48k_speech.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import torch
 3 | from tqdm import tqdm
 4 | import torchaudio
 5 | import librosa
 6 | import os
 7 | import math
 8 | import numpy as np
 9 | from .get_melvaehifigan48k import build_pretrained_models
10 | from . import torch_tools as torch_tools
11 | 
12 | class Tango:
13 |     def __init__(self, \
14 |         device="cuda:0"):
15 |         
16 |         self.sample_rate = 48000
17 |         self.device = device
18 | 
19 |         self.vae, self.stft = build_pretrained_models()
20 |         self.vae, self.stft = self.vae.eval().to(device), self.stft.eval().to(device)
21 | 
22 |     def mel_spectrogram_to_waveform(self, mel_spectrogram):
23 |         if mel_spectrogram.dim() == 4:
24 |             mel_spectrogram = mel_spectrogram.squeeze(1)
25 | 
26 |         waveform = self.vocoder(mel_spectrogram)
27 |         # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
28 |         waveform = waveform.cpu().float()
29 |         return waveform
30 | 
31 |     def sound2sound_generate_longterm(self, fname, batch_size=1, duration=10.24, steps=200, disable_progress=False):
32 |         """ Genrate audio without condition. """
33 |         num_frames = math.ceil(duration * 100. / 8)
34 |         with torch.no_grad():
35 |             orig_samples, fs = torchaudio.load(fname)
36 |             if(orig_samples.shape[-1]<int(duration*48000)):
37 |                 orig_samples = orig_samples.repeat(1,math.ceil(int(duration*48000)/float(orig_samples.shape[-1])))
38 |             # orig_samples = torch.cat([torch.zeros(orig_samples.shape[0], int(duration * fs)//2, dtype=orig_samples.dtype, device=orig_samples.device), orig_samples, torch.zeros(orig_samples.shape[0], int(duration * fs)//2, dtype=orig_samples.dtype, device=orig_samples.device)], -1).to(self.device)
39 |             orig_samples = torch.cat([orig_samples, torch.zeros(orig_samples.shape[0], int(duration * fs)//2, dtype=orig_samples.dtype, device=orig_samples.device)], -1).to(self.device)
40 |             if(fs!=48000):orig_samples = torchaudio.functional.resample(orig_samples, fs, 48000)
41 |             # resampled_audios = orig_samples[[0],int(4.64*48000):int(35.36*48000)+480].clamp(-1,1)
42 |             resampled_audios = orig_samples[[0],0:int(duration*48000)+480].clamp(-1,1)
43 |             orig_samples = orig_samples[[0],0:int(duration*48000)]
44 | 
45 |             mel, _, _ = torch_tools.wav_to_fbank2(resampled_audios, -1, fn_STFT=self.stft)
46 |             mel = mel.unsqueeze(1).to(self.device)
47 |             latents = torch.cat([self.vae.get_first_stage_encoding(self.vae.encode_first_stage(mel[[m]])) for m in range(mel.shape[0])],0)
48 | 
49 |             mel = self.vae.decode_first_stage(latents)
50 |             audio = self.vae.decode_to_waveform(mel)
51 |             audio = torch.from_numpy(audio)
52 | 
53 |             if(orig_samples.shape[-1]<audio.shape[-1]):
54 |                 orig_samples = torch.cat([orig_samples, torch.zeros(orig_samples.shape[0], audio.shape[-1]-orig_samples.shape[-1], dtype=orig_samples.dtype, device=orig_samples.device)],-1)
55 |             else:
56 |                 orig_samples = orig_samples[:,0:audio.shape[-1]]
57 |             output = torch.cat([orig_samples.detach().cpu(),audio.detach().cpu()],0)
58 |         return output
59 | 


--------------------------------------------------------------------------------
/SongGeneration/codeclm/tokenizer/Flow1dVAE/our_MERT_BESTRQ/mert_fairseq/config/pretrain/MERT_RVQ-VAE_CQT_95M_bestrq.yaml:
--------------------------------------------------------------------------------
  1 | # @package _group_
  2 | common:
  3 |   fp16: false
  4 |   log_format: json
  5 |   log_interval: 200
  6 |   seed: 1337
  7 |   # tensorboard_logdir: tblog_proj_name
  8 |   # wandb_project: wandb_proj_name
  9 | 
 10 | checkpoint:
 11 |   save_interval_updates: 25000
 12 |   keep_interval_updates: -1
 13 |   no_epoch_checkpoints: true
 14 | 
 15 | 
 16 | distributed_training:
 17 |   ddp_backend: no_c10d
 18 |   distributed_backend: 'nccl'
 19 |   distributed_world_size: 8 # 64
 20 |   nprocs_per_node: 8
 21 |   find_unused_parameters: true
 22 | 
 23 | task:
 24 |   _name: mert_pretraining
 25 |   data: ???
 26 |   label_dir: ???
 27 |   labels: ???
 28 |   label_rate: ${model.label_rate}
 29 |   sample_rate: 24000
 30 |   # crop to 5s
 31 |   max_sample_size: 120000
 32 |   min_sample_size: 72000
 33 | 
 34 |   pad_audio: false
 35 |   random_crop: true
 36 |   normalize: false # must be consistent with extractor
 37 | 
 38 | 
 39 | dataset:
 40 |   num_workers: 6
 41 |   max_tokens: 2000000
 42 |   skip_invalid_size_inputs_valid_test: true
 43 |   validate_interval: 1
 44 |   validate_interval_updates: 10000
 45 | 
 46 | criterion:
 47 |   _name: hubert
 48 |   pred_masked_weight: 1.0
 49 |   pred_nomask_weight: 0.0
 50 |   loss_weights: [10, 1]
 51 | 
 52 | optimization:
 53 |   max_update: 400000
 54 |   lr: [0.0005]
 55 |   clip_norm: 10.0
 56 | 
 57 | optimizer:
 58 |   _name: adam
 59 |   adam_betas: (0.9,0.98)
 60 |   adam_eps: 1e-06
 61 |   weight_decay: 0.01
 62 | 
 63 | lr_scheduler:
 64 |   _name: polynomial_decay
 65 |   warmup_updates: 32000
 66 | 
 67 | model:
 68 |   _name: mert
 69 |   label_rate: ???
 70 |   skip_masked: false
 71 |   skip_nomask: true
 72 |   mask_prob: 0.8
 73 |   mask_length: 5
 74 | 
 75 |   logit_temp: 0.1
 76 | 
 77 |   # ----- mixture ------
 78 |   mixture_prob: 0.5
 79 |   inbatch_noise_augment_len_range: "[12000, 24000]"
 80 |   inbatch_noise_augment_number_range: "[1, 3]"
 81 |   inbatch_noise_augment_volume: 1.0
 82 |   # ------------------------
 83 |   extractor_mode: default
 84 |   audio_extract_type: melspec # use melspec (instead of `w2v_conv`)
 85 |   melspec_n_bins: 120 # for melspec we use 120, means 12 bins per octave
 86 |   conv_feature_layers: '[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2'
 87 | 
 88 |   # best-rq loss
 89 |   audio_rq_loss_m: true
 90 |   audio_rq_loss_embed_dim: 16
 91 |   audio_rq_loss_num_codebooks: 1
 92 |   audio_rq_loss_num_embeds: 8192
 93 |   audio_rq_loss_seed: 42
 94 |   audio_rq_loss_use_norm: true
 95 | 
 96 |   # ---- cqt reconstruction, need to add loss weight ---
 97 |   audio_cqt_loss_m: true
 98 |   audio_cqt_bins: 336
 99 |   # -----------
100 |   final_dim: 64
101 |   encoder_layerdrop: 0.05
102 |   dropout_input: 0.1
103 |   dropout_features: 0.1
104 |   dropout: 0.1
105 |   attention_dropout: 0.1
106 |   feature_grad_mult: 0.1
107 |   untie_final_proj: true
108 |   activation_dropout: 0.0
109 | 
110 | 
111 | hydra:
112 |   job:
113 |     config:
114 |       override_dirname:
115 |         kv_sep: '-'
116 |         item_sep: '__'
117 |         exclude_keys:
118 |           - run
119 |           - task.data
120 |           - task.label_dir
121 |   run:
122 |     dir: ???
123 |   sweep:
124 |     dir: ???
125 |     subdir: ${hydra.job.config_name}__${hydra.job.override_dirname}
126 | 


--------------------------------------------------------------------------------
/SongGeneration/codeclm/tokenizer/Flow1dVAE/our_MERT_BESTRQ/mert_fairseq/config/pretrain/MERT_RVQ-VAE_CQT_95M_groupbestrq_multinodes.yaml:
--------------------------------------------------------------------------------
  1 | # @package _group_
  2 | common:
  3 |   fp16: false
  4 |   log_format: json
  5 |   log_interval: 200
  6 |   seed: 1337
  7 |   # tensorboard_logdir: tblog_proj_name
  8 |   # wandb_project: wandb_proj_name
  9 | 
 10 | checkpoint:
 11 |   save_interval_updates: 12500
 12 |   keep_interval_updates: -1
 13 |   no_epoch_checkpoints: true
 14 | 
 15 | 
 16 | distributed_training:
 17 |   ddp_backend: no_c10d
 18 |   distributed_backend: 'nccl'
 19 |   distributed_world_size: 64
 20 |   nprocs_per_node: 8
 21 |   find_unused_parameters: true
 22 | 
 23 | task:
 24 |   _name: mert_pretraining
 25 |   data: ???
 26 |   label_dir: ???
 27 |   labels: ???
 28 |   label_rate: ${model.label_rate}
 29 |   sample_rate: 24000
 30 |   # crop to 5s
 31 |   max_sample_size: 120000
 32 |   min_sample_size: 72000
 33 | 
 34 |   pad_audio: false
 35 |   random_crop: true
 36 |   normalize: false # must be consistent with extractor
 37 | 
 38 | 
 39 | dataset:
 40 |   num_workers: 6
 41 |   max_tokens: 2000000
 42 |   skip_invalid_size_inputs_valid_test: true
 43 |   validate_interval: 1
 44 |   validate_interval_updates: 10000
 45 | 
 46 | criterion:
 47 |   _name: hubert
 48 |   pred_masked_weight: 1.0
 49 |   pred_nomask_weight: 0.0
 50 |   loss_weights: [10, 1]
 51 | 
 52 | optimization:
 53 |   max_update: 400000
 54 |   lr: [0.0005]
 55 |   clip_norm: 10.0
 56 |   update_freq: [4]
 57 | 
 58 | optimizer:
 59 |   _name: adam
 60 |   adam_betas: (0.9,0.98)
 61 |   adam_eps: 1e-06
 62 |   weight_decay: 0.01
 63 | 
 64 | lr_scheduler:
 65 |   _name: polynomial_decay
 66 |   warmup_updates: 32000
 67 | 
 68 | model:
 69 |   _name: mert
 70 |   label_rate: ???
 71 |   skip_masked: false
 72 |   skip_nomask: true
 73 |   mask_prob: 0.8
 74 |   mask_length: 5
 75 | 
 76 |   logit_temp: 0.1
 77 | 
 78 |   # ----- mixture ------
 79 |   mixture_prob: 0.5
 80 |   inbatch_noise_augment_len_range: "[12000, 24000]"
 81 |   inbatch_noise_augment_number_range: "[1, 3]"
 82 |   inbatch_noise_augment_volume: 1.0
 83 |   # ------------------------
 84 |   extractor_mode: default
 85 |   audio_extract_type: melspec # use melspec (instead of `w2v_conv`)
 86 |   melspec_n_bins: 120 # for melspec we use 120, means 12 bins per octave
 87 |   conv_feature_layers: '[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2'
 88 | 
 89 |   # best-rq loss
 90 |   audio_rq_loss_m: true
 91 |   audio_rq_loss_embed_dim: 16
 92 |   audio_rq_loss_num_codebooks: 64 # 32
 93 |   audio_rq_loss_num_embeds: 1024
 94 |   audio_rq_loss_seed: 42
 95 | 
 96 |   # ---- cqt reconstruction, need to add loss weight ---
 97 |   audio_cqt_loss_m: true
 98 |   audio_cqt_bins: 336
 99 |   # -----------
100 |   final_dim: 16 # 64
101 |   encoder_layerdrop: 0.05
102 |   dropout_input: 0.1
103 |   dropout_features: 0.1
104 |   dropout: 0.1
105 |   attention_dropout: 0.1
106 |   feature_grad_mult: 0.1
107 |   untie_final_proj: true
108 |   activation_dropout: 0.0
109 | 
110 | 
111 | hydra:
112 |   job:
113 |     config:
114 |       override_dirname:
115 |         kv_sep: '-'
116 |         item_sep: '__'
117 |         exclude_keys:
118 |           - run
119 |           - task.data
120 |           - task.label_dir
121 |   run:
122 |     dir: ???
123 |   sweep:
124 |     dir: ???
125 |     subdir: ${hydra.job.config_name}__${hydra.job.override_dirname}
126 | 


--------------------------------------------------------------------------------
/SongGeneration/codeclm/tokenizer/Flow1dVAE/our_MERT_BESTRQ/mert_fairseq/config/pretrain/MERT_RVQ-VAE_CQT_95M_bestrq_multinodes.yaml:
--------------------------------------------------------------------------------
  1 | # @package _group_
  2 | common:
  3 |   fp16: false
  4 |   log_format: json
  5 |   log_interval: 200
  6 |   seed: 1337
  7 |   # tensorboard_logdir: tblog_proj_name
  8 |   # wandb_project: wandb_proj_name
  9 | 
 10 | checkpoint:
 11 |   save_interval_updates: 12500
 12 |   keep_interval_updates: -1
 13 |   no_epoch_checkpoints: true
 14 | 
 15 | 
 16 | distributed_training:
 17 |   ddp_backend: no_c10d
 18 |   distributed_backend: 'nccl'
 19 |   distributed_world_size: 64
 20 |   nprocs_per_node: 8
 21 |   find_unused_parameters: true
 22 | 
 23 | task:
 24 |   _name: mert_pretraining
 25 |   data: ???
 26 |   label_dir: ???
 27 |   labels: ???
 28 |   label_rate: ${model.label_rate}
 29 |   sample_rate: 24000
 30 |   # crop to 5s
 31 |   max_sample_size: 120000
 32 |   min_sample_size: 72000
 33 | 
 34 |   pad_audio: false
 35 |   random_crop: true
 36 |   normalize: false # must be consistent with extractor
 37 | 
 38 | 
 39 | dataset:
 40 |   num_workers: 6
 41 |   max_tokens: 2000000
 42 |   skip_invalid_size_inputs_valid_test: true
 43 |   validate_interval: 1
 44 |   validate_interval_updates: 10000
 45 | 
 46 | criterion:
 47 |   _name: hubert
 48 |   pred_masked_weight: 1.0
 49 |   pred_nomask_weight: 0.0
 50 |   loss_weights: [10, 1]
 51 | 
 52 | optimization:
 53 |   max_update: 400000
 54 |   lr: [0.0005]
 55 |   clip_norm: 10.0
 56 |   update_freq: [4]
 57 | 
 58 | optimizer:
 59 |   _name: adam
 60 |   adam_betas: (0.9,0.98)
 61 |   adam_eps: 1e-06
 62 |   weight_decay: 0.01
 63 | 
 64 | lr_scheduler:
 65 |   _name: polynomial_decay
 66 |   warmup_updates: 32000
 67 | 
 68 | model:
 69 |   _name: mert
 70 |   label_rate: ???
 71 |   skip_masked: false
 72 |   skip_nomask: true
 73 |   mask_prob: 0.8
 74 |   mask_length: 5
 75 | 
 76 |   logit_temp: 0.1
 77 | 
 78 |   # ----- mixture ------
 79 |   mixture_prob: 0.5
 80 |   inbatch_noise_augment_len_range: "[12000, 24000]"
 81 |   inbatch_noise_augment_number_range: "[1, 3]"
 82 |   inbatch_noise_augment_volume: 1.0
 83 |   # ------------------------
 84 |   extractor_mode: default
 85 |   audio_extract_type: melspec # use melspec (instead of `w2v_conv`)
 86 |   melspec_n_bins: 120 # for melspec we use 120, means 12 bins per octave
 87 |   conv_feature_layers: '[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2'
 88 | 
 89 |   # best-rq loss
 90 |   audio_rq_loss_m: true
 91 |   audio_rq_loss_embed_dim: 16
 92 |   audio_rq_loss_num_codebooks: 1
 93 |   audio_rq_loss_num_embeds: 8192
 94 |   audio_rq_loss_seed: 42
 95 |   audio_rq_loss_use_norm: true
 96 | 
 97 |   # ---- cqt reconstruction, need to add loss weight ---
 98 |   audio_cqt_loss_m: true
 99 |   audio_cqt_bins: 336
100 |   # -----------
101 |   final_dim: 64
102 |   encoder_layerdrop: 0.05
103 |   dropout_input: 0.1
104 |   dropout_features: 0.1
105 |   dropout: 0.1
106 |   attention_dropout: 0.1
107 |   feature_grad_mult: 0.1
108 |   untie_final_proj: true
109 |   activation_dropout: 0.0
110 | 
111 | 
112 | hydra:
113 |   job:
114 |     config:
115 |       override_dirname:
116 |         kv_sep: '-'
117 |         item_sep: '__'
118 |         exclude_keys:
119 |           - run
120 |           - task.data
121 |           - task.label_dir
122 |   run:
123 |     dir: ???
124 |   sweep:
125 |     dir: ???
126 |     subdir: ${hydra.job.config_name}__${hydra.job.override_dirname}
127 | 


--------------------------------------------------------------------------------
/SongGeneration/codeclm/tokenizer/Flow1dVAE/tools/infer_vaehifigan48k.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import torch
 3 | from tqdm import tqdm
 4 | import torchaudio
 5 | import librosa
 6 | import os
 7 | import math
 8 | import numpy as np
 9 | from .get_melvaehifigan48k import build_pretrained_models
10 | from . import torch_tools as torch_tools
11 | 
12 | class Tango:
13 |     def __init__(self, \
14 |         device="cuda:0"):
15 |         
16 |         self.sample_rate = 48000
17 |         self.device = device
18 | 
19 |         self.vae, self.stft = build_pretrained_models()
20 |         self.vae, self.stft = self.vae.eval().to(device), self.stft.eval().to(device)
21 | 
22 |     def mel_spectrogram_to_waveform(self, mel_spectrogram):
23 |         if mel_spectrogram.dim() == 4:
24 |             mel_spectrogram = mel_spectrogram.squeeze(1)
25 | 
26 |         waveform = self.vocoder(mel_spectrogram)
27 |         # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
28 |         waveform = waveform.cpu().float()
29 |         return waveform
30 | 
31 |     def sound2sound_generate_longterm(self, fname, batch_size=1, duration=10.24, steps=200, disable_progress=False):
32 |         """ Genrate audio without condition. """
33 |         num_frames = math.ceil(duration * 100. / 8)
34 |         with torch.no_grad():
35 |             orig_samples, fs = torchaudio.load(fname)
36 |             if(orig_samples.shape[-1]<int(duration*48000*3)):
37 |                 orig_samples = orig_samples.repeat(1,math.ceil(int(duration*48000*3)/float(orig_samples.shape[-1])))
38 |             # orig_samples = torch.cat([torch.zeros(orig_samples.shape[0], int(duration * fs)//2, dtype=orig_samples.dtype, device=orig_samples.device), orig_samples, torch.zeros(orig_samples.shape[0], int(duration * fs)//2, dtype=orig_samples.dtype, device=orig_samples.device)], -1).to(self.device)
39 |             orig_samples = torch.cat([orig_samples, torch.zeros(orig_samples.shape[0], int(duration * fs)//2, dtype=orig_samples.dtype, device=orig_samples.device)], -1).to(self.device)
40 |             if(fs!=48000):orig_samples = torchaudio.functional.resample(orig_samples, fs, 48000)
41 |             # resampled_audios = orig_samples[[0],int(4.64*48000):int(35.36*48000)+480].clamp(-1,1)
42 |             resampled_audios = orig_samples[[0],int(0*48000):int(duration*3*48000)+480].clamp(-1,1)
43 |             orig_samples = orig_samples[[0],:]
44 | 
45 |             mel, _, _ = torch_tools.wav_to_fbank2(resampled_audios, -1, fn_STFT=self.stft)
46 |             mel = mel.unsqueeze(1).to(self.device)
47 |             latents = torch.cat([self.vae.get_first_stage_encoding(self.vae.encode_first_stage(mel[[m]])) for m in range(mel.shape[0])],0)
48 | 
49 |             mel = self.vae.decode_first_stage(latents)
50 |             audio = self.vae.decode_to_waveform(mel)
51 |             audio = torch.from_numpy(audio)
52 | 
53 |             orig_samples = orig_samples[...,0:int(duration * 3 * 48000)]
54 |             if(orig_samples.shape[-1]<audio.shape[-1]):
55 |                 orig_samples = torch.cat([orig_samples, torch.zeros(orig_samples.shape[0], audio.shape[-1]-orig_samples.shape[-1], dtype=orig_samples.dtype, device=orig_samples.device)],-1)
56 |             else:
57 |                 orig_samples = orig_samples[:,0:audio.shape[-1]]
58 |             output = torch.cat([orig_samples.detach().cpu(),audio.detach().cpu()],0)
59 |         return output
60 | 


--------------------------------------------------------------------------------
/SongGeneration/third_party/dac/utils/decode.py:
--------------------------------------------------------------------------------
 1 | import warnings
 2 | from pathlib import Path
 3 | 
 4 | import argbind
 5 | import numpy as np
 6 | import torch
 7 | from audiotools import AudioSignal
 8 | from tqdm import tqdm
 9 | 
10 | from dac import DACFile
11 | from dac.utils import load_model
12 | 
13 | warnings.filterwarnings("ignore", category=UserWarning)
14 | 
15 | 
16 | @argbind.bind(group="decode", positional=True, without_prefix=True)
17 | @torch.inference_mode()
18 | @torch.no_grad()
19 | def decode(
20 |     input: str,
21 |     output: str = "",
22 |     weights_path: str = "",
23 |     model_tag: str = "latest",
24 |     model_bitrate: str = "8kbps",
25 |     device: str = "cuda",
26 |     model_type: str = "44khz",
27 |     verbose: bool = False,
28 | ):
29 |     """Decode audio from codes.
30 | 
31 |     Parameters
32 |     ----------
33 |     input : str
34 |         Path to input directory or file
35 |     output : str, optional
36 |         Path to output directory, by default "".
37 |         If `input` is a directory, the directory sub-tree relative to `input` is re-created in `output`.
38 |     weights_path : str, optional
39 |         Path to weights file, by default "". If not specified, the weights file will be downloaded from the internet using the
40 |         model_tag and model_type.
41 |     model_tag : str, optional
42 |         Tag of the model to use, by default "latest". Ignored if `weights_path` is specified.
43 |     model_bitrate: str
44 |         Bitrate of the model. Must be one of "8kbps", or "16kbps". Defaults to "8kbps".
45 |     device : str, optional
46 |         Device to use, by default "cuda". If "cpu", the model will be loaded on the CPU.
47 |     model_type : str, optional
48 |         The type of model to use. Must be one of "44khz", "24khz", or "16khz". Defaults to "44khz". Ignored if `weights_path` is specified.
49 |     """
50 |     generator = load_model(
51 |         model_type=model_type,
52 |         model_bitrate=model_bitrate,
53 |         tag=model_tag,
54 |         load_path=weights_path,
55 |     )
56 |     generator.to(device)
57 |     generator.eval()
58 | 
59 |     # Find all .dac files in input directory
60 |     _input = Path(input)
61 |     input_files = list(_input.glob("**/*.dac"))
62 | 
63 |     # If input is a .dac file, add it to the list
64 |     if _input.suffix == ".dac":
65 |         input_files.append(_input)
66 | 
67 |     # Create output directory
68 |     output = Path(output)
69 |     output.mkdir(parents=True, exist_ok=True)
70 | 
71 |     for i in tqdm(range(len(input_files)), desc=f"Decoding files"):
72 |         # Load file
73 |         artifact = DACFile.load(input_files[i])
74 | 
75 |         # Reconstruct audio from codes
76 |         recons = generator.decompress(artifact, verbose=verbose)
77 | 
78 |         # Compute output path
79 |         relative_path = input_files[i].relative_to(input)
80 |         output_dir = output / relative_path.parent
81 |         if not relative_path.name:
82 |             output_dir = output
83 |             relative_path = input_files[i]
84 |         output_name = relative_path.with_suffix(".wav").name
85 |         output_path = output_dir / output_name
86 |         output_path.parent.mkdir(parents=True, exist_ok=True)
87 | 
88 |         # Write to file
89 |         recons.write(output_path)
90 | 
91 | 
92 | if __name__ == "__main__":
93 |     args = argbind.parse_args()
94 |     with argbind.scope(args):
95 |         decode()
96 | 


--------------------------------------------------------------------------------
/SongGeneration/codeclm/tokenizer/Flow1dVAE/tools/infer_vaehifigan48k_vocal.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import torch
 3 | from tqdm import tqdm
 4 | import torchaudio
 5 | import librosa
 6 | import os
 7 | import math
 8 | import numpy as np
 9 | from .get_melvaehifigan48k import build_pretrained_models
10 | from . import torch_tools as torch_tools
11 | 
12 | class Tango:
13 |     def __init__(self, \
14 |         device="cuda:0"):
15 |         
16 |         self.sample_rate = 48000
17 |         self.device = device
18 | 
19 |         self.vae, self.stft = build_pretrained_models()
20 |         self.vae, self.stft = self.vae.eval().to(device), self.stft.eval().to(device)
21 | 
22 |     def mel_spectrogram_to_waveform(self, mel_spectrogram):
23 |         if mel_spectrogram.dim() == 4:
24 |             mel_spectrogram = mel_spectrogram.squeeze(1)
25 | 
26 |         waveform = self.vocoder(mel_spectrogram)
27 |         # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
28 |         waveform = waveform.cpu().float()
29 |         return waveform
30 | 
31 |     def sound2sound_generate_longterm(self, fname, batch_size=1, duration=10.24, steps=200, disable_progress=False):
32 |         """ Genrate audio without condition. """
33 |         num_frames = math.ceil(duration * 100. / 8)
34 |         with torch.no_grad():
35 |             orig_samples, fs = torchaudio.load(fname)
36 |             if(orig_samples.shape[-1]<int(duration*48000*2)):
37 |                 orig_samples =  torch.cat([orig_samples, torch.zeros(orig_samples.shape[0], int(duration*48000*2+480)-orig_samples.shape[-1], \
38 |                     dtype=orig_samples.dtype, device=orig_samples.device)], -1)
39 |             # orig_samples = torch.cat([torch.zeros(orig_samples.shape[0], int(duration * fs)//2, dtype=orig_samples.dtype, device=orig_samples.device), orig_samples, torch.zeros(orig_samples.shape[0], int(duration * fs)//2, dtype=orig_samples.dtype, device=orig_samples.device)], -1).to(self.device)
40 |             orig_samples = torch.cat([orig_samples, torch.zeros(orig_samples.shape[0], int(duration * fs)//2, dtype=orig_samples.dtype, device=orig_samples.device)], -1).to(self.device)
41 |             if(fs!=48000):orig_samples = torchaudio.functional.resample(orig_samples, fs, 48000)
42 |             # resampled_audios = orig_samples[[0],int(4.64*48000):int(35.36*48000)+480].clamp(-1,1)
43 |             resampled_audios = orig_samples[[0],0:int(duration*2*48000)+480].clamp(-1,1)
44 |             orig_samples = orig_samples[[0],0:int(duration*2*48000)]
45 | 
46 |             mel, _, _ = torch_tools.wav_to_fbank2(resampled_audios, -1, fn_STFT=self.stft)
47 |             mel = mel.unsqueeze(1).to(self.device)
48 |             latents = torch.cat([self.vae.get_first_stage_encoding(self.vae.encode_first_stage(mel[[m]])) for m in range(mel.shape[0])],0)
49 | 
50 |             mel = self.vae.decode_first_stage(latents)
51 |             audio = self.vae.decode_to_waveform(mel)
52 |             audio = torch.from_numpy(audio)
53 | 
54 |             if(orig_samples.shape[-1]<audio.shape[-1]):
55 |                 orig_samples = torch.cat([orig_samples, torch.zeros(orig_samples.shape[0], audio.shape[-1]-orig_samples.shape[-1], dtype=orig_samples.dtype, device=orig_samples.device)],-1)
56 |             else:
57 |                 orig_samples = orig_samples[:,0:audio.shape[-1]]
58 |             output = torch.cat([orig_samples.detach().cpu(),audio.detach().cpu()],0)
59 |         return output
60 | 


--------------------------------------------------------------------------------
/SongGeneration/codeclm/tokenizer/Flow1dVAE/our_MERT_BESTRQ/mert_fairseq/models/musicfm/modules/random_quantizer.py:
--------------------------------------------------------------------------------
 1 | # MIT License
 2 | #
 3 | # Copyright 2023 ByteDance Inc.
 4 | #
 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”),
 6 | # to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
 7 | # and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
 8 | #
 9 | # The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
10 | #
11 | # THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
12 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
13 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
14 | # IN THE SOFTWARE.
15 | 
16 | import torch
17 | from torch import nn, einsum
18 | from einops import rearrange
19 | 
20 | 
21 | class RandomProjectionQuantizer(nn.Module):
22 |     """
23 |     Random projection and codebook lookup module
24 | 
25 |     Some code is borrowed from:
26 |      https://github.com/lucidrains/vector-quantize-pytorch/blob/master/vector_quantize_pytorch/random_projection_quantizer.py
27 |     But I did normalization using pre-computed global mean & variance instead of using layer norm.
28 |     """
29 | 
30 |     def __init__(
31 |         self,
32 |         input_dim,
33 |         codebook_dim,
34 |         codebook_size,
35 |         seed=142,
36 |     ):
37 |         super().__init__()
38 | 
39 |         # random seed
40 |         torch.manual_seed(seed)
41 | 
42 |         # randomly initialized projection
43 |         random_projection = torch.empty(input_dim, codebook_dim)
44 |         nn.init.xavier_normal_(random_projection)
45 |         self.register_buffer("random_projection", random_projection)
46 | 
47 |         # randomly initialized codebook
48 |         codebook = torch.empty(codebook_size, codebook_dim)
49 |         nn.init.normal_(codebook)
50 |         self.register_buffer("codebook", codebook)
51 | 
52 |     def codebook_lookup(self, x):
53 |         # reshape
54 |         b = x.shape[0]
55 |         x = rearrange(x, "b n e -> (b n) e")
56 | 
57 |         # L2 normalization
58 |         normalized_x = nn.functional.normalize(x, dim=1, p=2)
59 |         normalized_codebook = nn.functional.normalize(self.codebook, dim=1, p=2)
60 | 
61 |         # compute distances
62 |         distances = torch.cdist(normalized_codebook, normalized_x)
63 | 
64 |         # get nearest
65 |         nearest_indices = torch.argmin(distances, dim=0)
66 | 
67 |         # reshape
68 |         xq = rearrange(nearest_indices, "(b n) -> b n", b=b)
69 | 
70 |         return xq
71 | 
72 |     @torch.no_grad()
73 |     def forward(self, x):
74 |         # always eval
75 |         self.eval()
76 | 
77 |         # random projection [batch, length, input_dim] -> [batch, length, codebook_dim]
78 |         x = einsum("b n d, d e -> b n e", x, self.random_projection)
79 | 
80 |         # codebook lookup
81 |         xq = self.codebook_lookup(x)
82 | 
83 |         return xq
84 | 


--------------------------------------------------------------------------------
/SongGeneration/codeclm/tokenizer/Flow1dVAE/extract_codes_stereo_7_1x4_ds.py:
--------------------------------------------------------------------------------
 1 | import torch,torchaudio
 2 | import os,sys,json
 3 | from tqdm import tqdm
 4 | 
 5 | #from codeclm_song_v1.codeclm.semantic_extractor.SpeechDecoder_v01.generate import Tango
 6 | from .generate_4rvq import Tango
 7 | import kaldiio
 8 | from kaldiio import WriteHelper
 9 | import torch
10 | import subprocess
11 | import time
12 | import sys
13 | 
14 | def get_gpu_memory():
15 |     _output_to_list = lambda x: x.decode('ascii').split('\n')[:-1]
16 | 
17 |     ACCEPTABLE_AVAILABLE_MEMORY = 1024
18 |     COMMAND = "nvidia-smi --query-gpu=memory.free --format=csv"
19 |     memory_free_info = _output_to_list(subprocess.check_output(COMMAND.split()))[1:]
20 |     memory_free_values = [int(x.split()[0]) for i, x in enumerate(memory_free_info)]
21 |     return memory_free_values
22 | 
23 | if __name__ == "__main__":
24 |     # Define Model
25 |     json_path = sys.argv[1]
26 |     outdir = sys.argv[2]
27 |     ds = int(sys.argv[3])
28 |     
29 |     gpu_idx = int(os.environ['CUDA_VISIBLE_DEVICES'])
30 |     while True:
31 |         free_mem = get_gpu_memory()
32 |         free_mem = free_mem[gpu_idx]
33 |         if(free_mem > 25_000):
34 |             print("GPU memory {}, run matrix cal".format(free_mem))
35 |             break
36 |         else:
37 |             print("GPU memory {}, sleep 1min".format(free_mem))
38 |             time.sleep(60)
39 |     
40 |     mus_infos = []
41 |     with open(json_path) as f:
42 |         for line in f:
43 |             item = json.loads(line)
44 |             mus_infos.append(item)
45 | 
46 |     tango = Tango(model_path = './saved/model_4rvq/model_2_fixed.safetensors', rvq_num=4)
47 |     
48 |     
49 |     # Feature extraction loop
50 |     # for i in tqdm(range(2000)):
51 |     with WriteHelper('ark,scp:{}/token.ark,{}/token.scp'.format(outdir, outdir), write_function="pickle") as writer:
52 |         print('ark,scp:{}/token.ark,{}/token.scp'.format(outdir, outdir))
53 |         bar = torch.zeros(4, 16384)
54 |         for item_idx, item in tqdm(enumerate(mus_infos)):
55 |             try:
56 |             # if True:
57 |                 idx = item['idx']
58 |                 # print(idx)
59 |                 with torch.autocast(device_type="cuda", dtype=torch.float16):
60 |                     if(os.path.exists(item['path'])):
61 |                         codes = tango.file2code_ds(item['path'], ds)
62 |                     else:
63 |                         codes = tango.file2code_ds('/mnt/share/' + item['path'], ds)
64 |                 codes = codes.cpu()
65 |                 writer(str(idx), codes)
66 |                 for i0 in range(codes.shape[-1]):
67 |                     bar[0, codes[0, 0, i0]] += 1
68 |                     bar[1, codes[0, 1, i0]] += 1
69 |                     bar[2, codes[0, 2, i0]] += 1
70 |                     bar[3, codes[0, 3, i0]] += 1
71 |             except Exception as e:
72 |                 print(item['path'])
73 |                 # print(e.message, e.args)
74 |                 # exit(1)
75 |                 continue
76 | 
77 |             if(item_idx % 1000 == 0):
78 |                 print("=========")
79 |                 print(1 - (bar[0]==0).sum() / bar.shape[-1])
80 |                 print("=========")
81 | 
82 |             # idx = item['idx']
83 |             # # print(idx)
84 |             # with torch.autocast(device_type="cuda", dtype=torch.float16):
85 |             #     codes = tango.file2code(item['path'])
86 |             # writer(str(idx), codes.cpu())


--------------------------------------------------------------------------------
/SongGeneration/codeclm/tokenizer/Flow1dVAE/our_MERT_BESTRQ/mert_fairseq/config/pretrain/MERT_RVQ-VAE_CQT_95M_bestrq_chroma_multinodes.yaml:
--------------------------------------------------------------------------------
  1 | # @package _group_
  2 | common:
  3 |   fp16: false
  4 |   log_format: json
  5 |   log_interval: 200
  6 |   seed: 1337
  7 |   # tensorboard_logdir: tblog_proj_name
  8 |   # wandb_project: wandb_proj_name
  9 | 
 10 | checkpoint:
 11 |   save_interval_updates: 12500
 12 |   keep_interval_updates: -1
 13 |   no_epoch_checkpoints: true
 14 | 
 15 | 
 16 | distributed_training:
 17 |   ddp_backend: no_c10d
 18 |   distributed_backend: 'nccl'
 19 |   distributed_world_size: 64
 20 |   nprocs_per_node: 8
 21 |   find_unused_parameters: true
 22 | 
 23 | task:
 24 |   _name: mert_pretraining
 25 |   data: ???
 26 |   label_dir: ???
 27 |   labels: ???
 28 |   label_rate: ${model.label_rate}
 29 |   sample_rate: 24000
 30 |   # crop to 5s
 31 |   max_sample_size: 120000
 32 |   min_sample_size: 72000
 33 | 
 34 |   pad_audio: false
 35 |   random_crop: true
 36 |   normalize: false # must be consistent with extractor
 37 | 
 38 | 
 39 | dataset:
 40 |   num_workers: 6
 41 |   max_tokens: 2000000
 42 |   skip_invalid_size_inputs_valid_test: true
 43 |   validate_interval: 1
 44 |   validate_interval_updates: 10000
 45 | 
 46 | criterion:
 47 |   _name: hubert
 48 |   pred_masked_weight: 1.0
 49 |   pred_nomask_weight: 0.0
 50 |   loss_weights: [10, 1]
 51 | 
 52 | optimization:
 53 |   max_update: 400000
 54 |   lr: [0.0005]
 55 |   clip_norm: 10.0
 56 |   update_freq: [4]
 57 | 
 58 | optimizer:
 59 |   _name: adam
 60 |   adam_betas: (0.9,0.98)
 61 |   adam_eps: 1e-06
 62 |   weight_decay: 0.01
 63 | 
 64 | lr_scheduler:
 65 |   _name: polynomial_decay
 66 |   warmup_updates: 32000
 67 | 
 68 | model:
 69 |   _name: mert
 70 |   label_rate: ???
 71 |   skip_masked: false
 72 |   skip_nomask: true
 73 |   mask_prob: 0.8
 74 |   mask_length: 5
 75 | 
 76 |   logit_temp: 0.1
 77 | 
 78 |   # ----- mixture ------
 79 |   mixture_prob: 0.5
 80 |   inbatch_noise_augment_len_range: "[12000, 24000]"
 81 |   inbatch_noise_augment_number_range: "[1, 3]"
 82 |   inbatch_noise_augment_volume: 1.0
 83 |   # ------------------------
 84 |   extractor_mode: default
 85 |   audio_extract_type: melspec # use melspec (instead of `w2v_conv`)
 86 |   melspec_n_bins: 120 # for melspec we use 120, means 12 bins per octave
 87 |   conv_feature_layers: '[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2'
 88 | 
 89 |   # best-rq loss
 90 |   audio_rq_loss_m: true
 91 |   audio_rq_loss_embed_dim: 16
 92 |   audio_rq_loss_num_codebooks: 1
 93 |   audio_rq_loss_num_embeds: 8192
 94 |   audio_rq_loss_seed: 42
 95 |   audio_rq_loss_use_norm: true
 96 |   audio_rq_loss_use_chroma: true
 97 |   audio_rq_loss_seed_chroma: 123
 98 | 
 99 |   # ---- cqt reconstruction, need to add loss weight ---
100 |   audio_cqt_loss_m: true
101 |   audio_cqt_bins: 336
102 |   # -----------
103 |   final_dim: 32
104 |   encoder_layerdrop: 0.05
105 |   dropout_input: 0.1
106 |   dropout_features: 0.1
107 |   dropout: 0.1
108 |   attention_dropout: 0.1
109 |   feature_grad_mult: 0.1
110 |   untie_final_proj: true
111 |   activation_dropout: 0.0
112 | 
113 | 
114 | hydra:
115 |   job:
116 |     config:
117 |       override_dirname:
118 |         kv_sep: '-'
119 |         item_sep: '__'
120 |         exclude_keys:
121 |           - run
122 |           - task.data
123 |           - task.label_dir
124 |   run:
125 |     dir: ???
126 |   sweep:
127 |     dir: ???
128 |     subdir: ${hydra.job.config_name}__${hydra.job.override_dirname}
129 | 


--------------------------------------------------------------------------------
/SongGeneration/codeclm/tokenizer/Flow1dVAE/our_MERT_BESTRQ/mert_fairseq/config/pretrain/MERT_RVQ-VAE_CQT_95M_bestrq_norm_multinodes.yaml:
--------------------------------------------------------------------------------
  1 | # @package _group_
  2 | common:
  3 |   fp16: false
  4 |   log_format: json
  5 |   log_interval: 200
  6 |   seed: 1337
  7 |   # tensorboard_logdir: tblog_proj_name
  8 |   # wandb_project: wandb_proj_name
  9 | 
 10 | checkpoint:
 11 |   save_interval_updates: 12500
 12 |   keep_interval_updates: -1
 13 |   no_epoch_checkpoints: true
 14 | 
 15 | 
 16 | distributed_training:
 17 |   ddp_backend: no_c10d
 18 |   distributed_backend: 'nccl'
 19 |   distributed_world_size: 64
 20 |   nprocs_per_node: 8
 21 |   find_unused_parameters: true
 22 | 
 23 | task:
 24 |   _name: mert_pretraining
 25 |   data: ???
 26 |   label_dir: ???
 27 |   labels: ???
 28 |   label_rate: ${model.label_rate}
 29 |   sample_rate: 24000
 30 |   # crop to 5s
 31 |   max_sample_size: 120000
 32 |   min_sample_size: 72000
 33 | 
 34 |   pad_audio: false
 35 |   random_crop: true
 36 |   normalize: false # must be consistent with extractor
 37 | 
 38 | 
 39 | dataset:
 40 |   num_workers: 6
 41 |   max_tokens: 2000000
 42 |   skip_invalid_size_inputs_valid_test: true
 43 |   validate_interval: 1
 44 |   validate_interval_updates: 10000
 45 | 
 46 | criterion:
 47 |   _name: hubert
 48 |   pred_masked_weight: 1.0
 49 |   pred_nomask_weight: 0.0
 50 |   loss_weights: [10, 1]
 51 | 
 52 | optimization:
 53 |   max_update: 400000
 54 |   lr: [0.0005]
 55 |   clip_norm: 10.0
 56 |   update_freq: [4]
 57 | 
 58 | optimizer:
 59 |   _name: adam
 60 |   adam_betas: (0.9,0.98)
 61 |   adam_eps: 1e-06
 62 |   weight_decay: 0.01
 63 | 
 64 | lr_scheduler:
 65 |   _name: polynomial_decay
 66 |   warmup_updates: 32000
 67 | 
 68 | model:
 69 |   _name: mert
 70 |   label_rate: ???
 71 |   skip_masked: false
 72 |   skip_nomask: true
 73 |   mask_prob: 0.8
 74 |   mask_length: 5
 75 | 
 76 |   logit_temp: 0.1
 77 | 
 78 |   # ----- mixture ------
 79 |   mixture_prob: 0.5
 80 |   inbatch_noise_augment_len_range: "[12000, 24000]"
 81 |   inbatch_noise_augment_number_range: "[1, 3]"
 82 |   inbatch_noise_augment_volume: 1.0
 83 |   # ------------------------
 84 |   extractor_mode: default
 85 |   audio_extract_type: melspec # use melspec (instead of `w2v_conv`)
 86 |   melspec_n_bins: 120 # for melspec we use 120, means 12 bins per octave
 87 |   conv_feature_layers: '[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2'
 88 | 
 89 |   # best-rq loss
 90 |   audio_rq_loss_m: true
 91 |   audio_rq_loss_embed_dim: 16
 92 |   audio_rq_loss_num_codebooks: 1
 93 |   audio_rq_loss_num_embeds: 8192
 94 |   audio_rq_loss_seed: 42
 95 |   audio_rq_loss_use_norm: true
 96 |   audio_rq_loss_use_chroma: false
 97 |   audio_rq_loss_seed_chroma: 123
 98 | 
 99 |   # ---- cqt reconstruction, need to add loss weight ---
100 |   audio_cqt_loss_m: true
101 |   audio_cqt_bins: 336
102 |   # -----------
103 |   final_dim: 64
104 |   encoder_layerdrop: 0.05
105 |   dropout_input: 0.1
106 |   dropout_features: 0.1
107 |   dropout: 0.1
108 |   attention_dropout: 0.1
109 |   feature_grad_mult: 0.1
110 |   untie_final_proj: true
111 |   activation_dropout: 0.0
112 | 
113 | 
114 | hydra:
115 |   job:
116 |     config:
117 |       override_dirname:
118 |         kv_sep: '-'
119 |         item_sep: '__'
120 |         exclude_keys:
121 |           - run
122 |           - task.data
123 |           - task.label_dir
124 |   run:
125 |     dir: ???
126 |   sweep:
127 |     dir: ???
128 |     subdir: ${hydra.job.config_name}__${hydra.job.override_dirname}
129 | 


--------------------------------------------------------------------------------
/SongGeneration/codeclm/tokenizer/Flow1dVAE/tools/infer_vaehifigan48k_soundmusic.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import torch
 3 | from tqdm import tqdm
 4 | import torchaudio
 5 | import librosa
 6 | import os
 7 | import math
 8 | import numpy as np
 9 | from .get_melvaehifigan48k import build_pretrained_models
10 | from . import torch_tools as torch_tools
11 | 
12 | class Tango:
13 |     def __init__(self, \
14 |         device="cuda:0"):
15 |         
16 |         self.sample_rate = 48000
17 |         self.device = device
18 | 
19 |         self.vae, self.stft = build_pretrained_models()
20 |         self.vae, self.stft = self.vae.eval().to(device), self.stft.eval().to(device)
21 | 
22 |         # print(sum(p.numel() for p in self.vae.parameters()));exit()
23 | 
24 |     def mel_spectrogram_to_waveform(self, mel_spectrogram):
25 |         if mel_spectrogram.dim() == 4:
26 |             mel_spectrogram = mel_spectrogram.squeeze(1)
27 | 
28 |         waveform = self.vocoder(mel_spectrogram)
29 |         # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
30 |         waveform = waveform.cpu().float()
31 |         return waveform
32 | 
33 |     def sound2sound_generate_longterm(self, fname, batch_size=1, duration=10.24, steps=200, disable_progress=False):
34 |         """ Genrate audio without condition. """
35 |         num_frames = math.ceil(duration * 100. / 8)
36 |         with torch.no_grad():
37 |             orig_samples, fs = torchaudio.load(fname)
38 |             if(orig_samples.shape[-1]<int(duration*48000)):
39 |                 orig_samples = orig_samples.repeat(1,math.ceil(int(duration*48000)/float(orig_samples.shape[-1])))
40 |             # orig_samples = torch.cat([torch.zeros(orig_samples.shape[0], int(duration * fs)//2, dtype=orig_samples.dtype, device=orig_samples.device), orig_samples, torch.zeros(orig_samples.shape[0], int(duration * fs)//2, dtype=orig_samples.dtype, device=orig_samples.device)], -1).to(self.device)
41 |             orig_samples = torch.cat([orig_samples, torch.zeros(orig_samples.shape[0], int(duration * fs)//2, dtype=orig_samples.dtype, device=orig_samples.device)], -1).to(self.device)
42 |             if(fs!=48000):orig_samples = torchaudio.functional.resample(orig_samples, fs, 48000)
43 |             # resampled_audios = orig_samples[[0],int(4.64*48000):int(35.36*48000)+480].clamp(-1,1)
44 |             resampled_audios = orig_samples[[0],int(0*48000):int(duration*48000)+480].clamp(-1,1)
45 |             orig_samples = orig_samples[[0],:]
46 | 
47 |             mel, _, _ = torch_tools.wav_to_fbank2(resampled_audios, -1, fn_STFT=self.stft)
48 |             mel = mel.unsqueeze(1).to(self.device)
49 |             latents = torch.cat([self.vae.get_first_stage_encoding(self.vae.encode_first_stage(mel[[m]])) for m in range(mel.shape[0])],0)
50 | 
51 |             mel = self.vae.decode_first_stage(latents)
52 |             audio = self.vae.decode_to_waveform(mel)
53 |             audio = torch.from_numpy(audio)
54 | 
55 |             orig_samples = orig_samples[...,0:int(duration * 48000)]
56 |             if(orig_samples.shape[-1]<audio.shape[-1]):
57 |                 orig_samples = torch.cat([orig_samples, torch.zeros(orig_samples.shape[0], audio.shape[-1]-orig_samples.shape[-1], dtype=orig_samples.dtype, device=orig_samples.device)],-1)
58 |             else:
59 |                 orig_samples = orig_samples[:,0:audio.shape[-1]]
60 |             output = torch.cat([orig_samples.detach().cpu(),audio.detach().cpu()],0)
61 |         return output
62 | 


--------------------------------------------------------------------------------
/SongGeneration/codeclm/tokenizer/Flow1dVAE/our_MERT_BESTRQ/mert_fairseq/config/pretrain/MERT_RVQ-VAE_CQT_95M_bestrq_norm_speech_multinodes.yaml:
--------------------------------------------------------------------------------
  1 | # @package _group_
  2 | common:
  3 |   fp16: false
  4 |   log_format: json
  5 |   log_interval: 200
  6 |   seed: 1337
  7 |   # tensorboard_logdir: tblog_proj_name
  8 |   # wandb_project: wandb_proj_name
  9 | 
 10 | checkpoint:
 11 |   save_interval_updates: 12500
 12 |   keep_interval_updates: -1
 13 |   no_epoch_checkpoints: true
 14 | 
 15 | 
 16 | distributed_training:
 17 |   ddp_backend: no_c10d
 18 |   distributed_backend: 'nccl'
 19 |   distributed_world_size: 64
 20 |   nprocs_per_node: 8
 21 |   find_unused_parameters: true
 22 | 
 23 | task:
 24 |   _name: mert_pretraining
 25 |   data: ???
 26 |   label_dir: ???
 27 |   labels: ???
 28 |   label_rate: ${model.label_rate}
 29 |   sample_rate: 24000
 30 |   # crop to 5s
 31 |   max_sample_size: 120000
 32 |   min_sample_size: 72000
 33 | 
 34 |   pad_audio: false
 35 |   random_crop: true
 36 |   normalize: false # must be consistent with extractor
 37 | 
 38 | 
 39 | dataset:
 40 |   num_workers: 6
 41 |   max_tokens: 2000000
 42 |   skip_invalid_size_inputs_valid_test: true
 43 |   validate_interval: 1
 44 |   validate_interval_updates: 10000
 45 | 
 46 | criterion:
 47 |   _name: hubert
 48 |   pred_masked_weight: 1.0
 49 |   pred_nomask_weight: 0.0
 50 |   loss_weights: [10, 1]
 51 | 
 52 | optimization:
 53 |   max_update: 400000
 54 |   lr: [0.0005]
 55 |   clip_norm: 10.0
 56 |   update_freq: [4]
 57 | 
 58 | optimizer:
 59 |   _name: adam
 60 |   adam_betas: (0.9,0.98)
 61 |   adam_eps: 1e-06
 62 |   weight_decay: 0.01
 63 | 
 64 | lr_scheduler:
 65 |   _name: polynomial_decay
 66 |   warmup_updates: 32000
 67 | 
 68 | model:
 69 |   _name: mert
 70 |   label_rate: ???
 71 |   skip_masked: false
 72 |   skip_nomask: true
 73 |   mask_prob: 0.8
 74 |   mask_length: 5
 75 | 
 76 |   logit_temp: 0.1
 77 | 
 78 |   # ----- mixture ------
 79 |   mixture_prob: 0 # 0.5
 80 |   inbatch_noise_augment_len_range: "[12000, 24000]"
 81 |   inbatch_noise_augment_number_range: "[1, 3]"
 82 |   inbatch_noise_augment_volume: 1.0
 83 |   # ------------------------
 84 |   extractor_mode: default
 85 |   audio_extract_type: melspec # use melspec (instead of `w2v_conv`)
 86 |   melspec_n_bins: 80 # 120 # for melspec we use 120, means 12 bins per octave
 87 |   conv_feature_layers: '[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2'
 88 | 
 89 |   # best-rq loss
 90 |   audio_rq_loss_m: true
 91 |   audio_rq_loss_embed_dim: 16
 92 |   audio_rq_loss_num_codebooks: 1
 93 |   audio_rq_loss_num_embeds: 8192
 94 |   audio_rq_loss_seed: 42
 95 |   audio_rq_loss_use_norm: true
 96 |   audio_rq_loss_use_chroma: false
 97 |   audio_rq_loss_seed_chroma: 123
 98 | 
 99 |   # ---- cqt reconstruction, need to add loss weight ---
100 |   audio_cqt_loss_m: false
101 |   audio_cqt_bins: 336
102 |   # -----------
103 |   final_dim: 64
104 |   encoder_layerdrop: 0.05
105 |   dropout_input: 0.1
106 |   dropout_features: 0.1
107 |   dropout: 0.1
108 |   attention_dropout: 0.1
109 |   feature_grad_mult: 0.1
110 |   untie_final_proj: true
111 |   activation_dropout: 0.0
112 | 
113 | 
114 | hydra:
115 |   job:
116 |     config:
117 |       override_dirname:
118 |         kv_sep: '-'
119 |         item_sep: '__'
120 |         exclude_keys:
121 |           - run
122 |           - task.data
123 |           - task.label_dir
124 |   run:
125 |     dir: ???
126 |   sweep:
127 |     dir: ???
128 |     subdir: ${hydra.job.config_name}__${hydra.job.override_dirname}
129 | 


--------------------------------------------------------------------------------
/SongGeneration/codeclm/tokenizer/Flow1dVAE/models/test_model.py:
--------------------------------------------------------------------------------
 1 | from thop import profile
 2 | from thop import clever_format
 3 | import torch
 4 | from tqdm import tqdm
 5 | import time
 6 | import sys
 7 | sys.path.append('./')
 8 | 
 9 | 
10 | def analyze_model(model, inputs):
11 |     # model size
12 |     num_trainable_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad)
13 |     print("Num trainable parameters: {} M".format(num_trainable_parameters/1000./1000.))
14 | 
15 |     # computation cost
16 |     with torch.no_grad():
17 |         model.eval()
18 |         macs, params = profile(model, inputs=inputs)
19 |         macs, params = clever_format([macs, params], "%.3f")
20 |         print("Macs: {}, Params: {}".format(macs, params))
21 | 
22 |     run_times = 50
23 |     # eval forward 100 times
24 |     with torch.no_grad():
25 |         model = model.eval().to('cuda')
26 |         inputs = [i.to('cuda') if isinstance(i, torch.Tensor) else i for i in inputs]
27 |         model.init_device_dtype(inputs[0].device, inputs[0].dtype)
28 |         st = time.time()
29 |         for i in tqdm(range(run_times)):
30 |             _ = model(*inputs)
31 |         et = time.time()
32 |         print("Eval forward : {:.03f} secs/per iter".format((et-st)/float(run_times)))
33 | 
34 |     # train backward 100 times
35 |     model = model.train().to('cuda')
36 |     inputs = [i.to('cuda') if isinstance(i, torch.Tensor) else i for i in inputs]
37 |     model.init_device_dtype(inputs[0].device, inputs[0].dtype)
38 |     optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
39 |     optimizer.zero_grad()
40 |     st = time.time()
41 |     for i in tqdm(range(run_times)):
42 |         inputs = [torch.rand_like(i) if isinstance(i, torch.cuda.FloatTensor) else i for i in inputs]
43 |         out = model(*inputs)
44 |         optimizer.zero_grad()
45 |         out.mean().backward()
46 |         optimizer.step()
47 |     et = time.time()
48 |     print("Train forward : {:.03f} secs/per iter".format((et-st)/float(run_times)))
49 | 
50 | def fetch_model_v3_transformer():
51 |     # num params: 326M
52 |     # macs (uncorrect): 261G/iter
53 |     # infer: 0.32s/iter
54 |     # train: 2.54s/iter
55 |     from models_transformercond_winorm_ch16_everything_512 import PromptCondAudioDiffusion
56 |     model = PromptCondAudioDiffusion( \
57 |         "configs/scheduler/stable_diffusion_2.1_largenoise.json", \
58 |         None, \
59 |         "configs/models/transformer2D.json"
60 |     )
61 |     inputs = [
62 |         torch.rand(1,16,1024*3//8,32), 
63 |         torch.rand(1,7,512), 
64 |         torch.tensor([1,]), 
65 |         torch.tensor([0,]), 
66 |         False,
67 |     ]
68 |     return model, inputs
69 | 
70 | def fetch_model_v3_unet():
71 |     # num params: 310M
72 |     # infer: 0.10s/iter
73 |     # train: 0.70s/iter
74 |     from models_musicldm_winorm_ch16_everything_sepnorm import PromptCondAudioDiffusion
75 |     model = PromptCondAudioDiffusion( \
76 |         "configs/scheduler/stable_diffusion_2.1_largenoise.json", \
77 |         None, \
78 |         "configs/diffusion_clapcond_model_config_ch16_everything.json"
79 |     )
80 |     inputs = [
81 |         torch.rand(1,16,1024*3//8,32), 
82 |         torch.rand(1,7,512), 
83 |         torch.tensor([1,]), 
84 |         torch.tensor([0,]), 
85 |         False,
86 |     ]
87 |     return model, inputs
88 | 
89 | if __name__=="__main__":
90 |     model, inputs = fetch_model_v3_transformer()
91 |     # model, inputs = fetch_model_v3_unet()
92 |     analyze_model(model, inputs)
93 | 


--------------------------------------------------------------------------------
/SongGeneration/third_party/stable_audio_tools/stable_audio_tools/training/losses/losses.py:
--------------------------------------------------------------------------------
  1 | import typing as tp
  2 | 
  3 | from torch.nn import functional as F
  4 | from torch import nn
  5 | 
  6 | class LossModule(nn.Module):
  7 |     def __init__(self, name: str, weight: float = 1.0):
  8 |         super().__init__()
  9 | 
 10 |         self.name = name
 11 |         self.weight = weight
 12 | 
 13 |     def forward(self, info, *args, **kwargs):
 14 |         raise NotImplementedError
 15 |     
 16 | class ValueLoss(LossModule):
 17 |     def __init__(self, key: str, name, weight: float = 1.0):
 18 |         super().__init__(name=name, weight=weight)
 19 | 
 20 |         self.key = key
 21 |     
 22 |     def forward(self, info):
 23 |         return self.weight * info[self.key]
 24 | 
 25 | class L1Loss(LossModule):
 26 |     def __init__(self, key_a: str, key_b: str, weight: float = 1.0, mask_key: str = None, name: str = 'l1_loss'):
 27 |         super().__init__(name=name, weight=weight)
 28 | 
 29 |         self.key_a = key_a
 30 |         self.key_b = key_b
 31 | 
 32 |         self.mask_key = mask_key
 33 |     
 34 |     def forward(self, info):
 35 |         mse_loss = F.l1_loss(info[self.key_a], info[self.key_b], reduction='none')    
 36 | 
 37 |         if self.mask_key is not None and self.mask_key in info:
 38 |             mse_loss = mse_loss[info[self.mask_key]]
 39 | 
 40 |         mse_loss = mse_loss.mean()
 41 | 
 42 |         return self.weight * mse_loss
 43 |     
 44 | class MSELoss(LossModule):
 45 |     def __init__(self, key_a: str, key_b: str, weight: float = 1.0, mask_key: str = None, name: str = 'mse_loss'):
 46 |         super().__init__(name=name, weight=weight)
 47 | 
 48 |         self.key_a = key_a
 49 |         self.key_b = key_b
 50 | 
 51 |         self.mask_key = mask_key
 52 |     
 53 |     def forward(self, info):
 54 |         mse_loss = F.mse_loss(info[self.key_a], info[self.key_b], reduction='none')    
 55 | 
 56 |         if self.mask_key is not None and self.mask_key in info and info[self.mask_key] is not None:
 57 |             mask = info[self.mask_key]
 58 | 
 59 |             if mask.ndim == 2 and mse_loss.ndim == 3:
 60 |                 mask = mask.unsqueeze(1)
 61 | 
 62 |             if mask.shape[1] != mse_loss.shape[1]:
 63 |                 mask = mask.repeat(1, mse_loss.shape[1], 1)
 64 | 
 65 |             mse_loss = mse_loss[mask]
 66 | 
 67 |         mse_loss = mse_loss.mean()
 68 | 
 69 |         return self.weight * mse_loss
 70 |     
 71 | class AuralossLoss(LossModule):
 72 |     def __init__(self, auraloss_module, input_key: str, target_key: str, name: str, weight: float = 1):
 73 |         super().__init__(name, weight)
 74 | 
 75 |         self.auraloss_module = auraloss_module
 76 | 
 77 |         self.input_key = input_key
 78 |         self.target_key = target_key
 79 | 
 80 |     def forward(self, info):
 81 |         loss = self.auraloss_module(info[self.input_key], info[self.target_key])
 82 | 
 83 |         return self.weight * loss
 84 |     
 85 | class MultiLoss(nn.Module):
 86 |     def __init__(self, losses: tp.List[LossModule]):
 87 |         super().__init__()
 88 | 
 89 |         self.losses = nn.ModuleList(losses)
 90 | 
 91 |     def forward(self, info):
 92 |         total_loss = 0
 93 | 
 94 |         losses = {}
 95 | 
 96 |         for loss_module in self.losses:
 97 |             module_loss = loss_module(info)
 98 |             total_loss += module_loss
 99 |             losses[loss_module.name] = module_loss
100 | 
101 |         return total_loss, losses


--------------------------------------------------------------------------------
/SongGeneration/third_party/dac/utils/encode.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | import warnings
 3 | from pathlib import Path
 4 | 
 5 | import argbind
 6 | import numpy as np
 7 | import torch
 8 | from audiotools import AudioSignal
 9 | from audiotools.core import util
10 | from tqdm import tqdm
11 | 
12 | from dac.utils import load_model
13 | 
14 | warnings.filterwarnings("ignore", category=UserWarning)
15 | 
16 | 
17 | @argbind.bind(group="encode", positional=True, without_prefix=True)
18 | @torch.inference_mode()
19 | @torch.no_grad()
20 | def encode(
21 |     input: str,
22 |     output: str = "",
23 |     weights_path: str = "",
24 |     model_tag: str = "latest",
25 |     model_bitrate: str = "8kbps",
26 |     n_quantizers: int = None,
27 |     device: str = "cuda",
28 |     model_type: str = "44khz",
29 |     win_duration: float = 5.0,
30 |     verbose: bool = False,
31 | ):
32 |     """Encode audio files in input path to .dac format.
33 | 
34 |     Parameters
35 |     ----------
36 |     input : str
37 |         Path to input audio file or directory
38 |     output : str, optional
39 |         Path to output directory, by default "". If `input` is a directory, the directory sub-tree relative to `input` is re-created in `output`.
40 |     weights_path : str, optional
41 |         Path to weights file, by default "". If not specified, the weights file will be downloaded from the internet using the
42 |         model_tag and model_type.
43 |     model_tag : str, optional
44 |         Tag of the model to use, by default "latest". Ignored if `weights_path` is specified.
45 |     model_bitrate: str
46 |         Bitrate of the model. Must be one of "8kbps", or "16kbps". Defaults to "8kbps".
47 |     n_quantizers : int, optional
48 |         Number of quantizers to use, by default None. If not specified, all the quantizers will be used and the model will compress at maximum bitrate.
49 |     device : str, optional
50 |         Device to use, by default "cuda"
51 |     model_type : str, optional
52 |         The type of model to use. Must be one of "44khz", "24khz", or "16khz". Defaults to "44khz". Ignored if `weights_path` is specified.
53 |     """
54 |     generator = load_model(
55 |         model_type=model_type,
56 |         model_bitrate=model_bitrate,
57 |         tag=model_tag,
58 |         load_path=weights_path,
59 |     )
60 |     generator.to(device)
61 |     generator.eval()
62 |     kwargs = {"n_quantizers": n_quantizers}
63 | 
64 |     # Find all audio files in input path
65 |     input = Path(input)
66 |     audio_files = util.find_audio(input)
67 | 
68 |     output = Path(output)
69 |     output.mkdir(parents=True, exist_ok=True)
70 | 
71 |     for i in tqdm(range(len(audio_files)), desc="Encoding files"):
72 |         # Load file
73 |         signal = AudioSignal(audio_files[i])
74 | 
75 |         # Encode audio to .dac format
76 |         artifact = generator.compress(signal, win_duration, verbose=verbose, **kwargs)
77 | 
78 |         # Compute output path
79 |         relative_path = audio_files[i].relative_to(input)
80 |         output_dir = output / relative_path.parent
81 |         if not relative_path.name:
82 |             output_dir = output
83 |             relative_path = audio_files[i]
84 |         output_name = relative_path.with_suffix(".dac").name
85 |         output_path = output_dir / output_name
86 |         output_path.parent.mkdir(parents=True, exist_ok=True)
87 | 
88 |         artifact.save(output_path)
89 | 
90 | 
91 | if __name__ == "__main__":
92 |     args = argbind.parse_args()
93 |     with argbind.scope(args):
94 |         encode()
95 | 


--------------------------------------------------------------------------------
/SongGeneration/codeclm/tokenizer/Flow1dVAE/our_MERT_BESTRQ/mert_fairseq/models/musicfm/modules/conv.py:
--------------------------------------------------------------------------------
 1 | # MIT License
 2 | #
 3 | # Copyright 2023 ByteDance Inc.
 4 | #
 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”),
 6 | # to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
 7 | # and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
 8 | #
 9 | # The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
10 | #
11 | # THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
12 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
13 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
14 | # IN THE SOFTWARE.
15 | 
16 | from torch import nn
17 | from einops import rearrange
18 | 
19 | 
20 | class Res2dModule(nn.Module):
21 |     def __init__(self, idim, odim, stride=(2, 2)):
22 |         super(Res2dModule, self).__init__()
23 |         self.conv1 = nn.Conv2d(idim, odim, 3, padding=1, stride=stride)
24 |         self.bn1 = nn.BatchNorm2d(odim)
25 |         self.conv2 = nn.Conv2d(odim, odim, 3, padding=1)
26 |         self.bn2 = nn.BatchNorm2d(odim)
27 |         self.relu = nn.ReLU()
28 | 
29 |         # residual
30 |         self.diff = False
31 |         if (idim != odim) or (stride[0] > 1):
32 |             self.conv3 = nn.Conv2d(idim, odim, 3, padding=1, stride=stride)
33 |             self.bn3 = nn.BatchNorm2d(odim)
34 |             self.diff = True
35 | 
36 |     def forward(self, x):
37 |         out = self.bn2(self.conv2(self.relu(self.bn1(self.conv1(x)))))
38 |         if self.diff:
39 |             x = self.bn3(self.conv3(x))
40 |         out = x + out
41 |         out = self.relu(out)
42 |         return out
43 | 
44 | 
45 | class Conv2dSubsampling(nn.Module):
46 |     """Convolutional 2D subsampling (to 1/4 length).
47 | 
48 |     Args:
49 |         idim (int): Input dimension.
50 |         hdim (int): Hidden dimension.
51 |         odim (int): Output dimension.
52 |         strides (list): Sizes of strides.
53 |         n_bands (int): Number of frequency bands.
54 |     """
55 | 
56 |     def __init__(self, idim, hdim, odim, strides=[2, 2], n_bands=64):
57 |         """Construct an Conv2dSubsampling object."""
58 |         super(Conv2dSubsampling, self).__init__()
59 | 
60 |         self.conv = nn.Sequential(
61 |             Res2dModule(idim, hdim, (2, strides[0])),
62 |             Res2dModule(hdim, hdim, (2, strides[1])),
63 |         )
64 |         self.linear = nn.Linear(hdim * n_bands // 2 // 2, odim)
65 | 
66 |     def forward(self, x):
67 |         """Subsample x.
68 | 
69 |         Args:
70 |             x (torch.Tensor): Input tensor (#batch, idim, time).
71 | 
72 |         Returns:
73 |             torch.Tensor: Subsampled tensor (#batch, time', odim),
74 |                 where time' = time // 4.
75 |         """
76 | 
77 |         if x.dim() == 3:
78 |             x = x.unsqueeze(1)  # (b, c, f, t)
79 |         x = self.conv(x)
80 |         x = rearrange(x, "b c f t -> b t (c f)")
81 |         x = self.linear(x)
82 |         return x
83 | 


--------------------------------------------------------------------------------
/SongGeneration/codeclm/tokenizer/Flow1dVAE/our_MERT_BESTRQ/mert_fairseq/config/pretrain/MERT_RVQ-VAE_CQT_330M_orig.yaml:
--------------------------------------------------------------------------------
  1 | # @package _group_
  2 | common:
  3 |   fp16: true
  4 |   log_format: json
  5 |   log_interval: 100
  6 |   seed: 1337
  7 |   # tensorboard_logdir: tblog_proj_name
  8 |   # wandb_project: wandb_proj_name
  9 | 
 10 | checkpoint:
 11 |   save_interval_updates: 5000
 12 |   keep_interval_updates: -1
 13 |   no_epoch_checkpoints: true
 14 | 
 15 | 
 16 | distributed_training:
 17 |   ddp_backend: no_c10d
 18 |   distributed_backend: 'nccl'
 19 |   distributed_world_size: 64
 20 |   nprocs_per_node: 8
 21 |   find_unused_parameters: true
 22 | 
 23 | task:
 24 |   _name: mert_pretraining
 25 |   data: ???
 26 |   label_dir: ???
 27 |   labels: ???
 28 |   label_rate: ${model.label_rate}
 29 |   sharding_data: 6
 30 |   load_random_data_shard: false
 31 |   sample_rate: 24000
 32 |   # crop to 5s
 33 |   # max_sample_size: 120000
 34 |   # crop to 5.12s, refers to 384 token per audio, which can be devided by 8.
 35 |   max_sample_size: 122880
 36 |   min_sample_size: 72000
 37 | 
 38 |   pad_audio: false
 39 |   random_crop: true
 40 |   # normalize: true # must be consistent with extractor_mode: layer_norm
 41 |   normalize: false # must be consistent with extractor_mode: default (groupnorm)
 42 | 
 43 | 
 44 | dataset:
 45 |   num_workers: 6
 46 |   max_tokens: 900000
 47 |   skip_invalid_size_inputs_valid_test: true
 48 |   validate_interval: 1
 49 |   validate_interval_updates: 10000
 50 | 
 51 | criterion:
 52 |   _name: hubert
 53 |   pred_masked_weight: 1.0
 54 |   pred_nomask_weight: 0.0
 55 |   loss_weights: [10, 1]
 56 | 
 57 | optimization:
 58 |   max_update: 400000
 59 |   lr: [0.0015]
 60 |   clip_norm: 1.0
 61 |   update_freq: [8]
 62 | 
 63 | optimizer:
 64 |   _name: adam
 65 |   adam_betas: (0.9,0.98)
 66 |   adam_eps: 1e-06
 67 |   weight_decay: 0.01
 68 | 
 69 | lr_scheduler:
 70 |   _name: polynomial_decay
 71 |   warmup_updates: 32000
 72 | 
 73 | model:
 74 |   _name: mert
 75 |   label_rate: ???
 76 |   skip_masked: false
 77 |   skip_nomask: true
 78 |   mask_prob: 0.8
 79 |   mask_length: 5
 80 | 
 81 |   logit_temp: 0.1
 82 | 
 83 | 
 84 |   # ----- mixture ------
 85 |   mixture_prob: 0.5
 86 |   inbatch_noise_augment_len_range: "[12000, 36000]"
 87 |   inbatch_noise_augment_number_range: "[1, 3]"
 88 |   inbatch_noise_augment_volume: 1.0
 89 |   # ------------------------
 90 | 
 91 |   # ---- cqt reconstruction, need to add loss weight ---
 92 |   audio_cqt_loss_m: true
 93 |   audio_cqt_bins: 336
 94 | 
 95 |   final_dim: 128
 96 |   encoder_layers: 24
 97 |   encoder_embed_dim: 1024
 98 |   encoder_ffn_embed_dim: 4096
 99 |   encoder_attention_heads: 16
100 |   # default refers to group norm
101 |   extractor_mode: default
102 |   # extractor_mode: layer_norm
103 |   conv_feature_layers: '[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2'
104 |   encoder_layerdrop: 0.0
105 |   dropout_input: 0.0
106 |   dropout_features: 0.0
107 |   dropout: 0.0
108 |   attention_dropout: 0.0
109 | 
110 |   layer_norm_first: true
111 |   feature_grad_mult: 1.0
112 | 
113 |   untie_final_proj: true
114 |   activation_dropout: 0.0
115 | 
116 |   deepnorm: false
117 |   attention_relax: 32.0
118 | 
119 | 
120 | 
121 | hydra:
122 |   job:
123 |     config:
124 |       override_dirname:
125 |         kv_sep: '-'
126 |         item_sep: '__'
127 |         exclude_keys:
128 |           - run
129 |           - task.data
130 |           - task.label_dir
131 |   run:
132 |     dir: ???
133 |   sweep:
134 |     dir: ???
135 |     subdir: ${hydra.job.config_name}__${hydra.job.override_dirname}


--------------------------------------------------------------------------------
/SongGeneration/codeclm/tokenizer/Flow1dVAE/models_gpt/models/tokenizer/tokenizer1.py:
--------------------------------------------------------------------------------
 1 | import torch.nn as nn
 2 | from transformers import LlamaTokenizer
 3 | import os
 4 | import typing as tp
 5 | import torch
 6 | import sys
 7 | from .pinyin.pinyin import G2P_PinYin
 8 | 
 9 | 
10 | ConditionType = tp.Tuple[torch.Tensor, torch.Tensor]  # condition, mask
11 | 
12 | def process_line(line):
13 |     line = line.strip()[2:]
14 |     if(line[0]=='\'' and line[-1]=='\''):
15 |         line = line[1:-1]
16 |     return line
17 | 
18 | class LlamaTokenizerConditioner(nn.Module):
19 |     def __init__(self, device: str = 'cpu', max_len = 3000, padding_idx='</s>', tokenizer_type=None,
20 |                  pretrained="hfl/chinese-llama-2-13b"): #"hfl/chinese-llama-2-13b"
21 |         super().__init__()
22 |         print(f"text tokenizer from {pretrained}")
23 |         self.text_tokenizer = LlamaTokenizer.from_pretrained(pretrained,cache_dir="huggingface_cache")
24 |         print(f"tokenizer vocab size: {self.text_tokenizer.vocab_size}")
25 |         self.g2p = G2P_PinYin()
26 |         add_token_list = []
27 |         with open(os.path.dirname(os.path.abspath(__file__))+'/vocab.yaml', 'r') as f:
28 |             for line in f:
29 |                 if(line):
30 |                     add_token_list.append(process_line(line))
31 |         type_tokens = []
32 |         with open(os.path.dirname(os.path.abspath(__file__))+'/structure.yaml', 'r') as f:
33 |             for line in f:
34 |                 if(line):
35 |                     type_tokens.append(process_line(line))
36 |         if add_token_list != []:
37 |             self.text_tokenizer.add_tokens(add_token_list, special_tokens=True)
38 |         # voc_size = self.text_tokenizer.vocab_size
39 |         voc_size = len(self.text_tokenizer.get_vocab()) # 加了额外token之后vocab_size似乎不会额外增加 ——cyy
40 |         print( voc_size)
41 |         # import pdb; pdb.set_trace()
42 |         padding_idx = str(padding_idx)
43 |         
44 |         self.text_tokenizer.pad_token = padding_idx
45 |         self.max_len = max_len
46 |         self.padding_idx = padding_idx
47 | 
48 |         vocab = self.text_tokenizer.get_vocab()
49 |         self.type_token_ids = [vocab[i] for i in type_tokens if i in vocab]
50 |         struct_tokens = [padding_idx] + [i for i in add_token_list if i[0]=='[' and i[-1]==']']
51 |         self.struct_token_ids = [vocab[i] for i in struct_tokens]
52 |         print("type tokens: ",{self.text_tokenizer.convert_ids_to_tokens(i):i for i in self.type_token_ids},
53 |                  "\t all structure tokens: ", {self.text_tokenizer.convert_ids_to_tokens(i):i for i in self.struct_token_ids})
54 |         
55 |     def tokenize(self, x: tp.List[tp.Optional[str]]) -> tp.Dict[str, torch.Tensor]:
56 |         x = [self.g2p(xi) if xi is not None else "" for xi in x]
57 |         inputs = self.text_tokenizer(x, return_tensors="pt", padding=True)
58 |         # print(x, [self.text_tokenizer.convert_ids_to_tokens(i.tolist()) for i in inputs['input_ids']])
59 |         # import pdb; pdb.set_trace()
60 |         if inputs['input_ids'].shape[-1] > self.max_len:
61 |             warnings.warn(f"Max len limit ({self.max_len}) Exceed! {x}")
62 |             
63 |         # print(x, inputs['input_ids'].shape)
64 |         return inputs
65 |     
66 | 
67 | if __name__ == "__main__":
68 |     tokenizer = LlamaTokenizerConditioner()
69 |     out = tokenizer.tokenize(["im ok today, and im happy now", "今天我很开心"])
70 |     print(out)
71 |     print(tokenizer.text_tokenizer.decode(out['input_ids'][0][:4]))
72 |     print(tokenizer.text_tokenizer.convert_ids_to_tokens(out['input_ids'][0]))


--------------------------------------------------------------------------------
/SongGeneration/codeclm/tokenizer/Flow1dVAE/our_MERT_BESTRQ/mert_fairseq/config/pretrain/MERT_RVQ-VAE_CQT_330M_multinodes_debug1node.yaml:
--------------------------------------------------------------------------------
  1 | # @package _group_
  2 | common:
  3 |   fp16: false
  4 |   log_format: json
  5 |   log_interval: 100
  6 |   seed: 1337
  7 |   # amp: true
  8 | 
  9 |   # tensorboard_logdir: tblog_proj_name
 10 |   # wandb_project: wandb_proj_name
 11 | 
 12 | checkpoint:
 13 |   save_interval_updates: 5000
 14 |   keep_interval_updates: -1
 15 |   no_epoch_checkpoints: true
 16 | 
 17 | 
 18 | distributed_training:
 19 |   ddp_backend: c10d
 20 |   distributed_backend: 'nccl'
 21 |   distributed_world_size: 64
 22 |   nprocs_per_node: 8
 23 |   find_unused_parameters: true
 24 |   # reset-dataloader: true
 25 | 
 26 | task:
 27 |   _name: mert_pretraining
 28 |   data: ???
 29 |   label_dir: ???
 30 |   labels: ???
 31 |   label_rate: ${model.label_rate}
 32 |   sharding_data: -1 #数据分块
 33 |   load_random_data_shard: false
 34 |   sample_rate: 24000
 35 |   # crop to 5s
 36 |   # max_sample_size: 120000
 37 |   # crop to 5.12s, refers to 384 token per audio, which can be devided by 8.
 38 |   max_sample_size: 122880
 39 |   min_sample_size: 72000
 40 | 
 41 |   pad_audio: false
 42 |   random_crop: true
 43 |   # normalize: true # must be consistent with extractor_mode: layer_norm
 44 |   normalize: false # must be consistent with extractor_mode: default (groupnorm)
 45 | 
 46 | 
 47 | dataset:
 48 |   num_workers: 6
 49 |   max_tokens: 900000
 50 |   skip_invalid_size_inputs_valid_test: true
 51 |   validate_interval: 1
 52 |   validate_interval_updates: 10000
 53 | 
 54 | criterion:
 55 |   _name: hubert
 56 |   pred_masked_weight: 1.0
 57 |   pred_nomask_weight: 0.0
 58 |   loss_weights: [10, 1]
 59 | 
 60 | optimization:
 61 |   max_update: 1000000
 62 |   lr: [0.0015]
 63 |   clip_norm: 1.0
 64 |   update_freq: [8]
 65 | 
 66 | optimizer:
 67 |   _name: adam
 68 |   adam_betas: (0.9,0.98)
 69 |   adam_eps: 1e-06
 70 |   weight_decay: 0.01
 71 | 
 72 | lr_scheduler:
 73 |   _name: polynomial_decay
 74 |   warmup_updates: 32000
 75 | 
 76 | model:
 77 |   _name: mert
 78 |   label_rate: ???
 79 |   skip_masked: false
 80 |   skip_nomask: true
 81 |   mask_prob: 0.8
 82 |   mask_length: 5
 83 | 
 84 |   logit_temp: 0.1
 85 | 
 86 | 
 87 |   # ----- mixture ------
 88 |   mixture_prob: 0.5
 89 |   inbatch_noise_augment_len_range: "[12000, 36000]"
 90 |   inbatch_noise_augment_number_range: "[1, 3]"
 91 |   inbatch_noise_augment_volume: 1.0
 92 |   # ------------------------
 93 | 
 94 |   # ---- cqt reconstruction, need to add loss weight ---
 95 |   audio_cqt_loss_m: true
 96 |   audio_cqt_bins: 336
 97 | 
 98 |   final_dim: 128
 99 |   encoder_layers: 24
100 |   encoder_embed_dim: 1024
101 |   encoder_ffn_embed_dim: 4096
102 |   encoder_attention_heads: 16
103 |   # default refers to group norm
104 |   extractor_mode: default
105 |   # extractor_mode: layer_norm
106 |   conv_feature_layers: '[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2'
107 |   encoder_layerdrop: 0.0
108 |   dropout_input: 0.0
109 |   dropout_features: 0.0
110 |   dropout: 0.0
111 |   attention_dropout: 0.0
112 | 
113 |   layer_norm_first: true
114 |   feature_grad_mult: 1.0
115 | 
116 |   untie_final_proj: true
117 |   activation_dropout: 0.0
118 | 
119 |   deepnorm: false
120 |   attention_relax: 32.0
121 | 
122 | 
123 | 
124 | hydra:
125 |   job:
126 |     config:
127 |       override_dirname:
128 |         kv_sep: '-'
129 |         item_sep: '__'
130 |         exclude_keys:
131 |           - run
132 |           - task.data
133 |           - task.label_dir
134 |   run:
135 |     dir: run
136 |   sweep:
137 |     dir: sweep
138 |     subdir: subdir
139 | 


--------------------------------------------------------------------------------
/SongGeneration/codeclm/tokenizer/Flow1dVAE/our_MERT_BESTRQ/mert_fairseq/config/pretrain/MERT_RVQ-VAE_CQT_330M.yaml:
--------------------------------------------------------------------------------
  1 | # @package _group_
  2 | common:
  3 |   fp16: false
  4 |   log_format: json
  5 |   log_interval: 100
  6 |   seed: 1337
  7 | 
  8 |   # tensorboard_logdir: tblog_proj_name
  9 |   # wandb_project: wandb_proj_name
 10 | 
 11 | checkpoint:
 12 |   save_interval_updates: 5000
 13 |   keep_interval_updates: -1
 14 |   no_epoch_checkpoints: true
 15 | 
 16 | 
 17 | distributed_training:
 18 |   ddp_backend: no_c10d
 19 |   distributed_backend: 'nccl'
 20 |   distributed_world_size: 64
 21 |   nprocs_per_node: 8
 22 |   find_unused_parameters: true
 23 |   # reset-dataloader: true
 24 | 
 25 | task:
 26 |   _name: mert_pretraining
 27 |   data: ???
 28 |   label_dir: ???
 29 |   labels: ???
 30 |   label_rate: ${model.label_rate}
 31 |   sharding_data: -1 #数据分块
 32 |   load_random_data_shard: false
 33 |   sample_rate: 24000
 34 |   # crop to 5s
 35 |   # max_sample_size: 120000
 36 |   # crop to 5.12s, refers to 384 token per audio, which can be devided by 8.
 37 |   max_sample_size: 122880
 38 |   min_sample_size: 72000
 39 | 
 40 |   pad_audio: false
 41 |   random_crop: true
 42 |   # normalize: true # must be consistent with extractor_mode: layer_norm
 43 |   normalize: false # must be consistent with extractor_mode: default (groupnorm)
 44 | 
 45 | 
 46 | dataset:
 47 |   num_workers: 6
 48 |   max_tokens: 900000
 49 |   skip_invalid_size_inputs_valid_test: true
 50 |   validate_interval: 1
 51 |   validate_interval_updates: 10000
 52 | 
 53 | criterion:
 54 |   _name: hubert
 55 |   pred_masked_weight: 1.0
 56 |   pred_nomask_weight: 0.0
 57 |   loss_weights: [10, 1]
 58 | 
 59 | optimization:
 60 |   max_update: 1000000
 61 |   lr: [0.0015]
 62 |   clip_norm: 1.0
 63 |   update_freq: [8]
 64 | 
 65 | optimizer:
 66 |   _name: adam
 67 |   adam_betas: (0.9,0.98)
 68 |   adam_eps: 1e-06
 69 |   weight_decay: 0.01
 70 | 
 71 | lr_scheduler:
 72 |   _name: polynomial_decay
 73 |   warmup_updates: 32000
 74 | 
 75 | model:
 76 |   _name: mert
 77 |   label_rate: ???
 78 |   skip_masked: false
 79 |   skip_nomask: true
 80 |   mask_prob: 0.8
 81 |   mask_length: 5
 82 | 
 83 |   logit_temp: 0.1
 84 | 
 85 | 
 86 |   # ----- mixture ------
 87 |   mixture_prob: 0.5
 88 |   inbatch_noise_augment_len_range: "[12000, 36000]"
 89 |   inbatch_noise_augment_number_range: "[1, 3]"
 90 |   inbatch_noise_augment_volume: 1.0
 91 |   # ------------------------
 92 | 
 93 |   # ---- cqt reconstruction, need to add loss weight ---
 94 |   audio_cqt_loss_m: true
 95 |   audio_cqt_bins: 336
 96 | 
 97 |   final_dim: 128
 98 |   encoder_layers: 24
 99 |   encoder_embed_dim: 1024
100 |   encoder_ffn_embed_dim: 4096
101 |   encoder_attention_heads: 16
102 |   # default refers to group norm
103 |   extractor_mode: default
104 |   # extractor_mode: layer_norm
105 |   conv_feature_layers: '[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2'
106 |   encoder_layerdrop: 0.0
107 |   dropout_input: 0.0
108 |   dropout_features: 0.0
109 |   dropout: 0.0
110 |   attention_dropout: 0.0
111 | 
112 |   layer_norm_first: true
113 |   feature_grad_mult: 1.0
114 | 
115 |   untie_final_proj: true
116 |   activation_dropout: 0.0
117 | 
118 |   deepnorm: false
119 |   attention_relax: 32.0
120 | 
121 | 
122 | 
123 | hydra:
124 |   job:
125 |     config:
126 |       override_dirname:
127 |         kv_sep: '-'
128 |         item_sep: '__'
129 |         exclude_keys:
130 |           - run
131 |           - task.data
132 |           - task.label_dir
133 |   run:
134 |     dir: ???
135 |   sweep:
136 |     dir: ???
137 |     subdir: ${hydra.job.config_name}__${hydra.job.override_dirname}
138 | 


--------------------------------------------------------------------------------