├── SongGeneration ├── third_party │ ├── hub │ │ └── version.txt │ ├── demucs │ │ ├── __init__.py │ │ ├── models │ │ │ ├── __init__.py │ │ │ ├── pretrained.py │ │ │ └── spec.py │ │ └── ckpt │ │ │ └── htdemucs.yaml │ ├── dac │ │ ├── compare │ │ │ ├── __init__.py │ │ │ └── encodec.py │ │ ├── nn │ │ │ ├── __init__.py │ │ │ └── layers.py │ │ ├── model │ │ │ └── __init__.py │ │ ├── __init__.py │ │ ├── __main__.py │ │ └── utils │ │ │ ├── decode.py │ │ │ └── encode.py │ ├── stable_audio_tools │ │ ├── stable_audio_tools │ │ │ ├── data │ │ │ │ └── __init__.py │ │ │ ├── inference │ │ │ │ ├── __init__.py │ │ │ │ └── utils.py │ │ │ ├── interface │ │ │ │ └── __init__.py │ │ │ ├── training │ │ │ │ ├── losses │ │ │ │ │ ├── __init__.py │ │ │ │ │ └── losses.py │ │ │ │ └── __init__.py │ │ │ ├── models │ │ │ │ ├── __init__.py │ │ │ │ ├── pretrained.py │ │ │ │ └── diffusion_prior.py │ │ │ ├── __init__.py │ │ │ └── configs │ │ │ │ ├── dataset_configs │ │ │ │ ├── custom_metadata │ │ │ │ │ └── custom_md_example.py │ │ │ │ ├── s3_wds_example.json │ │ │ │ └── local_training_example.json │ │ │ │ └── model_configs │ │ │ │ ├── dance_diffusion │ │ │ │ ├── dance_diffusion_base.json │ │ │ │ ├── dance_diffusion_large.json │ │ │ │ ├── dance_diffusion_base_16k.json │ │ │ │ └── dance_diffusion_base_44k.json │ │ │ │ └── autoencoders │ │ │ │ ├── dac_2048_32_vae.json │ │ │ │ └── encodec_musicgen_rvq.json │ │ ├── pyproject.toml │ │ ├── scripts │ │ │ └── ds_zero_to_pl_ckpt.py │ │ ├── LICENSE │ │ ├── LICENSES │ │ │ ├── LICENSE_ADP.txt │ │ │ ├── LICENSE_XTRANSFORMERS.txt │ │ │ ├── LICENSE_DESCRIPT.txt │ │ │ ├── LICENSE_NVIDIA.txt │ │ │ └── LICENSE_META.txt │ │ ├── defaults.ini │ │ ├── setup.py │ │ ├── run_gradio.py │ │ └── docs │ │ │ └── pretransforms.md │ └── Qwen2-7B │ │ ├── generation_config.json │ │ ├── config.json │ │ └── tokenizer_config.json ├── codeclm │ ├── tokenizer │ │ ├── Flow1dVAE │ │ │ ├── __init__.py │ │ │ ├── models │ │ │ │ ├── __init__.py │ │ │ │ └── test_model.py │ │ │ ├── tools │ │ │ │ ├── __init__.py │ │ │ │ ├── __pycache__ │ │ │ │ │ ├── mix.cpython-311.pyc │ │ │ │ │ ├── __init__.cpython-311.pyc │ │ │ │ │ ├── torch_tools.cpython-311.pyc │ │ │ │ │ └── get_1dvae_large.cpython-311.pyc │ │ │ │ ├── extract_rvq.py │ │ │ │ ├── safetensor2torch.py │ │ │ │ ├── get_1dvae.py │ │ │ │ ├── get_1dvae_1920.py │ │ │ │ ├── get_1dvae_large_melvae.py │ │ │ │ ├── get_1dvae_large.py │ │ │ │ ├── compare_2models.py │ │ │ │ ├── get_whisper_encoder.py │ │ │ │ ├── transmodelnorm.py │ │ │ │ ├── mix.py │ │ │ │ ├── check_stereo.py │ │ │ │ ├── infer_encodec.py │ │ │ │ ├── infer_encodec_speech.py │ │ │ │ ├── infer_encodec_vocal.py │ │ │ │ ├── creat_jsonl.py │ │ │ │ ├── infer_bsrnnvae441k.py │ │ │ │ ├── infer_bsrnnvae441k_vocal.py │ │ │ │ ├── infer_hifigan48k_speech.py │ │ │ │ ├── infer_hifigan48k_vocal.py │ │ │ │ ├── infer_vaehifigan48k_speech.py │ │ │ │ ├── infer_vaehifigan48k.py │ │ │ │ ├── infer_vaehifigan48k_vocal.py │ │ │ │ └── infer_vaehifigan48k_soundmusic.py │ │ │ ├── our_MERT_BESTRQ │ │ │ │ ├── __init__.py │ │ │ │ ├── mert_fairseq │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── models │ │ │ │ │ │ ├── __init__.py │ │ │ │ │ │ ├── musicfm │ │ │ │ │ │ │ ├── model │ │ │ │ │ │ │ │ ├── rvq.py │ │ │ │ │ │ │ │ ├── __init__.py │ │ │ │ │ │ │ │ └── w2v2_config.json │ │ │ │ │ │ │ ├── modules │ │ │ │ │ │ │ │ ├── __init__.py │ │ │ │ │ │ │ │ ├── features.py │ │ │ │ │ │ │ │ ├── random_quantizer.py │ │ │ │ │ │ │ │ └── conv.py │ │ │ │ │ │ │ └── __init__.py │ │ │ │ │ │ ├── mert │ │ │ │ │ │ │ ├── README.md │ │ │ │ │ │ │ └── __init__.py │ │ │ │ │ │ └── eat │ │ │ │ │ │ │ └── __init__.py │ │ │ │ │ ├── config │ │ │ │ │ │ └── pretrain │ │ │ │ │ │ │ ├── MERT_RVQ-VAE_CQT_95M_dac.yaml │ │ │ │ │ │ │ ├── run │ │ │ │ │ │ │ └── submitit_reg.yaml │ │ │ │ │ │ │ ├── MusicFM_95M_multinodes.yaml │ │ │ │ │ │ │ ├── MusicFM_95M_speech_multinodes.yaml │ │ │ │ │ │ │ ├── MusicFM_95M_bestrvq_multinodes.yaml │ │ │ │ │ │ │ ├── EAT_pretraining_AS2M.yaml │ │ │ │ │ │ │ ├── MERT_RVQ-VAE_CQT_95M.yaml │ │ │ │ │ │ │ ├── EAT_pretraining_music_multinodes.yaml │ │ │ │ │ │ │ ├── MERT_RVQ-VAE_CQT_95M_bestrvq_multinodes.yaml │ │ │ │ │ │ │ ├── MERT_RVQ-VAE_CQT_95M_dac_multinodes.yaml │ │ │ │ │ │ │ ├── MERT_RVQ-VAE_CQT_95M_mel_multinodes.yaml │ │ │ │ │ │ │ ├── MERT_RVQ-VAE_CQT_95M_bestrq.yaml │ │ │ │ │ │ │ ├── MERT_RVQ-VAE_CQT_95M_groupbestrq_multinodes.yaml │ │ │ │ │ │ │ ├── MERT_RVQ-VAE_CQT_95M_bestrq_multinodes.yaml │ │ │ │ │ │ │ ├── MERT_RVQ-VAE_CQT_95M_bestrq_chroma_multinodes.yaml │ │ │ │ │ │ │ ├── MERT_RVQ-VAE_CQT_95M_bestrq_norm_multinodes.yaml │ │ │ │ │ │ │ ├── MERT_RVQ-VAE_CQT_95M_bestrq_norm_speech_multinodes.yaml │ │ │ │ │ │ │ ├── MERT_RVQ-VAE_CQT_330M_orig.yaml │ │ │ │ │ │ │ ├── MERT_RVQ-VAE_CQT_330M_multinodes_debug1node.yaml │ │ │ │ │ │ │ └── MERT_RVQ-VAE_CQT_330M.yaml │ │ │ │ │ └── data │ │ │ │ │ │ ├── __init__.py │ │ │ │ │ │ └── eat_data │ │ │ │ │ │ ├── __init__.py │ │ │ │ │ │ └── add_class_target_dataset.py │ │ │ │ ├── modify_env.md │ │ │ │ ├── test.py │ │ │ │ ├── run_training_eat.sh │ │ │ │ └── run_training_orig.sh │ │ │ ├── models_gpt │ │ │ │ └── models │ │ │ │ │ └── tokenizer │ │ │ │ │ ├── structure.yaml │ │ │ │ │ ├── pinyin │ │ │ │ │ └── symbols.py │ │ │ │ │ └── tokenizer1.py │ │ │ ├── compare_model_weight.py │ │ │ ├── configs │ │ │ │ ├── scheduler │ │ │ │ │ └── stable_diffusion_2.1_largenoise_sample.json │ │ │ │ └── models │ │ │ │ │ └── transformer2D_wocross_inch112_1x4_multi_large.json │ │ │ ├── cal_token_stat.py │ │ │ ├── extract_codes_stereo_7_1x4.py │ │ │ ├── extract_codes_stereo_7_1x2.py │ │ │ └── extract_codes_stereo_7_1x4_ds.py │ │ └── __init__.py │ ├── models │ │ ├── __init__.py │ │ └── llama │ │ │ └── __init__.py │ └── utils │ │ └── autocast.py ├── sample │ ├── description │ │ ├── gender.txt │ │ ├── timbre.txt │ │ ├── emotion.txt │ │ ├── genre.txt │ │ └── instrument.txt │ ├── sample_prompt_audio.wav │ └── lyrics.jsonl ├── img │ ├── logo.jpg │ └── over.jpg └── conf │ ├── vocab.yaml │ └── w2v2_config.json ├── example_workflows └── SongGeneration.png ├── __init__.py └── README.md /SongGeneration/third_party/hub/version.txt: -------------------------------------------------------------------------------- 1 | 1 -------------------------------------------------------------------------------- /SongGeneration/third_party/demucs/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /SongGeneration/third_party/demucs/models/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /SongGeneration/codeclm/tokenizer/Flow1dVAE/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /SongGeneration/third_party/dac/compare/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /SongGeneration/codeclm/tokenizer/Flow1dVAE/models/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /SongGeneration/codeclm/tokenizer/Flow1dVAE/tools/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /SongGeneration/codeclm/tokenizer/__init__.py: -------------------------------------------------------------------------------- 1 | # no need for training -------------------------------------------------------------------------------- /SongGeneration/sample/description/gender.txt: -------------------------------------------------------------------------------- 1 | female 2 | male 3 | -------------------------------------------------------------------------------- /SongGeneration/codeclm/tokenizer/Flow1dVAE/our_MERT_BESTRQ/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /SongGeneration/third_party/demucs/ckpt/htdemucs.yaml: -------------------------------------------------------------------------------- 1 | models: ['htdemucs'] 2 | -------------------------------------------------------------------------------- /SongGeneration/codeclm/tokenizer/Flow1dVAE/our_MERT_BESTRQ/mert_fairseq/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /SongGeneration/third_party/stable_audio_tools/stable_audio_tools/data/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /SongGeneration/third_party/stable_audio_tools/stable_audio_tools/inference/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /SongGeneration/third_party/stable_audio_tools/stable_audio_tools/interface/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /SongGeneration/codeclm/tokenizer/Flow1dVAE/our_MERT_BESTRQ/mert_fairseq/models/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /SongGeneration/codeclm/tokenizer/Flow1dVAE/our_MERT_BESTRQ/mert_fairseq/models/musicfm/model/rvq.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /SongGeneration/sample/description/timbre.txt: -------------------------------------------------------------------------------- 1 | dark 2 | bright 3 | warm 4 | rock 5 | varies 6 | soft 7 | vocal 8 | -------------------------------------------------------------------------------- /SongGeneration/third_party/stable_audio_tools/stable_audio_tools/training/losses/__init__.py: -------------------------------------------------------------------------------- 1 | from .losses import * -------------------------------------------------------------------------------- /SongGeneration/codeclm/tokenizer/Flow1dVAE/our_MERT_BESTRQ/mert_fairseq/config/pretrain/MERT_RVQ-VAE_CQT_95M_dac.yaml: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /SongGeneration/codeclm/tokenizer/Flow1dVAE/our_MERT_BESTRQ/mert_fairseq/models/musicfm/model/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | -------------------------------------------------------------------------------- /SongGeneration/codeclm/tokenizer/Flow1dVAE/our_MERT_BESTRQ/mert_fairseq/models/musicfm/modules/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | -------------------------------------------------------------------------------- /SongGeneration/img/logo.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/smthemex/ComfyUI_SongGeneration/HEAD/SongGeneration/img/logo.jpg -------------------------------------------------------------------------------- /SongGeneration/img/over.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/smthemex/ComfyUI_SongGeneration/HEAD/SongGeneration/img/over.jpg -------------------------------------------------------------------------------- /SongGeneration/third_party/dac/nn/__init__.py: -------------------------------------------------------------------------------- 1 | from . import layers 2 | from . import loss 3 | from . import quantize 4 | -------------------------------------------------------------------------------- /SongGeneration/codeclm/tokenizer/Flow1dVAE/our_MERT_BESTRQ/mert_fairseq/models/musicfm/__init__.py: -------------------------------------------------------------------------------- 1 | from .musicfm_model import * -------------------------------------------------------------------------------- /SongGeneration/sample/description/emotion.txt: -------------------------------------------------------------------------------- 1 | sad 2 | emotional 3 | angry 4 | happy 5 | uplifting 6 | intense 7 | romantic 8 | melancholic 9 | -------------------------------------------------------------------------------- /example_workflows/SongGeneration.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/smthemex/ComfyUI_SongGeneration/HEAD/example_workflows/SongGeneration.png -------------------------------------------------------------------------------- /SongGeneration/third_party/stable_audio_tools/pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools"] 3 | build-backend = "setuptools.build_meta" -------------------------------------------------------------------------------- /SongGeneration/codeclm/tokenizer/Flow1dVAE/our_MERT_BESTRQ/mert_fairseq/data/__init__.py: -------------------------------------------------------------------------------- 1 | from .mert_dataset import MERTDataset 2 | from .eat_data import * -------------------------------------------------------------------------------- /SongGeneration/sample/sample_prompt_audio.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/smthemex/ComfyUI_SongGeneration/HEAD/SongGeneration/sample/sample_prompt_audio.wav -------------------------------------------------------------------------------- /SongGeneration/third_party/stable_audio_tools/stable_audio_tools/models/__init__.py: -------------------------------------------------------------------------------- 1 | from .factory import create_model_from_config, create_model_from_config_path -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- 1 | 2 | from .SongGeneration_node import NODE_CLASS_MAPPINGS, NODE_DISPLAY_NAME_MAPPINGS 3 | 4 | __all__ = ['NODE_CLASS_MAPPINGS', 'NODE_DISPLAY_NAME_MAPPINGS'] 5 | -------------------------------------------------------------------------------- /SongGeneration/third_party/dac/model/__init__.py: -------------------------------------------------------------------------------- 1 | from .base import CodecMixin 2 | from .base import DACFile 3 | from .dac import DAC 4 | from .discriminator import Discriminator 5 | -------------------------------------------------------------------------------- /SongGeneration/third_party/stable_audio_tools/stable_audio_tools/training/__init__.py: -------------------------------------------------------------------------------- 1 | from .factory import create_training_wrapper_from_config, create_demo_callback_from_config 2 | -------------------------------------------------------------------------------- /SongGeneration/third_party/stable_audio_tools/stable_audio_tools/__init__.py: -------------------------------------------------------------------------------- 1 | from .models.factory import create_model_from_config, create_model_from_config_path 2 | from .models.pretrained import get_pretrained_model -------------------------------------------------------------------------------- /SongGeneration/third_party/Qwen2-7B/generation_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "bos_token_id": 151643, 3 | "do_sample": false, 4 | "eos_token_id": 151643, 5 | "max_new_tokens": 2048, 6 | "transformers_version": "4.37.0" 7 | } -------------------------------------------------------------------------------- /SongGeneration/codeclm/tokenizer/Flow1dVAE/tools/__pycache__/mix.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/smthemex/ComfyUI_SongGeneration/HEAD/SongGeneration/codeclm/tokenizer/Flow1dVAE/tools/__pycache__/mix.cpython-311.pyc -------------------------------------------------------------------------------- /SongGeneration/codeclm/tokenizer/Flow1dVAE/tools/__pycache__/__init__.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/smthemex/ComfyUI_SongGeneration/HEAD/SongGeneration/codeclm/tokenizer/Flow1dVAE/tools/__pycache__/__init__.cpython-311.pyc -------------------------------------------------------------------------------- /SongGeneration/codeclm/tokenizer/Flow1dVAE/tools/__pycache__/torch_tools.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/smthemex/ComfyUI_SongGeneration/HEAD/SongGeneration/codeclm/tokenizer/Flow1dVAE/tools/__pycache__/torch_tools.cpython-311.pyc -------------------------------------------------------------------------------- /SongGeneration/codeclm/tokenizer/Flow1dVAE/tools/__pycache__/get_1dvae_large.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/smthemex/ComfyUI_SongGeneration/HEAD/SongGeneration/codeclm/tokenizer/Flow1dVAE/tools/__pycache__/get_1dvae_large.cpython-311.pyc -------------------------------------------------------------------------------- /SongGeneration/third_party/stable_audio_tools/stable_audio_tools/configs/dataset_configs/custom_metadata/custom_md_example.py: -------------------------------------------------------------------------------- 1 | def get_custom_metadata(info, audio): 2 | 3 | # Use relative path as the prompt 4 | return {"prompt": info["relpath"]} -------------------------------------------------------------------------------- /SongGeneration/codeclm/tokenizer/Flow1dVAE/models_gpt/models/tokenizer/structure.yaml: -------------------------------------------------------------------------------- 1 | - '[start]' 2 | - '[verse]' 3 | - '[chorus]' 4 | - '[outro]' 5 | - '[end]' 6 | - '[intro]' 7 | - '[solo]' 8 | - '[inst]' 9 | - '[bridge]' 10 | - '[break]' 11 | -------------------------------------------------------------------------------- /SongGeneration/codeclm/tokenizer/Flow1dVAE/our_MERT_BESTRQ/mert_fairseq/models/mert/README.md: -------------------------------------------------------------------------------- 1 | add cauchy extension from https://github.com/HazyResearch/state-spaces 2 | ```shell 3 | cd state-spaces/extensions/cauchy 4 | python setup.py install 5 | ``` 6 | -------------------------------------------------------------------------------- /SongGeneration/codeclm/tokenizer/Flow1dVAE/our_MERT_BESTRQ/mert_fairseq/models/eat/__init__.py: -------------------------------------------------------------------------------- 1 | # try: 2 | #from .EAT_pretraining import * 3 | # except: 4 | # import sys, os 5 | # sys.path.append(os.path.join(os.path.dirname(os.path.abspath(__file__)), '.')) 6 | # from EAT_pretraining import * -------------------------------------------------------------------------------- /SongGeneration/conf/vocab.yaml: -------------------------------------------------------------------------------- 1 | - '[verse]' 2 | - '[chorus]' 3 | - '[bridge]' 4 | - '[intro-short]' 5 | - '[intro-medium]' 6 | - '[intro-long]' 7 | - '[outro-short]' 8 | - '[outro-medium]' 9 | - '[outro-long]' 10 | - '[inst-short]' 11 | - '[inst-medium]' 12 | - '[inst-long]' 13 | - '[silence]' 14 | -------------------------------------------------------------------------------- /SongGeneration/codeclm/tokenizer/Flow1dVAE/our_MERT_BESTRQ/mert_fairseq/models/mert/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # 3 | # This source code is licensed under the MIT license found in the 4 | # LICENSE file in the root directory of this source tree. 5 | 6 | from .mert_model import * # noqa -------------------------------------------------------------------------------- /SongGeneration/third_party/stable_audio_tools/stable_audio_tools/configs/dataset_configs/s3_wds_example.json: -------------------------------------------------------------------------------- 1 | { 2 | "dataset_type": "s3", 3 | "datasets": [ 4 | { 5 | "id": "s3-test", 6 | "s3_path": "s3://my-bucket/datasets/webdataset/audio/" 7 | } 8 | ], 9 | "random_crop": true 10 | } -------------------------------------------------------------------------------- /SongGeneration/third_party/stable_audio_tools/stable_audio_tools/configs/dataset_configs/local_training_example.json: -------------------------------------------------------------------------------- 1 | { 2 | "dataset_type": "audio_dir", 3 | "datasets": [ 4 | { 5 | "id": "my_audio", 6 | "path": "train.jsonl", 7 | "custom_metadata_module": "custom_md_example.py" 8 | } 9 | ], 10 | "random_crop": true 11 | } 12 | -------------------------------------------------------------------------------- /SongGeneration/codeclm/tokenizer/Flow1dVAE/compare_model_weight.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import sys 3 | from safetensors.torch import load_file 4 | 5 | if __name__ == "__main__": 6 | m0, m1 = sys.argv[1], sys.argv[2] 7 | m0 = load_file(m0) 8 | m1 = load_file(m1) 9 | 10 | ks = [k for k in m0.keys() if 'bestrq' in k] 11 | for k in ks: 12 | print(k, (m0[k] - m1[k]).abs().sum()) 13 | -------------------------------------------------------------------------------- /SongGeneration/third_party/dac/__init__.py: -------------------------------------------------------------------------------- 1 | __version__ = "1.0.0" 2 | 3 | # preserved here for legacy reasons 4 | __model_version__ = "latest" 5 | 6 | import audiotools 7 | 8 | audiotools.ml.BaseModel.INTERN += ["dac.**"] 9 | audiotools.ml.BaseModel.EXTERN += ["einops"] 10 | 11 | 12 | from . import nn 13 | from . import model 14 | from . import utils 15 | from .model import DAC 16 | from .model import DACFile 17 | -------------------------------------------------------------------------------- /SongGeneration/codeclm/tokenizer/Flow1dVAE/tools/extract_rvq.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import sys 3 | 4 | if __name__=="__main__": 5 | p = sys.argv[1] 6 | bd = '/'.join(p.split('/')[:-1]) 7 | bn = p.split('/')[-1] 8 | 9 | d = {} 10 | m = torch.load(p, map_location='cpu') 11 | for k in m.keys(): 12 | if('rvq' in k): 13 | d[k] = m[k] 14 | 15 | torch.save(d, '{}/rvq.bin'.format(bd)) -------------------------------------------------------------------------------- /SongGeneration/codeclm/models/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | """ 7 | Models for EnCodec, AudioGen, MusicGen, as well as the generic LMModel. 8 | """ 9 | # flake8: noqa 10 | from . import builders 11 | from .codeclm import CodecLM 12 | -------------------------------------------------------------------------------- /SongGeneration/sample/description/genre.txt: -------------------------------------------------------------------------------- 1 | pop 2 | electronic 3 | hip hop 4 | rock 5 | jazz 6 | blues 7 | classical 8 | rap 9 | country 10 | classic rock 11 | hard rock 12 | folk 13 | soul 14 | dance, electronic 15 | rockabilly 16 | dance, dancepop, house, pop 17 | reggae 18 | experimental 19 | dance, pop 20 | dance, deephouse, electronic 21 | k-pop 22 | experimental pop 23 | pop punk 24 | rock and roll 25 | R&B 26 | varies 27 | pop rock 28 | -------------------------------------------------------------------------------- /SongGeneration/codeclm/tokenizer/Flow1dVAE/tools/safetensor2torch.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from safetensors import safe_open 3 | import torch 4 | 5 | if __name__=="__main__": 6 | inname = sys.argv[1] 7 | outname = sys.argv[2] 8 | 9 | main_weights = {} 10 | with safe_open(inname, framework="pt", device="cpu") as f: 11 | for key in f.keys(): 12 | main_weights[key] = f.get_tensor(key) 13 | 14 | torch.save(main_weights, outname) -------------------------------------------------------------------------------- /SongGeneration/codeclm/tokenizer/Flow1dVAE/configs/scheduler/stable_diffusion_2.1_largenoise_sample.json: -------------------------------------------------------------------------------- 1 | { 2 | "_class_name": "DDIMScheduler", 3 | "_diffusers_version": "0.8.0", 4 | "beta_end": 0.02, 5 | "beta_schedule": "scaled_linear", 6 | "beta_start": 0.0015, 7 | "clip_sample": false, 8 | "num_train_timesteps": 1000, 9 | "prediction_type": "sample", 10 | "set_alpha_to_one": false, 11 | "skip_prk_steps": true, 12 | "steps_offset": 1, 13 | "trained_betas": null 14 | } 15 | -------------------------------------------------------------------------------- /SongGeneration/third_party/stable_audio_tools/stable_audio_tools/configs/model_configs/dance_diffusion/dance_diffusion_base.json: -------------------------------------------------------------------------------- 1 | { 2 | "model_type": "diffusion_uncond", 3 | "sample_size": 65536, 4 | "sample_rate": 48000, 5 | "model": { 6 | "type": "DAU1d", 7 | "config": { 8 | "n_attn_layers": 5 9 | } 10 | }, 11 | "training": { 12 | "learning_rate": 1e-4, 13 | "demo": { 14 | "demo_every": 2000, 15 | "demo_steps": 250 16 | } 17 | } 18 | } -------------------------------------------------------------------------------- /SongGeneration/third_party/stable_audio_tools/stable_audio_tools/configs/model_configs/dance_diffusion/dance_diffusion_large.json: -------------------------------------------------------------------------------- 1 | { 2 | "model_type": "diffusion_uncond", 3 | "sample_size": 131072, 4 | "sample_rate": 48000, 5 | "model": { 6 | "type": "DAU1d", 7 | "config": { 8 | "n_attn_layers": 5 9 | } 10 | }, 11 | "training": { 12 | "learning_rate": 1e-4, 13 | "demo": { 14 | "demo_every": 2000, 15 | "demo_steps": 250 16 | } 17 | } 18 | } -------------------------------------------------------------------------------- /SongGeneration/codeclm/tokenizer/Flow1dVAE/our_MERT_BESTRQ/modify_env.md: -------------------------------------------------------------------------------- 1 | cp -r fairseq/fairseq/model_parallel/megatron /opt/conda/envs/map/lib/python3.8/site-packages/fairseq/model_parallel/ 2 | vi /opt/conda/envs/map/lib/python3.8/site-packages/apex/amp/_initialize.py # string_classes = str 3 | vi /opt/conda/envs/map/lib/python3.8/site-packages/fairseq/modules/layer_norm.py 4 | vi /opt/conda/envs/map/lib/python3.8/site-packages/fairseq/distributed/utils.py # import datetime; timeout=datetime.timedelta(seconds=51200); logger.info("add nccl time to 51200") 5 | -------------------------------------------------------------------------------- /SongGeneration/third_party/stable_audio_tools/stable_audio_tools/configs/model_configs/dance_diffusion/dance_diffusion_base_16k.json: -------------------------------------------------------------------------------- 1 | { 2 | "model_type": "diffusion_uncond", 3 | "sample_size": 65536, 4 | "sample_rate": 16000, 5 | "model": { 6 | "type": "DAU1d", 7 | "config": { 8 | "n_attn_layers": 5 9 | } 10 | }, 11 | "training": { 12 | "learning_rate": 1e-4, 13 | "demo": { 14 | "demo_every": 2000, 15 | "demo_steps": 250 16 | } 17 | } 18 | } -------------------------------------------------------------------------------- /SongGeneration/third_party/stable_audio_tools/stable_audio_tools/configs/model_configs/dance_diffusion/dance_diffusion_base_44k.json: -------------------------------------------------------------------------------- 1 | { 2 | "model_type": "diffusion_uncond", 3 | "sample_size": 65536, 4 | "sample_rate": 44100, 5 | "model": { 6 | "type": "DAU1d", 7 | "config": { 8 | "n_attn_layers": 5 9 | } 10 | }, 11 | "training": { 12 | "learning_rate": 4e-5, 13 | "demo": { 14 | "demo_every": 2000, 15 | "demo_steps": 250 16 | } 17 | } 18 | } -------------------------------------------------------------------------------- /SongGeneration/codeclm/tokenizer/Flow1dVAE/tools/get_1dvae.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from tqdm import tqdm 3 | import torchaudio 4 | from .....third_party.stable_audio_tools.stable_audio_tools.models.autoencoders import create_autoencoder_from_config 5 | import numpy as np 6 | import os 7 | import json 8 | 9 | def get_model(model_config, path): 10 | with open(model_config) as f: 11 | model_config = json.load(f) 12 | state_dict = torch.load(path) 13 | model = create_autoencoder_from_config(model_config) 14 | model.load_state_dict(state_dict['state_dict']) 15 | return model -------------------------------------------------------------------------------- /SongGeneration/codeclm/tokenizer/Flow1dVAE/tools/get_1dvae_1920.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from tqdm import tqdm 3 | import torchaudio 4 | from .....third_party.stable_audio_tools.stable_audio_tools.models.autoencoders import create_autoencoder_from_config 5 | import numpy as np 6 | import os 7 | import json 8 | 9 | def get_model(model_config, path): 10 | with open(model_config) as f: 11 | model_config = json.load(f) 12 | state_dict = torch.load(path) 13 | model = create_autoencoder_from_config(model_config) 14 | model.load_state_dict(state_dict['state_dict']) 15 | return model -------------------------------------------------------------------------------- /SongGeneration/codeclm/tokenizer/Flow1dVAE/tools/get_1dvae_large_melvae.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from tqdm import tqdm 3 | import torchaudio 4 | from .....third_party.stable_audio_tools.stable_audio_tools.models.autoencoders import create_autoencoder_from_config 5 | import numpy as np 6 | import os 7 | import json 8 | 9 | def get_model(model_config, path): 10 | with open(model_config) as f: 11 | model_config = json.load(f) 12 | state_dict = torch.load(path, map_location='cpu') 13 | model = create_autoencoder_from_config(model_config) 14 | model.load_state_dict(state_dict['state_dict'], strict=False) 15 | return model -------------------------------------------------------------------------------- /SongGeneration/codeclm/tokenizer/Flow1dVAE/our_MERT_BESTRQ/mert_fairseq/config/pretrain/run/submitit_reg.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | 3 | hydra: 4 | launcher: 5 | cpus_per_task: 8 6 | gpus_per_node: 8 7 | tasks_per_node: ${hydra.launcher.gpus_per_node} 8 | nodes: 4 9 | comment: null 10 | mem_gb: 384 11 | timeout_min: 4320 12 | max_num_timeout: 100 13 | constraint: volta32gb 14 | name: ${hydra.job.config_name}/${hydra.job.override_dirname} 15 | submitit_folder: ${hydra.sweep.dir}/submitit/%j 16 | 17 | distributed_training: 18 | distributed_world_size: 32 19 | distributed_port: 29671 20 | nprocs_per_node: 8 21 | -------------------------------------------------------------------------------- /SongGeneration/codeclm/tokenizer/Flow1dVAE/tools/get_1dvae_large.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from tqdm import tqdm 3 | import torchaudio 4 | from .....third_party.stable_audio_tools.stable_audio_tools.models.autoencoders import create_autoencoder_from_config 5 | import numpy as np 6 | import os 7 | import json 8 | 9 | def get_model(model_config, path): 10 | with open(model_config) as f: 11 | model_config = json.load(f) 12 | state_dict = torch.load(path, map_location='cpu') 13 | model = create_autoencoder_from_config(model_config) 14 | model.load_state_dict(state_dict['state_dict'], strict=False) 15 | del state_dict 16 | return model 17 | -------------------------------------------------------------------------------- /SongGeneration/codeclm/tokenizer/Flow1dVAE/our_MERT_BESTRQ/mert_fairseq/data/eat_data/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # 3 | # This source code is licensed under the MIT license found in the 4 | # LICENSE file in the root directory of this source tree. 5 | try: 6 | from .mae_image_dataset import MaeImageDataset 7 | from .raw_audio_dataset import FileAudioDataset 8 | except: 9 | import sys, os 10 | sys.path.append(os.path.join(os.path.dirname(os.path.abspath(__file__)), '.')) 11 | from mae_image_dataset import MaeImageDataset 12 | from raw_audio_dataset import FileAudioDataset 13 | 14 | __all__ = [ 15 | "MaeImageDataset", 16 | "FileAudioDataset", 17 | ] -------------------------------------------------------------------------------- /SongGeneration/third_party/stable_audio_tools/scripts/ds_zero_to_pl_ckpt.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | # from lightning.pytorch.utilities.deepspeed import convert_zero_checkpoint_to_fp32_state_dict 3 | 4 | if __name__ == "__main__": 5 | 6 | parser = argparse.ArgumentParser() 7 | parser.add_argument("--save_path", type=str, help="Path to the zero checkpoint") 8 | parser.add_argument("--output_path", type=str, help="Path to the output checkpoint", default="lightning_model.pt") 9 | args = parser.parse_args() 10 | 11 | # lightning deepspeed has saved a directory instead of a file 12 | save_path = args.save_path 13 | output_path = args.output_path 14 | convert_zero_checkpoint_to_fp32_state_dict(save_path, output_path) -------------------------------------------------------------------------------- /SongGeneration/codeclm/tokenizer/Flow1dVAE/tools/compare_2models.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import sys 3 | 4 | if __name__=="__main__": 5 | m1, m2 = sys.argv[1:3] 6 | m1 = torch.load(m1, map_location = 'cpu') 7 | m2 = torch.load(m2, map_location = 'cpu') 8 | m1_keys = set(m1.keys()) 9 | m2_keys = set(m2.keys()) 10 | 11 | m1_uniq_keys = m1_keys - m2_keys 12 | m2_uniq_keys = m2_keys - m1_keys 13 | m12_shared_keys = m1_keys & m2_keys 14 | 15 | print("m1_uniq_keys: ", m1_uniq_keys) 16 | print("m2_uniq_keys: ", m2_uniq_keys) 17 | print("m12_shared_keys but different: ") 18 | for k in m12_shared_keys: 19 | if(m1[k].numel() != m2[k].numel()): 20 | print(k,m1[k].shape,m2[k].shape) 21 | -------------------------------------------------------------------------------- /SongGeneration/codeclm/tokenizer/Flow1dVAE/cal_token_stat.py: -------------------------------------------------------------------------------- 1 | import kaldiio 2 | from tqdm import tqdm 3 | import torch 4 | 5 | if __name__ == "__main__": 6 | bar = torch.zeros(1, 16384) 7 | with open('token.scp', 'r') as f: 8 | for item_idx, line in tqdm(enumerate(f)): 9 | idx, pos = line.strip().split() 10 | codes = kaldiio.load_mat(pos) 11 | for i0 in range(codes.shape[-1]): 12 | bar[0, codes[0, 0, i0]] += 1 13 | if(item_idx % 1000 == 0): 14 | print("=========") 15 | print(1 - (bar[0]==0).sum() / bar.shape[-1]) 16 | print("=========") 17 | print("=========") 18 | print(1 - (bar[0]==0).sum() / bar.shape[-1]) 19 | print("=========") -------------------------------------------------------------------------------- /SongGeneration/third_party/Qwen2-7B/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "architectures": [ 3 | "Qwen2ForCausalLM" 4 | ], 5 | "attention_dropout": 0.0, 6 | "bos_token_id": 151643, 7 | "eos_token_id": 151643, 8 | "hidden_act": "silu", 9 | "hidden_size": 3584, 10 | "initializer_range": 0.02, 11 | "intermediate_size": 18944, 12 | "max_position_embeddings": 131072, 13 | "max_window_layers": 28, 14 | "model_type": "qwen2", 15 | "num_attention_heads": 28, 16 | "num_hidden_layers": 28, 17 | "num_key_value_heads": 4, 18 | "rms_norm_eps": 1e-06, 19 | "rope_theta": 1000000.0, 20 | "sliding_window": 131072, 21 | "tie_word_embeddings": false, 22 | "torch_dtype": "bfloat16", 23 | "transformers_version": "4.37.2", 24 | "use_cache": true, 25 | "use_sliding_window": false, 26 | "vocab_size": 152064 27 | } 28 | -------------------------------------------------------------------------------- /SongGeneration/third_party/dac/__main__.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | import argbind 4 | 5 | from dac.utils import download 6 | from dac.utils.decode import decode 7 | from dac.utils.encode import encode 8 | 9 | STAGES = ["encode", "decode", "download"] 10 | 11 | 12 | def run(stage: str): 13 | """Run stages. 14 | 15 | Parameters 16 | ---------- 17 | stage : str 18 | Stage to run 19 | """ 20 | if stage not in STAGES: 21 | raise ValueError(f"Unknown command: {stage}. Allowed commands are {STAGES}") 22 | stage_fn = globals()[stage] 23 | 24 | if stage == "download": 25 | stage_fn() 26 | return 27 | 28 | stage_fn() 29 | 30 | 31 | if __name__ == "__main__": 32 | group = sys.argv.pop(1) 33 | args = argbind.parse_args(group=group) 34 | 35 | with argbind.scope(args): 36 | run(group) 37 | -------------------------------------------------------------------------------- /SongGeneration/codeclm/tokenizer/Flow1dVAE/configs/models/transformer2D_wocross_inch112_1x4_multi_large.json: -------------------------------------------------------------------------------- 1 | { 2 | "_class_name": "Transformer2DModel", 3 | "_diffusers_version": "0.22.0.dev0", 4 | "activation_fn": "gelu-approximate", 5 | "attention_bias": true, 6 | "attention_head_dim": 72, 7 | "attention_type": "default", 8 | "cross_attention_dim": null, 9 | "double_self_attention": false, 10 | "dropout": 0.0, 11 | "in_channels": 96, 12 | "norm_elementwise_affine": false, 13 | "norm_eps": 1e-06, 14 | "norm_num_groups": 32, 15 | "norm_type": "ada_norm_single", 16 | "num_attention_heads": 22, 17 | "num_embeds_ada_norm": 1000, 18 | "num_layers": 24, 19 | "num_vector_embeds": null, 20 | "only_cross_attention": false, 21 | "out_channels": 32, 22 | "patch_size": 2, 23 | "sample_size": 384, 24 | "upcast_attention": false, 25 | "use_linear_projection": false 26 | } -------------------------------------------------------------------------------- /SongGeneration/codeclm/tokenizer/Flow1dVAE/tools/get_whisper_encoder.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from transformers import WhisperProcessor, WhisperForConditionalGeneration 3 | 4 | def get_whisper_encoder(): 5 | processor = WhisperProcessor.from_pretrained("openai/whisper-large-v3") 6 | model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large-v3").model.encoder 7 | return processor, model.eval() 8 | 9 | if __name__=="__main__": 10 | import numpy as np 11 | processor, model = get_whisper_encoder() 12 | model = model.cuda() 13 | 14 | with torch.no_grad(): 15 | input_features = processor(np.random.rand(16000*30,), sampling_rate=16000, return_tensors="pt").input_features.cuda() 16 | print(input_features.shape) 17 | out = model(input_features.repeat(10,1,1)) 18 | import pdb;pdb.set_trace() 19 | print(list(out.values())[0].shape) 20 | -------------------------------------------------------------------------------- /SongGeneration/sample/description/instrument.txt: -------------------------------------------------------------------------------- 1 | synthesizer and piano 2 | piano and drums 3 | piano and synthesizer 4 | synthesizer and drums 5 | piano and strings 6 | guitar and drums 7 | guitar and piano 8 | piano and double bass 9 | piano and guitar 10 | acoustic guitar and piano 11 | acoustic guitar and synthesizer 12 | synthesizer and guitar 13 | piano and saxophone 14 | saxophone and piano 15 | piano and violin 16 | electric guitar and drums 17 | acoustic guitar and drums 18 | synthesizer 19 | guitar and fiddle 20 | guitar and harmonica 21 | synthesizer and acoustic guitar 22 | beats 23 | piano 24 | acoustic guitar and fiddle 25 | brass and piano 26 | bass and drums 27 | violin 28 | acoustic guitar and harmonica 29 | piano and cello 30 | saxophone and trumpet 31 | guitar and banjo 32 | guitar and synthesizer 33 | saxophone 34 | violin and piano 35 | synthesizer and bass 36 | synthesizer and electric guitar 37 | electric guitar and piano 38 | beats and piano 39 | synthesizer and 40 | guitar 41 | -------------------------------------------------------------------------------- /SongGeneration/third_party/dac/nn/layers.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | from einops import rearrange 6 | from torch.nn.utils import weight_norm 7 | 8 | 9 | def WNConv1d(*args, **kwargs): 10 | return weight_norm(nn.Conv1d(*args, **kwargs)) 11 | 12 | 13 | def WNConvTranspose1d(*args, **kwargs): 14 | return weight_norm(nn.ConvTranspose1d(*args, **kwargs)) 15 | 16 | 17 | # Scripting this brings model speed up 1.4x 18 | @torch.jit.script 19 | def snake(x, alpha): 20 | shape = x.shape 21 | x = x.reshape(shape[0], shape[1], -1) 22 | x = x + (alpha + 1e-9).reciprocal() * torch.sin(alpha * x).pow(2) 23 | x = x.reshape(shape) 24 | return x 25 | 26 | 27 | class Snake1d(nn.Module): 28 | def __init__(self, channels): 29 | super().__init__() 30 | self.alpha = nn.Parameter(torch.ones(1, channels, 1)) 31 | 32 | def forward(self, x): 33 | return snake(x, self.alpha) 34 | 35 | -------------------------------------------------------------------------------- /SongGeneration/third_party/stable_audio_tools/stable_audio_tools/models/pretrained.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | from .factory import create_model_from_config 4 | from .utils import load_ckpt_state_dict 5 | 6 | from huggingface_hub import hf_hub_download 7 | 8 | def get_pretrained_model(name: str): 9 | 10 | model_config_path = hf_hub_download(name, filename="model_config.json", repo_type='model') 11 | 12 | with open(model_config_path) as f: 13 | model_config = json.load(f) 14 | 15 | model = create_model_from_config(model_config) 16 | 17 | # Try to download the model.safetensors file first, if it doesn't exist, download the model.ckpt file 18 | try: 19 | model_ckpt_path = hf_hub_download(name, filename="model.safetensors", repo_type='model') 20 | except Exception as e: 21 | model_ckpt_path = hf_hub_download(name, filename="model.ckpt", repo_type='model') 22 | 23 | model.load_state_dict(load_ckpt_state_dict(model_ckpt_path)) 24 | 25 | return model, model_config -------------------------------------------------------------------------------- /SongGeneration/codeclm/tokenizer/Flow1dVAE/tools/transmodelnorm.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | if __name__=="__main__": 4 | src_ckpt = 'saved/train_mulan_v3_48k_everything3/latest/pytorch_model_2.bin' 5 | tgt_ckpt = 'saved/train_mulan_v3_48k_everything3_sepnorm/src_pytorch_model_2.bin' 6 | # src_ckpt = 'saved/train_enhcodec2D_again/latest/pytorch_model_3.bin' 7 | # tgt_ckpt = 'saved/train_enhcodec2D_again_sepnorm/pytorch_model_3.bin' 8 | 9 | ckpt = torch.load(src_ckpt, map_location='cpu') 10 | 11 | ckpt['normfeat.sum_x'] = torch.ones(16, 32, dtype=ckpt['normfeat.sum_x'].dtype) * ckpt['normfeat.sum_x'] / ckpt['normfeat.counts'] 12 | ckpt['normfeat.sum_x2'] = torch.ones(16, 32, dtype=ckpt['normfeat.sum_x2'].dtype) * ckpt['normfeat.sum_x2'] / ckpt['normfeat.counts'] 13 | ckpt['normfeat.sum_target_x2'] = torch.ones(16, 32, dtype=ckpt['normfeat.sum_target_x2'].dtype) * ckpt['normfeat.sum_target_x2'] / ckpt['normfeat.counts'] 14 | ckpt['normfeat.counts'] = torch.ones_like(ckpt['normfeat.counts']) 15 | torch.save(ckpt, tgt_ckpt) 16 | -------------------------------------------------------------------------------- /SongGeneration/codeclm/tokenizer/Flow1dVAE/our_MERT_BESTRQ/test.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from dataclasses import dataclass 3 | from logging import getLogger 4 | import torch.nn.functional as F 5 | import fairseq.utils 6 | from fairseq.checkpoint_utils import load_model_ensemble_and_task 7 | import folder_paths 8 | import os 9 | logger = getLogger(__name__) 10 | 11 | @dataclass 12 | class UserDirModule: 13 | user_dir: str 14 | 15 | def load_model(model_dir, checkpoint_dir): 16 | '''Load Fairseq SSL model''' 17 | 18 | #导入模型所在的代码模块 19 | model_dir=os.path.join(folder_paths.base_path,"custom_nodes/ComfyUI_SongGeneration/SongGeneration",model_dir) 20 | model_path = UserDirModule(model_dir) 21 | 22 | checkpoint_dir=os.path.join(folder_paths.models_dir,"SongGeneration/ckpt/encode-s12k.pt") 23 | fairseq.utils.import_user_module(model_path) 24 | #print(checkpoint_dir,model_dir) 25 | #载入模型的checkpoint 26 | model, cfg, task = load_model_ensemble_and_task([checkpoint_dir], strict=False) 27 | model = model[0] 28 | 29 | return model 30 | -------------------------------------------------------------------------------- /SongGeneration/third_party/stable_audio_tools/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Stability AI 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /SongGeneration/third_party/stable_audio_tools/stable_audio_tools/inference/utils.py: -------------------------------------------------------------------------------- 1 | from ..data.utils import PadCrop 2 | 3 | from torchaudio import transforms as T 4 | 5 | def set_audio_channels(audio, target_channels): 6 | if target_channels == 1: 7 | # Convert to mono 8 | audio = audio.mean(1, keepdim=True) 9 | elif target_channels == 2: 10 | # Convert to stereo 11 | if audio.shape[1] == 1: 12 | audio = audio.repeat(1, 2, 1) 13 | elif audio.shape[1] > 2: 14 | audio = audio[:, :2, :] 15 | return audio 16 | 17 | def prepare_audio(audio, in_sr, target_sr, target_length, target_channels, device): 18 | 19 | audio = audio.to(device) 20 | 21 | if in_sr != target_sr: 22 | resample_tf = T.Resample(in_sr, target_sr).to(device) 23 | audio = resample_tf(audio) 24 | 25 | audio = PadCrop(target_length, randomize=False)(audio) 26 | 27 | # Add batch dimension 28 | if audio.dim() == 1: 29 | audio = audio.unsqueeze(0).unsqueeze(0) 30 | elif audio.dim() == 2: 31 | audio = audio.unsqueeze(0) 32 | 33 | audio = set_audio_channels(audio, target_channels) 34 | 35 | return audio -------------------------------------------------------------------------------- /SongGeneration/codeclm/tokenizer/Flow1dVAE/models_gpt/models/tokenizer/pinyin/symbols.py: -------------------------------------------------------------------------------- 1 | _pause = ["sil", "eos", "sp", "#0", "#1", "#2", "#3"] 2 | 3 | _initials = [ 4 | "^", 5 | "b", 6 | "c", 7 | "ch", 8 | "d", 9 | "f", 10 | "g", 11 | "h", 12 | "j", 13 | "k", 14 | "l", 15 | "m", 16 | "n", 17 | "p", 18 | "q", 19 | "r", 20 | "s", 21 | "sh", 22 | "t", 23 | "x", 24 | "z", 25 | "zh", 26 | ] 27 | 28 | _tones = ["1", "2", "3", "4", "5"] 29 | 30 | _finals = [ 31 | "a", 32 | "ai", 33 | "an", 34 | "ang", 35 | "ao", 36 | "e", 37 | "ei", 38 | "en", 39 | "eng", 40 | "er", 41 | "i", 42 | "ia", 43 | "ian", 44 | "iang", 45 | "iao", 46 | "ie", 47 | "ii", 48 | "iii", 49 | "in", 50 | "ing", 51 | "iong", 52 | "iou", 53 | "o", 54 | "ong", 55 | "ou", 56 | "u", 57 | "ua", 58 | "uai", 59 | "uan", 60 | "uang", 61 | "uei", 62 | "uen", 63 | "ueng", 64 | "uo", 65 | "v", 66 | "van", 67 | "ve", 68 | "vn", 69 | ] 70 | 71 | symbols = _pause + _initials + [i + j for i in _finals for j in _tones] 72 | -------------------------------------------------------------------------------- /SongGeneration/third_party/stable_audio_tools/LICENSES/LICENSE_ADP.txt: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 archinet.ai 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /SongGeneration/third_party/stable_audio_tools/LICENSES/LICENSE_XTRANSFORMERS.txt: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Phil Wang 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /SongGeneration/third_party/stable_audio_tools/LICENSES/LICENSE_DESCRIPT.txt: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023-present, Descript 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /SongGeneration/third_party/stable_audio_tools/LICENSES/LICENSE_NVIDIA.txt: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 NVIDIA CORPORATION. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /SongGeneration/third_party/stable_audio_tools/LICENSES/LICENSE_META.txt: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) Meta Platforms, Inc. and affiliates. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /SongGeneration/third_party/demucs/models/pretrained.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | """ 4 | @File : pretrained.py 5 | @Time : 2023/8/8 下午7:22 6 | @Author : waytan 7 | @Contact : waytan@tencent.com 8 | @License : (C)Copyright 2023, Tencent 9 | @Desc : Loading pretrained models. 10 | """ 11 | from pathlib import Path 12 | 13 | import yaml 14 | 15 | from .apply import BagOfModels 16 | from .htdemucs import HTDemucs 17 | from .states import load_state_dict 18 | 19 | 20 | def add_model_flags(parser): 21 | group = parser.add_mutually_exclusive_group(required=False) 22 | group.add_argument("-s", "--sig", help="Locally trained XP signature.") 23 | group.add_argument("-n", "--name", default=None, 24 | help="Pretrained model name or signature. Default is htdemucs.") 25 | parser.add_argument("--repo", type=Path, 26 | help="Folder containing all pre-trained models for use with -n.") 27 | 28 | 29 | def get_model_from_yaml(yaml_file, model_file): 30 | bag = yaml.safe_load(open(yaml_file)) 31 | model = load_state_dict(HTDemucs, model_file) 32 | weights = bag.get('weights') 33 | segment = bag.get('segment') 34 | return BagOfModels([model], weights, segment) 35 | -------------------------------------------------------------------------------- /SongGeneration/codeclm/utils/autocast.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | 4 | class TorchAutocast: 5 | """TorchAutocast utility class. 6 | Allows you to enable and disable autocast. This is specially useful 7 | when dealing with different architectures and clusters with different 8 | levels of support. 9 | 10 | Args: 11 | enabled (bool): Whether to enable torch.autocast or not. 12 | args: Additional args for torch.autocast. 13 | kwargs: Additional kwargs for torch.autocast 14 | """ 15 | def __init__(self, enabled: bool, *args, **kwargs): 16 | self.autocast = torch.autocast(*args, **kwargs) if enabled else None 17 | 18 | def __enter__(self): 19 | if self.autocast is None: 20 | return 21 | try: 22 | self.autocast.__enter__() 23 | except RuntimeError: 24 | device = self.autocast.device 25 | dtype = self.autocast.fast_dtype 26 | raise RuntimeError( 27 | f"There was an error autocasting with dtype={dtype} device={device}\n" 28 | "If you are on the FAIR Cluster, you might need to use autocast_dtype=float16" 29 | ) 30 | 31 | def __exit__(self, *args, **kwargs): 32 | if self.autocast is None: 33 | return 34 | self.autocast.__exit__(*args, **kwargs) 35 | -------------------------------------------------------------------------------- /SongGeneration/third_party/stable_audio_tools/defaults.ini: -------------------------------------------------------------------------------- 1 | 2 | [DEFAULTS] 3 | 4 | #name of the run 5 | name = stable_audio_tools 6 | 7 | # the batch size 8 | batch_size = 8 9 | 10 | # number of GPUs to use for training 11 | num_gpus = 1 12 | 13 | # number of nodes to use for training 14 | num_nodes = 1 15 | 16 | # Multi-GPU strategy for PyTorch Lightning 17 | strategy = "" 18 | 19 | # Precision to use for training 20 | precision = "16-mixed" 21 | 22 | # number of CPU workers for the DataLoader 23 | num_workers = 8 24 | 25 | # the random seed 26 | seed = 42 27 | 28 | # Batches for gradient accumulation 29 | accum_batches = 1 30 | 31 | # Number of steps between checkpoints 32 | checkpoint_every = 10000 33 | 34 | # trainer checkpoint file to restart training from 35 | ckpt_path = '' 36 | 37 | # model checkpoint file to start a new training run from 38 | pretrained_ckpt_path = '' 39 | 40 | # Checkpoint path for the pretransform model if needed 41 | pretransform_ckpt_path = '' 42 | 43 | # configuration model specifying model hyperparameters 44 | model_config = '' 45 | 46 | # configuration for datasets 47 | dataset_config = '' 48 | 49 | # directory to save the checkpoints in 50 | save_dir = '' 51 | 52 | # gradient_clip_val passed into PyTorch Lightning Trainer 53 | gradient_clip_val = 0.0 54 | 55 | # remove the weight norm from the pretransform model 56 | remove_pretransform_weight_norm = '' -------------------------------------------------------------------------------- /SongGeneration/sample/lyrics.jsonl: -------------------------------------------------------------------------------- 1 | {"idx": "sample_01_autoprompt", "gt_lyric": "[intro-short] ; [verse] 雪花舞动在无尽的天际.情缘如同雪花般轻轻逝去.希望与真挚.永不磨灭.你的忧虑.随风而逝 ; [chorus] 我怀抱着守护这片梦境.在这世界中寻找爱与虚幻.苦辣酸甜.我们一起品尝.在雪的光芒中.紧紧相拥 ; [inst-short] ; [verse] 雪花再次在风中飘扬.情愿如同雪花般消失无踪.希望与真挚.永不消失.在痛苦与喧嚣中.你找到解脱 ; [chorus] 我环绕着守护这片梦境.在这世界中感受爱与虚假.苦辣酸甜.我们一起分享.在白银的光芒中.我们同在 ; [outro-short]", "auto_prompt_audio_type": "Auto"} 2 | {"idx": "sample_01_noprompt", "gt_lyric": "[intro-short] ; [verse] 雪花舞动在无尽的天际.情缘如同雪花般轻轻逝去.希望与真挚.永不磨灭.你的忧虑.随风而逝 ; [chorus] 我怀抱着守护这片梦境.在这世界中寻找爱与虚幻.苦辣酸甜.我们一起品尝.在雪的光芒中.紧紧相拥 ; [inst-short] ; [verse] 雪花再次在风中飘扬.情愿如同雪花般消失无踪.希望与真挚.永不消失.在痛苦与喧嚣中.你找到解脱 ; [chorus] 我环绕着守护这片梦境.在这世界中感受爱与虚假.苦辣酸甜.我们一起分享.在白银的光芒中.我们同在 ; [outro-short]"} 3 | {"idx": "sample_01_textprompt", "descriptions": "female, dark, pop, sad, piano and drums, the bpm is 125.", "gt_lyric": "[intro-short] ; [verse] 雪花舞动在无尽的天际.情缘如同雪花般轻轻逝去.希望与真挚.永不磨灭.你的忧虑.随风而逝 ; [chorus] 我怀抱着守护这片梦境.在这世界中寻找爱与虚幻.苦辣酸甜.我们一起品尝.在雪的光芒中.紧紧相拥 ; [inst-short] ; [verse] 雪花再次在风中飘扬.情愿如同雪花般消失无踪.希望与真挚.永不消失.在痛苦与喧嚣中.你找到解脱 ; [chorus] 我环绕着守护这片梦境.在这世界中感受爱与虚假.苦辣酸甜.我们一起分享.在白银的光芒中.我们同在 ; [outro-short]"} 4 | {"idx": "sample_01_audioprompt", "gt_lyric": "[intro-short] ; [verse] 雪花舞动在无尽的天际.情缘如同雪花般轻轻逝去.希望与真挚.永不磨灭.你的忧虑.随风而逝 ; [chorus] 我怀抱着守护这片梦境.在这世界中寻找爱与虚幻.苦辣酸甜.我们一起品尝.在雪的光芒中.紧紧相拥 ; [inst-short] ; [verse] 雪花再次在风中飘扬.情愿如同雪花般消失无踪.希望与真挚.永不消失.在痛苦与喧嚣中.你找到解脱 ; [chorus] 我环绕着守护这片梦境.在这世界中感受爱与虚假.苦辣酸甜.我们一起分享.在白银的光芒中.我们同在 ; [outro-short]", "prompt_audio_path": "input/sample_prompt_audio.wav"} 5 | -------------------------------------------------------------------------------- /SongGeneration/third_party/Qwen2-7B/tokenizer_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "add_prefix_space": false, 3 | "added_tokens_decoder": { 4 | "151643": { 5 | "content": "<|endoftext|>", 6 | "lstrip": false, 7 | "normalized": false, 8 | "rstrip": false, 9 | "single_word": false, 10 | "special": true 11 | }, 12 | "151644": { 13 | "content": "<|im_start|>", 14 | "lstrip": false, 15 | "normalized": false, 16 | "rstrip": false, 17 | "single_word": false, 18 | "special": true 19 | }, 20 | "151645": { 21 | "content": "<|im_end|>", 22 | "lstrip": false, 23 | "normalized": false, 24 | "rstrip": false, 25 | "single_word": false, 26 | "special": true 27 | } 28 | }, 29 | "additional_special_tokens": ["<|im_start|>", "<|im_end|>"], 30 | "bos_token": null, 31 | "chat_template": "{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system\nYou are a helpful assistant<|im_end|>\n' }}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}", 32 | "clean_up_tokenization_spaces": false, 33 | "eos_token": "<|endoftext|>", 34 | "errors": "replace", 35 | "model_max_length": 32768, 36 | "pad_token": "<|endoftext|>", 37 | "split_special_tokens": false, 38 | "tokenizer_class": "Qwen2Tokenizer", 39 | "unk_token": null 40 | } 41 | -------------------------------------------------------------------------------- /SongGeneration/third_party/stable_audio_tools/setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | setup( 4 | name='stable-audio-tools', 5 | version='0.0.16', 6 | url='https://github.com/Stability-AI/stable-audio-tools.git', 7 | author='Stability AI', 8 | description='Training and inference tools for generative audio models from Stability AI', 9 | packages=find_packages(), 10 | install_requires=[ 11 | 'aeiou==0.0.20', 12 | 'alias-free-torch==0.0.6', 13 | 'auraloss==0.4.0', 14 | 'descript-audio-codec==1.0.0', 15 | 'einops==0.7.0', 16 | 'einops-exts==0.0.4', 17 | 'ema-pytorch==0.2.3', 18 | 'encodec==0.1.1', 19 | 'gradio>=3.42.0', 20 | 'huggingface_hub', 21 | 'importlib-resources==5.12.0', 22 | 'k-diffusion==0.1.1', 23 | 'laion-clap==1.1.4', 24 | 'local-attention==1.8.6', 25 | 'pandas==2.0.2', 26 | 'pedalboard==0.7.4', 27 | 'prefigure==0.0.9', 28 | 'pytorch_lightning==2.1.0', 29 | 'PyWavelets==1.4.1', 30 | 'safetensors', 31 | 'sentencepiece==0.1.99', 32 | 's3fs', 33 | 'torch>=2.0.1', 34 | 'torchaudio>=2.0.2', 35 | 'torchmetrics==0.11.4', 36 | 'tqdm', 37 | 'transformers', 38 | 'v-diffusion-pytorch==0.0.2', 39 | 'vector-quantize-pytorch==1.9.14', 40 | 'wandb==0.15.4', 41 | 'webdataset==0.2.48', 42 | 'x-transformers<1.27.0' 43 | ], 44 | ) -------------------------------------------------------------------------------- /SongGeneration/third_party/demucs/models/spec.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | """ 4 | @File : spec.py 5 | @Time : 2023/8/8 下午5:10 6 | @Author : waytan 7 | @Contact : waytan@tencent.com 8 | @License : (C)Copyright 2023, Tencent 9 | @Desc : Spec 10 | """ 11 | 12 | import torch as th 13 | 14 | 15 | def spectro(x, n_fft=512, hop_length=None, pad=0): 16 | *other, length = x.shape 17 | x = x.reshape(-1, length) 18 | is_mps = x.device.type == 'mps' 19 | if is_mps: 20 | x = x.cpu() 21 | z = th.stft(x, 22 | n_fft * (1 + pad), 23 | hop_length or n_fft // 4, 24 | window=th.hann_window(n_fft).to(x), 25 | win_length=n_fft, 26 | normalized=True, 27 | center=True, 28 | return_complex=True, 29 | pad_mode='reflect') 30 | _, freqs, frame = z.shape 31 | return z.view(*other, freqs, frame) 32 | 33 | 34 | def ispectro(z, hop_length=None, length=None, pad=0): 35 | *other, freqs, frames = z.shape 36 | n_fft = 2 * freqs - 2 37 | z = z.view(-1, freqs, frames) 38 | win_length = n_fft // (1 + pad) 39 | is_mps = z.device.type == 'mps' 40 | if is_mps: 41 | z = z.cpu() 42 | x = th.istft(z, 43 | n_fft, 44 | hop_length, 45 | window=th.hann_window(win_length).to(z.real), 46 | win_length=win_length, 47 | normalized=True, 48 | length=length, 49 | center=True) 50 | _, length = x.shape 51 | return x.view(*other, length) 52 | -------------------------------------------------------------------------------- /SongGeneration/third_party/stable_audio_tools/run_gradio.py: -------------------------------------------------------------------------------- 1 | from stable_audio_tools import get_pretrained_model 2 | from stable_audio_tools.interface.gradio import create_ui 3 | import json 4 | 5 | import torch 6 | 7 | def main(args): 8 | torch.manual_seed(42) 9 | 10 | interface = create_ui( 11 | model_config_path = args.model_config, 12 | ckpt_path=args.ckpt_path, 13 | pretrained_name=args.pretrained_name, 14 | pretransform_ckpt_path=args.pretransform_ckpt_path, 15 | model_half=args.model_half 16 | ) 17 | interface.queue() 18 | interface.launch(share=args.share, auth=(args.username, args.password) if args.username is not None else None) 19 | 20 | if __name__ == "__main__": 21 | import argparse 22 | parser = argparse.ArgumentParser(description='Run gradio interface') 23 | parser.add_argument('--pretrained-name', type=str, help='Name of pretrained model', required=False) 24 | parser.add_argument('--model-config', type=str, help='Path to model config', required=False) 25 | parser.add_argument('--ckpt-path', type=str, help='Path to model checkpoint', required=False) 26 | parser.add_argument('--pretransform-ckpt-path', type=str, help='Optional to model pretransform checkpoint', required=False) 27 | parser.add_argument('--share', action='store_true', help='Create a publicly shareable link', required=False) 28 | parser.add_argument('--username', type=str, help='Gradio username', required=False) 29 | parser.add_argument('--password', type=str, help='Gradio password', required=False) 30 | parser.add_argument('--model-half', action='store_true', help='Whether to use half precision', required=False) 31 | args = parser.parse_args() 32 | main(args) -------------------------------------------------------------------------------- /SongGeneration/third_party/dac/compare/encodec.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from audiotools import AudioSignal 3 | from audiotools.ml import BaseModel 4 | from encodec import EncodecModel 5 | 6 | 7 | class Encodec(BaseModel): 8 | def __init__(self, sample_rate: int = 24000, bandwidth: float = 24.0): 9 | super().__init__() 10 | 11 | if sample_rate == 24000: 12 | self.model = EncodecModel.encodec_model_24khz() 13 | else: 14 | self.model = EncodecModel.encodec_model_48khz() 15 | self.model.set_target_bandwidth(bandwidth) 16 | self.sample_rate = 44100 17 | 18 | def forward( 19 | self, 20 | audio_data: torch.Tensor, 21 | sample_rate: int = 44100, 22 | n_quantizers: int = None, 23 | ): 24 | signal = AudioSignal(audio_data, sample_rate) 25 | signal.resample(self.model.sample_rate) 26 | recons = self.model(signal.audio_data) 27 | recons = AudioSignal(recons, self.model.sample_rate) 28 | recons.resample(sample_rate) 29 | return {"audio": recons.audio_data} 30 | 31 | 32 | if __name__ == "__main__": 33 | import numpy as np 34 | from functools import partial 35 | 36 | model = Encodec() 37 | 38 | for n, m in model.named_modules(): 39 | o = m.extra_repr() 40 | p = sum([np.prod(p.size()) for p in m.parameters()]) 41 | fn = lambda o, p: o + f" {p/1e6:<.3f}M params." 42 | setattr(m, "extra_repr", partial(fn, o=o, p=p)) 43 | print(model) 44 | print("Total # of params: ", sum([np.prod(p.size()) for p in model.parameters()])) 45 | 46 | length = 88200 * 2 47 | x = torch.randn(1, 1, length).to(model.device) 48 | x.requires_grad_(True) 49 | x.retain_grad() 50 | 51 | # Make a forward pass 52 | out = model(x)["audio"] 53 | 54 | print(x.shape, out.shape) 55 | -------------------------------------------------------------------------------- /SongGeneration/codeclm/tokenizer/Flow1dVAE/extract_codes_stereo_7_1x4.py: -------------------------------------------------------------------------------- 1 | import torch,torchaudio 2 | import os,sys,json 3 | from tqdm import tqdm 4 | 5 | #from codeclm_song_v1.codeclm.semantic_extractor.SpeechDecoder_v01.generate import Tango 6 | from .generate_4rvq import Tango 7 | import kaldiio 8 | from kaldiio import WriteHelper 9 | 10 | if __name__ == "__main__": 11 | # Define Model 12 | json_path = sys.argv[1] 13 | outdir = sys.argv[2] 14 | 15 | mus_infos = [] 16 | with open(json_path) as f: 17 | for line in f: 18 | item = json.loads(line) 19 | mus_infos.append(item) 20 | 21 | tango = Tango(model_path = './saved/model_4rvq/model_2_fixed.safetensors', rvq_num=4) 22 | 23 | 24 | # Feature extraction loop 25 | # for i in tqdm(range(2000)): 26 | with WriteHelper('ark,scp:{}/token.ark,{}/token.scp'.format(outdir, outdir), write_function="pickle") as writer: 27 | print('ark,scp:{}/token.ark,{}/token.scp'.format(outdir, outdir)) 28 | for item in tqdm(mus_infos): 29 | try: 30 | # if True: 31 | idx = item['idx'] 32 | # print(idx) 33 | with torch.autocast(device_type="cuda", dtype=torch.float16): 34 | if(os.path.exists(item['path'])): 35 | codes = tango.file2code(item['path']) 36 | else: 37 | codes = tango.file2code('/mnt/share/' + item['path']) 38 | writer(str(idx), codes.cpu()) 39 | except: 40 | print(item['path']) 41 | continue 42 | # idx = item['idx'] 43 | # # print(idx) 44 | # with torch.autocast(device_type="cuda", dtype=torch.float16): 45 | # codes = tango.file2code(item['path']) 46 | # writer(str(idx), codes.cpu()) -------------------------------------------------------------------------------- /SongGeneration/codeclm/tokenizer/Flow1dVAE/tools/mix.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def a_weight(fs, n_fft, min_db=-80.0): 5 | freq = np.linspace(0, fs // 2, n_fft // 2 + 1) 6 | freq_sq = np.power(freq, 2) 7 | freq_sq[0] = 1.0 8 | weight = 2.0 + 20.0 * (2 * np.log10(12194) + 2 * np.log10(freq_sq) 9 | - np.log10(freq_sq + 12194 ** 2) 10 | - np.log10(freq_sq + 20.6 ** 2) 11 | - 0.5 * np.log10(freq_sq + 107.7 ** 2) 12 | - 0.5 * np.log10(freq_sq + 737.9 ** 2)) 13 | weight = np.maximum(weight, min_db) 14 | 15 | return weight 16 | 17 | 18 | def compute_gain(sound, fs, min_db=-80.0, mode="A_weighting"): 19 | if fs == 16000: 20 | n_fft = 2048 21 | elif fs == 44100: 22 | n_fft = 4096 23 | else: 24 | raise Exception("Invalid fs {}".format(fs)) 25 | stride = n_fft // 2 26 | 27 | gain = [] 28 | for i in range(0, len(sound) - n_fft + 1, stride): 29 | if mode == "RMSE": 30 | g = np.mean(sound[i: i + n_fft] ** 2) 31 | elif mode == "A_weighting": 32 | spec = np.fft.rfft(np.hanning(n_fft + 1)[:-1] * sound[i: i + n_fft]) 33 | power_spec = np.abs(spec) ** 2 34 | a_weighted_spec = power_spec * np.power(10, a_weight(fs, n_fft) / 10) 35 | g = np.sum(a_weighted_spec) 36 | else: 37 | raise Exception("Invalid mode {}".format(mode)) 38 | gain.append(g) 39 | 40 | gain = np.array(gain) 41 | gain = np.maximum(gain, np.power(10, min_db / 10)) 42 | gain_db = 10 * np.log10(gain) 43 | return gain_db 44 | 45 | 46 | def mix(sound1, sound2, r, fs): 47 | gain1 = np.max(compute_gain(sound1, fs)) # Decibel 48 | gain2 = np.max(compute_gain(sound2, fs)) 49 | t = 1.0 / (1 + np.power(10, (gain1 - gain2) / 20.) * (1 - r) / r) 50 | sound = ((sound1 * t + sound2 * (1 - t)) / np.sqrt(t ** 2 + (1 - t) ** 2)) 51 | return sound -------------------------------------------------------------------------------- /SongGeneration/codeclm/tokenizer/Flow1dVAE/tools/check_stereo.py: -------------------------------------------------------------------------------- 1 | ''' 2 | TAMPLEATE = { 3 | "path": "" 4 | "duration": "" 5 | "sample_rate": "" 6 | "amplitude": null, 7 | "weight": null, 8 | "info_path": null 9 | } 10 | ''' 11 | import torchaudio 12 | import json 13 | from tqdm import tqdm 14 | 15 | import torchaudio 16 | import numpy as np 17 | import torch, torch.nn as nn, random 18 | from torchaudio import transforms 19 | import os 20 | import argparse 21 | from tqdm import tqdm 22 | import torchaudio 23 | from torchaudio.transforms import Resample 24 | from multiprocessing import Pool 25 | 26 | def preprocess(args, wav_json, thread_id): 27 | # f = open("pretrain_tme_20230927.scp").readlines() 28 | f = open("out.{}".format(thread_id), 'w') 29 | for line in tqdm(wav_json): 30 | try: 31 | # import pdb; pdb.set_trace() 32 | line = line.strip() 33 | wav_info = json.loads(line) 34 | meta = torchaudio.info(wav_info["path"]) 35 | 36 | wav_info["num_channels"] = meta.num_channels 37 | json_string = json.dumps(wav_info) 38 | # print(json_string) 39 | f.write("{}\n".format(json_string)) 40 | except: 41 | print(line) 42 | 43 | if __name__ == "__main__": 44 | 45 | parser = argparse.ArgumentParser(description='Deep Speaker Embedding Inference') 46 | parser.add_argument('--wav_json', type=str) 47 | parser.add_argument('--num_thread', default=10, type=int, help='random seed') 48 | args = parser.parse_args() 49 | 50 | wav_json_total = open(args.wav_json).readlines() 51 | args.num_thread = min(len(wav_json_total), args.num_thread) 52 | wav_json_list = np.array_split(wav_json_total, args.num_thread) 53 | 54 | p = Pool(args.num_thread) 55 | for thread_id, wav_json in enumerate(wav_json_list): 56 | r = p.apply_async(preprocess, (args, wav_json, thread_id)) 57 | p.close() 58 | p.join() 59 | r.get() 60 | -------------------------------------------------------------------------------- /SongGeneration/codeclm/tokenizer/Flow1dVAE/tools/infer_encodec.py: -------------------------------------------------------------------------------- 1 | import json 2 | import torch 3 | from tqdm import tqdm 4 | from audiocraft.models.loaders import load_compression_model 5 | import torchaudio 6 | import librosa 7 | import os 8 | import math 9 | import numpy as np 10 | 11 | class Tango: 12 | def __init__(self, \ 13 | device="cuda:0"): 14 | 15 | self.sample_rate = 48000 16 | self.rsp48to32 = torchaudio.transforms.Resample(48000, 32000).to(device) 17 | self.rsp32to48 = torchaudio.transforms.Resample(32000, 48000).to(device) 18 | 19 | encodec = load_compression_model('compression_state_dict.bin', device='cpu').eval() 20 | encodec.set_num_codebooks(1) 21 | self.encodec = encodec.eval().to(device) 22 | self.device = torch.device(device) 23 | print ("Successfully loaded encodec model") 24 | 25 | @torch.no_grad() 26 | def remix(self, filename, start_step=1000, steps=999, disable_progress=False): 27 | """ Genrate audio without condition. """ 28 | init_audio, _ = librosa.load(filename, sr=self.sample_rate, mono=False) 29 | if(len(init_audio.shape)>1):init_audio = init_audio[0] 30 | init_audio = torch.from_numpy(init_audio)[None,None,:].to(self.device) 31 | init_audio = init_audio[:,:,int(0*self.sample_rate):int(10.24*3*self.sample_rate)] 32 | if(init_audio.shape[-1]1):init_audio = init_audio[0] 33 | init_audio = torch.from_numpy(init_audio)[None,None,:].to(self.device) 34 | init_audio = init_audio[:,:,0:int(10.24*2*self.sample_rate)] 35 | if(init_audio.shape[-1] 25_000): 33 | print("GPU memory {}, run matrix cal".format(free_mem)) 34 | break 35 | else: 36 | print("GPU memory {}, sleep 1min".format(free_mem)) 37 | time.sleep(60) 38 | 39 | mus_infos = [] 40 | with open(json_path) as f: 41 | for line in f: 42 | item = json.loads(line) 43 | mus_infos.append(item) 44 | 45 | tango = Tango(model_path = './saved/model_2rvq/model_2_fixed.safetensors', rvq_num=2) 46 | 47 | 48 | # Feature extraction loop 49 | # for i in tqdm(range(2000)): 50 | with WriteHelper('ark,scp:{}/token.ark,{}/token.scp'.format(outdir, outdir), write_function="pickle") as writer: 51 | print('ark,scp:{}/token.ark,{}/token.scp'.format(outdir, outdir)) 52 | for item in tqdm(mus_infos): 53 | try: 54 | # if True: 55 | idx = item['idx'] 56 | # print(idx) 57 | with torch.autocast(device_type="cuda", dtype=torch.float16): 58 | if(os.path.exists(item['path'])): 59 | codes = tango.file2code(item['path']) 60 | else: 61 | codes = tango.file2code('/mnt/share/' + item['path']) 62 | writer(str(idx), codes.cpu()) 63 | except: 64 | print(item['path']) 65 | continue 66 | # idx = item['idx'] 67 | # # print(idx) 68 | # with torch.autocast(device_type="cuda", dtype=torch.float16): 69 | # codes = tango.file2code(item['path']) 70 | # writer(str(idx), codes.cpu()) -------------------------------------------------------------------------------- /SongGeneration/conf/w2v2_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "activation_dropout": 0.1, 3 | "adapter_kernel_size": 3, 4 | "adapter_stride": 2, 5 | "add_adapter": false, 6 | "apply_spec_augment": true, 7 | "architectures": [ 8 | "Wav2Vec2ConformerForCTC" 9 | ], 10 | "attention_dropout": 0.1, 11 | "bos_token_id": 1, 12 | "classifier_proj_size": 256, 13 | "codevector_dim": 768, 14 | "conformer_conv_dropout": 0.1, 15 | "contrastive_logits_temperature": 0.1, 16 | "conv_bias": true, 17 | "conv_depthwise_kernel_size": 31, 18 | "conv_dim": [ 19 | 512, 20 | 512, 21 | 512, 22 | 512, 23 | 512, 24 | 512, 25 | 512 26 | ], 27 | "conv_kernel": [ 28 | 10, 29 | 3, 30 | 3, 31 | 3, 32 | 3, 33 | 2, 34 | 2 35 | ], 36 | "conv_stride": [ 37 | 5, 38 | 2, 39 | 2, 40 | 2, 41 | 2, 42 | 2, 43 | 2 44 | ], 45 | "ctc_loss_reduction": "sum", 46 | "ctc_zero_infinity": false, 47 | "diversity_loss_weight": 0.1, 48 | "do_stable_layer_norm": true, 49 | "eos_token_id": 2, 50 | "feat_extract_activation": "gelu", 51 | "feat_extract_dropout": 0.0, 52 | "feat_extract_norm": "layer", 53 | "feat_proj_dropout": 0.1, 54 | "feat_quantizer_dropout": 0.0, 55 | "final_dropout": 0.1, 56 | "gradient_checkpointing": false, 57 | "hidden_act": "swish", 58 | "hidden_dropout": 0.1, 59 | "hidden_dropout_prob": 0.1, 60 | "hidden_size": 1024, 61 | "initializer_range": 0.02, 62 | "intermediate_size": 4096, 63 | "layer_norm_eps": 1e-05, 64 | "layerdrop": 0.0, 65 | "mask_feature_length": 10, 66 | "mask_feature_min_masks": 0, 67 | "mask_feature_prob": 0.0, 68 | "mask_time_length": 10, 69 | "mask_time_min_masks": 2, 70 | "mask_time_prob": 0.05, 71 | "max_source_positions": 5000, 72 | "model_type": "wav2vec2-conformer", 73 | "num_adapter_layers": 3, 74 | "num_attention_heads": 16, 75 | "num_codevector_groups": 2, 76 | "num_codevectors_per_group": 320, 77 | "num_conv_pos_embedding_groups": 16, 78 | "num_conv_pos_embeddings": 128, 79 | "num_feat_extract_layers": 7, 80 | "num_hidden_layers": 24, 81 | "num_negatives": 100, 82 | "output_hidden_size": 1024, 83 | "pad_token_id": 0, 84 | "position_embeddings_type": "rotary", 85 | "proj_codevector_dim": 768, 86 | "rotary_embedding_base": 10000, 87 | "tdnn_dilation": [ 88 | 1, 89 | 2, 90 | 3, 91 | 1, 92 | 1 93 | ], 94 | "tdnn_dim": [ 95 | 512, 96 | 512, 97 | 512, 98 | 512, 99 | 1500 100 | ], 101 | "tdnn_kernel": [ 102 | 5, 103 | 3, 104 | 3, 105 | 1, 106 | 1 107 | ], 108 | "torch_dtype": "float32", 109 | "transformers_version": "4.19.0.dev0", 110 | "use_weighted_layer_sum": false, 111 | "vocab_size": 32, 112 | "xvector_output_dim": 512 113 | } 114 | -------------------------------------------------------------------------------- /SongGeneration/codeclm/tokenizer/Flow1dVAE/our_MERT_BESTRQ/run_training_eat.sh: -------------------------------------------------------------------------------- 1 | WORKER_RANK=${1:-$INDEX} 2 | PLATFORM=${2:-'shef'} 3 | YAML_NAME_WITHOUT_EXT=${3:-'MERT_RVQ-VAE_CQT_95M'} 4 | TRAINING_SETTING=${4:-'MERT_RVQ-VAE_CQT'} 5 | MASTER_PROC_ADD=${5:-$CHIEF_IP} 6 | DIST_PORT=${6:-'25520'} 7 | # echo $PATH 8 | # export PATH=$PATH:./ 9 | echo "worker rank ${WORKER_RANK}, master address ${MASTER_PROC_ADD}:${DIST_PORT}" 10 | 11 | MAP_PROJ_DIR=$(pwd) 12 | echo $MAP_PROJ_DIR 13 | 14 | NNODS=1 15 | BATCH_SIZE=12 16 | NUM_WOKERS=6 17 | 18 | run_command_prefix=' ' 19 | # Loading folders 20 | # 1. tsv files for audio paths 21 | # DATA_DIR=${MAP_PROJ_DIR}/data/audio_tsv 22 | DATA_DIR=${MAP_PROJ_DIR}/data/music4all_sh #audio_manifest 23 | # 2. working folder for saving checkpoints and loading config files 24 | CONFIG_DIR=/${MAP_PROJ_DIR}/mert_fairseq/config/pretrain 25 | # 3. clustering labels for training data 26 | LABEL_ROOT_DIR=${MAP_PROJ_DIR}/data/encodec_labels/custom_audio_dataset 27 | 28 | FAIRSEQ_PATH=${MAP_PROJ_DIR}/src/fairseq; 29 | SAVE_DIR=${MAP_PROJ_DIR}/data/fairseq_savedir/ 30 | 31 | case $YAML_NAME_WITHOUT_EXT in 32 | EAT_pretraining_music_multinodes) 33 | NNODS=4 34 | NPROCES_PER_NODE=8 35 | LABEL_RATE=25 36 | BATCH_SIZE=12 37 | ;; 38 | *) 39 | echo "Unknown running config: ${$YAML_NAME_WITHOUT_EXT}" 40 | exit 1 41 | ;; 42 | esac 43 | 44 | echo running $YAML_NAME_WITHOUT_EXT .. 45 | 46 | mkdir -p ${SAVE_DIR} 47 | echo "checkpoint save at: ${SAVE_DIR}" 48 | cd ${SAVE_DIR} 49 | 50 | DISTRIBUTED_WORLD_SIZE=`expr ${NNODS} \* ${NPROCES_PER_NODE}` 51 | ACTUAL_WORKER_RANK=`expr ${WORKER_RANK} \* ${NPROCES_PER_NODE}` 52 | echo "worker rank ${WORKER_RANK}, master address ${MASTER_PROC_ADD}:${DIST_PORT}, actual rank ${ACTUAL_WORKER_RANK}" 53 | 54 | DATE_SUFFIX=`date +"%Y-%m-%d_%H-%M"` 55 | 56 | OMP_NUM_THREADS=6 ${run_command_prefix} \ 57 | python -u ${FAIRSEQ_PATH}/fairseq_cli/hydra_train.py \ 58 | --config-dir ${CONFIG_DIR} --config-name ${YAML_NAME_WITHOUT_EXT} \ 59 | common.user_dir=${MAP_PROJ_DIR}/mert_fairseq \ 60 | common.tensorboard_logdir=${MAP_PROJ_DIR}/logs/pretrain_tb_${TRAINING_SETTING}_${YAML_NAME_WITHOUT_EXT}_multinodes${NNODS} \ 61 | checkpoint.save_dir=${SAVE_DIR}/ckpt_${TRAINING_SETTING}_multinodes${NNODS}_${DATE_SUFFIX}/${YAML_NAME_WITHOUT_EXT} \ 62 | distributed_training.distributed_rank=${ACTUAL_WORKER_RANK} \ 63 | distributed_training.distributed_world_size=${DISTRIBUTED_WORLD_SIZE} \ 64 | distributed_training.distributed_num_procs=${DISTRIBUTED_WORLD_SIZE} \ 65 | distributed_training.nprocs_per_node=${NPROCES_PER_NODE} \ 66 | distributed_training.distributed_init_method="tcp://${CHIEF_IP}:${DIST_PORT}" \ 67 | task.data=${DATA_DIR} \ 68 | dataset.num_workers=${NUM_WOKERS} \ 69 | dataset.batch_size=${BATCH_SIZE} \ 70 | dataset.disable_validation=true \ 71 | 72 | # pip install h5py timm -i https://mirrors.tencent.com/pypi/simple/ -------------------------------------------------------------------------------- /SongGeneration/third_party/stable_audio_tools/stable_audio_tools/models/diffusion_prior.py: -------------------------------------------------------------------------------- 1 | from enum import Enum 2 | import typing as tp 3 | 4 | from .diffusion import ConditionedDiffusionModelWrapper 5 | from ..inference.generation import generate_diffusion_cond 6 | from ..inference.utils import prepare_audio 7 | 8 | import torch 9 | from torch.nn import functional as F 10 | from torchaudio import transforms as T 11 | 12 | # Define prior types enum 13 | class PriorType(Enum): 14 | MonoToStereo = 1 15 | 16 | class DiffusionPrior(ConditionedDiffusionModelWrapper): 17 | def __init__(self, *args, prior_type: PriorType=None, **kwargs): 18 | super().__init__(*args, **kwargs) 19 | self.prior_type = prior_type 20 | 21 | class MonoToStereoDiffusionPrior(DiffusionPrior): 22 | def __init__(self, *args, **kwargs): 23 | super().__init__(*args, prior_type=PriorType.MonoToStereo, **kwargs) 24 | 25 | def stereoize( 26 | self, 27 | audio: torch.Tensor, # (batch, channels, time) 28 | in_sr: int, 29 | steps: int, 30 | sampler_kwargs: dict = {}, 31 | ): 32 | """ 33 | Generate stereo audio from mono audio using a pre-trained diffusion prior 34 | 35 | Args: 36 | audio: The mono audio to convert to stereo 37 | in_sr: The sample rate of the input audio 38 | steps: The number of diffusion steps to run 39 | sampler_kwargs: Keyword arguments to pass to the diffusion sampler 40 | """ 41 | 42 | device = audio.device 43 | 44 | sample_rate = self.sample_rate 45 | 46 | # Resample input audio if necessary 47 | if in_sr != sample_rate: 48 | resample_tf = T.Resample(in_sr, sample_rate).to(audio.device) 49 | audio = resample_tf(audio) 50 | 51 | audio_length = audio.shape[-1] 52 | 53 | # Pad input audio to be compatible with the model 54 | min_length = self.min_input_length 55 | padded_input_length = audio_length + (min_length - (audio_length % min_length)) % min_length 56 | 57 | # Pad input audio to be compatible with the model 58 | if padded_input_length > audio_length: 59 | audio = F.pad(audio, (0, padded_input_length - audio_length)) 60 | 61 | # Make audio mono, duplicate to stereo 62 | dual_mono = audio.mean(1, keepdim=True).repeat(1, 2, 1) 63 | 64 | if self.pretransform is not None: 65 | dual_mono = self.pretransform.encode(dual_mono) 66 | 67 | conditioning = {"source": [dual_mono]} 68 | 69 | stereo_audio = generate_diffusion_cond( 70 | self, 71 | conditioning_tensors=conditioning, 72 | steps=steps, 73 | sample_size=padded_input_length, 74 | sample_rate=sample_rate, 75 | device=device, 76 | **sampler_kwargs, 77 | ) 78 | 79 | return stereo_audio -------------------------------------------------------------------------------- /SongGeneration/codeclm/tokenizer/Flow1dVAE/our_MERT_BESTRQ/mert_fairseq/models/musicfm/model/w2v2_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "activation_dropout": 0.1, 3 | "adapter_kernel_size": 3, 4 | "adapter_stride": 2, 5 | "add_adapter": false, 6 | "apply_spec_augment": true, 7 | "architectures": [ 8 | "Wav2Vec2ConformerForCTC" 9 | ], 10 | "attention_dropout": 0.1, 11 | "bos_token_id": 1, 12 | "classifier_proj_size": 256, 13 | "codevector_dim": 768, 14 | "conformer_conv_dropout": 0.1, 15 | "contrastive_logits_temperature": 0.1, 16 | "conv_bias": true, 17 | "conv_depthwise_kernel_size": 31, 18 | "conv_dim": [ 19 | 512, 20 | 512, 21 | 512, 22 | 512, 23 | 512, 24 | 512, 25 | 512 26 | ], 27 | "conv_kernel": [ 28 | 10, 29 | 3, 30 | 3, 31 | 3, 32 | 3, 33 | 2, 34 | 2 35 | ], 36 | "conv_stride": [ 37 | 5, 38 | 2, 39 | 2, 40 | 2, 41 | 2, 42 | 2, 43 | 2 44 | ], 45 | "ctc_loss_reduction": "sum", 46 | "ctc_zero_infinity": false, 47 | "diversity_loss_weight": 0.1, 48 | "do_stable_layer_norm": true, 49 | "eos_token_id": 2, 50 | "feat_extract_activation": "gelu", 51 | "feat_extract_dropout": 0.0, 52 | "feat_extract_norm": "layer", 53 | "feat_proj_dropout": 0.1, 54 | "feat_quantizer_dropout": 0.0, 55 | "final_dropout": 0.1, 56 | "gradient_checkpointing": false, 57 | "hidden_act": "swish", 58 | "hidden_dropout": 0.1, 59 | "hidden_dropout_prob": 0.1, 60 | "hidden_size": 1024, 61 | "initializer_range": 0.02, 62 | "intermediate_size": 4096, 63 | "layer_norm_eps": 1e-05, 64 | "layerdrop": 0.0, 65 | "mask_feature_length": 10, 66 | "mask_feature_min_masks": 0, 67 | "mask_feature_prob": 0.0, 68 | "mask_time_length": 10, 69 | "mask_time_min_masks": 2, 70 | "mask_time_prob": 0.05, 71 | "max_source_positions": 5000, 72 | "model_type": "wav2vec2-conformer", 73 | "num_adapter_layers": 3, 74 | "num_attention_heads": 16, 75 | "num_codevector_groups": 2, 76 | "num_codevectors_per_group": 320, 77 | "num_conv_pos_embedding_groups": 16, 78 | "num_conv_pos_embeddings": 128, 79 | "num_feat_extract_layers": 7, 80 | "num_hidden_layers": 24, 81 | "num_negatives": 100, 82 | "output_hidden_size": 1024, 83 | "pad_token_id": 0, 84 | "position_embeddings_type": "rotary", 85 | "proj_codevector_dim": 768, 86 | "rotary_embedding_base": 10000, 87 | "tdnn_dilation": [ 88 | 1, 89 | 2, 90 | 3, 91 | 1, 92 | 1 93 | ], 94 | "tdnn_dim": [ 95 | 512, 96 | 512, 97 | 512, 98 | 512, 99 | 1500 100 | ], 101 | "tdnn_kernel": [ 102 | 5, 103 | 3, 104 | 3, 105 | 1, 106 | 1 107 | ], 108 | "torch_dtype": "float32", 109 | "transformers_version": "4.19.0.dev0", 110 | "use_weighted_layer_sum": false, 111 | "vocab_size": 32, 112 | "xvector_output_dim": 512 113 | } 114 | -------------------------------------------------------------------------------- /SongGeneration/codeclm/tokenizer/Flow1dVAE/our_MERT_BESTRQ/mert_fairseq/config/pretrain/EAT_pretraining_AS2M.yaml: -------------------------------------------------------------------------------- 1 | # @package _group_ 2 | 3 | common: 4 | fp16: true 5 | log_format: json 6 | log_interval: 200 7 | tensorboard_logdir: tb 8 | min_loss_scale: 1e-6 9 | fp16_no_flatten_grads: true 10 | user_dir: ${env:PWD} 11 | seed: 1 12 | 13 | checkpoint: 14 | save_interval: 1 15 | save_interval_updates: 10000 16 | keep_interval_updates: 1 17 | no_epoch_checkpoints: true 18 | 19 | task: 20 | _name: mae_image_pretraining 21 | data: unbalanced_train 22 | rebuild_batches: true 23 | key: source 24 | precompute_mask_config: {} 25 | downsr_16hz: true 26 | audio_mae: true 27 | h5_format: false 28 | target_length: 1024 29 | flexible_mask: false 30 | 31 | dataset: 32 | num_workers: 10 33 | batch_size: 12 34 | skip_invalid_size_inputs_valid_test: true 35 | required_batch_size_multiple: 1 36 | disable_validation: true 37 | 38 | distributed_training: 39 | distributed_world_size: 4 40 | ddp_backend: c10d 41 | 42 | criterion: 43 | _name: model 44 | log_keys: 45 | - ema_decay 46 | - target_var 47 | - pred_var 48 | - model_norm 49 | - ema_norm 50 | - masked_pct 51 | 52 | optimization: 53 | max_update: 400000 54 | lr: [ 0.0005 ] 55 | debug_param_names: true 56 | clip_norm: 4 57 | 58 | optimizer: 59 | _name: composite 60 | dynamic_groups: true 61 | groups: 62 | default: 63 | lr_float: 0.0005 64 | optimizer: 65 | _name: adam 66 | adam_betas: [0.9,0.95] 67 | weight_decay: 0.05 68 | lr_scheduler: 69 | _name: cosine 70 | warmup_updates: 53333 71 | 72 | lr_scheduler: pass_through 73 | 74 | model: 75 | _name: data2vec_multi 76 | 77 | ema_decay: 0.9998 78 | ema_end_decay: 0.99999 79 | ema_anneal_end_step: 100000 80 | instance_norm_target_layer: true 81 | layer_norm_target_layer: false 82 | layer_norm_targets: true 83 | end_of_block_targets: false 84 | 85 | depth: 12 86 | average_top_k_layers: 12 87 | clone_batch: 16 88 | 89 | norm_eps: 1e-6 90 | 91 | min_target_var: 0 92 | min_pred_var: 0 93 | 94 | encoder_dropout: 0 95 | post_mlp_drop: 0 96 | attention_dropout: 0 97 | activation_dropout: 0 98 | 99 | supported_modality: IMAGE 100 | cls_loss: 1 101 | 102 | ema_encoder_only: false 103 | 104 | modalities: 105 | image: 106 | in_chans: 1 107 | inverse_mask: true 108 | mask_prob: 0.8 109 | mask_prob_adjust: 0.07 110 | mask_length: 5 111 | mask_noise_std: 0.01 112 | prenet_depth: 0 113 | ema_local_encoder: true 114 | num_extra_tokens: 1 115 | init_extra_token_zero: false 116 | use_alibi_encoder: false 117 | decoder: 118 | decoder_dim: 768 119 | decoder_groups: 16 120 | decoder_kernel: 3 121 | decoder_layers: 6 122 | input_dropout: 0 -------------------------------------------------------------------------------- /SongGeneration/codeclm/tokenizer/Flow1dVAE/our_MERT_BESTRQ/mert_fairseq/config/pretrain/MERT_RVQ-VAE_CQT_95M.yaml: -------------------------------------------------------------------------------- 1 | # @package _group_ 2 | common: 3 | fp16: false 4 | log_format: json 5 | log_interval: 200 6 | seed: 1337 7 | # tensorboard_logdir: tblog_proj_name 8 | # wandb_project: wandb_proj_name 9 | 10 | checkpoint: 11 | save_interval_updates: 25000 12 | keep_interval_updates: -1 13 | no_epoch_checkpoints: true 14 | 15 | 16 | distributed_training: 17 | ddp_backend: no_c10d 18 | distributed_backend: 'nccl' 19 | distributed_world_size: 64 20 | nprocs_per_node: 8 21 | find_unused_parameters: true 22 | 23 | task: 24 | _name: mert_pretraining 25 | data: ??? 26 | label_dir: ??? 27 | labels: ??? 28 | label_rate: ${model.label_rate} 29 | sample_rate: 24000 30 | # crop to 5s 31 | max_sample_size: 120000 32 | min_sample_size: 72000 33 | 34 | pad_audio: false 35 | random_crop: true 36 | normalize: false # must be consistent with extractor 37 | 38 | 39 | dataset: 40 | num_workers: 6 41 | max_tokens: 2000000 42 | skip_invalid_size_inputs_valid_test: true 43 | validate_interval: 1 44 | validate_interval_updates: 10000 45 | 46 | criterion: 47 | _name: hubert 48 | pred_masked_weight: 1.0 49 | pred_nomask_weight: 0.0 50 | loss_weights: [10, 1] 51 | 52 | optimization: 53 | max_update: 400000 54 | lr: [0.0005] 55 | clip_norm: 10.0 56 | 57 | optimizer: 58 | _name: adam 59 | adam_betas: (0.9,0.98) 60 | adam_eps: 1e-06 61 | weight_decay: 0.01 62 | 63 | lr_scheduler: 64 | _name: polynomial_decay 65 | warmup_updates: 32000 66 | 67 | model: 68 | _name: mert 69 | label_rate: ??? 70 | skip_masked: false 71 | skip_nomask: true 72 | mask_prob: 0.8 73 | mask_length: 5 74 | 75 | logit_temp: 0.1 76 | 77 | # ----- mixture ------ 78 | mixture_prob: 0.5 79 | inbatch_noise_augment_len_range: "[12000, 24000]" 80 | inbatch_noise_augment_number_range: "[1, 3]" 81 | inbatch_noise_augment_volume: 1.0 82 | # ------------------------ 83 | extractor_mode: default 84 | audio_extract_type: w2v_conv 85 | conv_feature_layers: '[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2' 86 | 87 | # ---- cqt reconstruction, need to add loss weight --- 88 | audio_cqt_loss_m: true 89 | audio_cqt_bins: 336 90 | # ----------- 91 | final_dim: 64 92 | encoder_layerdrop: 0.05 93 | dropout_input: 0.1 94 | dropout_features: 0.1 95 | dropout: 0.1 96 | attention_dropout: 0.1 97 | feature_grad_mult: 0.1 98 | untie_final_proj: true 99 | activation_dropout: 0.0 100 | 101 | 102 | hydra: 103 | job: 104 | config: 105 | override_dirname: 106 | kv_sep: '-' 107 | item_sep: '__' 108 | exclude_keys: 109 | - run 110 | - task.data 111 | - task.label_dir 112 | run: 113 | dir: ??? 114 | sweep: 115 | dir: ??? 116 | subdir: ${hydra.job.config_name}__${hydra.job.override_dirname} 117 | -------------------------------------------------------------------------------- /SongGeneration/third_party/stable_audio_tools/stable_audio_tools/configs/model_configs/autoencoders/encodec_musicgen_rvq.json: -------------------------------------------------------------------------------- 1 | { 2 | "model_type": "autoencoder", 3 | "sample_size": 32000, 4 | "sample_rate": 32000, 5 | "audio_channels": 1, 6 | "model": { 7 | "encoder": { 8 | "type": "seanet", 9 | "config": { 10 | "channels": 1, 11 | "dimension": 128, 12 | "n_filters": 64, 13 | "ratios": [4, 4, 5, 8], 14 | "n_residual_layers": 1, 15 | "dilation_base": 2, 16 | "lstm": 2, 17 | "norm": "weight_norm" 18 | } 19 | }, 20 | "decoder": { 21 | "type": "seanet", 22 | "config": { 23 | "channels": 1, 24 | "dimension": 128, 25 | "n_filters": 64, 26 | "ratios": [4, 4, 5, 8], 27 | "n_residual_layers": 1, 28 | "dilation_base": 2, 29 | "lstm": 2, 30 | "norm": "weight_norm" 31 | } 32 | }, 33 | "bottleneck": { 34 | "type": "rvq", 35 | "config": { 36 | "num_quantizers": 4, 37 | "codebook_size": 2048, 38 | "dim": 128, 39 | "decay": 0.99, 40 | "threshold_ema_dead_code": 2 41 | } 42 | }, 43 | "latent_dim": 128, 44 | "downsampling_ratio": 640, 45 | "io_channels": 1 46 | }, 47 | "training": { 48 | "learning_rate": 1e-4, 49 | "warmup_steps": 0, 50 | "use_ema": true, 51 | "loss_configs": { 52 | "discriminator": { 53 | "type": "encodec", 54 | "config": { 55 | "filters": 32, 56 | "n_ffts": [2048, 1024, 512, 256, 128], 57 | "hop_lengths": [512, 256, 128, 64, 32], 58 | "win_lengths": [2048, 1024, 512, 256, 128] 59 | }, 60 | "weights": { 61 | "adversarial": 0.1, 62 | "feature_matching": 5.0 63 | } 64 | }, 65 | "spectral": { 66 | "type": "mrstft", 67 | "config": { 68 | "fft_sizes": [2048, 1024, 512, 256, 128, 64, 32], 69 | "hop_sizes": [512, 256, 128, 64, 32, 16, 8], 70 | "win_lengths": [2048, 1024, 512, 256, 128, 64, 32], 71 | "perceptual_weighting": true 72 | }, 73 | "weights": { 74 | "mrstft": 1.0 75 | } 76 | }, 77 | "time": { 78 | "type": "l1", 79 | "weights": { 80 | "l1": 0.0 81 | } 82 | } 83 | }, 84 | "demo": { 85 | "demo_every": 2000 86 | } 87 | } 88 | } -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ComfyUI_SongGeneration 2 | [SongGeneration](https://github.com/tencent-ailab/SongGeneration):High-Quality Song Generation with Multi-Preference Alignment (SOTA),you can try VRAM>12G 3 | 4 | # Update 5 | * 11/22 修复入参顺序颠倒的错误,修复一个找很久没找到的print,并修复其模块导入问题 6 | * 10/23 同步官方代码,删除fairseq库,已无安装难度; 7 | * 10/21同步官方代码,精简模型加载,删除hubert模型,优化lm模型加载顺序,避免转移到显存时峰值OOM; 8 | * 10/18 修改加载流程,支持最新的full ,new,large模型,large模型12GVram可能会OOM,修复高版本transformer 的函数错误/Modify the loading process to support the latest full, new, and large models, and fix function errors in higher versions of transformers 9 | * 07/29,支持bgm和人声(vocal,目前还是有bgm底噪)单独输出,选择mixed为合成全部,模型加载方式更合理,去掉诸多debug打印,新增save_separate按钮,开启则保存三个音频(bgm,vocal,mixed); 10 | * Test env(插件测试环境):window11,python3.11, torch2.6 ,cu124, VR12G,(transformers 4.45.1) 11 | 12 | 13 | # 1. Installation 14 | 15 | In the ./ComfyUI/custom_nodes directory, run the following: 16 | ``` 17 | git clone https://github.com/smthemex/ComfyUI_SongGeneration.git 18 | ``` 19 | 20 | # 2. Requirements 21 | 22 | * 如果缺失库,打开requirements_orgin.txt文件,看是少了哪个,手动安装; 23 | * If the library is missing, open the ’requirements_orgin.txt‘ file and see which one is missing, then manually install it; 24 | 25 | ``` 26 | pip install -r requirements.txt 27 | ``` 28 | 29 | # 3.Model 30 | * 3.1.1 download ckpt from [tencent/SongGeneration](https://huggingface.co/tencent/SongGeneration/tree/main) 国内建议魔搭[AI-ModelScope/SongGeneration](https://www.modelscope.cn/models/AI-ModelScope/SongGeneration/files) 31 | * 3.1.2 [new base](https://huggingface.co/lglg666/SongGeneration-base-new),[large ](https://huggingface.co/lglg666/SongGeneration-large),[full](https://huggingface.co/lglg666/SongGeneration-base-full) 32 | * 3.1.3 new prompt,[emb](https://github.com/tencent-ailab/SongGeneration/tree/main/tools) 33 | * 3.1.4 download htdemucs.pth [tencent/SongGeneration](https://huggingface.co/tencent/SongGeneration/tree/main/third_party/demucs/ckpt) 34 | * 文件结构如下,修改了加载流程,原来的结构也能用: 35 | ``` 36 | -- ComfyUI/models/SongGeneration/ # 24.4G all 整个文件夹的大小 37 | |-- htdemucs.pth #150M 38 | |--prompt.pt # 3M 39 | |--new_prompt.pt # 3M 40 | |--model_2.safetensors 41 | |--model_2_fixed.safetensors 42 | |--new_model.pt # rename from model.pt #可选 43 | |--large_model.pt # rename from model.pt #可选 44 | |-- ckpt/ 45 | |--encode-s12k.pt # 3.68G 46 | -- ComfyUI/models/vae/ 47 | |--autoencoder_music_1320k.ckpt 48 | ``` 49 | # 4 Example 50 | ![](https://github.com/smthemex/ComfyUI_SongGeneration/blob/main/example_workflows/SongGeneration.png) 51 | 52 | # 5 Citation 53 | ``` 54 | @article{lei2025levo, 55 | title={LeVo: High-Quality Song Generation with Multi-Preference Alignment}, 56 | author={Lei, Shun and Xu, Yaoxun and Lin, Zhiwei and Zhang, Huaicheng and Tan, Wei and Chen, Hangting and Yu, Jianwei and Zhang, Yixuan and Yang, Chenyu and Zhu, Haina and Wang, Shuai and Wu, Zhiyong and Yu, Dong}, 57 | journal={arXiv preprint arXiv:2506.07520}, 58 | year={2025} 59 | } 60 | ``` 61 | 62 | 63 | 64 | -------------------------------------------------------------------------------- /SongGeneration/codeclm/models/llama/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 EleutherAI and The HuggingFace Inc. team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | from typing import TYPE_CHECKING 15 | 16 | from transformers.utils import ( 17 | OptionalDependencyNotAvailable, 18 | _LazyModule, 19 | is_sentencepiece_available, 20 | is_tokenizers_available, 21 | is_torch_available, 22 | ) 23 | 24 | 25 | _import_structure = { 26 | "configuration_llama": ["LLAMA_PRETRAINED_CONFIG_ARCHIVE_MAP", "LlamaConfig"], 27 | } 28 | 29 | try: 30 | if not is_sentencepiece_available(): 31 | raise OptionalDependencyNotAvailable() 32 | except OptionalDependencyNotAvailable: 33 | pass 34 | else: 35 | _import_structure["tokenization_llama"] = ["LlamaTokenizer"] 36 | 37 | try: 38 | if not is_tokenizers_available(): 39 | raise OptionalDependencyNotAvailable() 40 | except OptionalDependencyNotAvailable: 41 | pass 42 | else: 43 | _import_structure["tokenization_llama_fast"] = ["LlamaTokenizerFast"] 44 | 45 | try: 46 | if not is_torch_available(): 47 | raise OptionalDependencyNotAvailable() 48 | except OptionalDependencyNotAvailable: 49 | pass 50 | else: 51 | _import_structure["modeling_llama"] = [ 52 | "LlamaForCausalLM", 53 | "LlamaModel", 54 | "LlamaPreTrainedModel", 55 | "LlamaForSequenceClassification", 56 | ] 57 | 58 | 59 | if TYPE_CHECKING: 60 | from .configuration_llama import LLAMA_PRETRAINED_CONFIG_ARCHIVE_MAP, LlamaConfig 61 | 62 | try: 63 | if not is_sentencepiece_available(): 64 | raise OptionalDependencyNotAvailable() 65 | except OptionalDependencyNotAvailable: 66 | pass 67 | else: 68 | from .tokenization_llama import LlamaTokenizer 69 | 70 | try: 71 | if not is_tokenizers_available(): 72 | raise OptionalDependencyNotAvailable() 73 | except OptionalDependencyNotAvailable: 74 | pass 75 | else: 76 | from .tokenization_llama_fast import LlamaTokenizerFast 77 | 78 | try: 79 | if not is_torch_available(): 80 | raise OptionalDependencyNotAvailable() 81 | except OptionalDependencyNotAvailable: 82 | pass 83 | else: 84 | from .modeling_llama import LlamaForCausalLM, LlamaForSequenceClassification, LlamaModel, LlamaPreTrainedModel 85 | 86 | 87 | else: 88 | import sys 89 | 90 | sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__) 91 | -------------------------------------------------------------------------------- /SongGeneration/codeclm/tokenizer/Flow1dVAE/our_MERT_BESTRQ/mert_fairseq/config/pretrain/EAT_pretraining_music_multinodes.yaml: -------------------------------------------------------------------------------- 1 | # @package _group_ 2 | 3 | common: 4 | fp16: true 5 | log_format: json 6 | log_interval: 200 7 | tensorboard_logdir: tb 8 | min_loss_scale: 1e-6 9 | fp16_no_flatten_grads: true 10 | user_dir: ${env:PWD} 11 | seed: 1 12 | 13 | checkpoint: 14 | save_interval: 1 15 | save_interval_updates: 10000 16 | keep_interval_updates: 1000 17 | no_epoch_checkpoints: true 18 | 19 | task: 20 | _name: mae_image_pretraining 21 | data: music4all_sh/ 22 | rebuild_batches: true 23 | key: source 24 | precompute_mask_config: {} 25 | downsr_16hz: false 26 | audio_mae: true 27 | h5_format: false 28 | target_length: 752 29 | flexible_mask: false 30 | sample_rate: 24000 31 | fixed_duration: 30 32 | 33 | dataset: 34 | num_workers: 10 35 | batch_size: 12 36 | skip_invalid_size_inputs_valid_test: true 37 | required_batch_size_multiple: 1 38 | disable_validation: true 39 | 40 | distributed_training: 41 | distributed_world_size: 4 42 | ddp_backend: c10d 43 | 44 | criterion: 45 | _name: model 46 | log_keys: 47 | - ema_decay 48 | - target_var 49 | - pred_var 50 | - model_norm 51 | - ema_norm 52 | - masked_pct 53 | 54 | optimization: 55 | max_update: 400000 56 | lr: [ 0.0001 ] 57 | # debug_param_names: true 58 | clip_norm: 4 59 | 60 | optimizer: 61 | _name: composite 62 | # dynamic_groups: true 63 | groups: 64 | default: 65 | lr_float: 0.0005 66 | optimizer: 67 | _name: adam 68 | adam_betas: [0.9,0.95] 69 | weight_decay: 0.05 70 | lr_scheduler: 71 | _name: cosine 72 | warmup_updates: 10000 # 53333 73 | 74 | lr_scheduler: pass_through 75 | 76 | model: 77 | _name: data2vec_multi 78 | 79 | ema_decay: 0.9998 80 | ema_end_decay: 0.99999 81 | ema_anneal_end_step: 100000 82 | instance_norm_target_layer: true 83 | layer_norm_target_layer: false 84 | layer_norm_targets: true 85 | end_of_block_targets: false 86 | 87 | depth: 12 88 | average_top_k_layers: 12 89 | clone_batch: 16 90 | 91 | norm_eps: 1e-6 92 | 93 | min_target_var: 0 94 | min_pred_var: 0 95 | 96 | encoder_dropout: 0 97 | post_mlp_drop: 0 98 | attention_dropout: 0 99 | activation_dropout: 0 100 | 101 | supported_modality: IMAGE 102 | cls_loss: 1 103 | 104 | ema_encoder_only: false 105 | 106 | modalities: 107 | image: 108 | in_chans: 1 109 | inverse_mask: true 110 | mask_prob: 0.8 111 | mask_prob_adjust: 0.07 112 | mask_length: 5 113 | mask_noise_std: 0.01 114 | prenet_depth: 0 115 | ema_local_encoder: true 116 | num_extra_tokens: 1 117 | init_extra_token_zero: false 118 | use_alibi_encoder: false 119 | decoder: 120 | decoder_dim: 768 121 | decoder_groups: 16 122 | decoder_kernel: 3 123 | decoder_layers: 6 124 | input_dropout: 0 125 | target_length: 752 -------------------------------------------------------------------------------- /SongGeneration/codeclm/tokenizer/Flow1dVAE/tools/infer_hifigan48k_speech.py: -------------------------------------------------------------------------------- 1 | import json 2 | import torch 3 | from tqdm import tqdm 4 | import torchaudio 5 | import librosa 6 | import os 7 | import math 8 | import numpy as np 9 | from .get_melvaehifigan48k import build_pretrained_models 10 | from . import torch_tools as torch_tools 11 | 12 | class Tango: 13 | def __init__(self, \ 14 | device="cuda:0"): 15 | 16 | self.sample_rate = 48000 17 | self.device = device 18 | 19 | self.vae, self.stft = build_pretrained_models() 20 | self.vae, self.stft = self.vae.eval().to(device), self.stft.eval().to(device) 21 | 22 | def mel_spectrogram_to_waveform(self, mel_spectrogram): 23 | if mel_spectrogram.dim() == 4: 24 | mel_spectrogram = mel_spectrogram.squeeze(1) 25 | 26 | waveform = self.vocoder(mel_spectrogram) 27 | # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16 28 | waveform = waveform.cpu().float() 29 | return waveform 30 | 31 | def sound2sound_generate_longterm(self, fname, batch_size=1, duration=10.24, steps=200, disable_progress=False): 32 | """ Genrate audio without condition. """ 33 | num_frames = math.ceil(duration * 100. / 8) 34 | with torch.no_grad(): 35 | orig_samples, fs = torchaudio.load(fname) 36 | if(orig_samples.shape[-1] (b n) e") 56 | 57 | # L2 normalization 58 | normalized_x = nn.functional.normalize(x, dim=1, p=2) 59 | normalized_codebook = nn.functional.normalize(self.codebook, dim=1, p=2) 60 | 61 | # compute distances 62 | distances = torch.cdist(normalized_codebook, normalized_x) 63 | 64 | # get nearest 65 | nearest_indices = torch.argmin(distances, dim=0) 66 | 67 | # reshape 68 | xq = rearrange(nearest_indices, "(b n) -> b n", b=b) 69 | 70 | return xq 71 | 72 | @torch.no_grad() 73 | def forward(self, x): 74 | # always eval 75 | self.eval() 76 | 77 | # random projection [batch, length, input_dim] -> [batch, length, codebook_dim] 78 | x = einsum("b n d, d e -> b n e", x, self.random_projection) 79 | 80 | # codebook lookup 81 | xq = self.codebook_lookup(x) 82 | 83 | return xq 84 | -------------------------------------------------------------------------------- /SongGeneration/codeclm/tokenizer/Flow1dVAE/extract_codes_stereo_7_1x4_ds.py: -------------------------------------------------------------------------------- 1 | import torch,torchaudio 2 | import os,sys,json 3 | from tqdm import tqdm 4 | 5 | #from codeclm_song_v1.codeclm.semantic_extractor.SpeechDecoder_v01.generate import Tango 6 | from .generate_4rvq import Tango 7 | import kaldiio 8 | from kaldiio import WriteHelper 9 | import torch 10 | import subprocess 11 | import time 12 | import sys 13 | 14 | def get_gpu_memory(): 15 | _output_to_list = lambda x: x.decode('ascii').split('\n')[:-1] 16 | 17 | ACCEPTABLE_AVAILABLE_MEMORY = 1024 18 | COMMAND = "nvidia-smi --query-gpu=memory.free --format=csv" 19 | memory_free_info = _output_to_list(subprocess.check_output(COMMAND.split()))[1:] 20 | memory_free_values = [int(x.split()[0]) for i, x in enumerate(memory_free_info)] 21 | return memory_free_values 22 | 23 | if __name__ == "__main__": 24 | # Define Model 25 | json_path = sys.argv[1] 26 | outdir = sys.argv[2] 27 | ds = int(sys.argv[3]) 28 | 29 | gpu_idx = int(os.environ['CUDA_VISIBLE_DEVICES']) 30 | while True: 31 | free_mem = get_gpu_memory() 32 | free_mem = free_mem[gpu_idx] 33 | if(free_mem > 25_000): 34 | print("GPU memory {}, run matrix cal".format(free_mem)) 35 | break 36 | else: 37 | print("GPU memory {}, sleep 1min".format(free_mem)) 38 | time.sleep(60) 39 | 40 | mus_infos = [] 41 | with open(json_path) as f: 42 | for line in f: 43 | item = json.loads(line) 44 | mus_infos.append(item) 45 | 46 | tango = Tango(model_path = './saved/model_4rvq/model_2_fixed.safetensors', rvq_num=4) 47 | 48 | 49 | # Feature extraction loop 50 | # for i in tqdm(range(2000)): 51 | with WriteHelper('ark,scp:{}/token.ark,{}/token.scp'.format(outdir, outdir), write_function="pickle") as writer: 52 | print('ark,scp:{}/token.ark,{}/token.scp'.format(outdir, outdir)) 53 | bar = torch.zeros(4, 16384) 54 | for item_idx, item in tqdm(enumerate(mus_infos)): 55 | try: 56 | # if True: 57 | idx = item['idx'] 58 | # print(idx) 59 | with torch.autocast(device_type="cuda", dtype=torch.float16): 60 | if(os.path.exists(item['path'])): 61 | codes = tango.file2code_ds(item['path'], ds) 62 | else: 63 | codes = tango.file2code_ds('/mnt/share/' + item['path'], ds) 64 | codes = codes.cpu() 65 | writer(str(idx), codes) 66 | for i0 in range(codes.shape[-1]): 67 | bar[0, codes[0, 0, i0]] += 1 68 | bar[1, codes[0, 1, i0]] += 1 69 | bar[2, codes[0, 2, i0]] += 1 70 | bar[3, codes[0, 3, i0]] += 1 71 | except Exception as e: 72 | print(item['path']) 73 | # print(e.message, e.args) 74 | # exit(1) 75 | continue 76 | 77 | if(item_idx % 1000 == 0): 78 | print("=========") 79 | print(1 - (bar[0]==0).sum() / bar.shape[-1]) 80 | print("=========") 81 | 82 | # idx = item['idx'] 83 | # # print(idx) 84 | # with torch.autocast(device_type="cuda", dtype=torch.float16): 85 | # codes = tango.file2code(item['path']) 86 | # writer(str(idx), codes.cpu()) -------------------------------------------------------------------------------- /SongGeneration/codeclm/tokenizer/Flow1dVAE/our_MERT_BESTRQ/mert_fairseq/config/pretrain/MERT_RVQ-VAE_CQT_95M_bestrq_chroma_multinodes.yaml: -------------------------------------------------------------------------------- 1 | # @package _group_ 2 | common: 3 | fp16: false 4 | log_format: json 5 | log_interval: 200 6 | seed: 1337 7 | # tensorboard_logdir: tblog_proj_name 8 | # wandb_project: wandb_proj_name 9 | 10 | checkpoint: 11 | save_interval_updates: 12500 12 | keep_interval_updates: -1 13 | no_epoch_checkpoints: true 14 | 15 | 16 | distributed_training: 17 | ddp_backend: no_c10d 18 | distributed_backend: 'nccl' 19 | distributed_world_size: 64 20 | nprocs_per_node: 8 21 | find_unused_parameters: true 22 | 23 | task: 24 | _name: mert_pretraining 25 | data: ??? 26 | label_dir: ??? 27 | labels: ??? 28 | label_rate: ${model.label_rate} 29 | sample_rate: 24000 30 | # crop to 5s 31 | max_sample_size: 120000 32 | min_sample_size: 72000 33 | 34 | pad_audio: false 35 | random_crop: true 36 | normalize: false # must be consistent with extractor 37 | 38 | 39 | dataset: 40 | num_workers: 6 41 | max_tokens: 2000000 42 | skip_invalid_size_inputs_valid_test: true 43 | validate_interval: 1 44 | validate_interval_updates: 10000 45 | 46 | criterion: 47 | _name: hubert 48 | pred_masked_weight: 1.0 49 | pred_nomask_weight: 0.0 50 | loss_weights: [10, 1] 51 | 52 | optimization: 53 | max_update: 400000 54 | lr: [0.0005] 55 | clip_norm: 10.0 56 | update_freq: [4] 57 | 58 | optimizer: 59 | _name: adam 60 | adam_betas: (0.9,0.98) 61 | adam_eps: 1e-06 62 | weight_decay: 0.01 63 | 64 | lr_scheduler: 65 | _name: polynomial_decay 66 | warmup_updates: 32000 67 | 68 | model: 69 | _name: mert 70 | label_rate: ??? 71 | skip_masked: false 72 | skip_nomask: true 73 | mask_prob: 0.8 74 | mask_length: 5 75 | 76 | logit_temp: 0.1 77 | 78 | # ----- mixture ------ 79 | mixture_prob: 0.5 80 | inbatch_noise_augment_len_range: "[12000, 24000]" 81 | inbatch_noise_augment_number_range: "[1, 3]" 82 | inbatch_noise_augment_volume: 1.0 83 | # ------------------------ 84 | extractor_mode: default 85 | audio_extract_type: melspec # use melspec (instead of `w2v_conv`) 86 | melspec_n_bins: 120 # for melspec we use 120, means 12 bins per octave 87 | conv_feature_layers: '[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2' 88 | 89 | # best-rq loss 90 | audio_rq_loss_m: true 91 | audio_rq_loss_embed_dim: 16 92 | audio_rq_loss_num_codebooks: 1 93 | audio_rq_loss_num_embeds: 8192 94 | audio_rq_loss_seed: 42 95 | audio_rq_loss_use_norm: true 96 | audio_rq_loss_use_chroma: true 97 | audio_rq_loss_seed_chroma: 123 98 | 99 | # ---- cqt reconstruction, need to add loss weight --- 100 | audio_cqt_loss_m: true 101 | audio_cqt_bins: 336 102 | # ----------- 103 | final_dim: 32 104 | encoder_layerdrop: 0.05 105 | dropout_input: 0.1 106 | dropout_features: 0.1 107 | dropout: 0.1 108 | attention_dropout: 0.1 109 | feature_grad_mult: 0.1 110 | untie_final_proj: true 111 | activation_dropout: 0.0 112 | 113 | 114 | hydra: 115 | job: 116 | config: 117 | override_dirname: 118 | kv_sep: '-' 119 | item_sep: '__' 120 | exclude_keys: 121 | - run 122 | - task.data 123 | - task.label_dir 124 | run: 125 | dir: ??? 126 | sweep: 127 | dir: ??? 128 | subdir: ${hydra.job.config_name}__${hydra.job.override_dirname} 129 | -------------------------------------------------------------------------------- /SongGeneration/codeclm/tokenizer/Flow1dVAE/our_MERT_BESTRQ/mert_fairseq/config/pretrain/MERT_RVQ-VAE_CQT_95M_bestrq_norm_multinodes.yaml: -------------------------------------------------------------------------------- 1 | # @package _group_ 2 | common: 3 | fp16: false 4 | log_format: json 5 | log_interval: 200 6 | seed: 1337 7 | # tensorboard_logdir: tblog_proj_name 8 | # wandb_project: wandb_proj_name 9 | 10 | checkpoint: 11 | save_interval_updates: 12500 12 | keep_interval_updates: -1 13 | no_epoch_checkpoints: true 14 | 15 | 16 | distributed_training: 17 | ddp_backend: no_c10d 18 | distributed_backend: 'nccl' 19 | distributed_world_size: 64 20 | nprocs_per_node: 8 21 | find_unused_parameters: true 22 | 23 | task: 24 | _name: mert_pretraining 25 | data: ??? 26 | label_dir: ??? 27 | labels: ??? 28 | label_rate: ${model.label_rate} 29 | sample_rate: 24000 30 | # crop to 5s 31 | max_sample_size: 120000 32 | min_sample_size: 72000 33 | 34 | pad_audio: false 35 | random_crop: true 36 | normalize: false # must be consistent with extractor 37 | 38 | 39 | dataset: 40 | num_workers: 6 41 | max_tokens: 2000000 42 | skip_invalid_size_inputs_valid_test: true 43 | validate_interval: 1 44 | validate_interval_updates: 10000 45 | 46 | criterion: 47 | _name: hubert 48 | pred_masked_weight: 1.0 49 | pred_nomask_weight: 0.0 50 | loss_weights: [10, 1] 51 | 52 | optimization: 53 | max_update: 400000 54 | lr: [0.0005] 55 | clip_norm: 10.0 56 | update_freq: [4] 57 | 58 | optimizer: 59 | _name: adam 60 | adam_betas: (0.9,0.98) 61 | adam_eps: 1e-06 62 | weight_decay: 0.01 63 | 64 | lr_scheduler: 65 | _name: polynomial_decay 66 | warmup_updates: 32000 67 | 68 | model: 69 | _name: mert 70 | label_rate: ??? 71 | skip_masked: false 72 | skip_nomask: true 73 | mask_prob: 0.8 74 | mask_length: 5 75 | 76 | logit_temp: 0.1 77 | 78 | # ----- mixture ------ 79 | mixture_prob: 0.5 80 | inbatch_noise_augment_len_range: "[12000, 24000]" 81 | inbatch_noise_augment_number_range: "[1, 3]" 82 | inbatch_noise_augment_volume: 1.0 83 | # ------------------------ 84 | extractor_mode: default 85 | audio_extract_type: melspec # use melspec (instead of `w2v_conv`) 86 | melspec_n_bins: 120 # for melspec we use 120, means 12 bins per octave 87 | conv_feature_layers: '[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2' 88 | 89 | # best-rq loss 90 | audio_rq_loss_m: true 91 | audio_rq_loss_embed_dim: 16 92 | audio_rq_loss_num_codebooks: 1 93 | audio_rq_loss_num_embeds: 8192 94 | audio_rq_loss_seed: 42 95 | audio_rq_loss_use_norm: true 96 | audio_rq_loss_use_chroma: false 97 | audio_rq_loss_seed_chroma: 123 98 | 99 | # ---- cqt reconstruction, need to add loss weight --- 100 | audio_cqt_loss_m: true 101 | audio_cqt_bins: 336 102 | # ----------- 103 | final_dim: 64 104 | encoder_layerdrop: 0.05 105 | dropout_input: 0.1 106 | dropout_features: 0.1 107 | dropout: 0.1 108 | attention_dropout: 0.1 109 | feature_grad_mult: 0.1 110 | untie_final_proj: true 111 | activation_dropout: 0.0 112 | 113 | 114 | hydra: 115 | job: 116 | config: 117 | override_dirname: 118 | kv_sep: '-' 119 | item_sep: '__' 120 | exclude_keys: 121 | - run 122 | - task.data 123 | - task.label_dir 124 | run: 125 | dir: ??? 126 | sweep: 127 | dir: ??? 128 | subdir: ${hydra.job.config_name}__${hydra.job.override_dirname} 129 | -------------------------------------------------------------------------------- /SongGeneration/codeclm/tokenizer/Flow1dVAE/tools/infer_vaehifigan48k_soundmusic.py: -------------------------------------------------------------------------------- 1 | import json 2 | import torch 3 | from tqdm import tqdm 4 | import torchaudio 5 | import librosa 6 | import os 7 | import math 8 | import numpy as np 9 | from .get_melvaehifigan48k import build_pretrained_models 10 | from . import torch_tools as torch_tools 11 | 12 | class Tango: 13 | def __init__(self, \ 14 | device="cuda:0"): 15 | 16 | self.sample_rate = 48000 17 | self.device = device 18 | 19 | self.vae, self.stft = build_pretrained_models() 20 | self.vae, self.stft = self.vae.eval().to(device), self.stft.eval().to(device) 21 | 22 | # print(sum(p.numel() for p in self.vae.parameters()));exit() 23 | 24 | def mel_spectrogram_to_waveform(self, mel_spectrogram): 25 | if mel_spectrogram.dim() == 4: 26 | mel_spectrogram = mel_spectrogram.squeeze(1) 27 | 28 | waveform = self.vocoder(mel_spectrogram) 29 | # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16 30 | waveform = waveform.cpu().float() 31 | return waveform 32 | 33 | def sound2sound_generate_longterm(self, fname, batch_size=1, duration=10.24, steps=200, disable_progress=False): 34 | """ Genrate audio without condition. """ 35 | num_frames = math.ceil(duration * 100. / 8) 36 | with torch.no_grad(): 37 | orig_samples, fs = torchaudio.load(fname) 38 | if(orig_samples.shape[-1] 1): 32 | self.conv3 = nn.Conv2d(idim, odim, 3, padding=1, stride=stride) 33 | self.bn3 = nn.BatchNorm2d(odim) 34 | self.diff = True 35 | 36 | def forward(self, x): 37 | out = self.bn2(self.conv2(self.relu(self.bn1(self.conv1(x))))) 38 | if self.diff: 39 | x = self.bn3(self.conv3(x)) 40 | out = x + out 41 | out = self.relu(out) 42 | return out 43 | 44 | 45 | class Conv2dSubsampling(nn.Module): 46 | """Convolutional 2D subsampling (to 1/4 length). 47 | 48 | Args: 49 | idim (int): Input dimension. 50 | hdim (int): Hidden dimension. 51 | odim (int): Output dimension. 52 | strides (list): Sizes of strides. 53 | n_bands (int): Number of frequency bands. 54 | """ 55 | 56 | def __init__(self, idim, hdim, odim, strides=[2, 2], n_bands=64): 57 | """Construct an Conv2dSubsampling object.""" 58 | super(Conv2dSubsampling, self).__init__() 59 | 60 | self.conv = nn.Sequential( 61 | Res2dModule(idim, hdim, (2, strides[0])), 62 | Res2dModule(hdim, hdim, (2, strides[1])), 63 | ) 64 | self.linear = nn.Linear(hdim * n_bands // 2 // 2, odim) 65 | 66 | def forward(self, x): 67 | """Subsample x. 68 | 69 | Args: 70 | x (torch.Tensor): Input tensor (#batch, idim, time). 71 | 72 | Returns: 73 | torch.Tensor: Subsampled tensor (#batch, time', odim), 74 | where time' = time // 4. 75 | """ 76 | 77 | if x.dim() == 3: 78 | x = x.unsqueeze(1) # (b, c, f, t) 79 | x = self.conv(x) 80 | x = rearrange(x, "b c f t -> b t (c f)") 81 | x = self.linear(x) 82 | return x 83 | -------------------------------------------------------------------------------- /SongGeneration/codeclm/tokenizer/Flow1dVAE/our_MERT_BESTRQ/mert_fairseq/config/pretrain/MERT_RVQ-VAE_CQT_330M_orig.yaml: -------------------------------------------------------------------------------- 1 | # @package _group_ 2 | common: 3 | fp16: true 4 | log_format: json 5 | log_interval: 100 6 | seed: 1337 7 | # tensorboard_logdir: tblog_proj_name 8 | # wandb_project: wandb_proj_name 9 | 10 | checkpoint: 11 | save_interval_updates: 5000 12 | keep_interval_updates: -1 13 | no_epoch_checkpoints: true 14 | 15 | 16 | distributed_training: 17 | ddp_backend: no_c10d 18 | distributed_backend: 'nccl' 19 | distributed_world_size: 64 20 | nprocs_per_node: 8 21 | find_unused_parameters: true 22 | 23 | task: 24 | _name: mert_pretraining 25 | data: ??? 26 | label_dir: ??? 27 | labels: ??? 28 | label_rate: ${model.label_rate} 29 | sharding_data: 6 30 | load_random_data_shard: false 31 | sample_rate: 24000 32 | # crop to 5s 33 | # max_sample_size: 120000 34 | # crop to 5.12s, refers to 384 token per audio, which can be devided by 8. 35 | max_sample_size: 122880 36 | min_sample_size: 72000 37 | 38 | pad_audio: false 39 | random_crop: true 40 | # normalize: true # must be consistent with extractor_mode: layer_norm 41 | normalize: false # must be consistent with extractor_mode: default (groupnorm) 42 | 43 | 44 | dataset: 45 | num_workers: 6 46 | max_tokens: 900000 47 | skip_invalid_size_inputs_valid_test: true 48 | validate_interval: 1 49 | validate_interval_updates: 10000 50 | 51 | criterion: 52 | _name: hubert 53 | pred_masked_weight: 1.0 54 | pred_nomask_weight: 0.0 55 | loss_weights: [10, 1] 56 | 57 | optimization: 58 | max_update: 400000 59 | lr: [0.0015] 60 | clip_norm: 1.0 61 | update_freq: [8] 62 | 63 | optimizer: 64 | _name: adam 65 | adam_betas: (0.9,0.98) 66 | adam_eps: 1e-06 67 | weight_decay: 0.01 68 | 69 | lr_scheduler: 70 | _name: polynomial_decay 71 | warmup_updates: 32000 72 | 73 | model: 74 | _name: mert 75 | label_rate: ??? 76 | skip_masked: false 77 | skip_nomask: true 78 | mask_prob: 0.8 79 | mask_length: 5 80 | 81 | logit_temp: 0.1 82 | 83 | 84 | # ----- mixture ------ 85 | mixture_prob: 0.5 86 | inbatch_noise_augment_len_range: "[12000, 36000]" 87 | inbatch_noise_augment_number_range: "[1, 3]" 88 | inbatch_noise_augment_volume: 1.0 89 | # ------------------------ 90 | 91 | # ---- cqt reconstruction, need to add loss weight --- 92 | audio_cqt_loss_m: true 93 | audio_cqt_bins: 336 94 | 95 | final_dim: 128 96 | encoder_layers: 24 97 | encoder_embed_dim: 1024 98 | encoder_ffn_embed_dim: 4096 99 | encoder_attention_heads: 16 100 | # default refers to group norm 101 | extractor_mode: default 102 | # extractor_mode: layer_norm 103 | conv_feature_layers: '[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2' 104 | encoder_layerdrop: 0.0 105 | dropout_input: 0.0 106 | dropout_features: 0.0 107 | dropout: 0.0 108 | attention_dropout: 0.0 109 | 110 | layer_norm_first: true 111 | feature_grad_mult: 1.0 112 | 113 | untie_final_proj: true 114 | activation_dropout: 0.0 115 | 116 | deepnorm: false 117 | attention_relax: 32.0 118 | 119 | 120 | 121 | hydra: 122 | job: 123 | config: 124 | override_dirname: 125 | kv_sep: '-' 126 | item_sep: '__' 127 | exclude_keys: 128 | - run 129 | - task.data 130 | - task.label_dir 131 | run: 132 | dir: ??? 133 | sweep: 134 | dir: ??? 135 | subdir: ${hydra.job.config_name}__${hydra.job.override_dirname} -------------------------------------------------------------------------------- /SongGeneration/codeclm/tokenizer/Flow1dVAE/models_gpt/models/tokenizer/tokenizer1.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | from transformers import LlamaTokenizer 3 | import os 4 | import typing as tp 5 | import torch 6 | import sys 7 | from .pinyin.pinyin import G2P_PinYin 8 | 9 | 10 | ConditionType = tp.Tuple[torch.Tensor, torch.Tensor] # condition, mask 11 | 12 | def process_line(line): 13 | line = line.strip()[2:] 14 | if(line[0]=='\'' and line[-1]=='\''): 15 | line = line[1:-1] 16 | return line 17 | 18 | class LlamaTokenizerConditioner(nn.Module): 19 | def __init__(self, device: str = 'cpu', max_len = 3000, padding_idx='', tokenizer_type=None, 20 | pretrained="hfl/chinese-llama-2-13b"): #"hfl/chinese-llama-2-13b" 21 | super().__init__() 22 | print(f"text tokenizer from {pretrained}") 23 | self.text_tokenizer = LlamaTokenizer.from_pretrained(pretrained,cache_dir="huggingface_cache") 24 | print(f"tokenizer vocab size: {self.text_tokenizer.vocab_size}") 25 | self.g2p = G2P_PinYin() 26 | add_token_list = [] 27 | with open(os.path.dirname(os.path.abspath(__file__))+'/vocab.yaml', 'r') as f: 28 | for line in f: 29 | if(line): 30 | add_token_list.append(process_line(line)) 31 | type_tokens = [] 32 | with open(os.path.dirname(os.path.abspath(__file__))+'/structure.yaml', 'r') as f: 33 | for line in f: 34 | if(line): 35 | type_tokens.append(process_line(line)) 36 | if add_token_list != []: 37 | self.text_tokenizer.add_tokens(add_token_list, special_tokens=True) 38 | # voc_size = self.text_tokenizer.vocab_size 39 | voc_size = len(self.text_tokenizer.get_vocab()) # 加了额外token之后vocab_size似乎不会额外增加 ——cyy 40 | print( voc_size) 41 | # import pdb; pdb.set_trace() 42 | padding_idx = str(padding_idx) 43 | 44 | self.text_tokenizer.pad_token = padding_idx 45 | self.max_len = max_len 46 | self.padding_idx = padding_idx 47 | 48 | vocab = self.text_tokenizer.get_vocab() 49 | self.type_token_ids = [vocab[i] for i in type_tokens if i in vocab] 50 | struct_tokens = [padding_idx] + [i for i in add_token_list if i[0]=='[' and i[-1]==']'] 51 | self.struct_token_ids = [vocab[i] for i in struct_tokens] 52 | print("type tokens: ",{self.text_tokenizer.convert_ids_to_tokens(i):i for i in self.type_token_ids}, 53 | "\t all structure tokens: ", {self.text_tokenizer.convert_ids_to_tokens(i):i for i in self.struct_token_ids}) 54 | 55 | def tokenize(self, x: tp.List[tp.Optional[str]]) -> tp.Dict[str, torch.Tensor]: 56 | x = [self.g2p(xi) if xi is not None else "" for xi in x] 57 | inputs = self.text_tokenizer(x, return_tensors="pt", padding=True) 58 | # print(x, [self.text_tokenizer.convert_ids_to_tokens(i.tolist()) for i in inputs['input_ids']]) 59 | # import pdb; pdb.set_trace() 60 | if inputs['input_ids'].shape[-1] > self.max_len: 61 | warnings.warn(f"Max len limit ({self.max_len}) Exceed! {x}") 62 | 63 | # print(x, inputs['input_ids'].shape) 64 | return inputs 65 | 66 | 67 | if __name__ == "__main__": 68 | tokenizer = LlamaTokenizerConditioner() 69 | out = tokenizer.tokenize(["im ok today, and im happy now", "今天我很开心"]) 70 | print(out) 71 | print(tokenizer.text_tokenizer.decode(out['input_ids'][0][:4])) 72 | print(tokenizer.text_tokenizer.convert_ids_to_tokens(out['input_ids'][0])) -------------------------------------------------------------------------------- /SongGeneration/codeclm/tokenizer/Flow1dVAE/our_MERT_BESTRQ/mert_fairseq/config/pretrain/MERT_RVQ-VAE_CQT_330M_multinodes_debug1node.yaml: -------------------------------------------------------------------------------- 1 | # @package _group_ 2 | common: 3 | fp16: false 4 | log_format: json 5 | log_interval: 100 6 | seed: 1337 7 | # amp: true 8 | 9 | # tensorboard_logdir: tblog_proj_name 10 | # wandb_project: wandb_proj_name 11 | 12 | checkpoint: 13 | save_interval_updates: 5000 14 | keep_interval_updates: -1 15 | no_epoch_checkpoints: true 16 | 17 | 18 | distributed_training: 19 | ddp_backend: c10d 20 | distributed_backend: 'nccl' 21 | distributed_world_size: 64 22 | nprocs_per_node: 8 23 | find_unused_parameters: true 24 | # reset-dataloader: true 25 | 26 | task: 27 | _name: mert_pretraining 28 | data: ??? 29 | label_dir: ??? 30 | labels: ??? 31 | label_rate: ${model.label_rate} 32 | sharding_data: -1 #数据分块 33 | load_random_data_shard: false 34 | sample_rate: 24000 35 | # crop to 5s 36 | # max_sample_size: 120000 37 | # crop to 5.12s, refers to 384 token per audio, which can be devided by 8. 38 | max_sample_size: 122880 39 | min_sample_size: 72000 40 | 41 | pad_audio: false 42 | random_crop: true 43 | # normalize: true # must be consistent with extractor_mode: layer_norm 44 | normalize: false # must be consistent with extractor_mode: default (groupnorm) 45 | 46 | 47 | dataset: 48 | num_workers: 6 49 | max_tokens: 900000 50 | skip_invalid_size_inputs_valid_test: true 51 | validate_interval: 1 52 | validate_interval_updates: 10000 53 | 54 | criterion: 55 | _name: hubert 56 | pred_masked_weight: 1.0 57 | pred_nomask_weight: 0.0 58 | loss_weights: [10, 1] 59 | 60 | optimization: 61 | max_update: 1000000 62 | lr: [0.0015] 63 | clip_norm: 1.0 64 | update_freq: [8] 65 | 66 | optimizer: 67 | _name: adam 68 | adam_betas: (0.9,0.98) 69 | adam_eps: 1e-06 70 | weight_decay: 0.01 71 | 72 | lr_scheduler: 73 | _name: polynomial_decay 74 | warmup_updates: 32000 75 | 76 | model: 77 | _name: mert 78 | label_rate: ??? 79 | skip_masked: false 80 | skip_nomask: true 81 | mask_prob: 0.8 82 | mask_length: 5 83 | 84 | logit_temp: 0.1 85 | 86 | 87 | # ----- mixture ------ 88 | mixture_prob: 0.5 89 | inbatch_noise_augment_len_range: "[12000, 36000]" 90 | inbatch_noise_augment_number_range: "[1, 3]" 91 | inbatch_noise_augment_volume: 1.0 92 | # ------------------------ 93 | 94 | # ---- cqt reconstruction, need to add loss weight --- 95 | audio_cqt_loss_m: true 96 | audio_cqt_bins: 336 97 | 98 | final_dim: 128 99 | encoder_layers: 24 100 | encoder_embed_dim: 1024 101 | encoder_ffn_embed_dim: 4096 102 | encoder_attention_heads: 16 103 | # default refers to group norm 104 | extractor_mode: default 105 | # extractor_mode: layer_norm 106 | conv_feature_layers: '[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2' 107 | encoder_layerdrop: 0.0 108 | dropout_input: 0.0 109 | dropout_features: 0.0 110 | dropout: 0.0 111 | attention_dropout: 0.0 112 | 113 | layer_norm_first: true 114 | feature_grad_mult: 1.0 115 | 116 | untie_final_proj: true 117 | activation_dropout: 0.0 118 | 119 | deepnorm: false 120 | attention_relax: 32.0 121 | 122 | 123 | 124 | hydra: 125 | job: 126 | config: 127 | override_dirname: 128 | kv_sep: '-' 129 | item_sep: '__' 130 | exclude_keys: 131 | - run 132 | - task.data 133 | - task.label_dir 134 | run: 135 | dir: run 136 | sweep: 137 | dir: sweep 138 | subdir: subdir 139 | -------------------------------------------------------------------------------- /SongGeneration/codeclm/tokenizer/Flow1dVAE/our_MERT_BESTRQ/mert_fairseq/config/pretrain/MERT_RVQ-VAE_CQT_330M.yaml: -------------------------------------------------------------------------------- 1 | # @package _group_ 2 | common: 3 | fp16: false 4 | log_format: json 5 | log_interval: 100 6 | seed: 1337 7 | 8 | # tensorboard_logdir: tblog_proj_name 9 | # wandb_project: wandb_proj_name 10 | 11 | checkpoint: 12 | save_interval_updates: 5000 13 | keep_interval_updates: -1 14 | no_epoch_checkpoints: true 15 | 16 | 17 | distributed_training: 18 | ddp_backend: no_c10d 19 | distributed_backend: 'nccl' 20 | distributed_world_size: 64 21 | nprocs_per_node: 8 22 | find_unused_parameters: true 23 | # reset-dataloader: true 24 | 25 | task: 26 | _name: mert_pretraining 27 | data: ??? 28 | label_dir: ??? 29 | labels: ??? 30 | label_rate: ${model.label_rate} 31 | sharding_data: -1 #数据分块 32 | load_random_data_shard: false 33 | sample_rate: 24000 34 | # crop to 5s 35 | # max_sample_size: 120000 36 | # crop to 5.12s, refers to 384 token per audio, which can be devided by 8. 37 | max_sample_size: 122880 38 | min_sample_size: 72000 39 | 40 | pad_audio: false 41 | random_crop: true 42 | # normalize: true # must be consistent with extractor_mode: layer_norm 43 | normalize: false # must be consistent with extractor_mode: default (groupnorm) 44 | 45 | 46 | dataset: 47 | num_workers: 6 48 | max_tokens: 900000 49 | skip_invalid_size_inputs_valid_test: true 50 | validate_interval: 1 51 | validate_interval_updates: 10000 52 | 53 | criterion: 54 | _name: hubert 55 | pred_masked_weight: 1.0 56 | pred_nomask_weight: 0.0 57 | loss_weights: [10, 1] 58 | 59 | optimization: 60 | max_update: 1000000 61 | lr: [0.0015] 62 | clip_norm: 1.0 63 | update_freq: [8] 64 | 65 | optimizer: 66 | _name: adam 67 | adam_betas: (0.9,0.98) 68 | adam_eps: 1e-06 69 | weight_decay: 0.01 70 | 71 | lr_scheduler: 72 | _name: polynomial_decay 73 | warmup_updates: 32000 74 | 75 | model: 76 | _name: mert 77 | label_rate: ??? 78 | skip_masked: false 79 | skip_nomask: true 80 | mask_prob: 0.8 81 | mask_length: 5 82 | 83 | logit_temp: 0.1 84 | 85 | 86 | # ----- mixture ------ 87 | mixture_prob: 0.5 88 | inbatch_noise_augment_len_range: "[12000, 36000]" 89 | inbatch_noise_augment_number_range: "[1, 3]" 90 | inbatch_noise_augment_volume: 1.0 91 | # ------------------------ 92 | 93 | # ---- cqt reconstruction, need to add loss weight --- 94 | audio_cqt_loss_m: true 95 | audio_cqt_bins: 336 96 | 97 | final_dim: 128 98 | encoder_layers: 24 99 | encoder_embed_dim: 1024 100 | encoder_ffn_embed_dim: 4096 101 | encoder_attention_heads: 16 102 | # default refers to group norm 103 | extractor_mode: default 104 | # extractor_mode: layer_norm 105 | conv_feature_layers: '[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2' 106 | encoder_layerdrop: 0.0 107 | dropout_input: 0.0 108 | dropout_features: 0.0 109 | dropout: 0.0 110 | attention_dropout: 0.0 111 | 112 | layer_norm_first: true 113 | feature_grad_mult: 1.0 114 | 115 | untie_final_proj: true 116 | activation_dropout: 0.0 117 | 118 | deepnorm: false 119 | attention_relax: 32.0 120 | 121 | 122 | 123 | hydra: 124 | job: 125 | config: 126 | override_dirname: 127 | kv_sep: '-' 128 | item_sep: '__' 129 | exclude_keys: 130 | - run 131 | - task.data 132 | - task.label_dir 133 | run: 134 | dir: ??? 135 | sweep: 136 | dir: ??? 137 | subdir: ${hydra.job.config_name}__${hydra.job.override_dirname} 138 | --------------------------------------------------------------------------------