├── .gitignore ├── LICENSE ├── NOTICE ├── README.md ├── audioldm ├── LICENSE ├── README.md ├── clap │ ├── LICENSE │ ├── __init__.py │ ├── open_clip │ │ ├── __init__.py │ │ ├── bert.py │ │ ├── bpe_simple_vocab_16e6.txt.gz │ │ ├── factory.py │ │ ├── feature_fusion.py │ │ ├── htsat.py │ │ ├── linear_probe.py │ │ ├── loss.py │ │ ├── model.py │ │ ├── model_configs │ │ │ ├── HTSAT-base.json │ │ │ ├── HTSAT-large.json │ │ │ ├── HTSAT-tiny-win-1536.json │ │ │ ├── HTSAT-tiny.json │ │ │ ├── PANN-10.json │ │ │ ├── PANN-14-fmax-18k.json │ │ │ ├── PANN-14-fmax-8k-20s.json │ │ │ ├── PANN-14-tiny-transformer.json │ │ │ ├── PANN-14-win-1536.json │ │ │ ├── PANN-14.json │ │ │ ├── PANN-6.json │ │ │ ├── RN101-quickgelu.json │ │ │ ├── RN101.json │ │ │ ├── RN50-quickgelu.json │ │ │ ├── RN50.json │ │ │ ├── RN50x16.json │ │ │ ├── RN50x4.json │ │ │ ├── ViT-B-16.json │ │ │ ├── ViT-B-32-quickgelu.json │ │ │ ├── ViT-B-32.json │ │ │ └── ViT-L-14.json │ │ ├── openai.py │ │ ├── pann_model.py │ │ ├── pretrained.py │ │ ├── timm_model.py │ │ ├── tokenizer.py │ │ ├── transform.py │ │ ├── utils.py │ │ └── version.py │ └── training │ │ ├── __init__.py │ │ ├── audioset_textmap.npy │ │ ├── data.py │ │ ├── distributed.py │ │ ├── imagenet_zeroshot_data.py │ │ ├── infer_demo.py │ │ ├── logger.py │ │ ├── lp_main.py │ │ ├── lp_train.py │ │ ├── main.py │ │ ├── params.py │ │ ├── scheduler.py │ │ ├── train.py │ │ └── zero_shot.py ├── conditional_models.py ├── diffusionmodules │ ├── __init__.py │ ├── attention.py │ ├── distributions.py │ ├── ema.py │ ├── model.py │ ├── nn.py │ ├── openaimodel.py │ └── x_transformer.py ├── hifigan │ ├── LICENSE │ ├── __init__.py │ └── models.py ├── latent_diffusion │ ├── __init__.py │ ├── ddim.py │ ├── ddpm.py │ ├── dpm_solver │ │ ├── __init__.py │ │ ├── dpm_solver.py │ │ └── sampler.py │ └── plms.py ├── latent_encoder │ ├── __init__.py │ └── autoencoder.py ├── losses │ ├── __init__.py │ └── contperceptual.py ├── pipeline.py ├── rewas.py └── utilities │ ├── __init__.py │ ├── audio │ ├── __init__.py │ ├── audio_processing.py │ ├── stft.py │ └── tools.py │ ├── data │ ├── __init__.py │ ├── dataset.py │ └── utils.py │ ├── diffusion_util.py │ ├── model_util.py │ ├── sampler_util.py │ └── tools.py ├── basketball_bounce.mp4 ├── configs ├── audioldm_m_rewas.yaml ├── cfg-24-01-04T16-39-21.yaml └── dataset_root.json ├── encoder ├── LICENSE ├── README.md ├── encoder_utils.py ├── model │ ├── .DS_Store │ ├── modules │ │ ├── .DS_Store │ │ ├── bridges.py │ │ ├── feat_extractors │ │ │ ├── .DS_Store │ │ │ ├── audio │ │ │ │ ├── ast.py │ │ │ │ ├── hf_src │ │ │ │ │ └── modeling_ast.py │ │ │ │ └── resnet.py │ │ │ ├── train_clip_src │ │ │ │ ├── __init__.py │ │ │ │ ├── open_clip │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── bpe_simple_vocab_16e6.txt.gz │ │ │ │ │ ├── coca_model.py │ │ │ │ │ ├── constants.py │ │ │ │ │ ├── factory.py │ │ │ │ │ ├── generation_utils.py │ │ │ │ │ ├── hf_configs.py │ │ │ │ │ ├── hf_model.py │ │ │ │ │ ├── loss.py │ │ │ │ │ ├── model.py │ │ │ │ │ ├── model_configs │ │ │ │ │ │ ├── RN101-quickgelu.json │ │ │ │ │ │ ├── RN101.json │ │ │ │ │ │ ├── RN50-quickgelu.json │ │ │ │ │ │ ├── RN50.json │ │ │ │ │ │ ├── RN50x16.json │ │ │ │ │ │ ├── RN50x4.json │ │ │ │ │ │ ├── RN50x64.json │ │ │ │ │ │ ├── ViT-B-16-plus-240.json │ │ │ │ │ │ ├── ViT-B-16-plus.json │ │ │ │ │ │ ├── ViT-B-16.json │ │ │ │ │ │ ├── ViT-B-32-plus-256.json │ │ │ │ │ │ ├── ViT-B-32-quickgelu.json │ │ │ │ │ │ ├── ViT-B-32.json │ │ │ │ │ │ ├── ViT-H-14.json │ │ │ │ │ │ ├── ViT-H-16.json │ │ │ │ │ │ ├── ViT-L-14-280.json │ │ │ │ │ │ ├── ViT-L-14-336.json │ │ │ │ │ │ ├── ViT-L-14.json │ │ │ │ │ │ ├── ViT-L-16-320.json │ │ │ │ │ │ ├── ViT-L-16.json │ │ │ │ │ │ ├── ViT-M-16-alt.json │ │ │ │ │ │ ├── ViT-M-16.json │ │ │ │ │ │ ├── ViT-M-32-alt.json │ │ │ │ │ │ ├── ViT-M-32.json │ │ │ │ │ │ ├── ViT-S-16-alt.json │ │ │ │ │ │ ├── ViT-S-16.json │ │ │ │ │ │ ├── ViT-S-32-alt.json │ │ │ │ │ │ ├── ViT-S-32.json │ │ │ │ │ │ ├── ViT-bigG-14.json │ │ │ │ │ │ ├── ViT-e-14.json │ │ │ │ │ │ ├── ViT-g-14.json │ │ │ │ │ │ ├── coca_ViT-B-32.json │ │ │ │ │ │ ├── coca_ViT-L-14.json │ │ │ │ │ │ ├── coca_base.json │ │ │ │ │ │ ├── coca_roberta-ViT-B-32.json │ │ │ │ │ │ ├── convnext_base.json │ │ │ │ │ │ ├── convnext_base_w.json │ │ │ │ │ │ ├── convnext_base_w_320.json │ │ │ │ │ │ ├── convnext_large.json │ │ │ │ │ │ ├── convnext_large_d.json │ │ │ │ │ │ ├── convnext_large_d_320.json │ │ │ │ │ │ ├── convnext_small.json │ │ │ │ │ │ ├── convnext_tiny.json │ │ │ │ │ │ ├── convnext_xlarge.json │ │ │ │ │ │ ├── convnext_xxlarge.json │ │ │ │ │ │ ├── convnext_xxlarge_320.json │ │ │ │ │ │ ├── mt5-base-ViT-B-32.json │ │ │ │ │ │ ├── mt5-xl-ViT-H-14.json │ │ │ │ │ │ ├── roberta-ViT-B-32.json │ │ │ │ │ │ ├── swin_base_patch4_window7_224.json │ │ │ │ │ │ ├── vit_medium_patch16_gap_256.json │ │ │ │ │ │ ├── vit_relpos_medium_patch16_cls_224.json │ │ │ │ │ │ ├── xlm-roberta-base-ViT-B-32.json │ │ │ │ │ │ └── xlm-roberta-large-ViT-H-14.json │ │ │ │ │ ├── modified_resnet.py │ │ │ │ │ ├── openai.py │ │ │ │ │ ├── pretrained.py │ │ │ │ │ ├── push_to_hf_hub.py │ │ │ │ │ ├── timm_model.py │ │ │ │ │ ├── tokenizer.py │ │ │ │ │ ├── transform.py │ │ │ │ │ ├── transformer.py │ │ │ │ │ ├── utils.py │ │ │ │ │ └── version.py │ │ │ │ └── training │ │ │ │ │ ├── .gitignore │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── data.py │ │ │ │ │ ├── distributed.py │ │ │ │ │ ├── file_utils.py │ │ │ │ │ ├── imagenet_zeroshot_data.py │ │ │ │ │ ├── logger.py │ │ │ │ │ ├── params.py │ │ │ │ │ ├── precision.py │ │ │ │ │ ├── profile.py │ │ │ │ │ ├── scheduler.py │ │ │ │ │ ├── train.py │ │ │ │ │ ├── train_clip.py │ │ │ │ │ └── zero_shot.py │ │ │ └── visual │ │ │ │ ├── __init__.py │ │ │ │ ├── motionformer.py │ │ │ │ ├── motionformer_src │ │ │ │ ├── divided_224_16x4.yaml │ │ │ │ ├── joint_224_16x4.yaml │ │ │ │ ├── motionformer_224_16x4.yaml │ │ │ │ ├── nystrom_helper.py │ │ │ │ ├── orthoformer_helper.py │ │ │ │ ├── performer_helper.py │ │ │ │ ├── video_model_builder.py │ │ │ │ └── vit_helper.py │ │ │ │ └── s3d.py │ │ └── transformer.py │ └── sync_model.py ├── phi.py ├── transformer.py └── transforms.py ├── eval_MAE.py ├── evaluation ├── av_align_score.py ├── clap │ ├── CLAPWrapper.py │ ├── LICENSE │ ├── __init__.py │ ├── audio.py │ ├── clap.py │ ├── clap_config.yml │ └── utils.py └── clap_score.py ├── requirements.txt ├── test.py ├── test_samples.json ├── tool_add_adapter.py └── utils.py /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__/ 2 | ckpts/ 3 | *.ckpt 4 | *.pt 5 | *.pth 6 | evaluation/clap/CLAP_weights_2022.pth 7 | output/ 8 | taming/ 9 | evaluation/tmp/ 10 | logs/ 11 | results/ 12 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## [AAAI'25] Read, Watch and Scream! Sound Generation from Text and Video 2 | 3 | [![arXiv](https://img.shields.io/badge/arXiv%20papr-2407.05551-b31b1b.svg)](https://arxiv.org/abs/2407.05551) 4 | [![Samples](https://img.shields.io/badge/Demo-Link-blue.svg)](https://naver-ai.github.io/rewas/) 5 | 6 | 7 | [Yujin Jeong](https://eugene6923.github.io/)  [Yunji Kim](https://github.com/YunjiKim)  [Sanghyuk Chun](https://sanghyukchun.github.io/home/)  [Jiyoung Lee](https://lee-jiyoung.github.io/) 8 | 9 | NAVER AI Lab 10 | 11 | --- 12 | ### Updates 13 | - (12/2024) Our paper is accepted at AAAI 2025! 14 | - (10/2024) We release the official code! 15 | 16 | --- 17 | 18 | ### Abstract 19 | 20 | 21 | Multimodal generative models have shown impressive advances with the help of powerful diffusion models. 22 | Despite the progress, generating sound solely from text poses challenges in ensuring comprehensive scene depiction and temporal alignment. 23 | Meanwhile, video-to-sound generation limits the flexibility to prioritize sound synthesis for specific objects within the scene. 24 | To tackle these challenges, we propose a novel video-and-text-to-sound generation method, called **ReWaS**, where video serves as a conditional control for a text-to-audio generation model. 25 | Our method estimates the structural information of audio (namely, energy) from the video while receiving key content cues from a user prompt. 26 | We employ a well-performing text-to-sound model to consolidate the video control, which is much more efficient for training multimodal diffusion models with massive triplet-paired (audio-video-text) data. 27 | In addition, by separating the generative components of audio, it becomes a more flexible system that allows users to freely adjust the energy, surrounding environment, and primary sound source according to their preferences. 28 | Experimental results demonstrate that our method shows superiority in terms of quality, controllability, and training efficiency. 29 | 30 | 31 | 32 | 33 | ## ReWaS 34 | 35 | ### Prepare Python running environment 36 | 37 | ```shell 38 | git clone https://github.com/naver-ai/rewas.git 39 | # Install running environment 40 | sudo apt-get update 41 | sudo apt-get install -y python3-tk 42 | sudo apt-get install -y ffmpeg 43 | pip install -r requirements.txt 44 | ``` 45 | 46 | If the code raises the following error, 'No module named 'pytorch_lightning.utilities.rank_zero', please upgrade pytorch-lightning. 47 | 48 | ### Download checkpoints 49 | 50 | 1. Download checkpoints from [link](https://huggingface.co/lee-j/ReWaS/tree/main) that contains parameteres of ReWaS(AudioLDM-M) and phi. 51 | 52 | 2. Download the checkpoints of pretrained Synchformer, VAE, CLAP, 16kHz HiFiGAN, and 48kHz HiFiGAN from [Synchformer](https://github.com/v-iashin/Synchformer?tab=readme-ov-file#audio-visual-synchronization-models) and [AudioLDM-training](https://github.com/haoheliu/AudioLDM-training-finetuning?tab=readme-ov-file#download-checkpoints-and-dataset). 53 | 54 | 55 | ```shell 56 | ckpts/ 57 | vae_mel_16k_64bins.ckpt 58 | hifigan_16k_64bins.ckpt 59 | clap_music_speech_audioset_epoch_15_esc_89.98.pt 60 | 24-01-04T16-39-21.pt 61 | phi_vggsound.ckpt 62 | audioldm_m_rewas_vggsound.ckpt 63 | ``` 64 | 65 | ### Test ReWaS 66 | Please insert the video path and text prompt that you want to generate audio into 'test_samples.json'. 67 | 68 | Use the following syntax: 69 | 70 | ```shell 71 | python test.py \ 72 | -ckpt ckpts/rewas.ckpt \ 73 | --config configs/audioldm_m_rewas.yaml \ 74 | --control_type energy_video \ 75 | --save_path outputs \ 76 | --testlist 'test_samples.json' 77 | ``` 78 | 79 | ### Evaluate model 80 | 81 | We recommend the following evaluation metrics. 82 | 83 | 1. **Energy MAE**: ./eval_MAE.py 84 | 2. [**Melception Audio Quality**](https://github.com/v-iashin/SpecVQGAN/blob/main/evaluate.py) 85 | 3. [**CLAP score**](https://github.com/Text-to-Audio/Make-An-Audio/tree/main/wav_evaluation) 86 | - Download CLAP weights from [Hugging Face](https://huggingface.co/microsoft/msclap/blob/main/CLAP_weights_2022.pth) into `evaluation/clap/CLAP_weights_2022.pth` 87 | ```shell 88 | cd evaluation; 89 | python clap_score.py 90 | ``` 91 | - requirements: transformer>=4.28.1 92 | 93 | 4. [**Onset Accuracy**](https://github.com/XYPB/CondFoleyGen/blob/main/predict_onset.py) 94 | 5. [**AV-align**](https://github.com/guyyariv/TempoTokens/blob/master/av_align.py) 95 | ```shell 96 | cd evaluation; 97 | python av_align_score.py --input_video_dir='/path/to/vggsound_video' --input_wav_dir='results/' --cache_path='./video_cache.json' 98 | ``` 99 | 100 | ### Customizing 101 | If you want to build a new ReWaS or apply in other text-to-audio model, you can use `tool_add_adapter.py` 102 | 103 | 104 | ## BibTex 105 | 106 | ``` 107 | @inproceedings{jeong2024read, 108 | author = {Jeong, Yujin and Kim, Yunji and Chun, Sanghyuk and Lee, Jiyoung}, 109 | title = {Read, Watch and Scream! Sound Generation from Text and Video}, 110 | journal = {arXiv preprint arXiv:2407.05551}, 111 | year = {2024}, 112 | } 113 | ``` 114 | 115 | ## License 116 | ``` 117 | ReWaS 118 | Copyright (c) 2024-present NAVER Cloud Corp. 119 | CC BY-NC-SA 4.0 (https://creativecommons.org/licenses/by-nc-sa/4.0/) 120 | ``` 121 | 122 | ## Reference 123 | We greatly appreciate the open-soucing of the following code bases. Open source code base is the real-world infinite stone 💎! 124 | - https://github.com/haoheliu/AudioLDM-training-finetuning 125 | - https://github.com/lllyasviel/ControlNet 126 | - https://github.com/v-iashin/Synchformer 127 | 128 | -------------------------------------------------------------------------------- /audioldm/README.md: -------------------------------------------------------------------------------- 1 | ### Reference 2 | 3 | Part of the code is borrowed from the following repos. 4 | 5 | https://github.com/haoheliu/AudioLDM 6 | -------------------------------------------------------------------------------- /audioldm/clap/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/naver-ai/rewas/4e09954d386583a4e62e7a1e28c2145ca1cfc411/audioldm/clap/__init__.py -------------------------------------------------------------------------------- /audioldm/clap/open_clip/__init__.py: -------------------------------------------------------------------------------- 1 | from .factory import ( 2 | list_models, 3 | create_model, 4 | create_model_and_transforms, 5 | add_model_config, 6 | ) 7 | from .loss import ClipLoss, gather_features, LPLoss, lp_gather_features, LPMetrics 8 | from .model import ( 9 | CLAP, 10 | CLAPTextCfg, 11 | CLAPVisionCfg, 12 | CLAPAudioCfp, 13 | convert_weights_to_fp16, 14 | trace_model, 15 | ) 16 | from .openai import load_openai_model, list_openai_models 17 | from .pretrained import ( 18 | list_pretrained, 19 | list_pretrained_tag_models, 20 | list_pretrained_model_tags, 21 | get_pretrained_url, 22 | download_pretrained, 23 | ) 24 | from .tokenizer import SimpleTokenizer, tokenize 25 | from .transform import image_transform 26 | -------------------------------------------------------------------------------- /audioldm/clap/open_clip/bert.py: -------------------------------------------------------------------------------- 1 | from transformers import BertTokenizer, BertModel 2 | 3 | tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") 4 | model = BertModel.from_pretrained("bert-base-uncased") 5 | text = "Replace me by any text you'd like." 6 | 7 | 8 | def bert_embeddings(text): 9 | # text = "Replace me by any text you'd like." 10 | encoded_input = tokenizer(text, return_tensors="pt") 11 | output = model(**encoded_input) 12 | return output 13 | 14 | 15 | from transformers import RobertaTokenizer, RobertaModel 16 | 17 | tokenizer = RobertaTokenizer.from_pretrained("roberta-base") 18 | model = RobertaModel.from_pretrained("roberta-base") 19 | text = "Replace me by any text you'd like." 20 | 21 | 22 | def Roberta_embeddings(text): 23 | # text = "Replace me by any text you'd like." 24 | encoded_input = tokenizer(text, return_tensors="pt") 25 | output = model(**encoded_input) 26 | return output 27 | 28 | 29 | from transformers import BartTokenizer, BartModel 30 | 31 | tokenizer = BartTokenizer.from_pretrained("facebook/bart-base") 32 | model = BartModel.from_pretrained("facebook/bart-base") 33 | text = "Replace me by any text you'd like." 34 | 35 | 36 | def bart_embeddings(text): 37 | # text = "Replace me by any text you'd like." 38 | encoded_input = tokenizer(text, return_tensors="pt") 39 | output = model(**encoded_input) 40 | return output 41 | -------------------------------------------------------------------------------- /audioldm/clap/open_clip/bpe_simple_vocab_16e6.txt.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/naver-ai/rewas/4e09954d386583a4e62e7a1e28c2145ca1cfc411/audioldm/clap/open_clip/bpe_simple_vocab_16e6.txt.gz -------------------------------------------------------------------------------- /audioldm/clap/open_clip/linear_probe.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch.nn.functional as F 3 | from torch import nn 4 | from .model import MLPLayers 5 | 6 | 7 | class LinearProbe(nn.Module): 8 | def __init__(self, model, mlp, freeze, in_ch, out_ch, act=None): 9 | """ 10 | Args: 11 | model: nn.Module 12 | mlp: bool, if True, then use the MLP layer as the linear probe module 13 | freeze: bool, if Ture, then freeze all the CLAP model's layers when training the linear probe 14 | in_ch: int, the output channel from CLAP model 15 | out_ch: int, the output channel from linear probe (class_num) 16 | act: torch.nn.functional, the activation function before the loss function 17 | """ 18 | super().__init__() 19 | in_ch = 512 20 | self.clap_model = model 21 | self.clap_model.text_branch = None # to save memory 22 | self.freeze = freeze 23 | if mlp: 24 | self.lp_layer = MLPLayers(units=[in_ch, in_ch * 2, out_ch]) 25 | else: 26 | self.lp_layer = nn.Linear(in_ch, out_ch) 27 | 28 | if self.freeze: 29 | for param in self.clap_model.parameters(): 30 | param.requires_grad = False 31 | 32 | if act == "None": 33 | self.act = None 34 | elif act == "relu": 35 | self.act = nn.ReLU() 36 | elif act == "elu": 37 | self.act = nn.ELU() 38 | elif act == "prelu": 39 | self.act = nn.PReLU(num_parameters=in_ch) 40 | elif act == "softmax": 41 | self.act = nn.Softmax(dim=-1) 42 | elif act == "sigmoid": 43 | self.act = nn.Sigmoid() 44 | 45 | def forward(self, x, mix_lambda=None, device=None): 46 | """ 47 | Args: 48 | x: waveform, torch.tensor [batch, t_samples] / batch of mel_spec and longer list 49 | mix_lambda: torch.tensor [batch], the mixup lambda 50 | Returns: 51 | class_prob: torch.tensor [batch, class_num] 52 | 53 | """ 54 | # batchnorm cancel grandient 55 | if self.freeze: 56 | self.clap_model.eval() 57 | 58 | x = self.clap_model.audio_projection( 59 | self.clap_model.audio_branch(x, mixup_lambda=mix_lambda, device=device)[ 60 | "embedding" 61 | ] 62 | ) 63 | out = self.lp_layer(x) 64 | if self.act is not None: 65 | out = self.act(out) 66 | return out 67 | -------------------------------------------------------------------------------- /audioldm/clap/open_clip/model_configs/HTSAT-base.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "audio_cfg": { 4 | "audio_length": 1024, 5 | "clip_samples": 480000, 6 | "mel_bins": 64, 7 | "sample_rate": 48000, 8 | "window_size": 1024, 9 | "hop_size": 480, 10 | "fmin": 50, 11 | "fmax": 14000, 12 | "class_num": 527, 13 | "model_type": "HTSAT", 14 | "model_name": "base" 15 | }, 16 | "text_cfg": { 17 | "context_length": 77, 18 | "vocab_size": 49408, 19 | "width": 512, 20 | "heads": 8, 21 | "layers": 12 22 | } 23 | } -------------------------------------------------------------------------------- /audioldm/clap/open_clip/model_configs/HTSAT-large.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 2048, 3 | "audio_cfg": { 4 | "audio_length": 1024, 5 | "clip_samples": 480000, 6 | "mel_bins": 64, 7 | "sample_rate": 48000, 8 | "window_size": 1024, 9 | "hop_size": 480, 10 | "fmin": 50, 11 | "fmax": 14000, 12 | "class_num": 527, 13 | "model_type": "HTSAT", 14 | "model_name": "large" 15 | }, 16 | "text_cfg": { 17 | "context_length": 77, 18 | "vocab_size": 49408, 19 | "width": 512, 20 | "heads": 8, 21 | "layers": 12 22 | } 23 | } -------------------------------------------------------------------------------- /audioldm/clap/open_clip/model_configs/HTSAT-tiny-win-1536.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 768, 3 | "audio_cfg": { 4 | "audio_length": 1024, 5 | "clip_samples": 480000, 6 | "mel_bins": 64, 7 | "sample_rate": 48000, 8 | "window_size": 1536, 9 | "hop_size": 480, 10 | "fmin": 50, 11 | "fmax": 14000, 12 | "class_num": 527, 13 | "model_type": "HTSAT", 14 | "model_name": "tiny" 15 | }, 16 | "text_cfg": { 17 | "context_length": 77, 18 | "vocab_size": 49408, 19 | "width": 512, 20 | "heads": 8, 21 | "layers": 12 22 | } 23 | } -------------------------------------------------------------------------------- /audioldm/clap/open_clip/model_configs/HTSAT-tiny.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 768, 3 | "audio_cfg": { 4 | "audio_length": 1024, 5 | "clip_samples": 480000, 6 | "mel_bins": 64, 7 | "sample_rate": 48000, 8 | "window_size": 1024, 9 | "hop_size": 480, 10 | "fmin": 50, 11 | "fmax": 14000, 12 | "class_num": 527, 13 | "model_type": "HTSAT", 14 | "model_name": "tiny" 15 | }, 16 | "text_cfg": { 17 | "context_length": 77, 18 | "vocab_size": 49408, 19 | "width": 512, 20 | "heads": 8, 21 | "layers": 12 22 | } 23 | } -------------------------------------------------------------------------------- /audioldm/clap/open_clip/model_configs/PANN-10.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "audio_cfg": { 4 | "audio_length": 1024, 5 | "clip_samples": 480000, 6 | "mel_bins": 64, 7 | "sample_rate": 48000, 8 | "window_size": 1024, 9 | "hop_size": 480, 10 | "fmin": 50, 11 | "fmax": 14000, 12 | "class_num": 527, 13 | "model_type": "PANN", 14 | "model_name": "Cnn10" 15 | }, 16 | "text_cfg": { 17 | "context_length": 77, 18 | "vocab_size": 49408, 19 | "width": 512, 20 | "heads": 8, 21 | "layers": 12 22 | } 23 | } -------------------------------------------------------------------------------- /audioldm/clap/open_clip/model_configs/PANN-14-fmax-18k.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 2048, 3 | "audio_cfg": { 4 | "audio_length": 1024, 5 | "clip_samples": 480000, 6 | "mel_bins": 64, 7 | "sample_rate": 48000, 8 | "window_size": 1024, 9 | "hop_size": 480, 10 | "fmin": 50, 11 | "fmax": 18000, 12 | "class_num": 527, 13 | "model_type": "PANN", 14 | "model_name": "Cnn14" 15 | }, 16 | "text_cfg": { 17 | "context_length": 77, 18 | "vocab_size": 49408, 19 | "width": 512, 20 | "heads": 8, 21 | "layers": 12 22 | } 23 | } -------------------------------------------------------------------------------- /audioldm/clap/open_clip/model_configs/PANN-14-fmax-8k-20s.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 2048, 3 | "audio_cfg": { 4 | "audio_length": 1024, 5 | "clip_samples": 960000, 6 | "mel_bins": 64, 7 | "sample_rate": 48000, 8 | "window_size": 1024, 9 | "hop_size": 360, 10 | "fmin": 50, 11 | "fmax": 8000, 12 | "class_num": 527, 13 | "model_type": "PANN", 14 | "model_name": "Cnn14" 15 | }, 16 | "text_cfg": { 17 | "context_length": 77, 18 | "vocab_size": 49408, 19 | "width": 512, 20 | "heads": 8, 21 | "layers": 12 22 | } 23 | } -------------------------------------------------------------------------------- /audioldm/clap/open_clip/model_configs/PANN-14-tiny-transformer.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 2048, 3 | "audio_cfg": { 4 | "audio_length": 1024, 5 | "clip_samples": 480000, 6 | "mel_bins": 64, 7 | "sample_rate": 48000, 8 | "window_size": 1024, 9 | "hop_size": 480, 10 | "fmin": 50, 11 | "fmax": 14000, 12 | "class_num": 527, 13 | "model_type": "PANN", 14 | "model_name": "Cnn14" 15 | }, 16 | "text_cfg": { 17 | "context_length": 77, 18 | "vocab_size": 49408, 19 | "width": 512, 20 | "heads": 8, 21 | "layers": 4 22 | } 23 | } -------------------------------------------------------------------------------- /audioldm/clap/open_clip/model_configs/PANN-14-win-1536.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 2048, 3 | "audio_cfg": { 4 | "audio_length": 1024, 5 | "clip_samples": 480000, 6 | "mel_bins": 64, 7 | "sample_rate": 48000, 8 | "window_size": 1536, 9 | "hop_size": 480, 10 | "fmin": 50, 11 | "fmax": 14000, 12 | "class_num": 527, 13 | "model_type": "PANN", 14 | "model_name": "Cnn14" 15 | }, 16 | "text_cfg": { 17 | "context_length": 77, 18 | "vocab_size": 49408, 19 | "width": 512, 20 | "heads": 8, 21 | "layers": 12 22 | } 23 | } -------------------------------------------------------------------------------- /audioldm/clap/open_clip/model_configs/PANN-14.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 2048, 3 | "audio_cfg": { 4 | "audio_length": 1024, 5 | "clip_samples": 480000, 6 | "mel_bins": 64, 7 | "sample_rate": 48000, 8 | "window_size": 1024, 9 | "hop_size": 480, 10 | "fmin": 50, 11 | "fmax": 14000, 12 | "class_num": 527, 13 | "model_type": "PANN", 14 | "model_name": "Cnn14" 15 | }, 16 | "text_cfg": { 17 | "context_length": 77, 18 | "vocab_size": 49408, 19 | "width": 512, 20 | "heads": 8, 21 | "layers": 12 22 | } 23 | } -------------------------------------------------------------------------------- /audioldm/clap/open_clip/model_configs/PANN-6.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "audio_cfg": { 4 | "audio_length": 1024, 5 | "clip_samples": 480000, 6 | "mel_bins": 64, 7 | "sample_rate": 48000, 8 | "window_size": 1024, 9 | "hop_size": 480, 10 | "fmin": 50, 11 | "fmax": 14000, 12 | "class_num": 527, 13 | "model_type": "PANN", 14 | "model_name": "Cnn6" 15 | }, 16 | "text_cfg": { 17 | "context_length": 77, 18 | "vocab_size": 49408, 19 | "width": 512, 20 | "heads": 8, 21 | "layers": 12 22 | } 23 | } -------------------------------------------------------------------------------- /audioldm/clap/open_clip/model_configs/RN101-quickgelu.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "quick_gelu": true, 4 | "vision_cfg": { 5 | "image_size": 224, 6 | "layers": [ 7 | 3, 8 | 4, 9 | 23, 10 | 3 11 | ], 12 | "width": 64, 13 | "patch_size": null 14 | }, 15 | "text_cfg": { 16 | "context_length": 77, 17 | "vocab_size": 49408, 18 | "width": 512, 19 | "heads": 8, 20 | "layers": 12 21 | } 22 | } -------------------------------------------------------------------------------- /audioldm/clap/open_clip/model_configs/RN101.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": [ 6 | 3, 7 | 4, 8 | 23, 9 | 3 10 | ], 11 | "width": 64, 12 | "patch_size": null 13 | }, 14 | "text_cfg": { 15 | "context_length": 77, 16 | "vocab_size": 49408, 17 | "width": 512, 18 | "heads": 8, 19 | "layers": 12 20 | } 21 | } -------------------------------------------------------------------------------- /audioldm/clap/open_clip/model_configs/RN50-quickgelu.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "quick_gelu": true, 4 | "vision_cfg": { 5 | "image_size": 224, 6 | "layers": [ 7 | 3, 8 | 4, 9 | 6, 10 | 3 11 | ], 12 | "width": 64, 13 | "patch_size": null 14 | }, 15 | "text_cfg": { 16 | "context_length": 77, 17 | "vocab_size": 49408, 18 | "width": 512, 19 | "heads": 8, 20 | "layers": 12 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /audioldm/clap/open_clip/model_configs/RN50.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": [ 6 | 3, 7 | 4, 8 | 6, 9 | 3 10 | ], 11 | "width": 64, 12 | "patch_size": null 13 | }, 14 | "text_cfg": { 15 | "context_length": 77, 16 | "vocab_size": 49408, 17 | "width": 512, 18 | "heads": 8, 19 | "layers": 12 20 | } 21 | } -------------------------------------------------------------------------------- /audioldm/clap/open_clip/model_configs/RN50x16.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 768, 3 | "vision_cfg": { 4 | "image_size": 384, 5 | "layers": [ 6 | 6, 7 | 8, 8 | 18, 9 | 8 10 | ], 11 | "width": 96, 12 | "patch_size": null 13 | }, 14 | "text_cfg": { 15 | "context_length": 77, 16 | "vocab_size": 49408, 17 | "width": 768, 18 | "heads": 12, 19 | "layers": 12 20 | } 21 | } -------------------------------------------------------------------------------- /audioldm/clap/open_clip/model_configs/RN50x4.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 640, 3 | "vision_cfg": { 4 | "image_size": 288, 5 | "layers": [ 6 | 4, 7 | 6, 8 | 10, 9 | 6 10 | ], 11 | "width": 80, 12 | "patch_size": null 13 | }, 14 | "text_cfg": { 15 | "context_length": 77, 16 | "vocab_size": 49408, 17 | "width": 640, 18 | "heads": 10, 19 | "layers": 12 20 | } 21 | } -------------------------------------------------------------------------------- /audioldm/clap/open_clip/model_configs/ViT-B-16.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 12, 6 | "width": 768, 7 | "patch_size": 16 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 512, 13 | "heads": 8, 14 | "layers": 12 15 | } 16 | } -------------------------------------------------------------------------------- /audioldm/clap/open_clip/model_configs/ViT-B-32-quickgelu.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "quick_gelu": true, 4 | "vision_cfg": { 5 | "image_size": 224, 6 | "layers": 12, 7 | "width": 768, 8 | "patch_size": 32 9 | }, 10 | "text_cfg": { 11 | "context_length": 77, 12 | "vocab_size": 49408, 13 | "width": 512, 14 | "heads": 8, 15 | "layers": 12 16 | } 17 | } -------------------------------------------------------------------------------- /audioldm/clap/open_clip/model_configs/ViT-B-32.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 12, 6 | "width": 768, 7 | "patch_size": 32 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 512, 13 | "heads": 8, 14 | "layers": 12 15 | } 16 | } -------------------------------------------------------------------------------- /audioldm/clap/open_clip/model_configs/ViT-L-14.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 768, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 24, 6 | "width": 1024, 7 | "patch_size": 14 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 768, 13 | "heads": 12, 14 | "layers": 12 15 | } 16 | } -------------------------------------------------------------------------------- /audioldm/clap/open_clip/openai.py: -------------------------------------------------------------------------------- 1 | """ OpenAI pretrained model functions 2 | 3 | Adapted from https://github.com/openai/CLIP. Originally MIT License, Copyright (c) 2021 OpenAI. 4 | """ 5 | 6 | import os 7 | import warnings 8 | from typing import Union, List 9 | 10 | import torch 11 | 12 | from .model import build_model_from_openai_state_dict 13 | from .pretrained import ( 14 | get_pretrained_url, 15 | list_pretrained_tag_models, 16 | download_pretrained, 17 | ) 18 | 19 | __all__ = ["list_openai_models", "load_openai_model"] 20 | 21 | 22 | def list_openai_models() -> List[str]: 23 | """Returns the names of available CLIP models""" 24 | return list_pretrained_tag_models("openai") 25 | 26 | 27 | def load_openai_model( 28 | name: str, 29 | model_cfg, 30 | device: Union[str, torch.device] = "cuda" if torch.cuda.is_available() else "cpu", 31 | jit=True, 32 | cache_dir=os.path.expanduser("~/.cache/clip"), 33 | enable_fusion: bool = False, 34 | fusion_type: str = "None", 35 | ): 36 | """Load a CLIP model, preserve its text pretrained part, and set in the CLAP model 37 | 38 | Parameters 39 | ---------- 40 | name : str 41 | A model name listed by `clip.available_models()`, or the path to a model checkpoint containing the state_dict 42 | device : Union[str, torch.device] 43 | The device to put the loaded model 44 | jit : bool 45 | Whether to load the optimized JIT model (default) or more hackable non-JIT model. 46 | 47 | Returns 48 | ------- 49 | model : torch.nn.Module 50 | The CLAP model 51 | preprocess : Callable[[PIL.Image], torch.Tensor] 52 | A torchvision transform that converts a PIL image into a tensor that the returned model can take as its input 53 | """ 54 | if get_pretrained_url(name, "openai"): 55 | model_path = download_pretrained( 56 | get_pretrained_url(name, "openai"), root=cache_dir 57 | ) 58 | elif os.path.isfile(name): 59 | model_path = name 60 | else: 61 | raise RuntimeError( 62 | f"Model {name} not found; available models = {list_openai_models()}" 63 | ) 64 | 65 | try: 66 | # loading JIT archive 67 | model = torch.jit.load(model_path, map_location=device if jit else "cpu").eval() 68 | state_dict = None 69 | except RuntimeError: 70 | # loading saved state dict 71 | if jit: 72 | warnings.warn( 73 | f"File {model_path} is not a JIT archive. Loading as a state dict instead" 74 | ) 75 | jit = False 76 | state_dict = torch.load(model_path, map_location="cpu") 77 | 78 | if not jit: 79 | try: 80 | model = build_model_from_openai_state_dict( 81 | state_dict or model.state_dict(), model_cfg, enable_fusion, fusion_type 82 | ).to(device) 83 | except KeyError: 84 | sd = {k[7:]: v for k, v in state_dict["state_dict"].items()} 85 | model = build_model_from_openai_state_dict( 86 | sd, model_cfg, enable_fusion, fusion_type 87 | ).to(device) 88 | 89 | if str(device) == "cpu": 90 | model.float() 91 | return model 92 | 93 | # patch the device names 94 | device_holder = torch.jit.trace( 95 | lambda: torch.ones([]).to(torch.device(device)), example_inputs=[] 96 | ) 97 | device_node = [ 98 | n 99 | for n in device_holder.graph.findAllNodes("prim::Constant") 100 | if "Device" in repr(n) 101 | ][-1] 102 | 103 | def patch_device(module): 104 | try: 105 | graphs = [module.graph] if hasattr(module, "graph") else [] 106 | except RuntimeError: 107 | graphs = [] 108 | 109 | if hasattr(module, "forward1"): 110 | graphs.append(module.forward1.graph) 111 | 112 | for graph in graphs: 113 | for node in graph.findAllNodes("prim::Constant"): 114 | if "value" in node.attributeNames() and str(node["value"]).startswith( 115 | "cuda" 116 | ): 117 | node.copyAttributes(device_node) 118 | 119 | model.apply(patch_device) 120 | patch_device(model.encode_audio) 121 | patch_device(model.encode_text) 122 | 123 | # patch dtype to float32 on CPU 124 | if str(device) == "cpu": 125 | float_holder = torch.jit.trace( 126 | lambda: torch.ones([]).float(), example_inputs=[] 127 | ) 128 | float_input = list(float_holder.graph.findNode("aten::to").inputs())[1] 129 | float_node = float_input.node() 130 | 131 | def patch_float(module): 132 | try: 133 | graphs = [module.graph] if hasattr(module, "graph") else [] 134 | except RuntimeError: 135 | graphs = [] 136 | 137 | if hasattr(module, "forward1"): 138 | graphs.append(module.forward1.graph) 139 | 140 | for graph in graphs: 141 | for node in graph.findAllNodes("aten::to"): 142 | inputs = list(node.inputs()) 143 | for i in [ 144 | 1, 145 | 2, 146 | ]: # dtype can be the second or third argument to aten::to() 147 | if inputs[i].node()["value"] == 5: 148 | inputs[i].node().copyAttributes(float_node) 149 | 150 | model.apply(patch_float) 151 | patch_float(model.encode_audio) 152 | patch_float(model.encode_text) 153 | model.float() 154 | 155 | model.audio_branch.audio_length = model.audio_cfg.audio_length 156 | return model 157 | -------------------------------------------------------------------------------- /audioldm/clap/open_clip/timm_model.py: -------------------------------------------------------------------------------- 1 | """ timm model adapter 2 | 3 | Wraps timm (https://github.com/rwightman/pytorch-image-models) models for use as a vision tower in CLIP model. 4 | """ 5 | from collections import OrderedDict 6 | 7 | import torch.nn as nn 8 | 9 | try: 10 | import timm 11 | from timm.models.layers import Mlp, to_2tuple 12 | from timm.models.layers.attention_pool2d import RotAttentionPool2d 13 | from timm.models.layers.attention_pool2d import ( 14 | AttentionPool2d as AbsAttentionPool2d, 15 | ) 16 | except ImportError as e: 17 | timm = None 18 | 19 | from .utils import freeze_batch_norm_2d 20 | 21 | 22 | class TimmModel(nn.Module): 23 | """timm model adapter 24 | # FIXME this adapter is a work in progress, may change in ways that break weight compat 25 | """ 26 | 27 | def __init__( 28 | self, 29 | model_name, 30 | embed_dim, 31 | image_size=224, 32 | pool="avg", 33 | proj="linear", 34 | drop=0.0, 35 | pretrained=False, 36 | ): 37 | super().__init__() 38 | if timm is None: 39 | raise RuntimeError("Please `pip install timm` to use timm models.") 40 | 41 | self.image_size = to_2tuple(image_size) 42 | self.trunk = timm.create_model(model_name, pretrained=pretrained) 43 | feat_size = self.trunk.default_cfg.get("pool_size", None) 44 | feature_ndim = 1 if not feat_size else 2 45 | if pool in ("abs_attn", "rot_attn"): 46 | assert feature_ndim == 2 47 | # if attn pooling used, remove both classifier and default pool 48 | self.trunk.reset_classifier(0, global_pool="") 49 | else: 50 | # reset global pool if pool config set, otherwise leave as network default 51 | reset_kwargs = dict(global_pool=pool) if pool else {} 52 | self.trunk.reset_classifier(0, **reset_kwargs) 53 | prev_chs = self.trunk.num_features 54 | 55 | head_layers = OrderedDict() 56 | if pool == "abs_attn": 57 | head_layers["pool"] = AbsAttentionPool2d( 58 | prev_chs, feat_size=feat_size, out_features=embed_dim 59 | ) 60 | prev_chs = embed_dim 61 | elif pool == "rot_attn": 62 | head_layers["pool"] = RotAttentionPool2d(prev_chs, out_features=embed_dim) 63 | prev_chs = embed_dim 64 | else: 65 | assert proj, "projection layer needed if non-attention pooling is used." 66 | 67 | # NOTE attention pool ends with a projection layer, so proj should usually be set to '' if such pooling is used 68 | if proj == "linear": 69 | head_layers["drop"] = nn.Dropout(drop) 70 | head_layers["proj"] = nn.Linear(prev_chs, embed_dim) 71 | elif proj == "mlp": 72 | head_layers["mlp"] = Mlp(prev_chs, 2 * embed_dim, embed_dim, drop=drop) 73 | 74 | self.head = nn.Sequential(head_layers) 75 | 76 | def lock(self, unlocked_groups=0, freeze_bn_stats=False): 77 | """lock modules 78 | Args: 79 | unlocked_groups (int): leave last n layer groups unlocked (default: 0) 80 | """ 81 | if not unlocked_groups: 82 | # lock full model 83 | for param in self.trunk.parameters(): 84 | param.requires_grad = False 85 | if freeze_bn_stats: 86 | freeze_batch_norm_2d(self.trunk) 87 | else: 88 | # NOTE: partial freeze requires latest timm (master) branch and is subject to change 89 | try: 90 | # FIXME import here until API stable and in an official release 91 | from timm.models.helpers import group_parameters, group_modules 92 | except ImportError: 93 | raise RuntimeError( 94 | "Please install latest timm `pip install git+https://github.com/rwightman/pytorch-image-models`" 95 | ) 96 | matcher = self.trunk.group_matcher() 97 | gparams = group_parameters(self.trunk, matcher) 98 | max_layer_id = max(gparams.keys()) 99 | max_layer_id = max_layer_id - unlocked_groups 100 | for group_idx in range(max_layer_id + 1): 101 | group = gparams[group_idx] 102 | for param in group: 103 | self.trunk.get_parameter(param).requires_grad = False 104 | if freeze_bn_stats: 105 | gmodules = group_modules(self.trunk, matcher, reverse=True) 106 | gmodules = {k for k, v in gmodules.items() if v <= max_layer_id} 107 | freeze_batch_norm_2d(self.trunk, gmodules) 108 | 109 | def forward(self, x): 110 | x = self.trunk(x) 111 | x = self.head(x) 112 | return x 113 | -------------------------------------------------------------------------------- /audioldm/clap/open_clip/transform.py: -------------------------------------------------------------------------------- 1 | from torchvision.transforms import ( 2 | Normalize, 3 | Compose, 4 | RandomResizedCrop, 5 | InterpolationMode, 6 | ToTensor, 7 | Resize, 8 | CenterCrop, 9 | ) 10 | 11 | 12 | def _convert_to_rgb(image): 13 | return image.convert("RGB") 14 | 15 | 16 | def image_transform( 17 | image_size: int, 18 | is_train: bool, 19 | mean=(0.48145466, 0.4578275, 0.40821073), 20 | std=(0.26862954, 0.26130258, 0.27577711), 21 | ): 22 | normalize = Normalize(mean=mean, std=std) 23 | if is_train: 24 | return Compose( 25 | [ 26 | RandomResizedCrop( 27 | image_size, 28 | scale=(0.9, 1.0), 29 | interpolation=InterpolationMode.BICUBIC, 30 | ), 31 | _convert_to_rgb, 32 | ToTensor(), 33 | normalize, 34 | ] 35 | ) 36 | else: 37 | return Compose( 38 | [ 39 | Resize(image_size, interpolation=InterpolationMode.BICUBIC), 40 | CenterCrop(image_size), 41 | _convert_to_rgb, 42 | ToTensor(), 43 | normalize, 44 | ] 45 | ) 46 | -------------------------------------------------------------------------------- /audioldm/clap/open_clip/version.py: -------------------------------------------------------------------------------- 1 | __version__ = "0.2.1" 2 | -------------------------------------------------------------------------------- /audioldm/clap/training/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/naver-ai/rewas/4e09954d386583a4e62e7a1e28c2145ca1cfc411/audioldm/clap/training/__init__.py -------------------------------------------------------------------------------- /audioldm/clap/training/audioset_textmap.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/naver-ai/rewas/4e09954d386583a4e62e7a1e28c2145ca1cfc411/audioldm/clap/training/audioset_textmap.npy -------------------------------------------------------------------------------- /audioldm/clap/training/distributed.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import torch 4 | import socket 5 | 6 | try: 7 | import horovod.torch as hvd 8 | except ImportError: 9 | hvd = None 10 | 11 | 12 | def is_global_master(args): 13 | return args.rank == 0 14 | 15 | 16 | def is_local_master(args): 17 | return args.local_rank == 0 18 | 19 | 20 | def is_master(args, local=False): 21 | return is_local_master(args) if local else is_global_master(args) 22 | 23 | 24 | def is_using_horovod(): 25 | # NOTE w/ horovod run, OMPI vars should be set, but w/ SLURM PMI vars will be set 26 | # Differentiating between horovod and DDP use via SLURM may not be possible, so horovod arg still required... 27 | ompi_vars = ["OMPI_COMM_WORLD_RANK", "OMPI_COMM_WORLD_SIZE"] 28 | pmi_vars = ["PMI_RANK", "PMI_SIZE"] 29 | if all([var in os.environ for var in ompi_vars]) or all( 30 | [var in os.environ for var in pmi_vars] 31 | ): 32 | return True 33 | else: 34 | return False 35 | 36 | 37 | def is_using_distributed(): 38 | if "WORLD_SIZE" in os.environ: 39 | return int(os.environ["WORLD_SIZE"]) > 1 40 | if "SLURM_NTASKS" in os.environ: 41 | return int(os.environ["SLURM_NTASKS"]) > 1 42 | return False 43 | 44 | 45 | def world_info_from_env(): 46 | local_rank = 0 47 | for v in ( 48 | "SLURM_LOCALID", 49 | "MPI_LOCALRANKID", 50 | "OMPI_COMM_WORLD_LOCAL_RANK", 51 | "LOCAL_RANK", 52 | ): 53 | if v in os.environ: 54 | local_rank = int(os.environ[v]) 55 | break 56 | global_rank = 0 57 | for v in ("SLURM_PROCID", "PMI_RANK", "OMPI_COMM_WORLD_RANK", "RANK"): 58 | if v in os.environ: 59 | global_rank = int(os.environ[v]) 60 | break 61 | world_size = 1 62 | for v in ("SLURM_NTASKS", "PMI_SIZE", "OMPI_COMM_WORLD_SIZE", "WORLD_SIZE"): 63 | if v in os.environ: 64 | world_size = int(os.environ[v]) 65 | break 66 | 67 | return local_rank, global_rank, world_size 68 | 69 | 70 | def init_distributed_device(args): 71 | # Distributed training = training on more than one GPU. 72 | # Works in both single and multi-node scenarios. 73 | args.distributed = False 74 | args.world_size = 1 75 | args.rank = 0 # global rank 76 | args.local_rank = 0 77 | if args.horovod: 78 | assert hvd is not None, "Horovod is not installed" 79 | hvd.init() 80 | world_size = int(os.environ["OMPI_COMM_WORLD_SIZE"]) 81 | world_rank = int(os.environ["OMPI_COMM_WORLD_RANK"]) 82 | local_rank = int(os.environ["OMPI_COMM_WORLD_LOCAL_RANK"]) 83 | args.local_rank = local_rank 84 | args.rank = world_rank 85 | args.world_size = world_size 86 | # args.local_rank = int(hvd.local_rank()) 87 | # args.rank = hvd.rank() 88 | # args.world_size = hvd.size() 89 | args.distributed = True 90 | os.environ["LOCAL_RANK"] = str(args.local_rank) 91 | os.environ["RANK"] = str(args.rank) 92 | os.environ["WORLD_SIZE"] = str(args.world_size) 93 | print( 94 | f"Distributed training: local_rank={args.local_rank}, " 95 | f"rank={args.rank}, world_size={args.world_size}, " 96 | f"hostname={socket.gethostname()}, pid={os.getpid()}" 97 | ) 98 | elif is_using_distributed(): 99 | if "SLURM_PROCID" in os.environ: 100 | # DDP via SLURM 101 | args.local_rank, args.rank, args.world_size = world_info_from_env() 102 | # SLURM var -> torch.distributed vars in case needed 103 | os.environ["LOCAL_RANK"] = str(args.local_rank) 104 | os.environ["RANK"] = str(args.rank) 105 | os.environ["WORLD_SIZE"] = str(args.world_size) 106 | torch.distributed.init_process_group( 107 | backend=args.dist_backend, 108 | init_method=args.dist_url, 109 | world_size=args.world_size, 110 | rank=args.rank, 111 | ) 112 | elif "OMPI_COMM_WORLD_SIZE" in os.environ: # using Summit cluster 113 | world_size = int(os.environ["OMPI_COMM_WORLD_SIZE"]) 114 | world_rank = int(os.environ["OMPI_COMM_WORLD_RANK"]) 115 | local_rank = int(os.environ["OMPI_COMM_WORLD_LOCAL_RANK"]) 116 | args.local_rank = local_rank 117 | args.rank = world_rank 118 | args.world_size = world_size 119 | torch.distributed.init_process_group( 120 | backend=args.dist_backend, 121 | init_method=args.dist_url, 122 | world_size=args.world_size, 123 | rank=args.rank, 124 | ) 125 | else: 126 | # DDP via torchrun, torch.distributed.launch 127 | args.local_rank, _, _ = world_info_from_env() 128 | torch.distributed.init_process_group( 129 | backend=args.dist_backend, init_method=args.dist_url 130 | ) 131 | args.world_size = torch.distributed.get_world_size() 132 | args.rank = torch.distributed.get_rank() 133 | args.distributed = True 134 | print( 135 | f"Distributed training: local_rank={args.local_rank}, " 136 | f"rank={args.rank}, world_size={args.world_size}, " 137 | f"hostname={socket.gethostname()}, pid={os.getpid()}" 138 | ) 139 | 140 | if torch.cuda.is_available(): 141 | if args.distributed and not args.no_set_device_rank: 142 | device = "cuda:%d" % args.local_rank 143 | else: 144 | device = "cuda:0" 145 | torch.cuda.set_device(device) 146 | else: 147 | device = "cpu" 148 | args.device = device 149 | device = torch.device(device) 150 | return device 151 | -------------------------------------------------------------------------------- /audioldm/clap/training/infer_demo.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | sys.path.append("src/clap") 4 | 5 | import os 6 | import torch 7 | import librosa 8 | from open_clip import create_model 9 | from training.data import get_audio_features 10 | from training.data import int16_to_float32, float32_to_int16 11 | from transformers import RobertaTokenizer 12 | 13 | tokenize = RobertaTokenizer.from_pretrained("roberta-base") 14 | 15 | 16 | def tokenizer(text): 17 | result = tokenize( 18 | text, 19 | padding="max_length", 20 | truncation=True, 21 | max_length=77, 22 | return_tensors="pt", 23 | ) 24 | return {k: v.squeeze(0) for k, v in result.items()} 25 | 26 | 27 | PRETRAINED_PATH = "/mnt/fast/nobackup/users/hl01486/projects/contrastive_pretraining/CLAP/assets/checkpoints/epoch_top_0_audioset_no_fusion.pt" 28 | WAVE_48k_PATH = "/mnt/fast/nobackup/users/hl01486/projects/contrastive_pretraining/CLAP/assets/audio/machine.wav" 29 | 30 | 31 | def infer_text(): 32 | device = "cuda:0" if torch.cuda.is_available() else "cpu" 33 | precision = "fp32" 34 | amodel = "HTSAT-tiny" # or 'PANN-14' 35 | tmodel = "roberta" # the best text encoder in our training 36 | enable_fusion = False # False if you do not want to use the fusion model 37 | fusion_type = "aff_2d" 38 | pretrained = PRETRAINED_PATH 39 | 40 | model, model_cfg = create_model( 41 | amodel, 42 | tmodel, 43 | pretrained, 44 | precision=precision, 45 | device=device, 46 | enable_fusion=enable_fusion, 47 | fusion_type=fusion_type, 48 | ) 49 | # load the text, can be a list (i.e. batch size) 50 | text_data = ["I love the contrastive learning", "I love the pretrain model"] 51 | # tokenize for roberta, if you want to tokenize for another text encoder, please refer to data.py#L43-90 52 | text_data = tokenizer(text_data) 53 | 54 | text_embed = model.get_text_embedding(text_data) 55 | print(text_embed.size()) 56 | 57 | 58 | def infer_audio(): 59 | device = "cuda:0" if torch.cuda.is_available() else "cpu" 60 | precision = "fp32" 61 | amodel = "HTSAT-tiny" # or 'PANN-14' 62 | tmodel = "roberta" # the best text encoder in our training 63 | enable_fusion = False # False if you do not want to use the fusion model 64 | fusion_type = "aff_2d" 65 | pretrained = PRETRAINED_PATH 66 | 67 | model, model_cfg = create_model( 68 | amodel, 69 | tmodel, 70 | pretrained, 71 | precision=precision, 72 | device=device, 73 | enable_fusion=enable_fusion, 74 | fusion_type=fusion_type, 75 | ) 76 | 77 | # load the waveform of the shape (T,), should resample to 48000 78 | audio_waveform, sr = librosa.load(WAVE_48k_PATH, sr=48000) 79 | # quantize 80 | audio_waveform = int16_to_float32(float32_to_int16(audio_waveform)) 81 | audio_waveform = torch.from_numpy(audio_waveform).float() 82 | audio_dict = {} 83 | 84 | # the 'fusion' truncate mode can be changed to 'rand_trunc' if run in unfusion mode 85 | import ipdb 86 | 87 | ipdb.set_trace() 88 | audio_dict = get_audio_features( 89 | audio_dict, 90 | audio_waveform, 91 | 480000, 92 | data_truncating="fusion", 93 | data_filling="repeatpad", 94 | audio_cfg=model_cfg["audio_cfg"], 95 | ) 96 | # can send a list to the model, to process many audio tracks in one time (i.e. batch size) 97 | audio_embed = model.get_audio_embedding([audio_dict]) 98 | print(audio_embed.size()) 99 | import ipdb 100 | 101 | ipdb.set_trace() 102 | 103 | 104 | if __name__ == "__main__": 105 | infer_text() 106 | infer_audio() 107 | -------------------------------------------------------------------------------- /audioldm/clap/training/logger.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | 4 | def setup_logging(log_file, level, include_host=False): 5 | if include_host: 6 | import socket 7 | 8 | hostname = socket.gethostname() 9 | formatter = logging.Formatter( 10 | f"%(asctime)s | {hostname} | %(levelname)s | %(message)s", 11 | datefmt="%Y-%m-%d,%H:%M:%S", 12 | ) 13 | else: 14 | formatter = logging.Formatter( 15 | "%(asctime)s | %(levelname)s | %(message)s", datefmt="%Y-%m-%d,%H:%M:%S" 16 | ) 17 | 18 | logging.root.setLevel(level) 19 | loggers = [logging.getLogger(name) for name in logging.root.manager.loggerDict] 20 | for logger in loggers: 21 | logger.setLevel(level) 22 | 23 | stream_handler = logging.StreamHandler() 24 | stream_handler.setFormatter(formatter) 25 | logging.root.addHandler(stream_handler) 26 | 27 | if log_file: 28 | file_handler = logging.FileHandler(filename=log_file) 29 | file_handler.setFormatter(formatter) 30 | logging.root.addHandler(file_handler) 31 | -------------------------------------------------------------------------------- /audioldm/clap/training/scheduler.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def assign_learning_rate(optimizer, new_lr): 5 | for param_group in optimizer.param_groups: 6 | param_group["lr"] = new_lr 7 | 8 | 9 | def _warmup_lr(base_lr, warmup_length, step): 10 | return base_lr * (step + 1) / warmup_length 11 | 12 | 13 | def cosine_lr(optimizer, base_lr, warmup_length, steps): 14 | def _lr_adjuster(step): 15 | if step < warmup_length: 16 | lr = _warmup_lr(base_lr, warmup_length, step) 17 | else: 18 | e = step - warmup_length 19 | es = steps - warmup_length 20 | lr = 0.5 * (1 + np.cos(np.pi * e / es)) * base_lr 21 | assign_learning_rate(optimizer, lr) 22 | return lr 23 | 24 | return _lr_adjuster 25 | -------------------------------------------------------------------------------- /audioldm/clap/training/zero_shot.py: -------------------------------------------------------------------------------- 1 | # NOTE: This script is currently not supported for CLAP. 2 | import logging 3 | from contextlib import suppress 4 | 5 | import torch 6 | import torch.nn.functional as F 7 | from tqdm import tqdm 8 | 9 | from open_clip import tokenize 10 | from .imagenet_zeroshot_data import imagenet_classnames, openai_imagenet_template 11 | 12 | 13 | def zero_shot_classifier(model, classnames, templates, args): 14 | with torch.no_grad(): 15 | zeroshot_weights = [] 16 | for classname in tqdm(classnames): 17 | texts = [template(classname) for template in templates] # format with class 18 | texts = tokenize(texts).to(args.device) # tokenize 19 | if args.distributed and not args.horovod: 20 | class_embeddings = model.module.encode_text(texts) 21 | else: 22 | class_embeddings = model.encode_text(texts) 23 | class_embedding = F.normalize(class_embeddings, dim=-1).mean(dim=0) 24 | class_embedding /= class_embedding.norm() 25 | zeroshot_weights.append(class_embedding) 26 | zeroshot_weights = torch.stack(zeroshot_weights, dim=1).to(args.device) 27 | return zeroshot_weights 28 | 29 | 30 | def accuracy(output, target, topk=(1,)): 31 | pred = output.topk(max(topk), 1, True, True)[1].t() 32 | correct = pred.eq(target.view(1, -1).expand_as(pred)) 33 | return [ 34 | float(correct[:k].reshape(-1).float().sum(0, keepdim=True).cpu().numpy()) 35 | for k in topk 36 | ] 37 | 38 | 39 | def run(model, classifier, dataloader, args): 40 | autocast = torch.cuda.amp.autocast if args.precision == "amp" else suppress 41 | with torch.no_grad(): 42 | top1, top5, n = 0.0, 0.0, 0.0 43 | for images, target in tqdm(dataloader, unit_scale=args.batch_size): 44 | images = images.to(args.device) 45 | target = target.to(args.device) 46 | 47 | with autocast(): 48 | # predict 49 | if args.distributed and not args.horovod: 50 | image_features = model.module.encode_image(images) 51 | else: 52 | image_features = model.encode_image(images) 53 | image_features = F.normalize(image_features, dim=-1) 54 | logits = 100.0 * image_features @ classifier 55 | 56 | # measure accuracy 57 | acc1, acc5 = accuracy(logits, target, topk=(1, 5)) 58 | top1 += acc1 59 | top5 += acc5 60 | n += images.size(0) 61 | 62 | top1 = top1 / n 63 | top5 = top5 / n 64 | return top1, top5 65 | 66 | 67 | def zero_shot_eval(model, data, epoch, args): 68 | if "imagenet-val" not in data and "imagenet-v2" not in data: 69 | return {} 70 | if args.zeroshot_frequency == 0: 71 | return {} 72 | if (epoch % args.zeroshot_frequency) != 0 and epoch != args.epochs: 73 | return {} 74 | 75 | logging.info("Starting zero-shot imagenet.") 76 | 77 | logging.info("Building zero-shot classifier") 78 | classifier = zero_shot_classifier( 79 | model, imagenet_classnames, openai_imagenet_template, args 80 | ) 81 | 82 | logging.info("Using classifier") 83 | results = {} 84 | if "imagenet-val" in data: 85 | top1, top5 = run(model, classifier, data["imagenet-val"].dataloader, args) 86 | results["imagenet-zeroshot-val-top1"] = top1 87 | results["imagenet-zeroshot-val-top5"] = top5 88 | if "imagenet-v2" in data: 89 | top1, top5 = run(model, classifier, data["imagenet-v2"].dataloader, args) 90 | results["imagenetv2-zeroshot-val-top1"] = top1 91 | results["imagenetv2-zeroshot-val-top5"] = top5 92 | 93 | logging.info("Finished zero-shot imagenet.") 94 | 95 | return results 96 | -------------------------------------------------------------------------------- /audioldm/diffusionmodules/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/naver-ai/rewas/4e09954d386583a4e62e7a1e28c2145ca1cfc411/audioldm/diffusionmodules/__init__.py -------------------------------------------------------------------------------- /audioldm/diffusionmodules/distributions.py: -------------------------------------------------------------------------------- 1 | # reference: https://github.com/haoheliu/AudioLDM-training-finetuning 2 | 3 | import torch 4 | import numpy as np 5 | 6 | 7 | class AbstractDistribution: 8 | def sample(self): 9 | raise NotImplementedError() 10 | 11 | def mode(self): 12 | raise NotImplementedError() 13 | 14 | 15 | class DiracDistribution(AbstractDistribution): 16 | def __init__(self, value): 17 | self.value = value 18 | 19 | def sample(self): 20 | return self.value 21 | 22 | def mode(self): 23 | return self.value 24 | 25 | 26 | class DiagonalGaussianDistribution(object): 27 | def __init__(self, parameters, deterministic=False): 28 | self.parameters = parameters 29 | self.mean, self.logvar = torch.chunk(parameters, 2, dim=1) 30 | self.logvar = torch.clamp(self.logvar, -30.0, 20.0) 31 | self.deterministic = deterministic 32 | self.std = torch.exp(0.5 * self.logvar) 33 | self.var = torch.exp(self.logvar) 34 | if self.deterministic: 35 | self.var = self.std = torch.zeros_like(self.mean).to( 36 | device=self.parameters.device 37 | ) 38 | 39 | def sample(self): 40 | x = self.mean + self.std * torch.randn(self.mean.shape).to( 41 | device=self.parameters.device 42 | ) 43 | return x 44 | 45 | def kl(self, other=None): 46 | if self.deterministic: 47 | return torch.Tensor([0.0]) 48 | else: 49 | if other is None: 50 | return 0.5 * torch.mean( 51 | torch.pow(self.mean, 2) + self.var - 1.0 - self.logvar, 52 | dim=[1, 2, 3], 53 | ) 54 | else: 55 | return 0.5 * torch.mean( 56 | torch.pow(self.mean - other.mean, 2) / other.var 57 | + self.var / other.var 58 | - 1.0 59 | - self.logvar 60 | + other.logvar, 61 | dim=[1, 2, 3], 62 | ) 63 | 64 | def nll(self, sample, dims=[1, 2, 3]): 65 | if self.deterministic: 66 | return torch.Tensor([0.0]) 67 | logtwopi = np.log(2.0 * np.pi) 68 | return 0.5 * torch.sum( 69 | logtwopi + self.logvar + torch.pow(sample - self.mean, 2) / self.var, 70 | dim=dims, 71 | ) 72 | 73 | def mode(self): 74 | return self.mean 75 | 76 | 77 | def normal_kl(mean1, logvar1, mean2, logvar2): 78 | """ 79 | source: https://github.com/openai/guided-diffusion/blob/27c20a8fab9cb472df5d6bdd6c8d11c8f430b924/guided_diffusion/losses.py#L12 80 | Compute the KL divergence between two gaussians. 81 | Shapes are automatically broadcasted, so batches can be compared to 82 | scalars, among other use cases. 83 | """ 84 | tensor = None 85 | for obj in (mean1, logvar1, mean2, logvar2): 86 | if isinstance(obj, torch.Tensor): 87 | tensor = obj 88 | break 89 | assert tensor is not None, "at least one argument must be a Tensor" 90 | 91 | # Force variances to be Tensors. Broadcasting helps convert scalars to 92 | # Tensors, but it does not work for torch.exp(). 93 | logvar1, logvar2 = [ 94 | x if isinstance(x, torch.Tensor) else torch.tensor(x).to(tensor) 95 | for x in (logvar1, logvar2) 96 | ] 97 | 98 | return 0.5 * ( 99 | -1.0 100 | + logvar2 101 | - logvar1 102 | + torch.exp(logvar1 - logvar2) 103 | + ((mean1 - mean2) ** 2) * torch.exp(-logvar2) 104 | ) 105 | -------------------------------------------------------------------------------- /audioldm/diffusionmodules/ema.py: -------------------------------------------------------------------------------- 1 | # reference: https://github.com/haoheliu/AudioLDM-training-finetuning 2 | 3 | import torch 4 | from torch import nn 5 | 6 | 7 | class LitEma(nn.Module): 8 | def __init__(self, model, decay=0.9999, use_num_upates=True): 9 | super().__init__() 10 | if decay < 0.0 or decay > 1.0: 11 | raise ValueError("Decay must be between 0 and 1") 12 | 13 | self.m_name2s_name = {} 14 | self.register_buffer("decay", torch.tensor(decay, dtype=torch.float32)) 15 | self.register_buffer( 16 | "num_updates", 17 | torch.tensor(0, dtype=torch.int) 18 | if use_num_upates 19 | else torch.tensor(-1, dtype=torch.int), 20 | ) 21 | 22 | for name, p in model.named_parameters(): 23 | if p.requires_grad: 24 | # remove as '.'-character is not allowed in buffers 25 | s_name = name.replace(".", "") 26 | self.m_name2s_name.update({name: s_name}) 27 | self.register_buffer(s_name, p.clone().detach().data) 28 | 29 | self.collected_params = [] 30 | 31 | def forward(self, model): 32 | decay = self.decay 33 | 34 | if self.num_updates >= 0: 35 | self.num_updates += 1 36 | decay = min(self.decay, (1 + self.num_updates) / (10 + self.num_updates)) 37 | 38 | one_minus_decay = 1.0 - decay 39 | 40 | with torch.no_grad(): 41 | m_param = dict(model.named_parameters()) 42 | shadow_params = dict(self.named_buffers()) 43 | 44 | for key in m_param: 45 | if m_param[key].requires_grad: 46 | sname = self.m_name2s_name[key] 47 | shadow_params[sname] = shadow_params[sname].type_as(m_param[key]) 48 | shadow_params[sname].sub_( 49 | one_minus_decay * (shadow_params[sname] - m_param[key]) 50 | ) 51 | else: 52 | assert not key in self.m_name2s_name 53 | 54 | def copy_to(self, model): 55 | m_param = dict(model.named_parameters()) 56 | shadow_params = dict(self.named_buffers()) 57 | for key in m_param: 58 | if m_param[key].requires_grad: 59 | m_param[key].data.copy_(shadow_params[self.m_name2s_name[key]].data) 60 | else: 61 | assert not key in self.m_name2s_name 62 | 63 | def store(self, parameters): 64 | """ 65 | Save the current parameters for restoring later. 66 | Args: 67 | parameters: Iterable of `torch.nn.Parameter`; the parameters to be 68 | temporarily stored. 69 | """ 70 | self.collected_params = [param.clone() for param in parameters] 71 | 72 | def restore(self, parameters): 73 | """ 74 | Restore the parameters stored with the `store` method. 75 | Useful to validate the model with EMA parameters without affecting the 76 | original optimization process. Store the parameters before the 77 | `copy_to` method. After validation (or model saving), use this to 78 | restore the former parameters. 79 | Args: 80 | parameters: Iterable of `torch.nn.Parameter`; the parameters to be 81 | updated with the stored parameters. 82 | """ 83 | for c_param, param in zip(self.collected_params, parameters): 84 | param.data.copy_(c_param.data) 85 | -------------------------------------------------------------------------------- /audioldm/hifigan/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Jungil Kong 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /audioldm/hifigan/__init__.py: -------------------------------------------------------------------------------- 1 | from .models import Generator 2 | 3 | 4 | class AttrDict(dict): 5 | def __init__(self, *args, **kwargs): 6 | super(AttrDict, self).__init__(*args, **kwargs) 7 | self.__dict__ = self 8 | -------------------------------------------------------------------------------- /audioldm/hifigan/models.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from torch.nn import Conv1d, ConvTranspose1d 5 | from torch.nn.utils import weight_norm, remove_weight_norm 6 | 7 | LRELU_SLOPE = 0.1 8 | 9 | 10 | def init_weights(m, mean=0.0, std=0.01): 11 | classname = m.__class__.__name__ 12 | if classname.find("Conv") != -1: 13 | m.weight.data.normal_(mean, std) 14 | 15 | 16 | def get_padding(kernel_size, dilation=1): 17 | return int((kernel_size * dilation - dilation) / 2) 18 | 19 | 20 | class ResBlock(torch.nn.Module): 21 | def __init__(self, h, channels, kernel_size=3, dilation=(1, 3, 5)): 22 | super(ResBlock, self).__init__() 23 | self.h = h 24 | self.convs1 = nn.ModuleList( 25 | [ 26 | weight_norm( 27 | Conv1d( 28 | channels, 29 | channels, 30 | kernel_size, 31 | 1, 32 | dilation=dilation[0], 33 | padding=get_padding(kernel_size, dilation[0]), 34 | ) 35 | ), 36 | weight_norm( 37 | Conv1d( 38 | channels, 39 | channels, 40 | kernel_size, 41 | 1, 42 | dilation=dilation[1], 43 | padding=get_padding(kernel_size, dilation[1]), 44 | ) 45 | ), 46 | weight_norm( 47 | Conv1d( 48 | channels, 49 | channels, 50 | kernel_size, 51 | 1, 52 | dilation=dilation[2], 53 | padding=get_padding(kernel_size, dilation[2]), 54 | ) 55 | ), 56 | ] 57 | ) 58 | self.convs1.apply(init_weights) 59 | 60 | self.convs2 = nn.ModuleList( 61 | [ 62 | weight_norm( 63 | Conv1d( 64 | channels, 65 | channels, 66 | kernel_size, 67 | 1, 68 | dilation=1, 69 | padding=get_padding(kernel_size, 1), 70 | ) 71 | ), 72 | weight_norm( 73 | Conv1d( 74 | channels, 75 | channels, 76 | kernel_size, 77 | 1, 78 | dilation=1, 79 | padding=get_padding(kernel_size, 1), 80 | ) 81 | ), 82 | weight_norm( 83 | Conv1d( 84 | channels, 85 | channels, 86 | kernel_size, 87 | 1, 88 | dilation=1, 89 | padding=get_padding(kernel_size, 1), 90 | ) 91 | ), 92 | ] 93 | ) 94 | self.convs2.apply(init_weights) 95 | 96 | def forward(self, x): 97 | for c1, c2 in zip(self.convs1, self.convs2): 98 | xt = F.leaky_relu(x, LRELU_SLOPE) 99 | xt = c1(xt) 100 | xt = F.leaky_relu(xt, LRELU_SLOPE) 101 | xt = c2(xt) 102 | x = xt + x 103 | return x 104 | 105 | def remove_weight_norm(self): 106 | for l in self.convs1: 107 | remove_weight_norm(l) 108 | for l in self.convs2: 109 | remove_weight_norm(l) 110 | 111 | 112 | class Generator(torch.nn.Module): 113 | def __init__(self, h): 114 | super(Generator, self).__init__() 115 | self.h = h 116 | self.num_kernels = len(h.resblock_kernel_sizes) 117 | self.num_upsamples = len(h.upsample_rates) 118 | self.conv_pre = weight_norm( 119 | Conv1d(h.num_mels, h.upsample_initial_channel, 7, 1, padding=3) 120 | ) 121 | resblock = ResBlock 122 | 123 | self.ups = nn.ModuleList() 124 | for i, (u, k) in enumerate(zip(h.upsample_rates, h.upsample_kernel_sizes)): 125 | self.ups.append( 126 | weight_norm( 127 | ConvTranspose1d( 128 | h.upsample_initial_channel // (2**i), 129 | h.upsample_initial_channel // (2 ** (i + 1)), 130 | k, 131 | u, 132 | padding=(k - u) // 2, 133 | ) 134 | ) 135 | ) 136 | 137 | self.resblocks = nn.ModuleList() 138 | for i in range(len(self.ups)): 139 | ch = h.upsample_initial_channel // (2 ** (i + 1)) 140 | for j, (k, d) in enumerate( 141 | zip(h.resblock_kernel_sizes, h.resblock_dilation_sizes) 142 | ): 143 | self.resblocks.append(resblock(h, ch, k, d)) 144 | 145 | self.conv_post = weight_norm(Conv1d(ch, 1, 7, 1, padding=3)) 146 | self.ups.apply(init_weights) 147 | self.conv_post.apply(init_weights) 148 | 149 | def forward(self, x): 150 | x = self.conv_pre(x) 151 | for i in range(self.num_upsamples): 152 | x = F.leaky_relu(x, LRELU_SLOPE) 153 | x = self.ups[i](x) 154 | xs = None 155 | for j in range(self.num_kernels): 156 | if xs is None: 157 | xs = self.resblocks[i * self.num_kernels + j](x) 158 | else: 159 | xs += self.resblocks[i * self.num_kernels + j](x) 160 | x = xs / self.num_kernels 161 | x = F.leaky_relu(x) 162 | x = self.conv_post(x) 163 | x = torch.tanh(x) 164 | 165 | return x 166 | 167 | def remove_weight_norm(self): 168 | # print("Removing weight norm...") 169 | for l in self.ups: 170 | remove_weight_norm(l) 171 | for l in self.resblocks: 172 | l.remove_weight_norm() 173 | remove_weight_norm(self.conv_pre) 174 | remove_weight_norm(self.conv_post) 175 | -------------------------------------------------------------------------------- /audioldm/latent_diffusion/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/naver-ai/rewas/4e09954d386583a4e62e7a1e28c2145ca1cfc411/audioldm/latent_diffusion/__init__.py -------------------------------------------------------------------------------- /audioldm/latent_diffusion/dpm_solver/__init__.py: -------------------------------------------------------------------------------- 1 | from .sampler import DPMSolverSampler 2 | -------------------------------------------------------------------------------- /audioldm/latent_diffusion/dpm_solver/sampler.py: -------------------------------------------------------------------------------- 1 | # reference: https://github.com/haoheliu/AudioLDM-training-finetuning 2 | 3 | """SAMPLING ONLY.""" 4 | 5 | import torch 6 | 7 | from .dpm_solver import NoiseScheduleVP, model_wrapper, DPM_Solver 8 | 9 | 10 | class DPMSolverSampler(object): 11 | def __init__(self, model, **kwargs): 12 | super().__init__() 13 | self.model = model 14 | to_torch = lambda x: x.clone().detach().to(torch.float32).to(model.device) 15 | self.register_buffer("alphas_cumprod", to_torch(model.alphas_cumprod)) 16 | 17 | def register_buffer(self, name, attr): 18 | if type(attr) == torch.Tensor: 19 | if attr.device != torch.device("cuda"): 20 | attr = attr.to(torch.device("cuda")) 21 | setattr(self, name, attr) 22 | 23 | @torch.no_grad() 24 | def sample( 25 | self, 26 | S, 27 | batch_size, 28 | shape, 29 | conditioning=None, 30 | callback=None, 31 | normals_sequence=None, 32 | img_callback=None, 33 | quantize_x0=False, 34 | eta=0.0, 35 | mask=None, 36 | x0=None, 37 | temperature=1.0, 38 | noise_dropout=0.0, 39 | score_corrector=None, 40 | corrector_kwargs=None, 41 | verbose=True, 42 | x_T=None, 43 | log_every_t=100, 44 | unconditional_guidance_scale=1.0, 45 | unconditional_conditioning=None, 46 | # this has to come in the same format as the conditioning, # e.g. as encoded tokens, ... 47 | **kwargs, 48 | ): 49 | if conditioning is not None: 50 | if isinstance(conditioning, dict): 51 | cbs = conditioning[list(conditioning.keys())[0]].shape[0] 52 | if cbs != batch_size: 53 | print( 54 | f"Warning: Got {cbs} conditionings but batch-size is {batch_size}" 55 | ) 56 | else: 57 | if conditioning.shape[0] != batch_size: 58 | print( 59 | f"Warning: Got {conditioning.shape[0]} conditionings but batch-size is {batch_size}" 60 | ) 61 | 62 | # sampling 63 | C, H, W = shape 64 | size = (batch_size, C, H, W) 65 | 66 | # print(f'Data shape for DPM-Solver sampling is {size}, sampling steps {S}') 67 | 68 | device = self.model.betas.device 69 | if x_T is None: 70 | img = torch.randn(size, device=device) 71 | else: 72 | img = x_T 73 | 74 | ns = NoiseScheduleVP("discrete", alphas_cumprod=self.alphas_cumprod) 75 | 76 | model_fn = model_wrapper( 77 | lambda x, t, c: self.model.apply_model(x, t, c), 78 | ns, 79 | model_type="noise", 80 | guidance_type="classifier-free", 81 | condition=conditioning, 82 | unconditional_condition=unconditional_conditioning, 83 | guidance_scale=unconditional_guidance_scale, 84 | ) 85 | 86 | dpm_solver = DPM_Solver(model_fn, ns, predict_x0=True, thresholding=False) 87 | x = dpm_solver.sample( 88 | img, 89 | steps=S, 90 | skip_type="time_uniform", 91 | method="multistep", 92 | order=2, 93 | lower_order_final=True, 94 | ) 95 | 96 | return x.to(device), None 97 | -------------------------------------------------------------------------------- /audioldm/latent_encoder/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/naver-ai/rewas/4e09954d386583a4e62e7a1e28c2145ca1cfc411/audioldm/latent_encoder/__init__.py -------------------------------------------------------------------------------- /audioldm/losses/__init__.py: -------------------------------------------------------------------------------- 1 | from .contperceptual import LPIPSWithDiscriminator 2 | -------------------------------------------------------------------------------- /audioldm/utilities/__init__.py: -------------------------------------------------------------------------------- 1 | from .tools import * 2 | from .data import * 3 | from .model_util import * 4 | -------------------------------------------------------------------------------- /audioldm/utilities/audio/__init__.py: -------------------------------------------------------------------------------- 1 | from .audio_processing import * 2 | from .stft import * 3 | from .tools import * 4 | -------------------------------------------------------------------------------- /audioldm/utilities/audio/audio_processing.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | import librosa.util as librosa_util 4 | from scipy.signal import get_window 5 | 6 | 7 | def window_sumsquare( 8 | window, 9 | n_frames, 10 | hop_length, 11 | win_length, 12 | n_fft, 13 | dtype=np.float32, 14 | norm=None, 15 | ): 16 | """ 17 | # from librosa 0.6 18 | Compute the sum-square envelope of a window function at a given hop length. 19 | 20 | This is used to estimate modulation effects induced by windowing 21 | observations in short-time fourier transforms. 22 | 23 | Parameters 24 | ---------- 25 | window : string, tuple, number, callable, or list-like 26 | Window specification, as in `get_window` 27 | 28 | n_frames : int > 0 29 | The number of analysis frames 30 | 31 | hop_length : int > 0 32 | The number of samples to advance between frames 33 | 34 | win_length : [optional] 35 | The length of the window function. By default, this matches `n_fft`. 36 | 37 | n_fft : int > 0 38 | The length of each analysis frame. 39 | 40 | dtype : np.dtype 41 | The data type of the output 42 | 43 | Returns 44 | ------- 45 | wss : np.ndarray, shape=`(n_fft + hop_length * (n_frames - 1))` 46 | The sum-squared envelope of the window function 47 | """ 48 | if win_length is None: 49 | win_length = n_fft 50 | 51 | n = n_fft + hop_length * (n_frames - 1) 52 | x = np.zeros(n, dtype=dtype) 53 | 54 | # Compute the squared window at the desired length 55 | win_sq = get_window(window, win_length, fftbins=True) 56 | win_sq = librosa_util.normalize(win_sq, norm=norm) ** 2 57 | win_sq = librosa_util.pad_center(win_sq, n_fft) 58 | 59 | # Fill the envelope 60 | for i in range(n_frames): 61 | sample = i * hop_length 62 | x[sample : min(n, sample + n_fft)] += win_sq[: max(0, min(n_fft, n - sample))] 63 | return x 64 | 65 | 66 | def griffin_lim(magnitudes, stft_fn, n_iters=30): 67 | """ 68 | PARAMS 69 | ------ 70 | magnitudes: spectrogram magnitudes 71 | stft_fn: STFT class with transform (STFT) and inverse (ISTFT) methods 72 | """ 73 | 74 | angles = np.angle(np.exp(2j * np.pi * np.random.rand(*magnitudes.size()))) 75 | angles = angles.astype(np.float32) 76 | angles = torch.autograd.Variable(torch.from_numpy(angles)) 77 | signal = stft_fn.inverse(magnitudes, angles).squeeze(1) 78 | 79 | for i in range(n_iters): 80 | _, angles = stft_fn.transform(signal) 81 | signal = stft_fn.inverse(magnitudes, angles).squeeze(1) 82 | return signal 83 | 84 | 85 | def dynamic_range_compression(x, normalize_fun=torch.log, C=1, clip_val=1e-5): 86 | """ 87 | PARAMS 88 | ------ 89 | C: compression factor 90 | """ 91 | return normalize_fun(torch.clamp(x, min=clip_val) * C) 92 | 93 | 94 | def dynamic_range_decompression(x, C=1): 95 | """ 96 | PARAMS 97 | ------ 98 | C: compression factor used to compress 99 | """ 100 | return torch.exp(x) / C 101 | -------------------------------------------------------------------------------- /audioldm/utilities/audio/tools.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | from scipy.io.wavfile import write 4 | import torchaudio 5 | 6 | from audioldm.utilities.audio.audio_processing import griffin_lim 7 | 8 | 9 | def get_mel_from_wav(audio, _stft): 10 | audio = torch.clip(torch.FloatTensor(audio).unsqueeze(0), -1, 1) 11 | audio = torch.autograd.Variable(audio, requires_grad=False) 12 | melspec, magnitudes, phases, energy = _stft.mel_spectrogram(audio) 13 | melspec = torch.squeeze(melspec, 0).numpy().astype(np.float32) 14 | magnitudes = torch.squeeze(magnitudes, 0).numpy().astype(np.float32) 15 | energy = torch.squeeze(energy, 0).numpy().astype(np.float32) 16 | return melspec, magnitudes, energy 17 | 18 | 19 | def inv_mel_spec(mel, out_filename, _stft, griffin_iters=60): 20 | mel = torch.stack([mel]) 21 | mel_decompress = _stft.spectral_de_normalize(mel) 22 | mel_decompress = mel_decompress.transpose(1, 2).data.cpu() 23 | spec_from_mel_scaling = 1000 24 | spec_from_mel = torch.mm(mel_decompress[0], _stft.mel_basis) 25 | spec_from_mel = spec_from_mel.transpose(0, 1).unsqueeze(0) 26 | spec_from_mel = spec_from_mel * spec_from_mel_scaling 27 | 28 | audio = griffin_lim( 29 | torch.autograd.Variable(spec_from_mel[:, :, :-1]), _stft._stft_fn, griffin_iters 30 | ) 31 | 32 | audio = audio.squeeze() 33 | audio = audio.cpu().numpy() 34 | audio_path = out_filename 35 | write(audio_path, _stft.sampling_rate, audio) 36 | 37 | def read_wav_file(filename, segment_length): 38 | # waveform, sr = librosa.load(filename, sr=None, mono=True) # 4 times slower 39 | waveform, sr = torchaudio.load(filename) # Faster!!! 40 | waveform = torchaudio.functional.resample(waveform, orig_freq=sr, new_freq=16000) 41 | waveform = waveform.numpy()[0, ...] 42 | waveform = normalize_wav(waveform) 43 | waveform = waveform[None, ...] 44 | waveform = pad_wav(waveform, segment_length) 45 | 46 | waveform = waveform / np.max(np.abs(waveform)) 47 | waveform = 0.5 * waveform 48 | 49 | return waveform -------------------------------------------------------------------------------- /audioldm/utilities/data/__init__.py: -------------------------------------------------------------------------------- 1 | from .dataset import Dataset 2 | -------------------------------------------------------------------------------- /audioldm/utilities/data/utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | from pathlib import Path 3 | import subprocess 4 | import random 5 | import numpy as np 6 | import soundfile as sf 7 | import torch 8 | import torchvision 9 | from moviepy.editor import VideoFileClip, AudioFileClip 10 | 11 | from encoder.encoder_utils import which_ffmpeg 12 | 13 | def get_video_and_audio(path, get_meta=False, duration=5, start_sec=0, end_sec=None, random_start = False): 14 | rgb, audio, meta = torchvision.io.read_video(str(path), start_sec, end_sec, 'sec', output_format='TCHW') 15 | assert meta['video_fps'], f'No video fps for {path}' 16 | 17 | vlen = int(duration * meta['video_fps']) 18 | meta_out = {'video': {'fps': [meta['video_fps']]}} 19 | 20 | 21 | if random_start: 22 | stx = random.randint(0, int(rgb.size(0)/meta['video_fps']) - duration) 23 | else: 24 | stx = 0 25 | 26 | rgb = rgb[int(stx*meta['video_fps']):int(stx*meta['video_fps']+vlen), :, :, :] 27 | 28 | if rgb.shape[0] < vlen: 29 | 30 | rgb = torch.cat([rgb, rgb, rgb], dim=0) 31 | rgb = rgb[int(stx*meta['video_fps']):int(stx*meta['video_fps']+vlen), :, :, :] 32 | 33 | 34 | if meta.get('audio_fps'): 35 | alen = int(duration * meta['audio_fps']) 36 | audio = audio.mean(dim=0) 37 | audio = audio[stx*meta['audio_fps']:(stx*meta['audio_fps']+alen)] 38 | meta_out['audio'] = {'framerate': [meta['audio_fps']]} 39 | else: 40 | meta_out['audio'] = {'framerate': [16000]} 41 | 42 | return rgb, audio, meta_out 43 | 44 | 45 | def save_wave(waveform, savepath, name="outwav"): 46 | if type(name) is not list: 47 | name = [name] * waveform.shape[0] 48 | 49 | paths = [] 50 | for i in range(waveform.shape[0]): 51 | path = os.path.join( 52 | savepath, 53 | "%s_%s.wav" 54 | % ( 55 | os.path.basename(name[i]) 56 | if (not ".wav" in name[i]) 57 | else os.path.basename(name[i]).split(".")[0], 58 | i, 59 | ), 60 | ) 61 | paths.append(path) 62 | print("Save audio to %s" % path) 63 | sf.write(path, waveform[i, 0], samplerate=16000) 64 | 65 | return paths 66 | 67 | def save_video(audio_path, video_path): 68 | 69 | video_clip = VideoFileClip(video_path) 70 | video_clip = video_clip.subclip(0, 5) # generated audio duration is 5 seconds. 71 | 72 | audio_clip = AudioFileClip(audio_path) 73 | video_clip = video_clip.set_audio(audio_clip) 74 | 75 | # Output file path for the final video with audio 76 | out_video_path = audio_path.replace('.wav', '.mp4') 77 | 78 | # Write the video clip with the audio to a new file 79 | video_clip.write_videofile(out_video_path, audio_codec='aac') 80 | 81 | # Close the clips 82 | video_clip.close() 83 | audio_clip.close() 84 | 85 | return 86 | 87 | def re_encode_video(new_path, path, vfps=25, afps=16000, in_size=256): 88 | assert which_ffmpeg() != '', 'Is ffmpeg installed? Check if the conda environment is activated.' 89 | 90 | os.makedirs(new_path, exist_ok=True) 91 | 92 | new_path += f"/{Path(path).stem}_{vfps}fps_{in_size}side_{afps}hz.mp4" 93 | new_path = str(new_path) 94 | cmd = f"{which_ffmpeg()}" 95 | # no info/error printing 96 | cmd += " -hide_banner -loglevel panic" 97 | cmd += f" -y -i {path}" 98 | # 1) change fps, 2) resize: min(H,W)=MIN_SIDE (vertical vids are supported), 3) change audio framerate 99 | cmd += f" -vf fps={vfps},scale=iw*{in_size}/'min(iw,ih)':ih*{in_size}/'min(iw,ih)',crop='trunc(iw/2)'*2:'trunc(ih/2)'*2" 100 | cmd += f" -ar {afps}" 101 | cmd += f" {new_path}" 102 | subprocess.call(cmd.split()) 103 | return new_path 104 | -------------------------------------------------------------------------------- /basketball_bounce.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/naver-ai/rewas/4e09954d386583a4e62e7a1e28c2145ca1cfc411/basketball_bounce.mp4 -------------------------------------------------------------------------------- /configs/audioldm_m_rewas.yaml: -------------------------------------------------------------------------------- 1 | metadata_root: "audioldm_train/config/2023_08_23_reproduce_audioldm/dataset_root.json" 2 | log_directory: ./logs/audioldm_vggsound 3 | project: "audioldm_vggsound" 4 | precision: "high" 5 | data_root: "/path/to/dataset" 6 | 7 | variables: 8 | sampling_rate: &sampling_rate 16000 9 | mel_bins: &mel_bins 64 10 | latent_embed_dim: &latent_embed_dim 8 11 | latent_t_size: &latent_t_size 128 12 | latent_f_size: &latent_f_size 16 13 | in_channels: &unet_in_channels 8 14 | optimize_ddpm_parameter: &optimize_ddpm_parameter true 15 | optimize_gpt: &optimize_gpt true 16 | warmup_steps: &warmup_steps 2000 17 | 18 | data: 19 | train: ["vggsound"] 20 | val: "vggsound" 21 | test: "vggsound" 22 | class_label_indices: "vggsound.tsv" 23 | dataloader_add_ons: [] 24 | dropout: 0.3 25 | train_mode: "energy" 26 | 27 | step: 28 | validation_every_n_epochs: 5 29 | save_checkpoint_every_n_steps: 5000 30 | max_steps: 800000 31 | save_top_k: 1 32 | 33 | preprocessing: 34 | audio: 35 | sampling_rate: *sampling_rate 36 | max_wav_value: 32768.0 37 | duration: 5.12 38 | stft: 39 | filter_length: 1024 40 | hop_length: 160 41 | win_length: 1024 42 | mel: 43 | n_mel_channels: *mel_bins 44 | mel_fmin: 0 45 | mel_fmax: 8000 46 | 47 | augmentation: 48 | mixup: 0.0 49 | 50 | model: 51 | target: audioldm.rewas.ReWaS # audioldm_train.modules.latent_diffusion.ddpm.LatentDiffusion 52 | params: 53 | # Autoencoder 54 | first_stage_config: 55 | base_learning_rate: 8.0e-06 56 | target: audioldm.latent_encoder.autoencoder.AutoencoderKL 57 | params: 58 | reload_from_ckpt: ./ckpts/vae_mel_16k_64bins.ckpt 59 | sampling_rate: *sampling_rate 60 | batchsize: 4 61 | monitor: val/rec_loss 62 | image_key: fbank 63 | subband: 1 64 | embed_dim: *latent_embed_dim 65 | time_shuffle: 1 66 | lossconfig: 67 | target: audioldm.losses.LPIPSWithDiscriminator 68 | params: 69 | disc_start: 50001 70 | kl_weight: 1000.0 71 | disc_weight: 0.5 72 | disc_in_channels: 1 73 | ddconfig: 74 | double_z: true 75 | mel_bins: *mel_bins # The frequency bins of mel spectrogram 76 | z_channels: 8 77 | resolution: 128 # 256 78 | downsample_time: false 79 | in_channels: 1 80 | out_ch: 1 81 | ch: 128 82 | ch_mult: 83 | - 1 84 | - 2 85 | - 4 86 | num_res_blocks: 2 87 | attn_resolutions: [] 88 | dropout: 0.0 89 | hifigan_ckpt: "./ckpts/hifigan_16k_64bins.ckpt" 90 | 91 | 92 | control_stage_config: 93 | target: audioldm.rewas.Adapter 94 | params: 95 | image_size: 64 96 | extra_film_condition_dim: 512 # If you use film as extra condition, set this parameter. For example if you have two conditioning vectors each have dimension 512, then this number would be 1024 97 | in_channels: *unet_in_channels # The input channel of the UNet model 98 | model_channels: 192 99 | hint_channels: 1 100 | attention_resolutions: 101 | - 8 102 | - 4 103 | - 2 104 | num_res_blocks: 2 105 | channel_mult: 106 | - 1 107 | - 2 108 | - 3 109 | - 5 110 | num_head_channels: 32 111 | use_spatial_transformer: true 112 | transformer_depth: 1 113 | extra_sa_layer: False 114 | use_checkpoint: true 115 | legacy: False 116 | 117 | # Other parameters 118 | base_learning_rate: 1.0e-5 119 | warmup_steps: *warmup_steps 120 | optimize_ddpm_parameter: *optimize_ddpm_parameter 121 | sampling_rate: *sampling_rate 122 | batchsize: 2 123 | linear_start: 0.0015 124 | linear_end: 0.0195 125 | num_timesteps_cond: 1 126 | log_every_t: 200 127 | timesteps: 1000 128 | unconditional_prob_cfg: 0.1 129 | parameterization: eps 130 | first_stage_key: fbank 131 | latent_t_size: *latent_t_size 132 | latent_f_size: *latent_f_size 133 | channels: *latent_embed_dim 134 | monitor: val/loss_simple_ema 135 | scale_by_std: true 136 | control_key: "hint" 137 | only_mid_control: False 138 | unet_config: 139 | target: audioldm.rewas.ControlledUnetModel 140 | params: 141 | image_size: 64 142 | extra_film_condition_dim: 512 # If you use film as extra condition, set this parameter. For example if you have two conditioning vectors each have dimension 512, then this number would be 1024 143 | in_channels: *unet_in_channels # The input channel of the UNet model 144 | out_channels: *latent_embed_dim 145 | model_channels: 192 146 | attention_resolutions: 147 | - 8 148 | - 4 149 | - 2 150 | num_res_blocks: 2 151 | channel_mult: 152 | - 1 153 | - 2 154 | - 3 155 | - 5 156 | num_head_channels: 32 157 | use_spatial_transformer: true 158 | transformer_depth: 1 159 | extra_sa_layer: False 160 | 161 | cond_stage_config: 162 | film_clap_cond1: 163 | cond_stage_key: text 164 | conditioning_key: film 165 | target: audioldm.conditional_models.CLAPAudioEmbeddingClassifierFreev2 166 | params: 167 | pretrained_path: ./ckpts/clap_music_speech_audioset_epoch_15_esc_89.98.pt 168 | sampling_rate: 16000 169 | embed_mode: text 170 | amodel: HTSAT-base 171 | 172 | evaluation_params: 173 | unconditional_guidance_scale: 3.5 174 | ddim_sampling_steps: 200 175 | n_candidates_per_samples: 3 176 | -------------------------------------------------------------------------------- /configs/dataset_root.json: -------------------------------------------------------------------------------- 1 | { 2 | "greatesthits": "/path/to/Greatest_Hits/data/", 3 | "vggsound": "/path/to/vggsound/audiodata/", 4 | "comments":{ 5 | }, 6 | 7 | "metadata":{ 8 | "path": { 9 | "greatesthits":{ 10 | "train": "/path/to/Greatest_Hits/metadata/greatesthits_train_label.json", 11 | "test": "/path/to/Greatest_Hits/metadata/greatesthits_test_label.json", 12 | "val": "/path/to/Greatest_Hits/metadata/greatesthits_val_label.json", 13 | "class_label_indices": "/path/to/Greatest_Hits/metadata/class_labels_indices.csv" 14 | }, 15 | "vggsound":{ 16 | "train": "/path/to/vggsound/metadata/vggsound_train_label.json", 17 | "test": "/path/to/vggsound/metadata/vggsound_test_label.json", 18 | "val": "/path/to/vggsound/metadata/vggsound_valid_label.json", 19 | "class_label_indices": "./vggsound.tsv" 20 | } 21 | } 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /encoder/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 Vladimir Iashin 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /encoder/README.md: -------------------------------------------------------------------------------- 1 | ### Reference 2 | 3 | Part of the code is borrowed from the following repos. 4 | 5 | https://github.com/v-iashin/Synchformer 6 | -------------------------------------------------------------------------------- /encoder/model/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/naver-ai/rewas/4e09954d386583a4e62e7a1e28c2145ca1cfc411/encoder/model/.DS_Store -------------------------------------------------------------------------------- /encoder/model/modules/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/naver-ai/rewas/4e09954d386583a4e62e7a1e28c2145ca1cfc411/encoder/model/modules/.DS_Store -------------------------------------------------------------------------------- /encoder/model/modules/feat_extractors/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/naver-ai/rewas/4e09954d386583a4e62e7a1e28c2145ca1cfc411/encoder/model/modules/feat_extractors/.DS_Store -------------------------------------------------------------------------------- /encoder/model/modules/feat_extractors/train_clip_src/__init__.py: -------------------------------------------------------------------------------- 1 | import sys 2 | sys.path.append('model/modules/feat_extractors') 3 | sys.path.append('model/modules/feat_extractors/train_clip_src') 4 | -------------------------------------------------------------------------------- /encoder/model/modules/feat_extractors/train_clip_src/open_clip/__init__.py: -------------------------------------------------------------------------------- 1 | from .coca_model import CoCa 2 | from .constants import OPENAI_DATASET_MEAN, OPENAI_DATASET_STD 3 | from .factory import create_model, create_model_from_pretrained, get_tokenizer, create_loss 4 | from .factory import list_models, add_model_config, get_model_config, load_checkpoint 5 | from .loss import ClipLoss, DistillClipLoss, CoCaLoss 6 | from .model import CLIP, CustomTextCLIP, CLIPTextCfg, CLIPVisionCfg, \ 7 | convert_weights_to_lp, convert_weights_to_fp16, trace_model, get_cast_dtype 8 | from .openai import load_openai_model, list_openai_models 9 | from .pretrained import list_pretrained, list_pretrained_models_by_tag, list_pretrained_tags_by_model, \ 10 | get_pretrained_url, download_pretrained_from_url, is_pretrained_cfg, get_pretrained_cfg, download_pretrained 11 | from .push_to_hf_hub import push_pretrained_to_hf_hub, push_to_hf_hub 12 | from .tokenizer import SimpleTokenizer, tokenize, decode 13 | from .transform import image_transform, AugmentationCfg 14 | -------------------------------------------------------------------------------- /encoder/model/modules/feat_extractors/train_clip_src/open_clip/bpe_simple_vocab_16e6.txt.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/naver-ai/rewas/4e09954d386583a4e62e7a1e28c2145ca1cfc411/encoder/model/modules/feat_extractors/train_clip_src/open_clip/bpe_simple_vocab_16e6.txt.gz -------------------------------------------------------------------------------- /encoder/model/modules/feat_extractors/train_clip_src/open_clip/constants.py: -------------------------------------------------------------------------------- 1 | OPENAI_DATASET_MEAN = (0.48145466, 0.4578275, 0.40821073) 2 | OPENAI_DATASET_STD = (0.26862954, 0.26130258, 0.27577711) 3 | -------------------------------------------------------------------------------- /encoder/model/modules/feat_extractors/train_clip_src/open_clip/generation_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/naver-ai/rewas/4e09954d386583a4e62e7a1e28c2145ca1cfc411/encoder/model/modules/feat_extractors/train_clip_src/open_clip/generation_utils.py -------------------------------------------------------------------------------- /encoder/model/modules/feat_extractors/train_clip_src/open_clip/hf_configs.py: -------------------------------------------------------------------------------- 1 | # HF architecture dict: 2 | arch_dict = { 3 | # https://huggingface.co/docs/transformers/model_doc/roberta#roberta 4 | "roberta": { 5 | "config_names": { 6 | "context_length": "max_position_embeddings", 7 | "vocab_size": "vocab_size", 8 | "width": "hidden_size", 9 | "heads": "num_attention_heads", 10 | "layers": "num_hidden_layers", 11 | "layer_attr": "layer", 12 | "token_embeddings_attr": "embeddings" 13 | }, 14 | "pooler": "mean_pooler", 15 | }, 16 | # https://huggingface.co/docs/transformers/model_doc/xlm-roberta#transformers.XLMRobertaConfig 17 | "xlm-roberta": { 18 | "config_names": { 19 | "context_length": "max_position_embeddings", 20 | "vocab_size": "vocab_size", 21 | "width": "hidden_size", 22 | "heads": "num_attention_heads", 23 | "layers": "num_hidden_layers", 24 | "layer_attr": "layer", 25 | "token_embeddings_attr": "embeddings" 26 | }, 27 | "pooler": "mean_pooler", 28 | }, 29 | # https://huggingface.co/docs/transformers/model_doc/mt5#mt5 30 | "mt5": { 31 | "config_names": { 32 | # unlimited seqlen 33 | # https://github.com/google-research/text-to-text-transfer-transformer/issues/273 34 | # https://github.com/huggingface/transformers/blob/v4.24.0/src/transformers/models/t5/modeling_t5.py#L374 35 | "context_length": "", 36 | "vocab_size": "vocab_size", 37 | "width": "d_model", 38 | "heads": "num_heads", 39 | "layers": "num_layers", 40 | "layer_attr": "block", 41 | "token_embeddings_attr": "embed_tokens" 42 | }, 43 | "pooler": "mean_pooler", 44 | }, 45 | } 46 | -------------------------------------------------------------------------------- /encoder/model/modules/feat_extractors/train_clip_src/open_clip/model_configs/RN101-quickgelu.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "quick_gelu": true, 4 | "vision_cfg": { 5 | "image_size": 224, 6 | "layers": [ 7 | 3, 8 | 4, 9 | 23, 10 | 3 11 | ], 12 | "width": 64, 13 | "patch_size": null 14 | }, 15 | "text_cfg": { 16 | "context_length": 77, 17 | "vocab_size": 49408, 18 | "width": 512, 19 | "heads": 8, 20 | "layers": 12 21 | } 22 | } -------------------------------------------------------------------------------- /encoder/model/modules/feat_extractors/train_clip_src/open_clip/model_configs/RN101.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": [ 6 | 3, 7 | 4, 8 | 23, 9 | 3 10 | ], 11 | "width": 64, 12 | "patch_size": null 13 | }, 14 | "text_cfg": { 15 | "context_length": 77, 16 | "vocab_size": 49408, 17 | "width": 512, 18 | "heads": 8, 19 | "layers": 12 20 | } 21 | } -------------------------------------------------------------------------------- /encoder/model/modules/feat_extractors/train_clip_src/open_clip/model_configs/RN50-quickgelu.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "quick_gelu": true, 4 | "vision_cfg": { 5 | "image_size": 224, 6 | "layers": [ 7 | 3, 8 | 4, 9 | 6, 10 | 3 11 | ], 12 | "width": 64, 13 | "patch_size": null 14 | }, 15 | "text_cfg": { 16 | "context_length": 77, 17 | "vocab_size": 49408, 18 | "width": 512, 19 | "heads": 8, 20 | "layers": 12 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /encoder/model/modules/feat_extractors/train_clip_src/open_clip/model_configs/RN50.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": [ 6 | 3, 7 | 4, 8 | 6, 9 | 3 10 | ], 11 | "width": 64, 12 | "patch_size": null 13 | }, 14 | "text_cfg": { 15 | "context_length": 77, 16 | "vocab_size": 49408, 17 | "width": 512, 18 | "heads": 8, 19 | "layers": 12 20 | } 21 | } -------------------------------------------------------------------------------- /encoder/model/modules/feat_extractors/train_clip_src/open_clip/model_configs/RN50x16.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 768, 3 | "vision_cfg": { 4 | "image_size": 384, 5 | "layers": [ 6 | 6, 7 | 8, 8 | 18, 9 | 8 10 | ], 11 | "width": 96, 12 | "patch_size": null 13 | }, 14 | "text_cfg": { 15 | "context_length": 77, 16 | "vocab_size": 49408, 17 | "width": 768, 18 | "heads": 12, 19 | "layers": 12 20 | } 21 | } -------------------------------------------------------------------------------- /encoder/model/modules/feat_extractors/train_clip_src/open_clip/model_configs/RN50x4.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 640, 3 | "vision_cfg": { 4 | "image_size": 288, 5 | "layers": [ 6 | 4, 7 | 6, 8 | 10, 9 | 6 10 | ], 11 | "width": 80, 12 | "patch_size": null 13 | }, 14 | "text_cfg": { 15 | "context_length": 77, 16 | "vocab_size": 49408, 17 | "width": 640, 18 | "heads": 10, 19 | "layers": 12 20 | } 21 | } -------------------------------------------------------------------------------- /encoder/model/modules/feat_extractors/train_clip_src/open_clip/model_configs/RN50x64.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "image_size": 448, 5 | "layers": [ 6 | 3, 7 | 15, 8 | 36, 9 | 10 10 | ], 11 | "width": 128, 12 | "patch_size": null 13 | }, 14 | "text_cfg": { 15 | "context_length": 77, 16 | "vocab_size": 49408, 17 | "width": 1024, 18 | "heads": 16, 19 | "layers": 12 20 | } 21 | } -------------------------------------------------------------------------------- /encoder/model/modules/feat_extractors/train_clip_src/open_clip/model_configs/ViT-B-16-plus-240.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 640, 3 | "vision_cfg": { 4 | "image_size": 240, 5 | "layers": 12, 6 | "width": 896, 7 | "patch_size": 16 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 640, 13 | "heads": 10, 14 | "layers": 12 15 | } 16 | } -------------------------------------------------------------------------------- /encoder/model/modules/feat_extractors/train_clip_src/open_clip/model_configs/ViT-B-16-plus.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 640, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 12, 6 | "width": 896, 7 | "patch_size": 16 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 640, 13 | "heads": 10, 14 | "layers": 12 15 | } 16 | } -------------------------------------------------------------------------------- /encoder/model/modules/feat_extractors/train_clip_src/open_clip/model_configs/ViT-B-16.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 12, 6 | "width": 768, 7 | "patch_size": 16 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 512, 13 | "heads": 8, 14 | "layers": 12 15 | } 16 | } -------------------------------------------------------------------------------- /encoder/model/modules/feat_extractors/train_clip_src/open_clip/model_configs/ViT-B-32-plus-256.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 640, 3 | "vision_cfg": { 4 | "image_size": 256, 5 | "layers": 12, 6 | "width": 896, 7 | "patch_size": 32 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 640, 13 | "heads": 10, 14 | "layers": 12 15 | } 16 | } -------------------------------------------------------------------------------- /encoder/model/modules/feat_extractors/train_clip_src/open_clip/model_configs/ViT-B-32-quickgelu.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "quick_gelu": true, 4 | "vision_cfg": { 5 | "image_size": 224, 6 | "layers": 12, 7 | "width": 768, 8 | "patch_size": 32 9 | }, 10 | "text_cfg": { 11 | "context_length": 77, 12 | "vocab_size": 49408, 13 | "width": 512, 14 | "heads": 8, 15 | "layers": 12 16 | } 17 | } -------------------------------------------------------------------------------- /encoder/model/modules/feat_extractors/train_clip_src/open_clip/model_configs/ViT-B-32.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 12, 6 | "width": 768, 7 | "patch_size": 32 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 512, 13 | "heads": 8, 14 | "layers": 12 15 | } 16 | } -------------------------------------------------------------------------------- /encoder/model/modules/feat_extractors/train_clip_src/open_clip/model_configs/ViT-H-14.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 32, 6 | "width": 1280, 7 | "head_width": 80, 8 | "patch_size": 14 9 | }, 10 | "text_cfg": { 11 | "context_length": 77, 12 | "vocab_size": 49408, 13 | "width": 1024, 14 | "heads": 16, 15 | "layers": 24 16 | } 17 | } -------------------------------------------------------------------------------- /encoder/model/modules/feat_extractors/train_clip_src/open_clip/model_configs/ViT-H-16.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 32, 6 | "width": 1280, 7 | "head_width": 80, 8 | "patch_size": 16 9 | }, 10 | "text_cfg": { 11 | "context_length": 77, 12 | "vocab_size": 49408, 13 | "width": 1024, 14 | "heads": 16, 15 | "layers": 24 16 | } 17 | } -------------------------------------------------------------------------------- /encoder/model/modules/feat_extractors/train_clip_src/open_clip/model_configs/ViT-L-14-280.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 768, 3 | "vision_cfg": { 4 | "image_size": 280, 5 | "layers": 24, 6 | "width": 1024, 7 | "patch_size": 14 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 768, 13 | "heads": 12, 14 | "layers": 12 15 | } 16 | } -------------------------------------------------------------------------------- /encoder/model/modules/feat_extractors/train_clip_src/open_clip/model_configs/ViT-L-14-336.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 768, 3 | "vision_cfg": { 4 | "image_size": 336, 5 | "layers": 24, 6 | "width": 1024, 7 | "patch_size": 14 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 768, 13 | "heads": 12, 14 | "layers": 12 15 | } 16 | } -------------------------------------------------------------------------------- /encoder/model/modules/feat_extractors/train_clip_src/open_clip/model_configs/ViT-L-14.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 768, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 24, 6 | "width": 1024, 7 | "patch_size": 14 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 768, 13 | "heads": 12, 14 | "layers": 12 15 | } 16 | } -------------------------------------------------------------------------------- /encoder/model/modules/feat_extractors/train_clip_src/open_clip/model_configs/ViT-L-16-320.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 768, 3 | "vision_cfg": { 4 | "image_size": 320, 5 | "layers": 24, 6 | "width": 1024, 7 | "patch_size": 16 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 768, 13 | "heads": 12, 14 | "layers": 12 15 | } 16 | } -------------------------------------------------------------------------------- /encoder/model/modules/feat_extractors/train_clip_src/open_clip/model_configs/ViT-L-16.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 768, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 24, 6 | "width": 1024, 7 | "patch_size": 16 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 768, 13 | "heads": 12, 14 | "layers": 12 15 | } 16 | } -------------------------------------------------------------------------------- /encoder/model/modules/feat_extractors/train_clip_src/open_clip/model_configs/ViT-M-16-alt.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 384, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 12, 6 | "width": 512, 7 | "patch_size": 16, 8 | "ls_init_value": 1e-4 9 | }, 10 | "text_cfg": { 11 | "context_length": 77, 12 | "vocab_size": 49408, 13 | "width": 384, 14 | "heads": 6, 15 | "layers": 12 16 | } 17 | } -------------------------------------------------------------------------------- /encoder/model/modules/feat_extractors/train_clip_src/open_clip/model_configs/ViT-M-16.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 12, 6 | "width": 512, 7 | "patch_size": 16 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 512, 13 | "heads": 8, 14 | "layers": 12 15 | } 16 | } -------------------------------------------------------------------------------- /encoder/model/modules/feat_extractors/train_clip_src/open_clip/model_configs/ViT-M-32-alt.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 384, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 12, 6 | "width": 512, 7 | "patch_size": 32 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 384, 13 | "heads": 6, 14 | "layers": 12 15 | } 16 | } -------------------------------------------------------------------------------- /encoder/model/modules/feat_extractors/train_clip_src/open_clip/model_configs/ViT-M-32.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 12, 6 | "width": 512, 7 | "patch_size": 32 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 512, 13 | "heads": 8, 14 | "layers": 12 15 | } 16 | } -------------------------------------------------------------------------------- /encoder/model/modules/feat_extractors/train_clip_src/open_clip/model_configs/ViT-S-16-alt.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 256, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 12, 6 | "width": 384, 7 | "patch_size": 16 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 256, 13 | "heads": 4, 14 | "layers": 10 15 | } 16 | } -------------------------------------------------------------------------------- /encoder/model/modules/feat_extractors/train_clip_src/open_clip/model_configs/ViT-S-16.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 384, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 12, 6 | "width": 384, 7 | "patch_size": 16 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 384, 13 | "heads": 6, 14 | "layers": 12 15 | } 16 | } -------------------------------------------------------------------------------- /encoder/model/modules/feat_extractors/train_clip_src/open_clip/model_configs/ViT-S-32-alt.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 256, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 12, 6 | "width": 384, 7 | "patch_size": 32 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 256, 13 | "heads": 4, 14 | "layers": 10 15 | } 16 | } -------------------------------------------------------------------------------- /encoder/model/modules/feat_extractors/train_clip_src/open_clip/model_configs/ViT-S-32.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 384, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 12, 6 | "width": 384, 7 | "patch_size": 32 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 384, 13 | "heads": 6, 14 | "layers": 12 15 | } 16 | } -------------------------------------------------------------------------------- /encoder/model/modules/feat_extractors/train_clip_src/open_clip/model_configs/ViT-bigG-14.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1280, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 48, 6 | "width": 1664, 7 | "head_width": 104, 8 | "mlp_ratio": 4.9231, 9 | "patch_size": 14 10 | }, 11 | "text_cfg": { 12 | "context_length": 77, 13 | "vocab_size": 49408, 14 | "width": 1280, 15 | "heads": 20, 16 | "layers": 32 17 | } 18 | } -------------------------------------------------------------------------------- /encoder/model/modules/feat_extractors/train_clip_src/open_clip/model_configs/ViT-e-14.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1280, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 56, 6 | "width": 1792, 7 | "head_width": 112, 8 | "mlp_ratio": 8.5715, 9 | "patch_size": 14 10 | }, 11 | "text_cfg": { 12 | "context_length": 77, 13 | "vocab_size": 49408, 14 | "width": 1280, 15 | "heads": 20, 16 | "layers": 36 17 | } 18 | } -------------------------------------------------------------------------------- /encoder/model/modules/feat_extractors/train_clip_src/open_clip/model_configs/ViT-g-14.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 40, 6 | "width": 1408, 7 | "head_width": 88, 8 | "mlp_ratio": 4.3637, 9 | "patch_size": 14 10 | }, 11 | "text_cfg": { 12 | "context_length": 77, 13 | "vocab_size": 49408, 14 | "width": 1024, 15 | "heads": 16, 16 | "layers": 24 17 | } 18 | } -------------------------------------------------------------------------------- /encoder/model/modules/feat_extractors/train_clip_src/open_clip/model_configs/coca_ViT-B-32.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 12, 6 | "width": 768, 7 | "patch_size": 32, 8 | "attentional_pool": true, 9 | "attn_pooler_heads": 8, 10 | "output_tokens": true 11 | }, 12 | "text_cfg": { 13 | "context_length": 76, 14 | "vocab_size": 49408, 15 | "width": 512, 16 | "heads": 8, 17 | "layers": 12, 18 | "embed_cls": true, 19 | "output_tokens": true 20 | }, 21 | "multimodal_cfg": { 22 | "context_length": 76, 23 | "vocab_size": 49408, 24 | "width": 512, 25 | "heads": 8, 26 | "layers": 12, 27 | "attn_pooler_heads": 8 28 | }, 29 | "custom_text": true 30 | } -------------------------------------------------------------------------------- /encoder/model/modules/feat_extractors/train_clip_src/open_clip/model_configs/coca_ViT-L-14.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 768, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 24, 6 | "width": 1024, 7 | "patch_size": 14, 8 | "attentional_pool": true, 9 | "attn_pooler_heads": 8, 10 | "output_tokens": true 11 | }, 12 | "text_cfg": { 13 | "context_length": 76, 14 | "vocab_size": 49408, 15 | "width": 768, 16 | "heads": 12, 17 | "layers": 12, 18 | "embed_cls": true, 19 | "output_tokens": true 20 | }, 21 | "multimodal_cfg": { 22 | "context_length": 76, 23 | "vocab_size": 49408, 24 | "width": 768, 25 | "heads": 12, 26 | "layers": 12, 27 | "attn_pooler_heads": 12 28 | }, 29 | "custom_text": true 30 | } 31 | -------------------------------------------------------------------------------- /encoder/model/modules/feat_extractors/train_clip_src/open_clip/model_configs/coca_base.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "multimodal_cfg": { 4 | "width": 768, 5 | "context_length": 76, 6 | "vocab_size": 64000, 7 | "mlp_ratio": 4, 8 | "layers": 12, 9 | "dim_head": 64, 10 | "heads": 12, 11 | "n_queries": 256, 12 | "attn_pooler_heads": 8 13 | }, 14 | "vision_cfg": { 15 | "image_size": 288, 16 | "layers": 12, 17 | "width": 768, 18 | "patch_size": 18, 19 | "output_tokens": true 20 | }, 21 | "text_cfg": { 22 | "context_length": 76, 23 | "vocab_size": 64000, 24 | "layers": 12, 25 | "heads": 12, 26 | "width": 768, 27 | "embed_cls": true, 28 | "output_tokens": true 29 | }, 30 | "custom_text": true 31 | } -------------------------------------------------------------------------------- /encoder/model/modules/feat_extractors/train_clip_src/open_clip/model_configs/coca_roberta-ViT-B-32.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 12, 6 | "width": 768, 7 | "patch_size": 32, 8 | "output_tokens": true 9 | }, 10 | "text_cfg": { 11 | "hf_model_name": "roberta-base", 12 | "hf_tokenizer_name": "roberta-base", 13 | "proj": "linear", 14 | "width": 768, 15 | "output_tokens": true 16 | }, 17 | "multimodal_cfg": { 18 | "context_length": 76, 19 | "width": 768, 20 | "heads": 8, 21 | "layers": 12 22 | }, 23 | "custom_text": true 24 | } 25 | -------------------------------------------------------------------------------- /encoder/model/modules/feat_extractors/train_clip_src/open_clip/model_configs/convnext_base.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "vision_cfg": { 4 | "timm_model_name": "convnext_base", 5 | "timm_model_pretrained": false, 6 | "timm_pool": "", 7 | "timm_proj": "linear", 8 | "timm_drop": 0.0, 9 | "timm_drop_path": 0.1, 10 | "image_size": 224 11 | }, 12 | "text_cfg": { 13 | "context_length": 77, 14 | "vocab_size": 49408, 15 | "width": 512, 16 | "heads": 8, 17 | "layers": 12 18 | } 19 | } -------------------------------------------------------------------------------- /encoder/model/modules/feat_extractors/train_clip_src/open_clip/model_configs/convnext_base_w.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 640, 3 | "vision_cfg": { 4 | "timm_model_name": "convnext_base", 5 | "timm_model_pretrained": false, 6 | "timm_pool": "", 7 | "timm_proj": "linear", 8 | "timm_drop": 0.0, 9 | "timm_drop_path": 0.1, 10 | "image_size": 256 11 | }, 12 | "text_cfg": { 13 | "context_length": 77, 14 | "vocab_size": 49408, 15 | "width": 640, 16 | "heads": 10, 17 | "layers": 12 18 | } 19 | } -------------------------------------------------------------------------------- /encoder/model/modules/feat_extractors/train_clip_src/open_clip/model_configs/convnext_base_w_320.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 640, 3 | "vision_cfg": { 4 | "timm_model_name": "convnext_base", 5 | "timm_model_pretrained": false, 6 | "timm_pool": "", 7 | "timm_proj": "linear", 8 | "timm_drop": 0.0, 9 | "timm_drop_path": 0.1, 10 | "image_size": 320 11 | }, 12 | "text_cfg": { 13 | "context_length": 77, 14 | "vocab_size": 49408, 15 | "width": 640, 16 | "heads": 10, 17 | "layers": 12 18 | } 19 | } -------------------------------------------------------------------------------- /encoder/model/modules/feat_extractors/train_clip_src/open_clip/model_configs/convnext_large.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 768, 3 | "vision_cfg": { 4 | "timm_model_name": "convnext_large", 5 | "timm_model_pretrained": false, 6 | "timm_pool": "", 7 | "timm_proj": "linear", 8 | "timm_drop": 0.0, 9 | "timm_drop_path": 0.1, 10 | "image_size": 224 11 | }, 12 | "text_cfg": { 13 | "context_length": 77, 14 | "vocab_size": 49408, 15 | "width": 768, 16 | "heads": 12, 17 | "layers": 12 18 | } 19 | } -------------------------------------------------------------------------------- /encoder/model/modules/feat_extractors/train_clip_src/open_clip/model_configs/convnext_large_d.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 768, 3 | "vision_cfg": { 4 | "timm_model_name": "convnext_large", 5 | "timm_model_pretrained": false, 6 | "timm_pool": "", 7 | "timm_proj": "mlp", 8 | "timm_drop": 0.0, 9 | "timm_drop_path": 0.1, 10 | "image_size": 256 11 | }, 12 | "text_cfg": { 13 | "context_length": 77, 14 | "vocab_size": 49408, 15 | "width": 768, 16 | "heads": 12, 17 | "layers": 16 18 | } 19 | } -------------------------------------------------------------------------------- /encoder/model/modules/feat_extractors/train_clip_src/open_clip/model_configs/convnext_large_d_320.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 768, 3 | "vision_cfg": { 4 | "timm_model_name": "convnext_large", 5 | "timm_model_pretrained": false, 6 | "timm_pool": "", 7 | "timm_proj": "mlp", 8 | "timm_drop": 0.0, 9 | "timm_drop_path": 0.1, 10 | "image_size": 320 11 | }, 12 | "text_cfg": { 13 | "context_length": 77, 14 | "vocab_size": 49408, 15 | "width": 768, 16 | "heads": 12, 17 | "layers": 16 18 | } 19 | } -------------------------------------------------------------------------------- /encoder/model/modules/feat_extractors/train_clip_src/open_clip/model_configs/convnext_small.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "vision_cfg": { 4 | "timm_model_name": "convnext_small", 5 | "timm_model_pretrained": false, 6 | "timm_pool": "", 7 | "timm_proj": "linear", 8 | "timm_drop": 0.0, 9 | "timm_drop_path": 0.1, 10 | "image_size": 224 11 | }, 12 | "text_cfg": { 13 | "context_length": 77, 14 | "vocab_size": 49408, 15 | "width": 512, 16 | "heads": 8, 17 | "layers": 12 18 | } 19 | } -------------------------------------------------------------------------------- /encoder/model/modules/feat_extractors/train_clip_src/open_clip/model_configs/convnext_tiny.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "timm_model_name": "convnext_tiny", 5 | "timm_model_pretrained": false, 6 | "timm_pool": "", 7 | "timm_proj": "linear", 8 | "timm_drop": 0.0, 9 | "timm_drop_path": 0.1, 10 | "image_size": 224 11 | }, 12 | "text_cfg": { 13 | "context_length": 77, 14 | "vocab_size": 49408, 15 | "width": 512, 16 | "heads": 8, 17 | "layers": 12 18 | } 19 | } -------------------------------------------------------------------------------- /encoder/model/modules/feat_extractors/train_clip_src/open_clip/model_configs/convnext_xlarge.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "timm_model_name": "convnext_xlarge", 5 | "timm_model_pretrained": false, 6 | "timm_pool": "", 7 | "timm_proj": "linear", 8 | "timm_drop": 0.0, 9 | "timm_drop_path": 0.1, 10 | "image_size": 256 11 | }, 12 | "text_cfg": { 13 | "context_length": 77, 14 | "vocab_size": 49408, 15 | "width": 1024, 16 | "heads": 16, 17 | "layers": 20 18 | } 19 | } -------------------------------------------------------------------------------- /encoder/model/modules/feat_extractors/train_clip_src/open_clip/model_configs/convnext_xxlarge.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "timm_model_name": "convnext_xxlarge", 5 | "timm_model_pretrained": false, 6 | "timm_pool": "", 7 | "timm_proj": "linear", 8 | "timm_drop": 0.0, 9 | "timm_drop_path": 0.1, 10 | "image_size": 256 11 | }, 12 | "text_cfg": { 13 | "context_length": 77, 14 | "vocab_size": 49408, 15 | "width": 1024, 16 | "heads": 16, 17 | "layers": 24 18 | } 19 | } -------------------------------------------------------------------------------- /encoder/model/modules/feat_extractors/train_clip_src/open_clip/model_configs/convnext_xxlarge_320.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "timm_model_name": "convnext_xxlarge", 5 | "timm_model_pretrained": false, 6 | "timm_pool": "", 7 | "timm_proj": "linear", 8 | "timm_drop": 0.0, 9 | "timm_drop_path": 0.1, 10 | "image_size": 320 11 | }, 12 | "text_cfg": { 13 | "context_length": 77, 14 | "vocab_size": 49408, 15 | "width": 1024, 16 | "heads": 16, 17 | "layers": 24 18 | } 19 | } -------------------------------------------------------------------------------- /encoder/model/modules/feat_extractors/train_clip_src/open_clip/model_configs/mt5-base-ViT-B-32.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 12, 6 | "width": 768, 7 | "patch_size": 32 8 | }, 9 | "text_cfg": { 10 | "hf_model_name": "google/mt5-base", 11 | "hf_tokenizer_name": "google/mt5-base", 12 | "proj": "mlp", 13 | "pooler_type": "mean_pooler" 14 | } 15 | } 16 | -------------------------------------------------------------------------------- /encoder/model/modules/feat_extractors/train_clip_src/open_clip/model_configs/mt5-xl-ViT-H-14.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 32, 6 | "width": 1280, 7 | "head_width": 80, 8 | "patch_size": 14 9 | }, 10 | "text_cfg": { 11 | "hf_model_name": "google/mt5-xl", 12 | "hf_tokenizer_name": "google/mt5-xl", 13 | "proj": "mlp", 14 | "pooler_type": "mean_pooler" 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /encoder/model/modules/feat_extractors/train_clip_src/open_clip/model_configs/roberta-ViT-B-32.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "quick_gelu": true, 4 | "vision_cfg": { 5 | "image_size": 224, 6 | "layers": 12, 7 | "width": 768, 8 | "patch_size": 32 9 | }, 10 | "text_cfg": { 11 | "hf_model_name": "roberta-base", 12 | "hf_tokenizer_name": "roberta-base", 13 | "proj": "mlp", 14 | "pooler_type": "mean_pooler" 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /encoder/model/modules/feat_extractors/train_clip_src/open_clip/model_configs/swin_base_patch4_window7_224.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 640, 3 | "vision_cfg": { 4 | "timm_model_name": "swin_base_patch4_window7_224", 5 | "timm_model_pretrained": false, 6 | "timm_pool": "", 7 | "timm_proj": "linear", 8 | "image_size": 224 9 | }, 10 | "text_cfg": { 11 | "context_length": 77, 12 | "vocab_size": 49408, 13 | "width": 640, 14 | "heads": 10, 15 | "layers": 12 16 | } 17 | } -------------------------------------------------------------------------------- /encoder/model/modules/feat_extractors/train_clip_src/open_clip/model_configs/vit_medium_patch16_gap_256.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "vision_cfg": { 4 | "timm_model_name": "vit_medium_patch16_gap_256", 5 | "timm_model_pretrained": false, 6 | "timm_pool": "", 7 | "timm_proj": "linear", 8 | "image_size": 256 9 | }, 10 | "text_cfg": { 11 | "context_length": 77, 12 | "vocab_size": 49408, 13 | "width": 512, 14 | "heads": 8, 15 | "layers": 12 16 | } 17 | } -------------------------------------------------------------------------------- /encoder/model/modules/feat_extractors/train_clip_src/open_clip/model_configs/vit_relpos_medium_patch16_cls_224.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "vision_cfg": { 4 | "timm_model_name": "vit_relpos_medium_patch16_cls_224", 5 | "timm_model_pretrained": false, 6 | "timm_pool": "", 7 | "timm_proj": "linear", 8 | "image_size": 224 9 | }, 10 | "text_cfg": { 11 | "context_length": 77, 12 | "vocab_size": 49408, 13 | "width": 512, 14 | "heads": 8, 15 | "layers": 12 16 | } 17 | } -------------------------------------------------------------------------------- /encoder/model/modules/feat_extractors/train_clip_src/open_clip/model_configs/xlm-roberta-base-ViT-B-32.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 12, 6 | "width": 768, 7 | "patch_size": 32 8 | }, 9 | "text_cfg": { 10 | "hf_model_name": "xlm-roberta-base", 11 | "hf_tokenizer_name": "xlm-roberta-base", 12 | "proj": "mlp", 13 | "pooler_type": "mean_pooler" 14 | } 15 | } 16 | -------------------------------------------------------------------------------- /encoder/model/modules/feat_extractors/train_clip_src/open_clip/model_configs/xlm-roberta-large-ViT-H-14.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 32, 6 | "width": 1280, 7 | "head_width": 80, 8 | "patch_size": 14 9 | }, 10 | "text_cfg": { 11 | "hf_model_name": "xlm-roberta-large", 12 | "hf_tokenizer_name": "xlm-roberta-large", 13 | "proj": "mlp", 14 | "pooler_type": "mean_pooler" 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /encoder/model/modules/feat_extractors/train_clip_src/open_clip/openai.py: -------------------------------------------------------------------------------- 1 | """ OpenAI pretrained model functions 2 | 3 | Adapted from https://github.com/openai/CLIP. Originally MIT License, Copyright (c) 2021 OpenAI. 4 | """ 5 | 6 | import os 7 | import warnings 8 | from typing import List, Optional, Union 9 | 10 | import torch 11 | 12 | from .model import build_model_from_openai_state_dict, convert_weights_to_lp, get_cast_dtype 13 | from .pretrained import get_pretrained_url, list_pretrained_models_by_tag, download_pretrained_from_url 14 | 15 | __all__ = ["list_openai_models", "load_openai_model"] 16 | 17 | 18 | def list_openai_models() -> List[str]: 19 | """Returns the names of available CLIP models""" 20 | return list_pretrained_models_by_tag('openai') 21 | 22 | 23 | def load_openai_model( 24 | name: str, 25 | precision: Optional[str] = None, 26 | device: Optional[Union[str, torch.device]] = None, 27 | jit: bool = True, 28 | cache_dir: Optional[str] = None, 29 | ): 30 | """Load a CLIP model 31 | 32 | Parameters 33 | ---------- 34 | name : str 35 | A model name listed by `clip.available_models()`, or the path to a model checkpoint containing the state_dict 36 | precision: str 37 | Model precision, if None defaults to 'fp32' if device == 'cpu' else 'fp16'. 38 | device : Union[str, torch.device] 39 | The device to put the loaded model 40 | jit : bool 41 | Whether to load the optimized JIT model (default) or more hackable non-JIT model. 42 | cache_dir : Optional[str] 43 | The directory to cache the downloaded model weights 44 | 45 | Returns 46 | ------- 47 | model : torch.nn.Module 48 | The CLIP model 49 | preprocess : Callable[[PIL.Image], torch.Tensor] 50 | A torchvision transform that converts a PIL image into a tensor that the returned model can take as its input 51 | """ 52 | if device is None: 53 | device = "cuda" if torch.cuda.is_available() else "cpu" 54 | if precision is None: 55 | precision = 'fp32' if device == 'cpu' else 'fp16' 56 | 57 | if get_pretrained_url(name, 'openai'): 58 | model_path = download_pretrained_from_url(get_pretrained_url(name, 'openai'), cache_dir=cache_dir) 59 | elif os.path.isfile(name): 60 | model_path = name 61 | else: 62 | raise RuntimeError(f"Model {name} not found; available models = {list_openai_models()}") 63 | 64 | try: 65 | # loading JIT archive 66 | model = torch.jit.load(model_path, map_location=device if jit else "cpu").eval() 67 | state_dict = None 68 | except RuntimeError: 69 | # loading saved state dict 70 | if jit: 71 | warnings.warn(f"File {model_path} is not a JIT archive. Loading as a state dict instead") 72 | jit = False 73 | state_dict = torch.load(model_path, map_location="cpu") 74 | 75 | if not jit: 76 | # Build a non-jit model from the OpenAI jitted model state dict 77 | cast_dtype = get_cast_dtype(precision) 78 | try: 79 | model = build_model_from_openai_state_dict(state_dict or model.state_dict(), cast_dtype=cast_dtype) 80 | except KeyError: 81 | sd = {k[7:]: v for k, v in state_dict["state_dict"].items()} 82 | model = build_model_from_openai_state_dict(sd, cast_dtype=cast_dtype) 83 | 84 | # model from OpenAI state dict is in manually cast fp16 mode, must be converted for AMP/fp32/bf16 use 85 | model = model.to(device) 86 | if precision.startswith('amp') or precision == 'fp32': 87 | model.float() 88 | elif precision == 'bf16': 89 | convert_weights_to_lp(model, dtype=torch.bfloat16) 90 | 91 | return model 92 | 93 | # patch the device names 94 | device_holder = torch.jit.trace(lambda: torch.ones([]).to(torch.device(device)), example_inputs=[]) 95 | device_node = [n for n in device_holder.graph.findAllNodes("prim::Constant") if "Device" in repr(n)][-1] 96 | 97 | def patch_device(module): 98 | try: 99 | graphs = [module.graph] if hasattr(module, "graph") else [] 100 | except RuntimeError: 101 | graphs = [] 102 | 103 | if hasattr(module, "forward1"): 104 | graphs.append(module.forward1.graph) 105 | 106 | for graph in graphs: 107 | for node in graph.findAllNodes("prim::Constant"): 108 | if "value" in node.attributeNames() and str(node["value"]).startswith("cuda"): 109 | node.copyAttributes(device_node) 110 | 111 | model.apply(patch_device) 112 | patch_device(model.encode_image) 113 | patch_device(model.encode_text) 114 | 115 | # patch dtype to float32 (typically for CPU) 116 | if precision == 'fp32': 117 | float_holder = torch.jit.trace(lambda: torch.ones([]).float(), example_inputs=[]) 118 | float_input = list(float_holder.graph.findNode("aten::to").inputs())[1] 119 | float_node = float_input.node() 120 | 121 | def patch_float(module): 122 | try: 123 | graphs = [module.graph] if hasattr(module, "graph") else [] 124 | except RuntimeError: 125 | graphs = [] 126 | 127 | if hasattr(module, "forward1"): 128 | graphs.append(module.forward1.graph) 129 | 130 | for graph in graphs: 131 | for node in graph.findAllNodes("aten::to"): 132 | inputs = list(node.inputs()) 133 | for i in [1, 2]: # dtype can be the second or third argument to aten::to() 134 | if inputs[i].node()["value"] == 5: 135 | inputs[i].node().copyAttributes(float_node) 136 | 137 | model.apply(patch_float) 138 | patch_float(model.encode_image) 139 | patch_float(model.encode_text) 140 | model.float() 141 | 142 | # ensure image_size attr available at consistent location for both jit and non-jit 143 | model.visual.image_size = model.input_resolution.item() 144 | return model 145 | -------------------------------------------------------------------------------- /encoder/model/modules/feat_extractors/train_clip_src/open_clip/timm_model.py: -------------------------------------------------------------------------------- 1 | """ timm model adapter 2 | 3 | Wraps timm (https://github.com/rwightman/pytorch-image-models) models for use as a vision tower in CLIP model. 4 | """ 5 | import logging 6 | from collections import OrderedDict 7 | 8 | import torch 9 | import torch.nn as nn 10 | 11 | try: 12 | import timm 13 | from timm.models.layers import Mlp, to_2tuple 14 | try: 15 | # old timm imports < 0.8.1 16 | from timm.models.layers.attention_pool2d import RotAttentionPool2d 17 | from timm.models.layers.attention_pool2d import AttentionPool2d as AbsAttentionPool2d 18 | except ImportError: 19 | # new timm imports >= 0.8.1 20 | from timm.layers import RotAttentionPool2d 21 | from timm.layers import AttentionPool2d as AbsAttentionPool2d 22 | except ImportError: 23 | timm = None 24 | 25 | from .utils import freeze_batch_norm_2d 26 | 27 | 28 | class TimmModel(nn.Module): 29 | """ timm model adapter 30 | # FIXME this adapter is a work in progress, may change in ways that break weight compat 31 | """ 32 | 33 | def __init__( 34 | self, 35 | model_name, 36 | embed_dim, 37 | image_size=224, 38 | pool='avg', 39 | proj='linear', 40 | proj_bias=False, 41 | drop=0., 42 | drop_path=None, 43 | pretrained=False, 44 | ): 45 | super().__init__() 46 | if timm is None: 47 | raise RuntimeError("Please `pip install timm` to use timm models.") 48 | 49 | self.image_size = to_2tuple(image_size) 50 | timm_kwargs = {} 51 | if drop_path is not None: 52 | timm_kwargs['drop_path_rate'] = drop_path 53 | self.trunk = timm.create_model(model_name, pretrained=pretrained, **timm_kwargs) 54 | feat_size = self.trunk.default_cfg.get('pool_size', None) 55 | feature_ndim = 1 if not feat_size else 2 56 | if pool in ('abs_attn', 'rot_attn'): 57 | assert feature_ndim == 2 58 | # if attn pooling used, remove both classifier and default pool 59 | self.trunk.reset_classifier(0, global_pool='') 60 | else: 61 | # reset global pool if pool config set, otherwise leave as network default 62 | reset_kwargs = dict(global_pool=pool) if pool else {} 63 | self.trunk.reset_classifier(0, **reset_kwargs) 64 | prev_chs = self.trunk.num_features 65 | 66 | head_layers = OrderedDict() 67 | if pool == 'abs_attn': 68 | head_layers['pool'] = AbsAttentionPool2d(prev_chs, feat_size=feat_size, out_features=embed_dim) 69 | prev_chs = embed_dim 70 | elif pool == 'rot_attn': 71 | head_layers['pool'] = RotAttentionPool2d(prev_chs, out_features=embed_dim) 72 | prev_chs = embed_dim 73 | else: 74 | assert proj, 'projection layer needed if non-attention pooling is used.' 75 | 76 | # NOTE attention pool ends with a projection layer, so proj should usually be set to '' if such pooling is used 77 | if proj == 'linear': 78 | head_layers['drop'] = nn.Dropout(drop) 79 | head_layers['proj'] = nn.Linear(prev_chs, embed_dim, bias=proj_bias) 80 | elif proj == 'mlp': 81 | head_layers['mlp'] = Mlp(prev_chs, 2 * embed_dim, embed_dim, drop=(drop, 0), bias=(True, proj_bias)) 82 | 83 | self.head = nn.Sequential(head_layers) 84 | 85 | def lock(self, unlocked_groups=0, freeze_bn_stats=False): 86 | """ lock modules 87 | Args: 88 | unlocked_groups (int): leave last n layer groups unlocked (default: 0) 89 | """ 90 | if not unlocked_groups: 91 | # lock full model 92 | for param in self.trunk.parameters(): 93 | param.requires_grad = False 94 | if freeze_bn_stats: 95 | freeze_batch_norm_2d(self.trunk) 96 | else: 97 | # NOTE: partial freeze requires latest timm (master) branch and is subject to change 98 | try: 99 | # FIXME import here until API stable and in an official release 100 | from timm.models.helpers import group_parameters, group_modules 101 | except ImportError: 102 | raise RuntimeError( 103 | 'Please install latest timm `pip install git+https://github.com/rwightman/pytorch-image-models`') 104 | matcher = self.trunk.group_matcher() 105 | gparams = group_parameters(self.trunk, matcher) 106 | max_layer_id = max(gparams.keys()) 107 | max_layer_id = max_layer_id - unlocked_groups 108 | for group_idx in range(max_layer_id + 1): 109 | group = gparams[group_idx] 110 | for param in group: 111 | self.trunk.get_parameter(param).requires_grad = False 112 | if freeze_bn_stats: 113 | gmodules = group_modules(self.trunk, matcher, reverse=True) 114 | gmodules = {k for k, v in gmodules.items() if v <= max_layer_id} 115 | freeze_batch_norm_2d(self.trunk, gmodules) 116 | 117 | @torch.jit.ignore 118 | def set_grad_checkpointing(self, enable=True): 119 | try: 120 | self.trunk.set_grad_checkpointing(enable) 121 | except Exception as e: 122 | logging.warning('grad checkpointing not supported for this timm image tower, continuing without...') 123 | 124 | def forward(self, x): 125 | x = self.trunk(x) 126 | x = self.head(x) 127 | return x 128 | -------------------------------------------------------------------------------- /encoder/model/modules/feat_extractors/train_clip_src/open_clip/transform.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | from dataclasses import dataclass, asdict 3 | from typing import Any, Dict, Optional, Sequence, Tuple, Union 4 | 5 | import torch 6 | import torch.nn as nn 7 | import torchvision.transforms.functional as F 8 | 9 | from torchvision.transforms import Normalize, Compose, RandomResizedCrop, InterpolationMode, ToTensor, Resize, \ 10 | CenterCrop 11 | 12 | from .constants import OPENAI_DATASET_MEAN, OPENAI_DATASET_STD 13 | 14 | 15 | @dataclass 16 | class AugmentationCfg: 17 | scale: Tuple[float, float] = (0.9, 1.0) 18 | ratio: Optional[Tuple[float, float]] = None 19 | color_jitter: Optional[Union[float, Tuple[float, float, float]]] = None 20 | interpolation: Optional[str] = None 21 | re_prob: Optional[float] = None 22 | re_count: Optional[int] = None 23 | use_timm: bool = False 24 | 25 | 26 | class ResizeMaxSize(nn.Module): 27 | 28 | def __init__(self, max_size, interpolation=InterpolationMode.BICUBIC, fn='max', fill=0): 29 | super().__init__() 30 | if not isinstance(max_size, int): 31 | raise TypeError(f"Size should be int. Got {type(max_size)}") 32 | self.max_size = max_size 33 | self.interpolation = interpolation 34 | self.fn = min if fn == 'min' else min 35 | self.fill = fill 36 | 37 | def forward(self, img): 38 | if isinstance(img, torch.Tensor): 39 | height, width = img.shape[:2] 40 | else: 41 | width, height = img.size 42 | scale = self.max_size / float(max(height, width)) 43 | if scale != 1.0: 44 | new_size = tuple(round(dim * scale) for dim in (height, width)) 45 | img = F.resize(img, new_size, self.interpolation) 46 | pad_h = self.max_size - new_size[0] 47 | pad_w = self.max_size - new_size[1] 48 | img = F.pad(img, padding=[pad_w//2, pad_h//2, pad_w - pad_w//2, pad_h - pad_h//2], fill=self.fill) 49 | return img 50 | 51 | 52 | def _convert_to_rgb(image): 53 | return image.convert('RGB') 54 | 55 | 56 | def image_transform( 57 | image_size: int, 58 | is_train: bool, 59 | mean: Optional[Tuple[float, ...]] = None, 60 | std: Optional[Tuple[float, ...]] = None, 61 | resize_longest_max: bool = False, 62 | fill_color: int = 0, 63 | aug_cfg: Optional[Union[Dict[str, Any], AugmentationCfg]] = None, 64 | ): 65 | mean = mean or OPENAI_DATASET_MEAN 66 | if not isinstance(mean, (list, tuple)): 67 | mean = (mean,) * 3 68 | 69 | std = std or OPENAI_DATASET_STD 70 | if not isinstance(std, (list, tuple)): 71 | std = (std,) * 3 72 | 73 | if isinstance(image_size, (list, tuple)) and image_size[0] == image_size[1]: 74 | # for square size, pass size as int so that Resize() uses aspect preserving shortest edge 75 | image_size = image_size[0] 76 | 77 | if isinstance(aug_cfg, dict): 78 | aug_cfg = AugmentationCfg(**aug_cfg) 79 | else: 80 | aug_cfg = aug_cfg or AugmentationCfg() 81 | normalize = Normalize(mean=mean, std=std) 82 | if is_train: 83 | aug_cfg_dict = {k: v for k, v in asdict(aug_cfg).items() if v is not None} 84 | use_timm = aug_cfg_dict.pop('use_timm', False) 85 | if use_timm: 86 | from timm.data import create_transform # timm can still be optional 87 | if isinstance(image_size, (tuple, list)): 88 | assert len(image_size) >= 2 89 | input_size = (3,) + image_size[-2:] 90 | else: 91 | input_size = (3, image_size, image_size) 92 | # by default, timm aug randomly alternates bicubic & bilinear for better robustness at inference time 93 | aug_cfg_dict.setdefault('interpolation', 'random') 94 | aug_cfg_dict.setdefault('color_jitter', None) # disable by default 95 | train_transform = create_transform( 96 | input_size=input_size, 97 | is_training=True, 98 | hflip=0., 99 | mean=mean, 100 | std=std, 101 | re_mode='pixel', 102 | **aug_cfg_dict, 103 | ) 104 | else: 105 | train_transform = Compose([ 106 | RandomResizedCrop( 107 | image_size, 108 | scale=aug_cfg_dict.pop('scale'), 109 | interpolation=InterpolationMode.BICUBIC, 110 | ), 111 | _convert_to_rgb, 112 | ToTensor(), 113 | normalize, 114 | ]) 115 | if aug_cfg_dict: 116 | warnings.warn(f'Unused augmentation cfg items, specify `use_timm` to use ({list(aug_cfg_dict.keys())}).') 117 | return train_transform 118 | else: 119 | if resize_longest_max: 120 | transforms = [ 121 | ResizeMaxSize(image_size, fill=fill_color) 122 | ] 123 | else: 124 | transforms = [ 125 | Resize(image_size, interpolation=InterpolationMode.BICUBIC), 126 | CenterCrop(image_size), 127 | ] 128 | transforms.extend([ 129 | _convert_to_rgb, 130 | ToTensor(), 131 | normalize, 132 | ]) 133 | return Compose(transforms) 134 | -------------------------------------------------------------------------------- /encoder/model/modules/feat_extractors/train_clip_src/open_clip/utils.py: -------------------------------------------------------------------------------- 1 | from itertools import repeat 2 | import collections.abc 3 | 4 | from torch import nn as nn 5 | from torchvision.ops.misc import FrozenBatchNorm2d 6 | 7 | 8 | def freeze_batch_norm_2d(module, module_match={}, name=''): 9 | """ 10 | Converts all `BatchNorm2d` and `SyncBatchNorm` layers of provided module into `FrozenBatchNorm2d`. If `module` is 11 | itself an instance of either `BatchNorm2d` or `SyncBatchNorm`, it is converted into `FrozenBatchNorm2d` and 12 | returned. Otherwise, the module is walked recursively and submodules are converted in place. 13 | 14 | Args: 15 | module (torch.nn.Module): Any PyTorch module. 16 | module_match (dict): Dictionary of full module names to freeze (all if empty) 17 | name (str): Full module name (prefix) 18 | 19 | Returns: 20 | torch.nn.Module: Resulting module 21 | 22 | Inspired by https://github.com/pytorch/pytorch/blob/a5895f85be0f10212791145bfedc0261d364f103/torch/nn/modules/batchnorm.py#L762 23 | """ 24 | res = module 25 | is_match = True 26 | if module_match: 27 | is_match = name in module_match 28 | if is_match and isinstance(module, (nn.modules.batchnorm.BatchNorm2d, nn.modules.batchnorm.SyncBatchNorm)): 29 | res = FrozenBatchNorm2d(module.num_features) 30 | res.num_features = module.num_features 31 | res.affine = module.affine 32 | if module.affine: 33 | res.weight.data = module.weight.data.clone().detach() 34 | res.bias.data = module.bias.data.clone().detach() 35 | res.running_mean.data = module.running_mean.data 36 | res.running_var.data = module.running_var.data 37 | res.eps = module.eps 38 | else: 39 | for child_name, child in module.named_children(): 40 | full_child_name = '.'.join([name, child_name]) if name else child_name 41 | new_child = freeze_batch_norm_2d(child, module_match, full_child_name) 42 | if new_child is not child: 43 | res.add_module(child_name, new_child) 44 | return res 45 | 46 | 47 | # From PyTorch internals 48 | def _ntuple(n): 49 | def parse(x): 50 | if isinstance(x, collections.abc.Iterable): 51 | return x 52 | return tuple(repeat(x, n)) 53 | return parse 54 | 55 | 56 | to_1tuple = _ntuple(1) 57 | to_2tuple = _ntuple(2) 58 | to_3tuple = _ntuple(3) 59 | to_4tuple = _ntuple(4) 60 | to_ntuple = lambda n, x: _ntuple(n)(x) 61 | -------------------------------------------------------------------------------- /encoder/model/modules/feat_extractors/train_clip_src/open_clip/version.py: -------------------------------------------------------------------------------- 1 | __version__ = '2.16.0' 2 | -------------------------------------------------------------------------------- /encoder/model/modules/feat_extractors/train_clip_src/training/.gitignore: -------------------------------------------------------------------------------- 1 | logs/ 2 | -------------------------------------------------------------------------------- /encoder/model/modules/feat_extractors/train_clip_src/training/__init__.py: -------------------------------------------------------------------------------- 1 | import sys 2 | sys.path.append('scripts') 3 | -------------------------------------------------------------------------------- /encoder/model/modules/feat_extractors/train_clip_src/training/distributed.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import torch 4 | import torch.distributed as dist 5 | 6 | 7 | def is_global_master(args): 8 | return args.rank == 0 9 | 10 | 11 | def is_local_master(args): 12 | return args.local_rank == 0 13 | 14 | 15 | def is_master(args, local=False): 16 | return is_local_master(args) if local else is_global_master(args) 17 | 18 | 19 | def is_using_distributed(): 20 | if 'WORLD_SIZE' in os.environ: 21 | return int(os.environ['WORLD_SIZE']) > 1 22 | if 'SLURM_NTASKS' in os.environ: 23 | return int(os.environ['SLURM_NTASKS']) > 1 24 | return False 25 | 26 | 27 | def world_info_from_env(): 28 | local_rank = 0 29 | for v in ('LOCAL_RANK', 'MPI_LOCALRANKID', 'SLURM_LOCALID', 'OMPI_COMM_WORLD_LOCAL_RANK'): 30 | if v in os.environ: 31 | local_rank = int(os.environ[v]) 32 | break 33 | global_rank = 0 34 | for v in ('RANK', 'PMI_RANK', 'SLURM_PROCID', 'OMPI_COMM_WORLD_RANK'): 35 | if v in os.environ: 36 | global_rank = int(os.environ[v]) 37 | break 38 | world_size = 1 39 | for v in ('WORLD_SIZE', 'PMI_SIZE', 'SLURM_NTASKS', 'OMPI_COMM_WORLD_SIZE'): 40 | if v in os.environ: 41 | world_size = int(os.environ[v]) 42 | break 43 | 44 | # print('local_rank=%d global_rank=%d world_size=%d' % (local_rank, global_rank, world_size)) 45 | 46 | # # get environemnt vars from os.environ, sort them by key and save to a file unique for each rank 47 | # sorted_env_vars = sorted(os.environ.items(), key=lambda x: x[0]) 48 | # env_vars_path = os.path.join(f"env_vars_{global_rank}.txt") 49 | # with open(env_vars_path, "w") as f: 50 | # for key, value in sorted_env_vars: 51 | # f.write(f"{key}={value}\n") 52 | 53 | return local_rank, global_rank, world_size 54 | 55 | 56 | def init_distributed_device(args): 57 | # Distributed training = training on more than one GPU. 58 | # Works in both single and multi-node scenarios. 59 | args.distributed = False 60 | args.world_size = 1 61 | args.rank = 0 # global rank 62 | args.local_rank = 0 63 | if is_using_distributed(): 64 | if 'SLURM_PROCID' in os.environ: 65 | # DDP via SLURM 66 | args.local_rank, args.rank, args.world_size = world_info_from_env() 67 | # SLURM var -> torch.distributed vars in case needed 68 | os.environ['LOCAL_RANK'] = str(args.local_rank) 69 | os.environ['RANK'] = str(args.rank) 70 | os.environ['WORLD_SIZE'] = str(args.world_size) 71 | torch.distributed.init_process_group( 72 | backend=args.training.dist_backend, 73 | init_method=args.training.dist_url, 74 | world_size=args.world_size, 75 | rank=args.rank, 76 | ) 77 | else: 78 | # DDP via torchrun, torch.distributed.launch 79 | args.local_rank, _, _ = world_info_from_env() 80 | torch.distributed.init_process_group( 81 | backend=args.training.dist_backend, 82 | init_method=args.training.dist_url) 83 | args.world_size = torch.distributed.get_world_size() 84 | args.rank = torch.distributed.get_rank() 85 | args.distributed = True 86 | 87 | if torch.cuda.is_available(): 88 | if args.distributed and not args.training.no_set_device_rank: 89 | device = 'cuda:%d' % args.local_rank 90 | else: 91 | device = 'cuda:0' 92 | torch.cuda.set_device(device) 93 | else: 94 | device = 'cpu' 95 | args.device = device 96 | device = torch.device(device) 97 | return device 98 | 99 | 100 | def broadcast_object(args, obj, src=0): 101 | # broadcast a pickle-able python object from rank-0 to all ranks 102 | if args.rank == src: 103 | objects = [obj] 104 | else: 105 | objects = [None] 106 | dist.broadcast_object_list(objects, src=src) 107 | return objects[0] 108 | 109 | 110 | def all_gather_object(args, obj, dst=0): 111 | # gather a pickle-able python object across all ranks 112 | objects = [None for _ in range(args.world_size)] 113 | dist.all_gather_object(objects, obj) 114 | return objects 115 | -------------------------------------------------------------------------------- /encoder/model/modules/feat_extractors/train_clip_src/training/file_utils.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import multiprocessing 4 | import subprocess 5 | import time 6 | import fsspec 7 | import torch 8 | from tqdm import tqdm 9 | 10 | def remote_sync_s3(local_dir, remote_dir): 11 | # skip epoch_latest which can change during sync. 12 | result = subprocess.run(["aws", "s3", "sync", local_dir, remote_dir, '--exclude', '*epoch_latest.pt'], stdout=subprocess.PIPE, stderr=subprocess.PIPE) 13 | if result.returncode != 0: 14 | logging.error(f"Error: Failed to sync with S3 bucket {result.stderr.decode('utf-8')}") 15 | return False 16 | 17 | logging.info(f"Successfully synced with S3 bucket") 18 | return True 19 | 20 | def remote_sync_fsspec(local_dir, remote_dir): 21 | # FIXME currently this is slow and not recommended. Look into speeding up. 22 | a = fsspec.get_mapper(local_dir) 23 | b = fsspec.get_mapper(remote_dir) 24 | 25 | for k in a: 26 | # skip epoch_latest which can change during sync. 27 | if 'epoch_latest.pt' in k: 28 | continue 29 | 30 | logging.info(f'Attempting to sync {k}') 31 | if k in b and len(a[k]) == len(b[k]): 32 | logging.debug(f'Skipping remote sync for {k}.') 33 | continue 34 | 35 | try: 36 | logging.info(f'Successful sync for {k}.') 37 | b[k] = a[k] 38 | except Exception as e: 39 | logging.info(f'Error during remote sync for {k}: {e}') 40 | return False 41 | 42 | return True 43 | 44 | def remote_sync(local_dir, remote_dir, protocol): 45 | logging.info('Starting remote sync.') 46 | if protocol == 's3': 47 | return remote_sync_s3(local_dir, remote_dir) 48 | elif protocol == 'fsspec': 49 | return remote_sync_fsspec(local_dir, remote_dir) 50 | else: 51 | logging.error('Remote protocol not known') 52 | return False 53 | 54 | def keep_running_remote_sync(sync_every, local_dir, remote_dir, protocol): 55 | while True: 56 | time.sleep(sync_every) 57 | remote_sync(local_dir, remote_dir, protocol) 58 | 59 | def start_sync_process(sync_every, local_dir, remote_dir, protocol): 60 | p = multiprocessing.Process(target=keep_running_remote_sync, args=(sync_every, local_dir, remote_dir, protocol)) 61 | return p 62 | 63 | # Note: we are not currently using this save function. 64 | def pt_save(pt_obj, file_path): 65 | of = fsspec.open(file_path, "wb") 66 | with of as f: 67 | torch.save(pt_obj, file_path) 68 | 69 | def pt_load(file_path, map_location=None): 70 | if file_path.startswith('s3'): 71 | logging.info('Loading remote checkpoint, which may take a bit.') 72 | of = fsspec.open(file_path, "rb") 73 | with of as f: 74 | out = torch.load(f, map_location=map_location) 75 | return out 76 | 77 | def check_exists(file_path): 78 | try: 79 | with fsspec.open(file_path): 80 | pass 81 | except FileNotFoundError: 82 | return False 83 | return True 84 | -------------------------------------------------------------------------------- /encoder/model/modules/feat_extractors/train_clip_src/training/logger.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | 4 | def setup_logging(log_file, level, include_host=False): 5 | if include_host: 6 | import socket 7 | hostname = socket.gethostname() 8 | formatter = logging.Formatter( 9 | f'%(asctime)s | {hostname} | %(levelname)s | %(message)s', datefmt='%Y-%m-%d,%H:%M:%S') 10 | else: 11 | formatter = logging.Formatter('%(asctime)s | %(levelname)s | %(message)s', datefmt='%Y-%m-%d,%H:%M:%S') 12 | 13 | logging.root.setLevel(level) 14 | loggers = [logging.getLogger(name) for name in logging.root.manager.loggerDict] 15 | for logger in loggers: 16 | if logger.name.startswith(('transformers', )): # these guys are too verbose at INFO 17 | logger.setLevel(logging.WARNING) 18 | else: 19 | logger.setLevel(level) 20 | 21 | stream_handler = logging.StreamHandler() 22 | stream_handler.setFormatter(formatter) 23 | logging.root.addHandler(stream_handler) 24 | 25 | if log_file: 26 | file_handler = logging.FileHandler(filename=log_file) 27 | file_handler.setFormatter(formatter) 28 | logging.root.addHandler(file_handler) 29 | -------------------------------------------------------------------------------- /encoder/model/modules/feat_extractors/train_clip_src/training/precision.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from contextlib import suppress 3 | 4 | 5 | def get_autocast(precision): 6 | if precision == 'amp': 7 | return torch.cuda.amp.autocast 8 | elif precision == 'amp_bfloat16' or precision == 'amp_bf16': 9 | # amp_bfloat16 is more stable than amp float16 for clip training 10 | return lambda: torch.cuda.amp.autocast(dtype=torch.bfloat16) 11 | else: 12 | return suppress 13 | -------------------------------------------------------------------------------- /encoder/model/modules/feat_extractors/train_clip_src/training/profile.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | import torch 4 | import open_clip 5 | import pandas as pd 6 | from fvcore.nn import FlopCountAnalysis, flop_count_str, ActivationCountAnalysis 7 | 8 | 9 | parser = argparse.ArgumentParser(description='OpenCLIP Profiler') 10 | 11 | # benchmark specific args 12 | parser.add_argument('--model', metavar='NAME', default='', 13 | help='model(s) to profile') 14 | parser.add_argument('--results-file', default='', type=str, metavar='FILENAME', 15 | help='Output csv file for results') 16 | 17 | 18 | def profile_fvcore( 19 | model, 20 | image_input_size=(3, 224, 224), 21 | text_input_size=(77,), 22 | batch_size=1, 23 | detailed=False, 24 | force_cpu=False 25 | ): 26 | if force_cpu: 27 | model = model.to('cpu') 28 | device, dtype = next(model.parameters()).device, next(model.parameters()).dtype 29 | example_image_input = torch.ones((batch_size,) + image_input_size, device=device, dtype=dtype) 30 | example_text_input = torch.ones((batch_size,) + text_input_size, device=device, dtype=torch.int64) 31 | fca = FlopCountAnalysis(model, (example_image_input, example_text_input)) 32 | aca = ActivationCountAnalysis(model, (example_image_input, example_text_input)) 33 | if detailed: 34 | fcs = flop_count_str(fca) 35 | print(fcs) 36 | return fca.total(), aca.total() 37 | 38 | 39 | def profile_fvcore_text( 40 | model, 41 | text_input_size=(77,), 42 | batch_size=1, 43 | detailed=False, 44 | force_cpu=False 45 | ): 46 | if force_cpu: 47 | model = model.to('cpu') 48 | device = next(model.parameters()).device 49 | example_input = torch.ones((batch_size,) + text_input_size, device=device, dtype=torch.int64) 50 | fca = FlopCountAnalysis(model, example_input) 51 | aca = ActivationCountAnalysis(model, example_input) 52 | if detailed: 53 | fcs = flop_count_str(fca) 54 | print(fcs) 55 | return fca.total(), aca.total() 56 | 57 | 58 | def profile_fvcore_image( 59 | model, 60 | image_input_size=(3, 224, 224), 61 | batch_size=1, 62 | detailed=False, 63 | force_cpu=False 64 | ): 65 | if force_cpu: 66 | model = model.to('cpu') 67 | device, dtype = next(model.parameters()).device, next(model.parameters()).dtype 68 | example_input = torch.ones((batch_size,) + image_input_size, device=device, dtype=dtype) 69 | fca = FlopCountAnalysis(model, example_input) 70 | aca = ActivationCountAnalysis(model, example_input) 71 | if detailed: 72 | fcs = flop_count_str(fca) 73 | print(fcs) 74 | return fca.total(), aca.total() 75 | 76 | 77 | def count_params(model): 78 | return sum([m.numel() for m in model.parameters()]) 79 | 80 | 81 | def profile_model(model_name): 82 | model = open_clip.create_model(model_name, force_custom_text=True, pretrained_hf=False) 83 | model.eval() 84 | if torch.cuda.is_available(): 85 | model = model.cuda() 86 | 87 | if isinstance(model.visual.image_size, (tuple, list)): 88 | image_input_size = (3,) + tuple(model.visual.image_size[-2:]) 89 | else: 90 | image_input_size = (3, model.visual.image_size, model.visual.image_size) 91 | text_input_size = (77,) 92 | 93 | results = {} 94 | results['model'] = model_name 95 | results['image_size'] = image_input_size[1] 96 | 97 | model_cfg = open_clip.get_model_config(model_name) 98 | if model_cfg: 99 | vision_cfg = open_clip.CLIPVisionCfg(**model_cfg['vision_cfg']) 100 | text_cfg = open_clip.CLIPTextCfg(**model_cfg['text_cfg']) 101 | results['image_width'] = int(vision_cfg.width) 102 | results['text_width'] = int(text_cfg.width) 103 | results['embed_dim'] = int(model_cfg['embed_dim']) 104 | else: 105 | results['image_width'] = 0 106 | results['text_width'] = 0 107 | results['embed_dim'] = 0 108 | 109 | retries = 2 110 | while retries: 111 | retries -= 1 112 | try: 113 | macs, acts = profile_fvcore( 114 | model, image_input_size=image_input_size, text_input_size=text_input_size, force_cpu=not retries) 115 | 116 | image_macs, image_acts = profile_fvcore_image( 117 | model.visual, image_input_size=image_input_size, force_cpu=not retries) 118 | 119 | text_macs, text_acts = profile_fvcore_text( 120 | model.text, text_input_size=text_input_size, force_cpu=not retries) 121 | 122 | results['gmacs'] = round(macs / 1e9, 2) 123 | results['macts'] = round(acts / 1e6, 2) 124 | results['mparams'] = round(count_params(model) / 1e6, 2) 125 | results['image_gmacs'] = round(image_macs / 1e9, 2) 126 | results['image_macts'] = round(image_acts / 1e6, 2) 127 | results['image_mparams'] = round(count_params(model.visual) / 1e6, 2) 128 | results['text_gmacs'] = round(text_macs / 1e9, 2) 129 | results['text_macts'] = round(text_acts / 1e6, 2) 130 | results['text_mparams'] = round(count_params(model.text) / 1e6, 2) 131 | except RuntimeError as e: 132 | pass 133 | return results 134 | 135 | 136 | def main(): 137 | args = parser.parse_args() 138 | 139 | # FIXME accept a text file name to allow lists of models in txt/csv 140 | if args.model == 'all': 141 | parsed_model = open_clip.list_models() 142 | else: 143 | parsed_model = args.model.split(',') 144 | 145 | results = [] 146 | for m in parsed_model: 147 | row = profile_model(m) 148 | results.append(row) 149 | 150 | df = pd.DataFrame(results, columns=results[0].keys()) 151 | df = df.sort_values('gmacs') 152 | print(df) 153 | if args.results_file: 154 | df.to_csv(args.results_file, index=False) 155 | 156 | 157 | if __name__ == '__main__': 158 | main() 159 | -------------------------------------------------------------------------------- /encoder/model/modules/feat_extractors/train_clip_src/training/scheduler.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def assign_learning_rate(optimizer, new_lr): 5 | for param_group in optimizer.param_groups: 6 | param_group["lr"] = new_lr 7 | 8 | 9 | def _warmup_lr(base_lr, warmup_length, step): 10 | return base_lr * (step + 1) / warmup_length 11 | 12 | 13 | def const_lr(optimizer, base_lr, warmup_length, steps): 14 | def _lr_adjuster(step): 15 | if step < warmup_length: 16 | lr = _warmup_lr(base_lr, warmup_length, step) 17 | else: 18 | lr = base_lr 19 | assign_learning_rate(optimizer, lr) 20 | return lr 21 | return _lr_adjuster 22 | 23 | 24 | def const_lr_cooldown(optimizer, base_lr, warmup_length, steps, cooldown_steps, cooldown_power=1.0, cooldown_end_lr=0.): 25 | def _lr_adjuster(step): 26 | start_cooldown_step = steps - cooldown_steps 27 | if step < warmup_length: 28 | lr = _warmup_lr(base_lr, warmup_length, step) 29 | else: 30 | if step < start_cooldown_step: 31 | lr = base_lr 32 | else: 33 | e = step - start_cooldown_step 34 | es = steps - start_cooldown_step 35 | # linear decay if power == 1; polynomial decay otherwise; 36 | decay = (1 - (e/es)) ** cooldown_power 37 | lr = decay * (base_lr - cooldown_end_lr) + cooldown_end_lr 38 | assign_learning_rate(optimizer, lr) 39 | return lr 40 | return _lr_adjuster 41 | 42 | 43 | def cosine_lr(optimizer, base_lr, warmup_length, steps): 44 | def _lr_adjuster(step): 45 | if step < warmup_length: 46 | lr = _warmup_lr(base_lr, warmup_length, step) 47 | else: 48 | e = step - warmup_length 49 | es = steps - warmup_length 50 | lr = 0.5 * (1 + np.cos(np.pi * e / es)) * base_lr 51 | assign_learning_rate(optimizer, lr) 52 | return lr 53 | return _lr_adjuster 54 | -------------------------------------------------------------------------------- /encoder/model/modules/feat_extractors/train_clip_src/training/zero_shot.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | import torch 4 | import torch.nn.functional as F 5 | from tqdm import tqdm 6 | 7 | from open_clip import get_cast_dtype, get_tokenizer 8 | from .precision import get_autocast 9 | from .imagenet_zeroshot_data import imagenet_classnames, openai_imagenet_template 10 | 11 | 12 | def zero_shot_classifier(model, classnames, templates, args): 13 | tokenizer = get_tokenizer(args.model) 14 | with torch.no_grad(): 15 | zeroshot_weights = [] 16 | for classname in tqdm(classnames): 17 | texts = [template(classname) for template in templates] # format with class 18 | texts = tokenizer(texts).to(args.device) # tokenize 19 | if args.distributed: 20 | class_embeddings = model.module.encode_text(texts) 21 | else: 22 | class_embeddings = model.encode_text(texts) 23 | class_embedding = F.normalize(class_embeddings, dim=-1).mean(dim=0) 24 | class_embedding /= class_embedding.norm() 25 | zeroshot_weights.append(class_embedding) 26 | zeroshot_weights = torch.stack(zeroshot_weights, dim=1).to(args.device) 27 | return zeroshot_weights 28 | 29 | 30 | def accuracy(output, target, topk=(1,)): 31 | pred = output.topk(max(topk), 1, True, True)[1].t() 32 | correct = pred.eq(target.view(1, -1).expand_as(pred)) 33 | return [float(correct[:k].reshape(-1).float().sum(0, keepdim=True).cpu().numpy()) for k in topk] 34 | 35 | 36 | def run(model, classifier, dataloader, args): 37 | autocast = get_autocast(args.precision) 38 | cast_dtype = get_cast_dtype(args.precision) 39 | with torch.no_grad(): 40 | top1, top5, n = 0., 0., 0. 41 | for images, target in tqdm(dataloader, unit_scale=args.batch_size): 42 | images = images.to(args.device) 43 | if cast_dtype is not None: 44 | images = images.to(dtype=cast_dtype) 45 | target = target.to(args.device) 46 | 47 | with autocast(): 48 | # predict 49 | if args.distributed: 50 | image_features = model.module.encode_image(images) 51 | else: 52 | image_features = model.encode_image(images) 53 | image_features = F.normalize(image_features, dim=-1) 54 | logits = 100. * image_features @ classifier 55 | 56 | # measure accuracy 57 | acc1, acc5 = accuracy(logits, target, topk=(1, 5)) 58 | top1 += acc1 59 | top5 += acc5 60 | n += images.size(0) 61 | 62 | top1 = (top1 / n) 63 | top5 = (top5 / n) 64 | return top1, top5 65 | 66 | 67 | def zero_shot_eval(model, data, epoch, args): 68 | if 'imagenet-val' not in data and 'imagenet-v2' not in data: 69 | return {} 70 | if args.zeroshot_frequency == 0: 71 | return {} 72 | if (epoch % args.zeroshot_frequency) != 0 and epoch != args.epochs: 73 | return {} 74 | 75 | logging.info('Starting zero-shot imagenet.') 76 | 77 | logging.info('Building zero-shot classifier') 78 | classifier = zero_shot_classifier(model, imagenet_classnames, openai_imagenet_template, args) 79 | 80 | logging.info('Using classifier') 81 | results = {} 82 | if 'imagenet-val' in data: 83 | top1, top5 = run(model, classifier, data['imagenet-val'].dataloader, args) 84 | results['imagenet-zeroshot-val-top1'] = top1 85 | results['imagenet-zeroshot-val-top5'] = top5 86 | if 'imagenet-v2' in data: 87 | top1, top5 = run(model, classifier, data['imagenet-v2'].dataloader, args) 88 | results['imagenetv2-zeroshot-val-top1'] = top1 89 | results['imagenetv2-zeroshot-val-top5'] = top5 90 | 91 | logging.info('Finished zero-shot imagenet.') 92 | 93 | return results 94 | -------------------------------------------------------------------------------- /encoder/model/modules/feat_extractors/visual/__init__.py: -------------------------------------------------------------------------------- 1 | import sys 2 | sys.path.append('model/modules/feat_extractors/visual') # nopep8 3 | -------------------------------------------------------------------------------- /encoder/model/modules/feat_extractors/visual/motionformer_src/divided_224_16x4.yaml: -------------------------------------------------------------------------------- 1 | TRAIN: 2 | ENABLE: True 3 | DATASET: Ssv2 4 | BATCH_SIZE: 32 5 | EVAL_PERIOD: 5 6 | CHECKPOINT_PERIOD: 5 7 | AUTO_RESUME: True 8 | CHECKPOINT_EPOCH_RESET: True 9 | CHECKPOINT_FILE_PATH: /checkpoint/fmetze/neurips_sota/40944587/checkpoints/checkpoint_epoch_00035.pyth 10 | DATA: 11 | NUM_FRAMES: 16 12 | SAMPLING_RATE: 4 13 | TRAIN_JITTER_SCALES: [256, 320] 14 | TRAIN_CROP_SIZE: 224 15 | TEST_CROP_SIZE: 224 16 | INPUT_CHANNEL_NUM: [3] 17 | MEAN: [0.5, 0.5, 0.5] 18 | STD: [0.5, 0.5, 0.5] 19 | PATH_TO_DATA_DIR: /private/home/mandelapatrick/slowfast/data/ssv2 20 | PATH_PREFIX: /datasets01/SomethingV2/092720/20bn-something-something-v2-frames 21 | INV_UNIFORM_SAMPLE: True 22 | RANDOM_FLIP: False 23 | REVERSE_INPUT_CHANNEL: True 24 | USE_RAND_AUGMENT: True 25 | RE_PROB: 0.0 26 | USE_REPEATED_AUG: False 27 | USE_RANDOM_RESIZE_CROPS: False 28 | COLORJITTER: False 29 | GRAYSCALE: False 30 | GAUSSIAN: False 31 | SOLVER: 32 | BASE_LR: 1e-4 33 | LR_POLICY: steps_with_relative_lrs 34 | LRS: [1, 0.1, 0.01] 35 | STEPS: [0, 20, 30] 36 | MAX_EPOCH: 35 37 | MOMENTUM: 0.9 38 | WEIGHT_DECAY: 5e-2 39 | WARMUP_EPOCHS: 0.0 40 | OPTIMIZING_METHOD: adamw 41 | USE_MIXED_PRECISION: True 42 | SMOOTHING: 0.2 43 | SLOWFAST: 44 | ALPHA: 8 45 | VIT: 46 | PATCH_SIZE: 16 47 | PATCH_SIZE_TEMP: 2 48 | CHANNELS: 3 49 | EMBED_DIM: 768 50 | DEPTH: 12 51 | NUM_HEADS: 12 52 | MLP_RATIO: 4 53 | QKV_BIAS: True 54 | VIDEO_INPUT: True 55 | TEMPORAL_RESOLUTION: 8 56 | USE_MLP: True 57 | DROP: 0.0 58 | POS_DROPOUT: 0.0 59 | DROP_PATH: 0.2 60 | IM_PRETRAINED: True 61 | HEAD_DROPOUT: 0.0 62 | HEAD_ACT: tanh 63 | PRETRAINED_WEIGHTS: vit_1k 64 | ATTN_LAYER: divided 65 | MODEL: 66 | NUM_CLASSES: 174 67 | ARCH: slow 68 | MODEL_NAME: VisionTransformer 69 | LOSS_FUNC: cross_entropy 70 | TEST: 71 | ENABLE: True 72 | DATASET: Ssv2 73 | BATCH_SIZE: 64 74 | NUM_ENSEMBLE_VIEWS: 1 75 | NUM_SPATIAL_CROPS: 3 76 | DATA_LOADER: 77 | NUM_WORKERS: 4 78 | PIN_MEMORY: True 79 | NUM_GPUS: 8 80 | NUM_SHARDS: 4 81 | RNG_SEED: 0 82 | OUTPUT_DIR: . 83 | TENSORBOARD: 84 | ENABLE: True 85 | -------------------------------------------------------------------------------- /encoder/model/modules/feat_extractors/visual/motionformer_src/joint_224_16x4.yaml: -------------------------------------------------------------------------------- 1 | TRAIN: 2 | ENABLE: True 3 | DATASET: Ssv2 4 | BATCH_SIZE: 32 5 | EVAL_PERIOD: 5 6 | CHECKPOINT_PERIOD: 5 7 | AUTO_RESUME: True 8 | CHECKPOINT_EPOCH_RESET: True 9 | CHECKPOINT_FILE_PATH: /checkpoint/fmetze/neurips_sota/40944587/checkpoints/checkpoint_epoch_00035.pyth 10 | DATA: 11 | NUM_FRAMES: 16 12 | SAMPLING_RATE: 4 13 | TRAIN_JITTER_SCALES: [256, 320] 14 | TRAIN_CROP_SIZE: 224 15 | TEST_CROP_SIZE: 224 16 | INPUT_CHANNEL_NUM: [3] 17 | MEAN: [0.5, 0.5, 0.5] 18 | STD: [0.5, 0.5, 0.5] 19 | PATH_TO_DATA_DIR: /private/home/mandelapatrick/slowfast/data/ssv2 20 | PATH_PREFIX: /datasets01/SomethingV2/092720/20bn-something-something-v2-frames 21 | INV_UNIFORM_SAMPLE: True 22 | RANDOM_FLIP: False 23 | REVERSE_INPUT_CHANNEL: True 24 | USE_RAND_AUGMENT: False 25 | RE_PROB: 0.0 26 | USE_REPEATED_AUG: False 27 | USE_RANDOM_RESIZE_CROPS: False 28 | COLORJITTER: False 29 | GRAYSCALE: False 30 | GAUSSIAN: False 31 | SOLVER: 32 | BASE_LR: 1e-4 33 | LR_POLICY: steps_with_relative_lrs 34 | LRS: [1, 0.1, 0.01] 35 | STEPS: [0, 20, 30] 36 | MAX_EPOCH: 35 37 | MOMENTUM: 0.9 38 | WEIGHT_DECAY: 5e-2 39 | WARMUP_EPOCHS: 0.0 40 | OPTIMIZING_METHOD: adamw 41 | USE_MIXED_PRECISION: True 42 | SMOOTHING: 0.2 43 | SLOWFAST: 44 | ALPHA: 8 45 | VIT: 46 | PATCH_SIZE: 16 47 | PATCH_SIZE_TEMP: 2 48 | CHANNELS: 3 49 | EMBED_DIM: 768 50 | DEPTH: 12 51 | NUM_HEADS: 12 52 | MLP_RATIO: 4 53 | QKV_BIAS: True 54 | VIDEO_INPUT: True 55 | TEMPORAL_RESOLUTION: 8 56 | USE_MLP: True 57 | DROP: 0.0 58 | POS_DROPOUT: 0.0 59 | DROP_PATH: 0.2 60 | IM_PRETRAINED: True 61 | HEAD_DROPOUT: 0.0 62 | HEAD_ACT: tanh 63 | PRETRAINED_WEIGHTS: vit_1k 64 | ATTN_LAYER: joint 65 | MODEL: 66 | NUM_CLASSES: 174 67 | ARCH: slow 68 | MODEL_NAME: VisionTransformer 69 | LOSS_FUNC: cross_entropy 70 | TEST: 71 | ENABLE: True 72 | DATASET: Ssv2 73 | BATCH_SIZE: 64 74 | NUM_ENSEMBLE_VIEWS: 1 75 | NUM_SPATIAL_CROPS: 3 76 | DATA_LOADER: 77 | NUM_WORKERS: 4 78 | PIN_MEMORY: True 79 | NUM_GPUS: 8 80 | NUM_SHARDS: 4 81 | RNG_SEED: 0 82 | OUTPUT_DIR: . 83 | TENSORBOARD: 84 | ENABLE: True 85 | -------------------------------------------------------------------------------- /encoder/model/modules/feat_extractors/visual/motionformer_src/motionformer_224_16x4.yaml: -------------------------------------------------------------------------------- 1 | TRAIN: 2 | ENABLE: True 3 | DATASET: Ssv2 4 | BATCH_SIZE: 32 5 | EVAL_PERIOD: 5 6 | CHECKPOINT_PERIOD: 5 7 | AUTO_RESUME: True 8 | CHECKPOINT_EPOCH_RESET: True 9 | CHECKPOINT_FILE_PATH: /checkpoint/fmetze/neurips_sota/40944587/checkpoints/checkpoint_epoch_00035.pyth 10 | DATA: 11 | NUM_FRAMES: 16 12 | SAMPLING_RATE: 4 13 | TRAIN_JITTER_SCALES: [256, 320] 14 | TRAIN_CROP_SIZE: 224 15 | TEST_CROP_SIZE: 224 16 | INPUT_CHANNEL_NUM: [3] 17 | MEAN: [0.5, 0.5, 0.5] 18 | STD: [0.5, 0.5, 0.5] 19 | PATH_TO_DATA_DIR: /private/home/mandelapatrick/slowfast/data/ssv2 20 | PATH_PREFIX: /datasets01/SomethingV2/092720/20bn-something-something-v2-frames 21 | INV_UNIFORM_SAMPLE: True 22 | RANDOM_FLIP: False 23 | REVERSE_INPUT_CHANNEL: True 24 | USE_RAND_AUGMENT: True 25 | RE_PROB: 0.0 26 | USE_REPEATED_AUG: False 27 | USE_RANDOM_RESIZE_CROPS: False 28 | COLORJITTER: False 29 | GRAYSCALE: False 30 | GAUSSIAN: False 31 | SOLVER: 32 | BASE_LR: 1e-4 33 | LR_POLICY: steps_with_relative_lrs 34 | LRS: [1, 0.1, 0.01] 35 | STEPS: [0, 20, 30] 36 | MAX_EPOCH: 35 37 | MOMENTUM: 0.9 38 | WEIGHT_DECAY: 5e-2 39 | WARMUP_EPOCHS: 0.0 40 | OPTIMIZING_METHOD: adamw 41 | USE_MIXED_PRECISION: True 42 | SMOOTHING: 0.2 43 | SLOWFAST: 44 | ALPHA: 8 45 | VIT: 46 | PATCH_SIZE: 16 47 | PATCH_SIZE_TEMP: 2 48 | CHANNELS: 3 49 | EMBED_DIM: 768 50 | DEPTH: 12 51 | NUM_HEADS: 12 52 | MLP_RATIO: 4 53 | QKV_BIAS: True 54 | VIDEO_INPUT: True 55 | TEMPORAL_RESOLUTION: 8 56 | USE_MLP: True 57 | DROP: 0.0 58 | POS_DROPOUT: 0.0 59 | DROP_PATH: 0.2 60 | IM_PRETRAINED: True 61 | HEAD_DROPOUT: 0.0 62 | HEAD_ACT: tanh 63 | PRETRAINED_WEIGHTS: vit_1k 64 | ATTN_LAYER: trajectory 65 | MODEL: 66 | NUM_CLASSES: 174 67 | ARCH: slow 68 | MODEL_NAME: VisionTransformer 69 | LOSS_FUNC: cross_entropy 70 | TEST: 71 | ENABLE: True 72 | DATASET: Ssv2 73 | BATCH_SIZE: 64 74 | NUM_ENSEMBLE_VIEWS: 1 75 | NUM_SPATIAL_CROPS: 3 76 | DATA_LOADER: 77 | NUM_WORKERS: 4 78 | PIN_MEMORY: True 79 | NUM_GPUS: 8 80 | NUM_SHARDS: 4 81 | RNG_SEED: 0 82 | OUTPUT_DIR: . 83 | TENSORBOARD: 84 | ENABLE: True 85 | -------------------------------------------------------------------------------- /encoder/model/modules/feat_extractors/visual/motionformer_src/nystrom_helper.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 3 | 4 | from einops import rearrange, repeat 5 | import torch 6 | import torch.nn as nn 7 | import torch.nn.functional as Fn 8 | import math 9 | 10 | 11 | def iterative_inv(mat, n_iter = 6, init_option="exact"): 12 | I = torch.eye(mat.size(-2), device = mat.device) 13 | K = mat 14 | 15 | if init_option == "original": 16 | # This original implementation is more conservative to compute coefficient of Z_0. 17 | V = 1. / torch.max(torch.sum(K, dim = -2)) * K.transpose(-1, -2) 18 | elif init_option == "arbitrary_input": 19 | # sum = 1 for softmax input but not for exp 20 | a1 = torch.max(torch.sum(torch.abs(K), dim = -2, keepdim=True), dim=-1, keepdim=True).values 21 | a2 = torch.max(torch.sum(torch.abs(K), dim = -1, keepdim=True), dim=-2, keepdim=True).values 22 | V = 1. / (a1 * a2) * K.transpose(-1, -2) 23 | else: # The entries of K are positive and ||K||_{\infty} = 1 due to softmax 24 | # This is the exact coefficient computation, 25 | # 1 / ||K||_1, of initialization of Z_0, leading to faster convergence. 26 | V = 1. / torch.max( 27 | torch.sum(K, dim = -2), dim = -1).values.unsqueeze(-1).unsqueeze(-1) * K.transpose(-1, -2) 28 | 29 | for _ in range(n_iter): 30 | KV = torch.matmul(K, V) 31 | V = torch.matmul(0.25 * V, 13 * I - torch.matmul(KV, 15 * I - torch.matmul(KV, 7 * I - KV))) 32 | return V 33 | 34 | 35 | def nystrom_spatial_attn( 36 | q, k, v, landmarks=64, num_frames=None, inv_iters=6, 37 | use_full_matrix=False, use_spatial_landmarks=False, return_attn=False 38 | ): 39 | 40 | """ 41 | Compute full space-time attention but only softmax over spatial dimension 42 | """ 43 | B, N, D = k.shape 44 | F = num_frames 45 | scale = D ** -0.5 46 | q = q * scale 47 | if use_full_matrix: 48 | queries_landmarks = q.clone() 49 | keys_landmarks = k.clone() 50 | else: 51 | segs = N // landmarks 52 | with torch.no_grad(): 53 | if use_spatial_landmarks: 54 | # transpose spatial and temporal dimensions 55 | q2 = rearrange(q, 'b (f p) d -> b (p f) d', f=F) 56 | k2 = rearrange(k, 'b (f p) d -> b (p f) d', f=F) 57 | if (N % landmarks == 0): 58 | keys_landmarks = k2.reshape(B, landmarks, N // landmarks, D).mean(dim = -2) 59 | queries_landmarks = q2.reshape(B, landmarks, N // landmarks, D).mean(dim = -2) 60 | else: 61 | num_k = (segs + 1) * landmarks - N 62 | keys_landmarks_f = k2[:, :num_k * segs, :].reshape( 63 | B, num_k, segs, D).mean(dim = -2) 64 | keys_landmarks_l = k2[:, num_k * segs:, :].reshape( 65 | B, landmarks - num_k, segs + 1, D).mean(dim = -2) 66 | keys_landmarks = torch.cat((keys_landmarks_f, keys_landmarks_l), dim = -2) 67 | 68 | queries_landmarks_f = q2[:, :num_k * segs, :].reshape( 69 | B, num_k, segs, D).mean(dim = -2) 70 | queries_landmarks_l = q2[:, num_k * segs:, :].reshape( 71 | B, landmarks - num_k, segs + 1, D).mean(dim = -2) 72 | queries_landmarks = torch.cat((queries_landmarks_f, queries_landmarks_l), dim = -2) 73 | else: 74 | if (N % landmarks == 0): 75 | keys_landmarks = k.reshape( 76 | B, landmarks, N // landmarks, D).mean(dim = -2) 77 | queries_landmarks = q.reshape( 78 | B, landmarks, N // landmarks, D).mean(dim = -2) 79 | else: 80 | num_k = (segs + 1) * landmarks - N 81 | keys_landmarks_f = k[:, :num_k * segs, :].reshape( 82 | B, num_k, segs, D).mean(dim = -2) 83 | keys_landmarks_l = k[:, num_k * segs:, :].reshape( 84 | B, landmarks - num_k, segs + 1, D).mean(dim = -2) 85 | keys_landmarks = torch.cat((keys_landmarks_f, keys_landmarks_l), dim = -2) 86 | 87 | queries_landmarks_f = q[:, :num_k * segs, :].reshape( 88 | B, num_k, segs, D).mean(dim = -2) 89 | queries_landmarks_l = q[:, num_k * segs:, :].reshape( 90 | B, landmarks - num_k, segs + 1, D).mean(dim = -2) 91 | queries_landmarks = torch.cat((queries_landmarks_f, queries_landmarks_l), dim = -2) 92 | 93 | kernel_1 = Fn.softmax( 94 | torch.matmul(q, keys_landmarks.transpose(-1, -2)), dim = -1) 95 | kernel_2 = Fn.softmax( 96 | torch.matmul(queries_landmarks, keys_landmarks.transpose(-1, -2)), dim = -1) 97 | kernel_3 = Fn.softmax( 98 | rearrange(torch.matmul( 99 | queries_landmarks, k.transpose(-1, -2)), 'b l (f p) -> b l f p', f=F), dim = -1) 100 | attn = torch.matmul(kernel_1, iterative_inv(kernel_2, n_iter=inv_iters)) 101 | 102 | v = rearrange(v, 'b (f p) d -> b f p d', f=F) 103 | x = torch.einsum( 104 | 'b n l, b l f d -> b n f d', 105 | attn, torch.einsum('b l f p, b f p d -> b l f d', kernel_3, v) 106 | ) 107 | 108 | if return_attn: 109 | attn = torch.einsum('b m l, b l f p -> b m f p', attn, kernel_3) 110 | return x, attn 111 | 112 | return x -------------------------------------------------------------------------------- /encoder/model/modules/feat_extractors/visual/motionformer_src/orthoformer_helper.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 3 | 4 | from einops import rearrange, repeat 5 | import torch 6 | import torch.nn as nn 7 | import torch.nn.functional as Fn 8 | import math 9 | 10 | 11 | def orthogonal_landmarks(q, k, num_landmarks=64, subsample_fraction=1.0): 12 | """ 13 | Construct set of landmarks by recursively selecting new landmarks 14 | that are maximally orthogonal to the existing set. 15 | Returns near orthogonal landmarks with shape (B, M, D). 16 | """ 17 | if subsample_fraction < 1.0: 18 | # Need at least M/2 samples of queries and keys 19 | num_samples = max(int(subsample_fraction * q.size(-2)), num_landmarks) 20 | q_unnormalised = q[:, torch.randint(q.size(-2), (num_samples,), device=q.device), :] # (B, N, D) 21 | else: 22 | # (B, N, D) 23 | q_unnormalised = q 24 | 25 | # may need to change default eps to eps=1e-8 for mixed precision compatibility 26 | qk = Fn.normalize(q_unnormalised, p=2, dim=-1) 27 | B, N, D = qk.shape 28 | 29 | selected_mask = torch.zeros((B, N, 1), device=qk.device) 30 | landmark_mask = torch.ones((B, 1, 1), dtype=selected_mask.dtype, device=qk.device) 31 | 32 | # Get initial random landmark 33 | random_idx = torch.randint(qk.size(-2), (B, 1, 1), device=qk.device) 34 | selected_landmark = qk[torch.arange(qk.size(0)), random_idx.view(-1), :].view(B, D) 35 | selected_mask.scatter_(-2, random_idx, landmark_mask) 36 | 37 | # Selected landmarks 38 | selected_landmarks = torch.empty((B, num_landmarks, D), device=qk.device, dtype=qk.dtype) 39 | selected_landmarks[:, 0, :] = selected_landmark 40 | 41 | # Store computed cosine similarities 42 | cos_sims = torch.empty((B, N, num_landmarks), device=qk.device, dtype=qk.dtype) 43 | 44 | for M in range(1, num_landmarks): 45 | # Calculate absolute cosine similarity between selected and unselected landmarks 46 | # (B, N, D) * (B, D) -> (B, N) 47 | cos_sim = torch.einsum('b n d, b d -> b n', qk, selected_landmark).abs() 48 | cos_sims[:, :, M - 1] = cos_sim 49 | # (B, N, M) cosine similarities of current set of landmarks wrt all queries and keys 50 | cos_sim_set = cos_sims[:, :, :M] 51 | 52 | # Get orthogonal landmark: landmark with smallest absolute cosine similarity: 53 | # set cosine similarity for already selected landmarks to > 1 54 | cos_sim_set.view(-1, M)[selected_mask.flatten().bool(), :] = 10 55 | # (B,) - want max for non 56 | selected_landmark_idx = cos_sim_set.amax(-1).argmin(-1) 57 | selected_landmark = qk[torch.arange(qk.size(0)), selected_landmark_idx, :].view(B, D) 58 | 59 | # Add most orthogonal landmark to selected landmarks: 60 | selected_landmarks[:, M, :] = selected_landmark 61 | 62 | # Removed selected indices from non-selected mask: 63 | selected_mask.scatter_(-2, selected_landmark_idx.unsqueeze(-1).unsqueeze(-1), landmark_mask) 64 | landmarks = torch.masked_select( 65 | q_unnormalised, selected_mask.bool()).reshape(B, -1, D) # (B, M, D) 66 | return landmarks # (B, M, D) 67 | 68 | 69 | def orthoformer( 70 | q, k, v, num_landmarks=64, subsample_fraction=1.0, 71 | num_frames=None, shared_landmarks=True, return_attn=False 72 | ): 73 | """ 74 | Computes spatial attention for all pairs of frames. 75 | The attention matrix is approximated using 76 | intermediate landmarks taken from the queries and keys. 77 | The landmarks can be unique (to each frame) or 78 | shared (a common set of landmarks across frames). 79 | """ 80 | B, N, D = k.shape 81 | F = num_frames 82 | L = num_landmarks 83 | P = N // F 84 | 85 | scale = D ** -0.25 86 | q = q * scale 87 | k = k * scale 88 | 89 | if shared_landmarks: 90 | with torch.no_grad(): 91 | landmarks = orthogonal_landmarks(q, k, num_landmarks, subsample_fraction) 92 | kernel_1 = Fn.softmax(torch.matmul(q, landmarks.transpose(-1, -2)), dim=-1) 93 | kernel_2 = Fn.softmax( 94 | rearrange(torch.matmul( 95 | landmarks, k.transpose(-1, -2)), 'b l (f p) -> b l f p', f=F), dim=-1) 96 | v = rearrange(v, 'b (f p) d -> b f p d', f=F) 97 | x = torch.einsum('b l f p, b f p d -> b l f d', kernel_2, v) 98 | x = torch.einsum('b n l, b l f d -> b n f d', kernel_1, x) 99 | if return_attn: 100 | attn = torch.einsum('b m l, b l f p -> b m f p', kernel_1, kernel_2) 101 | return x, attn 102 | else: 103 | q = rearrange(q, 'b (f p) d -> (b f) p d', f=F) 104 | k = rearrange(k, 'b (g q) d -> (b g) q d', g=F) 105 | with torch.no_grad(): 106 | landmarks = orthogonal_landmarks(q, k, num_landmarks, subsample_fraction) 107 | landmarks = rearrange(landmarks, '(b f) l d -> b f l d', f=F) 108 | q = rearrange(q, '(b f) p d -> b f 1 p d', f=F) 109 | k = rearrange(k, '(b g) q d -> b 1 g q d', g=F) 110 | v = rearrange(v, 'b (g q) d -> b 1 g q d', g=F) 111 | kernel_1 = Fn.softmax( 112 | torch.matmul(q, landmarks.unsqueeze(-4).transpose(-1, -2)), dim=-1) 113 | kernel_2 = Fn.softmax( 114 | torch.matmul(landmarks.unsqueeze(-3), k.transpose(-1, -2)), dim=-1) 115 | x = torch.matmul(kernel_1, torch.matmul(kernel_2, v)) 116 | x = rearrange(x, 'b f g p d -> b (f p) g d') 117 | if return_attn: 118 | attn = torch.matmul(kernel_1, kernel_2) 119 | attn = rearrange(attn, 'b f g p q -> b (f p) g q') 120 | return x, attn 121 | 122 | return x -------------------------------------------------------------------------------- /encoder/phi.py: -------------------------------------------------------------------------------- 1 | # ReWaS 2 | # Copyright (c) 2024-present NAVER Cloud Corp. 3 | # CC BY-NC-SA 4.0 (https://creativecommons.org/licenses/by-nc-sa/4.0/) 4 | 5 | import torch.nn as nn 6 | from einops.layers.torch import Rearrange, Reduce 7 | import einops 8 | from functools import partial 9 | from encoder.transformer import MultiheadAttention, SimpleTransformer 10 | import torch 11 | 12 | class Phi(nn.Module): 13 | def __init__(self, input_dim=768, out_dim=1, proj_dims=[768, 128, 64, 16, 1]): 14 | super().__init__() 15 | 16 | self.projection1 = nn.Sequential( 17 | nn.Linear(input_dim, input_dim), 18 | ) 19 | 20 | self.hint_blocks = SimpleTransformer( 21 | attn_target = partial( 22 | MultiheadAttention, 23 | embed_dim=input_dim, 24 | num_heads=8, 25 | bias=True, 26 | add_bias_kv=True, 27 | ), 28 | embed_dim = input_dim, 29 | num_blocks = 3, 30 | weight_init_style = "pytorch", # possible values jax or pytorch 31 | ) 32 | 33 | self.projection2 = nn.Sequential( 34 | nn.Linear(768,128), 35 | nn.ReLU(), 36 | nn.Linear(128,64), 37 | nn.ReLU(), 38 | nn.Linear(64,16), 39 | nn.ReLU(), 40 | nn.Linear(16,1), 41 | ) 42 | 43 | def forward(self, x): 44 | x = self.projection1(x) 45 | x = self.hint_blocks(x) 46 | x = self.projection2(x) 47 | return x -------------------------------------------------------------------------------- /eval_MAE.py: -------------------------------------------------------------------------------- 1 | # ReWaS 2 | # Copyright (c) 2024-present NAVER Cloud Corp. 3 | # CC BY-NC-SA 4.0 (https://creativecommons.org/licenses/by-nc-sa/4.0/) 4 | 5 | import argparse 6 | import time 7 | import numpy as np 8 | import einops 9 | import os 10 | import torch 11 | from torch.utils.data import DataLoader 12 | from audioldm.utilities.data.dataset import AudioDataset 13 | import yaml 14 | from tqdm import tqdm 15 | 16 | 17 | def get_audio(audio): 18 | audio = torch.mean(audio, axis=1) 19 | return audio 20 | 21 | def collate_fn(batch): 22 | batch = list(filter(lambda x: x is not None, batch)) 23 | return torch.utils.data.dataloader.default_collate(batch) 24 | 25 | def filter_common_keys(dict_a, dict_b): 26 | # Find the common keys between dict_a and dict_b 27 | common_keys = dict_a.keys() & dict_b.keys() 28 | 29 | # Create new dictionaries with only the common keys 30 | filtered_dict_a = {key: dict_a[key] for key in common_keys} 31 | filtered_dict_b = {key: dict_b[key] for key in common_keys} 32 | sorted_dict_a = dict(sorted(filtered_dict_a.items())) 33 | sorted_dict_b = dict(sorted(filtered_dict_b.items())) 34 | return sorted_dict_a, sorted_dict_b 35 | 36 | 37 | 38 | def main(args): 39 | batch_size = args.batch_size 40 | configs = yaml.load(open(args.config, "r"), Loader=yaml.FullLoader) 41 | dataloader_add_ons = configs["data"]["dataloader_add_ons"] 42 | 43 | generated_dataset = AudioDataset(configs, split="video_control", add_ons=dataloader_add_ons) 44 | generated_dataloader = DataLoader(generated_dataset, num_workers=8, batch_size=batch_size, shuffle=True,drop_last =True) 45 | 46 | gtdataset = AudioDataset(configs, split="gt", add_ons=dataloader_add_ons) 47 | gt_dataloader = DataLoader(gtdataset, num_workers=args.num_workers, batch_size=batch_size, shuffle=True,drop_last =True) 48 | 49 | gt_energy = {} 50 | test_energy = {} 51 | 52 | for idx, item in tqdm(enumerate(gt_dataloader)): 53 | name = str(item['fname'][0]).split("/")[-1] 54 | gt_mel = item['log_mel_spec'] 55 | energy = torch.mean(gt_mel, dim=2) 56 | gt_energy[f'{name}'] = energy 57 | 58 | for idx, item in tqdm(enumerate(generated_dataloader)): 59 | name = str(item['fname'][0]).split("/")[-1] 60 | pred_mel = item['log_mel_spec'] 61 | energy = torch.mean(pred_mel, dim=2) 62 | test_energy[f'{name}'] = energy 63 | 64 | gt_energy, test_energy = filter_common_keys(gt_energy, test_energy) 65 | print(gt_energy.keys()) 66 | print(test_energy.keys()) 67 | gt_energy = torch.cat(list(gt_energy.values()),dim=0) 68 | test_energy = torch.cat(list(test_energy.values()),dim=0) 69 | loss = nn.L1Loss() 70 | MAE = loss(gt_energy, test_energy) 71 | print(len(gt_energy.keys())) 72 | print(f"###### MAE: {MAE}") 73 | 74 | 75 | if __name__ == '__main__': 76 | parser = argparse.ArgumentParser() 77 | parser.add_argument('--device', default='cuda', type=str) 78 | parser.add_argument('--config', default='configs/audioldm_m_rewas.yaml', type=str) 79 | parser.add_argument('--batch_size', default=128, type=int) 80 | parser.add_argument('--save_path', default="outputs", type=str) 81 | parser.add_argument('--num_workers', default=16, type=int) 82 | args = parser.parse_args() 83 | main(args) 84 | -------------------------------------------------------------------------------- /evaluation/clap/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) Microsoft Corporation. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE -------------------------------------------------------------------------------- /evaluation/clap/__init__.py: -------------------------------------------------------------------------------- 1 | from . import clap 2 | from . import audio 3 | from . import utils -------------------------------------------------------------------------------- /evaluation/clap/clap.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | import torch.nn.functional as F 4 | from torch import nn 5 | from transformers import AutoModel 6 | from .audio import get_audio_encoder 7 | 8 | class Projection(nn.Module): 9 | def __init__(self, d_in: int, d_out: int, p: float=0.5) -> None: 10 | super().__init__() 11 | self.linear1 = nn.Linear(d_in, d_out, bias=False) 12 | self.linear2 = nn.Linear(d_out, d_out, bias=False) 13 | self.layer_norm = nn.LayerNorm(d_out) 14 | self.drop = nn.Dropout(p) 15 | 16 | def forward(self, x: torch.Tensor) -> torch.Tensor: 17 | embed1 = self.linear1(x) 18 | embed2 = self.drop(self.linear2(F.gelu(embed1))) 19 | embeds = self.layer_norm(embed1 + embed2) 20 | return embeds 21 | 22 | class AudioEncoder(nn.Module): 23 | def __init__(self, audioenc_name:str, d_in: int, d_out: int, sample_rate: int, window_size: int, 24 | hop_size: int, mel_bins: int, fmin: int, fmax: int, classes_num: int) -> None: 25 | super().__init__() 26 | 27 | audio_encoder = get_audio_encoder(audioenc_name) 28 | 29 | self.base = audio_encoder( 30 | sample_rate, window_size, 31 | hop_size, mel_bins, fmin, fmax, 32 | classes_num, d_in) 33 | 34 | self.projection = Projection(d_in, d_out) 35 | 36 | def forward(self, x): 37 | out_dict = self.base(x) 38 | audio_features, audio_classification_output = out_dict['embedding'], out_dict['clipwise_output'] 39 | projected_vec = self.projection(audio_features) 40 | return projected_vec, audio_classification_output 41 | 42 | class TextEncoder(nn.Module): 43 | def __init__(self, d_out: int, text_model: str, transformer_embed_dim: int) -> None: 44 | super().__init__() 45 | self.base = AutoModel.from_pretrained(text_model) 46 | 47 | self.projection = Projection(transformer_embed_dim, d_out) 48 | 49 | def forward(self, x): 50 | out = self.base(**x)[0] 51 | out = out[:, 0, :] # get CLS token output 52 | projected_vec = self.projection(out) 53 | return projected_vec 54 | 55 | class CLAP(nn.Module): 56 | def __init__(self, 57 | # audio 58 | audioenc_name: str, 59 | sample_rate: int, 60 | window_size: int, 61 | hop_size: int, 62 | mel_bins: int, 63 | fmin: int, 64 | fmax: int, 65 | classes_num: int, 66 | out_emb: int, 67 | # text 68 | text_model: str, 69 | transformer_embed_dim: int, 70 | # common 71 | d_proj: int, 72 | ): 73 | super().__init__() 74 | 75 | 76 | self.audio_encoder = AudioEncoder( 77 | audioenc_name, out_emb, d_proj, 78 | sample_rate, window_size, hop_size, mel_bins, fmin, fmax, classes_num) 79 | 80 | self.caption_encoder = TextEncoder( 81 | d_proj, text_model, transformer_embed_dim 82 | ) 83 | 84 | self.logit_scale = nn.Parameter(torch.ones([]) * np.log(1 / 0.07)) 85 | 86 | def forward(self, audio, text): 87 | audio_embed, _ = self.audio_encoder(audio) 88 | caption_embed = self.caption_encoder(text) 89 | 90 | return caption_embed, audio_embed, self.logit_scale.exp() -------------------------------------------------------------------------------- /evaluation/clap/clap_config.yml: -------------------------------------------------------------------------------- 1 | # TEXT ENCODER CONFIG 2 | text_model: 'bert-base-uncased' 3 | text_len: 100 4 | transformer_embed_dim: 768 5 | freeze_text_encoder_weights: True 6 | 7 | # AUDIO ENCODER CONFIG 8 | audioenc_name: 'Cnn14' 9 | out_emb: 2048 10 | sampling_rate: 44100 11 | duration: 9 12 | fmin: 50 13 | fmax: 14000 14 | n_fft: 1028 15 | hop_size: 320 16 | mel_bins: 64 17 | window_size: 1024 18 | 19 | # PROJECTION SPACE CONFIG 20 | d_proj: 1024 21 | temperature: 0.003 22 | 23 | # TRAINING AND EVALUATION CONFIG 24 | num_classes: 527 25 | batch_size: 1024 26 | demo: False -------------------------------------------------------------------------------- /evaluation/clap/utils.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import yaml 3 | import sys 4 | 5 | def read_config_as_args(config_path,args=None,is_config_str=False): 6 | return_dict = {} 7 | 8 | if config_path is not None: 9 | if is_config_str: 10 | yml_config = yaml.load(config_path, Loader=yaml.FullLoader) 11 | else: 12 | with open(config_path, "r") as f: 13 | yml_config = yaml.load(f, Loader=yaml.FullLoader) 14 | 15 | if args != None: 16 | for k, v in yml_config.items(): 17 | if k in args.__dict__: 18 | args.__dict__[k] = v 19 | else: 20 | sys.stderr.write("Ignored unknown parameter {} in yaml.\n".format(k)) 21 | else: 22 | for k, v in yml_config.items(): 23 | return_dict[k] = v 24 | 25 | args = args if args != None else return_dict 26 | return argparse.Namespace(**args) 27 | -------------------------------------------------------------------------------- /evaluation/clap_score.py: -------------------------------------------------------------------------------- 1 | # reference: https://github.com/Text-to-Audio/Make-An-Audio/blob/main/wav_evaluation/cal_clap_score.py 2 | 3 | import pathlib 4 | import sys 5 | import os 6 | directory = pathlib.Path(os.getcwd()) 7 | sys.path.append(str(directory)) 8 | import torch 9 | import numpy as np 10 | from clap.CLAPWrapper import CLAPWrapper 11 | import argparse 12 | from tqdm import tqdm 13 | import pandas as pd 14 | import json 15 | 16 | def add_audio_path(df): 17 | df['audio_path'] = df.apply(lambda x:x['mel_path'].replace('.npy','.wav'),axis=1) 18 | return df 19 | 20 | def build_tsv_from_wavs(root_dir, dataset): 21 | 22 | wavfiles = os.listdir(root_dir) 23 | # wavfiles = list(filter(lambda x:x.endswith('.wav') and x[-6:-4]!='gt',wavfiles)) 24 | print(f'###### number of samples: {len(wavfiles)}') 25 | 26 | dict_list = [] 27 | for wavfile in wavfiles: 28 | tmpd = {'audio_path':os.path.join(root_dir, wavfile)} 29 | if dataset == 'vggsound': 30 | caption = ' '.join(wavfile.split('_')[:-1]) 31 | tmpd['caption'] = caption 32 | dict_list.append(tmpd) 33 | 34 | df = pd.DataFrame.from_dict(dict_list) 35 | tsv_path = f'{os.path.basename(root_dir)}.tsv' 36 | tsv_path = os.path.join('./tmp/', tsv_path) 37 | df.to_csv(tsv_path, sep='\t', index=False) 38 | 39 | return tsv_path 40 | 41 | def cal_score_by_tsv(tsv_path, clap_model, cutoff=5): 42 | df = pd.read_csv(tsv_path, sep='\t') 43 | clap_scores = [] 44 | if not ('audio_path' in df.columns): 45 | df = add_audio_path(df) 46 | caption_list,audio_list = [],[] 47 | with torch.no_grad(): 48 | for idx,t in enumerate(tqdm(df.itertuples()), start=1): 49 | caption_list.append(getattr(t,'caption')) 50 | audio_list.append(getattr(t,'audio_path')) 51 | if idx % 20 == 0: 52 | text_embeddings = clap_model.get_text_embeddings(caption_list) 53 | audio_embeddings = clap_model.get_audio_embeddings(audio_list, resample=True, cutoff=5) 54 | score_mat = clap_model.compute_similarity(audio_embeddings, text_embeddings,use_logit_scale=False) 55 | score = score_mat.diagonal() 56 | clap_scores.append(score.cpu().numpy()) 57 | audio_list = [] 58 | caption_list = [] 59 | return np.mean(np.array(clap_scores).flatten()) 60 | 61 | def add_clap_score_to_tsv(tsv_path, clap_model): 62 | df = pd.read_csv(tsv_path,sep='\t') 63 | clap_scores_dict = {} 64 | with torch.no_grad(): 65 | for idx,t in enumerate(tqdm(df.itertuples()),start=1): 66 | text_embeddings = clap_model.get_text_embeddings([getattr(t,'caption')])# 经过了norm的embedding 67 | audio_embeddings = clap_model.get_audio_embeddings([getattr(t,'audio_path')], resample=True) 68 | score = clap_model.compute_similarity(audio_embeddings, text_embeddings,use_logit_scale=False) 69 | clap_scores_dict[idx] = score.cpu().numpy() 70 | df['clap_score'] = clap_scores_dict 71 | df.to_csv(tsv_path[:-4]+'_clap.tsv',sep='\t',index=False) 72 | 73 | 74 | if __name__ == '__main__': 75 | parser = argparse.ArgumentParser() 76 | parser.add_argument('--dataset', type=str, default='vggsound') 77 | parser.add_argument('--tsv_path', type=str, default='') 78 | parser.add_argument('--wav_dir', type=str) 79 | parser.add_argument('--mean', type=bool, default=True) 80 | parser.add_argument('--ckpt_path', default="clap") 81 | args = parser.parse_args() 82 | 83 | if args.tsv_path: 84 | tsv_path = args.tsv_path 85 | else: 86 | tsv_path = os.path.join('./tmp/', f'{os.path.basename(args.wav_dir)}.tsv') 87 | 88 | if not os.path.exists(tsv_path): 89 | print("result tsv not exist, build for it") 90 | tsv_path = build_tsv_from_wavs(args.wav_dir, args.dataset) 91 | 92 | clap_model = CLAPWrapper( 93 | os.path.join(args.ckpt_path, 'CLAP_weights_2022.pth'), 94 | os.path.join(args.ckpt_path, 'clap_config.yml'), 95 | use_cuda=True) 96 | 97 | clap_score = cal_score_by_tsv(tsv_path, clap_model, cutoff=5) 98 | out = args.wav_dir if args.wav_dir else args.tsv_path 99 | 100 | print(f"Clap score for {out} is:{clap_score}") -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | pytorch-lightning 2 | scipy 3 | share==1.0.4 4 | taming-transformers==0.0.1 5 | torch 6 | torchaudio 7 | torchlibrosa 8 | torchmetrics 9 | tqdm 10 | transformers 11 | omegaconf 12 | h5py 13 | braceexpand 14 | webdataset 15 | progressbar 16 | timm 17 | moviepy 18 | wget 19 | numpy -------------------------------------------------------------------------------- /test.py: -------------------------------------------------------------------------------- 1 | # ReWaS 2 | # Copyright (c) 2024-present NAVER Cloud Corp. 3 | # CC BY-NC-SA 4.0 (https://creativecommons.org/licenses/by-nc-sa/4.0/) 4 | 5 | import os 6 | 7 | from audioldm.pipeline import rewas_generation, build_control_model 8 | 9 | import json 10 | import argparse 11 | import pandas as pd 12 | from random import shuffle 13 | from omegaconf import OmegaConf 14 | from tqdm import tqdm 15 | 16 | import warnings 17 | warnings.filterwarnings("ignore") 18 | 19 | import torch 20 | import torch.nn as nn 21 | from torch.utils.data.distributed import DistributedSampler 22 | 23 | from utils import seed_everything 24 | from encoder.encoder_utils import patch_config, get_pretrained 25 | from encoder.phi import Phi 26 | 27 | 28 | def main(args): 29 | 30 | seed = args.seed 31 | seed_everything(seed) 32 | 33 | assert os.path.isfile(args.ckpt_path), "check checkpoints in ckpt_path!" 34 | 35 | control_type = args.control_type 36 | save_path = args.save_path 37 | os.makedirs(save_path, exist_ok=True) 38 | 39 | audioldm_control = build_control_model( 40 | ckpt_path = args.ckpt_path, 41 | control_type = args.control_type, 42 | config = args.config, 43 | model_name = args.model_name) 44 | 45 | cfg_path = f'./configs/cfg-{args.synchformer_exp}.yaml' 46 | synchformer_cfg = OmegaConf.load(cfg_path) 47 | synchformer_cfg = patch_config(synchformer_cfg) 48 | 49 | 50 | video_encoder = get_pretrained(args.synchformer_exp, 0) 51 | 52 | phi = Phi() 53 | resume_params = torch.load(args.phi_ckpt_path) 54 | resume_new = {k.replace("module.",""): v for k, v in resume_params.items()} 55 | phi.load_state_dict(resume_new) 56 | 57 | phi.eval() 58 | phi = nn.DataParallel(phi, device_ids=[i for i in range(torch.cuda.device_count())]) 59 | 60 | print(f'Generate data list: {args.testlist}') 61 | 62 | with open(args.testlist, 'rb') as f: 63 | datalist = list(map(json.loads, f)) 64 | 65 | 66 | for x in tqdm(datalist): 67 | prompt = x['prompt'] 68 | videopath = x['video_name'] 69 | 70 | waveform = rewas_generation( 71 | audioldm_control, 72 | prompt, 73 | videopath, 74 | args.control_type, 75 | args.synchformer_exp, 76 | synchformer_cfg, 77 | video_encoder, 78 | phi, 79 | args.file_path, 80 | seed, 81 | duration=args.duration, 82 | guidance_scale=args.guidance_scale, 83 | ddim_steps=args.ddim_steps, 84 | n_candidate_gen_per_text=args.n_candidate_gen_per_cond, 85 | batchsize=args.batchsize, 86 | save_path=save_path, 87 | re_encode=args.re_encode, 88 | local_rank=0 89 | ) 90 | 91 | if args.re_encode: 92 | os.rmdir('.cache/') 93 | 94 | if __name__ == '__main__': 95 | 96 | parser = argparse.ArgumentParser() 97 | 98 | 99 | parser.add_argument( 100 | "--testlist", 101 | type=str, 102 | default="test_samples.json", 103 | ) 104 | 105 | parser.add_argument( 106 | "--datadir", 107 | type=str, 108 | default="/path/to/video", 109 | ) 110 | 111 | parser.add_argument( 112 | "-f", 113 | "--file_path", 114 | type=str, 115 | default=None, 116 | ) 117 | 118 | parser.add_argument( 119 | "-s", 120 | "--save_path", 121 | type=str, 122 | help="The path to save model output", 123 | default="./results", 124 | ) 125 | 126 | parser.add_argument( 127 | "--model_name", 128 | type=str, 129 | help="The checkpoint you gonna use", 130 | default="audioldm-m-full", 131 | ) 132 | 133 | parser.add_argument( 134 | "-ckpt", 135 | "--ckpt_path", 136 | type=str, 137 | help="The path to the pretrained .ckpt model", 138 | default="ckpts/audioldm_m_rewas_vggsound.ckpt", 139 | ) 140 | 141 | parser.add_argument( 142 | "--phi_ckpt_path", 143 | type=str, 144 | help="The path to the pretrained .ckpt video encoder", 145 | default="ckpts/phi_vggsound.ckpt", 146 | ) 147 | 148 | parser.add_argument( 149 | "--synchformer_exp", 150 | type=str, 151 | help="The name of experiment of synchformer", 152 | default="24-01-04T16-39-21", 153 | ) 154 | 155 | parser.add_argument( 156 | "-b", 157 | "--batchsize", 158 | type=int, 159 | default=1, 160 | help="Generate how many samples at the same time", 161 | ) 162 | 163 | parser.add_argument( 164 | "--ddim_steps", 165 | type=int, 166 | default=200, 167 | help="The sampling step for DDIM", 168 | ) 169 | 170 | parser.add_argument( 171 | "-gs", 172 | "--guidance_scale", 173 | type=float, 174 | default=3, 175 | help="Guidance scale (Large => better quality and relavancy to text; Small => better diversity)", 176 | ) 177 | 178 | parser.add_argument( 179 | "-dur", 180 | "--duration", 181 | type=float, 182 | default=5.0, 183 | help="The duration of the samples", 184 | ) 185 | 186 | parser.add_argument( 187 | "-n", 188 | "--n_candidate_gen_per_cond", 189 | type=int, 190 | default=1, 191 | help="The number of generated sample per condition. A Larger value usually lead to better quality with heavier computation", 192 | ) 193 | 194 | parser.add_argument( 195 | "--seed", 196 | type=int, 197 | default=42, 198 | help="Change this value (any integer number) will lead to a different generation result.", 199 | ) 200 | 201 | parser.add_argument( 202 | "--config", 203 | type=str, 204 | default="configs/audioldm_m_rewas.yaml", 205 | ) 206 | 207 | 208 | parser.add_argument( 209 | "--control_type", 210 | type=str, 211 | default="energy_video", 212 | choices=["energy_audio", "energy_video"] 213 | ) 214 | 215 | parser.add_argument('--re_encode', action='store_true') 216 | 217 | 218 | args = parser.parse_args() 219 | 220 | main(args) 221 | 222 | 223 | -------------------------------------------------------------------------------- /test_samples.json: -------------------------------------------------------------------------------- 1 | {"video_name": "./basketball_bounce.mp4", "prompt": "basketball bounce"} 2 | -------------------------------------------------------------------------------- /tool_add_adapter.py: -------------------------------------------------------------------------------- 1 | # reference: https://github.com/lllyasviel/ControlNet/blob/main/tool_add_control.py 2 | 3 | import sys 4 | import os 5 | import argparse 6 | import torch 7 | from omegaconf import OmegaConf 8 | from encoder.encoder_utils import instantiate_from_config 9 | 10 | 11 | def get_node_name(name, parent_name): 12 | if len(name) <= len(parent_name): 13 | return False, '' 14 | p = name[:len(parent_name)] 15 | if p != parent_name: 16 | return False, '' 17 | return True, name[len(parent_name):] 18 | 19 | def main(): 20 | parser = argparse.ArgumentParser(description="Adding adapter architecture to audioldm.") 21 | parser.add_argument("--input_path", type=str, required=True, help="path to pretrained model weights") 22 | parser.add_argument("--output_path", type=str, required=True, help="path to save output model weights") 23 | parser.add_argument("--config_path", type=str, default="configs/audioldm_m_rewas.yaml") 24 | args = parser.parse_args() 25 | 26 | assert os.path.exists(args.input_path), 'Input model does not exist.' 27 | assert not os.path.exists(args.output_path), 'Output filename already exists.' 28 | 29 | config = OmegaConf.load(args.config_path) 30 | model = instantiate_from_config(config.model).cpu() 31 | print(f'Loaded model config from [{args.config_path}]') 32 | 33 | pretrained_weights = torch.load(args.input_path) 34 | if 'state_dict' in pretrained_weights: 35 | pretrained_weights = pretrained_weights['state_dict'] 36 | 37 | scratch_dict = model.state_dict() 38 | 39 | target_dict = {} 40 | for k in scratch_dict.keys(): 41 | is_control, name = get_node_name(k, 'control_') 42 | if is_control: 43 | copy_k = 'model.diffusion_' + name 44 | print(f'control add: {copy_k}') 45 | else: 46 | copy_k = k 47 | if copy_k in pretrained_weights: 48 | target_dict[k] = pretrained_weights[copy_k].clone() 49 | else: 50 | target_dict[k] = scratch_dict[k].clone() 51 | 52 | model.load_state_dict(target_dict, strict=True) 53 | torch.save(model.state_dict(), args.output_path) 54 | 55 | print(f'Model saved in {args.output_path}') 56 | 57 | if __name__ == "__main__": 58 | main() -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | # reference: https://github.com/haoheliu/AudioLDM/blob/main/audioldm/utils.py 2 | 3 | import subprocess 4 | import json 5 | import os 6 | import soundfile as sf 7 | 8 | 9 | import torch 10 | import torchvision 11 | import torchaudio 12 | 13 | def default_audioldm_config(model_name="audioldm-s-full"): 14 | basic_config = { 15 | "wave_file_save_path": "./output", 16 | "id": { 17 | "version": "v1", 18 | "name": "default", 19 | "root": "/mnt/fast/nobackup/users/hl01486/projects/general_audio_generation/AudioLDM-python/config/default/latent_diffusion.yaml", 20 | }, 21 | "preprocessing": { 22 | "audio": {"sampling_rate": 16000, "max_wav_value": 32768}, 23 | "stft": {"filter_length": 1024, "hop_length": 160, "win_length": 1024}, 24 | "mel": { 25 | "n_mel_channels": 64, 26 | "mel_fmin": 0, 27 | "mel_fmax": 8000, 28 | "freqm": 0, 29 | "timem": 0, 30 | "blur": False, 31 | "mean": -4.63, 32 | "std": 2.74, 33 | "target_length": 1024, 34 | }, 35 | }, 36 | "model": { 37 | "device": "cuda", 38 | "target": "audioldm.pipline.LatentDiffusion", 39 | "params": { 40 | "base_learning_rate": 5e-06, 41 | "linear_start": 0.0015, 42 | "linear_end": 0.0195, 43 | "num_timesteps_cond": 1, 44 | "log_every_t": 200, 45 | "timesteps": 1000, 46 | "first_stage_key": "fbank", 47 | "cond_stage_key": "waveform", 48 | "latent_t_size": 256, 49 | "latent_f_size": 16, 50 | "channels": 8, 51 | "cond_stage_trainable": True, 52 | "conditioning_key": "film", 53 | "monitor": "val/loss_simple_ema", 54 | "scale_by_std": True, 55 | "unet_config": { 56 | "target": "audioldm.latent_diffusion.openaimodel.UNetModel", 57 | "params": { 58 | "image_size": 64, 59 | "extra_film_condition_dim": 512, 60 | "extra_film_use_concat": True, 61 | "in_channels": 8, 62 | "out_channels": 8, 63 | "model_channels": 128, 64 | "attention_resolutions": [8, 4, 2], 65 | "num_res_blocks": 2, 66 | "channel_mult": [1, 2, 3, 5], 67 | "num_head_channels": 32, 68 | "use_spatial_transformer": True, 69 | }, 70 | }, 71 | "first_stage_config": { 72 | "base_learning_rate": 4.5e-05, 73 | "target": "audioldm.variational_autoencoder.autoencoder.AutoencoderKL", 74 | "params": { 75 | "monitor": "val/rec_loss", 76 | "image_key": "fbank", 77 | "subband": 1, 78 | "embed_dim": 8, 79 | "time_shuffle": 1, 80 | "ddconfig": { 81 | "double_z": True, 82 | "z_channels": 8, 83 | "resolution": 256, 84 | "downsample_time": False, 85 | "in_channels": 1, 86 | "out_ch": 1, 87 | "ch": 128, 88 | "ch_mult": [1, 2, 4], 89 | "num_res_blocks": 2, 90 | "attn_resolutions": [], 91 | "dropout": 0.0, 92 | }, 93 | }, 94 | }, 95 | "cond_stage_config": { 96 | "target": "audioldm.clap.encoders.CLAPAudioEmbeddingClassifierFreev2", 97 | "params": { 98 | "key": "waveform", 99 | "sampling_rate": 16000, 100 | "embed_mode": "audio", 101 | "unconditional_prob": 0.1, 102 | }, 103 | }, 104 | }, 105 | }, 106 | } 107 | 108 | if("-l-" in model_name): 109 | basic_config["model"]["params"]["unet_config"]["params"]["model_channels"] = 256 110 | basic_config["model"]["params"]["unet_config"]["params"]["num_head_channels"] = 64 111 | elif("-m-" in model_name): 112 | basic_config["model"]["params"]["unet_config"]["params"]["model_channels"] = 192 113 | basic_config["model"]["params"]["cond_stage_config"]["params"]["amodel"] = "HTSAT-base" # This model use a larger HTAST 114 | 115 | return basic_config 116 | 117 | 118 | def load_json(fname): 119 | with open(fname, "r") as f: 120 | data = json.load(f) 121 | return data 122 | 123 | 124 | def read_json(dataset_json_file): 125 | with open(dataset_json_file, "r") as fp: 126 | data_json = json.load(fp) 127 | return data_json["data"] 128 | 129 | 130 | def seed_everything(seed): 131 | import random, os 132 | import numpy as np 133 | import torch 134 | 135 | random.seed(seed) 136 | os.environ["PYTHONHASHSEED"] = str(seed) 137 | np.random.seed(seed) 138 | torch.manual_seed(seed) 139 | torch.cuda.manual_seed(seed) 140 | torch.backends.cudnn.deterministic = True 141 | torch.backends.cudnn.benchmark = True 142 | --------------------------------------------------------------------------------