├── .gitignore
├── LICENSE
├── NOTICE
├── README.md
├── audioldm
    ├── LICENSE
    ├── README.md
    ├── clap
    │   ├── LICENSE
    │   ├── __init__.py
    │   ├── open_clip
    │   │   ├── __init__.py
    │   │   ├── bert.py
    │   │   ├── bpe_simple_vocab_16e6.txt.gz
    │   │   ├── factory.py
    │   │   ├── feature_fusion.py
    │   │   ├── htsat.py
    │   │   ├── linear_probe.py
    │   │   ├── loss.py
    │   │   ├── model.py
    │   │   ├── model_configs
    │   │   │   ├── HTSAT-base.json
    │   │   │   ├── HTSAT-large.json
    │   │   │   ├── HTSAT-tiny-win-1536.json
    │   │   │   ├── HTSAT-tiny.json
    │   │   │   ├── PANN-10.json
    │   │   │   ├── PANN-14-fmax-18k.json
    │   │   │   ├── PANN-14-fmax-8k-20s.json
    │   │   │   ├── PANN-14-tiny-transformer.json
    │   │   │   ├── PANN-14-win-1536.json
    │   │   │   ├── PANN-14.json
    │   │   │   ├── PANN-6.json
    │   │   │   ├── RN101-quickgelu.json
    │   │   │   ├── RN101.json
    │   │   │   ├── RN50-quickgelu.json
    │   │   │   ├── RN50.json
    │   │   │   ├── RN50x16.json
    │   │   │   ├── RN50x4.json
    │   │   │   ├── ViT-B-16.json
    │   │   │   ├── ViT-B-32-quickgelu.json
    │   │   │   ├── ViT-B-32.json
    │   │   │   └── ViT-L-14.json
    │   │   ├── openai.py
    │   │   ├── pann_model.py
    │   │   ├── pretrained.py
    │   │   ├── timm_model.py
    │   │   ├── tokenizer.py
    │   │   ├── transform.py
    │   │   ├── utils.py
    │   │   └── version.py
    │   └── training
    │   │   ├── __init__.py
    │   │   ├── audioset_textmap.npy
    │   │   ├── data.py
    │   │   ├── distributed.py
    │   │   ├── imagenet_zeroshot_data.py
    │   │   ├── infer_demo.py
    │   │   ├── logger.py
    │   │   ├── lp_main.py
    │   │   ├── lp_train.py
    │   │   ├── main.py
    │   │   ├── params.py
    │   │   ├── scheduler.py
    │   │   ├── train.py
    │   │   └── zero_shot.py
    ├── conditional_models.py
    ├── diffusionmodules
    │   ├── __init__.py
    │   ├── attention.py
    │   ├── distributions.py
    │   ├── ema.py
    │   ├── model.py
    │   ├── nn.py
    │   ├── openaimodel.py
    │   └── x_transformer.py
    ├── hifigan
    │   ├── LICENSE
    │   ├── __init__.py
    │   └── models.py
    ├── latent_diffusion
    │   ├── __init__.py
    │   ├── ddim.py
    │   ├── ddpm.py
    │   ├── dpm_solver
    │   │   ├── __init__.py
    │   │   ├── dpm_solver.py
    │   │   └── sampler.py
    │   └── plms.py
    ├── latent_encoder
    │   ├── __init__.py
    │   └── autoencoder.py
    ├── losses
    │   ├── __init__.py
    │   └── contperceptual.py
    ├── pipeline.py
    ├── rewas.py
    └── utilities
    │   ├── __init__.py
    │   ├── audio
    │       ├── __init__.py
    │       ├── audio_processing.py
    │       ├── stft.py
    │       └── tools.py
    │   ├── data
    │       ├── __init__.py
    │       ├── dataset.py
    │       └── utils.py
    │   ├── diffusion_util.py
    │   ├── model_util.py
    │   ├── sampler_util.py
    │   └── tools.py
├── basketball_bounce.mp4
├── configs
    ├── audioldm_m_rewas.yaml
    ├── cfg-24-01-04T16-39-21.yaml
    └── dataset_root.json
├── encoder
    ├── LICENSE
    ├── README.md
    ├── encoder_utils.py
    ├── model
    │   ├── .DS_Store
    │   ├── modules
    │   │   ├── .DS_Store
    │   │   ├── bridges.py
    │   │   ├── feat_extractors
    │   │   │   ├── .DS_Store
    │   │   │   ├── audio
    │   │   │   │   ├── ast.py
    │   │   │   │   ├── hf_src
    │   │   │   │   │   └── modeling_ast.py
    │   │   │   │   └── resnet.py
    │   │   │   ├── train_clip_src
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── open_clip
    │   │   │   │   │   ├── __init__.py
    │   │   │   │   │   ├── bpe_simple_vocab_16e6.txt.gz
    │   │   │   │   │   ├── coca_model.py
    │   │   │   │   │   ├── constants.py
    │   │   │   │   │   ├── factory.py
    │   │   │   │   │   ├── generation_utils.py
    │   │   │   │   │   ├── hf_configs.py
    │   │   │   │   │   ├── hf_model.py
    │   │   │   │   │   ├── loss.py
    │   │   │   │   │   ├── model.py
    │   │   │   │   │   ├── model_configs
    │   │   │   │   │   │   ├── RN101-quickgelu.json
    │   │   │   │   │   │   ├── RN101.json
    │   │   │   │   │   │   ├── RN50-quickgelu.json
    │   │   │   │   │   │   ├── RN50.json
    │   │   │   │   │   │   ├── RN50x16.json
    │   │   │   │   │   │   ├── RN50x4.json
    │   │   │   │   │   │   ├── RN50x64.json
    │   │   │   │   │   │   ├── ViT-B-16-plus-240.json
    │   │   │   │   │   │   ├── ViT-B-16-plus.json
    │   │   │   │   │   │   ├── ViT-B-16.json
    │   │   │   │   │   │   ├── ViT-B-32-plus-256.json
    │   │   │   │   │   │   ├── ViT-B-32-quickgelu.json
    │   │   │   │   │   │   ├── ViT-B-32.json
    │   │   │   │   │   │   ├── ViT-H-14.json
    │   │   │   │   │   │   ├── ViT-H-16.json
    │   │   │   │   │   │   ├── ViT-L-14-280.json
    │   │   │   │   │   │   ├── ViT-L-14-336.json
    │   │   │   │   │   │   ├── ViT-L-14.json
    │   │   │   │   │   │   ├── ViT-L-16-320.json
    │   │   │   │   │   │   ├── ViT-L-16.json
    │   │   │   │   │   │   ├── ViT-M-16-alt.json
    │   │   │   │   │   │   ├── ViT-M-16.json
    │   │   │   │   │   │   ├── ViT-M-32-alt.json
    │   │   │   │   │   │   ├── ViT-M-32.json
    │   │   │   │   │   │   ├── ViT-S-16-alt.json
    │   │   │   │   │   │   ├── ViT-S-16.json
    │   │   │   │   │   │   ├── ViT-S-32-alt.json
    │   │   │   │   │   │   ├── ViT-S-32.json
    │   │   │   │   │   │   ├── ViT-bigG-14.json
    │   │   │   │   │   │   ├── ViT-e-14.json
    │   │   │   │   │   │   ├── ViT-g-14.json
    │   │   │   │   │   │   ├── coca_ViT-B-32.json
    │   │   │   │   │   │   ├── coca_ViT-L-14.json
    │   │   │   │   │   │   ├── coca_base.json
    │   │   │   │   │   │   ├── coca_roberta-ViT-B-32.json
    │   │   │   │   │   │   ├── convnext_base.json
    │   │   │   │   │   │   ├── convnext_base_w.json
    │   │   │   │   │   │   ├── convnext_base_w_320.json
    │   │   │   │   │   │   ├── convnext_large.json
    │   │   │   │   │   │   ├── convnext_large_d.json
    │   │   │   │   │   │   ├── convnext_large_d_320.json
    │   │   │   │   │   │   ├── convnext_small.json
    │   │   │   │   │   │   ├── convnext_tiny.json
    │   │   │   │   │   │   ├── convnext_xlarge.json
    │   │   │   │   │   │   ├── convnext_xxlarge.json
    │   │   │   │   │   │   ├── convnext_xxlarge_320.json
    │   │   │   │   │   │   ├── mt5-base-ViT-B-32.json
    │   │   │   │   │   │   ├── mt5-xl-ViT-H-14.json
    │   │   │   │   │   │   ├── roberta-ViT-B-32.json
    │   │   │   │   │   │   ├── swin_base_patch4_window7_224.json
    │   │   │   │   │   │   ├── vit_medium_patch16_gap_256.json
    │   │   │   │   │   │   ├── vit_relpos_medium_patch16_cls_224.json
    │   │   │   │   │   │   ├── xlm-roberta-base-ViT-B-32.json
    │   │   │   │   │   │   └── xlm-roberta-large-ViT-H-14.json
    │   │   │   │   │   ├── modified_resnet.py
    │   │   │   │   │   ├── openai.py
    │   │   │   │   │   ├── pretrained.py
    │   │   │   │   │   ├── push_to_hf_hub.py
    │   │   │   │   │   ├── timm_model.py
    │   │   │   │   │   ├── tokenizer.py
    │   │   │   │   │   ├── transform.py
    │   │   │   │   │   ├── transformer.py
    │   │   │   │   │   ├── utils.py
    │   │   │   │   │   └── version.py
    │   │   │   │   └── training
    │   │   │   │   │   ├── .gitignore
    │   │   │   │   │   ├── __init__.py
    │   │   │   │   │   ├── data.py
    │   │   │   │   │   ├── distributed.py
    │   │   │   │   │   ├── file_utils.py
    │   │   │   │   │   ├── imagenet_zeroshot_data.py
    │   │   │   │   │   ├── logger.py
    │   │   │   │   │   ├── params.py
    │   │   │   │   │   ├── precision.py
    │   │   │   │   │   ├── profile.py
    │   │   │   │   │   ├── scheduler.py
    │   │   │   │   │   ├── train.py
    │   │   │   │   │   ├── train_clip.py
    │   │   │   │   │   └── zero_shot.py
    │   │   │   └── visual
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── motionformer.py
    │   │   │   │   ├── motionformer_src
    │   │   │   │       ├── divided_224_16x4.yaml
    │   │   │   │       ├── joint_224_16x4.yaml
    │   │   │   │       ├── motionformer_224_16x4.yaml
    │   │   │   │       ├── nystrom_helper.py
    │   │   │   │       ├── orthoformer_helper.py
    │   │   │   │       ├── performer_helper.py
    │   │   │   │       ├── video_model_builder.py
    │   │   │   │       └── vit_helper.py
    │   │   │   │   └── s3d.py
    │   │   └── transformer.py
    │   └── sync_model.py
    ├── phi.py
    ├── transformer.py
    └── transforms.py
├── eval_MAE.py
├── evaluation
    ├── av_align_score.py
    ├── clap
    │   ├── CLAPWrapper.py
    │   ├── LICENSE
    │   ├── __init__.py
    │   ├── audio.py
    │   ├── clap.py
    │   ├── clap_config.yml
    │   └── utils.py
    └── clap_score.py
├── requirements.txt
├── test.py
├── test_samples.json
├── tool_add_adapter.py
└── utils.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | __pycache__/
 2 | ckpts/
 3 | *.ckpt
 4 | *.pt
 5 | *.pth
 6 | evaluation/clap/CLAP_weights_2022.pth
 7 | output/
 8 | taming/
 9 | evaluation/tmp/
10 | logs/
11 | results/
12 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | ## [AAAI'25] Read, Watch and Scream! Sound Generation from Text and Video
  2 | 
  3 | [![arXiv](https://img.shields.io/badge/arXiv%20papr-2407.05551-b31b1b.svg)](https://arxiv.org/abs/2407.05551)
  4 | [![Samples](https://img.shields.io/badge/Demo-Link-blue.svg)](https://naver-ai.github.io/rewas/)
  5 | 
  6 | 
  7 | [Yujin Jeong](https://eugene6923.github.io/)&nbsp; [Yunji Kim](https://github.com/YunjiKim)&nbsp; [Sanghyuk Chun](https://sanghyukchun.github.io/home/)&nbsp; [Jiyoung Lee](https://lee-jiyoung.github.io/)
  8 | 
  9 | NAVER AI Lab
 10 | 
 11 | --- 
 12 | ### Updates
 13 | - (12/2024) Our paper is accepted at AAAI 2025!
 14 | - (10/2024) We release the official code!
 15 | 
 16 | ---
 17 | 
 18 | ### Abstract
 19 | 
 20 | 
 21 | Multimodal generative models have shown impressive advances with the help of powerful diffusion models.
 22 | Despite the progress, generating sound solely from text poses challenges in ensuring comprehensive scene depiction and temporal alignment.
 23 | Meanwhile, video-to-sound generation limits the flexibility to prioritize sound synthesis for specific objects within the scene.
 24 | To tackle these challenges, we propose a novel video-and-text-to-sound generation method, called **ReWaS**, where video serves as a conditional control for a text-to-audio generation model.
 25 | Our method estimates the structural information of audio (namely, energy) from the video while receiving key content cues from a user prompt.
 26 | We employ a well-performing text-to-sound model to consolidate the video control, which is much more efficient for training multimodal diffusion models with massive triplet-paired (audio-video-text) data.
 27 | In addition, by separating the generative components of audio, it becomes a more flexible system that allows users to freely adjust the energy, surrounding environment, and primary sound source according to their preferences.
 28 | Experimental results demonstrate that our method shows superiority in terms of quality, controllability, and training efficiency.
 29 | 
 30 | 
 31 | 
 32 | 
 33 | ## ReWaS
 34 | 
 35 | ### Prepare Python running environment
 36 | 
 37 | ```shell 
 38 | git clone https://github.com/naver-ai/rewas.git
 39 | # Install running environment
 40 | sudo apt-get update
 41 | sudo apt-get install -y python3-tk
 42 | sudo apt-get install -y ffmpeg
 43 | pip install -r requirements.txt
 44 | ```
 45 | 
 46 | If the code raises the following error, 'No module named 'pytorch_lightning.utilities.rank_zero', please upgrade pytorch-lightning.
 47 | 
 48 | ### Download checkpoints
 49 | 
 50 | 1. Download checkpoints from [link](https://huggingface.co/lee-j/ReWaS/tree/main) that contains parameteres of ReWaS(AudioLDM-M) and phi.
 51 | 
 52 | 2. Download the checkpoints of pretrained Synchformer, VAE, CLAP, 16kHz HiFiGAN, and 48kHz HiFiGAN from [Synchformer](https://github.com/v-iashin/Synchformer?tab=readme-ov-file#audio-visual-synchronization-models) and [AudioLDM-training](https://github.com/haoheliu/AudioLDM-training-finetuning?tab=readme-ov-file#download-checkpoints-and-dataset).
 53 | 
 54 | 
 55 | ```shell
 56 | ckpts/
 57 |   vae_mel_16k_64bins.ckpt
 58 |   hifigan_16k_64bins.ckpt
 59 |   clap_music_speech_audioset_epoch_15_esc_89.98.pt
 60 |   24-01-04T16-39-21.pt
 61 |   phi_vggsound.ckpt
 62 |   audioldm_m_rewas_vggsound.ckpt
 63 | ```
 64 | 
 65 | ### Test ReWaS
 66 | Please insert the video path and text prompt that you want to generate audio into 'test_samples.json'.
 67 | 
 68 | Use the following syntax:
 69 | 
 70 | ```shell
 71 | python test.py \
 72 |   -ckpt ckpts/rewas.ckpt \
 73 |   --config configs/audioldm_m_rewas.yaml \
 74 |   --control_type energy_video \
 75 |   --save_path outputs \
 76 |   --testlist 'test_samples.json'
 77 | ```
 78 | 
 79 | ### Evaluate model
 80 | 
 81 | We recommend the following evaluation metrics.
 82 | 
 83 | 1. **Energy MAE**: ./eval_MAE.py
 84 | 2. [**Melception Audio Quality**](https://github.com/v-iashin/SpecVQGAN/blob/main/evaluate.py)
 85 | 3. [**CLAP score**](https://github.com/Text-to-Audio/Make-An-Audio/tree/main/wav_evaluation) 
 86 | - Download CLAP weights from [Hugging Face](https://huggingface.co/microsoft/msclap/blob/main/CLAP_weights_2022.pth) into `evaluation/clap/CLAP_weights_2022.pth`
 87 |   ```shell 
 88 |   cd evaluation;
 89 |   python clap_score.py
 90 |   ```
 91 | - requirements: transformer>=4.28.1
 92 | 
 93 | 4. [**Onset Accuracy**](https://github.com/XYPB/CondFoleyGen/blob/main/predict_onset.py)
 94 | 5. [**AV-align**](https://github.com/guyyariv/TempoTokens/blob/master/av_align.py)
 95 |     ```shell
 96 |     cd evaluation;
 97 |     python av_align_score.py --input_video_dir='/path/to/vggsound_video' --input_wav_dir='results/' --cache_path='./video_cache.json'
 98 |     ```
 99 | 
100 | ### Customizing
101 | If you want to build a new ReWaS or apply in other text-to-audio model, you can use `tool_add_adapter.py`
102 | 
103 | 
104 | ## BibTex
105 | 
106 | ```
107 | @inproceedings{jeong2024read,
108 |   author    = {Jeong, Yujin and Kim, Yunji and Chun, Sanghyuk and Lee, Jiyoung},
109 |   title     = {Read, Watch and Scream! Sound Generation from Text and Video},
110 |   journal   = {arXiv preprint arXiv:2407.05551},
111 |   year      = {2024},
112 | }
113 | ```
114 | 
115 | ## License
116 | ```
117 | ReWaS
118 | Copyright (c) 2024-present NAVER Cloud Corp.
119 | CC BY-NC-SA 4.0 (https://creativecommons.org/licenses/by-nc-sa/4.0/)
120 | ```
121 | 
122 | ## Reference
123 | We greatly appreciate the open-soucing of the following code bases. Open source code base is the real-world infinite stone 💎!
124 | - https://github.com/haoheliu/AudioLDM-training-finetuning
125 | - https://github.com/lllyasviel/ControlNet
126 | - https://github.com/v-iashin/Synchformer
127 | 
128 | 


--------------------------------------------------------------------------------
/audioldm/README.md:
--------------------------------------------------------------------------------
1 | ### Reference
2 | 
3 | Part of the code is borrowed from the following repos.
4 | 
5 | https://github.com/haoheliu/AudioLDM
6 | 


--------------------------------------------------------------------------------
/audioldm/clap/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/naver-ai/rewas/4e09954d386583a4e62e7a1e28c2145ca1cfc411/audioldm/clap/__init__.py


--------------------------------------------------------------------------------
/audioldm/clap/open_clip/__init__.py:
--------------------------------------------------------------------------------
 1 | from .factory import (
 2 |     list_models,
 3 |     create_model,
 4 |     create_model_and_transforms,
 5 |     add_model_config,
 6 | )
 7 | from .loss import ClipLoss, gather_features, LPLoss, lp_gather_features, LPMetrics
 8 | from .model import (
 9 |     CLAP,
10 |     CLAPTextCfg,
11 |     CLAPVisionCfg,
12 |     CLAPAudioCfp,
13 |     convert_weights_to_fp16,
14 |     trace_model,
15 | )
16 | from .openai import load_openai_model, list_openai_models
17 | from .pretrained import (
18 |     list_pretrained,
19 |     list_pretrained_tag_models,
20 |     list_pretrained_model_tags,
21 |     get_pretrained_url,
22 |     download_pretrained,
23 | )
24 | from .tokenizer import SimpleTokenizer, tokenize
25 | from .transform import image_transform
26 | 


--------------------------------------------------------------------------------
/audioldm/clap/open_clip/bert.py:
--------------------------------------------------------------------------------
 1 | from transformers import BertTokenizer, BertModel
 2 | 
 3 | tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
 4 | model = BertModel.from_pretrained("bert-base-uncased")
 5 | text = "Replace me by any text you'd like."
 6 | 
 7 | 
 8 | def bert_embeddings(text):
 9 |     # text = "Replace me by any text you'd like."
10 |     encoded_input = tokenizer(text, return_tensors="pt")
11 |     output = model(**encoded_input)
12 |     return output
13 | 
14 | 
15 | from transformers import RobertaTokenizer, RobertaModel
16 | 
17 | tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
18 | model = RobertaModel.from_pretrained("roberta-base")
19 | text = "Replace me by any text you'd like."
20 | 
21 | 
22 | def Roberta_embeddings(text):
23 |     # text = "Replace me by any text you'd like."
24 |     encoded_input = tokenizer(text, return_tensors="pt")
25 |     output = model(**encoded_input)
26 |     return output
27 | 
28 | 
29 | from transformers import BartTokenizer, BartModel
30 | 
31 | tokenizer = BartTokenizer.from_pretrained("facebook/bart-base")
32 | model = BartModel.from_pretrained("facebook/bart-base")
33 | text = "Replace me by any text you'd like."
34 | 
35 | 
36 | def bart_embeddings(text):
37 |     # text = "Replace me by any text you'd like."
38 |     encoded_input = tokenizer(text, return_tensors="pt")
39 |     output = model(**encoded_input)
40 |     return output
41 | 


--------------------------------------------------------------------------------
/audioldm/clap/open_clip/bpe_simple_vocab_16e6.txt.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/naver-ai/rewas/4e09954d386583a4e62e7a1e28c2145ca1cfc411/audioldm/clap/open_clip/bpe_simple_vocab_16e6.txt.gz


--------------------------------------------------------------------------------
/audioldm/clap/open_clip/linear_probe.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import torch.nn.functional as F
 3 | from torch import nn
 4 | from .model import MLPLayers
 5 | 
 6 | 
 7 | class LinearProbe(nn.Module):
 8 |     def __init__(self, model, mlp, freeze, in_ch, out_ch, act=None):
 9 |         """
10 |         Args:
11 |             model: nn.Module
12 |             mlp: bool, if True, then use the MLP layer as the linear probe module
13 |             freeze: bool, if Ture, then freeze all the CLAP model's layers when training the linear probe
14 |             in_ch: int, the output channel from CLAP model
15 |             out_ch: int, the output channel from linear probe (class_num)
16 |             act: torch.nn.functional, the activation function before the loss function
17 |         """
18 |         super().__init__()
19 |         in_ch = 512
20 |         self.clap_model = model
21 |         self.clap_model.text_branch = None  # to save memory
22 |         self.freeze = freeze
23 |         if mlp:
24 |             self.lp_layer = MLPLayers(units=[in_ch, in_ch * 2, out_ch])
25 |         else:
26 |             self.lp_layer = nn.Linear(in_ch, out_ch)
27 | 
28 |         if self.freeze:
29 |             for param in self.clap_model.parameters():
30 |                 param.requires_grad = False
31 | 
32 |         if act == "None":
33 |             self.act = None
34 |         elif act == "relu":
35 |             self.act = nn.ReLU()
36 |         elif act == "elu":
37 |             self.act = nn.ELU()
38 |         elif act == "prelu":
39 |             self.act = nn.PReLU(num_parameters=in_ch)
40 |         elif act == "softmax":
41 |             self.act = nn.Softmax(dim=-1)
42 |         elif act == "sigmoid":
43 |             self.act = nn.Sigmoid()
44 | 
45 |     def forward(self, x, mix_lambda=None, device=None):
46 |         """
47 |         Args:
48 |             x: waveform, torch.tensor [batch, t_samples] / batch of mel_spec and longer list
49 |             mix_lambda: torch.tensor [batch], the mixup lambda
50 |         Returns:
51 |             class_prob: torch.tensor [batch, class_num]
52 | 
53 |         """
54 |         # batchnorm cancel grandient
55 |         if self.freeze:
56 |             self.clap_model.eval()
57 | 
58 |         x = self.clap_model.audio_projection(
59 |             self.clap_model.audio_branch(x, mixup_lambda=mix_lambda, device=device)[
60 |                 "embedding"
61 |             ]
62 |         )
63 |         out = self.lp_layer(x)
64 |         if self.act is not None:
65 |             out = self.act(out)
66 |         return out
67 | 


--------------------------------------------------------------------------------
/audioldm/clap/open_clip/model_configs/HTSAT-base.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1024,
 3 |     "audio_cfg": {
 4 |         "audio_length": 1024,
 5 |         "clip_samples": 480000,
 6 |         "mel_bins": 64,
 7 |         "sample_rate": 48000,
 8 |         "window_size": 1024,
 9 |         "hop_size": 480,
10 |         "fmin": 50,
11 |         "fmax": 14000,
12 |         "class_num": 527,
13 |         "model_type": "HTSAT",
14 |         "model_name": "base"
15 |     },
16 |     "text_cfg": {
17 |         "context_length": 77,
18 |         "vocab_size": 49408,
19 |         "width": 512,
20 |         "heads": 8,
21 |         "layers": 12
22 |     }
23 | }


--------------------------------------------------------------------------------
/audioldm/clap/open_clip/model_configs/HTSAT-large.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 2048,
 3 |     "audio_cfg": {
 4 |         "audio_length": 1024,
 5 |         "clip_samples": 480000,
 6 |         "mel_bins": 64,
 7 |         "sample_rate": 48000,
 8 |         "window_size": 1024,
 9 |         "hop_size": 480,
10 |         "fmin": 50,
11 |         "fmax": 14000,
12 |         "class_num": 527,
13 |         "model_type": "HTSAT",
14 |         "model_name": "large"
15 |     },
16 |     "text_cfg": {
17 |         "context_length": 77,
18 |         "vocab_size": 49408,
19 |         "width": 512,
20 |         "heads": 8,
21 |         "layers": 12
22 |     }
23 | }


--------------------------------------------------------------------------------
/audioldm/clap/open_clip/model_configs/HTSAT-tiny-win-1536.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 768,
 3 |     "audio_cfg": {
 4 |         "audio_length": 1024,
 5 |         "clip_samples": 480000,
 6 |         "mel_bins": 64,
 7 |         "sample_rate": 48000,
 8 |         "window_size": 1536,
 9 |         "hop_size": 480,
10 |         "fmin": 50,
11 |         "fmax": 14000,
12 |         "class_num": 527,
13 |         "model_type": "HTSAT",
14 |         "model_name": "tiny"
15 |     },
16 |     "text_cfg": {
17 |         "context_length": 77,
18 |         "vocab_size": 49408,
19 |         "width": 512,
20 |         "heads": 8,
21 |         "layers": 12
22 |     }
23 | }


--------------------------------------------------------------------------------
/audioldm/clap/open_clip/model_configs/HTSAT-tiny.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 768,
 3 |     "audio_cfg": {
 4 |         "audio_length": 1024,
 5 |         "clip_samples": 480000,
 6 |         "mel_bins": 64,
 7 |         "sample_rate": 48000,
 8 |         "window_size": 1024,
 9 |         "hop_size": 480,
10 |         "fmin": 50,
11 |         "fmax": 14000,
12 |         "class_num": 527,
13 |         "model_type": "HTSAT",
14 |         "model_name": "tiny"
15 |     },
16 |     "text_cfg": {
17 |         "context_length": 77,
18 |         "vocab_size": 49408,
19 |         "width": 512,
20 |         "heads": 8,
21 |         "layers": 12
22 |     }
23 | }


--------------------------------------------------------------------------------
/audioldm/clap/open_clip/model_configs/PANN-10.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1024,
 3 |     "audio_cfg": {
 4 |         "audio_length": 1024,
 5 |         "clip_samples": 480000,
 6 |         "mel_bins": 64,
 7 |         "sample_rate": 48000,
 8 |         "window_size": 1024,
 9 |         "hop_size": 480,
10 |         "fmin": 50,
11 |         "fmax": 14000,
12 |         "class_num": 527,
13 |         "model_type": "PANN",
14 |         "model_name": "Cnn10"
15 |     },
16 |     "text_cfg": {
17 |         "context_length": 77,
18 |         "vocab_size": 49408,
19 |         "width": 512,
20 |         "heads": 8,
21 |         "layers": 12
22 |     }
23 | }


--------------------------------------------------------------------------------
/audioldm/clap/open_clip/model_configs/PANN-14-fmax-18k.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 2048,
 3 |     "audio_cfg": {
 4 |         "audio_length": 1024,
 5 |         "clip_samples": 480000,
 6 |         "mel_bins": 64,
 7 |         "sample_rate": 48000,
 8 |         "window_size": 1024,
 9 |         "hop_size": 480,
10 |         "fmin": 50,
11 |         "fmax": 18000,
12 |         "class_num": 527,
13 |         "model_type": "PANN",
14 |         "model_name": "Cnn14"
15 |     },
16 |     "text_cfg": {
17 |         "context_length": 77,
18 |         "vocab_size": 49408,
19 |         "width": 512,
20 |         "heads": 8,
21 |         "layers": 12
22 |     }
23 | }


--------------------------------------------------------------------------------
/audioldm/clap/open_clip/model_configs/PANN-14-fmax-8k-20s.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 2048,
 3 |     "audio_cfg": {
 4 |         "audio_length": 1024,
 5 |         "clip_samples": 960000,
 6 |         "mel_bins": 64,
 7 |         "sample_rate": 48000,
 8 |         "window_size": 1024,
 9 |         "hop_size": 360,
10 |         "fmin": 50,
11 |         "fmax": 8000,
12 |         "class_num": 527,
13 |         "model_type": "PANN",
14 |         "model_name": "Cnn14"
15 |     },
16 |     "text_cfg": {
17 |         "context_length": 77,
18 |         "vocab_size": 49408,
19 |         "width": 512,
20 |         "heads": 8,
21 |         "layers": 12
22 |     }
23 | }


--------------------------------------------------------------------------------
/audioldm/clap/open_clip/model_configs/PANN-14-tiny-transformer.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 2048,
 3 |     "audio_cfg": {
 4 |         "audio_length": 1024,
 5 |         "clip_samples": 480000,
 6 |         "mel_bins": 64,
 7 |         "sample_rate": 48000,
 8 |         "window_size": 1024,
 9 |         "hop_size": 480,
10 |         "fmin": 50,
11 |         "fmax": 14000,
12 |         "class_num": 527,
13 |         "model_type": "PANN",
14 |         "model_name": "Cnn14"
15 |     },
16 |     "text_cfg": {
17 |         "context_length": 77,
18 |         "vocab_size": 49408,
19 |         "width": 512,
20 |         "heads": 8,
21 |         "layers": 4
22 |     }
23 | }


--------------------------------------------------------------------------------
/audioldm/clap/open_clip/model_configs/PANN-14-win-1536.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 2048,
 3 |     "audio_cfg": {
 4 |         "audio_length": 1024,
 5 |         "clip_samples": 480000,
 6 |         "mel_bins": 64,
 7 |         "sample_rate": 48000,
 8 |         "window_size": 1536,
 9 |         "hop_size": 480,
10 |         "fmin": 50,
11 |         "fmax": 14000,
12 |         "class_num": 527,
13 |         "model_type": "PANN",
14 |         "model_name": "Cnn14"
15 |     },
16 |     "text_cfg": {
17 |         "context_length": 77,
18 |         "vocab_size": 49408,
19 |         "width": 512,
20 |         "heads": 8,
21 |         "layers": 12
22 |     }
23 | }


--------------------------------------------------------------------------------
/audioldm/clap/open_clip/model_configs/PANN-14.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 2048,
 3 |     "audio_cfg": {
 4 |         "audio_length": 1024,
 5 |         "clip_samples": 480000,
 6 |         "mel_bins": 64,
 7 |         "sample_rate": 48000,
 8 |         "window_size": 1024,
 9 |         "hop_size": 480,
10 |         "fmin": 50,
11 |         "fmax": 14000,
12 |         "class_num": 527,
13 |         "model_type": "PANN",
14 |         "model_name": "Cnn14"
15 |     },
16 |     "text_cfg": {
17 |         "context_length": 77,
18 |         "vocab_size": 49408,
19 |         "width": 512,
20 |         "heads": 8,
21 |         "layers": 12
22 |     }
23 | }


--------------------------------------------------------------------------------
/audioldm/clap/open_clip/model_configs/PANN-6.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 512,
 3 |     "audio_cfg": {
 4 |         "audio_length": 1024,
 5 |         "clip_samples": 480000,
 6 |         "mel_bins": 64,
 7 |         "sample_rate": 48000,
 8 |         "window_size": 1024,
 9 |         "hop_size": 480,
10 |         "fmin": 50,
11 |         "fmax": 14000,
12 |         "class_num": 527,
13 |         "model_type": "PANN",
14 |         "model_name": "Cnn6"
15 |     },
16 |     "text_cfg": {
17 |         "context_length": 77,
18 |         "vocab_size": 49408,
19 |         "width": 512,
20 |         "heads": 8,
21 |         "layers": 12
22 |     }
23 | }


--------------------------------------------------------------------------------
/audioldm/clap/open_clip/model_configs/RN101-quickgelu.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 512,
 3 |     "quick_gelu": true,
 4 |     "vision_cfg": {
 5 |         "image_size": 224,
 6 |         "layers": [
 7 |             3,
 8 |             4,
 9 |             23,
10 |             3
11 |         ],
12 |         "width": 64,
13 |         "patch_size": null
14 |     },
15 |     "text_cfg": {
16 |         "context_length": 77,
17 |         "vocab_size": 49408,
18 |         "width": 512,
19 |         "heads": 8,
20 |         "layers": 12
21 |     }
22 | }


--------------------------------------------------------------------------------
/audioldm/clap/open_clip/model_configs/RN101.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 512,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": [
 6 |             3,
 7 |             4,
 8 |             23,
 9 |             3
10 |         ],
11 |         "width": 64,
12 |         "patch_size": null
13 |     },
14 |     "text_cfg": {
15 |         "context_length": 77,
16 |         "vocab_size": 49408,
17 |         "width": 512,
18 |         "heads": 8,
19 |         "layers": 12
20 |     }
21 | }


--------------------------------------------------------------------------------
/audioldm/clap/open_clip/model_configs/RN50-quickgelu.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1024,
 3 |     "quick_gelu": true,
 4 |     "vision_cfg": {
 5 |         "image_size": 224,
 6 |         "layers": [
 7 |             3,
 8 |             4,
 9 |             6,
10 |             3
11 |         ],
12 |         "width": 64,
13 |         "patch_size": null
14 |     },
15 |     "text_cfg": {
16 |         "context_length": 77,
17 |         "vocab_size": 49408,
18 |         "width": 512,
19 |         "heads": 8,
20 |         "layers": 12
21 |     }
22 | }
23 | 


--------------------------------------------------------------------------------
/audioldm/clap/open_clip/model_configs/RN50.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1024,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": [
 6 |             3,
 7 |             4,
 8 |             6,
 9 |             3
10 |         ],
11 |         "width": 64,
12 |         "patch_size": null
13 |     },
14 |     "text_cfg": {
15 |         "context_length": 77,
16 |         "vocab_size": 49408,
17 |         "width": 512,
18 |         "heads": 8,
19 |         "layers": 12
20 |     }
21 | }


--------------------------------------------------------------------------------
/audioldm/clap/open_clip/model_configs/RN50x16.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 768,
 3 |     "vision_cfg": {
 4 |         "image_size": 384,
 5 |         "layers": [
 6 |             6,
 7 |             8,
 8 |             18,
 9 |             8
10 |         ],
11 |         "width": 96,
12 |         "patch_size": null
13 |     },
14 |     "text_cfg": {
15 |         "context_length": 77,
16 |         "vocab_size": 49408,
17 |         "width": 768,
18 |         "heads": 12,
19 |         "layers": 12
20 |     }
21 | }


--------------------------------------------------------------------------------
/audioldm/clap/open_clip/model_configs/RN50x4.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 640,
 3 |     "vision_cfg": {
 4 |         "image_size": 288,
 5 |         "layers": [
 6 |             4,
 7 |             6,
 8 |             10,
 9 |             6
10 |         ],
11 |         "width": 80,
12 |         "patch_size": null
13 |     },
14 |     "text_cfg": {
15 |         "context_length": 77,
16 |         "vocab_size": 49408,
17 |         "width": 640,
18 |         "heads": 10,
19 |         "layers": 12
20 |     }
21 | }


--------------------------------------------------------------------------------
/audioldm/clap/open_clip/model_configs/ViT-B-16.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 512,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 12,
 6 |         "width": 768,
 7 |         "patch_size": 16
 8 |     },
 9 |     "text_cfg": {
10 |         "context_length": 77,
11 |         "vocab_size": 49408,
12 |         "width": 512,
13 |         "heads": 8,
14 |         "layers": 12
15 |     }
16 | }


--------------------------------------------------------------------------------
/audioldm/clap/open_clip/model_configs/ViT-B-32-quickgelu.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 512,
 3 |     "quick_gelu": true,
 4 |     "vision_cfg": {
 5 |         "image_size": 224,
 6 |         "layers": 12,
 7 |         "width": 768,
 8 |         "patch_size": 32
 9 |     },
10 |     "text_cfg": {
11 |         "context_length": 77,
12 |         "vocab_size": 49408,
13 |         "width": 512,
14 |         "heads": 8,
15 |         "layers": 12
16 |     }
17 | }


--------------------------------------------------------------------------------
/audioldm/clap/open_clip/model_configs/ViT-B-32.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 512,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 12,
 6 |         "width": 768,
 7 |         "patch_size": 32
 8 |     },
 9 |     "text_cfg": {
10 |         "context_length": 77,
11 |         "vocab_size": 49408,
12 |         "width": 512,
13 |         "heads": 8,
14 |         "layers": 12
15 |     }
16 | }


--------------------------------------------------------------------------------
/audioldm/clap/open_clip/model_configs/ViT-L-14.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 768,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 24,
 6 |         "width": 1024,
 7 |         "patch_size": 14
 8 |     },
 9 |     "text_cfg": {
10 |         "context_length": 77,
11 |         "vocab_size": 49408,
12 |         "width": 768,
13 |         "heads": 12,
14 |         "layers": 12
15 |     }
16 | }


--------------------------------------------------------------------------------
/audioldm/clap/open_clip/openai.py:
--------------------------------------------------------------------------------
  1 | """ OpenAI pretrained model functions
  2 | 
  3 | Adapted from https://github.com/openai/CLIP. Originally MIT License, Copyright (c) 2021 OpenAI.
  4 | """
  5 | 
  6 | import os
  7 | import warnings
  8 | from typing import Union, List
  9 | 
 10 | import torch
 11 | 
 12 | from .model import build_model_from_openai_state_dict
 13 | from .pretrained import (
 14 |     get_pretrained_url,
 15 |     list_pretrained_tag_models,
 16 |     download_pretrained,
 17 | )
 18 | 
 19 | __all__ = ["list_openai_models", "load_openai_model"]
 20 | 
 21 | 
 22 | def list_openai_models() -> List[str]:
 23 |     """Returns the names of available CLIP models"""
 24 |     return list_pretrained_tag_models("openai")
 25 | 
 26 | 
 27 | def load_openai_model(
 28 |     name: str,
 29 |     model_cfg,
 30 |     device: Union[str, torch.device] = "cuda" if torch.cuda.is_available() else "cpu",
 31 |     jit=True,
 32 |     cache_dir=os.path.expanduser("~/.cache/clip"),
 33 |     enable_fusion: bool = False,
 34 |     fusion_type: str = "None",
 35 | ):
 36 |     """Load a CLIP model, preserve its text pretrained part, and set in the CLAP model
 37 | 
 38 |     Parameters
 39 |     ----------
 40 |     name : str
 41 |         A model name listed by `clip.available_models()`, or the path to a model checkpoint containing the state_dict
 42 |     device : Union[str, torch.device]
 43 |         The device to put the loaded model
 44 |     jit : bool
 45 |         Whether to load the optimized JIT model (default) or more hackable non-JIT model.
 46 | 
 47 |     Returns
 48 |     -------
 49 |     model : torch.nn.Module
 50 |         The CLAP model
 51 |     preprocess : Callable[[PIL.Image], torch.Tensor]
 52 |         A torchvision transform that converts a PIL image into a tensor that the returned model can take as its input
 53 |     """
 54 |     if get_pretrained_url(name, "openai"):
 55 |         model_path = download_pretrained(
 56 |             get_pretrained_url(name, "openai"), root=cache_dir
 57 |         )
 58 |     elif os.path.isfile(name):
 59 |         model_path = name
 60 |     else:
 61 |         raise RuntimeError(
 62 |             f"Model {name} not found; available models = {list_openai_models()}"
 63 |         )
 64 | 
 65 |     try:
 66 |         # loading JIT archive
 67 |         model = torch.jit.load(model_path, map_location=device if jit else "cpu").eval()
 68 |         state_dict = None
 69 |     except RuntimeError:
 70 |         # loading saved state dict
 71 |         if jit:
 72 |             warnings.warn(
 73 |                 f"File {model_path} is not a JIT archive. Loading as a state dict instead"
 74 |             )
 75 |             jit = False
 76 |         state_dict = torch.load(model_path, map_location="cpu")
 77 | 
 78 |     if not jit:
 79 |         try:
 80 |             model = build_model_from_openai_state_dict(
 81 |                 state_dict or model.state_dict(), model_cfg, enable_fusion, fusion_type
 82 |             ).to(device)
 83 |         except KeyError:
 84 |             sd = {k[7:]: v for k, v in state_dict["state_dict"].items()}
 85 |             model = build_model_from_openai_state_dict(
 86 |                 sd, model_cfg, enable_fusion, fusion_type
 87 |             ).to(device)
 88 | 
 89 |         if str(device) == "cpu":
 90 |             model.float()
 91 |         return model
 92 | 
 93 |     # patch the device names
 94 |     device_holder = torch.jit.trace(
 95 |         lambda: torch.ones([]).to(torch.device(device)), example_inputs=[]
 96 |     )
 97 |     device_node = [
 98 |         n
 99 |         for n in device_holder.graph.findAllNodes("prim::Constant")
100 |         if "Device" in repr(n)
101 |     ][-1]
102 | 
103 |     def patch_device(module):
104 |         try:
105 |             graphs = [module.graph] if hasattr(module, "graph") else []
106 |         except RuntimeError:
107 |             graphs = []
108 | 
109 |         if hasattr(module, "forward1"):
110 |             graphs.append(module.forward1.graph)
111 | 
112 |         for graph in graphs:
113 |             for node in graph.findAllNodes("prim::Constant"):
114 |                 if "value" in node.attributeNames() and str(node["value"]).startswith(
115 |                     "cuda"
116 |                 ):
117 |                     node.copyAttributes(device_node)
118 | 
119 |     model.apply(patch_device)
120 |     patch_device(model.encode_audio)
121 |     patch_device(model.encode_text)
122 | 
123 |     # patch dtype to float32 on CPU
124 |     if str(device) == "cpu":
125 |         float_holder = torch.jit.trace(
126 |             lambda: torch.ones([]).float(), example_inputs=[]
127 |         )
128 |         float_input = list(float_holder.graph.findNode("aten::to").inputs())[1]
129 |         float_node = float_input.node()
130 | 
131 |         def patch_float(module):
132 |             try:
133 |                 graphs = [module.graph] if hasattr(module, "graph") else []
134 |             except RuntimeError:
135 |                 graphs = []
136 | 
137 |             if hasattr(module, "forward1"):
138 |                 graphs.append(module.forward1.graph)
139 | 
140 |             for graph in graphs:
141 |                 for node in graph.findAllNodes("aten::to"):
142 |                     inputs = list(node.inputs())
143 |                     for i in [
144 |                         1,
145 |                         2,
146 |                     ]:  # dtype can be the second or third argument to aten::to()
147 |                         if inputs[i].node()["value"] == 5:
148 |                             inputs[i].node().copyAttributes(float_node)
149 | 
150 |         model.apply(patch_float)
151 |         patch_float(model.encode_audio)
152 |         patch_float(model.encode_text)
153 |         model.float()
154 | 
155 |     model.audio_branch.audio_length = model.audio_cfg.audio_length
156 |     return model
157 | 


--------------------------------------------------------------------------------
/audioldm/clap/open_clip/timm_model.py:
--------------------------------------------------------------------------------
  1 | """ timm model adapter
  2 | 
  3 | Wraps timm (https://github.com/rwightman/pytorch-image-models) models for use as a vision tower in CLIP model.
  4 | """
  5 | from collections import OrderedDict
  6 | 
  7 | import torch.nn as nn
  8 | 
  9 | try:
 10 |     import timm
 11 |     from timm.models.layers import Mlp, to_2tuple
 12 |     from timm.models.layers.attention_pool2d import RotAttentionPool2d
 13 |     from timm.models.layers.attention_pool2d import (
 14 |         AttentionPool2d as AbsAttentionPool2d,
 15 |     )
 16 | except ImportError as e:
 17 |     timm = None
 18 | 
 19 | from .utils import freeze_batch_norm_2d
 20 | 
 21 | 
 22 | class TimmModel(nn.Module):
 23 |     """timm model adapter
 24 |     # FIXME this adapter is a work in progress, may change in ways that break weight compat
 25 |     """
 26 | 
 27 |     def __init__(
 28 |         self,
 29 |         model_name,
 30 |         embed_dim,
 31 |         image_size=224,
 32 |         pool="avg",
 33 |         proj="linear",
 34 |         drop=0.0,
 35 |         pretrained=False,
 36 |     ):
 37 |         super().__init__()
 38 |         if timm is None:
 39 |             raise RuntimeError("Please `pip install timm` to use timm models.")
 40 | 
 41 |         self.image_size = to_2tuple(image_size)
 42 |         self.trunk = timm.create_model(model_name, pretrained=pretrained)
 43 |         feat_size = self.trunk.default_cfg.get("pool_size", None)
 44 |         feature_ndim = 1 if not feat_size else 2
 45 |         if pool in ("abs_attn", "rot_attn"):
 46 |             assert feature_ndim == 2
 47 |             # if attn pooling used, remove both classifier and default pool
 48 |             self.trunk.reset_classifier(0, global_pool="")
 49 |         else:
 50 |             # reset global pool if pool config set, otherwise leave as network default
 51 |             reset_kwargs = dict(global_pool=pool) if pool else {}
 52 |             self.trunk.reset_classifier(0, **reset_kwargs)
 53 |         prev_chs = self.trunk.num_features
 54 | 
 55 |         head_layers = OrderedDict()
 56 |         if pool == "abs_attn":
 57 |             head_layers["pool"] = AbsAttentionPool2d(
 58 |                 prev_chs, feat_size=feat_size, out_features=embed_dim
 59 |             )
 60 |             prev_chs = embed_dim
 61 |         elif pool == "rot_attn":
 62 |             head_layers["pool"] = RotAttentionPool2d(prev_chs, out_features=embed_dim)
 63 |             prev_chs = embed_dim
 64 |         else:
 65 |             assert proj, "projection layer needed if non-attention pooling is used."
 66 | 
 67 |         # NOTE attention pool ends with a projection layer, so proj should usually be set to '' if such pooling is used
 68 |         if proj == "linear":
 69 |             head_layers["drop"] = nn.Dropout(drop)
 70 |             head_layers["proj"] = nn.Linear(prev_chs, embed_dim)
 71 |         elif proj == "mlp":
 72 |             head_layers["mlp"] = Mlp(prev_chs, 2 * embed_dim, embed_dim, drop=drop)
 73 | 
 74 |         self.head = nn.Sequential(head_layers)
 75 | 
 76 |     def lock(self, unlocked_groups=0, freeze_bn_stats=False):
 77 |         """lock modules
 78 |         Args:
 79 |             unlocked_groups (int): leave last n layer groups unlocked (default: 0)
 80 |         """
 81 |         if not unlocked_groups:
 82 |             # lock full model
 83 |             for param in self.trunk.parameters():
 84 |                 param.requires_grad = False
 85 |             if freeze_bn_stats:
 86 |                 freeze_batch_norm_2d(self.trunk)
 87 |         else:
 88 |             # NOTE: partial freeze requires latest timm (master) branch and is subject to change
 89 |             try:
 90 |                 # FIXME import here until API stable and in an official release
 91 |                 from timm.models.helpers import group_parameters, group_modules
 92 |             except ImportError:
 93 |                 raise RuntimeError(
 94 |                     "Please install latest timm `pip install git+https://github.com/rwightman/pytorch-image-models`"
 95 |                 )
 96 |             matcher = self.trunk.group_matcher()
 97 |             gparams = group_parameters(self.trunk, matcher)
 98 |             max_layer_id = max(gparams.keys())
 99 |             max_layer_id = max_layer_id - unlocked_groups
100 |             for group_idx in range(max_layer_id + 1):
101 |                 group = gparams[group_idx]
102 |                 for param in group:
103 |                     self.trunk.get_parameter(param).requires_grad = False
104 |             if freeze_bn_stats:
105 |                 gmodules = group_modules(self.trunk, matcher, reverse=True)
106 |                 gmodules = {k for k, v in gmodules.items() if v <= max_layer_id}
107 |                 freeze_batch_norm_2d(self.trunk, gmodules)
108 | 
109 |     def forward(self, x):
110 |         x = self.trunk(x)
111 |         x = self.head(x)
112 |         return x
113 | 


--------------------------------------------------------------------------------
/audioldm/clap/open_clip/transform.py:
--------------------------------------------------------------------------------
 1 | from torchvision.transforms import (
 2 |     Normalize,
 3 |     Compose,
 4 |     RandomResizedCrop,
 5 |     InterpolationMode,
 6 |     ToTensor,
 7 |     Resize,
 8 |     CenterCrop,
 9 | )
10 | 
11 | 
12 | def _convert_to_rgb(image):
13 |     return image.convert("RGB")
14 | 
15 | 
16 | def image_transform(
17 |     image_size: int,
18 |     is_train: bool,
19 |     mean=(0.48145466, 0.4578275, 0.40821073),
20 |     std=(0.26862954, 0.26130258, 0.27577711),
21 | ):
22 |     normalize = Normalize(mean=mean, std=std)
23 |     if is_train:
24 |         return Compose(
25 |             [
26 |                 RandomResizedCrop(
27 |                     image_size,
28 |                     scale=(0.9, 1.0),
29 |                     interpolation=InterpolationMode.BICUBIC,
30 |                 ),
31 |                 _convert_to_rgb,
32 |                 ToTensor(),
33 |                 normalize,
34 |             ]
35 |         )
36 |     else:
37 |         return Compose(
38 |             [
39 |                 Resize(image_size, interpolation=InterpolationMode.BICUBIC),
40 |                 CenterCrop(image_size),
41 |                 _convert_to_rgb,
42 |                 ToTensor(),
43 |                 normalize,
44 |             ]
45 |         )
46 | 


--------------------------------------------------------------------------------
/audioldm/clap/open_clip/version.py:
--------------------------------------------------------------------------------
1 | __version__ = "0.2.1"
2 | 


--------------------------------------------------------------------------------
/audioldm/clap/training/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/naver-ai/rewas/4e09954d386583a4e62e7a1e28c2145ca1cfc411/audioldm/clap/training/__init__.py


--------------------------------------------------------------------------------
/audioldm/clap/training/audioset_textmap.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/naver-ai/rewas/4e09954d386583a4e62e7a1e28c2145ca1cfc411/audioldm/clap/training/audioset_textmap.npy


--------------------------------------------------------------------------------
/audioldm/clap/training/distributed.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | 
  3 | import torch
  4 | import socket
  5 | 
  6 | try:
  7 |     import horovod.torch as hvd
  8 | except ImportError:
  9 |     hvd = None
 10 | 
 11 | 
 12 | def is_global_master(args):
 13 |     return args.rank == 0
 14 | 
 15 | 
 16 | def is_local_master(args):
 17 |     return args.local_rank == 0
 18 | 
 19 | 
 20 | def is_master(args, local=False):
 21 |     return is_local_master(args) if local else is_global_master(args)
 22 | 
 23 | 
 24 | def is_using_horovod():
 25 |     # NOTE w/ horovod run, OMPI vars should be set, but w/ SLURM PMI vars will be set
 26 |     # Differentiating between horovod and DDP use via SLURM may not be possible, so horovod arg still required...
 27 |     ompi_vars = ["OMPI_COMM_WORLD_RANK", "OMPI_COMM_WORLD_SIZE"]
 28 |     pmi_vars = ["PMI_RANK", "PMI_SIZE"]
 29 |     if all([var in os.environ for var in ompi_vars]) or all(
 30 |         [var in os.environ for var in pmi_vars]
 31 |     ):
 32 |         return True
 33 |     else:
 34 |         return False
 35 | 
 36 | 
 37 | def is_using_distributed():
 38 |     if "WORLD_SIZE" in os.environ:
 39 |         return int(os.environ["WORLD_SIZE"]) > 1
 40 |     if "SLURM_NTASKS" in os.environ:
 41 |         return int(os.environ["SLURM_NTASKS"]) > 1
 42 |     return False
 43 | 
 44 | 
 45 | def world_info_from_env():
 46 |     local_rank = 0
 47 |     for v in (
 48 |         "SLURM_LOCALID",
 49 |         "MPI_LOCALRANKID",
 50 |         "OMPI_COMM_WORLD_LOCAL_RANK",
 51 |         "LOCAL_RANK",
 52 |     ):
 53 |         if v in os.environ:
 54 |             local_rank = int(os.environ[v])
 55 |             break
 56 |     global_rank = 0
 57 |     for v in ("SLURM_PROCID", "PMI_RANK", "OMPI_COMM_WORLD_RANK", "RANK"):
 58 |         if v in os.environ:
 59 |             global_rank = int(os.environ[v])
 60 |             break
 61 |     world_size = 1
 62 |     for v in ("SLURM_NTASKS", "PMI_SIZE", "OMPI_COMM_WORLD_SIZE", "WORLD_SIZE"):
 63 |         if v in os.environ:
 64 |             world_size = int(os.environ[v])
 65 |             break
 66 | 
 67 |     return local_rank, global_rank, world_size
 68 | 
 69 | 
 70 | def init_distributed_device(args):
 71 |     # Distributed training = training on more than one GPU.
 72 |     # Works in both single and multi-node scenarios.
 73 |     args.distributed = False
 74 |     args.world_size = 1
 75 |     args.rank = 0  # global rank
 76 |     args.local_rank = 0
 77 |     if args.horovod:
 78 |         assert hvd is not None, "Horovod is not installed"
 79 |         hvd.init()
 80 |         world_size = int(os.environ["OMPI_COMM_WORLD_SIZE"])
 81 |         world_rank = int(os.environ["OMPI_COMM_WORLD_RANK"])
 82 |         local_rank = int(os.environ["OMPI_COMM_WORLD_LOCAL_RANK"])
 83 |         args.local_rank = local_rank
 84 |         args.rank = world_rank
 85 |         args.world_size = world_size
 86 |         # args.local_rank = int(hvd.local_rank())
 87 |         # args.rank = hvd.rank()
 88 |         # args.world_size = hvd.size()
 89 |         args.distributed = True
 90 |         os.environ["LOCAL_RANK"] = str(args.local_rank)
 91 |         os.environ["RANK"] = str(args.rank)
 92 |         os.environ["WORLD_SIZE"] = str(args.world_size)
 93 |         print(
 94 |             f"Distributed training: local_rank={args.local_rank}, "
 95 |             f"rank={args.rank}, world_size={args.world_size}, "
 96 |             f"hostname={socket.gethostname()}, pid={os.getpid()}"
 97 |         )
 98 |     elif is_using_distributed():
 99 |         if "SLURM_PROCID" in os.environ:
100 |             # DDP via SLURM
101 |             args.local_rank, args.rank, args.world_size = world_info_from_env()
102 |             # SLURM var -> torch.distributed vars in case needed
103 |             os.environ["LOCAL_RANK"] = str(args.local_rank)
104 |             os.environ["RANK"] = str(args.rank)
105 |             os.environ["WORLD_SIZE"] = str(args.world_size)
106 |             torch.distributed.init_process_group(
107 |                 backend=args.dist_backend,
108 |                 init_method=args.dist_url,
109 |                 world_size=args.world_size,
110 |                 rank=args.rank,
111 |             )
112 |         elif "OMPI_COMM_WORLD_SIZE" in os.environ:  # using Summit cluster
113 |             world_size = int(os.environ["OMPI_COMM_WORLD_SIZE"])
114 |             world_rank = int(os.environ["OMPI_COMM_WORLD_RANK"])
115 |             local_rank = int(os.environ["OMPI_COMM_WORLD_LOCAL_RANK"])
116 |             args.local_rank = local_rank
117 |             args.rank = world_rank
118 |             args.world_size = world_size
119 |             torch.distributed.init_process_group(
120 |                 backend=args.dist_backend,
121 |                 init_method=args.dist_url,
122 |                 world_size=args.world_size,
123 |                 rank=args.rank,
124 |             )
125 |         else:
126 |             # DDP via torchrun, torch.distributed.launch
127 |             args.local_rank, _, _ = world_info_from_env()
128 |             torch.distributed.init_process_group(
129 |                 backend=args.dist_backend, init_method=args.dist_url
130 |             )
131 |             args.world_size = torch.distributed.get_world_size()
132 |             args.rank = torch.distributed.get_rank()
133 |         args.distributed = True
134 |         print(
135 |             f"Distributed training: local_rank={args.local_rank}, "
136 |             f"rank={args.rank}, world_size={args.world_size}, "
137 |             f"hostname={socket.gethostname()}, pid={os.getpid()}"
138 |         )
139 | 
140 |     if torch.cuda.is_available():
141 |         if args.distributed and not args.no_set_device_rank:
142 |             device = "cuda:%d" % args.local_rank
143 |         else:
144 |             device = "cuda:0"
145 |         torch.cuda.set_device(device)
146 |     else:
147 |         device = "cpu"
148 |     args.device = device
149 |     device = torch.device(device)
150 |     return device
151 | 


--------------------------------------------------------------------------------
/audioldm/clap/training/infer_demo.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | 
  3 | sys.path.append("src/clap")
  4 | 
  5 | import os
  6 | import torch
  7 | import librosa
  8 | from open_clip import create_model
  9 | from training.data import get_audio_features
 10 | from training.data import int16_to_float32, float32_to_int16
 11 | from transformers import RobertaTokenizer
 12 | 
 13 | tokenize = RobertaTokenizer.from_pretrained("roberta-base")
 14 | 
 15 | 
 16 | def tokenizer(text):
 17 |     result = tokenize(
 18 |         text,
 19 |         padding="max_length",
 20 |         truncation=True,
 21 |         max_length=77,
 22 |         return_tensors="pt",
 23 |     )
 24 |     return {k: v.squeeze(0) for k, v in result.items()}
 25 | 
 26 | 
 27 | PRETRAINED_PATH = "/mnt/fast/nobackup/users/hl01486/projects/contrastive_pretraining/CLAP/assets/checkpoints/epoch_top_0_audioset_no_fusion.pt"
 28 | WAVE_48k_PATH = "/mnt/fast/nobackup/users/hl01486/projects/contrastive_pretraining/CLAP/assets/audio/machine.wav"
 29 | 
 30 | 
 31 | def infer_text():
 32 |     device = "cuda:0" if torch.cuda.is_available() else "cpu"
 33 |     precision = "fp32"
 34 |     amodel = "HTSAT-tiny"  # or 'PANN-14'
 35 |     tmodel = "roberta"  # the best text encoder in our training
 36 |     enable_fusion = False  # False if you do not want to use the fusion model
 37 |     fusion_type = "aff_2d"
 38 |     pretrained = PRETRAINED_PATH
 39 | 
 40 |     model, model_cfg = create_model(
 41 |         amodel,
 42 |         tmodel,
 43 |         pretrained,
 44 |         precision=precision,
 45 |         device=device,
 46 |         enable_fusion=enable_fusion,
 47 |         fusion_type=fusion_type,
 48 |     )
 49 |     # load the text, can be a list (i.e. batch size)
 50 |     text_data = ["I love the contrastive learning", "I love the pretrain model"]
 51 |     # tokenize for roberta, if you want to tokenize for another text encoder, please refer to data.py#L43-90
 52 |     text_data = tokenizer(text_data)
 53 | 
 54 |     text_embed = model.get_text_embedding(text_data)
 55 |     print(text_embed.size())
 56 | 
 57 | 
 58 | def infer_audio():
 59 |     device = "cuda:0" if torch.cuda.is_available() else "cpu"
 60 |     precision = "fp32"
 61 |     amodel = "HTSAT-tiny"  # or 'PANN-14'
 62 |     tmodel = "roberta"  # the best text encoder in our training
 63 |     enable_fusion = False  # False if you do not want to use the fusion model
 64 |     fusion_type = "aff_2d"
 65 |     pretrained = PRETRAINED_PATH
 66 | 
 67 |     model, model_cfg = create_model(
 68 |         amodel,
 69 |         tmodel,
 70 |         pretrained,
 71 |         precision=precision,
 72 |         device=device,
 73 |         enable_fusion=enable_fusion,
 74 |         fusion_type=fusion_type,
 75 |     )
 76 | 
 77 |     # load the waveform of the shape (T,), should resample to 48000
 78 |     audio_waveform, sr = librosa.load(WAVE_48k_PATH, sr=48000)
 79 |     # quantize
 80 |     audio_waveform = int16_to_float32(float32_to_int16(audio_waveform))
 81 |     audio_waveform = torch.from_numpy(audio_waveform).float()
 82 |     audio_dict = {}
 83 | 
 84 |     # the 'fusion' truncate mode can be changed to 'rand_trunc' if run in unfusion mode
 85 |     import ipdb
 86 | 
 87 |     ipdb.set_trace()
 88 |     audio_dict = get_audio_features(
 89 |         audio_dict,
 90 |         audio_waveform,
 91 |         480000,
 92 |         data_truncating="fusion",
 93 |         data_filling="repeatpad",
 94 |         audio_cfg=model_cfg["audio_cfg"],
 95 |     )
 96 |     # can send a list to the model, to process many audio tracks in one time (i.e. batch size)
 97 |     audio_embed = model.get_audio_embedding([audio_dict])
 98 |     print(audio_embed.size())
 99 |     import ipdb
100 | 
101 |     ipdb.set_trace()
102 | 
103 | 
104 | if __name__ == "__main__":
105 |     infer_text()
106 |     infer_audio()
107 | 


--------------------------------------------------------------------------------
/audioldm/clap/training/logger.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | 
 4 | def setup_logging(log_file, level, include_host=False):
 5 |     if include_host:
 6 |         import socket
 7 | 
 8 |         hostname = socket.gethostname()
 9 |         formatter = logging.Formatter(
10 |             f"%(asctime)s |  {hostname} | %(levelname)s | %(message)s",
11 |             datefmt="%Y-%m-%d,%H:%M:%S",
12 |         )
13 |     else:
14 |         formatter = logging.Formatter(
15 |             "%(asctime)s | %(levelname)s | %(message)s", datefmt="%Y-%m-%d,%H:%M:%S"
16 |         )
17 | 
18 |     logging.root.setLevel(level)
19 |     loggers = [logging.getLogger(name) for name in logging.root.manager.loggerDict]
20 |     for logger in loggers:
21 |         logger.setLevel(level)
22 | 
23 |     stream_handler = logging.StreamHandler()
24 |     stream_handler.setFormatter(formatter)
25 |     logging.root.addHandler(stream_handler)
26 | 
27 |     if log_file:
28 |         file_handler = logging.FileHandler(filename=log_file)
29 |         file_handler.setFormatter(formatter)
30 |         logging.root.addHandler(file_handler)
31 | 


--------------------------------------------------------------------------------
/audioldm/clap/training/scheduler.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | def assign_learning_rate(optimizer, new_lr):
 5 |     for param_group in optimizer.param_groups:
 6 |         param_group["lr"] = new_lr
 7 | 
 8 | 
 9 | def _warmup_lr(base_lr, warmup_length, step):
10 |     return base_lr * (step + 1) / warmup_length
11 | 
12 | 
13 | def cosine_lr(optimizer, base_lr, warmup_length, steps):
14 |     def _lr_adjuster(step):
15 |         if step < warmup_length:
16 |             lr = _warmup_lr(base_lr, warmup_length, step)
17 |         else:
18 |             e = step - warmup_length
19 |             es = steps - warmup_length
20 |             lr = 0.5 * (1 + np.cos(np.pi * e / es)) * base_lr
21 |         assign_learning_rate(optimizer, lr)
22 |         return lr
23 | 
24 |     return _lr_adjuster
25 | 


--------------------------------------------------------------------------------
/audioldm/clap/training/zero_shot.py:
--------------------------------------------------------------------------------
 1 | # NOTE: This script is currently not supported for CLAP.
 2 | import logging
 3 | from contextlib import suppress
 4 | 
 5 | import torch
 6 | import torch.nn.functional as F
 7 | from tqdm import tqdm
 8 | 
 9 | from open_clip import tokenize
10 | from .imagenet_zeroshot_data import imagenet_classnames, openai_imagenet_template
11 | 
12 | 
13 | def zero_shot_classifier(model, classnames, templates, args):
14 |     with torch.no_grad():
15 |         zeroshot_weights = []
16 |         for classname in tqdm(classnames):
17 |             texts = [template(classname) for template in templates]  # format with class
18 |             texts = tokenize(texts).to(args.device)  # tokenize
19 |             if args.distributed and not args.horovod:
20 |                 class_embeddings = model.module.encode_text(texts)
21 |             else:
22 |                 class_embeddings = model.encode_text(texts)
23 |             class_embedding = F.normalize(class_embeddings, dim=-1).mean(dim=0)
24 |             class_embedding /= class_embedding.norm()
25 |             zeroshot_weights.append(class_embedding)
26 |         zeroshot_weights = torch.stack(zeroshot_weights, dim=1).to(args.device)
27 |     return zeroshot_weights
28 | 
29 | 
30 | def accuracy(output, target, topk=(1,)):
31 |     pred = output.topk(max(topk), 1, True, True)[1].t()
32 |     correct = pred.eq(target.view(1, -1).expand_as(pred))
33 |     return [
34 |         float(correct[:k].reshape(-1).float().sum(0, keepdim=True).cpu().numpy())
35 |         for k in topk
36 |     ]
37 | 
38 | 
39 | def run(model, classifier, dataloader, args):
40 |     autocast = torch.cuda.amp.autocast if args.precision == "amp" else suppress
41 |     with torch.no_grad():
42 |         top1, top5, n = 0.0, 0.0, 0.0
43 |         for images, target in tqdm(dataloader, unit_scale=args.batch_size):
44 |             images = images.to(args.device)
45 |             target = target.to(args.device)
46 | 
47 |             with autocast():
48 |                 # predict
49 |                 if args.distributed and not args.horovod:
50 |                     image_features = model.module.encode_image(images)
51 |                 else:
52 |                     image_features = model.encode_image(images)
53 |                 image_features = F.normalize(image_features, dim=-1)
54 |                 logits = 100.0 * image_features @ classifier
55 | 
56 |             # measure accuracy
57 |             acc1, acc5 = accuracy(logits, target, topk=(1, 5))
58 |             top1 += acc1
59 |             top5 += acc5
60 |             n += images.size(0)
61 | 
62 |     top1 = top1 / n
63 |     top5 = top5 / n
64 |     return top1, top5
65 | 
66 | 
67 | def zero_shot_eval(model, data, epoch, args):
68 |     if "imagenet-val" not in data and "imagenet-v2" not in data:
69 |         return {}
70 |     if args.zeroshot_frequency == 0:
71 |         return {}
72 |     if (epoch % args.zeroshot_frequency) != 0 and epoch != args.epochs:
73 |         return {}
74 | 
75 |     logging.info("Starting zero-shot imagenet.")
76 | 
77 |     logging.info("Building zero-shot classifier")
78 |     classifier = zero_shot_classifier(
79 |         model, imagenet_classnames, openai_imagenet_template, args
80 |     )
81 | 
82 |     logging.info("Using classifier")
83 |     results = {}
84 |     if "imagenet-val" in data:
85 |         top1, top5 = run(model, classifier, data["imagenet-val"].dataloader, args)
86 |         results["imagenet-zeroshot-val-top1"] = top1
87 |         results["imagenet-zeroshot-val-top5"] = top5
88 |     if "imagenet-v2" in data:
89 |         top1, top5 = run(model, classifier, data["imagenet-v2"].dataloader, args)
90 |         results["imagenetv2-zeroshot-val-top1"] = top1
91 |         results["imagenetv2-zeroshot-val-top5"] = top5
92 | 
93 |     logging.info("Finished zero-shot imagenet.")
94 | 
95 |     return results
96 | 


--------------------------------------------------------------------------------
/audioldm/diffusionmodules/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/naver-ai/rewas/4e09954d386583a4e62e7a1e28c2145ca1cfc411/audioldm/diffusionmodules/__init__.py


--------------------------------------------------------------------------------
/audioldm/diffusionmodules/distributions.py:
--------------------------------------------------------------------------------
  1 | # reference: https://github.com/haoheliu/AudioLDM-training-finetuning
  2 | 
  3 | import torch
  4 | import numpy as np
  5 | 
  6 | 
  7 | class AbstractDistribution:
  8 |     def sample(self):
  9 |         raise NotImplementedError()
 10 | 
 11 |     def mode(self):
 12 |         raise NotImplementedError()
 13 | 
 14 | 
 15 | class DiracDistribution(AbstractDistribution):
 16 |     def __init__(self, value):
 17 |         self.value = value
 18 | 
 19 |     def sample(self):
 20 |         return self.value
 21 | 
 22 |     def mode(self):
 23 |         return self.value
 24 | 
 25 | 
 26 | class DiagonalGaussianDistribution(object):
 27 |     def __init__(self, parameters, deterministic=False):
 28 |         self.parameters = parameters
 29 |         self.mean, self.logvar = torch.chunk(parameters, 2, dim=1)
 30 |         self.logvar = torch.clamp(self.logvar, -30.0, 20.0)
 31 |         self.deterministic = deterministic
 32 |         self.std = torch.exp(0.5 * self.logvar)
 33 |         self.var = torch.exp(self.logvar)
 34 |         if self.deterministic:
 35 |             self.var = self.std = torch.zeros_like(self.mean).to(
 36 |                 device=self.parameters.device
 37 |             )
 38 | 
 39 |     def sample(self):
 40 |         x = self.mean + self.std * torch.randn(self.mean.shape).to(
 41 |             device=self.parameters.device
 42 |         )
 43 |         return x
 44 | 
 45 |     def kl(self, other=None):
 46 |         if self.deterministic:
 47 |             return torch.Tensor([0.0])
 48 |         else:
 49 |             if other is None:
 50 |                 return 0.5 * torch.mean(
 51 |                     torch.pow(self.mean, 2) + self.var - 1.0 - self.logvar,
 52 |                     dim=[1, 2, 3],
 53 |                 )
 54 |             else:
 55 |                 return 0.5 * torch.mean(
 56 |                     torch.pow(self.mean - other.mean, 2) / other.var
 57 |                     + self.var / other.var
 58 |                     - 1.0
 59 |                     - self.logvar
 60 |                     + other.logvar,
 61 |                     dim=[1, 2, 3],
 62 |                 )
 63 | 
 64 |     def nll(self, sample, dims=[1, 2, 3]):
 65 |         if self.deterministic:
 66 |             return torch.Tensor([0.0])
 67 |         logtwopi = np.log(2.0 * np.pi)
 68 |         return 0.5 * torch.sum(
 69 |             logtwopi + self.logvar + torch.pow(sample - self.mean, 2) / self.var,
 70 |             dim=dims,
 71 |         )
 72 | 
 73 |     def mode(self):
 74 |         return self.mean
 75 | 
 76 | 
 77 | def normal_kl(mean1, logvar1, mean2, logvar2):
 78 |     """
 79 |     source: https://github.com/openai/guided-diffusion/blob/27c20a8fab9cb472df5d6bdd6c8d11c8f430b924/guided_diffusion/losses.py#L12
 80 |     Compute the KL divergence between two gaussians.
 81 |     Shapes are automatically broadcasted, so batches can be compared to
 82 |     scalars, among other use cases.
 83 |     """
 84 |     tensor = None
 85 |     for obj in (mean1, logvar1, mean2, logvar2):
 86 |         if isinstance(obj, torch.Tensor):
 87 |             tensor = obj
 88 |             break
 89 |     assert tensor is not None, "at least one argument must be a Tensor"
 90 | 
 91 |     # Force variances to be Tensors. Broadcasting helps convert scalars to
 92 |     # Tensors, but it does not work for torch.exp().
 93 |     logvar1, logvar2 = [
 94 |         x if isinstance(x, torch.Tensor) else torch.tensor(x).to(tensor)
 95 |         for x in (logvar1, logvar2)
 96 |     ]
 97 | 
 98 |     return 0.5 * (
 99 |         -1.0
100 |         + logvar2
101 |         - logvar1
102 |         + torch.exp(logvar1 - logvar2)
103 |         + ((mean1 - mean2) ** 2) * torch.exp(-logvar2)
104 |     )
105 | 


--------------------------------------------------------------------------------
/audioldm/diffusionmodules/ema.py:
--------------------------------------------------------------------------------
 1 | # reference: https://github.com/haoheliu/AudioLDM-training-finetuning
 2 | 
 3 | import torch
 4 | from torch import nn
 5 | 
 6 | 
 7 | class LitEma(nn.Module):
 8 |     def __init__(self, model, decay=0.9999, use_num_upates=True):
 9 |         super().__init__()
10 |         if decay < 0.0 or decay > 1.0:
11 |             raise ValueError("Decay must be between 0 and 1")
12 | 
13 |         self.m_name2s_name = {}
14 |         self.register_buffer("decay", torch.tensor(decay, dtype=torch.float32))
15 |         self.register_buffer(
16 |             "num_updates",
17 |             torch.tensor(0, dtype=torch.int)
18 |             if use_num_upates
19 |             else torch.tensor(-1, dtype=torch.int),
20 |         )
21 | 
22 |         for name, p in model.named_parameters():
23 |             if p.requires_grad:
24 |                 # remove as '.'-character is not allowed in buffers
25 |                 s_name = name.replace(".", "")
26 |                 self.m_name2s_name.update({name: s_name})
27 |                 self.register_buffer(s_name, p.clone().detach().data)
28 | 
29 |         self.collected_params = []
30 | 
31 |     def forward(self, model):
32 |         decay = self.decay
33 | 
34 |         if self.num_updates >= 0:
35 |             self.num_updates += 1
36 |             decay = min(self.decay, (1 + self.num_updates) / (10 + self.num_updates))
37 | 
38 |         one_minus_decay = 1.0 - decay
39 | 
40 |         with torch.no_grad():
41 |             m_param = dict(model.named_parameters())
42 |             shadow_params = dict(self.named_buffers())
43 | 
44 |             for key in m_param:
45 |                 if m_param[key].requires_grad:
46 |                     sname = self.m_name2s_name[key]
47 |                     shadow_params[sname] = shadow_params[sname].type_as(m_param[key])
48 |                     shadow_params[sname].sub_(
49 |                         one_minus_decay * (shadow_params[sname] - m_param[key])
50 |                     )
51 |                 else:
52 |                     assert not key in self.m_name2s_name
53 | 
54 |     def copy_to(self, model):
55 |         m_param = dict(model.named_parameters())
56 |         shadow_params = dict(self.named_buffers())
57 |         for key in m_param:
58 |             if m_param[key].requires_grad:
59 |                 m_param[key].data.copy_(shadow_params[self.m_name2s_name[key]].data)
60 |             else:
61 |                 assert not key in self.m_name2s_name
62 | 
63 |     def store(self, parameters):
64 |         """
65 |         Save the current parameters for restoring later.
66 |         Args:
67 |           parameters: Iterable of `torch.nn.Parameter`; the parameters to be
68 |             temporarily stored.
69 |         """
70 |         self.collected_params = [param.clone() for param in parameters]
71 | 
72 |     def restore(self, parameters):
73 |         """
74 |         Restore the parameters stored with the `store` method.
75 |         Useful to validate the model with EMA parameters without affecting the
76 |         original optimization process. Store the parameters before the
77 |         `copy_to` method. After validation (or model saving), use this to
78 |         restore the former parameters.
79 |         Args:
80 |           parameters: Iterable of `torch.nn.Parameter`; the parameters to be
81 |             updated with the stored parameters.
82 |         """
83 |         for c_param, param in zip(self.collected_params, parameters):
84 |             param.data.copy_(c_param.data)
85 | 


--------------------------------------------------------------------------------
/audioldm/hifigan/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 Jungil Kong
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/audioldm/hifigan/__init__.py:
--------------------------------------------------------------------------------
1 | from .models import Generator
2 | 
3 | 
4 | class AttrDict(dict):
5 |     def __init__(self, *args, **kwargs):
6 |         super(AttrDict, self).__init__(*args, **kwargs)
7 |         self.__dict__ = self
8 | 


--------------------------------------------------------------------------------
/audioldm/hifigan/models.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.nn.functional as F
  4 | from torch.nn import Conv1d, ConvTranspose1d
  5 | from torch.nn.utils import weight_norm, remove_weight_norm
  6 | 
  7 | LRELU_SLOPE = 0.1
  8 | 
  9 | 
 10 | def init_weights(m, mean=0.0, std=0.01):
 11 |     classname = m.__class__.__name__
 12 |     if classname.find("Conv") != -1:
 13 |         m.weight.data.normal_(mean, std)
 14 | 
 15 | 
 16 | def get_padding(kernel_size, dilation=1):
 17 |     return int((kernel_size * dilation - dilation) / 2)
 18 | 
 19 | 
 20 | class ResBlock(torch.nn.Module):
 21 |     def __init__(self, h, channels, kernel_size=3, dilation=(1, 3, 5)):
 22 |         super(ResBlock, self).__init__()
 23 |         self.h = h
 24 |         self.convs1 = nn.ModuleList(
 25 |             [
 26 |                 weight_norm(
 27 |                     Conv1d(
 28 |                         channels,
 29 |                         channels,
 30 |                         kernel_size,
 31 |                         1,
 32 |                         dilation=dilation[0],
 33 |                         padding=get_padding(kernel_size, dilation[0]),
 34 |                     )
 35 |                 ),
 36 |                 weight_norm(
 37 |                     Conv1d(
 38 |                         channels,
 39 |                         channels,
 40 |                         kernel_size,
 41 |                         1,
 42 |                         dilation=dilation[1],
 43 |                         padding=get_padding(kernel_size, dilation[1]),
 44 |                     )
 45 |                 ),
 46 |                 weight_norm(
 47 |                     Conv1d(
 48 |                         channels,
 49 |                         channels,
 50 |                         kernel_size,
 51 |                         1,
 52 |                         dilation=dilation[2],
 53 |                         padding=get_padding(kernel_size, dilation[2]),
 54 |                     )
 55 |                 ),
 56 |             ]
 57 |         )
 58 |         self.convs1.apply(init_weights)
 59 | 
 60 |         self.convs2 = nn.ModuleList(
 61 |             [
 62 |                 weight_norm(
 63 |                     Conv1d(
 64 |                         channels,
 65 |                         channels,
 66 |                         kernel_size,
 67 |                         1,
 68 |                         dilation=1,
 69 |                         padding=get_padding(kernel_size, 1),
 70 |                     )
 71 |                 ),
 72 |                 weight_norm(
 73 |                     Conv1d(
 74 |                         channels,
 75 |                         channels,
 76 |                         kernel_size,
 77 |                         1,
 78 |                         dilation=1,
 79 |                         padding=get_padding(kernel_size, 1),
 80 |                     )
 81 |                 ),
 82 |                 weight_norm(
 83 |                     Conv1d(
 84 |                         channels,
 85 |                         channels,
 86 |                         kernel_size,
 87 |                         1,
 88 |                         dilation=1,
 89 |                         padding=get_padding(kernel_size, 1),
 90 |                     )
 91 |                 ),
 92 |             ]
 93 |         )
 94 |         self.convs2.apply(init_weights)
 95 | 
 96 |     def forward(self, x):
 97 |         for c1, c2 in zip(self.convs1, self.convs2):
 98 |             xt = F.leaky_relu(x, LRELU_SLOPE)
 99 |             xt = c1(xt)
100 |             xt = F.leaky_relu(xt, LRELU_SLOPE)
101 |             xt = c2(xt)
102 |             x = xt + x
103 |         return x
104 | 
105 |     def remove_weight_norm(self):
106 |         for l in self.convs1:
107 |             remove_weight_norm(l)
108 |         for l in self.convs2:
109 |             remove_weight_norm(l)
110 | 
111 | 
112 | class Generator(torch.nn.Module):
113 |     def __init__(self, h):
114 |         super(Generator, self).__init__()
115 |         self.h = h
116 |         self.num_kernels = len(h.resblock_kernel_sizes)
117 |         self.num_upsamples = len(h.upsample_rates)
118 |         self.conv_pre = weight_norm(
119 |             Conv1d(h.num_mels, h.upsample_initial_channel, 7, 1, padding=3)
120 |         )
121 |         resblock = ResBlock
122 | 
123 |         self.ups = nn.ModuleList()
124 |         for i, (u, k) in enumerate(zip(h.upsample_rates, h.upsample_kernel_sizes)):
125 |             self.ups.append(
126 |                 weight_norm(
127 |                     ConvTranspose1d(
128 |                         h.upsample_initial_channel // (2**i),
129 |                         h.upsample_initial_channel // (2 ** (i + 1)),
130 |                         k,
131 |                         u,
132 |                         padding=(k - u) // 2,
133 |                     )
134 |                 )
135 |             )
136 | 
137 |         self.resblocks = nn.ModuleList()
138 |         for i in range(len(self.ups)):
139 |             ch = h.upsample_initial_channel // (2 ** (i + 1))
140 |             for j, (k, d) in enumerate(
141 |                 zip(h.resblock_kernel_sizes, h.resblock_dilation_sizes)
142 |             ):
143 |                 self.resblocks.append(resblock(h, ch, k, d))
144 | 
145 |         self.conv_post = weight_norm(Conv1d(ch, 1, 7, 1, padding=3))
146 |         self.ups.apply(init_weights)
147 |         self.conv_post.apply(init_weights)
148 | 
149 |     def forward(self, x):
150 |         x = self.conv_pre(x)
151 |         for i in range(self.num_upsamples):
152 |             x = F.leaky_relu(x, LRELU_SLOPE)
153 |             x = self.ups[i](x)
154 |             xs = None
155 |             for j in range(self.num_kernels):
156 |                 if xs is None:
157 |                     xs = self.resblocks[i * self.num_kernels + j](x)
158 |                 else:
159 |                     xs += self.resblocks[i * self.num_kernels + j](x)
160 |             x = xs / self.num_kernels
161 |         x = F.leaky_relu(x)
162 |         x = self.conv_post(x)
163 |         x = torch.tanh(x)
164 | 
165 |         return x
166 | 
167 |     def remove_weight_norm(self):
168 |         # print("Removing weight norm...")
169 |         for l in self.ups:
170 |             remove_weight_norm(l)
171 |         for l in self.resblocks:
172 |             l.remove_weight_norm()
173 |         remove_weight_norm(self.conv_pre)
174 |         remove_weight_norm(self.conv_post)
175 | 


--------------------------------------------------------------------------------
/audioldm/latent_diffusion/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/naver-ai/rewas/4e09954d386583a4e62e7a1e28c2145ca1cfc411/audioldm/latent_diffusion/__init__.py


--------------------------------------------------------------------------------
/audioldm/latent_diffusion/dpm_solver/__init__.py:
--------------------------------------------------------------------------------
1 | from .sampler import DPMSolverSampler
2 | 


--------------------------------------------------------------------------------
/audioldm/latent_diffusion/dpm_solver/sampler.py:
--------------------------------------------------------------------------------
 1 | # reference: https://github.com/haoheliu/AudioLDM-training-finetuning
 2 | 
 3 | """SAMPLING ONLY."""
 4 | 
 5 | import torch
 6 | 
 7 | from .dpm_solver import NoiseScheduleVP, model_wrapper, DPM_Solver
 8 | 
 9 | 
10 | class DPMSolverSampler(object):
11 |     def __init__(self, model, **kwargs):
12 |         super().__init__()
13 |         self.model = model
14 |         to_torch = lambda x: x.clone().detach().to(torch.float32).to(model.device)
15 |         self.register_buffer("alphas_cumprod", to_torch(model.alphas_cumprod))
16 | 
17 |     def register_buffer(self, name, attr):
18 |         if type(attr) == torch.Tensor:
19 |             if attr.device != torch.device("cuda"):
20 |                 attr = attr.to(torch.device("cuda"))
21 |         setattr(self, name, attr)
22 | 
23 |     @torch.no_grad()
24 |     def sample(
25 |         self,
26 |         S,
27 |         batch_size,
28 |         shape,
29 |         conditioning=None,
30 |         callback=None,
31 |         normals_sequence=None,
32 |         img_callback=None,
33 |         quantize_x0=False,
34 |         eta=0.0,
35 |         mask=None,
36 |         x0=None,
37 |         temperature=1.0,
38 |         noise_dropout=0.0,
39 |         score_corrector=None,
40 |         corrector_kwargs=None,
41 |         verbose=True,
42 |         x_T=None,
43 |         log_every_t=100,
44 |         unconditional_guidance_scale=1.0,
45 |         unconditional_conditioning=None,
46 |         # this has to come in the same format as the conditioning, # e.g. as encoded tokens, ...
47 |         **kwargs,
48 |     ):
49 |         if conditioning is not None:
50 |             if isinstance(conditioning, dict):
51 |                 cbs = conditioning[list(conditioning.keys())[0]].shape[0]
52 |                 if cbs != batch_size:
53 |                     print(
54 |                         f"Warning: Got {cbs} conditionings but batch-size is {batch_size}"
55 |                     )
56 |             else:
57 |                 if conditioning.shape[0] != batch_size:
58 |                     print(
59 |                         f"Warning: Got {conditioning.shape[0]} conditionings but batch-size is {batch_size}"
60 |                     )
61 | 
62 |         # sampling
63 |         C, H, W = shape
64 |         size = (batch_size, C, H, W)
65 | 
66 |         # print(f'Data shape for DPM-Solver sampling is {size}, sampling steps {S}')
67 | 
68 |         device = self.model.betas.device
69 |         if x_T is None:
70 |             img = torch.randn(size, device=device)
71 |         else:
72 |             img = x_T
73 | 
74 |         ns = NoiseScheduleVP("discrete", alphas_cumprod=self.alphas_cumprod)
75 | 
76 |         model_fn = model_wrapper(
77 |             lambda x, t, c: self.model.apply_model(x, t, c),
78 |             ns,
79 |             model_type="noise",
80 |             guidance_type="classifier-free",
81 |             condition=conditioning,
82 |             unconditional_condition=unconditional_conditioning,
83 |             guidance_scale=unconditional_guidance_scale,
84 |         )
85 | 
86 |         dpm_solver = DPM_Solver(model_fn, ns, predict_x0=True, thresholding=False)
87 |         x = dpm_solver.sample(
88 |             img,
89 |             steps=S,
90 |             skip_type="time_uniform",
91 |             method="multistep",
92 |             order=2,
93 |             lower_order_final=True,
94 |         )
95 | 
96 |         return x.to(device), None
97 | 


--------------------------------------------------------------------------------
/audioldm/latent_encoder/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/naver-ai/rewas/4e09954d386583a4e62e7a1e28c2145ca1cfc411/audioldm/latent_encoder/__init__.py


--------------------------------------------------------------------------------
/audioldm/losses/__init__.py:
--------------------------------------------------------------------------------
1 | from .contperceptual import LPIPSWithDiscriminator
2 | 


--------------------------------------------------------------------------------
/audioldm/utilities/__init__.py:
--------------------------------------------------------------------------------
1 | from .tools import *
2 | from .data import *
3 | from .model_util import *
4 | 


--------------------------------------------------------------------------------
/audioldm/utilities/audio/__init__.py:
--------------------------------------------------------------------------------
1 | from .audio_processing import *
2 | from .stft import *
3 | from .tools import *
4 | 


--------------------------------------------------------------------------------
/audioldm/utilities/audio/audio_processing.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import numpy as np
  3 | import librosa.util as librosa_util
  4 | from scipy.signal import get_window
  5 | 
  6 | 
  7 | def window_sumsquare(
  8 |     window,
  9 |     n_frames,
 10 |     hop_length,
 11 |     win_length,
 12 |     n_fft,
 13 |     dtype=np.float32,
 14 |     norm=None,
 15 | ):
 16 |     """
 17 |     # from librosa 0.6
 18 |     Compute the sum-square envelope of a window function at a given hop length.
 19 | 
 20 |     This is used to estimate modulation effects induced by windowing
 21 |     observations in short-time fourier transforms.
 22 | 
 23 |     Parameters
 24 |     ----------
 25 |     window : string, tuple, number, callable, or list-like
 26 |         Window specification, as in `get_window`
 27 | 
 28 |     n_frames : int > 0
 29 |         The number of analysis frames
 30 | 
 31 |     hop_length : int > 0
 32 |         The number of samples to advance between frames
 33 | 
 34 |     win_length : [optional]
 35 |         The length of the window function.  By default, this matches `n_fft`.
 36 | 
 37 |     n_fft : int > 0
 38 |         The length of each analysis frame.
 39 | 
 40 |     dtype : np.dtype
 41 |         The data type of the output
 42 | 
 43 |     Returns
 44 |     -------
 45 |     wss : np.ndarray, shape=`(n_fft + hop_length * (n_frames - 1))`
 46 |         The sum-squared envelope of the window function
 47 |     """
 48 |     if win_length is None:
 49 |         win_length = n_fft
 50 | 
 51 |     n = n_fft + hop_length * (n_frames - 1)
 52 |     x = np.zeros(n, dtype=dtype)
 53 | 
 54 |     # Compute the squared window at the desired length
 55 |     win_sq = get_window(window, win_length, fftbins=True)
 56 |     win_sq = librosa_util.normalize(win_sq, norm=norm) ** 2
 57 |     win_sq = librosa_util.pad_center(win_sq, n_fft)
 58 | 
 59 |     # Fill the envelope
 60 |     for i in range(n_frames):
 61 |         sample = i * hop_length
 62 |         x[sample : min(n, sample + n_fft)] += win_sq[: max(0, min(n_fft, n - sample))]
 63 |     return x
 64 | 
 65 | 
 66 | def griffin_lim(magnitudes, stft_fn, n_iters=30):
 67 |     """
 68 |     PARAMS
 69 |     ------
 70 |     magnitudes: spectrogram magnitudes
 71 |     stft_fn: STFT class with transform (STFT) and inverse (ISTFT) methods
 72 |     """
 73 | 
 74 |     angles = np.angle(np.exp(2j * np.pi * np.random.rand(*magnitudes.size())))
 75 |     angles = angles.astype(np.float32)
 76 |     angles = torch.autograd.Variable(torch.from_numpy(angles))
 77 |     signal = stft_fn.inverse(magnitudes, angles).squeeze(1)
 78 | 
 79 |     for i in range(n_iters):
 80 |         _, angles = stft_fn.transform(signal)
 81 |         signal = stft_fn.inverse(magnitudes, angles).squeeze(1)
 82 |     return signal
 83 | 
 84 | 
 85 | def dynamic_range_compression(x, normalize_fun=torch.log, C=1, clip_val=1e-5):
 86 |     """
 87 |     PARAMS
 88 |     ------
 89 |     C: compression factor
 90 |     """
 91 |     return normalize_fun(torch.clamp(x, min=clip_val) * C)
 92 | 
 93 | 
 94 | def dynamic_range_decompression(x, C=1):
 95 |     """
 96 |     PARAMS
 97 |     ------
 98 |     C: compression factor used to compress
 99 |     """
100 |     return torch.exp(x) / C
101 | 


--------------------------------------------------------------------------------
/audioldm/utilities/audio/tools.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import numpy as np
 3 | from scipy.io.wavfile import write
 4 | import torchaudio
 5 | 
 6 | from audioldm.utilities.audio.audio_processing import griffin_lim
 7 | 
 8 | 
 9 | def get_mel_from_wav(audio, _stft):
10 |     audio = torch.clip(torch.FloatTensor(audio).unsqueeze(0), -1, 1)
11 |     audio = torch.autograd.Variable(audio, requires_grad=False)
12 |     melspec, magnitudes, phases, energy = _stft.mel_spectrogram(audio)
13 |     melspec = torch.squeeze(melspec, 0).numpy().astype(np.float32)
14 |     magnitudes = torch.squeeze(magnitudes, 0).numpy().astype(np.float32)
15 |     energy = torch.squeeze(energy, 0).numpy().astype(np.float32)
16 |     return melspec, magnitudes, energy
17 | 
18 | 
19 | def inv_mel_spec(mel, out_filename, _stft, griffin_iters=60):
20 |     mel = torch.stack([mel])
21 |     mel_decompress = _stft.spectral_de_normalize(mel)
22 |     mel_decompress = mel_decompress.transpose(1, 2).data.cpu()
23 |     spec_from_mel_scaling = 1000
24 |     spec_from_mel = torch.mm(mel_decompress[0], _stft.mel_basis)
25 |     spec_from_mel = spec_from_mel.transpose(0, 1).unsqueeze(0)
26 |     spec_from_mel = spec_from_mel * spec_from_mel_scaling
27 | 
28 |     audio = griffin_lim(
29 |         torch.autograd.Variable(spec_from_mel[:, :, :-1]), _stft._stft_fn, griffin_iters
30 |     )
31 | 
32 |     audio = audio.squeeze()
33 |     audio = audio.cpu().numpy()
34 |     audio_path = out_filename
35 |     write(audio_path, _stft.sampling_rate, audio)
36 | 
37 | def read_wav_file(filename, segment_length):
38 |     # waveform, sr = librosa.load(filename, sr=None, mono=True) # 4 times slower
39 |     waveform, sr = torchaudio.load(filename)  # Faster!!!
40 |     waveform = torchaudio.functional.resample(waveform, orig_freq=sr, new_freq=16000)
41 |     waveform = waveform.numpy()[0, ...]
42 |     waveform = normalize_wav(waveform)
43 |     waveform = waveform[None, ...]
44 |     waveform = pad_wav(waveform, segment_length)
45 |     
46 |     waveform = waveform / np.max(np.abs(waveform))
47 |     waveform = 0.5 * waveform
48 |     
49 |     return waveform


--------------------------------------------------------------------------------
/audioldm/utilities/data/__init__.py:
--------------------------------------------------------------------------------
1 | from .dataset import Dataset
2 | 


--------------------------------------------------------------------------------
/audioldm/utilities/data/utils.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from pathlib import Path
  3 | import subprocess
  4 | import random
  5 | import numpy as np
  6 | import soundfile as sf
  7 | import torch
  8 | import torchvision
  9 | from moviepy.editor import VideoFileClip, AudioFileClip
 10 | 
 11 | from encoder.encoder_utils import which_ffmpeg
 12 | 
 13 | def get_video_and_audio(path, get_meta=False, duration=5, start_sec=0, end_sec=None, random_start = False):
 14 |     rgb, audio, meta = torchvision.io.read_video(str(path), start_sec, end_sec, 'sec', output_format='TCHW')
 15 |     assert meta['video_fps'], f'No video fps for {path}'
 16 |     
 17 |     vlen = int(duration * meta['video_fps'])
 18 |     meta_out = {'video': {'fps': [meta['video_fps']]}}
 19 |     
 20 | 
 21 |     if random_start:
 22 |         stx = random.randint(0, int(rgb.size(0)/meta['video_fps']) - duration)
 23 |     else:
 24 |         stx = 0
 25 |         
 26 |     rgb = rgb[int(stx*meta['video_fps']):int(stx*meta['video_fps']+vlen), :, :, :]
 27 | 
 28 |     if rgb.shape[0] < vlen:
 29 | 
 30 |         rgb = torch.cat([rgb, rgb, rgb], dim=0)
 31 |         rgb = rgb[int(stx*meta['video_fps']):int(stx*meta['video_fps']+vlen), :, :, :]
 32 | 
 33 | 
 34 |     if meta.get('audio_fps'):
 35 |         alen = int(duration * meta['audio_fps'])
 36 |         audio = audio.mean(dim=0)
 37 |         audio = audio[stx*meta['audio_fps']:(stx*meta['audio_fps']+alen)]
 38 |         meta_out['audio'] =  {'framerate': [meta['audio_fps']]}
 39 |     else:
 40 |         meta_out['audio'] =  {'framerate': [16000]}
 41 | 
 42 |     return rgb, audio, meta_out
 43 | 
 44 | 
 45 | def save_wave(waveform, savepath, name="outwav"):
 46 |     if type(name) is not list:
 47 |         name = [name] * waveform.shape[0]
 48 | 
 49 |     paths = []
 50 |     for i in range(waveform.shape[0]):
 51 |         path = os.path.join(
 52 |             savepath,
 53 |             "%s_%s.wav"
 54 |             % (
 55 |                 os.path.basename(name[i])
 56 |                 if (not ".wav" in name[i])
 57 |                 else os.path.basename(name[i]).split(".")[0],
 58 |                 i,
 59 |             ),
 60 |         )
 61 |         paths.append(path)
 62 |         print("Save audio to %s" % path)
 63 |         sf.write(path, waveform[i, 0], samplerate=16000)
 64 |         
 65 |     return paths
 66 | 
 67 | def save_video(audio_path, video_path):
 68 | 
 69 |     video_clip = VideoFileClip(video_path)
 70 |     video_clip = video_clip.subclip(0, 5) # generated audio duration is 5 seconds.
 71 | 
 72 |     audio_clip = AudioFileClip(audio_path)
 73 |     video_clip = video_clip.set_audio(audio_clip)
 74 |     
 75 |     # Output file path for the final video with audio
 76 |     out_video_path = audio_path.replace('.wav', '.mp4')
 77 | 
 78 |     # Write the video clip with the audio to a new file
 79 |     video_clip.write_videofile(out_video_path, audio_codec='aac')
 80 | 
 81 |     # Close the clips
 82 |     video_clip.close()
 83 |     audio_clip.close()
 84 | 
 85 |     return
 86 |         
 87 | def re_encode_video(new_path, path, vfps=25, afps=16000, in_size=256):
 88 |     assert which_ffmpeg() != '', 'Is ffmpeg installed? Check if the conda environment is activated.'
 89 |     
 90 |     os.makedirs(new_path, exist_ok=True)
 91 | 
 92 |     new_path += f"/{Path(path).stem}_{vfps}fps_{in_size}side_{afps}hz.mp4"
 93 |     new_path = str(new_path)
 94 |     cmd = f"{which_ffmpeg()}"
 95 |     # no info/error printing
 96 |     cmd += " -hide_banner -loglevel panic"
 97 |     cmd += f" -y -i {path}"
 98 |     # 1) change fps, 2) resize: min(H,W)=MIN_SIDE (vertical vids are supported), 3) change audio framerate
 99 |     cmd += f" -vf fps={vfps},scale=iw*{in_size}/'min(iw,ih)':ih*{in_size}/'min(iw,ih)',crop='trunc(iw/2)'*2:'trunc(ih/2)'*2"
100 |     cmd += f" -ar {afps}"
101 |     cmd += f" {new_path}"
102 |     subprocess.call(cmd.split())
103 |     return new_path
104 | 


--------------------------------------------------------------------------------
/basketball_bounce.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/naver-ai/rewas/4e09954d386583a4e62e7a1e28c2145ca1cfc411/basketball_bounce.mp4


--------------------------------------------------------------------------------
/configs/audioldm_m_rewas.yaml:
--------------------------------------------------------------------------------
  1 | metadata_root: "audioldm_train/config/2023_08_23_reproduce_audioldm/dataset_root.json"
  2 | log_directory: ./logs/audioldm_vggsound
  3 | project: "audioldm_vggsound"
  4 | precision: "high"
  5 | data_root: "/path/to/dataset"
  6 | 
  7 | variables:
  8 |   sampling_rate: &sampling_rate 16000 
  9 |   mel_bins: &mel_bins 64
 10 |   latent_embed_dim: &latent_embed_dim 8
 11 |   latent_t_size: &latent_t_size 128 
 12 |   latent_f_size: &latent_f_size 16
 13 |   in_channels: &unet_in_channels 8
 14 |   optimize_ddpm_parameter: &optimize_ddpm_parameter true
 15 |   optimize_gpt: &optimize_gpt true
 16 |   warmup_steps: &warmup_steps 2000
 17 | 
 18 | data: 
 19 |   train: ["vggsound"]
 20 |   val: "vggsound"
 21 |   test: "vggsound"
 22 |   class_label_indices: "vggsound.tsv"
 23 |   dataloader_add_ons: [] 
 24 |   dropout: 0.3
 25 |   train_mode: "energy"
 26 | 
 27 | step:
 28 |   validation_every_n_epochs: 5
 29 |   save_checkpoint_every_n_steps: 5000
 30 |   max_steps: 800000
 31 |   save_top_k: 1
 32 | 
 33 | preprocessing:
 34 |   audio:
 35 |     sampling_rate: *sampling_rate
 36 |     max_wav_value: 32768.0
 37 |     duration: 5.12 
 38 |   stft:
 39 |     filter_length: 1024
 40 |     hop_length: 160
 41 |     win_length: 1024
 42 |   mel:
 43 |     n_mel_channels: *mel_bins
 44 |     mel_fmin: 0
 45 |     mel_fmax: 8000 
 46 | 
 47 | augmentation:
 48 |   mixup: 0.0
 49 | 
 50 | model:
 51 |   target: audioldm.rewas.ReWaS # audioldm_train.modules.latent_diffusion.ddpm.LatentDiffusion
 52 |   params: 
 53 |     # Autoencoder
 54 |     first_stage_config:
 55 |       base_learning_rate: 8.0e-06
 56 |       target: audioldm.latent_encoder.autoencoder.AutoencoderKL
 57 |       params: 
 58 |         reload_from_ckpt: ./ckpts/vae_mel_16k_64bins.ckpt
 59 |         sampling_rate: *sampling_rate
 60 |         batchsize: 4
 61 |         monitor: val/rec_loss
 62 |         image_key: fbank
 63 |         subband: 1
 64 |         embed_dim: *latent_embed_dim
 65 |         time_shuffle: 1
 66 |         lossconfig:
 67 |           target: audioldm.losses.LPIPSWithDiscriminator
 68 |           params:
 69 |             disc_start: 50001
 70 |             kl_weight: 1000.0
 71 |             disc_weight: 0.5
 72 |             disc_in_channels: 1
 73 |         ddconfig: 
 74 |           double_z: true
 75 |           mel_bins: *mel_bins # The frequency bins of mel spectrogram
 76 |           z_channels: 8
 77 |           resolution: 128 # 256
 78 |           downsample_time: false
 79 |           in_channels: 1
 80 |           out_ch: 1
 81 |           ch: 128 
 82 |           ch_mult:
 83 |           - 1
 84 |           - 2
 85 |           - 4
 86 |           num_res_blocks: 2
 87 |           attn_resolutions: []
 88 |           dropout: 0.0
 89 |           hifigan_ckpt: "./ckpts/hifigan_16k_64bins.ckpt"
 90 | 
 91 |       
 92 |     control_stage_config:
 93 |       target: audioldm.rewas.Adapter
 94 |       params:
 95 |         image_size: 64 
 96 |         extra_film_condition_dim: 512 # If you use film as extra condition, set this parameter. For example if you have two conditioning vectors each have dimension 512, then this number would be 1024
 97 |         in_channels: *unet_in_channels # The input channel of the UNet model
 98 |         model_channels: 192
 99 |         hint_channels: 1
100 |         attention_resolutions:
101 |         - 8
102 |         - 4
103 |         - 2
104 |         num_res_blocks: 2
105 |         channel_mult: 
106 |         - 1
107 |         - 2
108 |         - 3
109 |         - 5
110 |         num_head_channels: 32
111 |         use_spatial_transformer: true
112 |         transformer_depth: 1
113 |         extra_sa_layer: False
114 |         use_checkpoint: true
115 |         legacy: False
116 | 
117 |     # Other parameters
118 |     base_learning_rate: 1.0e-5
119 |     warmup_steps: *warmup_steps
120 |     optimize_ddpm_parameter: *optimize_ddpm_parameter
121 |     sampling_rate: *sampling_rate
122 |     batchsize: 2
123 |     linear_start: 0.0015
124 |     linear_end: 0.0195
125 |     num_timesteps_cond: 1
126 |     log_every_t: 200
127 |     timesteps: 1000
128 |     unconditional_prob_cfg: 0.1
129 |     parameterization: eps 
130 |     first_stage_key: fbank
131 |     latent_t_size: *latent_t_size 
132 |     latent_f_size: *latent_f_size
133 |     channels: *latent_embed_dim 
134 |     monitor: val/loss_simple_ema
135 |     scale_by_std: true
136 |     control_key: "hint"
137 |     only_mid_control: False
138 |     unet_config:
139 |       target: audioldm.rewas.ControlledUnetModel
140 |       params:
141 |         image_size: 64 
142 |         extra_film_condition_dim: 512 # If you use film as extra condition, set this parameter. For example if you have two conditioning vectors each have dimension 512, then this number would be 1024
143 |         in_channels: *unet_in_channels # The input channel of the UNet model
144 |         out_channels: *latent_embed_dim 
145 |         model_channels: 192 
146 |         attention_resolutions:
147 |         - 8
148 |         - 4
149 |         - 2
150 |         num_res_blocks: 2
151 |         channel_mult: 
152 |         - 1
153 |         - 2
154 |         - 3
155 |         - 5
156 |         num_head_channels: 32
157 |         use_spatial_transformer: true
158 |         transformer_depth: 1
159 |         extra_sa_layer: False
160 |     
161 |     cond_stage_config:
162 |       film_clap_cond1:
163 |         cond_stage_key: text
164 |         conditioning_key: film
165 |         target: audioldm.conditional_models.CLAPAudioEmbeddingClassifierFreev2
166 |         params:
167 |           pretrained_path: ./ckpts/clap_music_speech_audioset_epoch_15_esc_89.98.pt
168 |           sampling_rate: 16000
169 |           embed_mode: text
170 |           amodel: HTSAT-base
171 | 
172 |     evaluation_params:
173 |       unconditional_guidance_scale: 3.5
174 |       ddim_sampling_steps: 200
175 |       n_candidates_per_samples: 3
176 | 


--------------------------------------------------------------------------------
/configs/dataset_root.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "greatesthits": "/path/to/Greatest_Hits/data/",
 3 |     "vggsound": "/path/to/vggsound/audiodata/",
 4 |     "comments":{
 5 |     },
 6 | 
 7 |     "metadata":{
 8 |         "path": {
 9 |             "greatesthits":{
10 |                 "train": "/path/to/Greatest_Hits/metadata/greatesthits_train_label.json", 
11 |                 "test": "/path/to/Greatest_Hits/metadata/greatesthits_test_label.json",
12 |                 "val": "/path/to/Greatest_Hits/metadata/greatesthits_val_label.json", 
13 |                 "class_label_indices": "/path/to/Greatest_Hits/metadata/class_labels_indices.csv"
14 |             },
15 |             "vggsound":{
16 |                 "train": "/path/to/vggsound/metadata/vggsound_train_label.json", 
17 |                 "test": "/path/to/vggsound/metadata/vggsound_test_label.json",
18 |                 "val": "/path/to/vggsound/metadata/vggsound_valid_label.json", 
19 |                 "class_label_indices": "./vggsound.tsv"
20 |             }
21 |         }
22 |     }
23 | }
24 | 


--------------------------------------------------------------------------------
/encoder/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 Vladimir Iashin
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/encoder/README.md:
--------------------------------------------------------------------------------
1 | ### Reference
2 | 
3 | Part of the code is borrowed from the following repos.
4 | 
5 | https://github.com/v-iashin/Synchformer
6 | 


--------------------------------------------------------------------------------
/encoder/model/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/naver-ai/rewas/4e09954d386583a4e62e7a1e28c2145ca1cfc411/encoder/model/.DS_Store


--------------------------------------------------------------------------------
/encoder/model/modules/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/naver-ai/rewas/4e09954d386583a4e62e7a1e28c2145ca1cfc411/encoder/model/modules/.DS_Store


--------------------------------------------------------------------------------
/encoder/model/modules/feat_extractors/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/naver-ai/rewas/4e09954d386583a4e62e7a1e28c2145ca1cfc411/encoder/model/modules/feat_extractors/.DS_Store


--------------------------------------------------------------------------------
/encoder/model/modules/feat_extractors/train_clip_src/__init__.py:
--------------------------------------------------------------------------------
1 | import sys
2 | sys.path.append('model/modules/feat_extractors')
3 | sys.path.append('model/modules/feat_extractors/train_clip_src')
4 | 


--------------------------------------------------------------------------------
/encoder/model/modules/feat_extractors/train_clip_src/open_clip/__init__.py:
--------------------------------------------------------------------------------
 1 | from .coca_model import CoCa
 2 | from .constants import OPENAI_DATASET_MEAN, OPENAI_DATASET_STD
 3 | from .factory import create_model, create_model_from_pretrained, get_tokenizer, create_loss
 4 | from .factory import list_models, add_model_config, get_model_config, load_checkpoint
 5 | from .loss import ClipLoss, DistillClipLoss, CoCaLoss
 6 | from .model import CLIP, CustomTextCLIP, CLIPTextCfg, CLIPVisionCfg, \
 7 |     convert_weights_to_lp, convert_weights_to_fp16, trace_model, get_cast_dtype
 8 | from .openai import load_openai_model, list_openai_models
 9 | from .pretrained import list_pretrained, list_pretrained_models_by_tag, list_pretrained_tags_by_model, \
10 |     get_pretrained_url, download_pretrained_from_url, is_pretrained_cfg, get_pretrained_cfg, download_pretrained
11 | from .push_to_hf_hub import push_pretrained_to_hf_hub, push_to_hf_hub
12 | from .tokenizer import SimpleTokenizer, tokenize, decode
13 | from .transform import image_transform, AugmentationCfg
14 | 


--------------------------------------------------------------------------------
/encoder/model/modules/feat_extractors/train_clip_src/open_clip/bpe_simple_vocab_16e6.txt.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/naver-ai/rewas/4e09954d386583a4e62e7a1e28c2145ca1cfc411/encoder/model/modules/feat_extractors/train_clip_src/open_clip/bpe_simple_vocab_16e6.txt.gz


--------------------------------------------------------------------------------
/encoder/model/modules/feat_extractors/train_clip_src/open_clip/constants.py:
--------------------------------------------------------------------------------
1 | OPENAI_DATASET_MEAN = (0.48145466, 0.4578275, 0.40821073)
2 | OPENAI_DATASET_STD = (0.26862954, 0.26130258, 0.27577711)
3 | 


--------------------------------------------------------------------------------
/encoder/model/modules/feat_extractors/train_clip_src/open_clip/generation_utils.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/naver-ai/rewas/4e09954d386583a4e62e7a1e28c2145ca1cfc411/encoder/model/modules/feat_extractors/train_clip_src/open_clip/generation_utils.py


--------------------------------------------------------------------------------
/encoder/model/modules/feat_extractors/train_clip_src/open_clip/hf_configs.py:
--------------------------------------------------------------------------------
 1 | # HF architecture dict:
 2 | arch_dict = {
 3 |     # https://huggingface.co/docs/transformers/model_doc/roberta#roberta
 4 |     "roberta": {
 5 |         "config_names": {
 6 |             "context_length": "max_position_embeddings",
 7 |             "vocab_size": "vocab_size",
 8 |             "width": "hidden_size",
 9 |             "heads": "num_attention_heads",
10 |             "layers": "num_hidden_layers",
11 |             "layer_attr": "layer",
12 |             "token_embeddings_attr": "embeddings"
13 |         },
14 |         "pooler": "mean_pooler",
15 |     },
16 |     # https://huggingface.co/docs/transformers/model_doc/xlm-roberta#transformers.XLMRobertaConfig
17 |     "xlm-roberta": {
18 |         "config_names": {
19 |             "context_length": "max_position_embeddings",
20 |             "vocab_size": "vocab_size",
21 |             "width": "hidden_size",
22 |             "heads": "num_attention_heads",
23 |             "layers": "num_hidden_layers",
24 |             "layer_attr": "layer",
25 |             "token_embeddings_attr": "embeddings"
26 |         },
27 |         "pooler": "mean_pooler",
28 |     },
29 |     # https://huggingface.co/docs/transformers/model_doc/mt5#mt5
30 |     "mt5": {
31 |         "config_names": {
32 |             # unlimited seqlen
33 |             # https://github.com/google-research/text-to-text-transfer-transformer/issues/273
34 |             # https://github.com/huggingface/transformers/blob/v4.24.0/src/transformers/models/t5/modeling_t5.py#L374
35 |             "context_length": "",
36 |             "vocab_size": "vocab_size",
37 |             "width": "d_model",
38 |             "heads": "num_heads",
39 |             "layers": "num_layers",
40 |             "layer_attr": "block",
41 |             "token_embeddings_attr": "embed_tokens"
42 |         },
43 |         "pooler": "mean_pooler",
44 |     },
45 | }
46 | 


--------------------------------------------------------------------------------
/encoder/model/modules/feat_extractors/train_clip_src/open_clip/model_configs/RN101-quickgelu.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 512,
 3 |     "quick_gelu": true,
 4 |     "vision_cfg": {
 5 |         "image_size": 224,
 6 |         "layers": [
 7 |             3,
 8 |             4,
 9 |             23,
10 |             3
11 |         ],
12 |         "width": 64,
13 |         "patch_size": null
14 |     },
15 |     "text_cfg": {
16 |         "context_length": 77,
17 |         "vocab_size": 49408,
18 |         "width": 512,
19 |         "heads": 8,
20 |         "layers": 12
21 |     }
22 | }


--------------------------------------------------------------------------------
/encoder/model/modules/feat_extractors/train_clip_src/open_clip/model_configs/RN101.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 512,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": [
 6 |             3,
 7 |             4,
 8 |             23,
 9 |             3
10 |         ],
11 |         "width": 64,
12 |         "patch_size": null
13 |     },
14 |     "text_cfg": {
15 |         "context_length": 77,
16 |         "vocab_size": 49408,
17 |         "width": 512,
18 |         "heads": 8,
19 |         "layers": 12
20 |     }
21 | }


--------------------------------------------------------------------------------
/encoder/model/modules/feat_extractors/train_clip_src/open_clip/model_configs/RN50-quickgelu.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1024,
 3 |     "quick_gelu": true,
 4 |     "vision_cfg": {
 5 |         "image_size": 224,
 6 |         "layers": [
 7 |             3,
 8 |             4,
 9 |             6,
10 |             3
11 |         ],
12 |         "width": 64,
13 |         "patch_size": null
14 |     },
15 |     "text_cfg": {
16 |         "context_length": 77,
17 |         "vocab_size": 49408,
18 |         "width": 512,
19 |         "heads": 8,
20 |         "layers": 12
21 |     }
22 | }
23 | 


--------------------------------------------------------------------------------
/encoder/model/modules/feat_extractors/train_clip_src/open_clip/model_configs/RN50.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1024,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": [
 6 |             3,
 7 |             4,
 8 |             6,
 9 |             3
10 |         ],
11 |         "width": 64,
12 |         "patch_size": null
13 |     },
14 |     "text_cfg": {
15 |         "context_length": 77,
16 |         "vocab_size": 49408,
17 |         "width": 512,
18 |         "heads": 8,
19 |         "layers": 12
20 |     }
21 | }


--------------------------------------------------------------------------------
/encoder/model/modules/feat_extractors/train_clip_src/open_clip/model_configs/RN50x16.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 768,
 3 |     "vision_cfg": {
 4 |         "image_size": 384,
 5 |         "layers": [
 6 |             6,
 7 |             8,
 8 |             18,
 9 |             8
10 |         ],
11 |         "width": 96,
12 |         "patch_size": null
13 |     },
14 |     "text_cfg": {
15 |         "context_length": 77,
16 |         "vocab_size": 49408,
17 |         "width": 768,
18 |         "heads": 12,
19 |         "layers": 12
20 |     }
21 | }


--------------------------------------------------------------------------------
/encoder/model/modules/feat_extractors/train_clip_src/open_clip/model_configs/RN50x4.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 640,
 3 |     "vision_cfg": {
 4 |         "image_size": 288,
 5 |         "layers": [
 6 |             4,
 7 |             6,
 8 |             10,
 9 |             6
10 |         ],
11 |         "width": 80,
12 |         "patch_size": null
13 |     },
14 |     "text_cfg": {
15 |         "context_length": 77,
16 |         "vocab_size": 49408,
17 |         "width": 640,
18 |         "heads": 10,
19 |         "layers": 12
20 |     }
21 | }


--------------------------------------------------------------------------------
/encoder/model/modules/feat_extractors/train_clip_src/open_clip/model_configs/RN50x64.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1024,
 3 |     "vision_cfg": {
 4 |         "image_size": 448,
 5 |         "layers": [
 6 |             3,
 7 |             15,
 8 |             36,
 9 |             10
10 |         ],
11 |         "width": 128,
12 |         "patch_size": null
13 |     },
14 |     "text_cfg": {
15 |         "context_length": 77,
16 |         "vocab_size": 49408,
17 |         "width": 1024,
18 |         "heads": 16,
19 |         "layers": 12
20 |     }
21 | }


--------------------------------------------------------------------------------
/encoder/model/modules/feat_extractors/train_clip_src/open_clip/model_configs/ViT-B-16-plus-240.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 640,
 3 |     "vision_cfg": {
 4 |         "image_size": 240,
 5 |         "layers": 12,
 6 |         "width": 896,
 7 |         "patch_size": 16
 8 |     },
 9 |     "text_cfg": {
10 |         "context_length": 77,
11 |         "vocab_size": 49408,
12 |         "width": 640,
13 |         "heads": 10,
14 |         "layers": 12
15 |     }
16 | }


--------------------------------------------------------------------------------
/encoder/model/modules/feat_extractors/train_clip_src/open_clip/model_configs/ViT-B-16-plus.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 640,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 12,
 6 |         "width": 896,
 7 |         "patch_size": 16
 8 |     },
 9 |     "text_cfg": {
10 |         "context_length": 77,
11 |         "vocab_size": 49408,
12 |         "width": 640,
13 |         "heads": 10,
14 |         "layers": 12
15 |     }
16 | }


--------------------------------------------------------------------------------
/encoder/model/modules/feat_extractors/train_clip_src/open_clip/model_configs/ViT-B-16.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 512,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 12,
 6 |         "width": 768,
 7 |         "patch_size": 16
 8 |     },
 9 |     "text_cfg": {
10 |         "context_length": 77,
11 |         "vocab_size": 49408,
12 |         "width": 512,
13 |         "heads": 8,
14 |         "layers": 12
15 |     }
16 | }


--------------------------------------------------------------------------------
/encoder/model/modules/feat_extractors/train_clip_src/open_clip/model_configs/ViT-B-32-plus-256.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 640,
 3 |     "vision_cfg": {
 4 |         "image_size": 256,
 5 |         "layers": 12,
 6 |         "width": 896,
 7 |         "patch_size": 32
 8 |     },
 9 |     "text_cfg": {
10 |         "context_length": 77,
11 |         "vocab_size": 49408,
12 |         "width": 640,
13 |         "heads": 10,
14 |         "layers": 12
15 |     }
16 | }


--------------------------------------------------------------------------------
/encoder/model/modules/feat_extractors/train_clip_src/open_clip/model_configs/ViT-B-32-quickgelu.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 512,
 3 |     "quick_gelu": true,
 4 |     "vision_cfg": {
 5 |         "image_size": 224,
 6 |         "layers": 12,
 7 |         "width": 768,
 8 |         "patch_size": 32
 9 |     },
10 |     "text_cfg": {
11 |         "context_length": 77,
12 |         "vocab_size": 49408,
13 |         "width": 512,
14 |         "heads": 8,
15 |         "layers": 12
16 |     }
17 | }


--------------------------------------------------------------------------------
/encoder/model/modules/feat_extractors/train_clip_src/open_clip/model_configs/ViT-B-32.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 512,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 12,
 6 |         "width": 768,
 7 |         "patch_size": 32
 8 |     },
 9 |     "text_cfg": {
10 |         "context_length": 77,
11 |         "vocab_size": 49408,
12 |         "width": 512,
13 |         "heads": 8,
14 |         "layers": 12
15 |     }
16 | }


--------------------------------------------------------------------------------
/encoder/model/modules/feat_extractors/train_clip_src/open_clip/model_configs/ViT-H-14.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1024,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 32,
 6 |         "width": 1280,
 7 |         "head_width": 80,
 8 |         "patch_size": 14
 9 |     },
10 |     "text_cfg": {
11 |         "context_length": 77,
12 |         "vocab_size": 49408,
13 |         "width": 1024,
14 |         "heads": 16,
15 |         "layers": 24
16 |     }
17 | }


--------------------------------------------------------------------------------
/encoder/model/modules/feat_extractors/train_clip_src/open_clip/model_configs/ViT-H-16.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1024,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 32,
 6 |         "width": 1280,
 7 |         "head_width": 80,
 8 |         "patch_size": 16
 9 |     },
10 |     "text_cfg": {
11 |         "context_length": 77,
12 |         "vocab_size": 49408,
13 |         "width": 1024,
14 |         "heads": 16,
15 |         "layers": 24
16 |     }
17 | }


--------------------------------------------------------------------------------
/encoder/model/modules/feat_extractors/train_clip_src/open_clip/model_configs/ViT-L-14-280.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 768,
 3 |     "vision_cfg": {
 4 |         "image_size": 280,
 5 |         "layers": 24,
 6 |         "width": 1024,
 7 |         "patch_size": 14
 8 |     },
 9 |     "text_cfg": {
10 |         "context_length": 77,
11 |         "vocab_size": 49408,
12 |         "width": 768,
13 |         "heads": 12,
14 |         "layers": 12
15 |     }
16 | }


--------------------------------------------------------------------------------
/encoder/model/modules/feat_extractors/train_clip_src/open_clip/model_configs/ViT-L-14-336.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 768,
 3 |     "vision_cfg": {
 4 |         "image_size": 336,
 5 |         "layers": 24,
 6 |         "width": 1024,
 7 |         "patch_size": 14
 8 |     },
 9 |     "text_cfg": {
10 |         "context_length": 77,
11 |         "vocab_size": 49408,
12 |         "width": 768,
13 |         "heads": 12,
14 |         "layers": 12
15 |     }
16 | }


--------------------------------------------------------------------------------
/encoder/model/modules/feat_extractors/train_clip_src/open_clip/model_configs/ViT-L-14.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 768,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 24,
 6 |         "width": 1024,
 7 |         "patch_size": 14
 8 |     },
 9 |     "text_cfg": {
10 |         "context_length": 77,
11 |         "vocab_size": 49408,
12 |         "width": 768,
13 |         "heads": 12,
14 |         "layers": 12
15 |     }
16 | }


--------------------------------------------------------------------------------
/encoder/model/modules/feat_extractors/train_clip_src/open_clip/model_configs/ViT-L-16-320.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 768,
 3 |     "vision_cfg": {
 4 |         "image_size": 320,
 5 |         "layers": 24,
 6 |         "width": 1024,
 7 |         "patch_size": 16
 8 |     },
 9 |     "text_cfg": {
10 |         "context_length": 77,
11 |         "vocab_size": 49408,
12 |         "width": 768,
13 |         "heads": 12,
14 |         "layers": 12
15 |     }
16 | }


--------------------------------------------------------------------------------
/encoder/model/modules/feat_extractors/train_clip_src/open_clip/model_configs/ViT-L-16.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 768,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 24,
 6 |         "width": 1024,
 7 |         "patch_size": 16
 8 |     },
 9 |     "text_cfg": {
10 |         "context_length": 77,
11 |         "vocab_size": 49408,
12 |         "width": 768,
13 |         "heads": 12,
14 |         "layers": 12
15 |     }
16 | }


--------------------------------------------------------------------------------
/encoder/model/modules/feat_extractors/train_clip_src/open_clip/model_configs/ViT-M-16-alt.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 384,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 12,
 6 |         "width": 512,
 7 |         "patch_size": 16,
 8 |         "ls_init_value": 1e-4
 9 |     },
10 |     "text_cfg": {
11 |         "context_length": 77,
12 |         "vocab_size": 49408,
13 |         "width": 384,
14 |         "heads": 6,
15 |         "layers": 12
16 |     }
17 | }


--------------------------------------------------------------------------------
/encoder/model/modules/feat_extractors/train_clip_src/open_clip/model_configs/ViT-M-16.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 512,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 12,
 6 |         "width": 512,
 7 |         "patch_size": 16
 8 |     },
 9 |     "text_cfg": {
10 |         "context_length": 77,
11 |         "vocab_size": 49408,
12 |         "width": 512,
13 |         "heads": 8,
14 |         "layers": 12
15 |     }
16 | }


--------------------------------------------------------------------------------
/encoder/model/modules/feat_extractors/train_clip_src/open_clip/model_configs/ViT-M-32-alt.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 384,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 12,
 6 |         "width": 512,
 7 |         "patch_size": 32
 8 |     },
 9 |     "text_cfg": {
10 |         "context_length": 77,
11 |         "vocab_size": 49408,
12 |         "width": 384,
13 |         "heads": 6,
14 |         "layers": 12
15 |     }
16 | }


--------------------------------------------------------------------------------
/encoder/model/modules/feat_extractors/train_clip_src/open_clip/model_configs/ViT-M-32.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 512,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 12,
 6 |         "width": 512,
 7 |         "patch_size": 32
 8 |     },
 9 |     "text_cfg": {
10 |         "context_length": 77,
11 |         "vocab_size": 49408,
12 |         "width": 512,
13 |         "heads": 8,
14 |         "layers": 12
15 |     }
16 | }


--------------------------------------------------------------------------------
/encoder/model/modules/feat_extractors/train_clip_src/open_clip/model_configs/ViT-S-16-alt.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 256,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 12,
 6 |         "width": 384,
 7 |         "patch_size": 16
 8 |     },
 9 |     "text_cfg": {
10 |         "context_length": 77,
11 |         "vocab_size": 49408,
12 |         "width": 256,
13 |         "heads": 4,
14 |         "layers": 10
15 |     }
16 | }


--------------------------------------------------------------------------------
/encoder/model/modules/feat_extractors/train_clip_src/open_clip/model_configs/ViT-S-16.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 384,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 12,
 6 |         "width": 384,
 7 |         "patch_size": 16
 8 |     },
 9 |     "text_cfg": {
10 |         "context_length": 77,
11 |         "vocab_size": 49408,
12 |         "width": 384,
13 |         "heads": 6,
14 |         "layers": 12
15 |     }
16 | }


--------------------------------------------------------------------------------
/encoder/model/modules/feat_extractors/train_clip_src/open_clip/model_configs/ViT-S-32-alt.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 256,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 12,
 6 |         "width": 384,
 7 |         "patch_size": 32
 8 |     },
 9 |     "text_cfg": {
10 |         "context_length": 77,
11 |         "vocab_size": 49408,
12 |         "width": 256,
13 |         "heads": 4,
14 |         "layers": 10
15 |     }
16 | }


--------------------------------------------------------------------------------
/encoder/model/modules/feat_extractors/train_clip_src/open_clip/model_configs/ViT-S-32.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 384,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 12,
 6 |         "width": 384,
 7 |         "patch_size": 32
 8 |     },
 9 |     "text_cfg": {
10 |         "context_length": 77,
11 |         "vocab_size": 49408,
12 |         "width": 384,
13 |         "heads": 6,
14 |         "layers": 12
15 |     }
16 | }


--------------------------------------------------------------------------------
/encoder/model/modules/feat_extractors/train_clip_src/open_clip/model_configs/ViT-bigG-14.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1280,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 48,
 6 |         "width": 1664,
 7 |         "head_width": 104,
 8 |         "mlp_ratio": 4.9231,
 9 |         "patch_size": 14
10 |     },
11 |     "text_cfg": {
12 |         "context_length": 77,
13 |         "vocab_size": 49408,
14 |         "width": 1280,
15 |         "heads": 20,
16 |         "layers": 32
17 |     }
18 | }


--------------------------------------------------------------------------------
/encoder/model/modules/feat_extractors/train_clip_src/open_clip/model_configs/ViT-e-14.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1280,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 56,
 6 |         "width": 1792,
 7 |         "head_width": 112,
 8 |         "mlp_ratio": 8.5715,
 9 |         "patch_size": 14
10 |     },
11 |     "text_cfg": {
12 |         "context_length": 77,
13 |         "vocab_size": 49408,
14 |         "width": 1280,
15 |         "heads": 20,
16 |         "layers": 36
17 |     }
18 | }


--------------------------------------------------------------------------------
/encoder/model/modules/feat_extractors/train_clip_src/open_clip/model_configs/ViT-g-14.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1024,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 40,
 6 |         "width": 1408,
 7 |         "head_width": 88,
 8 |         "mlp_ratio": 4.3637,
 9 |         "patch_size": 14
10 |     },
11 |     "text_cfg": {
12 |         "context_length": 77,
13 |         "vocab_size": 49408,
14 |         "width": 1024,
15 |         "heads": 16,
16 |         "layers": 24
17 |     }
18 | }


--------------------------------------------------------------------------------
/encoder/model/modules/feat_extractors/train_clip_src/open_clip/model_configs/coca_ViT-B-32.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 512,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 12,
 6 |         "width": 768,
 7 |         "patch_size": 32,
 8 |         "attentional_pool": true,
 9 |         "attn_pooler_heads": 8,
10 |         "output_tokens": true
11 |     },
12 |     "text_cfg": {
13 |         "context_length": 76,
14 |         "vocab_size": 49408,
15 |         "width": 512,
16 |         "heads": 8,
17 |         "layers": 12,
18 |         "embed_cls": true,
19 |         "output_tokens": true
20 |     },
21 |     "multimodal_cfg": {
22 |         "context_length": 76,
23 |         "vocab_size": 49408,
24 |         "width": 512,
25 |         "heads": 8,
26 |         "layers": 12,
27 |         "attn_pooler_heads": 8
28 |     },
29 |     "custom_text": true
30 | }


--------------------------------------------------------------------------------
/encoder/model/modules/feat_extractors/train_clip_src/open_clip/model_configs/coca_ViT-L-14.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 768,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 24,
 6 |         "width": 1024,
 7 |         "patch_size": 14,
 8 |         "attentional_pool": true,
 9 |         "attn_pooler_heads": 8,
10 |         "output_tokens": true
11 |     },
12 |     "text_cfg": {
13 |         "context_length": 76,
14 |         "vocab_size": 49408,
15 |         "width": 768,
16 |         "heads": 12,
17 |         "layers": 12,
18 |         "embed_cls": true,
19 |         "output_tokens": true
20 |     },
21 |     "multimodal_cfg": {
22 |         "context_length": 76,
23 |         "vocab_size": 49408,
24 |         "width": 768,
25 |         "heads": 12,
26 |         "layers": 12,
27 |         "attn_pooler_heads": 12
28 |     },
29 |     "custom_text": true
30 | }
31 | 


--------------------------------------------------------------------------------
/encoder/model/modules/feat_extractors/train_clip_src/open_clip/model_configs/coca_base.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 512,
 3 |     "multimodal_cfg": {
 4 |         "width": 768,
 5 |         "context_length": 76,
 6 |         "vocab_size": 64000,
 7 |         "mlp_ratio": 4,
 8 |         "layers": 12,
 9 |         "dim_head": 64,
10 |         "heads": 12,
11 |         "n_queries": 256,
12 |         "attn_pooler_heads": 8
13 |     },
14 |     "vision_cfg": {
15 |         "image_size": 288,
16 |         "layers": 12,
17 |         "width": 768,
18 |         "patch_size": 18,
19 |         "output_tokens": true
20 |     },
21 |     "text_cfg": {
22 |         "context_length": 76,
23 |         "vocab_size": 64000,
24 |         "layers": 12,
25 |         "heads": 12,
26 |         "width": 768,
27 |         "embed_cls": true,
28 |         "output_tokens": true
29 |     },
30 |     "custom_text": true
31 | }


--------------------------------------------------------------------------------
/encoder/model/modules/feat_extractors/train_clip_src/open_clip/model_configs/coca_roberta-ViT-B-32.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 512,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 12,
 6 |         "width": 768,
 7 |         "patch_size": 32,
 8 |         "output_tokens": true
 9 |     },
10 |     "text_cfg": {
11 |         "hf_model_name": "roberta-base",
12 |         "hf_tokenizer_name": "roberta-base",
13 |         "proj": "linear",
14 |         "width": 768,
15 |         "output_tokens": true
16 |     },
17 |     "multimodal_cfg": {
18 |         "context_length": 76,
19 |         "width": 768,
20 |         "heads": 8,
21 |         "layers": 12
22 |     },
23 |     "custom_text": true
24 | }
25 | 


--------------------------------------------------------------------------------
/encoder/model/modules/feat_extractors/train_clip_src/open_clip/model_configs/convnext_base.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 512,
 3 |     "vision_cfg": {
 4 |         "timm_model_name": "convnext_base",
 5 |         "timm_model_pretrained": false,
 6 |         "timm_pool": "",
 7 |         "timm_proj": "linear",
 8 |         "timm_drop": 0.0,
 9 |         "timm_drop_path": 0.1,
10 |         "image_size": 224
11 |     },
12 |     "text_cfg": {
13 |         "context_length": 77,
14 |         "vocab_size": 49408,
15 |         "width": 512,
16 |         "heads": 8,
17 |         "layers": 12
18 |     }
19 | }


--------------------------------------------------------------------------------
/encoder/model/modules/feat_extractors/train_clip_src/open_clip/model_configs/convnext_base_w.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 640,
 3 |     "vision_cfg": {
 4 |         "timm_model_name": "convnext_base",
 5 |         "timm_model_pretrained": false,
 6 |         "timm_pool": "",
 7 |         "timm_proj": "linear",
 8 |         "timm_drop": 0.0,
 9 |         "timm_drop_path": 0.1,
10 |         "image_size": 256
11 |     },
12 |     "text_cfg": {
13 |         "context_length": 77,
14 |         "vocab_size": 49408,
15 |         "width": 640,
16 |         "heads": 10,
17 |         "layers": 12
18 |     }
19 | }


--------------------------------------------------------------------------------
/encoder/model/modules/feat_extractors/train_clip_src/open_clip/model_configs/convnext_base_w_320.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 640,
 3 |     "vision_cfg": {
 4 |         "timm_model_name": "convnext_base",
 5 |         "timm_model_pretrained": false,
 6 |         "timm_pool": "",
 7 |         "timm_proj": "linear",
 8 |         "timm_drop": 0.0,
 9 |         "timm_drop_path": 0.1,
10 |         "image_size": 320
11 |     },
12 |     "text_cfg": {
13 |         "context_length": 77,
14 |         "vocab_size": 49408,
15 |         "width": 640,
16 |         "heads": 10,
17 |         "layers": 12
18 |     }
19 | }


--------------------------------------------------------------------------------
/encoder/model/modules/feat_extractors/train_clip_src/open_clip/model_configs/convnext_large.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 768,
 3 |     "vision_cfg": {
 4 |         "timm_model_name": "convnext_large",
 5 |         "timm_model_pretrained": false,
 6 |         "timm_pool": "",
 7 |         "timm_proj": "linear",
 8 |         "timm_drop": 0.0,
 9 |         "timm_drop_path": 0.1,
10 |         "image_size": 224
11 |     },
12 |     "text_cfg": {
13 |         "context_length": 77,
14 |         "vocab_size": 49408,
15 |         "width": 768,
16 |         "heads": 12,
17 |         "layers": 12
18 |     }
19 | }


--------------------------------------------------------------------------------
/encoder/model/modules/feat_extractors/train_clip_src/open_clip/model_configs/convnext_large_d.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 768,
 3 |     "vision_cfg": {
 4 |         "timm_model_name": "convnext_large",
 5 |         "timm_model_pretrained": false,
 6 |         "timm_pool": "",
 7 |         "timm_proj": "mlp",
 8 |         "timm_drop": 0.0,
 9 |         "timm_drop_path": 0.1,
10 |         "image_size": 256
11 |     },
12 |     "text_cfg": {
13 |         "context_length": 77,
14 |         "vocab_size": 49408,
15 |         "width": 768,
16 |         "heads": 12,
17 |         "layers": 16
18 |     }
19 | }


--------------------------------------------------------------------------------
/encoder/model/modules/feat_extractors/train_clip_src/open_clip/model_configs/convnext_large_d_320.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 768,
 3 |     "vision_cfg": {
 4 |         "timm_model_name": "convnext_large",
 5 |         "timm_model_pretrained": false,
 6 |         "timm_pool": "",
 7 |         "timm_proj": "mlp",
 8 |         "timm_drop": 0.0,
 9 |         "timm_drop_path": 0.1,
10 |         "image_size": 320
11 |     },
12 |     "text_cfg": {
13 |         "context_length": 77,
14 |         "vocab_size": 49408,
15 |         "width": 768,
16 |         "heads": 12,
17 |         "layers": 16
18 |     }
19 | }


--------------------------------------------------------------------------------
/encoder/model/modules/feat_extractors/train_clip_src/open_clip/model_configs/convnext_small.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 512,
 3 |     "vision_cfg": {
 4 |         "timm_model_name": "convnext_small",
 5 |         "timm_model_pretrained": false,
 6 |         "timm_pool": "",
 7 |         "timm_proj": "linear",
 8 |         "timm_drop": 0.0,
 9 |         "timm_drop_path": 0.1,
10 |         "image_size": 224
11 |     },
12 |     "text_cfg": {
13 |         "context_length": 77,
14 |         "vocab_size": 49408,
15 |         "width": 512,
16 |         "heads": 8,
17 |         "layers": 12
18 |     }
19 | }


--------------------------------------------------------------------------------
/encoder/model/modules/feat_extractors/train_clip_src/open_clip/model_configs/convnext_tiny.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1024,
 3 |     "vision_cfg": {
 4 |         "timm_model_name": "convnext_tiny",
 5 |         "timm_model_pretrained": false,
 6 |         "timm_pool": "",
 7 |         "timm_proj": "linear",
 8 |         "timm_drop": 0.0,
 9 |         "timm_drop_path": 0.1,
10 |         "image_size": 224
11 |     },
12 |     "text_cfg": {
13 |         "context_length": 77,
14 |         "vocab_size": 49408,
15 |         "width": 512,
16 |         "heads": 8,
17 |         "layers": 12
18 |     }
19 | }


--------------------------------------------------------------------------------
/encoder/model/modules/feat_extractors/train_clip_src/open_clip/model_configs/convnext_xlarge.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1024,
 3 |     "vision_cfg": {
 4 |         "timm_model_name": "convnext_xlarge",
 5 |         "timm_model_pretrained": false,
 6 |         "timm_pool": "",
 7 |         "timm_proj": "linear",
 8 |         "timm_drop": 0.0,
 9 |         "timm_drop_path": 0.1,
10 |         "image_size": 256
11 |     },
12 |     "text_cfg": {
13 |         "context_length": 77,
14 |         "vocab_size": 49408,
15 |         "width": 1024,
16 |         "heads": 16,
17 |         "layers": 20
18 |     }
19 | }


--------------------------------------------------------------------------------
/encoder/model/modules/feat_extractors/train_clip_src/open_clip/model_configs/convnext_xxlarge.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1024,
 3 |     "vision_cfg": {
 4 |         "timm_model_name": "convnext_xxlarge",
 5 |         "timm_model_pretrained": false,
 6 |         "timm_pool": "",
 7 |         "timm_proj": "linear",
 8 |         "timm_drop": 0.0,
 9 |         "timm_drop_path": 0.1,
10 |         "image_size": 256
11 |     },
12 |     "text_cfg": {
13 |         "context_length": 77,
14 |         "vocab_size": 49408,
15 |         "width": 1024,
16 |         "heads": 16,
17 |         "layers": 24
18 |     }
19 | }


--------------------------------------------------------------------------------
/encoder/model/modules/feat_extractors/train_clip_src/open_clip/model_configs/convnext_xxlarge_320.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1024,
 3 |     "vision_cfg": {
 4 |         "timm_model_name": "convnext_xxlarge",
 5 |         "timm_model_pretrained": false,
 6 |         "timm_pool": "",
 7 |         "timm_proj": "linear",
 8 |         "timm_drop": 0.0,
 9 |         "timm_drop_path": 0.1,
10 |         "image_size": 320
11 |     },
12 |     "text_cfg": {
13 |         "context_length": 77,
14 |         "vocab_size": 49408,
15 |         "width": 1024,
16 |         "heads": 16,
17 |         "layers": 24
18 |     }
19 | }


--------------------------------------------------------------------------------
/encoder/model/modules/feat_extractors/train_clip_src/open_clip/model_configs/mt5-base-ViT-B-32.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 512,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 12,
 6 |         "width": 768,
 7 |         "patch_size": 32
 8 |     },
 9 |     "text_cfg": {
10 |         "hf_model_name": "google/mt5-base",
11 |         "hf_tokenizer_name": "google/mt5-base",
12 |         "proj": "mlp",
13 |         "pooler_type": "mean_pooler"
14 |     }
15 | }
16 | 


--------------------------------------------------------------------------------
/encoder/model/modules/feat_extractors/train_clip_src/open_clip/model_configs/mt5-xl-ViT-H-14.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1024,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 32,
 6 |         "width": 1280,
 7 |         "head_width": 80,
 8 |         "patch_size": 14
 9 |     },
10 |     "text_cfg": {
11 |         "hf_model_name": "google/mt5-xl",
12 |         "hf_tokenizer_name": "google/mt5-xl",
13 |         "proj": "mlp",
14 |         "pooler_type": "mean_pooler"
15 |     }
16 | }
17 | 


--------------------------------------------------------------------------------
/encoder/model/modules/feat_extractors/train_clip_src/open_clip/model_configs/roberta-ViT-B-32.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 512,
 3 |     "quick_gelu": true,
 4 |     "vision_cfg": {
 5 |         "image_size": 224,
 6 |         "layers": 12,
 7 |         "width": 768,
 8 |         "patch_size": 32
 9 |     },
10 |     "text_cfg": {
11 |         "hf_model_name": "roberta-base",
12 |         "hf_tokenizer_name": "roberta-base",
13 |         "proj": "mlp",
14 |         "pooler_type": "mean_pooler"
15 |     }
16 | }
17 | 


--------------------------------------------------------------------------------
/encoder/model/modules/feat_extractors/train_clip_src/open_clip/model_configs/swin_base_patch4_window7_224.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 640,
 3 |     "vision_cfg": {
 4 |         "timm_model_name": "swin_base_patch4_window7_224",
 5 |         "timm_model_pretrained": false,
 6 |         "timm_pool": "",
 7 |         "timm_proj": "linear",
 8 |         "image_size": 224
 9 |     },
10 |     "text_cfg": {
11 |         "context_length": 77,
12 |         "vocab_size": 49408,
13 |         "width": 640,
14 |         "heads": 10,
15 |         "layers": 12
16 |     }
17 | }


--------------------------------------------------------------------------------
/encoder/model/modules/feat_extractors/train_clip_src/open_clip/model_configs/vit_medium_patch16_gap_256.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 512,
 3 |     "vision_cfg": {
 4 |         "timm_model_name": "vit_medium_patch16_gap_256",
 5 |         "timm_model_pretrained": false,
 6 |         "timm_pool": "",
 7 |         "timm_proj": "linear",
 8 |         "image_size": 256
 9 |     },
10 |     "text_cfg": {
11 |         "context_length": 77,
12 |         "vocab_size": 49408,
13 |         "width": 512,
14 |         "heads": 8,
15 |         "layers": 12
16 |     }
17 | }


--------------------------------------------------------------------------------
/encoder/model/modules/feat_extractors/train_clip_src/open_clip/model_configs/vit_relpos_medium_patch16_cls_224.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 512,
 3 |     "vision_cfg": {
 4 |         "timm_model_name": "vit_relpos_medium_patch16_cls_224",
 5 |         "timm_model_pretrained": false,
 6 |         "timm_pool": "",
 7 |         "timm_proj": "linear",
 8 |         "image_size": 224
 9 |     },
10 |     "text_cfg": {
11 |         "context_length": 77,
12 |         "vocab_size": 49408,
13 |         "width": 512,
14 |         "heads": 8,
15 |         "layers": 12
16 |     }
17 | }


--------------------------------------------------------------------------------
/encoder/model/modules/feat_extractors/train_clip_src/open_clip/model_configs/xlm-roberta-base-ViT-B-32.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 512,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 12,
 6 |         "width": 768,
 7 |         "patch_size": 32
 8 |     },
 9 |     "text_cfg": {
10 |         "hf_model_name": "xlm-roberta-base",
11 |         "hf_tokenizer_name": "xlm-roberta-base",
12 |         "proj": "mlp",
13 |         "pooler_type": "mean_pooler"
14 |     }
15 | }
16 | 


--------------------------------------------------------------------------------
/encoder/model/modules/feat_extractors/train_clip_src/open_clip/model_configs/xlm-roberta-large-ViT-H-14.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1024,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 32,
 6 |         "width": 1280,
 7 |         "head_width": 80,
 8 |         "patch_size": 14
 9 |     },
10 |     "text_cfg": {
11 |         "hf_model_name": "xlm-roberta-large",
12 |         "hf_tokenizer_name": "xlm-roberta-large",
13 |         "proj": "mlp",
14 |         "pooler_type": "mean_pooler"
15 |     }
16 | }
17 | 


--------------------------------------------------------------------------------
/encoder/model/modules/feat_extractors/train_clip_src/open_clip/openai.py:
--------------------------------------------------------------------------------
  1 | """ OpenAI pretrained model functions
  2 | 
  3 | Adapted from https://github.com/openai/CLIP. Originally MIT License, Copyright (c) 2021 OpenAI.
  4 | """
  5 | 
  6 | import os
  7 | import warnings
  8 | from typing import List, Optional, Union
  9 | 
 10 | import torch
 11 | 
 12 | from .model import build_model_from_openai_state_dict, convert_weights_to_lp, get_cast_dtype
 13 | from .pretrained import get_pretrained_url, list_pretrained_models_by_tag, download_pretrained_from_url
 14 | 
 15 | __all__ = ["list_openai_models", "load_openai_model"]
 16 | 
 17 | 
 18 | def list_openai_models() -> List[str]:
 19 |     """Returns the names of available CLIP models"""
 20 |     return list_pretrained_models_by_tag('openai')
 21 | 
 22 | 
 23 | def load_openai_model(
 24 |         name: str,
 25 |         precision: Optional[str] = None,
 26 |         device: Optional[Union[str, torch.device]] = None,
 27 |         jit: bool = True,
 28 |         cache_dir: Optional[str] = None,
 29 | ):
 30 |     """Load a CLIP model
 31 | 
 32 |     Parameters
 33 |     ----------
 34 |     name : str
 35 |         A model name listed by `clip.available_models()`, or the path to a model checkpoint containing the state_dict
 36 |     precision: str
 37 |         Model precision, if None defaults to 'fp32' if device == 'cpu' else 'fp16'.
 38 |     device : Union[str, torch.device]
 39 |         The device to put the loaded model
 40 |     jit : bool
 41 |         Whether to load the optimized JIT model (default) or more hackable non-JIT model.
 42 |     cache_dir : Optional[str]
 43 |         The directory to cache the downloaded model weights
 44 | 
 45 |     Returns
 46 |     -------
 47 |     model : torch.nn.Module
 48 |         The CLIP model
 49 |     preprocess : Callable[[PIL.Image], torch.Tensor]
 50 |         A torchvision transform that converts a PIL image into a tensor that the returned model can take as its input
 51 |     """
 52 |     if device is None:
 53 |         device = "cuda" if torch.cuda.is_available() else "cpu"
 54 |     if precision is None:
 55 |         precision = 'fp32' if device == 'cpu' else 'fp16'
 56 | 
 57 |     if get_pretrained_url(name, 'openai'):
 58 |         model_path = download_pretrained_from_url(get_pretrained_url(name, 'openai'), cache_dir=cache_dir)
 59 |     elif os.path.isfile(name):
 60 |         model_path = name
 61 |     else:
 62 |         raise RuntimeError(f"Model {name} not found; available models = {list_openai_models()}")
 63 | 
 64 |     try:
 65 |         # loading JIT archive
 66 |         model = torch.jit.load(model_path, map_location=device if jit else "cpu").eval()
 67 |         state_dict = None
 68 |     except RuntimeError:
 69 |         # loading saved state dict
 70 |         if jit:
 71 |             warnings.warn(f"File {model_path} is not a JIT archive. Loading as a state dict instead")
 72 |             jit = False
 73 |         state_dict = torch.load(model_path, map_location="cpu")
 74 | 
 75 |     if not jit:
 76 |         # Build a non-jit model from the OpenAI jitted model state dict
 77 |         cast_dtype = get_cast_dtype(precision)
 78 |         try:
 79 |             model = build_model_from_openai_state_dict(state_dict or model.state_dict(), cast_dtype=cast_dtype)
 80 |         except KeyError:
 81 |             sd = {k[7:]: v for k, v in state_dict["state_dict"].items()}
 82 |             model = build_model_from_openai_state_dict(sd, cast_dtype=cast_dtype)
 83 | 
 84 |         # model from OpenAI state dict is in manually cast fp16 mode, must be converted for AMP/fp32/bf16 use
 85 |         model = model.to(device)
 86 |         if precision.startswith('amp') or precision == 'fp32':
 87 |             model.float()
 88 |         elif precision == 'bf16':
 89 |             convert_weights_to_lp(model, dtype=torch.bfloat16)
 90 | 
 91 |         return model
 92 | 
 93 |     # patch the device names
 94 |     device_holder = torch.jit.trace(lambda: torch.ones([]).to(torch.device(device)), example_inputs=[])
 95 |     device_node = [n for n in device_holder.graph.findAllNodes("prim::Constant") if "Device" in repr(n)][-1]
 96 | 
 97 |     def patch_device(module):
 98 |         try:
 99 |             graphs = [module.graph] if hasattr(module, "graph") else []
100 |         except RuntimeError:
101 |             graphs = []
102 | 
103 |         if hasattr(module, "forward1"):
104 |             graphs.append(module.forward1.graph)
105 | 
106 |         for graph in graphs:
107 |             for node in graph.findAllNodes("prim::Constant"):
108 |                 if "value" in node.attributeNames() and str(node["value"]).startswith("cuda"):
109 |                     node.copyAttributes(device_node)
110 | 
111 |     model.apply(patch_device)
112 |     patch_device(model.encode_image)
113 |     patch_device(model.encode_text)
114 | 
115 |     # patch dtype to float32 (typically for CPU)
116 |     if precision == 'fp32':
117 |         float_holder = torch.jit.trace(lambda: torch.ones([]).float(), example_inputs=[])
118 |         float_input = list(float_holder.graph.findNode("aten::to").inputs())[1]
119 |         float_node = float_input.node()
120 | 
121 |         def patch_float(module):
122 |             try:
123 |                 graphs = [module.graph] if hasattr(module, "graph") else []
124 |             except RuntimeError:
125 |                 graphs = []
126 | 
127 |             if hasattr(module, "forward1"):
128 |                 graphs.append(module.forward1.graph)
129 | 
130 |             for graph in graphs:
131 |                 for node in graph.findAllNodes("aten::to"):
132 |                     inputs = list(node.inputs())
133 |                     for i in [1, 2]:  # dtype can be the second or third argument to aten::to()
134 |                         if inputs[i].node()["value"] == 5:
135 |                             inputs[i].node().copyAttributes(float_node)
136 | 
137 |         model.apply(patch_float)
138 |         patch_float(model.encode_image)
139 |         patch_float(model.encode_text)
140 |         model.float()
141 | 
142 |     # ensure image_size attr available at consistent location for both jit and non-jit
143 |     model.visual.image_size = model.input_resolution.item()
144 |     return model
145 | 


--------------------------------------------------------------------------------
/encoder/model/modules/feat_extractors/train_clip_src/open_clip/timm_model.py:
--------------------------------------------------------------------------------
  1 | """ timm model adapter
  2 | 
  3 | Wraps timm (https://github.com/rwightman/pytorch-image-models) models for use as a vision tower in CLIP model.
  4 | """
  5 | import logging
  6 | from collections import OrderedDict
  7 | 
  8 | import torch
  9 | import torch.nn as nn
 10 | 
 11 | try:
 12 |     import timm
 13 |     from timm.models.layers import Mlp, to_2tuple
 14 |     try:
 15 |         # old timm imports < 0.8.1
 16 |         from timm.models.layers.attention_pool2d import RotAttentionPool2d
 17 |         from timm.models.layers.attention_pool2d import AttentionPool2d as AbsAttentionPool2d
 18 |     except ImportError:
 19 |         # new timm imports >= 0.8.1
 20 |         from timm.layers import RotAttentionPool2d
 21 |         from timm.layers import AttentionPool2d as AbsAttentionPool2d
 22 | except ImportError:
 23 |     timm = None
 24 | 
 25 | from .utils import freeze_batch_norm_2d
 26 | 
 27 | 
 28 | class TimmModel(nn.Module):
 29 |     """ timm model adapter
 30 |     # FIXME this adapter is a work in progress, may change in ways that break weight compat
 31 |     """
 32 | 
 33 |     def __init__(
 34 |             self,
 35 |             model_name,
 36 |             embed_dim,
 37 |             image_size=224,
 38 |             pool='avg',
 39 |             proj='linear',
 40 |             proj_bias=False,
 41 |             drop=0.,
 42 |             drop_path=None,
 43 |             pretrained=False,
 44 |     ):
 45 |         super().__init__()
 46 |         if timm is None:
 47 |             raise RuntimeError("Please `pip install timm` to use timm models.")
 48 | 
 49 |         self.image_size = to_2tuple(image_size)
 50 |         timm_kwargs = {}
 51 |         if drop_path is not None:
 52 |             timm_kwargs['drop_path_rate'] = drop_path
 53 |         self.trunk = timm.create_model(model_name, pretrained=pretrained, **timm_kwargs)
 54 |         feat_size = self.trunk.default_cfg.get('pool_size', None)
 55 |         feature_ndim = 1 if not feat_size else 2
 56 |         if pool in ('abs_attn', 'rot_attn'):
 57 |             assert feature_ndim == 2
 58 |             # if attn pooling used, remove both classifier and default pool
 59 |             self.trunk.reset_classifier(0, global_pool='')
 60 |         else:
 61 |             # reset global pool if pool config set, otherwise leave as network default
 62 |             reset_kwargs = dict(global_pool=pool) if pool else {}
 63 |             self.trunk.reset_classifier(0, **reset_kwargs)
 64 |         prev_chs = self.trunk.num_features
 65 | 
 66 |         head_layers = OrderedDict()
 67 |         if pool == 'abs_attn':
 68 |             head_layers['pool'] = AbsAttentionPool2d(prev_chs, feat_size=feat_size, out_features=embed_dim)
 69 |             prev_chs = embed_dim
 70 |         elif pool == 'rot_attn':
 71 |             head_layers['pool'] = RotAttentionPool2d(prev_chs, out_features=embed_dim)
 72 |             prev_chs = embed_dim
 73 |         else:
 74 |             assert proj, 'projection layer needed if non-attention pooling is used.'
 75 | 
 76 |         # NOTE attention pool ends with a projection layer, so proj should usually be set to '' if such pooling is used
 77 |         if proj == 'linear':
 78 |             head_layers['drop'] = nn.Dropout(drop)
 79 |             head_layers['proj'] = nn.Linear(prev_chs, embed_dim, bias=proj_bias)
 80 |         elif proj == 'mlp':
 81 |             head_layers['mlp'] = Mlp(prev_chs, 2 * embed_dim, embed_dim, drop=(drop, 0), bias=(True, proj_bias))
 82 | 
 83 |         self.head = nn.Sequential(head_layers)
 84 | 
 85 |     def lock(self, unlocked_groups=0, freeze_bn_stats=False):
 86 |         """ lock modules
 87 |         Args:
 88 |             unlocked_groups (int): leave last n layer groups unlocked (default: 0)
 89 |         """
 90 |         if not unlocked_groups:
 91 |             # lock full model
 92 |             for param in self.trunk.parameters():
 93 |                 param.requires_grad = False
 94 |             if freeze_bn_stats:
 95 |                 freeze_batch_norm_2d(self.trunk)
 96 |         else:
 97 |             # NOTE: partial freeze requires latest timm (master) branch and is subject to change
 98 |             try:
 99 |                 # FIXME import here until API stable and in an official release
100 |                 from timm.models.helpers import group_parameters, group_modules
101 |             except ImportError:
102 |                 raise RuntimeError(
103 |                     'Please install latest timm `pip install git+https://github.com/rwightman/pytorch-image-models`')
104 |             matcher = self.trunk.group_matcher()
105 |             gparams = group_parameters(self.trunk, matcher)
106 |             max_layer_id = max(gparams.keys())
107 |             max_layer_id = max_layer_id - unlocked_groups
108 |             for group_idx in range(max_layer_id + 1):
109 |                 group = gparams[group_idx]
110 |                 for param in group:
111 |                     self.trunk.get_parameter(param).requires_grad = False
112 |             if freeze_bn_stats:
113 |                 gmodules = group_modules(self.trunk, matcher, reverse=True)
114 |                 gmodules = {k for k, v in gmodules.items() if v <= max_layer_id}
115 |                 freeze_batch_norm_2d(self.trunk, gmodules)
116 | 
117 |     @torch.jit.ignore
118 |     def set_grad_checkpointing(self, enable=True):
119 |         try:
120 |             self.trunk.set_grad_checkpointing(enable)
121 |         except Exception as e:
122 |             logging.warning('grad checkpointing not supported for this timm image tower, continuing without...')
123 | 
124 |     def forward(self, x):
125 |         x = self.trunk(x)
126 |         x = self.head(x)
127 |         return x
128 | 


--------------------------------------------------------------------------------
/encoder/model/modules/feat_extractors/train_clip_src/open_clip/transform.py:
--------------------------------------------------------------------------------
  1 | import warnings
  2 | from dataclasses import dataclass, asdict
  3 | from typing import Any, Dict, Optional, Sequence, Tuple, Union
  4 | 
  5 | import torch
  6 | import torch.nn as nn
  7 | import torchvision.transforms.functional as F
  8 | 
  9 | from torchvision.transforms import Normalize, Compose, RandomResizedCrop, InterpolationMode, ToTensor, Resize, \
 10 |     CenterCrop
 11 | 
 12 | from .constants import OPENAI_DATASET_MEAN, OPENAI_DATASET_STD
 13 | 
 14 | 
 15 | @dataclass
 16 | class AugmentationCfg:
 17 |     scale: Tuple[float, float] = (0.9, 1.0)
 18 |     ratio: Optional[Tuple[float, float]] = None
 19 |     color_jitter: Optional[Union[float, Tuple[float, float, float]]] = None
 20 |     interpolation: Optional[str] = None
 21 |     re_prob: Optional[float] = None
 22 |     re_count: Optional[int] = None
 23 |     use_timm: bool = False
 24 | 
 25 | 
 26 | class ResizeMaxSize(nn.Module):
 27 | 
 28 |     def __init__(self, max_size, interpolation=InterpolationMode.BICUBIC, fn='max', fill=0):
 29 |         super().__init__()
 30 |         if not isinstance(max_size, int):
 31 |             raise TypeError(f"Size should be int. Got {type(max_size)}")
 32 |         self.max_size = max_size
 33 |         self.interpolation = interpolation
 34 |         self.fn = min if fn == 'min' else min
 35 |         self.fill = fill
 36 | 
 37 |     def forward(self, img):
 38 |         if isinstance(img, torch.Tensor):
 39 |             height, width = img.shape[:2]
 40 |         else:
 41 |             width, height = img.size
 42 |         scale = self.max_size / float(max(height, width))
 43 |         if scale != 1.0:
 44 |             new_size = tuple(round(dim * scale) for dim in (height, width))
 45 |             img = F.resize(img, new_size, self.interpolation)
 46 |             pad_h = self.max_size - new_size[0]
 47 |             pad_w = self.max_size - new_size[1]
 48 |             img = F.pad(img, padding=[pad_w//2, pad_h//2, pad_w - pad_w//2, pad_h - pad_h//2], fill=self.fill)
 49 |         return img
 50 | 
 51 | 
 52 | def _convert_to_rgb(image):
 53 |     return image.convert('RGB')
 54 | 
 55 | 
 56 | def image_transform(
 57 |         image_size: int,
 58 |         is_train: bool,
 59 |         mean: Optional[Tuple[float, ...]] = None,
 60 |         std: Optional[Tuple[float, ...]] = None,
 61 |         resize_longest_max: bool = False,
 62 |         fill_color: int = 0,
 63 |         aug_cfg: Optional[Union[Dict[str, Any], AugmentationCfg]] = None,
 64 | ):
 65 |     mean = mean or OPENAI_DATASET_MEAN
 66 |     if not isinstance(mean, (list, tuple)):
 67 |         mean = (mean,) * 3
 68 | 
 69 |     std = std or OPENAI_DATASET_STD
 70 |     if not isinstance(std, (list, tuple)):
 71 |         std = (std,) * 3
 72 | 
 73 |     if isinstance(image_size, (list, tuple)) and image_size[0] == image_size[1]:
 74 |         # for square size, pass size as int so that Resize() uses aspect preserving shortest edge
 75 |         image_size = image_size[0]
 76 | 
 77 |     if isinstance(aug_cfg, dict):
 78 |         aug_cfg = AugmentationCfg(**aug_cfg)
 79 |     else:
 80 |         aug_cfg = aug_cfg or AugmentationCfg()
 81 |     normalize = Normalize(mean=mean, std=std)
 82 |     if is_train:
 83 |         aug_cfg_dict = {k: v for k, v in asdict(aug_cfg).items() if v is not None}
 84 |         use_timm = aug_cfg_dict.pop('use_timm', False)
 85 |         if use_timm:
 86 |             from timm.data import create_transform  # timm can still be optional
 87 |             if isinstance(image_size, (tuple, list)):
 88 |                 assert len(image_size) >= 2
 89 |                 input_size = (3,) + image_size[-2:]
 90 |             else:
 91 |                 input_size = (3, image_size, image_size)
 92 |             # by default, timm aug randomly alternates bicubic & bilinear for better robustness at inference time
 93 |             aug_cfg_dict.setdefault('interpolation', 'random')
 94 |             aug_cfg_dict.setdefault('color_jitter', None)  # disable by default
 95 |             train_transform = create_transform(
 96 |                 input_size=input_size,
 97 |                 is_training=True,
 98 |                 hflip=0.,
 99 |                 mean=mean,
100 |                 std=std,
101 |                 re_mode='pixel',
102 |                 **aug_cfg_dict,
103 |             )
104 |         else:
105 |             train_transform = Compose([
106 |                 RandomResizedCrop(
107 |                     image_size,
108 |                     scale=aug_cfg_dict.pop('scale'),
109 |                     interpolation=InterpolationMode.BICUBIC,
110 |                 ),
111 |                 _convert_to_rgb,
112 |                 ToTensor(),
113 |                 normalize,
114 |             ])
115 |             if aug_cfg_dict:
116 |                 warnings.warn(f'Unused augmentation cfg items, specify `use_timm` to use ({list(aug_cfg_dict.keys())}).')
117 |         return train_transform
118 |     else:
119 |         if resize_longest_max:
120 |             transforms = [
121 |                 ResizeMaxSize(image_size, fill=fill_color)
122 |             ]
123 |         else:
124 |             transforms = [
125 |                 Resize(image_size, interpolation=InterpolationMode.BICUBIC),
126 |                 CenterCrop(image_size),
127 |             ]
128 |         transforms.extend([
129 |             _convert_to_rgb,
130 |             ToTensor(),
131 |             normalize,
132 |         ])
133 |         return Compose(transforms)
134 | 


--------------------------------------------------------------------------------
/encoder/model/modules/feat_extractors/train_clip_src/open_clip/utils.py:
--------------------------------------------------------------------------------
 1 | from itertools import repeat
 2 | import collections.abc
 3 | 
 4 | from torch import nn as nn
 5 | from torchvision.ops.misc import FrozenBatchNorm2d
 6 | 
 7 | 
 8 | def freeze_batch_norm_2d(module, module_match={}, name=''):
 9 |     """
10 |     Converts all `BatchNorm2d` and `SyncBatchNorm` layers of provided module into `FrozenBatchNorm2d`. If `module` is
11 |     itself an instance of either `BatchNorm2d` or `SyncBatchNorm`, it is converted into `FrozenBatchNorm2d` and
12 |     returned. Otherwise, the module is walked recursively and submodules are converted in place.
13 | 
14 |     Args:
15 |         module (torch.nn.Module): Any PyTorch module.
16 |         module_match (dict): Dictionary of full module names to freeze (all if empty)
17 |         name (str): Full module name (prefix)
18 | 
19 |     Returns:
20 |         torch.nn.Module: Resulting module
21 | 
22 |     Inspired by https://github.com/pytorch/pytorch/blob/a5895f85be0f10212791145bfedc0261d364f103/torch/nn/modules/batchnorm.py#L762
23 |     """
24 |     res = module
25 |     is_match = True
26 |     if module_match:
27 |         is_match = name in module_match
28 |     if is_match and isinstance(module, (nn.modules.batchnorm.BatchNorm2d, nn.modules.batchnorm.SyncBatchNorm)):
29 |         res = FrozenBatchNorm2d(module.num_features)
30 |         res.num_features = module.num_features
31 |         res.affine = module.affine
32 |         if module.affine:
33 |             res.weight.data = module.weight.data.clone().detach()
34 |             res.bias.data = module.bias.data.clone().detach()
35 |         res.running_mean.data = module.running_mean.data
36 |         res.running_var.data = module.running_var.data
37 |         res.eps = module.eps
38 |     else:
39 |         for child_name, child in module.named_children():
40 |             full_child_name = '.'.join([name, child_name]) if name else child_name
41 |             new_child = freeze_batch_norm_2d(child, module_match, full_child_name)
42 |             if new_child is not child:
43 |                 res.add_module(child_name, new_child)
44 |     return res
45 | 
46 | 
47 | # From PyTorch internals
48 | def _ntuple(n):
49 |     def parse(x):
50 |         if isinstance(x, collections.abc.Iterable):
51 |             return x
52 |         return tuple(repeat(x, n))
53 |     return parse
54 | 
55 | 
56 | to_1tuple = _ntuple(1)
57 | to_2tuple = _ntuple(2)
58 | to_3tuple = _ntuple(3)
59 | to_4tuple = _ntuple(4)
60 | to_ntuple = lambda n, x: _ntuple(n)(x)
61 | 


--------------------------------------------------------------------------------
/encoder/model/modules/feat_extractors/train_clip_src/open_clip/version.py:
--------------------------------------------------------------------------------
1 | __version__ = '2.16.0'
2 | 


--------------------------------------------------------------------------------
/encoder/model/modules/feat_extractors/train_clip_src/training/.gitignore:
--------------------------------------------------------------------------------
1 | logs/
2 | 


--------------------------------------------------------------------------------
/encoder/model/modules/feat_extractors/train_clip_src/training/__init__.py:
--------------------------------------------------------------------------------
1 | import sys
2 | sys.path.append('scripts')
3 | 


--------------------------------------------------------------------------------
/encoder/model/modules/feat_extractors/train_clip_src/training/distributed.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | 
  3 | import torch
  4 | import torch.distributed as dist
  5 | 
  6 | 
  7 | def is_global_master(args):
  8 |     return args.rank == 0
  9 | 
 10 | 
 11 | def is_local_master(args):
 12 |     return args.local_rank == 0
 13 | 
 14 | 
 15 | def is_master(args, local=False):
 16 |     return is_local_master(args) if local else is_global_master(args)
 17 | 
 18 | 
 19 | def is_using_distributed():
 20 |     if 'WORLD_SIZE' in os.environ:
 21 |         return int(os.environ['WORLD_SIZE']) > 1
 22 |     if 'SLURM_NTASKS' in os.environ:
 23 |         return int(os.environ['SLURM_NTASKS']) > 1
 24 |     return False
 25 | 
 26 | 
 27 | def world_info_from_env():
 28 |     local_rank = 0
 29 |     for v in ('LOCAL_RANK', 'MPI_LOCALRANKID', 'SLURM_LOCALID', 'OMPI_COMM_WORLD_LOCAL_RANK'):
 30 |         if v in os.environ:
 31 |             local_rank = int(os.environ[v])
 32 |             break
 33 |     global_rank = 0
 34 |     for v in ('RANK', 'PMI_RANK', 'SLURM_PROCID', 'OMPI_COMM_WORLD_RANK'):
 35 |         if v in os.environ:
 36 |             global_rank = int(os.environ[v])
 37 |             break
 38 |     world_size = 1
 39 |     for v in ('WORLD_SIZE', 'PMI_SIZE', 'SLURM_NTASKS', 'OMPI_COMM_WORLD_SIZE'):
 40 |         if v in os.environ:
 41 |             world_size = int(os.environ[v])
 42 |             break
 43 | 
 44 |     # print('local_rank=%d global_rank=%d world_size=%d' % (local_rank, global_rank, world_size))
 45 | 
 46 |     # # get environemnt vars from os.environ, sort them by key and save to a file unique for each rank
 47 |     # sorted_env_vars = sorted(os.environ.items(), key=lambda x: x[0])
 48 |     # env_vars_path = os.path.join(f"env_vars_{global_rank}.txt")
 49 |     # with open(env_vars_path, "w") as f:
 50 |     #     for key, value in sorted_env_vars:
 51 |     #         f.write(f"{key}={value}\n")
 52 | 
 53 |     return local_rank, global_rank, world_size
 54 | 
 55 | 
 56 | def init_distributed_device(args):
 57 |     # Distributed training = training on more than one GPU.
 58 |     # Works in both single and multi-node scenarios.
 59 |     args.distributed = False
 60 |     args.world_size = 1
 61 |     args.rank = 0  # global rank
 62 |     args.local_rank = 0
 63 |     if is_using_distributed():
 64 |         if 'SLURM_PROCID' in os.environ:
 65 |             # DDP via SLURM
 66 |             args.local_rank, args.rank, args.world_size = world_info_from_env()
 67 |             # SLURM var -> torch.distributed vars in case needed
 68 |             os.environ['LOCAL_RANK'] = str(args.local_rank)
 69 |             os.environ['RANK'] = str(args.rank)
 70 |             os.environ['WORLD_SIZE'] = str(args.world_size)
 71 |             torch.distributed.init_process_group(
 72 |                 backend=args.training.dist_backend,
 73 |                 init_method=args.training.dist_url,
 74 |                 world_size=args.world_size,
 75 |                 rank=args.rank,
 76 |             )
 77 |         else:
 78 |             # DDP via torchrun, torch.distributed.launch
 79 |             args.local_rank, _, _ = world_info_from_env()
 80 |             torch.distributed.init_process_group(
 81 |                 backend=args.training.dist_backend,
 82 |                 init_method=args.training.dist_url)
 83 |             args.world_size = torch.distributed.get_world_size()
 84 |             args.rank = torch.distributed.get_rank()
 85 |         args.distributed = True
 86 | 
 87 |     if torch.cuda.is_available():
 88 |         if args.distributed and not args.training.no_set_device_rank:
 89 |             device = 'cuda:%d' % args.local_rank
 90 |         else:
 91 |             device = 'cuda:0'
 92 |         torch.cuda.set_device(device)
 93 |     else:
 94 |         device = 'cpu'
 95 |     args.device = device
 96 |     device = torch.device(device)
 97 |     return device
 98 | 
 99 | 
100 | def broadcast_object(args, obj, src=0):
101 |     # broadcast a pickle-able python object from rank-0 to all ranks
102 |     if args.rank == src:
103 |         objects = [obj]
104 |     else:
105 |         objects = [None]
106 |     dist.broadcast_object_list(objects, src=src)
107 |     return objects[0]
108 | 
109 | 
110 | def all_gather_object(args, obj, dst=0):
111 |     # gather a pickle-able python object across all ranks
112 |     objects = [None for _ in range(args.world_size)]
113 |     dist.all_gather_object(objects, obj)
114 |     return objects
115 | 


--------------------------------------------------------------------------------
/encoder/model/modules/feat_extractors/train_clip_src/training/file_utils.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import os
 3 | import multiprocessing
 4 | import subprocess
 5 | import time
 6 | import fsspec
 7 | import torch
 8 | from tqdm import tqdm
 9 | 
10 | def remote_sync_s3(local_dir, remote_dir):
11 |     # skip epoch_latest which can change during sync.
12 |     result = subprocess.run(["aws", "s3", "sync", local_dir, remote_dir, '--exclude', '*epoch_latest.pt'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
13 |     if result.returncode != 0:
14 |         logging.error(f"Error: Failed to sync with S3 bucket {result.stderr.decode('utf-8')}")
15 |         return False
16 |         
17 |     logging.info(f"Successfully synced with S3 bucket")
18 |     return True
19 | 
20 | def remote_sync_fsspec(local_dir, remote_dir):
21 |     # FIXME currently this is slow and not recommended. Look into speeding up.
22 |     a = fsspec.get_mapper(local_dir)
23 |     b = fsspec.get_mapper(remote_dir)
24 | 
25 |     for k in a:
26 |         # skip epoch_latest which can change during sync.
27 |         if 'epoch_latest.pt' in k:
28 |             continue
29 | 
30 |         logging.info(f'Attempting to sync {k}')
31 |         if k in b and len(a[k]) == len(b[k]):
32 |             logging.debug(f'Skipping remote sync for {k}.')
33 |             continue
34 | 
35 |         try:
36 |             logging.info(f'Successful sync for {k}.')
37 |             b[k] = a[k]
38 |         except Exception as e:
39 |             logging.info(f'Error during remote sync for {k}: {e}')
40 |             return False
41 | 
42 |     return True
43 | 
44 | def remote_sync(local_dir, remote_dir, protocol):
45 |     logging.info('Starting remote sync.')
46 |     if protocol == 's3':
47 |         return remote_sync_s3(local_dir, remote_dir)
48 |     elif protocol == 'fsspec':
49 |         return remote_sync_fsspec(local_dir, remote_dir)
50 |     else:
51 |         logging.error('Remote protocol not known')
52 |         return False
53 | 
54 | def keep_running_remote_sync(sync_every, local_dir, remote_dir, protocol):
55 |     while True:
56 |         time.sleep(sync_every)
57 |         remote_sync(local_dir, remote_dir, protocol)
58 | 
59 | def start_sync_process(sync_every, local_dir, remote_dir, protocol):
60 |     p = multiprocessing.Process(target=keep_running_remote_sync, args=(sync_every, local_dir, remote_dir, protocol))
61 |     return p
62 | 
63 | # Note: we are not currently using this save function.
64 | def pt_save(pt_obj, file_path):
65 |     of = fsspec.open(file_path, "wb")
66 |     with of as f:
67 |         torch.save(pt_obj, file_path)
68 | 
69 | def pt_load(file_path, map_location=None):
70 |     if file_path.startswith('s3'):
71 |         logging.info('Loading remote checkpoint, which may take a bit.')
72 |     of = fsspec.open(file_path, "rb")
73 |     with of as f:
74 |         out = torch.load(f, map_location=map_location)
75 |     return out
76 | 
77 | def check_exists(file_path):
78 |     try:
79 |         with fsspec.open(file_path):
80 |             pass
81 |     except FileNotFoundError:
82 |         return False
83 |     return True
84 | 


--------------------------------------------------------------------------------
/encoder/model/modules/feat_extractors/train_clip_src/training/logger.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | 
 4 | def setup_logging(log_file, level, include_host=False):
 5 |     if include_host:
 6 |         import socket
 7 |         hostname = socket.gethostname()
 8 |         formatter = logging.Formatter(
 9 |             f'%(asctime)s |  {hostname} | %(levelname)s | %(message)s', datefmt='%Y-%m-%d,%H:%M:%S')
10 |     else:
11 |         formatter = logging.Formatter('%(asctime)s | %(levelname)s | %(message)s', datefmt='%Y-%m-%d,%H:%M:%S')
12 | 
13 |     logging.root.setLevel(level)
14 |     loggers = [logging.getLogger(name) for name in logging.root.manager.loggerDict]
15 |     for logger in loggers:
16 |         if logger.name.startswith(('transformers', )):  # these guys are too verbose at INFO
17 |             logger.setLevel(logging.WARNING)
18 |         else:
19 |             logger.setLevel(level)
20 | 
21 |     stream_handler = logging.StreamHandler()
22 |     stream_handler.setFormatter(formatter)
23 |     logging.root.addHandler(stream_handler)
24 | 
25 |     if log_file:
26 |         file_handler = logging.FileHandler(filename=log_file)
27 |         file_handler.setFormatter(formatter)
28 |         logging.root.addHandler(file_handler)
29 | 


--------------------------------------------------------------------------------
/encoder/model/modules/feat_extractors/train_clip_src/training/precision.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from contextlib import suppress
 3 | 
 4 | 
 5 | def get_autocast(precision):
 6 |     if precision == 'amp':
 7 |         return torch.cuda.amp.autocast
 8 |     elif precision == 'amp_bfloat16' or precision == 'amp_bf16':
 9 |         # amp_bfloat16 is more stable than amp float16 for clip training
10 |         return lambda: torch.cuda.amp.autocast(dtype=torch.bfloat16)
11 |     else:
12 |         return suppress
13 | 


--------------------------------------------------------------------------------
/encoder/model/modules/feat_extractors/train_clip_src/training/profile.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | 
  3 | import torch
  4 | import open_clip
  5 | import pandas as pd
  6 | from fvcore.nn import FlopCountAnalysis, flop_count_str, ActivationCountAnalysis
  7 | 
  8 | 
  9 | parser = argparse.ArgumentParser(description='OpenCLIP Profiler')
 10 | 
 11 | # benchmark specific args
 12 | parser.add_argument('--model', metavar='NAME', default='',
 13 |                     help='model(s) to profile')
 14 | parser.add_argument('--results-file', default='', type=str, metavar='FILENAME',
 15 |                     help='Output csv file for results')
 16 | 
 17 | 
 18 | def profile_fvcore(
 19 |         model,
 20 |         image_input_size=(3, 224, 224),
 21 |         text_input_size=(77,),
 22 |         batch_size=1,
 23 |         detailed=False,
 24 |         force_cpu=False
 25 | ):
 26 |     if force_cpu:
 27 |         model = model.to('cpu')
 28 |     device, dtype = next(model.parameters()).device, next(model.parameters()).dtype
 29 |     example_image_input = torch.ones((batch_size,) + image_input_size, device=device, dtype=dtype)
 30 |     example_text_input = torch.ones((batch_size,) + text_input_size, device=device, dtype=torch.int64)
 31 |     fca = FlopCountAnalysis(model, (example_image_input, example_text_input))
 32 |     aca = ActivationCountAnalysis(model, (example_image_input, example_text_input))
 33 |     if detailed:
 34 |         fcs = flop_count_str(fca)
 35 |         print(fcs)
 36 |     return fca.total(), aca.total()
 37 | 
 38 | 
 39 | def profile_fvcore_text(
 40 |         model,
 41 |         text_input_size=(77,),
 42 |         batch_size=1,
 43 |         detailed=False,
 44 |         force_cpu=False
 45 | ):
 46 |     if force_cpu:
 47 |         model = model.to('cpu')
 48 |     device = next(model.parameters()).device
 49 |     example_input = torch.ones((batch_size,) + text_input_size, device=device, dtype=torch.int64)
 50 |     fca = FlopCountAnalysis(model, example_input)
 51 |     aca = ActivationCountAnalysis(model, example_input)
 52 |     if detailed:
 53 |         fcs = flop_count_str(fca)
 54 |         print(fcs)
 55 |     return fca.total(), aca.total()
 56 | 
 57 | 
 58 | def profile_fvcore_image(
 59 |         model,
 60 |         image_input_size=(3, 224, 224),
 61 |         batch_size=1,
 62 |         detailed=False,
 63 |         force_cpu=False
 64 | ):
 65 |     if force_cpu:
 66 |         model = model.to('cpu')
 67 |     device, dtype = next(model.parameters()).device, next(model.parameters()).dtype
 68 |     example_input = torch.ones((batch_size,) + image_input_size, device=device, dtype=dtype)
 69 |     fca = FlopCountAnalysis(model, example_input)
 70 |     aca = ActivationCountAnalysis(model, example_input)
 71 |     if detailed:
 72 |         fcs = flop_count_str(fca)
 73 |         print(fcs)
 74 |     return fca.total(), aca.total()
 75 | 
 76 | 
 77 | def count_params(model):
 78 |     return sum([m.numel() for m in model.parameters()])
 79 | 
 80 | 
 81 | def profile_model(model_name):
 82 |     model = open_clip.create_model(model_name, force_custom_text=True, pretrained_hf=False)
 83 |     model.eval()
 84 |     if torch.cuda.is_available():
 85 |         model = model.cuda()
 86 | 
 87 |     if isinstance(model.visual.image_size, (tuple, list)):
 88 |         image_input_size = (3,) + tuple(model.visual.image_size[-2:])
 89 |     else:
 90 |         image_input_size = (3, model.visual.image_size, model.visual.image_size)
 91 |     text_input_size = (77,)
 92 | 
 93 |     results = {}
 94 |     results['model'] = model_name
 95 |     results['image_size'] = image_input_size[1]
 96 | 
 97 |     model_cfg = open_clip.get_model_config(model_name)
 98 |     if model_cfg:
 99 |         vision_cfg = open_clip.CLIPVisionCfg(**model_cfg['vision_cfg'])
100 |         text_cfg = open_clip.CLIPTextCfg(**model_cfg['text_cfg'])
101 |         results['image_width'] = int(vision_cfg.width)
102 |         results['text_width'] = int(text_cfg.width)
103 |         results['embed_dim'] = int(model_cfg['embed_dim'])
104 |     else:
105 |         results['image_width'] = 0
106 |         results['text_width'] = 0
107 |         results['embed_dim'] = 0
108 | 
109 |     retries = 2
110 |     while retries:
111 |         retries -= 1
112 |         try:
113 |             macs, acts = profile_fvcore(
114 |                 model, image_input_size=image_input_size, text_input_size=text_input_size, force_cpu=not retries)
115 | 
116 |             image_macs, image_acts = profile_fvcore_image(
117 |                 model.visual, image_input_size=image_input_size, force_cpu=not retries)
118 | 
119 |             text_macs, text_acts = profile_fvcore_text(
120 |                 model.text, text_input_size=text_input_size, force_cpu=not retries)
121 | 
122 |             results['gmacs'] = round(macs / 1e9, 2)
123 |             results['macts'] = round(acts / 1e6, 2)
124 |             results['mparams'] = round(count_params(model) / 1e6, 2)
125 |             results['image_gmacs'] = round(image_macs / 1e9, 2)
126 |             results['image_macts'] = round(image_acts / 1e6, 2)
127 |             results['image_mparams'] = round(count_params(model.visual) / 1e6, 2)
128 |             results['text_gmacs'] = round(text_macs / 1e9, 2)
129 |             results['text_macts'] = round(text_acts / 1e6, 2)
130 |             results['text_mparams'] = round(count_params(model.text) / 1e6, 2)
131 |         except RuntimeError as e:
132 |             pass
133 |     return results
134 | 
135 | 
136 | def main():
137 |     args = parser.parse_args()
138 | 
139 |     # FIXME accept a text file name to allow lists of models in txt/csv
140 |     if args.model == 'all':
141 |         parsed_model = open_clip.list_models()
142 |     else:
143 |         parsed_model = args.model.split(',')
144 | 
145 |     results = []
146 |     for m in parsed_model:
147 |         row = profile_model(m)
148 |         results.append(row)
149 | 
150 |     df = pd.DataFrame(results, columns=results[0].keys())
151 |     df = df.sort_values('gmacs')
152 |     print(df)
153 |     if args.results_file:
154 |         df.to_csv(args.results_file, index=False)
155 | 
156 | 
157 | if __name__ == '__main__':
158 |     main()
159 | 


--------------------------------------------------------------------------------
/encoder/model/modules/feat_extractors/train_clip_src/training/scheduler.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | def assign_learning_rate(optimizer, new_lr):
 5 |     for param_group in optimizer.param_groups:
 6 |         param_group["lr"] = new_lr
 7 | 
 8 | 
 9 | def _warmup_lr(base_lr, warmup_length, step):
10 |     return base_lr * (step + 1) / warmup_length
11 | 
12 | 
13 | def const_lr(optimizer, base_lr, warmup_length, steps):
14 |     def _lr_adjuster(step):
15 |         if step < warmup_length:
16 |             lr = _warmup_lr(base_lr, warmup_length, step)
17 |         else:
18 |             lr = base_lr
19 |         assign_learning_rate(optimizer, lr)
20 |         return lr
21 |     return _lr_adjuster
22 | 
23 | 
24 | def const_lr_cooldown(optimizer, base_lr, warmup_length, steps, cooldown_steps, cooldown_power=1.0, cooldown_end_lr=0.):
25 |     def _lr_adjuster(step):
26 |         start_cooldown_step = steps - cooldown_steps
27 |         if step < warmup_length:
28 |             lr = _warmup_lr(base_lr, warmup_length, step)
29 |         else:
30 |             if step < start_cooldown_step:
31 |                 lr = base_lr
32 |             else:
33 |                 e = step - start_cooldown_step
34 |                 es = steps - start_cooldown_step
35 |                 # linear decay if power == 1; polynomial decay otherwise;
36 |                 decay = (1 - (e/es)) ** cooldown_power
37 |                 lr = decay * (base_lr - cooldown_end_lr) + cooldown_end_lr
38 |         assign_learning_rate(optimizer, lr)
39 |         return lr
40 |     return _lr_adjuster
41 | 
42 | 
43 | def cosine_lr(optimizer, base_lr, warmup_length, steps):
44 |     def _lr_adjuster(step):
45 |         if step < warmup_length:
46 |             lr = _warmup_lr(base_lr, warmup_length, step)
47 |         else:
48 |             e = step - warmup_length
49 |             es = steps - warmup_length
50 |             lr = 0.5 * (1 + np.cos(np.pi * e / es)) * base_lr
51 |         assign_learning_rate(optimizer, lr)
52 |         return lr
53 |     return _lr_adjuster
54 | 


--------------------------------------------------------------------------------
/encoder/model/modules/feat_extractors/train_clip_src/training/zero_shot.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | import torch
 4 | import torch.nn.functional as F
 5 | from tqdm import tqdm
 6 | 
 7 | from open_clip import get_cast_dtype, get_tokenizer
 8 | from .precision import get_autocast
 9 | from .imagenet_zeroshot_data import imagenet_classnames, openai_imagenet_template
10 | 
11 | 
12 | def zero_shot_classifier(model, classnames, templates, args):
13 |     tokenizer = get_tokenizer(args.model)
14 |     with torch.no_grad():
15 |         zeroshot_weights = []
16 |         for classname in tqdm(classnames):
17 |             texts = [template(classname) for template in templates]  # format with class
18 |             texts = tokenizer(texts).to(args.device)  # tokenize
19 |             if args.distributed:
20 |                 class_embeddings = model.module.encode_text(texts)
21 |             else:
22 |                 class_embeddings = model.encode_text(texts)
23 |             class_embedding = F.normalize(class_embeddings, dim=-1).mean(dim=0)
24 |             class_embedding /= class_embedding.norm()
25 |             zeroshot_weights.append(class_embedding)
26 |         zeroshot_weights = torch.stack(zeroshot_weights, dim=1).to(args.device)
27 |     return zeroshot_weights
28 | 
29 | 
30 | def accuracy(output, target, topk=(1,)):
31 |     pred = output.topk(max(topk), 1, True, True)[1].t()
32 |     correct = pred.eq(target.view(1, -1).expand_as(pred))
33 |     return [float(correct[:k].reshape(-1).float().sum(0, keepdim=True).cpu().numpy()) for k in topk]
34 | 
35 | 
36 | def run(model, classifier, dataloader, args):
37 |     autocast = get_autocast(args.precision)
38 |     cast_dtype = get_cast_dtype(args.precision)
39 |     with torch.no_grad():
40 |         top1, top5, n = 0., 0., 0.
41 |         for images, target in tqdm(dataloader, unit_scale=args.batch_size):
42 |             images = images.to(args.device)
43 |             if cast_dtype is not None:
44 |                 images = images.to(dtype=cast_dtype)
45 |             target = target.to(args.device)
46 | 
47 |             with autocast():
48 |                 # predict
49 |                 if args.distributed:
50 |                     image_features = model.module.encode_image(images)
51 |                 else:
52 |                     image_features = model.encode_image(images)
53 |                 image_features = F.normalize(image_features, dim=-1)
54 |                 logits = 100. * image_features @ classifier
55 | 
56 |             # measure accuracy
57 |             acc1, acc5 = accuracy(logits, target, topk=(1, 5))
58 |             top1 += acc1
59 |             top5 += acc5
60 |             n += images.size(0)
61 | 
62 |     top1 = (top1 / n)
63 |     top5 = (top5 / n)
64 |     return top1, top5
65 | 
66 | 
67 | def zero_shot_eval(model, data, epoch, args):
68 |     if 'imagenet-val' not in data and 'imagenet-v2' not in data:
69 |         return {}
70 |     if args.zeroshot_frequency == 0:
71 |         return {}
72 |     if (epoch % args.zeroshot_frequency) != 0 and epoch != args.epochs:
73 |         return {}
74 | 
75 |     logging.info('Starting zero-shot imagenet.')
76 | 
77 |     logging.info('Building zero-shot classifier')
78 |     classifier = zero_shot_classifier(model, imagenet_classnames, openai_imagenet_template, args)
79 | 
80 |     logging.info('Using classifier')
81 |     results = {}
82 |     if 'imagenet-val' in data:
83 |         top1, top5 = run(model, classifier, data['imagenet-val'].dataloader, args)
84 |         results['imagenet-zeroshot-val-top1'] = top1
85 |         results['imagenet-zeroshot-val-top5'] = top5
86 |     if 'imagenet-v2' in data:
87 |         top1, top5 = run(model, classifier, data['imagenet-v2'].dataloader, args)
88 |         results['imagenetv2-zeroshot-val-top1'] = top1
89 |         results['imagenetv2-zeroshot-val-top5'] = top5
90 | 
91 |     logging.info('Finished zero-shot imagenet.')
92 | 
93 |     return results
94 | 


--------------------------------------------------------------------------------
/encoder/model/modules/feat_extractors/visual/__init__.py:
--------------------------------------------------------------------------------
1 | import sys
2 | sys.path.append('model/modules/feat_extractors/visual')  # nopep8
3 | 


--------------------------------------------------------------------------------
/encoder/model/modules/feat_extractors/visual/motionformer_src/divided_224_16x4.yaml:
--------------------------------------------------------------------------------
 1 | TRAIN:
 2 |   ENABLE: True
 3 |   DATASET: Ssv2
 4 |   BATCH_SIZE: 32
 5 |   EVAL_PERIOD: 5
 6 |   CHECKPOINT_PERIOD: 5
 7 |   AUTO_RESUME: True
 8 |   CHECKPOINT_EPOCH_RESET: True
 9 |   CHECKPOINT_FILE_PATH: /checkpoint/fmetze/neurips_sota/40944587/checkpoints/checkpoint_epoch_00035.pyth
10 | DATA:
11 |   NUM_FRAMES: 16
12 |   SAMPLING_RATE: 4
13 |   TRAIN_JITTER_SCALES: [256, 320]
14 |   TRAIN_CROP_SIZE: 224
15 |   TEST_CROP_SIZE: 224
16 |   INPUT_CHANNEL_NUM: [3]
17 |   MEAN: [0.5, 0.5, 0.5]
18 |   STD: [0.5, 0.5, 0.5]
19 |   PATH_TO_DATA_DIR: /private/home/mandelapatrick/slowfast/data/ssv2
20 |   PATH_PREFIX: /datasets01/SomethingV2/092720/20bn-something-something-v2-frames
21 |   INV_UNIFORM_SAMPLE: True
22 |   RANDOM_FLIP: False
23 |   REVERSE_INPUT_CHANNEL: True
24 |   USE_RAND_AUGMENT: True
25 |   RE_PROB: 0.0
26 |   USE_REPEATED_AUG: False
27 |   USE_RANDOM_RESIZE_CROPS: False
28 |   COLORJITTER: False
29 |   GRAYSCALE: False
30 |   GAUSSIAN: False
31 | SOLVER:
32 |   BASE_LR: 1e-4
33 |   LR_POLICY: steps_with_relative_lrs
34 |   LRS: [1, 0.1, 0.01]
35 |   STEPS: [0, 20, 30]
36 |   MAX_EPOCH: 35
37 |   MOMENTUM: 0.9
38 |   WEIGHT_DECAY: 5e-2
39 |   WARMUP_EPOCHS: 0.0
40 |   OPTIMIZING_METHOD: adamw
41 |   USE_MIXED_PRECISION: True
42 |   SMOOTHING: 0.2
43 | SLOWFAST:
44 |   ALPHA: 8
45 | VIT:
46 |   PATCH_SIZE: 16
47 |   PATCH_SIZE_TEMP: 2
48 |   CHANNELS: 3
49 |   EMBED_DIM: 768
50 |   DEPTH: 12
51 |   NUM_HEADS: 12
52 |   MLP_RATIO: 4
53 |   QKV_BIAS: True
54 |   VIDEO_INPUT: True
55 |   TEMPORAL_RESOLUTION: 8
56 |   USE_MLP: True
57 |   DROP: 0.0
58 |   POS_DROPOUT: 0.0
59 |   DROP_PATH: 0.2
60 |   IM_PRETRAINED: True
61 |   HEAD_DROPOUT: 0.0
62 |   HEAD_ACT: tanh
63 |   PRETRAINED_WEIGHTS: vit_1k
64 |   ATTN_LAYER: divided
65 | MODEL:
66 |   NUM_CLASSES: 174
67 |   ARCH: slow
68 |   MODEL_NAME: VisionTransformer
69 |   LOSS_FUNC: cross_entropy
70 | TEST:
71 |   ENABLE: True
72 |   DATASET: Ssv2
73 |   BATCH_SIZE: 64
74 |   NUM_ENSEMBLE_VIEWS: 1
75 |   NUM_SPATIAL_CROPS: 3
76 | DATA_LOADER:
77 |   NUM_WORKERS: 4
78 |   PIN_MEMORY: True
79 | NUM_GPUS: 8
80 | NUM_SHARDS: 4
81 | RNG_SEED: 0
82 | OUTPUT_DIR: .
83 | TENSORBOARD:
84 |   ENABLE: True
85 | 


--------------------------------------------------------------------------------
/encoder/model/modules/feat_extractors/visual/motionformer_src/joint_224_16x4.yaml:
--------------------------------------------------------------------------------
 1 | TRAIN:
 2 |   ENABLE: True
 3 |   DATASET: Ssv2
 4 |   BATCH_SIZE: 32
 5 |   EVAL_PERIOD: 5
 6 |   CHECKPOINT_PERIOD: 5
 7 |   AUTO_RESUME: True
 8 |   CHECKPOINT_EPOCH_RESET: True
 9 |   CHECKPOINT_FILE_PATH: /checkpoint/fmetze/neurips_sota/40944587/checkpoints/checkpoint_epoch_00035.pyth
10 | DATA:
11 |   NUM_FRAMES: 16
12 |   SAMPLING_RATE: 4
13 |   TRAIN_JITTER_SCALES: [256, 320]
14 |   TRAIN_CROP_SIZE: 224
15 |   TEST_CROP_SIZE: 224
16 |   INPUT_CHANNEL_NUM: [3]
17 |   MEAN: [0.5, 0.5, 0.5]
18 |   STD: [0.5, 0.5, 0.5]
19 |   PATH_TO_DATA_DIR: /private/home/mandelapatrick/slowfast/data/ssv2
20 |   PATH_PREFIX: /datasets01/SomethingV2/092720/20bn-something-something-v2-frames
21 |   INV_UNIFORM_SAMPLE: True
22 |   RANDOM_FLIP: False
23 |   REVERSE_INPUT_CHANNEL: True
24 |   USE_RAND_AUGMENT: False
25 |   RE_PROB: 0.0
26 |   USE_REPEATED_AUG: False
27 |   USE_RANDOM_RESIZE_CROPS: False
28 |   COLORJITTER: False
29 |   GRAYSCALE: False
30 |   GAUSSIAN: False
31 | SOLVER:
32 |   BASE_LR: 1e-4
33 |   LR_POLICY: steps_with_relative_lrs
34 |   LRS: [1, 0.1, 0.01]
35 |   STEPS: [0, 20, 30]
36 |   MAX_EPOCH: 35
37 |   MOMENTUM: 0.9
38 |   WEIGHT_DECAY: 5e-2
39 |   WARMUP_EPOCHS: 0.0
40 |   OPTIMIZING_METHOD: adamw
41 |   USE_MIXED_PRECISION: True
42 |   SMOOTHING: 0.2
43 | SLOWFAST:
44 |   ALPHA: 8
45 | VIT:
46 |   PATCH_SIZE: 16
47 |   PATCH_SIZE_TEMP: 2
48 |   CHANNELS: 3
49 |   EMBED_DIM: 768
50 |   DEPTH: 12
51 |   NUM_HEADS: 12
52 |   MLP_RATIO: 4
53 |   QKV_BIAS: True
54 |   VIDEO_INPUT: True
55 |   TEMPORAL_RESOLUTION: 8
56 |   USE_MLP: True
57 |   DROP: 0.0
58 |   POS_DROPOUT: 0.0
59 |   DROP_PATH: 0.2
60 |   IM_PRETRAINED: True
61 |   HEAD_DROPOUT: 0.0
62 |   HEAD_ACT: tanh
63 |   PRETRAINED_WEIGHTS: vit_1k
64 |   ATTN_LAYER: joint
65 | MODEL:
66 |   NUM_CLASSES: 174
67 |   ARCH: slow
68 |   MODEL_NAME: VisionTransformer
69 |   LOSS_FUNC: cross_entropy
70 | TEST:
71 |   ENABLE: True
72 |   DATASET: Ssv2
73 |   BATCH_SIZE: 64
74 |   NUM_ENSEMBLE_VIEWS: 1
75 |   NUM_SPATIAL_CROPS: 3
76 | DATA_LOADER:
77 |   NUM_WORKERS: 4
78 |   PIN_MEMORY: True
79 | NUM_GPUS: 8
80 | NUM_SHARDS: 4
81 | RNG_SEED: 0
82 | OUTPUT_DIR: .
83 | TENSORBOARD:
84 |   ENABLE: True
85 | 


--------------------------------------------------------------------------------
/encoder/model/modules/feat_extractors/visual/motionformer_src/motionformer_224_16x4.yaml:
--------------------------------------------------------------------------------
 1 | TRAIN:
 2 |   ENABLE: True
 3 |   DATASET: Ssv2
 4 |   BATCH_SIZE: 32
 5 |   EVAL_PERIOD: 5
 6 |   CHECKPOINT_PERIOD: 5
 7 |   AUTO_RESUME: True
 8 |   CHECKPOINT_EPOCH_RESET: True
 9 |   CHECKPOINT_FILE_PATH: /checkpoint/fmetze/neurips_sota/40944587/checkpoints/checkpoint_epoch_00035.pyth
10 | DATA:
11 |   NUM_FRAMES: 16
12 |   SAMPLING_RATE: 4
13 |   TRAIN_JITTER_SCALES: [256, 320]
14 |   TRAIN_CROP_SIZE: 224
15 |   TEST_CROP_SIZE: 224
16 |   INPUT_CHANNEL_NUM: [3]
17 |   MEAN: [0.5, 0.5, 0.5]
18 |   STD: [0.5, 0.5, 0.5]
19 |   PATH_TO_DATA_DIR: /private/home/mandelapatrick/slowfast/data/ssv2
20 |   PATH_PREFIX: /datasets01/SomethingV2/092720/20bn-something-something-v2-frames
21 |   INV_UNIFORM_SAMPLE: True
22 |   RANDOM_FLIP: False
23 |   REVERSE_INPUT_CHANNEL: True
24 |   USE_RAND_AUGMENT: True
25 |   RE_PROB: 0.0
26 |   USE_REPEATED_AUG: False
27 |   USE_RANDOM_RESIZE_CROPS: False
28 |   COLORJITTER: False
29 |   GRAYSCALE: False
30 |   GAUSSIAN: False
31 | SOLVER:
32 |   BASE_LR: 1e-4
33 |   LR_POLICY: steps_with_relative_lrs
34 |   LRS: [1, 0.1, 0.01]
35 |   STEPS: [0, 20, 30]
36 |   MAX_EPOCH: 35
37 |   MOMENTUM: 0.9
38 |   WEIGHT_DECAY: 5e-2
39 |   WARMUP_EPOCHS: 0.0
40 |   OPTIMIZING_METHOD: adamw
41 |   USE_MIXED_PRECISION: True
42 |   SMOOTHING: 0.2
43 | SLOWFAST:
44 |   ALPHA: 8
45 | VIT:
46 |   PATCH_SIZE: 16
47 |   PATCH_SIZE_TEMP: 2
48 |   CHANNELS: 3
49 |   EMBED_DIM: 768
50 |   DEPTH: 12
51 |   NUM_HEADS: 12
52 |   MLP_RATIO: 4
53 |   QKV_BIAS: True
54 |   VIDEO_INPUT: True
55 |   TEMPORAL_RESOLUTION: 8
56 |   USE_MLP: True
57 |   DROP: 0.0
58 |   POS_DROPOUT: 0.0
59 |   DROP_PATH: 0.2
60 |   IM_PRETRAINED: True
61 |   HEAD_DROPOUT: 0.0
62 |   HEAD_ACT: tanh
63 |   PRETRAINED_WEIGHTS: vit_1k
64 |   ATTN_LAYER: trajectory
65 | MODEL:
66 |   NUM_CLASSES: 174
67 |   ARCH: slow
68 |   MODEL_NAME: VisionTransformer
69 |   LOSS_FUNC: cross_entropy
70 | TEST:
71 |   ENABLE: True
72 |   DATASET: Ssv2
73 |   BATCH_SIZE: 64
74 |   NUM_ENSEMBLE_VIEWS: 1
75 |   NUM_SPATIAL_CROPS: 3
76 | DATA_LOADER:
77 |   NUM_WORKERS: 4
78 |   PIN_MEMORY: True
79 | NUM_GPUS: 8
80 | NUM_SHARDS: 4
81 | RNG_SEED: 0
82 | OUTPUT_DIR: .
83 | TENSORBOARD:
84 |   ENABLE: True
85 | 


--------------------------------------------------------------------------------
/encoder/model/modules/feat_extractors/visual/motionformer_src/nystrom_helper.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
  3 | 
  4 | from einops import rearrange, repeat
  5 | import torch
  6 | import torch.nn as nn
  7 | import torch.nn.functional as Fn
  8 | import math
  9 | 
 10 | 
 11 | def iterative_inv(mat, n_iter = 6, init_option="exact"):
 12 |     I = torch.eye(mat.size(-2), device = mat.device)
 13 |     K = mat
 14 | 
 15 |     if init_option == "original":
 16 |         # This original implementation is more conservative to compute coefficient of Z_0. 
 17 |         V = 1. / torch.max(torch.sum(K, dim = -2)) * K.transpose(-1, -2)
 18 |     elif init_option == "arbitrary_input":
 19 |         # sum = 1 for softmax input but not for exp
 20 |         a1 = torch.max(torch.sum(torch.abs(K), dim = -2, keepdim=True), dim=-1, keepdim=True).values
 21 |         a2 = torch.max(torch.sum(torch.abs(K), dim = -1, keepdim=True), dim=-2, keepdim=True).values
 22 |         V = 1. / (a1 * a2) * K.transpose(-1, -2)
 23 |     else: # The entries of K are positive and ||K||_{\infty} = 1 due to softmax
 24 |         # This is the exact coefficient computation, 
 25 |         # 1 / ||K||_1, of initialization of Z_0, leading to faster convergence. 
 26 |         V = 1. / torch.max(
 27 |             torch.sum(K, dim = -2), dim = -1).values.unsqueeze(-1).unsqueeze(-1) * K.transpose(-1, -2)
 28 | 
 29 |     for _ in range(n_iter):
 30 |         KV = torch.matmul(K, V)
 31 |         V = torch.matmul(0.25 * V, 13 * I - torch.matmul(KV, 15 * I - torch.matmul(KV, 7 * I - KV)))
 32 |     return V
 33 | 
 34 | 
 35 | def nystrom_spatial_attn(
 36 |     q, k, v, landmarks=64, num_frames=None, inv_iters=6, 
 37 |     use_full_matrix=False, use_spatial_landmarks=False, return_attn=False
 38 | ):
 39 | 
 40 |     """
 41 |     Compute full space-time attention but only softmax over spatial dimension
 42 |     """
 43 |     B, N, D = k.shape
 44 |     F = num_frames
 45 |     scale = D ** -0.5
 46 |     q = q * scale
 47 |     if use_full_matrix:
 48 |         queries_landmarks = q.clone()
 49 |         keys_landmarks = k.clone()
 50 |     else:
 51 |         segs = N // landmarks
 52 |         with torch.no_grad():
 53 |             if use_spatial_landmarks:
 54 |                 # transpose spatial and temporal dimensions
 55 |                 q2 = rearrange(q, 'b (f p) d -> b (p f) d', f=F)
 56 |                 k2 = rearrange(k, 'b (f p) d -> b (p f) d', f=F)
 57 |                 if (N % landmarks == 0):
 58 |                     keys_landmarks = k2.reshape(B, landmarks, N // landmarks, D).mean(dim = -2)
 59 |                     queries_landmarks = q2.reshape(B, landmarks, N // landmarks, D).mean(dim = -2)
 60 |                 else:
 61 |                     num_k = (segs + 1) * landmarks - N
 62 |                     keys_landmarks_f = k2[:, :num_k * segs, :].reshape(
 63 |                         B, num_k, segs, D).mean(dim = -2)
 64 |                     keys_landmarks_l = k2[:, num_k * segs:, :].reshape(
 65 |                         B, landmarks - num_k, segs + 1, D).mean(dim = -2)
 66 |                     keys_landmarks = torch.cat((keys_landmarks_f, keys_landmarks_l), dim = -2)
 67 | 
 68 |                     queries_landmarks_f = q2[:, :num_k * segs, :].reshape(
 69 |                         B, num_k, segs, D).mean(dim = -2)
 70 |                     queries_landmarks_l = q2[:, num_k * segs:, :].reshape(
 71 |                         B, landmarks - num_k, segs + 1, D).mean(dim = -2)
 72 |                     queries_landmarks = torch.cat((queries_landmarks_f, queries_landmarks_l), dim = -2)
 73 |             else:
 74 |                 if (N % landmarks == 0):
 75 |                     keys_landmarks = k.reshape(
 76 |                         B, landmarks, N // landmarks, D).mean(dim = -2)
 77 |                     queries_landmarks = q.reshape(
 78 |                         B, landmarks, N // landmarks, D).mean(dim = -2)
 79 |                 else:
 80 |                     num_k = (segs + 1) * landmarks - N
 81 |                     keys_landmarks_f = k[:, :num_k * segs, :].reshape(
 82 |                         B, num_k, segs, D).mean(dim = -2)
 83 |                     keys_landmarks_l = k[:, num_k * segs:, :].reshape(
 84 |                         B, landmarks - num_k, segs + 1, D).mean(dim = -2)
 85 |                     keys_landmarks = torch.cat((keys_landmarks_f, keys_landmarks_l), dim = -2)
 86 | 
 87 |                     queries_landmarks_f = q[:, :num_k * segs, :].reshape(
 88 |                         B, num_k, segs, D).mean(dim = -2)
 89 |                     queries_landmarks_l = q[:, num_k * segs:, :].reshape(
 90 |                         B, landmarks - num_k, segs + 1, D).mean(dim = -2)
 91 |                     queries_landmarks = torch.cat((queries_landmarks_f, queries_landmarks_l), dim = -2)
 92 | 
 93 |     kernel_1 = Fn.softmax(
 94 |         torch.matmul(q, keys_landmarks.transpose(-1, -2)), dim = -1)
 95 |     kernel_2 = Fn.softmax(
 96 |         torch.matmul(queries_landmarks, keys_landmarks.transpose(-1, -2)), dim = -1)
 97 |     kernel_3 = Fn.softmax(
 98 |         rearrange(torch.matmul(
 99 |             queries_landmarks, k.transpose(-1, -2)), 'b l (f p) -> b l f p', f=F), dim = -1)
100 |     attn = torch.matmul(kernel_1, iterative_inv(kernel_2, n_iter=inv_iters))
101 | 
102 |     v = rearrange(v, 'b (f p) d -> b f p d', f=F)
103 |     x = torch.einsum(
104 |         'b n l, b l f d -> b n f d', 
105 |         attn, torch.einsum('b l f p, b f p d -> b l f d', kernel_3, v)
106 |     )
107 | 
108 |     if return_attn:
109 |         attn = torch.einsum('b m l, b l f p -> b m f p', attn, kernel_3)
110 |         return x, attn
111 | 
112 |     return x


--------------------------------------------------------------------------------
/encoder/model/modules/feat_extractors/visual/motionformer_src/orthoformer_helper.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
  3 | 
  4 | from einops import rearrange, repeat
  5 | import torch
  6 | import torch.nn as nn
  7 | import torch.nn.functional as Fn
  8 | import math
  9 | 
 10 | 
 11 | def orthogonal_landmarks(q, k, num_landmarks=64, subsample_fraction=1.0):
 12 |     """
 13 |     Construct set of landmarks by recursively selecting new landmarks 
 14 |     that are maximally orthogonal to the existing set.
 15 |     Returns near orthogonal landmarks with shape (B, M, D).
 16 |     """
 17 |     if subsample_fraction < 1.0:
 18 |         # Need at least M/2 samples of queries and keys
 19 |         num_samples = max(int(subsample_fraction * q.size(-2)), num_landmarks)
 20 |         q_unnormalised = q[:, torch.randint(q.size(-2), (num_samples,), device=q.device), :] # (B, N, D)
 21 |     else:
 22 |         # (B, N, D)
 23 |         q_unnormalised = q
 24 | 
 25 |     # may need to change default eps to eps=1e-8 for mixed precision compatibility
 26 |     qk = Fn.normalize(q_unnormalised, p=2, dim=-1)
 27 |     B, N, D = qk.shape
 28 | 
 29 |     selected_mask = torch.zeros((B, N, 1), device=qk.device)
 30 |     landmark_mask = torch.ones((B, 1, 1), dtype=selected_mask.dtype, device=qk.device)
 31 | 
 32 |     # Get initial random landmark
 33 |     random_idx = torch.randint(qk.size(-2), (B, 1, 1), device=qk.device)
 34 |     selected_landmark = qk[torch.arange(qk.size(0)), random_idx.view(-1), :].view(B, D)
 35 |     selected_mask.scatter_(-2, random_idx, landmark_mask)
 36 | 
 37 |     # Selected landmarks
 38 |     selected_landmarks = torch.empty((B, num_landmarks, D), device=qk.device, dtype=qk.dtype)
 39 |     selected_landmarks[:, 0, :] = selected_landmark
 40 | 
 41 |     # Store computed cosine similarities
 42 |     cos_sims = torch.empty((B, N, num_landmarks), device=qk.device, dtype=qk.dtype)
 43 | 
 44 |     for M in range(1, num_landmarks):
 45 |         # Calculate absolute cosine similarity between selected and unselected landmarks
 46 |         # (B, N, D) * (B, D) -> (B, N)
 47 |         cos_sim = torch.einsum('b n d, b d -> b n', qk, selected_landmark).abs()
 48 |         cos_sims[:, :, M - 1] = cos_sim
 49 |         # (B, N, M) cosine similarities of current set of landmarks wrt all queries and keys
 50 |         cos_sim_set = cos_sims[:, :, :M]
 51 | 
 52 |         # Get orthogonal landmark: landmark with smallest absolute cosine similarity:
 53 |         # set cosine similarity for already selected landmarks to > 1
 54 |         cos_sim_set.view(-1, M)[selected_mask.flatten().bool(), :] = 10
 55 |         # (B,) - want max for non
 56 |         selected_landmark_idx = cos_sim_set.amax(-1).argmin(-1)
 57 |         selected_landmark = qk[torch.arange(qk.size(0)), selected_landmark_idx, :].view(B, D)
 58 | 
 59 |         # Add most orthogonal landmark to selected landmarks: 
 60 |         selected_landmarks[:, M, :] = selected_landmark
 61 | 
 62 |         # Removed selected indices from non-selected mask: 
 63 |         selected_mask.scatter_(-2, selected_landmark_idx.unsqueeze(-1).unsqueeze(-1), landmark_mask)
 64 |     landmarks = torch.masked_select(
 65 |         q_unnormalised, selected_mask.bool()).reshape(B, -1, D) # (B, M, D)
 66 |     return landmarks # (B, M, D)
 67 | 
 68 | 
 69 | def orthoformer(
 70 |     q, k, v, num_landmarks=64, subsample_fraction=1.0, 
 71 |     num_frames=None, shared_landmarks=True, return_attn=False
 72 | ):
 73 |     """
 74 |     Computes spatial attention for all pairs of frames.
 75 |     The attention matrix is approximated using 
 76 |     intermediate landmarks taken from the queries and keys.
 77 |     The landmarks can be unique (to each frame) or 
 78 |     shared (a common set of landmarks across frames).
 79 |     """
 80 |     B, N, D = k.shape
 81 |     F = num_frames
 82 |     L = num_landmarks
 83 |     P = N // F
 84 | 
 85 |     scale = D ** -0.25
 86 |     q = q * scale
 87 |     k = k * scale
 88 |     
 89 |     if shared_landmarks:
 90 |         with torch.no_grad():
 91 |             landmarks = orthogonal_landmarks(q, k, num_landmarks, subsample_fraction)
 92 |         kernel_1 = Fn.softmax(torch.matmul(q, landmarks.transpose(-1, -2)), dim=-1)
 93 |         kernel_2 = Fn.softmax(
 94 |             rearrange(torch.matmul(
 95 |                 landmarks, k.transpose(-1, -2)), 'b l (f p) -> b l f p', f=F), dim=-1)
 96 |         v = rearrange(v, 'b (f p) d -> b f p d', f=F)
 97 |         x = torch.einsum('b l f p, b f p d -> b l f d', kernel_2, v)
 98 |         x = torch.einsum('b n l, b l f d -> b n f d', kernel_1, x)
 99 |         if return_attn:
100 |             attn = torch.einsum('b m l, b l f p -> b m f p', kernel_1, kernel_2)
101 |             return x, attn
102 |     else:
103 |         q = rearrange(q, 'b (f p) d -> (b f) p d', f=F)
104 |         k = rearrange(k, 'b (g q) d -> (b g) q d', g=F)
105 |         with torch.no_grad():
106 |             landmarks = orthogonal_landmarks(q, k, num_landmarks, subsample_fraction)
107 |             landmarks = rearrange(landmarks, '(b f) l d -> b f l d', f=F)
108 |         q = rearrange(q, '(b f) p d -> b f 1 p d', f=F)
109 |         k = rearrange(k, '(b g) q d -> b 1 g q d', g=F)
110 |         v = rearrange(v, 'b (g q) d -> b 1 g q d', g=F)
111 |         kernel_1 = Fn.softmax(
112 |             torch.matmul(q, landmarks.unsqueeze(-4).transpose(-1, -2)), dim=-1)
113 |         kernel_2 = Fn.softmax(
114 |             torch.matmul(landmarks.unsqueeze(-3), k.transpose(-1, -2)), dim=-1)
115 |         x = torch.matmul(kernel_1, torch.matmul(kernel_2, v))
116 |         x = rearrange(x, 'b f g p d -> b (f p) g d')
117 |         if return_attn:
118 |             attn = torch.matmul(kernel_1, kernel_2)
119 |             attn = rearrange(attn, 'b f g p q -> b (f p) g q')
120 |             return x, attn
121 | 
122 |     return x


--------------------------------------------------------------------------------
/encoder/phi.py:
--------------------------------------------------------------------------------
 1 | # ReWaS
 2 | # Copyright (c) 2024-present NAVER Cloud Corp.
 3 | # CC BY-NC-SA 4.0 (https://creativecommons.org/licenses/by-nc-sa/4.0/)
 4 | 
 5 | import torch.nn as nn
 6 | from einops.layers.torch import Rearrange, Reduce
 7 | import einops
 8 | from functools import partial
 9 | from encoder.transformer import MultiheadAttention, SimpleTransformer
10 | import torch
11 | 
12 | class Phi(nn.Module):
13 |     def __init__(self, input_dim=768, out_dim=1, proj_dims=[768, 128, 64, 16, 1]):
14 |         super().__init__()
15 | 
16 |         self.projection1 = nn.Sequential(
17 |             nn.Linear(input_dim, input_dim),
18 |         )
19 | 
20 |         self.hint_blocks = SimpleTransformer(
21 |             attn_target = partial(
22 |                     MultiheadAttention,
23 |                     embed_dim=input_dim,
24 |                     num_heads=8,
25 |                     bias=True,
26 |                     add_bias_kv=True,
27 |                 ),
28 |             embed_dim = input_dim,
29 |             num_blocks = 3,
30 |             weight_init_style = "pytorch",  # possible values jax or pytorch
31 |         )
32 | 
33 |         self.projection2 = nn.Sequential(
34 |             nn.Linear(768,128),
35 |             nn.ReLU(),
36 |             nn.Linear(128,64),
37 |             nn.ReLU(),
38 |             nn.Linear(64,16),
39 |             nn.ReLU(),
40 |             nn.Linear(16,1),
41 |         )
42 | 
43 |     def forward(self, x):
44 |         x = self.projection1(x)
45 |         x = self.hint_blocks(x)
46 |         x = self.projection2(x)
47 |         return x


--------------------------------------------------------------------------------
/eval_MAE.py:
--------------------------------------------------------------------------------
 1 | # ReWaS
 2 | # Copyright (c) 2024-present NAVER Cloud Corp.
 3 | # CC BY-NC-SA 4.0 (https://creativecommons.org/licenses/by-nc-sa/4.0/)
 4 | 
 5 | import argparse
 6 | import time
 7 | import numpy as np
 8 | import einops
 9 | import os
10 | import torch
11 | from torch.utils.data import DataLoader
12 | from audioldm.utilities.data.dataset import AudioDataset
13 | import yaml
14 | from tqdm import tqdm
15 | 
16 | 
17 | def get_audio(audio):
18 |     audio = torch.mean(audio, axis=1)
19 |     return audio
20 | 
21 | def collate_fn(batch):
22 |     batch = list(filter(lambda x: x is not None, batch))
23 |     return torch.utils.data.dataloader.default_collate(batch)
24 | 
25 | def filter_common_keys(dict_a, dict_b):
26 |     # Find the common keys between dict_a and dict_b
27 |     common_keys = dict_a.keys() & dict_b.keys()
28 |     
29 |     # Create new dictionaries with only the common keys
30 |     filtered_dict_a = {key: dict_a[key] for key in common_keys}
31 |     filtered_dict_b = {key: dict_b[key] for key in common_keys}
32 |     sorted_dict_a = dict(sorted(filtered_dict_a.items()))
33 |     sorted_dict_b = dict(sorted(filtered_dict_b.items()))
34 |     return sorted_dict_a, sorted_dict_b
35 | 
36 | 
37 | 
38 | def main(args):
39 |     batch_size = args.batch_size
40 |     configs = yaml.load(open(args.config, "r"), Loader=yaml.FullLoader)
41 |     dataloader_add_ons = configs["data"]["dataloader_add_ons"]
42 |     
43 |     generated_dataset = AudioDataset(configs, split="video_control", add_ons=dataloader_add_ons)
44 |     generated_dataloader = DataLoader(generated_dataset, num_workers=8, batch_size=batch_size, shuffle=True,drop_last =True)
45 |     
46 |     gtdataset = AudioDataset(configs, split="gt", add_ons=dataloader_add_ons)
47 |     gt_dataloader = DataLoader(gtdataset, num_workers=args.num_workers, batch_size=batch_size, shuffle=True,drop_last =True)
48 |     
49 |     gt_energy = {}
50 |     test_energy = {}
51 | 
52 |     for idx, item in tqdm(enumerate(gt_dataloader)):
53 |         name = str(item['fname'][0]).split("/")[-1]
54 |         gt_mel = item['log_mel_spec']
55 |         energy = torch.mean(gt_mel, dim=2)
56 |         gt_energy[f'{name}'] = energy
57 | 
58 |     for idx, item in tqdm(enumerate(generated_dataloader)):
59 |         name = str(item['fname'][0]).split("/")[-1]
60 |         pred_mel = item['log_mel_spec']
61 |         energy = torch.mean(pred_mel, dim=2)
62 |         test_energy[f'{name}'] = energy
63 | 
64 |     gt_energy, test_energy = filter_common_keys(gt_energy, test_energy)
65 |     print(gt_energy.keys())
66 |     print(test_energy.keys())
67 |     gt_energy = torch.cat(list(gt_energy.values()),dim=0)
68 |     test_energy = torch.cat(list(test_energy.values()),dim=0)
69 |     loss = nn.L1Loss()
70 |     MAE  = loss(gt_energy, test_energy)
71 |     print(len(gt_energy.keys()))
72 |     print(f"###### MAE: {MAE}")
73 | 
74 | 
75 | if __name__ == '__main__':
76 |     parser = argparse.ArgumentParser()
77 |     parser.add_argument('--device', default='cuda', type=str)
78 |     parser.add_argument('--config', default='configs/audioldm_m_rewas.yaml', type=str)
79 |     parser.add_argument('--batch_size', default=128, type=int)
80 |     parser.add_argument('--save_path', default="outputs", type=str)
81 |     parser.add_argument('--num_workers', default=16, type=int)
82 |     args = parser.parse_args()
83 |     main(args)
84 | 


--------------------------------------------------------------------------------
/evaluation/clap/LICENSE:
--------------------------------------------------------------------------------
 1 |     MIT License
 2 | 
 3 |     Copyright (c) Microsoft Corporation.
 4 | 
 5 |     Permission is hereby granted, free of charge, to any person obtaining a copy
 6 |     of this software and associated documentation files (the "Software"), to deal
 7 |     in the Software without restriction, including without limitation the rights
 8 |     to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 |     copies of the Software, and to permit persons to whom the Software is
10 |     furnished to do so, subject to the following conditions:
11 | 
12 |     The above copyright notice and this permission notice shall be included in all
13 |     copies or substantial portions of the Software.
14 | 
15 |     THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 |     IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 |     FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 |     AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 |     LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 |     OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 |     SOFTWARE


--------------------------------------------------------------------------------
/evaluation/clap/__init__.py:
--------------------------------------------------------------------------------
1 | from . import clap 
2 | from . import audio
3 | from . import utils


--------------------------------------------------------------------------------
/evaluation/clap/clap.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import torch
 3 | import torch.nn.functional as F
 4 | from torch import nn
 5 | from transformers import AutoModel
 6 | from .audio import get_audio_encoder
 7 | 
 8 | class Projection(nn.Module):
 9 |     def __init__(self, d_in: int, d_out: int, p: float=0.5) -> None:
10 |         super().__init__()
11 |         self.linear1 = nn.Linear(d_in, d_out, bias=False)
12 |         self.linear2 = nn.Linear(d_out, d_out, bias=False)
13 |         self.layer_norm = nn.LayerNorm(d_out)
14 |         self.drop = nn.Dropout(p)
15 | 
16 |     def forward(self, x: torch.Tensor) -> torch.Tensor:
17 |         embed1 = self.linear1(x)
18 |         embed2 = self.drop(self.linear2(F.gelu(embed1)))
19 |         embeds = self.layer_norm(embed1 + embed2)
20 |         return embeds
21 | 
22 | class AudioEncoder(nn.Module):
23 |     def __init__(self, audioenc_name:str, d_in: int, d_out: int, sample_rate: int, window_size: int,
24 |             hop_size: int, mel_bins: int, fmin: int, fmax: int, classes_num: int) -> None:
25 |         super().__init__()
26 | 
27 |         audio_encoder = get_audio_encoder(audioenc_name)
28 | 
29 |         self.base = audio_encoder(
30 |             sample_rate, window_size,
31 |             hop_size, mel_bins, fmin, fmax,
32 |             classes_num, d_in)
33 | 
34 |         self.projection = Projection(d_in, d_out)
35 | 
36 |     def forward(self, x):
37 |         out_dict = self.base(x)
38 |         audio_features, audio_classification_output = out_dict['embedding'], out_dict['clipwise_output']
39 |         projected_vec = self.projection(audio_features)
40 |         return projected_vec, audio_classification_output
41 | 
42 | class TextEncoder(nn.Module):
43 |     def __init__(self, d_out: int, text_model: str, transformer_embed_dim: int) -> None:
44 |         super().__init__()
45 |         self.base = AutoModel.from_pretrained(text_model)
46 |         
47 |         self.projection = Projection(transformer_embed_dim, d_out)
48 | 
49 |     def forward(self, x):
50 |         out = self.base(**x)[0]
51 |         out = out[:, 0, :]  # get CLS token output
52 |         projected_vec = self.projection(out)
53 |         return projected_vec
54 | 
55 | class CLAP(nn.Module):
56 |     def __init__(self,
57 |                 # audio
58 |                 audioenc_name: str,
59 |                 sample_rate: int, 
60 |                 window_size: int, 
61 |                 hop_size: int, 
62 |                 mel_bins: int, 
63 |                 fmin: int, 
64 |                 fmax: int, 
65 |                 classes_num: int, 
66 |                 out_emb: int,
67 |                 # text
68 |                 text_model: str,
69 |                 transformer_embed_dim: int,
70 |                 # common
71 |                 d_proj: int,
72 |                 ):
73 |         super().__init__()
74 | 
75 |         
76 |         self.audio_encoder = AudioEncoder(
77 |             audioenc_name, out_emb, d_proj,
78 |             sample_rate, window_size, hop_size, mel_bins, fmin, fmax, classes_num)
79 | 
80 |         self.caption_encoder = TextEncoder(
81 |             d_proj, text_model, transformer_embed_dim
82 |         )
83 | 
84 |         self.logit_scale = nn.Parameter(torch.ones([]) * np.log(1 / 0.07))
85 | 
86 |     def forward(self, audio, text):
87 |         audio_embed, _ = self.audio_encoder(audio)
88 |         caption_embed = self.caption_encoder(text)
89 | 
90 |         return caption_embed, audio_embed, self.logit_scale.exp()


--------------------------------------------------------------------------------
/evaluation/clap/clap_config.yml:
--------------------------------------------------------------------------------
 1 | # TEXT ENCODER CONFIG
 2 | text_model: 'bert-base-uncased'
 3 | text_len: 100
 4 | transformer_embed_dim: 768
 5 | freeze_text_encoder_weights: True
 6 | 
 7 | # AUDIO ENCODER CONFIG
 8 | audioenc_name: 'Cnn14'
 9 | out_emb: 2048
10 | sampling_rate: 44100
11 | duration: 9
12 | fmin: 50
13 | fmax: 14000
14 | n_fft: 1028
15 | hop_size: 320
16 | mel_bins: 64
17 | window_size: 1024
18 | 
19 | # PROJECTION SPACE CONFIG 
20 | d_proj: 1024
21 | temperature: 0.003
22 | 
23 | # TRAINING AND EVALUATION CONFIG
24 | num_classes: 527
25 | batch_size: 1024
26 | demo: False


--------------------------------------------------------------------------------
/evaluation/clap/utils.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import yaml
 3 | import sys
 4 | 
 5 | def read_config_as_args(config_path,args=None,is_config_str=False):
 6 |     return_dict = {}
 7 | 
 8 |     if config_path is not None:
 9 |         if is_config_str:
10 |             yml_config = yaml.load(config_path, Loader=yaml.FullLoader)
11 |         else:
12 |             with open(config_path, "r") as f:
13 |                 yml_config = yaml.load(f, Loader=yaml.FullLoader)
14 | 
15 |         if args != None:
16 |             for k, v in yml_config.items():
17 |                 if k in args.__dict__:
18 |                     args.__dict__[k] = v
19 |                 else:
20 |                     sys.stderr.write("Ignored unknown parameter {} in yaml.\n".format(k))
21 |         else:
22 |             for k, v in yml_config.items():
23 |                 return_dict[k] = v
24 | 
25 |     args = args if args != None else return_dict
26 |     return argparse.Namespace(**args)
27 | 


--------------------------------------------------------------------------------
/evaluation/clap_score.py:
--------------------------------------------------------------------------------
  1 | # reference: https://github.com/Text-to-Audio/Make-An-Audio/blob/main/wav_evaluation/cal_clap_score.py
  2 | 
  3 | import pathlib
  4 | import sys
  5 | import os
  6 | directory = pathlib.Path(os.getcwd())
  7 | sys.path.append(str(directory))
  8 | import torch
  9 | import numpy as np
 10 | from clap.CLAPWrapper import CLAPWrapper
 11 | import argparse
 12 | from tqdm import tqdm
 13 | import pandas as pd
 14 | import json
 15 | 
 16 | def add_audio_path(df):
 17 |     df['audio_path'] = df.apply(lambda x:x['mel_path'].replace('.npy','.wav'),axis=1)
 18 |     return df
 19 | 
 20 | def build_tsv_from_wavs(root_dir, dataset):
 21 |     
 22 |     wavfiles = os.listdir(root_dir)
 23 |     # wavfiles = list(filter(lambda x:x.endswith('.wav') and x[-6:-4]!='gt',wavfiles))
 24 |     print(f'###### number of samples: {len(wavfiles)}')
 25 | 
 26 |     dict_list = []
 27 |     for wavfile in wavfiles:
 28 |         tmpd = {'audio_path':os.path.join(root_dir, wavfile)}
 29 |         if dataset == 'vggsound':
 30 |             caption = ' '.join(wavfile.split('_')[:-1])
 31 |         tmpd['caption'] = caption
 32 |         dict_list.append(tmpd)
 33 | 
 34 |     df = pd.DataFrame.from_dict(dict_list)
 35 |     tsv_path = f'{os.path.basename(root_dir)}.tsv'
 36 |     tsv_path = os.path.join('./tmp/', tsv_path)
 37 |     df.to_csv(tsv_path, sep='\t', index=False)
 38 | 
 39 |     return tsv_path
 40 | 
 41 | def cal_score_by_tsv(tsv_path, clap_model, cutoff=5):
 42 |     df = pd.read_csv(tsv_path, sep='\t')
 43 |     clap_scores = []
 44 |     if not ('audio_path' in df.columns):
 45 |         df = add_audio_path(df)
 46 |     caption_list,audio_list = [],[]
 47 |     with torch.no_grad():
 48 |         for idx,t in enumerate(tqdm(df.itertuples()), start=1): 
 49 |             caption_list.append(getattr(t,'caption'))
 50 |             audio_list.append(getattr(t,'audio_path'))
 51 |             if idx % 20 == 0:
 52 |                 text_embeddings = clap_model.get_text_embeddings(caption_list)
 53 |                 audio_embeddings = clap_model.get_audio_embeddings(audio_list, resample=True, cutoff=5)
 54 |                 score_mat = clap_model.compute_similarity(audio_embeddings, text_embeddings,use_logit_scale=False)
 55 |                 score = score_mat.diagonal()
 56 |                 clap_scores.append(score.cpu().numpy())
 57 |                 audio_list = []
 58 |                 caption_list = []
 59 |     return np.mean(np.array(clap_scores).flatten())
 60 | 
 61 | def add_clap_score_to_tsv(tsv_path, clap_model):
 62 |     df = pd.read_csv(tsv_path,sep='\t')
 63 |     clap_scores_dict = {}
 64 |     with torch.no_grad():
 65 |         for idx,t in enumerate(tqdm(df.itertuples()),start=1): 
 66 |             text_embeddings = clap_model.get_text_embeddings([getattr(t,'caption')])# 经过了norm的embedding
 67 |             audio_embeddings = clap_model.get_audio_embeddings([getattr(t,'audio_path')], resample=True)
 68 |             score = clap_model.compute_similarity(audio_embeddings, text_embeddings,use_logit_scale=False)
 69 |             clap_scores_dict[idx] = score.cpu().numpy()
 70 |     df['clap_score'] = clap_scores_dict
 71 |     df.to_csv(tsv_path[:-4]+'_clap.tsv',sep='\t',index=False)
 72 | 
 73 | 
 74 | if __name__ == '__main__':
 75 |     parser = argparse.ArgumentParser()
 76 |     parser.add_argument('--dataset', type=str, default='vggsound')
 77 |     parser.add_argument('--tsv_path', type=str, default='')
 78 |     parser.add_argument('--wav_dir', type=str)
 79 |     parser.add_argument('--mean', type=bool, default=True)
 80 |     parser.add_argument('--ckpt_path', default="clap")
 81 |     args = parser.parse_args()
 82 | 
 83 |     if args.tsv_path:
 84 |         tsv_path = args.tsv_path
 85 |     else:
 86 |         tsv_path = os.path.join('./tmp/', f'{os.path.basename(args.wav_dir)}.tsv')
 87 | 
 88 |     if not os.path.exists(tsv_path):
 89 |         print("result tsv not exist, build for it")
 90 |         tsv_path = build_tsv_from_wavs(args.wav_dir, args.dataset)
 91 | 
 92 |     clap_model = CLAPWrapper(
 93 |                     os.path.join(args.ckpt_path, 'CLAP_weights_2022.pth'),
 94 |                     os.path.join(args.ckpt_path, 'clap_config.yml'), 
 95 |                     use_cuda=True)
 96 | 
 97 |     clap_score = cal_score_by_tsv(tsv_path, clap_model, cutoff=5)
 98 |     out = args.wav_dir if args.wav_dir else args.tsv_path
 99 | 
100 |     print(f"Clap score for {out} is:{clap_score}")


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | pytorch-lightning
 2 | scipy
 3 | share==1.0.4
 4 | taming-transformers==0.0.1
 5 | torch
 6 | torchaudio
 7 | torchlibrosa
 8 | torchmetrics
 9 | tqdm
10 | transformers
11 | omegaconf
12 | h5py
13 | braceexpand
14 | webdataset
15 | progressbar
16 | timm
17 | moviepy
18 | wget
19 | numpy


--------------------------------------------------------------------------------
/test.py:
--------------------------------------------------------------------------------
  1 | # ReWaS
  2 | # Copyright (c) 2024-present NAVER Cloud Corp.
  3 | # CC BY-NC-SA 4.0 (https://creativecommons.org/licenses/by-nc-sa/4.0/)
  4 | 
  5 | import os
  6 | 
  7 | from audioldm.pipeline import rewas_generation, build_control_model
  8 | 
  9 | import json
 10 | import argparse
 11 | import pandas as pd
 12 | from random import shuffle
 13 | from omegaconf import OmegaConf
 14 | from tqdm import tqdm 
 15 | 
 16 | import warnings
 17 | warnings.filterwarnings("ignore")
 18 | 
 19 | import torch
 20 | import torch.nn as nn
 21 | from torch.utils.data.distributed import DistributedSampler
 22 | 
 23 | from utils import seed_everything
 24 | from encoder.encoder_utils import patch_config, get_pretrained
 25 | from encoder.phi import Phi
 26 | 
 27 | 
 28 | def main(args):
 29 | 
 30 |     seed = args.seed
 31 |     seed_everything(seed)
 32 | 
 33 |     assert os.path.isfile(args.ckpt_path), "check checkpoints in ckpt_path!"
 34 |         
 35 |     control_type = args.control_type
 36 |     save_path = args.save_path
 37 |     os.makedirs(save_path, exist_ok=True)
 38 | 
 39 |     audioldm_control = build_control_model(
 40 |         ckpt_path = args.ckpt_path,
 41 |         control_type = args.control_type,
 42 |         config = args.config,
 43 |         model_name = args.model_name)
 44 | 
 45 |     cfg_path = f'./configs/cfg-{args.synchformer_exp}.yaml'
 46 |     synchformer_cfg = OmegaConf.load(cfg_path)
 47 |     synchformer_cfg = patch_config(synchformer_cfg)
 48 |     
 49 | 
 50 |     video_encoder = get_pretrained(args.synchformer_exp, 0)
 51 | 
 52 |     phi = Phi()
 53 |     resume_params = torch.load(args.phi_ckpt_path)
 54 |     resume_new = {k.replace("module.",""): v for k, v in resume_params.items()}
 55 |     phi.load_state_dict(resume_new)
 56 | 
 57 |     phi.eval()
 58 |     phi = nn.DataParallel(phi, device_ids=[i for i in range(torch.cuda.device_count())])
 59 | 
 60 |     print(f'Generate data list: {args.testlist}')
 61 | 
 62 |     with open(args.testlist, 'rb') as f:
 63 |         datalist = list(map(json.loads, f))
 64 | 
 65 | 
 66 |     for x in tqdm(datalist):
 67 |         prompt = x['prompt']
 68 |         videopath = x['video_name']
 69 | 
 70 |         waveform = rewas_generation(
 71 |             audioldm_control,
 72 |             prompt,
 73 |             videopath,
 74 |             args.control_type,
 75 |             args.synchformer_exp,
 76 |             synchformer_cfg,
 77 |             video_encoder,
 78 |             phi,
 79 |             args.file_path,
 80 |             seed,
 81 |             duration=args.duration,
 82 |             guidance_scale=args.guidance_scale,
 83 |             ddim_steps=args.ddim_steps,
 84 |             n_candidate_gen_per_text=args.n_candidate_gen_per_cond,
 85 |             batchsize=args.batchsize,
 86 |             save_path=save_path,
 87 |             re_encode=args.re_encode,
 88 |             local_rank=0
 89 |             )
 90 | 
 91 |     if args.re_encode:
 92 |         os.rmdir('.cache/')
 93 | 
 94 | if __name__ == '__main__':
 95 | 
 96 |     parser = argparse.ArgumentParser()
 97 |     
 98 | 
 99 |     parser.add_argument(
100 |         "--testlist",
101 |         type=str,
102 |         default="test_samples.json",
103 |     )
104 | 
105 |     parser.add_argument(
106 |         "--datadir",
107 |         type=str,
108 |         default="/path/to/video",
109 |     )
110 | 
111 |     parser.add_argument(
112 |         "-f",
113 |         "--file_path",
114 |         type=str,
115 |         default=None,
116 |     )
117 | 
118 |     parser.add_argument(
119 |         "-s",
120 |         "--save_path",
121 |         type=str,
122 |         help="The path to save model output",
123 |         default="./results",
124 |     )
125 | 
126 |     parser.add_argument(
127 |         "--model_name",
128 |         type=str,
129 |         help="The checkpoint you gonna use",
130 |         default="audioldm-m-full",
131 |     )
132 | 
133 |     parser.add_argument(
134 |         "-ckpt",
135 |         "--ckpt_path",
136 |         type=str,
137 |         help="The path to the pretrained .ckpt model",
138 |         default="ckpts/audioldm_m_rewas_vggsound.ckpt",
139 |     )
140 |     
141 |     parser.add_argument(
142 |         "--phi_ckpt_path",
143 |         type=str,
144 |         help="The path to the pretrained .ckpt video encoder",
145 |         default="ckpts/phi_vggsound.ckpt",
146 |     )
147 |     
148 |     parser.add_argument(
149 |         "--synchformer_exp",
150 |         type=str,
151 |         help="The name of experiment of synchformer",
152 |         default="24-01-04T16-39-21",
153 |     )
154 | 
155 |     parser.add_argument(
156 |         "-b",
157 |         "--batchsize",
158 |         type=int,
159 |         default=1,
160 |         help="Generate how many samples at the same time",
161 |     )
162 | 
163 |     parser.add_argument(
164 |         "--ddim_steps",
165 |         type=int,
166 |         default=200,
167 |         help="The sampling step for DDIM",
168 |     )
169 | 
170 |     parser.add_argument(
171 |         "-gs",
172 |         "--guidance_scale",
173 |         type=float,
174 |         default=3,
175 |         help="Guidance scale (Large => better quality and relavancy to text; Small => better diversity)",
176 |     )
177 | 
178 |     parser.add_argument(
179 |         "-dur",
180 |         "--duration",
181 |         type=float,
182 |         default=5.0,
183 |         help="The duration of the samples",
184 |     )
185 | 
186 |     parser.add_argument(
187 |         "-n",
188 |         "--n_candidate_gen_per_cond",
189 |         type=int,
190 |         default=1,
191 |         help="The number of generated sample per condition. A Larger value usually lead to better quality with heavier computation",
192 |     )
193 | 
194 |     parser.add_argument(
195 |         "--seed",
196 |         type=int,
197 |         default=42,
198 |         help="Change this value (any integer number) will lead to a different generation result.",
199 |     )
200 | 
201 |     parser.add_argument(
202 |         "--config",
203 |         type=str,
204 |         default="configs/audioldm_m_rewas.yaml",
205 |     )
206 | 
207 | 
208 |     parser.add_argument(
209 |         "--control_type",
210 |         type=str,
211 |         default="energy_video",
212 |         choices=["energy_audio", "energy_video"]
213 |     )
214 | 
215 |     parser.add_argument('--re_encode', action='store_true')
216 | 
217 | 
218 |     args = parser.parse_args()
219 | 
220 |     main(args)
221 | 
222 | 
223 | 


--------------------------------------------------------------------------------
/test_samples.json:
--------------------------------------------------------------------------------
1 | {"video_name": "./basketball_bounce.mp4", "prompt": "basketball bounce"}
2 | 


--------------------------------------------------------------------------------
/tool_add_adapter.py:
--------------------------------------------------------------------------------
 1 | # reference: https://github.com/lllyasviel/ControlNet/blob/main/tool_add_control.py
 2 | 
 3 | import sys
 4 | import os
 5 | import argparse
 6 | import torch
 7 | from omegaconf import OmegaConf
 8 | from encoder.encoder_utils import instantiate_from_config
 9 | 
10 | 
11 | def get_node_name(name, parent_name):
12 |     if len(name) <= len(parent_name):
13 |         return False, ''
14 |     p = name[:len(parent_name)]
15 |     if p != parent_name:
16 |         return False, ''
17 |     return True, name[len(parent_name):]
18 | 
19 | def main():
20 |     parser = argparse.ArgumentParser(description="Adding adapter architecture to audioldm.")
21 |     parser.add_argument("--input_path", type=str, required=True, help="path to pretrained model weights")
22 |     parser.add_argument("--output_path", type=str, required=True, help="path to save output model weights")
23 |     parser.add_argument("--config_path", type=str, default="configs/audioldm_m_rewas.yaml")
24 |     args = parser.parse_args()
25 | 
26 |     assert os.path.exists(args.input_path), 'Input model does not exist.'
27 |     assert not os.path.exists(args.output_path), 'Output filename already exists.'
28 | 
29 |     config = OmegaConf.load(args.config_path)
30 |     model = instantiate_from_config(config.model).cpu()
31 |     print(f'Loaded model config from [{args.config_path}]')
32 | 
33 |     pretrained_weights = torch.load(args.input_path)
34 |     if 'state_dict' in pretrained_weights:
35 |         pretrained_weights = pretrained_weights['state_dict']
36 | 
37 |     scratch_dict = model.state_dict()
38 | 
39 |     target_dict = {}
40 |     for k in scratch_dict.keys():
41 |         is_control, name = get_node_name(k, 'control_')
42 |         if is_control:
43 |             copy_k = 'model.diffusion_' + name
44 |             print(f'control add: {copy_k}')
45 |         else:
46 |             copy_k = k
47 |         if copy_k in pretrained_weights:
48 |             target_dict[k] = pretrained_weights[copy_k].clone()
49 |         else:
50 |             target_dict[k] = scratch_dict[k].clone()
51 | 
52 |     model.load_state_dict(target_dict, strict=True)
53 |     torch.save(model.state_dict(), args.output_path)
54 | 
55 |     print(f'Model saved in {args.output_path}')
56 | 
57 | if __name__ == "__main__":
58 |     main()


--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
  1 | # reference: https://github.com/haoheliu/AudioLDM/blob/main/audioldm/utils.py
  2 | 
  3 | import subprocess
  4 | import json
  5 | import os
  6 | import soundfile as sf
  7 | 
  8 | 
  9 | import torch
 10 | import torchvision
 11 | import torchaudio
 12 | 
 13 | def default_audioldm_config(model_name="audioldm-s-full"):    
 14 |     basic_config = {
 15 |         "wave_file_save_path": "./output",
 16 |         "id": {
 17 |             "version": "v1",
 18 |             "name": "default",
 19 |             "root": "/mnt/fast/nobackup/users/hl01486/projects/general_audio_generation/AudioLDM-python/config/default/latent_diffusion.yaml",
 20 |         },
 21 |         "preprocessing": {
 22 |             "audio": {"sampling_rate": 16000, "max_wav_value": 32768},
 23 |             "stft": {"filter_length": 1024, "hop_length": 160, "win_length": 1024},
 24 |             "mel": {
 25 |                 "n_mel_channels": 64,
 26 |                 "mel_fmin": 0,
 27 |                 "mel_fmax": 8000,
 28 |                 "freqm": 0,
 29 |                 "timem": 0,
 30 |                 "blur": False,
 31 |                 "mean": -4.63,
 32 |                 "std": 2.74,
 33 |                 "target_length": 1024,
 34 |             },
 35 |         },
 36 |         "model": {
 37 |             "device": "cuda",
 38 |             "target": "audioldm.pipline.LatentDiffusion",
 39 |             "params": {
 40 |                 "base_learning_rate": 5e-06,
 41 |                 "linear_start": 0.0015,
 42 |                 "linear_end": 0.0195,
 43 |                 "num_timesteps_cond": 1,
 44 |                 "log_every_t": 200,
 45 |                 "timesteps": 1000,
 46 |                 "first_stage_key": "fbank",
 47 |                 "cond_stage_key": "waveform",
 48 |                 "latent_t_size": 256,
 49 |                 "latent_f_size": 16,
 50 |                 "channels": 8,
 51 |                 "cond_stage_trainable": True,
 52 |                 "conditioning_key": "film",
 53 |                 "monitor": "val/loss_simple_ema",
 54 |                 "scale_by_std": True,
 55 |                 "unet_config": {
 56 |                     "target": "audioldm.latent_diffusion.openaimodel.UNetModel",
 57 |                     "params": {
 58 |                         "image_size": 64,
 59 |                         "extra_film_condition_dim": 512,
 60 |                         "extra_film_use_concat": True,
 61 |                         "in_channels": 8,
 62 |                         "out_channels": 8,
 63 |                         "model_channels": 128,
 64 |                         "attention_resolutions": [8, 4, 2],
 65 |                         "num_res_blocks": 2,
 66 |                         "channel_mult": [1, 2, 3, 5],
 67 |                         "num_head_channels": 32,
 68 |                         "use_spatial_transformer": True,
 69 |                     },
 70 |                 },
 71 |                 "first_stage_config": {
 72 |                     "base_learning_rate": 4.5e-05,
 73 |                     "target": "audioldm.variational_autoencoder.autoencoder.AutoencoderKL",
 74 |                     "params": {
 75 |                         "monitor": "val/rec_loss",
 76 |                         "image_key": "fbank",
 77 |                         "subband": 1,
 78 |                         "embed_dim": 8,
 79 |                         "time_shuffle": 1,
 80 |                         "ddconfig": {
 81 |                             "double_z": True,
 82 |                             "z_channels": 8,
 83 |                             "resolution": 256,
 84 |                             "downsample_time": False,
 85 |                             "in_channels": 1,
 86 |                             "out_ch": 1,
 87 |                             "ch": 128,
 88 |                             "ch_mult": [1, 2, 4],
 89 |                             "num_res_blocks": 2,
 90 |                             "attn_resolutions": [],
 91 |                             "dropout": 0.0,
 92 |                         },
 93 |                     },
 94 |                 },
 95 |                 "cond_stage_config": {
 96 |                     "target": "audioldm.clap.encoders.CLAPAudioEmbeddingClassifierFreev2",
 97 |                     "params": {
 98 |                         "key": "waveform",
 99 |                         "sampling_rate": 16000,
100 |                         "embed_mode": "audio",
101 |                         "unconditional_prob": 0.1,
102 |                     },
103 |                 },
104 |             },
105 |         },
106 |     }
107 |     
108 |     if("-l-" in model_name):
109 |         basic_config["model"]["params"]["unet_config"]["params"]["model_channels"] = 256
110 |         basic_config["model"]["params"]["unet_config"]["params"]["num_head_channels"] = 64
111 |     elif("-m-" in model_name):
112 |         basic_config["model"]["params"]["unet_config"]["params"]["model_channels"] = 192
113 |         basic_config["model"]["params"]["cond_stage_config"]["params"]["amodel"] = "HTSAT-base" # This model use a larger HTAST
114 |         
115 |     return basic_config
116 |     
117 | 
118 | def load_json(fname):
119 |     with open(fname, "r") as f:
120 |         data = json.load(f)
121 |         return data
122 | 
123 | 
124 | def read_json(dataset_json_file):
125 |     with open(dataset_json_file, "r") as fp:
126 |         data_json = json.load(fp)
127 |     return data_json["data"]
128 | 
129 | 
130 | def seed_everything(seed):
131 |     import random, os
132 |     import numpy as np
133 |     import torch
134 | 
135 |     random.seed(seed)
136 |     os.environ["PYTHONHASHSEED"] = str(seed)
137 |     np.random.seed(seed)
138 |     torch.manual_seed(seed)
139 |     torch.cuda.manual_seed(seed)
140 |     torch.backends.cudnn.deterministic = True
141 |     torch.backends.cudnn.benchmark = True
142 | 


--------------------------------------------------------------------------------