├── README.md ├── assets ├── emotion_temp.wav ├── example.png ├── framework.png ├── librispeech_temp.wav ├── logo.png ├── loss_plot_outliers_replaced.pdf ├── question.wav └── temp.wav ├── cosyvoice ├── __init__.py ├── bin │ ├── inference.py │ └── train.py ├── cli │ ├── __init__.py │ ├── cosyvoice.py │ ├── frontend.py │ └── model.py ├── dataset │ ├── __init__.py │ ├── dataset.py │ └── processor.py ├── flow │ ├── decoder.py │ ├── flow.py │ ├── flow_gradtts.py │ ├── flow_matching.py │ ├── flow_matching_dit.py │ ├── length_regulator.py │ └── stable │ │ ├── adp.py │ │ ├── blocks.py │ │ ├── dit.py │ │ ├── dit_v2.py │ │ ├── sampling.py │ │ ├── stable_diffusion.py │ │ ├── stable_diffusion_test.py │ │ ├── transformer.py │ │ └── transformer_use_mask.py ├── hifigan │ ├── f0_predictor.py │ └── generator.py ├── llm │ └── llm.py ├── transformer │ ├── __init__.py │ ├── activation.py │ ├── attention.py │ ├── convolution.py │ ├── decoder.py │ ├── decoder_layer.py │ ├── embedding.py │ ├── encoder.py │ ├── encoder_layer.py │ ├── label_smoothing_loss.py │ ├── positionwise_feed_forward.py │ └── subsampling.py ├── utils │ ├── __init__.py │ ├── block_mask_util.py │ ├── class_utils.py │ ├── common.py │ ├── executor.py │ ├── file_utils.py │ ├── frontend_utils.py │ ├── mask.py │ ├── scheduler.py │ └── train_utils.py ├── vocab_16K.yaml └── vocab_6K.yaml ├── demo.py ├── inference.py ├── openomni ├── __init__.py ├── constants.py ├── conversation.py ├── eval │ ├── cvbench_eval.py │ ├── eval_gpt_review.py │ ├── eval_gpt_review_bench.py │ ├── eval_gpt_review_visual.py │ ├── eval_pope.py │ ├── eval_science_qa.py │ ├── eval_science_qa_gpt4.py │ ├── eval_science_qa_gpt4_requery.py │ ├── eval_textvqa.py │ ├── generate_webpage_data_from_table.py │ ├── m4c_evaluator.py │ ├── mm_vet_eval.py │ ├── mminst_eval.py │ ├── mmvp_eval.py │ ├── model_qa.py │ ├── model_vqa_blink.py │ ├── model_vqa_cvbench.py │ ├── model_vqa_gqa.py │ ├── model_vqa_loader.py │ ├── model_vqa_mia.py │ ├── model_vqa_mmbench.py │ ├── model_vqa_mminst.py │ ├── model_vqa_mminst2.py │ ├── model_vqa_mmvp.py │ ├── model_vqa_science.py │ ├── model_vqa_test.py │ ├── model_vqa_textvqa.py │ ├── model_vqa_vqa2.py │ ├── omni_eval.py │ ├── ov_odssey_eval.py │ ├── qa_baseline_gpt35.py │ ├── qwen2 │ │ ├── aishell2_eval.jsonl │ │ ├── asr_eval.py │ │ ├── et2s_eval.py │ │ ├── librispeech_eval.jsonl │ │ ├── omni_eval.py │ │ ├── openomni_emotion_val.json │ │ ├── ov_ossey_eval.py │ │ ├── t2s_eval.py │ │ └── wenetspeech_eval.json │ ├── run_llava.py │ └── summarize_gpt_review.py ├── flow_inference.py ├── mm_utils.py ├── model │ ├── __init__.py │ ├── apply_delta.py │ ├── builder.py │ ├── consolidate.py │ ├── language_model │ │ ├── llava_her_llama.py │ │ ├── llava_her_qwen.py │ │ ├── llava_llama.py │ │ ├── llava_mistral.py │ │ ├── llava_mpt.py │ │ └── llava_qwen.py │ ├── llava_arch.py │ ├── llava_her_arch.py │ ├── make_delta.py │ ├── multimodal_encoder │ │ ├── builder.py │ │ └── clip_encoder.py │ ├── multimodal_projector │ │ └── builder.py │ ├── speech_encoder │ │ ├── builder.py │ │ └── speech_encoder.py │ ├── speech_generator_ar │ │ ├── builder.py │ │ ├── generation.py │ │ └── speech_generator.py │ ├── speech_generator_ctc │ │ ├── builder.py │ │ ├── generation.py │ │ └── speech_generator.py │ ├── speech_projector │ │ ├── builder.py │ │ └── speech_projector.py │ ├── utils.py │ ├── visual_encoder │ │ ├── builder.py │ │ └── clip_encoder.py │ └── visual_projector │ │ └── builder.py ├── serve │ ├── __init__.py │ ├── cli.py │ ├── controller.py │ ├── examples │ │ ├── extreme_ironing.jpg │ │ └── waterview.jpg │ ├── gradio_web_server.py │ ├── model_worker.py │ ├── register_worker.py │ ├── sglang_worker.py │ └── test_message.py ├── train │ ├── llama_flash_attn_monkey_patch.py │ ├── llama_xformers_attn_monkey_patch.py │ ├── llava_trainer.py │ ├── train.py │ ├── train_mem.py │ └── train_xformers.py └── utils.py ├── pyproject.toml ├── requirements.txt ├── scripts ├── clear.sh ├── convert_gqa_for_eval.py ├── convert_mmbench_for_submission.py ├── convert_mmvet_for_eval.py ├── convert_seed_for_submission.py ├── convert_sqa_to_llava.py ├── convert_sqa_to_llava_base_prompt.py ├── convert_vizwiz_for_submission.py ├── convert_vqav2_for_submission.py ├── mmevol │ ├── eval │ │ ├── gqa.sh │ │ ├── mmbench.sh │ │ ├── mmbench_cn.sh │ │ ├── mme.sh │ │ ├── seed.sh │ │ ├── sqa.sh │ │ └── textvqa.sh │ └── train │ │ ├── llama3 │ │ ├── finetune.sh │ │ └── pretrain.sh │ │ └── qwen2 │ │ ├── finetune.sh │ │ └── pretrain.sh ├── train │ ├── llama3 │ │ ├── asr_finetune.sh │ │ ├── image2text_finetune.sh │ │ ├── image2text_pretrain.sh │ │ ├── speech2text_pretrain.sh │ │ ├── text2speech_dpo.sh │ │ ├── text2speech_pretrain.sh │ │ └── text2speech_pretrain_ctc.sh │ └── qwen2 │ │ ├── asr_finetune.sh │ │ ├── image2text_finetune.sh │ │ ├── image2text_pretrain.sh │ │ ├── speech2text_pretrain.sh │ │ ├── text2speech_dpo.sh │ │ ├── text2speech_pretrain.sh │ │ ├── text2speech_pretrain_6k.sh │ │ └── text2speech_pretrain_ctc.sh ├── zero2.json ├── zero3.json └── zero3_offload.json └── vlmevalkit ├── run.py ├── script └── run_inference_2.sh └── vlmeval ├── __init__.py ├── api ├── __init__.py ├── base.py ├── gpt.py └── gpt_int.py ├── config.py ├── evaluate ├── OCRBench.py ├── __init__.py ├── coco_eval.py ├── llavabench.py ├── mathvista_eval.py ├── misc.py ├── mmvet_eval.py ├── multiple_choice.py ├── vqa_eval.py └── yes_or_no.py ├── inference.py ├── smp ├── __init__.py ├── file.py ├── log.py ├── misc.py └── vlm.py ├── utils ├── __init__.py ├── custom_prompt.py ├── dataset.py ├── dataset_config.py ├── matching_util.py └── mp_util.py └── vlm ├── __init__.py ├── base.py ├── openomni_llama.py └── openomni_qwen.py /assets/emotion_temp.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RainBowLuoCS/OpenOmni/2892efff5fb4155a2f0b775aaa1f316aa69e289a/assets/emotion_temp.wav -------------------------------------------------------------------------------- /assets/example.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RainBowLuoCS/OpenOmni/2892efff5fb4155a2f0b775aaa1f316aa69e289a/assets/example.png -------------------------------------------------------------------------------- /assets/framework.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RainBowLuoCS/OpenOmni/2892efff5fb4155a2f0b775aaa1f316aa69e289a/assets/framework.png -------------------------------------------------------------------------------- /assets/librispeech_temp.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RainBowLuoCS/OpenOmni/2892efff5fb4155a2f0b775aaa1f316aa69e289a/assets/librispeech_temp.wav -------------------------------------------------------------------------------- /assets/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RainBowLuoCS/OpenOmni/2892efff5fb4155a2f0b775aaa1f316aa69e289a/assets/logo.png -------------------------------------------------------------------------------- /assets/loss_plot_outliers_replaced.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RainBowLuoCS/OpenOmni/2892efff5fb4155a2f0b775aaa1f316aa69e289a/assets/loss_plot_outliers_replaced.pdf -------------------------------------------------------------------------------- /assets/question.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RainBowLuoCS/OpenOmni/2892efff5fb4155a2f0b775aaa1f316aa69e289a/assets/question.wav -------------------------------------------------------------------------------- /assets/temp.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RainBowLuoCS/OpenOmni/2892efff5fb4155a2f0b775aaa1f316aa69e289a/assets/temp.wav -------------------------------------------------------------------------------- /cosyvoice/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RainBowLuoCS/OpenOmni/2892efff5fb4155a2f0b775aaa1f316aa69e289a/cosyvoice/__init__.py -------------------------------------------------------------------------------- /cosyvoice/cli/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RainBowLuoCS/OpenOmni/2892efff5fb4155a2f0b775aaa1f316aa69e289a/cosyvoice/cli/__init__.py -------------------------------------------------------------------------------- /cosyvoice/cli/cosyvoice.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu) 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | import os 15 | import torch 16 | from hyperpyyaml import load_hyperpyyaml 17 | from modelscope import snapshot_download 18 | from cosyvoice.cli.frontend import CosyVoiceFrontEnd 19 | from cosyvoice.cli.model import CosyVoiceModel 20 | 21 | class CosyVoice: 22 | 23 | def __init__(self, model_dir): 24 | instruct = True if '-Instruct' in model_dir else False 25 | self.model_dir = model_dir 26 | if not os.path.exists(model_dir): 27 | model_dir = snapshot_download(model_dir) 28 | with open('{}/cosyvoice.yaml'.format(model_dir), 'r') as f: 29 | configs = load_hyperpyyaml(f) 30 | self.frontend = CosyVoiceFrontEnd(configs['get_tokenizer'], 31 | configs['feat_extractor'], 32 | '{}/campplus.onnx'.format(model_dir), 33 | '{}/speech_tokenizer_v1.onnx'.format(model_dir), 34 | '{}/spk2info.pt'.format(model_dir), 35 | instruct, 36 | configs['allowed_special']) 37 | self.model = CosyVoiceModel(configs['llm'], configs['flow'], configs['hift']) 38 | self.model.load('{}/llm.pt'.format(model_dir), 39 | '{}/flow.pt'.format(model_dir), 40 | '{}/hift.pt'.format(model_dir)) 41 | del configs 42 | 43 | def list_avaliable_spks(self): 44 | spks = list(self.frontend.spk2info.keys()) 45 | return spks 46 | 47 | def inference_sft(self, tts_text, spk_id): 48 | tts_speeches = [] 49 | for i in self.frontend.text_normalize(tts_text, split=True): 50 | model_input = self.frontend.frontend_sft(i, spk_id) 51 | model_output = self.model.inference(**model_input) 52 | tts_speeches.append(model_output['tts_speech']) 53 | return {'tts_speech': torch.concat(tts_speeches, dim=1)} 54 | 55 | def inference_zero_shot(self, tts_text, prompt_text, prompt_speech_16k): 56 | prompt_text = self.frontend.text_normalize(prompt_text, split=False) 57 | tts_speeches = [] 58 | for i in self.frontend.text_normalize(tts_text, split=True): 59 | model_input = self.frontend.frontend_zero_shot(i, prompt_text, prompt_speech_16k) 60 | model_output = self.model.inference(**model_input) 61 | tts_speeches.append(model_output['tts_speech']) 62 | return {'tts_speech': torch.concat(tts_speeches, dim=1)} 63 | 64 | def inference_cross_lingual(self, tts_text, prompt_speech_16k): 65 | if self.frontend.instruct is True: 66 | raise ValueError('{} do not support cross_lingual inference'.format(self.model_dir)) 67 | tts_speeches = [] 68 | for i in self.frontend.text_normalize(tts_text, split=True): 69 | model_input = self.frontend.frontend_cross_lingual(i, prompt_speech_16k) 70 | model_output = self.model.inference(**model_input) 71 | tts_speeches.append(model_output['tts_speech']) 72 | return {'tts_speech': torch.concat(tts_speeches, dim=1)} 73 | 74 | def inference_instruct(self, tts_text, spk_id, instruct_text): 75 | if self.frontend.instruct is False: 76 | raise ValueError('{} do not support instruct inference'.format(self.model_dir)) 77 | instruct_text = self.frontend.text_normalize(instruct_text, split=False) 78 | tts_speeches = [] 79 | for i in self.frontend.text_normalize(tts_text, split=True): 80 | model_input = self.frontend.frontend_instruct(i, spk_id, instruct_text) 81 | model_output = self.model.inference(**model_input) 82 | tts_speeches.append(model_output['tts_speech']) 83 | return {'tts_speech': torch.concat(tts_speeches, dim=1)} 84 | -------------------------------------------------------------------------------- /cosyvoice/dataset/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RainBowLuoCS/OpenOmni/2892efff5fb4155a2f0b775aaa1f316aa69e289a/cosyvoice/dataset/__init__.py -------------------------------------------------------------------------------- /cosyvoice/flow/length_regulator.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu, Zhihao Du) 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | from typing import Tuple 15 | import torch.nn as nn 16 | from torch.nn import functional as F 17 | from cosyvoice.utils.mask import make_pad_mask 18 | 19 | 20 | class InterpolateRegulator(nn.Module): 21 | def __init__( 22 | self, 23 | channels: int, 24 | sampling_ratios: Tuple, 25 | out_channels: int = None, 26 | groups: int = 1, 27 | ): 28 | super().__init__() 29 | self.sampling_ratios = sampling_ratios 30 | out_channels = out_channels or channels 31 | model = nn.ModuleList([]) 32 | if len(sampling_ratios) > 0: 33 | for _ in sampling_ratios: 34 | module = nn.Conv1d(channels, channels, 3, 1, 1) 35 | norm = nn.GroupNorm(groups, channels) 36 | act = nn.Mish() 37 | model.extend([module, norm, act]) 38 | model.append( 39 | nn.Conv1d(channels, out_channels, 1, 1) 40 | ) 41 | self.model = nn.Sequential(*model) 42 | 43 | def forward(self, x, ylens=None): 44 | # x in (B, T, D) 45 | mask = (~make_pad_mask(ylens)).to(x).unsqueeze(-1) 46 | x = F.interpolate(x.transpose(1, 2).contiguous(), size=ylens.max(), mode='nearest') 47 | out = self.model(x).transpose(1, 2).contiguous() 48 | olens = ylens 49 | return out * mask, olens 50 | -------------------------------------------------------------------------------- /cosyvoice/flow/stable/stable_diffusion.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.nn import functional as F 3 | from .dit import DiffusionTransformer 4 | from .adp import UNet1d 5 | from .sampling import sample 6 | import math 7 | from model.base import BaseModule 8 | import pdb 9 | 10 | target_length = 1536 11 | 12 | 13 | def pad_and_create_mask(matrix, target_length): 14 | T = matrix.shape[2] 15 | if T > target_length: 16 | raise ValueError("The third dimension length %s should not exceed %s" % (T, target_length)) 17 | 18 | padding_size = target_length - T 19 | 20 | padded_matrix = F.pad(matrix, (0, padding_size), "constant", 0) 21 | 22 | mask = torch.ones((1, target_length)) 23 | mask[:, T:] = 0 # Set the padding part to 0 24 | 25 | return padded_matrix.to(matrix.device), mask.to(matrix.device) 26 | 27 | 28 | class Stable_Diffusion(BaseModule): 29 | def __init__(self, io_channels, input_concat_dim=None, embed_dim=768, depth=24, num_heads=24, 30 | project_cond_tokens=False, transformer_type="continuous_transformer"): 31 | super(Stable_Diffusion, self).__init__() 32 | self.diffusion = DiffusionTransformer( 33 | io_channels=io_channels, 34 | input_concat_dim=input_concat_dim, 35 | embed_dim=embed_dim, 36 | # cond_token_dim=target_length, 37 | depth=depth, 38 | num_heads=num_heads, 39 | project_cond_tokens=project_cond_tokens, 40 | transformer_type=transformer_type, 41 | ) 42 | # self.diffusion = UNet1d( 43 | # in_channels=80, 44 | # channels=256, 45 | # resnet_groups=16, 46 | # kernel_multiplier_downsample=2, 47 | # multipliers=[4, 4, 4, 5, 5], 48 | # factors=[1, 2, 2, 4], # 输入长度不一致卷积缩短 49 | # num_blocks=[2, 2, 2, 2], 50 | # attentions=[1, 3, 3, 3, 3], 51 | # attention_heads=16, 52 | # attention_multiplier=4, 53 | # use_nearest_upsample=False, 54 | # use_skip_scale=True, 55 | # use_context_time=True 56 | # ) 57 | self.rng = torch.quasirandom.SobolEngine(1, scramble=True) 58 | 59 | @torch.no_grad() 60 | def forward(self, mu, mask, n_timesteps): 61 | # pdb.set_trace() 62 | mask = mask.squeeze(1) 63 | noise = torch.randn_like(mu).to(mu.device) 64 | # mu_pad, mu_pad_mask = pad_and_create_mask(mu, target_length) 65 | # extra_args = {"cross_attn_cond": mu, "cross_attn_cond_mask": mask, "mask": mask} 66 | extra_args = {"input_concat_cond": mu, "mask": mask} 67 | fakes = sample(self.diffusion, noise, n_timesteps, 0, **extra_args) 68 | 69 | return fakes 70 | 71 | def compute_loss(self, x0, mask, mu): 72 | 73 | # pdb.set_trace() 74 | t = self.rng.draw(x0.shape[0])[:, 0].to(x0.device) 75 | alphas, sigmas = torch.cos(t * math.pi / 2), torch.sin(t * math.pi / 2) 76 | 77 | alphas = alphas[:, None, None] 78 | sigmas = sigmas[:, None, None] 79 | noise = torch.randn_like(x0) 80 | noised_inputs = x0 * alphas + noise * sigmas 81 | targets = noise * alphas - x0 * sigmas 82 | mask = mask.squeeze(1) 83 | # mu_pad, mu_pad_mask = pad_and_create_mask(mu, target_length) 84 | # output = self.diffusion(noised_inputs, t, cross_attn_cond=mu, 85 | # cross_attn_cond_mask=mask, mask=mask, cfg_dropout_prob=0.1) 86 | # pdb.set_trace() 87 | output = self.diffusion(noised_inputs, # [bs, 80, 229] 88 | t, # (bs,) 89 | input_concat_cond=mu, 90 | mask=mask, # [bs, 229] 91 | cfg_dropout_prob=0.1) 92 | 93 | return self.mse_loss(output, targets, mask), output 94 | 95 | def mse_loss(self, output, targets, mask): 96 | 97 | mse_loss = F.mse_loss(output, targets, reduction='none') 98 | 99 | if mask.ndim == 2 and mse_loss.ndim == 3: 100 | mask = mask.unsqueeze(1) 101 | 102 | if mask.shape[1] != mse_loss.shape[1]: 103 | mask = mask.repeat(1, mse_loss.shape[1], 1) 104 | 105 | mse_loss = mse_loss * mask 106 | 107 | mse_loss = mse_loss.mean() 108 | 109 | return mse_loss 110 | -------------------------------------------------------------------------------- /cosyvoice/flow/stable/stable_diffusion_test.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.nn import functional as F 3 | from .dit import DiffusionTransformer 4 | from .adp import UNet1d 5 | from .sampling import sample 6 | import math 7 | from model.base import BaseModule 8 | import pdb 9 | 10 | target_length = 1536 11 | def pad_and_create_mask(matrix, target_length): 12 | 13 | T = matrix.shape[2] 14 | if T > target_length: 15 | raise ValueError("The third dimension length %s should not exceed %s"%(T, target_length)) 16 | 17 | padding_size = target_length - T 18 | 19 | padded_matrix = F.pad(matrix, (0, padding_size), "constant", 0) 20 | 21 | mask = torch.ones((1, target_length)) 22 | mask[:, T:] = 0 # Set the padding part to 0 23 | 24 | return padded_matrix.to(matrix.device), mask.to(matrix.device) 25 | 26 | 27 | class Stable_Diffusion(BaseModule): 28 | def __init__(self): 29 | super(Stable_Diffusion, self).__init__() 30 | self.diffusion = DiffusionTransformer( 31 | io_channels=80, 32 | # input_concat_dim=80, 33 | embed_dim=768, 34 | # cond_token_dim=target_length, 35 | depth=24, 36 | num_heads=24, 37 | project_cond_tokens=False, 38 | transformer_type="continuous_transformer", 39 | ) 40 | # self.diffusion = UNet1d( 41 | # in_channels=80, 42 | # channels=256, 43 | # resnet_groups=16, 44 | # kernel_multiplier_downsample=2, 45 | # multipliers=[4, 4, 4, 5, 5], 46 | # factors=[1, 2, 2, 4], # 输入长度不一致卷积缩短 47 | # num_blocks=[2, 2, 2, 2], 48 | # attentions=[1, 3, 3, 3, 3], 49 | # attention_heads=16, 50 | # attention_multiplier=4, 51 | # use_nearest_upsample=False, 52 | # use_skip_scale=True, 53 | # use_context_time=True 54 | # ) 55 | self.rng = torch.quasirandom.SobolEngine(1, scramble=True) 56 | 57 | @torch.no_grad() 58 | def forward(self, mu, mask, n_timesteps): 59 | # pdb.set_trace() 60 | mask = mask.squeeze(1) 61 | # noise = torch.randn_like(mu).to(mu.device) 62 | # mu_pad, mu_pad_mask = pad_and_create_mask(mu, target_length) 63 | # extra_args = {"cross_attn_cond": mu, "cross_attn_cond_mask": mask, "mask": mask} 64 | extra_args = {"mask": mask} 65 | fakes = sample(self.diffusion, mu, n_timesteps, 0, **extra_args) 66 | 67 | return fakes 68 | 69 | 70 | def compute_loss(self, x0, mask, mu): 71 | 72 | # pdb.set_trace() 73 | t = self.rng.draw(x0.shape[0])[:, 0].to(x0.device) 74 | alphas, sigmas = torch.cos(t * math.pi / 2), torch.sin(t * math.pi / 2) 75 | 76 | alphas = alphas[:, None, None] 77 | sigmas = sigmas[:, None, None] 78 | noise = torch.randn_like(x0) 79 | noised_inputs = x0 * alphas + noise * sigmas 80 | targets = mu * alphas - x0 * sigmas 81 | mask = mask.squeeze(1) 82 | # mu_pad, mu_pad_mask = pad_and_create_mask(mu, target_length) 83 | # output = self.diffusion(noised_inputs, t, cross_attn_cond=mu, 84 | # cross_attn_cond_mask=mask, mask=mask, cfg_dropout_prob=0.1) 85 | output = self.diffusion(noised_inputs, t, mask=mask, cfg_dropout_prob=0.1) 86 | 87 | return self.mse_loss(output, targets, mask), output 88 | 89 | 90 | def mse_loss(self, output, targets, mask): 91 | 92 | mse_loss = F.mse_loss(output, targets, reduction='none') 93 | 94 | if mask.ndim == 2 and mse_loss.ndim == 3: 95 | mask = mask.unsqueeze(1) 96 | 97 | if mask.shape[1] != mse_loss.shape[1]: 98 | mask = mask.repeat(1, mse_loss.shape[1], 1) 99 | 100 | mse_loss = mse_loss[mask] 101 | 102 | mse_loss = mse_loss.mean() 103 | 104 | return mse_loss -------------------------------------------------------------------------------- /cosyvoice/hifigan/f0_predictor.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu, Kai Hu) 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | import torch 15 | import torch.nn as nn 16 | from torch.nn.utils import weight_norm 17 | 18 | 19 | class ConvRNNF0Predictor(nn.Module): 20 | def __init__(self, 21 | num_class: int = 1, 22 | in_channels: int = 80, 23 | cond_channels: int = 512 24 | ): 25 | super().__init__() 26 | 27 | self.num_class = num_class 28 | self.condnet = nn.Sequential( 29 | weight_norm( 30 | nn.Conv1d(in_channels, cond_channels, kernel_size=3, padding=1) 31 | ), 32 | nn.ELU(), 33 | weight_norm( 34 | nn.Conv1d(cond_channels, cond_channels, kernel_size=3, padding=1) 35 | ), 36 | nn.ELU(), 37 | weight_norm( 38 | nn.Conv1d(cond_channels, cond_channels, kernel_size=3, padding=1) 39 | ), 40 | nn.ELU(), 41 | weight_norm( 42 | nn.Conv1d(cond_channels, cond_channels, kernel_size=3, padding=1) 43 | ), 44 | nn.ELU(), 45 | weight_norm( 46 | nn.Conv1d(cond_channels, cond_channels, kernel_size=3, padding=1) 47 | ), 48 | nn.ELU(), 49 | ) 50 | self.classifier = nn.Linear(in_features=cond_channels, out_features=self.num_class) 51 | 52 | def forward(self, x: torch.Tensor) -> torch.Tensor: 53 | x = self.condnet(x) 54 | x = x.transpose(1, 2) 55 | return torch.abs(self.classifier(x).squeeze(-1)) 56 | -------------------------------------------------------------------------------- /cosyvoice/transformer/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RainBowLuoCS/OpenOmni/2892efff5fb4155a2f0b775aaa1f316aa69e289a/cosyvoice/transformer/__init__.py -------------------------------------------------------------------------------- /cosyvoice/transformer/activation.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020 Johns Hopkins University (Shinji Watanabe) 2 | # 2020 Northwestern Polytechnical University (Pengcheng Guo) 3 | # 2020 Mobvoi Inc (Binbin Zhang) 4 | # 2024 Alibaba Inc (Xiang Lyu) 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | """Swish() activation function for Conformer.""" 18 | 19 | import torch 20 | from torch import nn, sin, pow 21 | from torch.nn import Parameter 22 | 23 | 24 | class Swish(torch.nn.Module): 25 | """Construct an Swish object.""" 26 | 27 | def forward(self, x: torch.Tensor) -> torch.Tensor: 28 | """Return Swish activation function.""" 29 | return x * torch.sigmoid(x) 30 | 31 | 32 | # Implementation adapted from https://github.com/EdwardDixon/snake under the MIT license. 33 | # LICENSE is in incl_licenses directory. 34 | class Snake(nn.Module): 35 | ''' 36 | Implementation of a sine-based periodic activation function 37 | Shape: 38 | - Input: (B, C, T) 39 | - Output: (B, C, T), same shape as the input 40 | Parameters: 41 | - alpha - trainable parameter 42 | References: 43 | - This activation function is from this paper by Liu Ziyin, Tilman Hartwig, Masahito Ueda: 44 | https://arxiv.org/abs/2006.08195 45 | Examples: 46 | >>> a1 = snake(256) 47 | >>> x = torch.randn(256) 48 | >>> x = a1(x) 49 | ''' 50 | def __init__(self, in_features, alpha=1.0, alpha_trainable=True, alpha_logscale=False): 51 | ''' 52 | Initialization. 53 | INPUT: 54 | - in_features: shape of the input 55 | - alpha: trainable parameter 56 | alpha is initialized to 1 by default, higher values = higher-frequency. 57 | alpha will be trained along with the rest of your model. 58 | ''' 59 | super(Snake, self).__init__() 60 | self.in_features = in_features 61 | 62 | # initialize alpha 63 | self.alpha_logscale = alpha_logscale 64 | if self.alpha_logscale: # log scale alphas initialized to zeros 65 | self.alpha = Parameter(torch.zeros(in_features) * alpha) 66 | else: # linear scale alphas initialized to ones 67 | self.alpha = Parameter(torch.ones(in_features) * alpha) 68 | 69 | self.alpha.requires_grad = alpha_trainable 70 | 71 | self.no_div_by_zero = 0.000000001 72 | 73 | def forward(self, x): 74 | ''' 75 | Forward pass of the function. 76 | Applies the function to the input elementwise. 77 | Snake ∶= x + 1/a * sin^2 (xa) 78 | ''' 79 | alpha = self.alpha.unsqueeze(0).unsqueeze(-1) # line up with x to [B, C, T] 80 | if self.alpha_logscale: 81 | alpha = torch.exp(alpha) 82 | x = x + (1.0 / (alpha + self.no_div_by_zero)) * pow(sin(x * alpha), 2) 83 | 84 | return x 85 | -------------------------------------------------------------------------------- /cosyvoice/transformer/label_smoothing_loss.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019 Shigeki Karita 2 | # 2020 Mobvoi Inc (Binbin Zhang) 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Label smoothing module.""" 16 | 17 | import torch 18 | from torch import nn 19 | 20 | 21 | class LabelSmoothingLoss(nn.Module): 22 | """Label-smoothing loss. 23 | 24 | In a standard CE loss, the label's data distribution is: 25 | [0,1,2] -> 26 | [ 27 | [1.0, 0.0, 0.0], 28 | [0.0, 1.0, 0.0], 29 | [0.0, 0.0, 1.0], 30 | ] 31 | 32 | In the smoothing version CE Loss,some probabilities 33 | are taken from the true label prob (1.0) and are divided 34 | among other labels. 35 | 36 | e.g. 37 | smoothing=0.1 38 | [0,1,2] -> 39 | [ 40 | [0.9, 0.05, 0.05], 41 | [0.05, 0.9, 0.05], 42 | [0.05, 0.05, 0.9], 43 | ] 44 | 45 | Args: 46 | size (int): the number of class 47 | padding_idx (int): padding class id which will be ignored for loss 48 | smoothing (float): smoothing rate (0.0 means the conventional CE) 49 | normalize_length (bool): 50 | normalize loss by sequence length if True 51 | normalize loss by batch size if False 52 | """ 53 | 54 | def __init__(self, 55 | size: int, 56 | padding_idx: int, 57 | smoothing: float, 58 | normalize_length: bool = False): 59 | """Construct an LabelSmoothingLoss object.""" 60 | super(LabelSmoothingLoss, self).__init__() 61 | self.criterion = nn.KLDivLoss(reduction="none") 62 | self.padding_idx = padding_idx 63 | self.confidence = 1.0 - smoothing 64 | self.smoothing = smoothing 65 | self.size = size 66 | self.normalize_length = normalize_length 67 | 68 | def forward(self, x: torch.Tensor, target: torch.Tensor) -> torch.Tensor: 69 | """Compute loss between x and target. 70 | 71 | The model outputs and data labels tensors are flatten to 72 | (batch*seqlen, class) shape and a mask is applied to the 73 | padding part which should not be calculated for loss. 74 | 75 | Args: 76 | x (torch.Tensor): prediction (batch, seqlen, class) 77 | target (torch.Tensor): 78 | target signal masked with self.padding_id (batch, seqlen) 79 | Returns: 80 | loss (torch.Tensor) : The KL loss, scalar float value 81 | """ 82 | assert x.size(2) == self.size 83 | batch_size = x.size(0) 84 | x = x.view(-1, self.size) 85 | target = target.view(-1) 86 | # use zeros_like instead of torch.no_grad() for true_dist, 87 | # since no_grad() can not be exported by JIT 88 | true_dist = torch.zeros_like(x) 89 | true_dist.fill_(self.smoothing / (self.size - 1)) 90 | ignore = target == self.padding_idx # (B,) 91 | total = len(target) - ignore.sum().item() 92 | target = target.masked_fill(ignore, 0) # avoid -1 index 93 | true_dist.scatter_(1, target.unsqueeze(1), self.confidence) 94 | kl = self.criterion(torch.log_softmax(x, dim=1), true_dist) 95 | denom = total if self.normalize_length else batch_size 96 | return kl.masked_fill(ignore.unsqueeze(1), 0).sum() / denom 97 | -------------------------------------------------------------------------------- /cosyvoice/transformer/positionwise_feed_forward.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019 Shigeki Karita 2 | # 2020 Mobvoi Inc (Binbin Zhang) 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Positionwise feed forward layer definition.""" 16 | 17 | import torch 18 | 19 | 20 | class PositionwiseFeedForward(torch.nn.Module): 21 | """Positionwise feed forward layer. 22 | 23 | FeedForward are appied on each position of the sequence. 24 | The output dim is same with the input dim. 25 | 26 | Args: 27 | idim (int): Input dimenstion. 28 | hidden_units (int): The number of hidden units. 29 | dropout_rate (float): Dropout rate. 30 | activation (torch.nn.Module): Activation function 31 | """ 32 | 33 | def __init__( 34 | self, 35 | idim: int, 36 | hidden_units: int, 37 | dropout_rate: float, 38 | activation: torch.nn.Module = torch.nn.ReLU(), 39 | ): 40 | """Construct a PositionwiseFeedForward object.""" 41 | super(PositionwiseFeedForward, self).__init__() 42 | self.w_1 = torch.nn.Linear(idim, hidden_units) 43 | self.activation = activation 44 | self.dropout = torch.nn.Dropout(dropout_rate) 45 | self.w_2 = torch.nn.Linear(hidden_units, idim) 46 | 47 | def forward(self, xs: torch.Tensor) -> torch.Tensor: 48 | """Forward function. 49 | 50 | Args: 51 | xs: input tensor (B, L, D) 52 | Returns: 53 | output tensor, (B, L, D) 54 | """ 55 | return self.w_2(self.dropout(self.activation(self.w_1(xs)))) 56 | 57 | 58 | class MoEFFNLayer(torch.nn.Module): 59 | """ 60 | Mixture of expert with Positionwise feed forward layer 61 | See also figure 1 in https://arxiv.org/pdf/2305.15663.pdf 62 | The output dim is same with the input dim. 63 | 64 | Modified from https://github.com/Lightning-AI/lit-gpt/pull/823 65 | https://github.com/mistralai/mistral-src/blob/b46d6/moe_one_file_ref.py#L203-L219 66 | Args: 67 | n_expert: number of expert. 68 | n_expert_per_token: The actual number of experts used for each frame 69 | idim (int): Input dimenstion. 70 | hidden_units (int): The number of hidden units. 71 | dropout_rate (float): Dropout rate. 72 | activation (torch.nn.Module): Activation function 73 | """ 74 | 75 | def __init__( 76 | self, 77 | n_expert: int, 78 | n_expert_per_token: int, 79 | idim: int, 80 | hidden_units: int, 81 | dropout_rate: float, 82 | activation: torch.nn.Module = torch.nn.ReLU(), 83 | ): 84 | super(MoEFFNLayer, self).__init__() 85 | self.gate = torch.nn.Linear(idim, n_expert, bias=False) 86 | self.experts = torch.nn.ModuleList( 87 | PositionwiseFeedForward(idim, hidden_units, dropout_rate, 88 | activation) for _ in range(n_expert)) 89 | self.n_expert_per_token = n_expert_per_token 90 | 91 | def forward(self, xs: torch.Tensor) -> torch.Tensor: 92 | """Foward function. 93 | Args: 94 | xs: input tensor (B, L, D) 95 | Returns: 96 | output tensor, (B, L, D) 97 | 98 | """ 99 | B, L, D = xs.size( 100 | ) # batch size, sequence length, embedding dimension (idim) 101 | xs = xs.view(-1, D) # (B*L, D) 102 | router = self.gate(xs) # (B*L, n_expert) 103 | logits, indices = torch.topk( 104 | router, self.n_expert_per_token 105 | ) # probs:(B*L, n_expert), indices: (B*L, n_expert) 106 | weights = torch.nn.functional.softmax( 107 | logits, dim=1, 108 | dtype=torch.float).to(dtype=xs.dtype) # (B*L, n_expert_per_token) 109 | output = torch.zeros_like(xs) # (B*L, D) 110 | for i, expert in enumerate(self.experts): 111 | mask = indices == i 112 | batch_idx, ith_expert = torch.where(mask) 113 | output[batch_idx] += weights[batch_idx, ith_expert, None] * expert( 114 | xs[batch_idx]) 115 | return output.view(B, L, D) 116 | -------------------------------------------------------------------------------- /cosyvoice/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RainBowLuoCS/OpenOmni/2892efff5fb4155a2f0b775aaa1f316aa69e289a/cosyvoice/utils/__init__.py -------------------------------------------------------------------------------- /cosyvoice/utils/block_mask_util.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | 4 | def create_grid_mask(seq_length, trunck_length, fill_triangle): 5 | assert seq_length > 0 6 | 7 | # 先不考虑seen_length创建一个grid mask: 8 | if fill_triangle: 9 | mask = 1 - torch.triu(torch.ones(seq_length, seq_length), diagonal=1) 10 | # 下三角与主对角线都为1 11 | else: 12 | mask = torch.zeros(seq_length, seq_length) 13 | 14 | for i in range(seq_length): 15 | trunck_idx = i // trunck_length 16 | trunck_start = trunck_idx * trunck_length 17 | trunck_end = trunck_length + trunck_start 18 | mask[i][trunck_start:trunck_end] = 1 19 | 20 | return mask 21 | 22 | 23 | if __name__ == "__main__": 24 | mask = create_grid_mask(seq_length=8, trunck_length=3, fill_triangle=True).int() 25 | print(mask) 26 | # tensor([[1, 1, 1, 0, 0, 0, 0, 0], 27 | # [1, 1, 1, 0, 0, 0, 0, 0], 28 | # [1, 1, 1, 0, 0, 0, 0, 0], 29 | # [1, 1, 1, 1, 1, 1, 0, 0], 30 | # [1, 1, 1, 1, 1, 1, 0, 0], 31 | # [1, 1, 1, 1, 1, 1, 0, 0], 32 | # [1, 1, 1, 1, 1, 1, 1, 1], 33 | # [1, 1, 1, 1, 1, 1, 1, 1]] 34 | 35 | -------------------------------------------------------------------------------- /cosyvoice/utils/class_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright [2023-11-28] 2 | # 2024 Alibaba Inc (authors: Xiang Lyu) 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | import torch 16 | 17 | from cosyvoice.transformer.activation import Swish 18 | from cosyvoice.transformer.subsampling import ( 19 | LinearNoSubsampling, 20 | EmbedinigNoSubsampling, 21 | Conv1dSubsampling2, 22 | Conv2dSubsampling4, 23 | Conv2dSubsampling6, 24 | Conv2dSubsampling8, 25 | ) 26 | from cosyvoice.transformer.embedding import (PositionalEncoding, 27 | RelPositionalEncoding, 28 | WhisperPositionalEncoding, 29 | LearnablePositionalEncoding, 30 | NoPositionalEncoding) 31 | from cosyvoice.transformer.attention import (MultiHeadedAttention, 32 | RelPositionMultiHeadedAttention, 33 | BlockRelPositionMultiHeadedAttention) 34 | from cosyvoice.transformer.embedding import EspnetRelPositionalEncoding 35 | from cosyvoice.transformer.subsampling import LegacyLinearNoSubsampling 36 | 37 | 38 | COSYVOICE_ACTIVATION_CLASSES = { 39 | "hardtanh": torch.nn.Hardtanh, 40 | "tanh": torch.nn.Tanh, 41 | "relu": torch.nn.ReLU, 42 | "selu": torch.nn.SELU, 43 | "swish": getattr(torch.nn, "SiLU", Swish), 44 | "gelu": torch.nn.GELU, 45 | } 46 | 47 | COSYVOICE_SUBSAMPLE_CLASSES = { 48 | "linear": LinearNoSubsampling, 49 | "linear_legacy": LegacyLinearNoSubsampling, 50 | "embed": EmbedinigNoSubsampling, 51 | "conv1d2": Conv1dSubsampling2, 52 | "conv2d": Conv2dSubsampling4, 53 | "conv2d6": Conv2dSubsampling6, 54 | "conv2d8": Conv2dSubsampling8, 55 | 'paraformer_dummy': torch.nn.Identity 56 | } 57 | 58 | COSYVOICE_EMB_CLASSES = { 59 | "embed": PositionalEncoding, 60 | "abs_pos": PositionalEncoding, 61 | "rel_pos": RelPositionalEncoding, 62 | "rel_pos_espnet": EspnetRelPositionalEncoding, 63 | "no_pos": NoPositionalEncoding, 64 | "abs_pos_whisper": WhisperPositionalEncoding, 65 | "embed_learnable_pe": LearnablePositionalEncoding, 66 | } 67 | 68 | COSYVOICE_ATTENTION_CLASSES = { 69 | "selfattn": MultiHeadedAttention, 70 | "rel_selfattn": RelPositionMultiHeadedAttention, 71 | "block_rel_selfattn": BlockRelPositionMultiHeadedAttention, 72 | } 73 | -------------------------------------------------------------------------------- /cosyvoice/utils/common.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) 2 | # 2024 Alibaba Inc (authors: Xiang Lyu) 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # Modified from ESPnet(https://github.com/espnet/espnet) 16 | """Unility functions for Transformer.""" 17 | 18 | from typing import List 19 | 20 | import torch 21 | 22 | IGNORE_ID = -1 23 | 24 | 25 | def pad_list(xs: List[torch.Tensor], pad_value: int): 26 | """Perform padding for the list of tensors. 27 | 28 | Args: 29 | xs (List): List of Tensors [(T_1, `*`), (T_2, `*`), ..., (T_B, `*`)]. 30 | pad_value (float): Value for padding. 31 | 32 | Returns: 33 | Tensor: Padded tensor (B, Tmax, `*`). 34 | 35 | Examples: 36 | >>> x = [torch.ones(4), torch.ones(2), torch.ones(1)] 37 | >>> x 38 | [tensor([1., 1., 1., 1.]), tensor([1., 1.]), tensor([1.])] 39 | >>> pad_list(x, 0) 40 | tensor([[1., 1., 1., 1.], 41 | [1., 1., 0., 0.], 42 | [1., 0., 0., 0.]]) 43 | 44 | """ 45 | max_len = max([len(item) for item in xs]) 46 | batchs = len(xs) 47 | ndim = xs[0].ndim 48 | if ndim == 1: 49 | pad_res = torch.zeros(batchs, 50 | max_len, 51 | dtype=xs[0].dtype, 52 | device=xs[0].device) 53 | elif ndim == 2: 54 | pad_res = torch.zeros(batchs, 55 | max_len, 56 | xs[0].shape[1], 57 | dtype=xs[0].dtype, 58 | device=xs[0].device) 59 | elif ndim == 3: 60 | pad_res = torch.zeros(batchs, 61 | max_len, 62 | xs[0].shape[1], 63 | xs[0].shape[2], 64 | dtype=xs[0].dtype, 65 | device=xs[0].device) 66 | else: 67 | raise ValueError(f"Unsupported ndim: {ndim}") 68 | pad_res.fill_(pad_value) 69 | for i in range(batchs): 70 | pad_res[i, :len(xs[i])] = xs[i] 71 | return pad_res 72 | 73 | 74 | def th_accuracy(pad_outputs: torch.Tensor, pad_targets: torch.Tensor, 75 | ignore_label: int) -> torch.Tensor: 76 | """Calculate accuracy. 77 | 78 | Args: 79 | pad_outputs (Tensor): Prediction tensors (B * Lmax, D). 80 | pad_targets (LongTensor): Target label tensors (B, Lmax). 81 | ignore_label (int): Ignore label id. 82 | 83 | Returns: 84 | torch.Tensor: Accuracy value (0.0 - 1.0). 85 | 86 | """ 87 | pad_pred = pad_outputs.view(pad_targets.size(0), pad_targets.size(1), 88 | pad_outputs.size(1)).argmax(2) 89 | mask = pad_targets != ignore_label 90 | numerator = torch.sum( 91 | pad_pred.masked_select(mask) == pad_targets.masked_select(mask)) 92 | denominator = torch.sum(mask) 93 | return (numerator / denominator).detach() 94 | 95 | 96 | def get_padding(kernel_size, dilation=1): 97 | return int((kernel_size * dilation - dilation) / 2) 98 | 99 | 100 | def init_weights(m, mean=0.0, std=0.01): 101 | classname = m.__class__.__name__ 102 | if classname.find("Conv") != -1: 103 | m.weight.data.normal_(mean, std) 104 | -------------------------------------------------------------------------------- /cosyvoice/utils/file_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang) 2 | # 2024 Alibaba Inc (authors: Xiang Lyu) 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | import json 17 | import torchaudio 18 | 19 | 20 | def read_lists(list_file): 21 | lists = [] 22 | with open(list_file, 'r', encoding='utf8') as fin: 23 | for line in fin: 24 | lists.append(line.strip()) 25 | return lists 26 | 27 | def read_json_lists(list_file): 28 | lists = read_lists(list_file) 29 | results = {} 30 | for fn in lists: 31 | with open(fn, 'r', encoding='utf8') as fin: 32 | results.update(json.load(fin)) 33 | return results 34 | 35 | def load_wav(wav, target_sr): 36 | speech, sample_rate = torchaudio.load(wav) 37 | speech = speech.mean(dim=0, keepdim=True) 38 | if sample_rate != target_sr: 39 | assert sample_rate > target_sr, 'wav sample rate {} must be greater than {}'.format(sample_rate, target_sr) 40 | speech = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=target_sr)(speech) 41 | return speech 42 | 43 | def speed_change(waveform, sample_rate, speed_factor: str): 44 | effects = [ 45 | ["tempo", speed_factor], # speed_factor 46 | ["rate", f"{sample_rate}"] 47 | ] 48 | augmented_waveform, new_sample_rate = torchaudio.sox_effects.apply_effects_tensor( 49 | waveform, 50 | sample_rate, 51 | effects 52 | ) 53 | return augmented_waveform, new_sample_rate 54 | -------------------------------------------------------------------------------- /cosyvoice/utils/frontend_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu, Zhihao Du) 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import re 16 | chinese_char_pattern = re.compile(r'[\u4e00-\u9fff]+') 17 | 18 | # whether contain chinese character 19 | def contains_chinese(text): 20 | return bool(chinese_char_pattern.search(text)) 21 | 22 | 23 | # replace special symbol 24 | def replace_corner_mark(text): 25 | text = text.replace('²', '平方') 26 | text = text.replace('³', '立方') 27 | return text 28 | 29 | 30 | # remove meaningless symbol 31 | def remove_bracket(text): 32 | text = text.replace('(', '').replace(')', '') 33 | text = text.replace('【', '').replace('】', '') 34 | text = text.replace('`', '').replace('`', '') 35 | text = text.replace("——", " ") 36 | return text 37 | 38 | 39 | # spell Arabic numerals 40 | def spell_out_number(text: str, inflect_parser): 41 | new_text = [] 42 | st = None 43 | for i, c in enumerate(text): 44 | if not c.isdigit(): 45 | if st is not None: 46 | num_str = inflect_parser.number_to_words(text[st: i]) 47 | new_text.append(num_str) 48 | st = None 49 | new_text.append(c) 50 | else: 51 | if st is None: 52 | st = i 53 | if st is not None and st < len(text): 54 | num_str = inflect_parser.number_to_words(text[st:]) 55 | new_text.append(num_str) 56 | return ''.join(new_text) 57 | 58 | 59 | # split paragrah logic: 60 | # 1. per sentence max len token_max_n, min len token_min_n, merge if last sentence len less than merge_len 61 | # 2. cal sentence len according to lang 62 | # 3. split sentence according to puncatation 63 | def split_paragraph(text: str, tokenize, lang="zh", token_max_n=80, token_min_n=60, merge_len=20, comma_split=False): 64 | def calc_utt_length(_text: str): 65 | if lang == "zh": 66 | return len(_text) 67 | else: 68 | return len(tokenize(_text)) 69 | 70 | def should_merge(_text: str): 71 | if lang == "zh": 72 | return len(_text) < merge_len 73 | else: 74 | return len(tokenize(_text)) < merge_len 75 | 76 | if lang == "zh": 77 | pounc = ['。', '?', '!', ';', ':', '、', '.', '?', '!', ';'] 78 | else: 79 | pounc = ['.', '?', '!', ';', ':'] 80 | if comma_split: 81 | pounc.extend([',', ',']) 82 | st = 0 83 | utts = [] 84 | for i, c in enumerate(text): 85 | if c in pounc: 86 | if len(text[st: i]) > 0: 87 | utts.append(text[st: i] + c) 88 | if i + 1 < len(text) and text[i + 1] in ['"', '”']: 89 | tmp = utts.pop(-1) 90 | utts.append(tmp + text[i + 1]) 91 | st = i + 2 92 | else: 93 | st = i + 1 94 | if len(utts) == 0: 95 | if lang == "zh": 96 | utts.append(text + '。') 97 | else: 98 | utts.append(text + '.') 99 | final_utts = [] 100 | cur_utt = "" 101 | for utt in utts: 102 | if calc_utt_length(cur_utt + utt) > token_max_n and calc_utt_length(cur_utt) > token_min_n: 103 | final_utts.append(cur_utt) 104 | cur_utt = "" 105 | cur_utt = cur_utt + utt 106 | if len(cur_utt) > 0: 107 | if should_merge(cur_utt) and len(final_utts) != 0: 108 | final_utts[-1] = final_utts[-1] + cur_utt 109 | else: 110 | final_utts.append(cur_utt) 111 | 112 | return final_utts 113 | 114 | 115 | # remove blank between chinese character 116 | def replace_blank(text: str): 117 | out_str = [] 118 | for i, c in enumerate(text): 119 | if c == " ": 120 | if ((text[i + 1].isascii() and text[i + 1] != " ") and 121 | (text[i - 1].isascii() and text[i - 1] != " ")): 122 | out_str.append(c) 123 | else: 124 | out_str.append(c) 125 | return "".join(out_str) 126 | -------------------------------------------------------------------------------- /cosyvoice/vocab_16K.yaml: -------------------------------------------------------------------------------- 1 | # set random seed, so that you may reproduce your result. 2 | __set_seed1: !apply:random.seed [1986] 3 | __set_seed2: !apply:numpy.random.seed [1986] 4 | __set_seed3: !apply:torch.manual_seed [1986] 5 | __set_seed4: !apply:torch.cuda.manual_seed_all [1986] 6 | 7 | # fixed params 8 | sample_rate: 22050 9 | text_encoder_input_size: 512 10 | llm_input_size: 1024 11 | llm_output_size: 1024 12 | spk_embed_dim: 192 13 | 14 | 15 | flow: !new:cosyvoice.flow.flow.MaskedDiffWithXvec 16 | input_size: 512 17 | output_size: 80 18 | spk_embed_dim: !ref 19 | output_type: 'mel' 20 | vocab_size: 16384 21 | input_frame_rate: 12.5 22 | only_mask_loss: True 23 | encoder: !new:cosyvoice.transformer.encoder.BlockConformerEncoder 24 | output_size: 512 25 | attention_heads: 8 26 | linear_units: 2048 27 | num_blocks: 6 28 | dropout_rate: 0.1 29 | positional_dropout_rate: 0.1 30 | attention_dropout_rate: 0.1 31 | normalize_before: True 32 | input_layer: 'linear' 33 | pos_enc_layer_type: 'rel_pos_espnet' 34 | selfattention_layer_type: 'block_rel_selfattn' 35 | block_size: 10 36 | input_size: 512 37 | use_cnn_module: False 38 | macaron_style: False 39 | length_regulator: !new:cosyvoice.flow.length_regulator.InterpolateRegulator 40 | channels: 80 41 | sampling_ratios: [1, 1, 1, 1] 42 | decoder: !new:cosyvoice.flow.flow_matching.ConditionalCFM 43 | in_channels: 240 44 | n_spks: 1 45 | spk_emb_dim: 80 46 | cfm_params: !new:omegaconf.DictConfig 47 | content: 48 | sigma_min: 1e-06 49 | solver: 'euler' 50 | t_scheduler: 'cosine' 51 | training_cfg_rate: 0.2 52 | inference_cfg_rate: 0.7 53 | reg_loss_type: 'l1' 54 | estimator: !new:cosyvoice.flow.decoder.ConditionalDecoder 55 | in_channels: 320 56 | out_channels: 80 57 | channels: [256, 256] 58 | dropout: 0 59 | attention_head_dim: 64 60 | n_blocks: 4 61 | num_mid_blocks: 12 62 | num_heads: 8 63 | act_fn: 'gelu' 64 | 65 | hift: !new:cosyvoice.hifigan.generator.HiFTGenerator 66 | in_channels: 80 67 | base_channels: 512 68 | nb_harmonics: 8 69 | sampling_rate: !ref 70 | nsf_alpha: 0.1 71 | nsf_sigma: 0.003 72 | nsf_voiced_threshold: 10 73 | upsample_rates: [8, 8] 74 | upsample_kernel_sizes: [16, 16] 75 | istft_params: 76 | n_fft: 16 77 | hop_len: 4 78 | resblock_kernel_sizes: [3, 7, 11] 79 | resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]] 80 | source_resblock_kernel_sizes: [7, 11] 81 | source_resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5]] 82 | lrelu_slope: 0.1 83 | audio_limit: 0.99 84 | f0_predictor: !new:cosyvoice.hifigan.f0_predictor.ConvRNNF0Predictor 85 | num_class: 1 86 | in_channels: 80 87 | cond_channels: 512 88 | -------------------------------------------------------------------------------- /openomni/__init__.py: -------------------------------------------------------------------------------- 1 | # from llava.model import LlavaLlamaForCausalLM 2 | -------------------------------------------------------------------------------- /openomni/constants.py: -------------------------------------------------------------------------------- 1 | CONTROLLER_HEART_BEAT_EXPIRATION = 30 2 | WORKER_HEART_BEAT_INTERVAL = 15 3 | 4 | LOGDIR = "." 5 | 6 | # Model Constants 7 | IGNORE_INDEX = -100 8 | IMAGE_TOKEN_INDEX = -200 9 | SPEECH_TOKEN_INDEX = -300 10 | DEFAULT_IMAGE_TOKEN = "" 11 | DEFAULT_SPEECH_TOKEN = "" 12 | -------------------------------------------------------------------------------- /openomni/eval/eval_gpt_review.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | import os 4 | 5 | import openai 6 | import tqdm 7 | import ray 8 | import time 9 | 10 | NUM_SECONDS_TO_SLEEP = 3 11 | 12 | @ray.remote(num_cpus=4) 13 | def get_eval(content: str, max_tokens: int): 14 | while True: 15 | try: 16 | response = openai.ChatCompletion.create( 17 | model='gpt-4', 18 | messages=[{ 19 | 'role': 'system', 20 | 'content': 'You are a helpful and precise assistant for checking the quality of the answer.' 21 | }, { 22 | 'role': 'user', 23 | 'content': content, 24 | }], 25 | temperature=0.2, # TODO: figure out which temperature is best for evaluation 26 | max_tokens=max_tokens, 27 | ) 28 | break 29 | except openai.error.RateLimitError: 30 | pass 31 | except Exception as e: 32 | print(e) 33 | time.sleep(NUM_SECONDS_TO_SLEEP) 34 | 35 | print('success!') 36 | return response['choices'][0]['message']['content'] 37 | 38 | 39 | def parse_score(review): 40 | try: 41 | score_pair = review.split('\n')[0] 42 | score_pair = score_pair.replace(',', ' ') 43 | sp = score_pair.split(' ') 44 | if len(sp) == 2: 45 | return [float(sp[0]), float(sp[1])] 46 | else: 47 | print('error', review) 48 | return [-1, -1] 49 | except Exception as e: 50 | print(e) 51 | print('error', review) 52 | return [-1, -1] 53 | 54 | 55 | if __name__ == '__main__': 56 | parser = argparse.ArgumentParser(description='ChatGPT-based QA evaluation.') 57 | parser.add_argument('-q', '--question') 58 | # parser.add_argument('-a', '--answer') 59 | parser.add_argument('-a', '--answer-list', nargs='+', default=[]) 60 | parser.add_argument('-r', '--rule') 61 | parser.add_argument('-o', '--output') 62 | parser.add_argument('--max-tokens', type=int, default=1024, help='maximum number of tokens produced in the output') 63 | args = parser.parse_args() 64 | 65 | ray.init() 66 | 67 | f_q = open(os.path.expanduser(args.question)) 68 | f_ans1 = open(os.path.expanduser(args.answer_list[0])) 69 | f_ans2 = open(os.path.expanduser(args.answer_list[1])) 70 | rule_dict = json.load(open(os.path.expanduser(args.rule), 'r')) 71 | 72 | review_file = open(f'{args.output}', 'w') 73 | 74 | js_list = [] 75 | handles = [] 76 | idx = 0 77 | for ques_js, ans1_js, ans2_js in zip(f_q, f_ans1, f_ans2): 78 | # if idx == 1: 79 | # break 80 | 81 | ques = json.loads(ques_js) 82 | ans1 = json.loads(ans1_js) 83 | ans2 = json.loads(ans2_js) 84 | 85 | category = json.loads(ques_js)['category'] 86 | if category in rule_dict: 87 | rule = rule_dict[category] 88 | else: 89 | rule = rule_dict['default'] 90 | prompt = rule['prompt'] 91 | role = rule['role'] 92 | content = (f'[Question]\n{ques["text"]}\n\n' 93 | f'[{role} 1]\n{ans1["text"]}\n\n[End of {role} 1]\n\n' 94 | f'[{role} 2]\n{ans2["text"]}\n\n[End of {role} 2]\n\n' 95 | f'[System]\n{prompt}\n\n') 96 | js_list.append({ 97 | 'id': idx+1, 98 | 'question_id': ques['question_id'], 99 | 'answer1_id': ans1['answer_id'], 100 | 'answer2_id': ans2['answer_id'], 101 | 'category': category}) 102 | idx += 1 103 | handles.append(get_eval.remote(content, args.max_tokens)) 104 | # To avoid the rate limit set by OpenAI 105 | time.sleep(NUM_SECONDS_TO_SLEEP) 106 | 107 | reviews = ray.get(handles) 108 | for idx, review in enumerate(reviews): 109 | scores = parse_score(review) 110 | js_list[idx]['content'] = review 111 | js_list[idx]['tuple'] = scores 112 | review_file.write(json.dumps(js_list[idx]) + '\n') 113 | review_file.close() 114 | -------------------------------------------------------------------------------- /openomni/eval/eval_gpt_review_bench.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | import os 4 | 5 | import openai 6 | import time 7 | 8 | NUM_SECONDS_TO_SLEEP = 0.5 9 | 10 | 11 | def get_eval(content: str, max_tokens: int): 12 | while True: 13 | try: 14 | response = openai.ChatCompletion.create( 15 | model='gpt-4-0314', 16 | messages=[{ 17 | 'role': 'system', 18 | 'content': 'You are a helpful and precise assistant for checking the quality of the answer.' 19 | }, { 20 | 'role': 'user', 21 | 'content': content, 22 | }], 23 | temperature=0.2, # TODO: figure out which temperature is best for evaluation 24 | max_tokens=max_tokens, 25 | ) 26 | break 27 | except openai.error.RateLimitError: 28 | pass 29 | except Exception as e: 30 | print(e) 31 | time.sleep(NUM_SECONDS_TO_SLEEP) 32 | 33 | return response['choices'][0]['message']['content'] 34 | 35 | 36 | def parse_score(review): 37 | try: 38 | score_pair = review.split('\n')[0] 39 | score_pair = score_pair.replace(',', ' ') 40 | sp = score_pair.split(' ') 41 | if len(sp) == 2: 42 | return [float(sp[0]), float(sp[1])] 43 | else: 44 | print('error', review) 45 | return [-1, -1] 46 | except Exception as e: 47 | print(e) 48 | print('error', review) 49 | return [-1, -1] 50 | 51 | 52 | if __name__ == '__main__': 53 | parser = argparse.ArgumentParser(description='ChatGPT-based QA evaluation.') 54 | parser.add_argument('-q', '--question') 55 | parser.add_argument('-c', '--context') 56 | parser.add_argument('-a', '--answer-list', nargs='+', default=[]) 57 | parser.add_argument('-r', '--rule') 58 | parser.add_argument('-o', '--output') 59 | parser.add_argument('--max-tokens', type=int, default=1024, help='maximum number of tokens produced in the output') 60 | args = parser.parse_args() 61 | 62 | f_q = open(os.path.expanduser(args.question)) 63 | f_ans1 = open(os.path.expanduser(args.answer_list[0])) 64 | f_ans2 = open(os.path.expanduser(args.answer_list[1])) 65 | rule_dict = json.load(open(os.path.expanduser(args.rule), 'r')) 66 | 67 | if os.path.isfile(os.path.expanduser(args.output)): 68 | cur_reviews = [json.loads(line) for line in open(os.path.expanduser(args.output))] 69 | else: 70 | cur_reviews = [] 71 | 72 | review_file = open(f'{args.output}', 'a') 73 | 74 | context_list = [json.loads(line) for line in open(os.path.expanduser(args.context))] 75 | image_to_context = {context['image']: context for context in context_list} 76 | 77 | handles = [] 78 | idx = 0 79 | for ques_js, ans1_js, ans2_js in zip(f_q, f_ans1, f_ans2): 80 | ques = json.loads(ques_js) 81 | ans1 = json.loads(ans1_js) 82 | ans2 = json.loads(ans2_js) 83 | 84 | inst = image_to_context[ques['image']] 85 | 86 | if isinstance(inst['caption'], list): 87 | cap_str = '\n'.join(inst['caption']) 88 | else: 89 | cap_str = inst['caption'] 90 | 91 | category = 'llava_bench_' + json.loads(ques_js)['category'] 92 | if category in rule_dict: 93 | rule = rule_dict[category] 94 | else: 95 | assert False, f"Visual QA category not found in rule file: {category}." 96 | prompt = rule['prompt'] 97 | role = rule['role'] 98 | content = (f'[Context]\n{cap_str}\n\n' 99 | f'[Question]\n{ques["text"]}\n\n' 100 | f'[{role} 1]\n{ans1["text"]}\n\n[End of {role} 1]\n\n' 101 | f'[{role} 2]\n{ans2["text"]}\n\n[End of {role} 2]\n\n' 102 | f'[System]\n{prompt}\n\n') 103 | cur_js = { 104 | 'id': idx+1, 105 | 'question_id': ques['question_id'], 106 | 'answer1_id': ans1.get('answer_id', ans1['question_id']), 107 | 'answer2_id': ans2.get('answer_id', ans2['answer_id']), 108 | 'category': category 109 | } 110 | if idx >= len(cur_reviews): 111 | review = get_eval(content, args.max_tokens) 112 | scores = parse_score(review) 113 | cur_js['content'] = review 114 | cur_js['tuple'] = scores 115 | review_file.write(json.dumps(cur_js) + '\n') 116 | review_file.flush() 117 | else: 118 | print(f'Skipping {idx} as we already have it.') 119 | idx += 1 120 | print(idx) 121 | review_file.close() 122 | -------------------------------------------------------------------------------- /openomni/eval/eval_gpt_review_visual.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | import os 4 | 5 | import openai 6 | import time 7 | 8 | NUM_SECONDS_TO_SLEEP = 0.5 9 | 10 | 11 | def get_eval(content: str, max_tokens: int): 12 | while True: 13 | try: 14 | response = openai.ChatCompletion.create( 15 | model='gpt-4-0314', 16 | messages=[{ 17 | 'role': 'system', 18 | 'content': 'You are a helpful and precise assistant for checking the quality of the answer.' 19 | }, { 20 | 'role': 'user', 21 | 'content': content, 22 | }], 23 | temperature=0.2, # TODO: figure out which temperature is best for evaluation 24 | max_tokens=max_tokens, 25 | ) 26 | break 27 | except openai.error.RateLimitError: 28 | pass 29 | except Exception as e: 30 | print(e) 31 | time.sleep(NUM_SECONDS_TO_SLEEP) 32 | 33 | return response['choices'][0]['message']['content'] 34 | 35 | 36 | def parse_score(review): 37 | try: 38 | score_pair = review.split('\n')[0] 39 | score_pair = score_pair.replace(',', ' ') 40 | sp = score_pair.split(' ') 41 | if len(sp) == 2: 42 | return [float(sp[0]), float(sp[1])] 43 | else: 44 | print('error', review) 45 | return [-1, -1] 46 | except Exception as e: 47 | print(e) 48 | print('error', review) 49 | return [-1, -1] 50 | 51 | 52 | if __name__ == '__main__': 53 | parser = argparse.ArgumentParser(description='ChatGPT-based QA evaluation.') 54 | parser.add_argument('-q', '--question') 55 | parser.add_argument('-c', '--context') 56 | parser.add_argument('-a', '--answer-list', nargs='+', default=[]) 57 | parser.add_argument('-r', '--rule') 58 | parser.add_argument('-o', '--output') 59 | parser.add_argument('--max-tokens', type=int, default=1024, help='maximum number of tokens produced in the output') 60 | args = parser.parse_args() 61 | 62 | f_q = open(os.path.expanduser(args.question)) 63 | f_ans1 = open(os.path.expanduser(args.answer_list[0])) 64 | f_ans2 = open(os.path.expanduser(args.answer_list[1])) 65 | rule_dict = json.load(open(os.path.expanduser(args.rule), 'r')) 66 | 67 | if os.path.isfile(os.path.expanduser(args.output)): 68 | cur_reviews = [json.loads(line) for line in open(os.path.expanduser(args.output))] 69 | else: 70 | cur_reviews = [] 71 | 72 | review_file = open(f'{args.output}', 'a') 73 | 74 | context_list = [json.loads(line) for line in open(os.path.expanduser(args.context))] 75 | image_to_context = {context['image']: context for context in context_list} 76 | 77 | handles = [] 78 | idx = 0 79 | for ques_js, ans1_js, ans2_js in zip(f_q, f_ans1, f_ans2): 80 | ques = json.loads(ques_js) 81 | ans1 = json.loads(ans1_js) 82 | ans2 = json.loads(ans2_js) 83 | 84 | inst = image_to_context[ques['image']] 85 | cap_str = '\n'.join(inst['captions']) 86 | box_str = '\n'.join([f'{instance["category"]}: {instance["bbox"]}' for instance in inst['instances']]) 87 | 88 | category = json.loads(ques_js)['category'] 89 | if category in rule_dict: 90 | rule = rule_dict[category] 91 | else: 92 | assert False, f"Visual QA category not found in rule file: {category}." 93 | prompt = rule['prompt'] 94 | role = rule['role'] 95 | content = (f'[Context]\n{cap_str}\n\n{box_str}\n\n' 96 | f'[Question]\n{ques["text"]}\n\n' 97 | f'[{role} 1]\n{ans1["text"]}\n\n[End of {role} 1]\n\n' 98 | f'[{role} 2]\n{ans2["text"]}\n\n[End of {role} 2]\n\n' 99 | f'[System]\n{prompt}\n\n') 100 | cur_js = { 101 | 'id': idx+1, 102 | 'question_id': ques['question_id'], 103 | 'answer1_id': ans1.get('answer_id', ans1['question_id']), 104 | 'answer2_id': ans2.get('answer_id', ans2['answer_id']), 105 | 'category': category 106 | } 107 | if idx >= len(cur_reviews): 108 | review = get_eval(content, args.max_tokens) 109 | scores = parse_score(review) 110 | cur_js['content'] = review 111 | cur_js['tuple'] = scores 112 | review_file.write(json.dumps(cur_js) + '\n') 113 | review_file.flush() 114 | else: 115 | print(f'Skipping {idx} as we already have it.') 116 | idx += 1 117 | print(idx) 118 | review_file.close() 119 | -------------------------------------------------------------------------------- /openomni/eval/eval_pope.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import argparse 4 | 5 | def eval_pope(answers, label_file): 6 | label_list = [json.loads(q)['label'] for q in open(label_file, 'r')] 7 | 8 | for answer in answers: 9 | text = answer['text'] 10 | 11 | # Only keep the first sentence 12 | if text.find('.') != -1: 13 | text = text.split('.')[0] 14 | 15 | text = text.replace(',', '') 16 | words = text.split(' ') 17 | if 'No' in words or 'not' in words or 'no' in words: 18 | answer['text'] = 'no' 19 | else: 20 | answer['text'] = 'yes' 21 | 22 | for i in range(len(label_list)): 23 | if label_list[i] == 'no': 24 | label_list[i] = 0 25 | else: 26 | label_list[i] = 1 27 | 28 | pred_list = [] 29 | for answer in answers: 30 | if answer['text'] == 'no': 31 | pred_list.append(0) 32 | else: 33 | pred_list.append(1) 34 | 35 | pos = 1 36 | neg = 0 37 | yes_ratio = pred_list.count(1) / len(pred_list) 38 | 39 | TP, TN, FP, FN = 0, 0, 0, 0 40 | for pred, label in zip(pred_list, label_list): 41 | if pred == pos and label == pos: 42 | TP += 1 43 | elif pred == pos and label == neg: 44 | FP += 1 45 | elif pred == neg and label == neg: 46 | TN += 1 47 | elif pred == neg and label == pos: 48 | FN += 1 49 | 50 | print('TP\tFP\tTN\tFN\t') 51 | print('{}\t{}\t{}\t{}'.format(TP, FP, TN, FN)) 52 | 53 | precision = float(TP) / float(TP + FP) 54 | recall = float(TP) / float(TP + FN) 55 | f1 = 2*precision*recall / (precision + recall) 56 | acc = (TP + TN) / (TP + TN + FP + FN) 57 | print('Accuracy: {}'.format(acc)) 58 | print('Precision: {}'.format(precision)) 59 | print('Recall: {}'.format(recall)) 60 | print('F1 score: {}'.format(f1)) 61 | print('Yes ratio: {}'.format(yes_ratio)) 62 | print('%.3f, %.3f, %.3f, %.3f, %.3f' % (f1, acc, precision, recall, yes_ratio) ) 63 | 64 | if __name__ == "__main__": 65 | parser = argparse.ArgumentParser() 66 | parser.add_argument("--annotation-dir", type=str) 67 | parser.add_argument("--question-file", type=str) 68 | parser.add_argument("--result-file", type=str) 69 | args = parser.parse_args() 70 | 71 | questions = [json.loads(line) for line in open(args.question_file)] 72 | questions = {question['question_id']: question for question in questions} 73 | answers = [json.loads(q) for q in open(args.result_file)] 74 | for file in os.listdir(args.annotation_dir): 75 | assert file.startswith('coco_pope_') 76 | assert file.endswith('.json') 77 | category = file[10:-5] 78 | cur_answers = [x for x in answers if questions[x['question_id']]['category'] == category] 79 | print('Category: {}, # samples: {}'.format(category, len(cur_answers))) 80 | eval_pope(cur_answers, os.path.join(args.annotation_dir, file)) 81 | print("====================================") 82 | -------------------------------------------------------------------------------- /openomni/eval/eval_science_qa.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | import os 4 | import re 5 | import random 6 | 7 | 8 | def get_args(): 9 | parser = argparse.ArgumentParser() 10 | parser.add_argument('--base-dir', type=str) 11 | parser.add_argument('--result-file', type=str) 12 | parser.add_argument('--output-file', type=str) 13 | parser.add_argument('--output-result', type=str) 14 | parser.add_argument('--split', type=str, default='test') 15 | parser.add_argument('--options', type=list, default=["A", "B", "C", "D", "E"]) 16 | return parser.parse_args() 17 | 18 | 19 | def convert_caps(results): 20 | fakecaps = [] 21 | for result in results: 22 | image_id = result['question_id'] 23 | caption = result['text'] 24 | fakecaps.append({"image_id": int(image_id), "caption": caption}) 25 | return fakecaps 26 | 27 | 28 | def get_pred_idx(prediction, choices, options): 29 | """ 30 | Get the index (e.g. 2) from the prediction (e.g. 'C') 31 | """ 32 | if prediction in options[:len(choices)]: 33 | return options.index(prediction) 34 | else: 35 | return -1 36 | return random.choice(range(len(choices))) 37 | 38 | 39 | if __name__ == "__main__": 40 | args = get_args() 41 | 42 | base_dir = args.base_dir 43 | split_indices = json.load(open(os.path.join(base_dir, "pid_splits.json")))[args.split] 44 | problems = json.load(open(os.path.join(base_dir, "problems.json"))) 45 | predictions = [json.loads(line) for line in open(args.result_file)] 46 | predictions = {pred['question_id']: pred for pred in predictions} 47 | split_problems = {idx: problems[idx] for idx in split_indices} 48 | 49 | results = {'correct': [], 'incorrect': []} 50 | sqa_results = {} 51 | sqa_results['acc'] = None 52 | sqa_results['correct'] = None 53 | sqa_results['count'] = None 54 | sqa_results['results'] = {} 55 | sqa_results['outputs'] = {} 56 | 57 | for prob_id, prob in split_problems.items(): 58 | if prob_id not in predictions: 59 | pred = {'text': 'FAILED', 'prompt': 'Unknown'} 60 | pred_text = 'FAILED' 61 | else: 62 | pred = predictions[prob_id] 63 | pred_text = pred['text'] 64 | 65 | if pred_text in args.options: 66 | answer = pred_text 67 | elif len(pred_text) >= 3 and pred_text[0] in args.options and pred_text[1:3] == ". ": 68 | answer = pred_text[0] 69 | else: 70 | pattern = re.compile(r'The answer is ([A-Z]).') 71 | res = pattern.findall(pred_text) 72 | if len(res) == 1: 73 | answer = res[0] # 'A', 'B', ... 74 | else: 75 | answer = "FAILED" 76 | 77 | pred_idx = get_pred_idx(answer, prob['choices'], args.options) 78 | 79 | analysis = { 80 | 'question_id': prob_id, 81 | 'parsed_ans': answer, 82 | 'ground_truth': args.options[prob['answer']], 83 | 'question': pred['prompt'], 84 | 'pred': pred_text, 85 | 'is_multimodal': '' in pred['prompt'], 86 | } 87 | 88 | sqa_results['results'][prob_id] = get_pred_idx(answer, prob['choices'], args.options) 89 | sqa_results['outputs'][prob_id] = pred_text 90 | 91 | if pred_idx == prob['answer']: 92 | results['correct'].append(analysis) 93 | else: 94 | results['incorrect'].append(analysis) 95 | 96 | correct = len(results['correct']) 97 | total = len(results['correct']) + len(results['incorrect']) 98 | 99 | ###### IMG ###### 100 | multimodal_correct = len([x for x in results['correct'] if x['is_multimodal']]) 101 | multimodal_incorrect = len([x for x in results['incorrect'] if x['is_multimodal']]) 102 | multimodal_total = multimodal_correct + multimodal_incorrect 103 | ###### IMG ###### 104 | 105 | print(f'Total: {total}, Correct: {correct}, Accuracy: {correct / total * 100:.2f}%, IMG-Accuracy: {multimodal_correct / multimodal_total * 100:.2f}%') 106 | 107 | sqa_results['acc'] = correct / total * 100 108 | sqa_results['correct'] = correct 109 | sqa_results['count'] = total 110 | 111 | with open(args.output_file, 'w') as f: 112 | json.dump(results, f, indent=2) 113 | with open(args.output_result, 'w') as f: 114 | json.dump(sqa_results, f, indent=2) 115 | -------------------------------------------------------------------------------- /openomni/eval/eval_science_qa_gpt4.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | import os 4 | import re 5 | import random 6 | from collections import defaultdict 7 | 8 | 9 | def get_args(): 10 | parser = argparse.ArgumentParser() 11 | parser.add_argument('--base-dir', type=str) 12 | parser.add_argument('--gpt4-result', type=str) 13 | parser.add_argument('--our-result', type=str) 14 | parser.add_argument('--split', type=str, default='test') 15 | parser.add_argument('--options', type=list, default=["A", "B", "C", "D", "E"]) 16 | return parser.parse_args() 17 | 18 | 19 | def convert_caps(results): 20 | fakecaps = [] 21 | for result in results: 22 | image_id = result['question_id'] 23 | caption = result['text'] 24 | fakecaps.append({"image_id": int(image_id), "caption": caption}) 25 | return fakecaps 26 | 27 | 28 | def get_pred_idx(prediction, choices, options): 29 | """ 30 | Get the index (e.g. 2) from the prediction (e.g. 'C') 31 | """ 32 | if prediction in options[:len(choices)]: 33 | return options.index(prediction) 34 | else: 35 | return random.choice(range(len(choices))) 36 | 37 | 38 | if __name__ == "__main__": 39 | args = get_args() 40 | 41 | base_dir = args.base_dir 42 | split_indices = json.load(open(os.path.join(base_dir, "pid_splits.json")))[args.split] 43 | problems = json.load(open(os.path.join(base_dir, "problems.json"))) 44 | our_predictions = [json.loads(line) for line in open(args.our_result)] 45 | our_predictions = {pred['question_id']: pred for pred in our_predictions} 46 | split_problems = {idx: problems[idx] for idx in split_indices} 47 | 48 | gpt4_predictions = json.load(open(args.gpt4_result))['outputs'] 49 | 50 | results = defaultdict(lambda: 0) 51 | 52 | for prob_id, prob in split_problems.items(): 53 | if prob_id not in our_predictions: 54 | continue 55 | if prob_id not in gpt4_predictions: 56 | continue 57 | our_pred = our_predictions[prob_id]['text'] 58 | gpt4_pred = gpt4_predictions[prob_id] 59 | 60 | pattern = re.compile(r'The answer is ([A-Z]).') 61 | our_res = pattern.findall(our_pred) 62 | if len(our_res) == 1: 63 | our_answer = our_res[0] # 'A', 'B', ... 64 | else: 65 | our_answer = "FAILED" 66 | gpt4_res = pattern.findall(gpt4_pred) 67 | if len(gpt4_res) == 1: 68 | gpt4_answer = gpt4_res[0] # 'A', 'B', ... 69 | else: 70 | gpt4_answer = "FAILED" 71 | 72 | our_pred_idx = get_pred_idx(our_answer, prob['choices'], args.options) 73 | gpt4_pred_idx = get_pred_idx(gpt4_answer, prob['choices'], args.options) 74 | 75 | if gpt4_answer == 'FAILED': 76 | results['gpt4_failed'] += 1 77 | # continue 78 | gpt4_pred_idx = our_pred_idx 79 | # if our_pred_idx != prob['answer']: 80 | # print(our_predictions[prob_id]['prompt']) 81 | # print('-----------------') 82 | # print(f'LECTURE: {prob["lecture"]}') 83 | # print(f'SOLUTION: {prob["solution"]}') 84 | # print('=====================') 85 | else: 86 | # continue 87 | pass 88 | # gpt4_pred_idx = our_pred_idx 89 | 90 | if gpt4_pred_idx == prob['answer']: 91 | results['correct'] += 1 92 | else: 93 | results['incorrect'] += 1 94 | 95 | 96 | if gpt4_pred_idx == prob['answer'] or our_pred_idx == prob['answer']: 97 | results['correct_upperbound'] += 1 98 | 99 | correct = results['correct'] 100 | total = results['correct'] + results['incorrect'] 101 | print(f'Total: {total}, Correct: {correct}, Accuracy: {correct / total * 100:.2f}%') 102 | print(f'Total: {total}, Correct (upper): {results["correct_upperbound"]}, Accuracy: {results["correct_upperbound"] / total * 100:.2f}%') 103 | print(f'Total: {total}, GPT-4 NO-ANS (RANDOM): {results["gpt4_failed"]}, Percentage: {results["gpt4_failed"] / total * 100:.2f}%') 104 | 105 | -------------------------------------------------------------------------------- /openomni/eval/eval_textvqa.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | import json 4 | import re 5 | 6 | from llava.eval.m4c_evaluator import TextVQAAccuracyEvaluator 7 | 8 | 9 | def get_args(): 10 | parser = argparse.ArgumentParser() 11 | parser.add_argument('--annotation-file', type=str,default="datasets/textvqa/TextVQA_0.5.1_val.json") 12 | parser.add_argument('--result-file', type=str,default="answers/qwen2_evol_textvqa_prediction.jsonl") 13 | parser.add_argument('--result-dir', type=str) 14 | return parser.parse_args() 15 | 16 | 17 | def prompt_processor(prompt): 18 | if prompt.startswith('OCR tokens: '): 19 | pattern = r"Question: (.*?) Short answer:" 20 | match = re.search(pattern, prompt, re.DOTALL) 21 | question = match.group(1) 22 | elif 'Reference OCR token: ' in prompt and len(prompt.split('\n')) == 3: 23 | if prompt.startswith('Reference OCR token:'): 24 | question = prompt.split('\n')[1] 25 | else: 26 | question = prompt.split('\n')[0] 27 | elif len(prompt.split('\n')) == 2: 28 | question = prompt.split('\n')[0] 29 | else: 30 | assert False 31 | 32 | return question.lower() 33 | 34 | 35 | def eval_single(annotation_file, result_file): 36 | experiment_name = os.path.splitext(os.path.basename(result_file))[0] 37 | print(experiment_name) 38 | annotations = json.load(open(annotation_file))['data'] 39 | annotations = {(annotation['image_id'], annotation['question'].lower()): annotation for annotation in annotations} 40 | results = [json.loads(line) for line in open(result_file)] 41 | 42 | pred_list = [] 43 | for result in results: 44 | annotation = annotations[(result['question_id'], prompt_processor(result['prompt']))] 45 | pred_list.append({ 46 | "pred_answer": result['text'], 47 | "gt_answers": annotation['answers'], 48 | }) 49 | 50 | evaluator = TextVQAAccuracyEvaluator() 51 | print('Samples: {}\nAccuracy: {:.2f}%\n'.format(len(pred_list), 100. * evaluator.eval_pred_list(pred_list))) 52 | 53 | 54 | if __name__ == "__main__": 55 | args = get_args() 56 | 57 | if args.result_file is not None: 58 | eval_single(args.annotation_file, args.result_file) 59 | 60 | if args.result_dir is not None: 61 | for result_file in sorted(os.listdir(args.result_dir)): 62 | if not result_file.endswith('.jsonl'): 63 | print(f'Skipping {result_file}') 64 | continue 65 | eval_single(args.annotation_file, os.path.join(args.result_dir, result_file)) 66 | -------------------------------------------------------------------------------- /openomni/eval/generate_webpage_data_from_table.py: -------------------------------------------------------------------------------- 1 | """Generate json file for webpage.""" 2 | import json 3 | import os 4 | import re 5 | 6 | # models = ['llama', 'alpaca', 'gpt35', 'bard'] 7 | models = ['vicuna'] 8 | 9 | 10 | def read_jsonl(path: str, key: str=None): 11 | data = [] 12 | with open(os.path.expanduser(path)) as f: 13 | for line in f: 14 | if not line: 15 | continue 16 | data.append(json.loads(line)) 17 | if key is not None: 18 | data.sort(key=lambda x: x[key]) 19 | data = {item[key]: item for item in data} 20 | return data 21 | 22 | 23 | def trim_hanging_lines(s: str, n: int) -> str: 24 | s = s.strip() 25 | for _ in range(n): 26 | s = s.split('\n', 1)[1].strip() 27 | return s 28 | 29 | 30 | if __name__ == '__main__': 31 | questions = read_jsonl('table/question.jsonl', key='question_id') 32 | 33 | # alpaca_answers = read_jsonl('table/answer/answer_alpaca-13b.jsonl', key='question_id') 34 | # bard_answers = read_jsonl('table/answer/answer_bard.jsonl', key='question_id') 35 | # gpt35_answers = read_jsonl('table/answer/answer_gpt35.jsonl', key='question_id') 36 | # llama_answers = read_jsonl('table/answer/answer_llama-13b.jsonl', key='question_id') 37 | vicuna_answers = read_jsonl('table/answer/answer_vicuna-13b.jsonl', key='question_id') 38 | ours_answers = read_jsonl('table/results/llama-13b-hf-alpaca.jsonl', key='question_id') 39 | 40 | review_vicuna = read_jsonl('table/review/review_vicuna-13b_llama-13b-hf-alpaca.jsonl', key='question_id') 41 | # review_alpaca = read_jsonl('table/review/review_alpaca-13b_vicuna-13b.jsonl', key='question_id') 42 | # review_bard = read_jsonl('table/review/review_bard_vicuna-13b.jsonl', key='question_id') 43 | # review_gpt35 = read_jsonl('table/review/review_gpt35_vicuna-13b.jsonl', key='question_id') 44 | # review_llama = read_jsonl('table/review/review_llama-13b_vicuna-13b.jsonl', key='question_id') 45 | 46 | records = [] 47 | for qid in questions.keys(): 48 | r = { 49 | 'id': qid, 50 | 'category': questions[qid]['category'], 51 | 'question': questions[qid]['text'], 52 | 'answers': { 53 | # 'alpaca': alpaca_answers[qid]['text'], 54 | # 'llama': llama_answers[qid]['text'], 55 | # 'bard': bard_answers[qid]['text'], 56 | # 'gpt35': gpt35_answers[qid]['text'], 57 | 'vicuna': vicuna_answers[qid]['text'], 58 | 'ours': ours_answers[qid]['text'], 59 | }, 60 | 'evaluations': { 61 | # 'alpaca': review_alpaca[qid]['text'], 62 | # 'llama': review_llama[qid]['text'], 63 | # 'bard': review_bard[qid]['text'], 64 | 'vicuna': review_vicuna[qid]['content'], 65 | # 'gpt35': review_gpt35[qid]['text'], 66 | }, 67 | 'scores': { 68 | 'vicuna': review_vicuna[qid]['tuple'], 69 | # 'alpaca': review_alpaca[qid]['score'], 70 | # 'llama': review_llama[qid]['score'], 71 | # 'bard': review_bard[qid]['score'], 72 | # 'gpt35': review_gpt35[qid]['score'], 73 | }, 74 | } 75 | 76 | # cleanup data 77 | cleaned_evals = {} 78 | for k, v in r['evaluations'].items(): 79 | v = v.strip() 80 | lines = v.split('\n') 81 | # trim the first line if it's a pair of numbers 82 | if re.match(r'\d+[, ]+\d+', lines[0]): 83 | lines = lines[1:] 84 | v = '\n'.join(lines) 85 | cleaned_evals[k] = v.replace('Assistant 1', "**Assistant 1**").replace('Assistant 2', '**Assistant 2**') 86 | 87 | r['evaluations'] = cleaned_evals 88 | records.append(r) 89 | 90 | # Reorder the records, this is optional 91 | for r in records: 92 | if r['id'] <= 20: 93 | r['id'] += 60 94 | else: 95 | r['id'] -= 20 96 | for r in records: 97 | if r['id'] <= 50: 98 | r['id'] += 10 99 | elif 50 < r['id'] <= 60: 100 | r['id'] -= 50 101 | for r in records: 102 | if r['id'] == 7: 103 | r['id'] = 1 104 | elif r['id'] < 7: 105 | r['id'] += 1 106 | 107 | records.sort(key=lambda x: x['id']) 108 | 109 | # Write to file 110 | with open('webpage/data.json', 'w') as f: 111 | json.dump({'questions': records, 'models': models}, f, indent=2) 112 | -------------------------------------------------------------------------------- /openomni/eval/model_qa.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | import os 4 | 5 | import shortuuid 6 | import torch 7 | from tqdm import tqdm 8 | from transformers import AutoModelForCausalLM, AutoTokenizer 9 | 10 | from llava.conversation import default_conversation 11 | from llava.utils import disable_torch_init 12 | 13 | 14 | @torch.inference_mode() 15 | def eval_model(model_name, questions_file, answers_file): 16 | # Model 17 | disable_torch_init() 18 | model_name = os.path.expanduser(model_name) 19 | tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False) 20 | model = AutoModelForCausalLM.from_pretrained(model_name, 21 | torch_dtype=torch.float16).cuda() 22 | 23 | ques_file = open(os.path.expanduser(questions_file), "r") 24 | ans_file = open(os.path.expanduser(answers_file), "w") 25 | for i, line in enumerate(tqdm(ques_file)): 26 | idx = json.loads(line)["question_id"] 27 | qs = json.loads(line)["text"] 28 | cat = json.loads(line)["category"] 29 | conv = default_conversation.copy() 30 | conv.append_message(conv.roles[0], qs) 31 | prompt = conv.get_prompt() 32 | inputs = tokenizer([prompt]) 33 | input_ids = torch.as_tensor(inputs.input_ids).cuda() 34 | output_ids = model.generate( 35 | input_ids, 36 | do_sample=True, 37 | use_cache=True, 38 | temperature=0.7, 39 | max_new_tokens=1024,) 40 | outputs = tokenizer.batch_decode( 41 | output_ids, skip_special_tokens=True)[0] 42 | try: 43 | index = outputs.index(conv.sep, len(prompt)) 44 | except ValueError: 45 | outputs += conv.sep 46 | index = outputs.index(conv.sep, len(prompt)) 47 | 48 | outputs = outputs[len(prompt) + len(conv.roles[1]) + 2:index].strip() 49 | ans_id = shortuuid.uuid() 50 | ans_file.write(json.dumps({"question_id": idx, 51 | "text": outputs, 52 | "answer_id": ans_id, 53 | "model_id": model_name, 54 | "metadata": {}}) + "\n") 55 | ans_file.flush() 56 | ans_file.close() 57 | 58 | 59 | if __name__ == "__main__": 60 | parser = argparse.ArgumentParser() 61 | parser.add_argument("--model-name", type=str, default="facebook/opt-350m") 62 | parser.add_argument("--question-file", type=str, 63 | default="tables/question.jsonl") 64 | parser.add_argument("--answers-file", type=str, default="answer.jsonl") 65 | args = parser.parse_args() 66 | 67 | eval_model(args.model_name, args.question_file, args.answers_file) 68 | -------------------------------------------------------------------------------- /openomni/eval/qa_baseline_gpt35.py: -------------------------------------------------------------------------------- 1 | """Generate answers with GPT-3.5""" 2 | # Note: you need to be using OpenAI Python v0.27.0 for the code below to work 3 | import argparse 4 | import json 5 | import os 6 | import time 7 | import concurrent.futures 8 | 9 | import openai 10 | import tqdm 11 | import shortuuid 12 | 13 | MODEL = 'gpt-3.5-turbo' 14 | MODEL_ID = 'gpt-3.5-turbo:20230327' 15 | 16 | def get_answer(question_id: int, question: str, max_tokens: int): 17 | ans = { 18 | 'answer_id': shortuuid.uuid(), 19 | 'question_id': question_id, 20 | 'model_id': MODEL_ID, 21 | } 22 | for _ in range(3): 23 | try: 24 | response = openai.ChatCompletion.create( 25 | model=MODEL, 26 | messages=[{ 27 | 'role': 'system', 28 | 'content': 'You are a helpful assistant.' 29 | }, { 30 | 'role': 'user', 31 | 'content': question, 32 | }], 33 | max_tokens=max_tokens, 34 | ) 35 | ans['text'] = response['choices'][0]['message']['content'] 36 | return ans 37 | except Exception as e: 38 | print('[ERROR]', e) 39 | ans['text'] = '#ERROR#' 40 | time.sleep(1) 41 | return ans 42 | 43 | 44 | if __name__ == '__main__': 45 | parser = argparse.ArgumentParser(description='ChatGPT answer generation.') 46 | parser.add_argument('-q', '--question') 47 | parser.add_argument('-o', '--output') 48 | parser.add_argument('--max-tokens', type=int, default=1024, help='maximum number of tokens produced in the output') 49 | args = parser.parse_args() 50 | 51 | questions_dict = {} 52 | with open(os.path.expanduser(args.question)) as f: 53 | for line in f: 54 | if not line: 55 | continue 56 | q = json.loads(line) 57 | questions_dict[q['question_id']] = q['text'] 58 | 59 | answers = [] 60 | 61 | with concurrent.futures.ThreadPoolExecutor(max_workers=32) as executor: 62 | futures = [] 63 | for qid, question in questions_dict.items(): 64 | future = executor.submit(get_answer, qid, question, args.max_tokens) 65 | futures.append(future) 66 | 67 | for future in tqdm.tqdm(concurrent.futures.as_completed(futures), total=len(futures)): 68 | answers.append(future.result()) 69 | 70 | answers.sort(key=lambda x: x['question_id']) 71 | 72 | with open(os.path.expanduser(args.output), 'w') as f: 73 | table = [json.dumps(ans) for ans in answers] 74 | f.write('\n'.join(table)) 75 | -------------------------------------------------------------------------------- /openomni/eval/summarize_gpt_review.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | from collections import defaultdict 4 | 5 | import numpy as np 6 | 7 | import argparse 8 | 9 | def parse_args(): 10 | parser = argparse.ArgumentParser(description='ChatGPT-based QA evaluation.') 11 | parser.add_argument('-d', '--dir', default=None) 12 | parser.add_argument('-v', '--version', default=None) 13 | parser.add_argument('-s', '--select', nargs='*', default=None) 14 | parser.add_argument('-f', '--files', nargs='*', default=[]) 15 | parser.add_argument('-i', '--ignore', nargs='*', default=[]) 16 | return parser.parse_args() 17 | 18 | 19 | if __name__ == '__main__': 20 | args = parse_args() 21 | 22 | if args.ignore is not None: 23 | args.ignore = [int(x) for x in args.ignore] 24 | 25 | if len(args.files) > 0: 26 | review_files = args.files 27 | else: 28 | review_files = [x for x in os.listdir(args.dir) if x.endswith('.jsonl') and (x.startswith('gpt4_text') or x.startswith('reviews_') or x.startswith('review_') or 'review' in args.dir)] 29 | 30 | for review_file in sorted(review_files): 31 | config = os.path.basename(review_file).replace('gpt4_text_', '').replace('.jsonl', '') 32 | if args.select is not None and any(x not in config for x in args.select): 33 | continue 34 | if '0613' in config: 35 | version = '0613' 36 | else: 37 | version = '0314' 38 | if args.version is not None and args.version != version: 39 | continue 40 | scores = defaultdict(list) 41 | print(config) 42 | with open(os.path.join(args.dir, review_file) if args.dir is not None else review_file) as f: 43 | for review_str in f: 44 | review = json.loads(review_str) 45 | if review['question_id'] in args.ignore: 46 | continue 47 | if 'category' in review: 48 | scores[review['category']].append(review['tuple']) 49 | scores['all'].append(review['tuple']) 50 | else: 51 | if 'tuple' in review: 52 | scores['all'].append(review['tuple']) 53 | else: 54 | scores['all'].append(review['score']) 55 | for k, v in sorted(scores.items()): 56 | stats = np.asarray(v).mean(0).tolist() 57 | stats = [round(x, 3) for x in stats] 58 | # print(k, stats, round(stats[1]/stats[0]*100, 1)) 59 | print(k, round(stats[1]/stats[0]*100, 1), round(stats[0] * 10, 1), round(stats[1] * 10, 1)) 60 | print('=================================') 61 | -------------------------------------------------------------------------------- /openomni/model/__init__.py: -------------------------------------------------------------------------------- 1 | from .language_model.llava_her_llama import LlavaHerLlamaForCausalLM, LlavaHerConfig 2 | from .language_model.llava_her_qwen import LlavaHerQwen2ForCausalLM, LlavaHerQwenConfig 3 | # from .language_model.llava_mpt import LlavaMptForCausalLM, LlavaMptConfig 4 | # from .language_model.llava_mistral import LlavaMistralForCausalLM, LlavaMistralConfig 5 | 6 | -------------------------------------------------------------------------------- /openomni/model/apply_delta.py: -------------------------------------------------------------------------------- 1 | """ 2 | Usage: 3 | python3 -m fastchat.model.apply_delta --base ~/model_weights/llama-7b --target ~/model_weights/vicuna-7b --delta lmsys/vicuna-7b-delta 4 | """ 5 | import argparse 6 | 7 | import torch 8 | from tqdm import tqdm 9 | from transformers import AutoTokenizer, AutoModelForCausalLM 10 | from llava import LlavaLlamaForCausalLM 11 | 12 | 13 | def apply_delta(base_model_path, target_model_path, delta_path): 14 | print("Loading base model") 15 | base = AutoModelForCausalLM.from_pretrained( 16 | base_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True) 17 | 18 | print("Loading delta") 19 | delta = LlavaLlamaForCausalLM.from_pretrained(delta_path, torch_dtype=torch.float16, low_cpu_mem_usage=True) 20 | delta_tokenizer = AutoTokenizer.from_pretrained(delta_path) 21 | 22 | print("Applying delta") 23 | for name, param in tqdm(delta.state_dict().items(), desc="Applying delta"): 24 | if name not in base.state_dict(): 25 | assert name in ['model.mm_projector.weight', 'model.mm_projector.bias'], f'{name} not in base model' 26 | continue 27 | if param.data.shape == base.state_dict()[name].shape: 28 | param.data += base.state_dict()[name] 29 | else: 30 | assert name in ['model.embed_tokens.weight', 'lm_head.weight'], \ 31 | f'{name} dimension mismatch: {param.data.shape} vs {base.state_dict()[name].shape}' 32 | bparam = base.state_dict()[name] 33 | param.data[:bparam.shape[0], :bparam.shape[1]] += bparam 34 | 35 | print("Saving target model") 36 | delta.save_pretrained(target_model_path) 37 | delta_tokenizer.save_pretrained(target_model_path) 38 | 39 | 40 | if __name__ == "__main__": 41 | parser = argparse.ArgumentParser() 42 | parser.add_argument("--base-model-path", type=str, required=True) 43 | parser.add_argument("--target-model-path", type=str, required=True) 44 | parser.add_argument("--delta-path", type=str, required=True) 45 | 46 | args = parser.parse_args() 47 | 48 | apply_delta(args.base_model_path, args.target_model_path, args.delta_path) 49 | -------------------------------------------------------------------------------- /openomni/model/consolidate.py: -------------------------------------------------------------------------------- 1 | """ 2 | Usage: 3 | python3 -m llava.model.consolidate --src ~/model_weights/llava-7b --dst ~/model_weights/llava-7b_consolidate 4 | """ 5 | import argparse 6 | 7 | import torch 8 | from transformers import AutoTokenizer, AutoModelForCausalLM 9 | from llava.model import * 10 | from llava.model.utils import auto_upgrade 11 | 12 | 13 | def consolidate_ckpt(src_path, dst_path): 14 | print("Loading model") 15 | auto_upgrade(src_path) 16 | src_model = AutoModelForCausalLM.from_pretrained(src_path, torch_dtype=torch.float16, low_cpu_mem_usage=True) 17 | src_tokenizer = AutoTokenizer.from_pretrained(src_path, use_fast=False) 18 | src_model.save_pretrained(dst_path) 19 | src_tokenizer.save_pretrained(dst_path) 20 | 21 | 22 | if __name__ == "__main__": 23 | parser = argparse.ArgumentParser() 24 | parser.add_argument("--src", type=str, required=True) 25 | parser.add_argument("--dst", type=str, required=True) 26 | 27 | args = parser.parse_args() 28 | 29 | consolidate_ckpt(args.src, args.dst) 30 | -------------------------------------------------------------------------------- /openomni/model/language_model/llava_mpt.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Haotian Liu 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | from typing import Optional, Tuple 17 | 18 | import torch 19 | 20 | from transformers import AutoConfig, AutoModelForCausalLM, \ 21 | MptConfig, MptForCausalLM, MptModel 22 | from llava.model.llava_arch import LlavaMetaModel, LlavaMetaForCausalLM 23 | 24 | 25 | class LlavaMptConfig(MptConfig): 26 | model_type = "llava_mpt" 27 | 28 | 29 | class LlavaMptModel(LlavaMetaModel, MptModel): 30 | config_class = LlavaMptConfig 31 | 32 | def __init__(self, config: MptConfig): 33 | config.hidden_size = config.d_model 34 | super(LlavaMptModel, self).__init__(config) 35 | 36 | def embed_tokens(self, x): 37 | return self.wte(x) 38 | 39 | 40 | class LlavaMptForCausalLM(MptForCausalLM, LlavaMetaForCausalLM): 41 | config_class = LlavaMptConfig 42 | supports_gradient_checkpointing = True 43 | 44 | def __init__(self, config): 45 | super(MptForCausalLM, self).__init__(config) 46 | 47 | self.transformer = LlavaMptModel(config) 48 | self.lm_head = torch.nn.Linear(config.hidden_size, config.vocab_size, bias=False) 49 | 50 | # Initialize weights and apply final processing 51 | self.post_init() 52 | 53 | def get_model(self): 54 | return self.transformer 55 | 56 | def _set_gradient_checkpointing(self, module, value=False): 57 | if isinstance(module, LlavaMptModel): 58 | module.gradient_checkpointing = value 59 | 60 | def forward( 61 | self, 62 | input_ids: Optional[torch.LongTensor] = None, 63 | past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None, 64 | attention_mask: Optional[torch.Tensor] = None, 65 | inputs_embeds: Optional[torch.Tensor] = None, 66 | labels: Optional[torch.Tensor] = None, 67 | use_cache: Optional[bool] = None, 68 | output_attentions: Optional[bool] = None, 69 | output_hidden_states: Optional[bool] = None, 70 | return_dict: Optional[bool] = None, 71 | images=None): 72 | 73 | input_ids, attention_mask, past_key_values, inputs_embeds, labels = self.prepare_inputs_labels_for_multimodal(input_ids, attention_mask, past_key_values, labels, images) 74 | 75 | return super().forward( 76 | input_ids, 77 | past_key_values=past_key_values, 78 | attention_mask=attention_mask, 79 | inputs_embeds=inputs_embeds, 80 | labels=labels, 81 | use_cache=use_cache, 82 | output_attentions=output_attentions, 83 | output_hidden_states=output_hidden_states, 84 | return_dict=return_dict, 85 | ) 86 | 87 | def prepare_inputs_for_generation(self, input_ids, past_key_values=None, inputs_embeds=None, **kwargs): 88 | images = kwargs.pop("images", None) 89 | _inputs = super().prepare_inputs_for_generation( 90 | input_ids, past_key_values=past_key_values, inputs_embeds=inputs_embeds, **kwargs 91 | ) 92 | _inputs['images'] = images 93 | return _inputs 94 | 95 | 96 | AutoConfig.register("llava_mpt", LlavaMptConfig) 97 | AutoModelForCausalLM.register(LlavaMptConfig, LlavaMptForCausalLM) 98 | -------------------------------------------------------------------------------- /openomni/model/make_delta.py: -------------------------------------------------------------------------------- 1 | """ 2 | Usage: 3 | python3 -m llava.model.make_delta --base ~/model_weights/llama-7b --target ~/model_weights/llava-7b --delta ~/model_weights/llava-7b-delta --hub-repo-id liuhaotian/llava-7b-delta 4 | """ 5 | import argparse 6 | 7 | import torch 8 | from tqdm import tqdm 9 | from transformers import AutoTokenizer, AutoModelForCausalLM 10 | from llava.model.utils import auto_upgrade 11 | 12 | 13 | def make_delta(base_model_path, target_model_path, delta_path, hub_repo_id): 14 | print("Loading base model") 15 | base = AutoModelForCausalLM.from_pretrained( 16 | base_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True) 17 | 18 | print("Loading target model") 19 | auto_upgrade(target_model_path) 20 | target = AutoModelForCausalLM.from_pretrained(target_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True) 21 | 22 | print("Calculating delta") 23 | for name, param in tqdm(target.state_dict().items(), desc="Calculating delta"): 24 | if name not in base.state_dict(): 25 | assert name in ['model.mm_projector.weight', 'model.mm_projector.bias'], f'{name} not in base model' 26 | continue 27 | if param.data.shape == base.state_dict()[name].shape: 28 | param.data -= base.state_dict()[name] 29 | else: 30 | assert name in ['model.embed_tokens.weight', 'lm_head.weight'], f'{name} dimension mismatch: {param.data.shape} vs {base.state_dict()[name].shape}' 31 | bparam = base.state_dict()[name] 32 | param.data[:bparam.shape[0], :bparam.shape[1]] -= bparam 33 | 34 | print("Saving delta") 35 | if hub_repo_id: 36 | kwargs = {"push_to_hub": True, "repo_id": hub_repo_id} 37 | else: 38 | kwargs = {} 39 | target.save_pretrained(delta_path, **kwargs) 40 | target_tokenizer = AutoTokenizer.from_pretrained(target_model_path) 41 | target_tokenizer.save_pretrained(delta_path, **kwargs) 42 | 43 | 44 | if __name__ == "__main__": 45 | parser = argparse.ArgumentParser() 46 | parser.add_argument("--base-model-path", type=str, required=True) 47 | parser.add_argument("--target-model-path", type=str, required=True) 48 | parser.add_argument("--delta-path", type=str, required=True) 49 | parser.add_argument("--hub-repo-id", type=str, default=None) 50 | args = parser.parse_args() 51 | 52 | make_delta(args.base_model_path, args.target_model_path, args.delta_path, args.hub_repo_id) 53 | -------------------------------------------------------------------------------- /openomni/model/multimodal_encoder/builder.py: -------------------------------------------------------------------------------- 1 | import os 2 | from .clip_encoder import CLIPVisionTower, CLIPVisionTowerS2 3 | 4 | 5 | def build_vision_tower(vision_tower_cfg, **kwargs): 6 | vision_tower = getattr(vision_tower_cfg, 'mm_vision_tower', getattr(vision_tower_cfg, 'vision_tower', None)) 7 | is_absolute_path_exists = os.path.exists(vision_tower) 8 | use_s2 = getattr(vision_tower_cfg, 's2', False) 9 | if is_absolute_path_exists or vision_tower.startswith("openai") or vision_tower.startswith("laion") or "ShareGPT4V" in vision_tower: 10 | if use_s2: 11 | return CLIPVisionTowerS2(vision_tower, args=vision_tower_cfg, **kwargs) 12 | else: 13 | return CLIPVisionTower(vision_tower, args=vision_tower_cfg, **kwargs) 14 | 15 | raise ValueError(f'Unknown vision tower: {vision_tower}') 16 | -------------------------------------------------------------------------------- /openomni/model/multimodal_projector/builder.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import re 4 | 5 | 6 | class IdentityMap(nn.Module): 7 | def __init__(self): 8 | super().__init__() 9 | 10 | def forward(self, x, *args, **kwargs): 11 | return x 12 | 13 | @property 14 | def config(self): 15 | return {"mm_projector_type": 'identity'} 16 | 17 | 18 | class SimpleResBlock(nn.Module): 19 | def __init__(self, channels): 20 | super().__init__() 21 | self.pre_norm = nn.LayerNorm(channels) 22 | 23 | self.proj = nn.Sequential( 24 | nn.Linear(channels, channels), 25 | nn.GELU(), 26 | nn.Linear(channels, channels) 27 | ) 28 | def forward(self, x): 29 | x = self.pre_norm(x) 30 | return x + self.proj(x) 31 | 32 | 33 | def build_vision_projector(config, delay_load=False, **kwargs): 34 | projector_type = getattr(config, 'mm_projector_type', 'linear') 35 | 36 | if projector_type == 'linear': 37 | return nn.Linear(config.mm_hidden_size, config.hidden_size) 38 | 39 | mlp_gelu_match = re.match(r'^mlp(\d+)x_gelu$', projector_type) 40 | if mlp_gelu_match: 41 | mlp_depth = int(mlp_gelu_match.group(1)) 42 | modules = [nn.Linear(config.mm_hidden_size, config.hidden_size)] 43 | for _ in range(1, mlp_depth): 44 | modules.append(nn.GELU()) 45 | modules.append(nn.Linear(config.hidden_size, config.hidden_size)) 46 | return nn.Sequential(*modules) 47 | 48 | if projector_type == 'identity': 49 | return IdentityMap() 50 | 51 | raise ValueError(f'Unknown projector type: {projector_type}') 52 | -------------------------------------------------------------------------------- /openomni/model/speech_encoder/builder.py: -------------------------------------------------------------------------------- 1 | from .speech_encoder import WhisperWrappedEncoder 2 | 3 | 4 | def build_speech_encoder(config): 5 | speech_encoder_type = getattr(config, 'speech_encoder_type', None) 6 | if "whisper" in speech_encoder_type.lower(): 7 | return WhisperWrappedEncoder.load(config) 8 | 9 | raise ValueError(f'Unknown speech encoder: {speech_encoder_type}') 10 | -------------------------------------------------------------------------------- /openomni/model/speech_encoder/speech_encoder.py: -------------------------------------------------------------------------------- 1 | # Adopted from https://github.com/ddlBoJack/SLAM-LLM/blob/main/src/slam_llm/models/encoder.py 2 | 3 | import types 4 | import torch 5 | import torch.nn as nn 6 | import torch.nn.functional as F 7 | import whisper 8 | 9 | class WhisperWrappedEncoder: 10 | 11 | @classmethod 12 | def load(cls, model_config): 13 | 14 | def replace_layer_norm(module): 15 | from whisper.model import LayerNorm 16 | for name, child in module.named_children(): 17 | if isinstance(child, LayerNorm): 18 | old_params = child.state_dict() 19 | new_layer_norm = nn.LayerNorm(child.normalized_shape, eps=child.eps, elementwise_affine=child.elementwise_affine) 20 | new_layer_norm.load_state_dict(old_params) 21 | setattr(module, name, new_layer_norm) 22 | else: 23 | replace_layer_norm(child) 24 | encoder = whisper.load_model(name=model_config.speech_encoder, device='cpu').encoder 25 | # encoder = whisper.load_model(name="/mnt/workspace/lr/datasets/checkpoints/llava_her_pretrained/large-v3.pt",device='cpu').encoder 26 | 27 | replace_layer_norm(encoder) 28 | return encoder 29 | 30 | # return None -------------------------------------------------------------------------------- /openomni/model/speech_generator_ar/builder.py: -------------------------------------------------------------------------------- 1 | from .speech_generator import SpeechGeneratorCTC 2 | 3 | 4 | def build_speech_generator(config): 5 | generator_type = getattr(config, 'speech_generator_type', 'ctc') 6 | if generator_type == 'ctc' or generator_type == 'ar': 7 | return SpeechGeneratorCTC(config) 8 | 9 | raise ValueError(f'Unknown generator type: {generator_type}') 10 | -------------------------------------------------------------------------------- /openomni/model/speech_generator_ctc/builder.py: -------------------------------------------------------------------------------- 1 | from .speech_generator import SpeechGeneratorCTC 2 | 3 | 4 | def build_speech_generator(config): 5 | generator_type = getattr(config, 'speech_generator_type', 'ctc') 6 | if generator_type == 'ctc': 7 | return SpeechGeneratorCTC(config) 8 | 9 | raise ValueError(f'Unknown generator type: {generator_type}') 10 | -------------------------------------------------------------------------------- /openomni/model/speech_projector/builder.py: -------------------------------------------------------------------------------- 1 | from .speech_projector import EncoderProjectorConcat 2 | 3 | 4 | def build_speech_projector(config): 5 | projector_type = getattr(config, 'speech_projector_type', 'linear') 6 | if projector_type == 'linear': 7 | return EncoderProjectorConcat(config) 8 | 9 | raise ValueError(f'Unknown projector type: {projector_type}') 10 | -------------------------------------------------------------------------------- /openomni/model/speech_projector/speech_projector.py: -------------------------------------------------------------------------------- 1 | # Adopted from https://github.com/ddlBoJack/SLAM-LLM/blob/main/src/slam_llm/models/projector.py 2 | 3 | 4 | import torch 5 | import torch.nn as nn 6 | 7 | 8 | class EncoderProjectorConcat(nn.Module): 9 | def __init__(self, config): 10 | super().__init__() 11 | self.k = config.speech_encoder_ds_rate 12 | self.encoder_dim = config.speech_encoder_hidden_size 13 | self.llm_dim = config.hidden_size 14 | self.linear1 = nn.Linear(self.encoder_dim * self.k, 2048) 15 | self.relu = nn.ReLU() 16 | self.linear2 = nn.Linear(2048, config.hidden_size) 17 | 18 | def forward(self, x): 19 | batch_size, seq_len, dim = x.size() 20 | num_frames_to_discard = seq_len % self.k 21 | if num_frames_to_discard > 0: 22 | x = x[:, :-num_frames_to_discard, :] 23 | seq_len = x.size(1) 24 | 25 | x = x.contiguous() 26 | x = x.view(batch_size, seq_len // self.k, dim * self.k) 27 | x = self.linear1(x) 28 | x = self.relu(x) 29 | x = self.linear2(x) 30 | return x -------------------------------------------------------------------------------- /openomni/model/utils.py: -------------------------------------------------------------------------------- 1 | from transformers import AutoConfig 2 | 3 | 4 | def auto_upgrade(config): 5 | cfg = AutoConfig.from_pretrained(config) 6 | if 'llava' in config and 'llava' not in cfg.model_type: 7 | assert cfg.model_type == 'llama' 8 | print("You are using newer LLaVA code base, while the checkpoint of v0 is from older code base.") 9 | print("You must upgrade the checkpoint to the new code base (this can be done automatically).") 10 | confirm = input("Please confirm that you want to upgrade the checkpoint. [Y/N]") 11 | if confirm.lower() in ["y", "yes"]: 12 | print("Upgrading checkpoint...") 13 | assert len(cfg.architectures) == 1 14 | setattr(cfg.__class__, "model_type", "llava") 15 | cfg.architectures[0] = 'LlavaLlamaForCausalLM' 16 | cfg.save_pretrained(config) 17 | print("Checkpoint upgraded.") 18 | else: 19 | print("Checkpoint upgrade aborted.") 20 | exit(1) 21 | -------------------------------------------------------------------------------- /openomni/model/visual_encoder/builder.py: -------------------------------------------------------------------------------- 1 | import os 2 | from .clip_encoder import CLIPVisionTower, CLIPVisionTowerS2 3 | 4 | 5 | def build_vision_tower(vision_tower_cfg, **kwargs): 6 | vision_tower = getattr(vision_tower_cfg, 'mm_vision_tower', getattr(vision_tower_cfg, 'vision_tower', None)) 7 | is_absolute_path_exists = os.path.exists(vision_tower) 8 | use_s2 = getattr(vision_tower_cfg, 's2', False) 9 | if is_absolute_path_exists or vision_tower.startswith("openai") or vision_tower.startswith("laion") or "ShareGPT4V" in vision_tower: 10 | if use_s2: 11 | return CLIPVisionTowerS2(vision_tower, args=vision_tower_cfg, **kwargs) 12 | else: 13 | return CLIPVisionTower(vision_tower, args=vision_tower_cfg, **kwargs) 14 | 15 | raise ValueError(f'Unknown vision tower: {vision_tower}') 16 | -------------------------------------------------------------------------------- /openomni/model/visual_projector/builder.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import re 4 | 5 | 6 | class IdentityMap(nn.Module): 7 | def __init__(self): 8 | super().__init__() 9 | 10 | def forward(self, x, *args, **kwargs): 11 | return x 12 | 13 | @property 14 | def config(self): 15 | return {"mm_projector_type": 'identity'} 16 | 17 | 18 | class SimpleResBlock(nn.Module): 19 | def __init__(self, channels): 20 | super().__init__() 21 | self.pre_norm = nn.LayerNorm(channels) 22 | 23 | self.proj = nn.Sequential( 24 | nn.Linear(channels, channels), 25 | nn.GELU(), 26 | nn.Linear(channels, channels) 27 | ) 28 | def forward(self, x): 29 | x = self.pre_norm(x) 30 | return x + self.proj(x) 31 | 32 | 33 | def build_vision_projector(config, delay_load=False, **kwargs): 34 | projector_type = getattr(config, 'mm_projector_type', 'linear') 35 | 36 | if projector_type == 'linear': 37 | return nn.Linear(config.mm_hidden_size, config.hidden_size) 38 | 39 | mlp_gelu_match = re.match(r'^mlp(\d+)x_gelu$', projector_type) 40 | if mlp_gelu_match: 41 | mlp_depth = int(mlp_gelu_match.group(1)) 42 | modules = [nn.Linear(config.mm_hidden_size, config.hidden_size)] 43 | for _ in range(1, mlp_depth): 44 | modules.append(nn.GELU()) 45 | modules.append(nn.Linear(config.hidden_size, config.hidden_size)) 46 | return nn.Sequential(*modules) 47 | 48 | if projector_type == 'identity': 49 | return IdentityMap() 50 | 51 | raise ValueError(f'Unknown projector type: {projector_type}') 52 | -------------------------------------------------------------------------------- /openomni/serve/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RainBowLuoCS/OpenOmni/2892efff5fb4155a2f0b775aaa1f316aa69e289a/openomni/serve/__init__.py -------------------------------------------------------------------------------- /openomni/serve/examples/extreme_ironing.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RainBowLuoCS/OpenOmni/2892efff5fb4155a2f0b775aaa1f316aa69e289a/openomni/serve/examples/extreme_ironing.jpg -------------------------------------------------------------------------------- /openomni/serve/examples/waterview.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RainBowLuoCS/OpenOmni/2892efff5fb4155a2f0b775aaa1f316aa69e289a/openomni/serve/examples/waterview.jpg -------------------------------------------------------------------------------- /openomni/serve/register_worker.py: -------------------------------------------------------------------------------- 1 | """ 2 | Manually register workers. 3 | 4 | Usage: 5 | python3 -m fastchat.serve.register_worker --controller http://localhost:21001 --worker-name http://localhost:21002 6 | """ 7 | 8 | import argparse 9 | 10 | import requests 11 | 12 | if __name__ == "__main__": 13 | parser = argparse.ArgumentParser() 14 | parser.add_argument("--controller-address", type=str) 15 | parser.add_argument("--worker-name", type=str) 16 | parser.add_argument("--check-heart-beat", action="store_true") 17 | args = parser.parse_args() 18 | 19 | url = args.controller_address + "/register_worker" 20 | data = { 21 | "worker_name": args.worker_name, 22 | "check_heart_beat": args.check_heart_beat, 23 | "worker_status": None, 24 | } 25 | r = requests.post(url, json=data) 26 | assert r.status_code == 200 27 | -------------------------------------------------------------------------------- /openomni/serve/test_message.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | 4 | import requests 5 | 6 | from llava.conversation import default_conversation 7 | 8 | 9 | def main(): 10 | if args.worker_address: 11 | worker_addr = args.worker_address 12 | else: 13 | controller_addr = args.controller_address 14 | ret = requests.post(controller_addr + "/refresh_all_workers") 15 | ret = requests.post(controller_addr + "/list_models") 16 | models = ret.json()["models"] 17 | models.sort() 18 | print(f"Models: {models}") 19 | 20 | ret = requests.post(controller_addr + "/get_worker_address", 21 | json={"model": args.model_name}) 22 | worker_addr = ret.json()["address"] 23 | print(f"worker_addr: {worker_addr}") 24 | 25 | if worker_addr == "": 26 | return 27 | 28 | conv = default_conversation.copy() 29 | conv.append_message(conv.roles[0], args.message) 30 | prompt = conv.get_prompt() 31 | 32 | headers = {"User-Agent": "LLaVA Client"} 33 | pload = { 34 | "model": args.model_name, 35 | "prompt": prompt, 36 | "max_new_tokens": args.max_new_tokens, 37 | "temperature": 0.7, 38 | "stop": conv.sep, 39 | } 40 | response = requests.post(worker_addr + "/worker_generate_stream", headers=headers, 41 | json=pload, stream=True) 42 | 43 | print(prompt.replace(conv.sep, "\n"), end="") 44 | for chunk in response.iter_lines(chunk_size=8192, decode_unicode=False, delimiter=b"\0"): 45 | if chunk: 46 | data = json.loads(chunk.decode("utf-8")) 47 | output = data["text"].split(conv.sep)[-1] 48 | print(output, end="\r") 49 | print("") 50 | 51 | 52 | if __name__ == "__main__": 53 | parser = argparse.ArgumentParser() 54 | parser.add_argument("--controller-address", type=str, default="http://localhost:21001") 55 | parser.add_argument("--worker-address", type=str) 56 | parser.add_argument("--model-name", type=str, default="facebook/opt-350m") 57 | parser.add_argument("--max-new-tokens", type=int, default=32) 58 | parser.add_argument("--message", type=str, default= 59 | "Tell me a story with more than 1000 words.") 60 | args = parser.parse_args() 61 | 62 | main() 63 | -------------------------------------------------------------------------------- /openomni/train/train_mem.py: -------------------------------------------------------------------------------- 1 | from openomni.train.train import train 2 | 3 | if __name__ == "__main__": 4 | train(attn_implementation="flash_attention_2") 5 | -------------------------------------------------------------------------------- /openomni/train/train_xformers.py: -------------------------------------------------------------------------------- 1 | # Make it more memory efficient by monkey patching the LLaMA model with xformers attention. 2 | # Need to call this before importing transformers. 3 | from llava.train.llama_xformers_attn_monkey_patch import ( 4 | replace_llama_attn_with_xformers_attn, 5 | ) 6 | 7 | replace_llama_attn_with_xformers_attn() 8 | 9 | from openomni.train.train import train 10 | 11 | if __name__ == "__main__": 12 | train() 13 | -------------------------------------------------------------------------------- /openomni/utils.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import logging 3 | import logging.handlers 4 | import os 5 | import sys 6 | 7 | import requests 8 | 9 | from openomni.constants import LOGDIR 10 | 11 | server_error_msg = "**NETWORK ERROR DUE TO HIGH TRAFFIC. PLEASE REGENERATE OR REFRESH THIS PAGE.**" 12 | moderation_msg = "YOUR INPUT VIOLATES OUR CONTENT MODERATION GUIDELINES. PLEASE TRY AGAIN." 13 | 14 | handler = None 15 | 16 | 17 | def build_logger(logger_name, logger_filename): 18 | global handler 19 | 20 | formatter = logging.Formatter( 21 | fmt="%(asctime)s | %(levelname)s | %(name)s | %(message)s", 22 | datefmt="%Y-%m-%d %H:%M:%S", 23 | ) 24 | 25 | # Set the format of root handlers 26 | if not logging.getLogger().handlers: 27 | logging.basicConfig(level=logging.INFO) 28 | logging.getLogger().handlers[0].setFormatter(formatter) 29 | 30 | # Redirect stdout and stderr to loggers 31 | stdout_logger = logging.getLogger("stdout") 32 | stdout_logger.setLevel(logging.INFO) 33 | sl = StreamToLogger(stdout_logger, logging.INFO) 34 | sys.stdout = sl 35 | 36 | stderr_logger = logging.getLogger("stderr") 37 | stderr_logger.setLevel(logging.ERROR) 38 | sl = StreamToLogger(stderr_logger, logging.ERROR) 39 | sys.stderr = sl 40 | 41 | # Get logger 42 | logger = logging.getLogger(logger_name) 43 | logger.setLevel(logging.INFO) 44 | 45 | # Add a file handler for all loggers 46 | if handler is None: 47 | os.makedirs(LOGDIR, exist_ok=True) 48 | filename = os.path.join(LOGDIR, logger_filename) 49 | handler = logging.handlers.TimedRotatingFileHandler( 50 | filename, when='D', utc=True, encoding='UTF-8') 51 | handler.setFormatter(formatter) 52 | 53 | for name, item in logging.root.manager.loggerDict.items(): 54 | if isinstance(item, logging.Logger): 55 | item.addHandler(handler) 56 | 57 | return logger 58 | 59 | 60 | class StreamToLogger(object): 61 | """ 62 | Fake file-like stream object that redirects writes to a logger instance. 63 | """ 64 | def __init__(self, logger, log_level=logging.INFO): 65 | self.terminal = sys.stdout 66 | self.logger = logger 67 | self.log_level = log_level 68 | self.linebuf = '' 69 | 70 | def __getattr__(self, attr): 71 | return getattr(self.terminal, attr) 72 | 73 | def write(self, buf): 74 | temp_linebuf = self.linebuf + buf 75 | self.linebuf = '' 76 | for line in temp_linebuf.splitlines(True): 77 | # From the io.TextIOWrapper docs: 78 | # On output, if newline is None, any '\n' characters written 79 | # are translated to the system default line separator. 80 | # By default sys.stdout.write() expects '\n' newlines and then 81 | # translates them so this is still cross platform. 82 | if line[-1] == '\n': 83 | self.logger.log(self.log_level, line.rstrip()) 84 | else: 85 | self.linebuf += line 86 | 87 | def flush(self): 88 | if self.linebuf != '': 89 | self.logger.log(self.log_level, self.linebuf.rstrip()) 90 | self.linebuf = '' 91 | 92 | 93 | def disable_torch_init(): 94 | """ 95 | Disable the redundant torch default initialization to accelerate model creation. 96 | """ 97 | import torch 98 | setattr(torch.nn.Linear, "reset_parameters", lambda self: None) 99 | setattr(torch.nn.LayerNorm, "reset_parameters", lambda self: None) 100 | 101 | 102 | def violates_moderation(text): 103 | """ 104 | Check whether the text violates OpenAI moderation API. 105 | """ 106 | url = "https://api.openai.com/v1/moderations" 107 | headers = {"Content-Type": "application/json", 108 | "Authorization": "Bearer " + os.environ["OPENAI_API_KEY"]} 109 | text = text.replace("\n", "") 110 | data = "{" + '"input": ' + f'"{text}"' + "}" 111 | data = data.encode("utf-8") 112 | try: 113 | ret = requests.post(url, headers=headers, data=data, timeout=5) 114 | flagged = ret.json()["results"][0]["flagged"] 115 | except requests.exceptions.RequestException as e: 116 | flagged = False 117 | except KeyError as e: 118 | flagged = False 119 | 120 | return flagged 121 | 122 | 123 | def pretty_print_semaphore(semaphore): 124 | if semaphore is None: 125 | return "None" 126 | return f"Semaphore(value={semaphore._value}, locked={semaphore.locked()})" 127 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools>=61.0"] 3 | build-backend = "setuptools.build_meta" 4 | 5 | [project] 6 | name = "openomni" 7 | version = "0.0.1" 8 | description = "OpenOmni" 9 | readme = "README.md" 10 | requires-python = ">=3.8" 11 | classifiers = [ 12 | "Programming Language :: Python :: 3", 13 | "License :: OSI Approved :: Apache Software License", 14 | ] 15 | dependencies = [ 16 | "torch==2.1.2", "torchvision==0.16.2", 17 | "transformers==4.43.4", "tokenizers==0.19.0", "sentencepiece==0.1.99", "shortuuid", 18 | "accelerate==0.21.0", "peft", "bitsandbytes", 19 | "pydantic", "markdown2[all]", "numpy", "scikit-learn==1.2.2", 20 | "gradio==4.16.0", "gradio_client==0.8.1","torchaudio==2.1.2", 21 | "requests", "httpx==0.24.0", "uvicorn", "fastapi", "prettytable", 22 | "einops==0.6.1", "einops-exts==0.0.4", "timm==0.6.13", "openpyxl" 23 | ] 24 | 25 | [project.optional-dependencies] 26 | train = ["deepspeed==0.12.6", "ninja", "wandb"] 27 | build = ["build", "twine"] 28 | 29 | [project.urls] 30 | "Homepage" = "https://github.com/RainBowLuoCS/OpenOmni" 31 | "Bug Tracker" = "https://github.com/RainBowLuoCS/OpenOmniissues" 32 | 33 | [tool.setuptools.packages.find] 34 | exclude = ["assets*", "benchmark*", "docs", "dist*", "playground*", "scripts*", "tests*"] 35 | 36 | [tool.wheel] 37 | exclude = ["assets*", "benchmark*", "docs", "dist*", "playground*", "scripts*", "tests*"] 38 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | pycocoevalcap 2 | validators 3 | visual_genome 4 | xlsxwriter 5 | sty 6 | transformers==4.43.4 7 | openai-whisper 8 | git+https://github.com/shivammehta25/Matcha-TTS.git 9 | 10 | 11 | -------------------------------------------------------------------------------- /scripts/clear.sh: -------------------------------------------------------------------------------- 1 | 2 | for pid in $(nvidia-smi --query-compute-apps=pid --format=csv,noheader); do 3 | kill -9 $pid 4 | done 5 | -------------------------------------------------------------------------------- /scripts/convert_gqa_for_eval.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import argparse 4 | 5 | parser = argparse.ArgumentParser() 6 | parser.add_argument("--src", type=str) 7 | parser.add_argument("--dst", type=str) 8 | args = parser.parse_args() 9 | 10 | all_answers = [] 11 | for line_idx, line in enumerate(open(args.src)): 12 | res = json.loads(line) 13 | question_id = res['question_id'] 14 | text = res['text'].rstrip('.').lower() 15 | all_answers.append({"questionId": question_id, "prediction": text}) 16 | 17 | with open(args.dst, 'w') as f: 18 | json.dump(all_answers, f) 19 | -------------------------------------------------------------------------------- /scripts/convert_mmbench_for_submission.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import argparse 4 | import pandas as pd 5 | 6 | def get_args(): 7 | parser = argparse.ArgumentParser() 8 | parser.add_argument("--annotation-file", type=str, required=True) 9 | parser.add_argument("--result-dir", type=str, required=True) 10 | parser.add_argument("--upload-dir", type=str, required=True) 11 | parser.add_argument("--experiment", type=str, required=True) 12 | 13 | return parser.parse_args() 14 | 15 | if __name__ == "__main__": 16 | args = get_args() 17 | 18 | df = pd.read_table(args.annotation_file) 19 | 20 | cur_df = df.copy() 21 | cur_df = cur_df.drop(columns=['hint', 'category', 'source', 'image', 'comment', 'l2-category']) 22 | cur_df.insert(6, 'prediction', None) 23 | for pred in open(os.path.join(args.result_dir, f"{args.experiment}.jsonl")): 24 | pred = json.loads(pred) 25 | cur_df.loc[df['index'] == pred['question_id'], 'prediction'] = pred['text'] 26 | 27 | cur_df.to_excel(os.path.join(args.upload_dir, f"{args.experiment}.xlsx"), index=False, engine='openpyxl') 28 | -------------------------------------------------------------------------------- /scripts/convert_mmvet_for_eval.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import argparse 4 | 5 | parser = argparse.ArgumentParser() 6 | parser.add_argument("--src", type=str) 7 | parser.add_argument("--dst", type=str) 8 | args = parser.parse_args() 9 | 10 | cur_result = {} 11 | 12 | for line in open(args.src): 13 | data = json.loads(line) 14 | qid = data['question_id'] 15 | cur_result[f'v1_{qid}'] = data['text'] 16 | 17 | with open(args.dst, 'w') as f: 18 | json.dump(cur_result, f, indent=2) 19 | -------------------------------------------------------------------------------- /scripts/convert_seed_for_submission.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import argparse 4 | 5 | 6 | def get_args(): 7 | parser = argparse.ArgumentParser() 8 | parser.add_argument("--annotation-file", type=str) 9 | parser.add_argument("--result-file", type=str) 10 | parser.add_argument("--result-upload-file", type=str) 11 | return parser.parse_args() 12 | 13 | 14 | def eval_single(result_file, eval_only_type=None): 15 | results = {} 16 | for line in open(result_file): 17 | row = json.loads(line) 18 | results[row['question_id']] = row 19 | 20 | type_counts = {} 21 | correct_counts = {} 22 | for question_data in data['questions']: 23 | if eval_only_type is not None and question_data['data_type'] != eval_only_type: continue 24 | data_type = question_data['question_type_id'] 25 | type_counts[data_type] = type_counts.get(data_type, 0) + 1 26 | try: 27 | question_id = int(question_data['question_id']) 28 | except: 29 | question_id = question_data['question_id'] 30 | if question_id not in results: 31 | correct_counts[data_type] = correct_counts.get(data_type, 0) 32 | continue 33 | row = results[question_id] 34 | if row['text'] == question_data['answer']: 35 | correct_counts[data_type] = correct_counts.get(data_type, 0) + 1 36 | 37 | total_count = 0 38 | total_correct = 0 39 | for data_type in sorted(type_counts.keys()): 40 | accuracy = correct_counts[data_type] / type_counts[data_type] * 100 41 | if eval_only_type is None: 42 | print(f"{ques_type_id_to_name[data_type]}: {accuracy:.2f}%") 43 | 44 | total_count += type_counts[data_type] 45 | total_correct += correct_counts[data_type] 46 | 47 | total_accuracy = total_correct / total_count * 100 48 | if eval_only_type is None: 49 | print(f"Total accuracy: {total_accuracy:.2f}%") 50 | else: 51 | print(f"{eval_only_type} accuracy: {total_accuracy:.2f}%") 52 | 53 | return results 54 | 55 | if __name__ == "__main__": 56 | args = get_args() 57 | data = json.load(open(args.annotation_file)) 58 | ques_type_id_to_name = {id:n for n,id in data['question_type'].items()} 59 | 60 | results = eval_single(args.result_file) 61 | eval_single(args.result_file, eval_only_type='image') 62 | -------------------------------------------------------------------------------- /scripts/convert_sqa_to_llava.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import fire 4 | import re 5 | from convert_sqa_to_llava_base_prompt import build_prompt_chatbot 6 | 7 | 8 | def convert_to_llava(base_dir, split, prompt_format="QCM-LEA"): 9 | split_indices = json.load(open(os.path.join(base_dir, "pid_splits.json")))[split] 10 | problems = json.load(open(os.path.join(base_dir, "problems.json"))) 11 | 12 | split_problems = build_prompt_chatbot( 13 | problems, split_indices, prompt_format, 14 | use_caption=False, is_test=False) 15 | 16 | target_format = [] 17 | for prob_id, (input, output) in split_problems.items(): 18 | if input.startswith('Question: '): 19 | input = input.replace('Question: ', '') 20 | if output.startswith('Answer: '): 21 | output = output.replace('Answer: ', '') 22 | 23 | raw_prob_data = problems[prob_id] 24 | if raw_prob_data['image'] is None: 25 | target_format.append({ 26 | "id": prob_id, 27 | "conversations": [ 28 | {'from': 'human', 'value': f"{input}"}, 29 | {'from': 'gpt', 'value': f"{output}"}, 30 | ], 31 | }) 32 | 33 | else: 34 | target_format.append({ 35 | "id": prob_id, 36 | "image": os.path.join(prob_id, raw_prob_data['image']), 37 | "conversations": [ 38 | {'from': 'human', 'value': f"{input}\n"}, 39 | {'from': 'gpt', 'value': f"{output}"}, 40 | ], 41 | }) 42 | 43 | print(f'Number of samples: {len(target_format)}') 44 | 45 | with open(os.path.join(base_dir, f"llava_{split}_{prompt_format}.json"), "w") as f: 46 | json.dump(target_format, f, indent=2) 47 | 48 | 49 | def convert_to_jsonl(base_dir, split, prompt_format="QCM-LEPA"): 50 | split_indices = json.load(open(os.path.join(base_dir, "pid_splits.json")))[split] 51 | problems = json.load(open(os.path.join(base_dir, "problems.json"))) 52 | 53 | split_problems = build_prompt_chatbot( 54 | problems, split_indices, prompt_format, 55 | use_caption=False, is_test=False) 56 | 57 | writer = open(os.path.join(base_dir, f"scienceqa_{split}_{prompt_format}.jsonl"), "w") 58 | for prob_id, (input, output) in split_problems.items(): 59 | if input.startswith('Question: '): 60 | input = input.replace('Question: ', '') 61 | if output.startswith('Answer: '): 62 | output = output.replace('Answer: ', '') 63 | 64 | raw_prob_data = problems[prob_id] 65 | if raw_prob_data['image'] is None: 66 | data = { 67 | "id": prob_id, 68 | "instruction": f"{input}", 69 | "output": f"{output}", 70 | } 71 | 72 | else: 73 | data = { 74 | "id": prob_id, 75 | "image": os.path.join(prob_id, raw_prob_data['image']), 76 | "instruction": f"{input}\n", 77 | "output": f"{output}", 78 | } 79 | writer.write(json.dumps(data) + '\n') 80 | writer.close() 81 | 82 | 83 | def main(task, **kwargs): 84 | globals()[task](**kwargs) 85 | 86 | 87 | if __name__ == "__main__": 88 | fire.Fire(main) 89 | -------------------------------------------------------------------------------- /scripts/convert_vizwiz_for_submission.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | import json 4 | 5 | from llava.eval.m4c_evaluator import EvalAIAnswerProcessor 6 | 7 | 8 | def parse_args(): 9 | parser = argparse.ArgumentParser() 10 | parser.add_argument('--annotation-file', type=str, required=True) 11 | parser.add_argument('--result-file', type=str, required=True) 12 | parser.add_argument('--result-upload-file', type=str, required=True) 13 | return parser.parse_args() 14 | 15 | 16 | if __name__ == '__main__': 17 | 18 | args = parse_args() 19 | 20 | os.makedirs(os.path.dirname(args.result_upload_file), exist_ok=True) 21 | 22 | results = [] 23 | error_line = 0 24 | for line_idx, line in enumerate(open(args.result_file)): 25 | try: 26 | results.append(json.loads(line)) 27 | except: 28 | error_line += 1 29 | results = {x['question_id']: x['text'] for x in results} 30 | test_split = [json.loads(line) for line in open(args.annotation_file)] 31 | split_ids = set([x['question_id'] for x in test_split]) 32 | 33 | print(f'total results: {len(results)}, total split: {len(test_split)}, error_line: {error_line}') 34 | 35 | all_answers = [] 36 | 37 | answer_processor = EvalAIAnswerProcessor() 38 | 39 | for x in test_split: 40 | assert x['question_id'] in results 41 | all_answers.append({ 42 | 'image': x['image'], 43 | 'answer': answer_processor(results[x['question_id']]) 44 | }) 45 | 46 | with open(args.result_upload_file, 'w') as f: 47 | json.dump(all_answers, f) 48 | -------------------------------------------------------------------------------- /scripts/convert_vqav2_for_submission.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | import json 4 | 5 | from llava.eval.m4c_evaluator import EvalAIAnswerProcessor 6 | 7 | 8 | def parse_args(): 9 | parser = argparse.ArgumentParser() 10 | parser.add_argument('--dir', type=str, default="./playground/data/eval/vqav2") 11 | parser.add_argument('--ckpt', type=str, required=True) 12 | parser.add_argument('--split', type=str, required=True) 13 | return parser.parse_args() 14 | 15 | 16 | if __name__ == '__main__': 17 | 18 | args = parse_args() 19 | 20 | src = os.path.join(args.dir, 'answers', args.split, args.ckpt, 'merge.jsonl') 21 | test_split = os.path.join(args.dir, 'llava_vqav2_mscoco_test2015.jsonl') 22 | dst = os.path.join(args.dir, 'answers_upload', args.split, f'{args.ckpt}.json') 23 | os.makedirs(os.path.dirname(dst), exist_ok=True) 24 | 25 | results = [] 26 | error_line = 0 27 | for line_idx, line in enumerate(open(src)): 28 | try: 29 | results.append(json.loads(line)) 30 | except: 31 | error_line += 1 32 | 33 | results = {x['question_id']: x['text'] for x in results} 34 | test_split = [json.loads(line) for line in open(test_split)] 35 | split_ids = set([x['question_id'] for x in test_split]) 36 | 37 | print(f'total results: {len(results)}, total split: {len(test_split)}, error_line: {error_line}') 38 | 39 | all_answers = [] 40 | 41 | answer_processor = EvalAIAnswerProcessor() 42 | 43 | for x in test_split: 44 | if x['question_id'] not in results: 45 | all_answers.append({ 46 | 'question_id': x['question_id'], 47 | 'answer': '' 48 | }) 49 | else: 50 | all_answers.append({ 51 | 'question_id': x['question_id'], 52 | 'answer': answer_processor(results[x['question_id']]) 53 | }) 54 | 55 | with open(dst, 'w') as f: 56 | json.dump(all_answers, open(dst, 'w')) 57 | -------------------------------------------------------------------------------- /scripts/mmevol/eval/gqa.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # srun -p mllm --gres gpu:8 bash scripts/v1_6/eval/gqa.sh 3 | 4 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}" 5 | IFS=',' read -ra GPULIST <<< "$gpu_list" 6 | CHUNKS=${#GPULIST[@]} 7 | 8 | CONV_MODE=llava_llama_3 9 | CKPT=$1 10 | CKPT_DIR=${2-"checkpoints"} 11 | 12 | SPLIT="llava_gqa_testdev_balanced" 13 | GQADIR="./playground/data/eval/gqa/data" 14 | 15 | for IDX in $(seq 0 $((CHUNKS-1))); do 16 | CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m llava.eval.model_vqa_loader \ 17 | --model-path ${CKPT_DIR}/${CKPT} \ 18 | --question-file ./playground/data/eval/gqa/$SPLIT.jsonl \ 19 | --image-folder /mnt/hwfile/mllm/xinglong/llava/llava_1.5/playground/data/eval/gqa/images \ 20 | --answers-file ./playground/data/eval/gqa/answers/$SPLIT/$CKPT/${CHUNKS}_${IDX}.jsonl \ 21 | --num-chunks $CHUNKS \ 22 | --chunk-idx $IDX \ 23 | --temperature 0 \ 24 | --square_eval True \ 25 | --conv-mode ${CONV_MODE} & 26 | done 27 | 28 | wait 29 | 30 | output_file=./playground/data/eval/gqa/answers/$SPLIT/$CKPT/merge.jsonl 31 | 32 | # Clear out the output file if it exists. 33 | > "$output_file" 34 | 35 | # Loop through the indices and concatenate each file. 36 | for IDX in $(seq 0 $((CHUNKS-1))); do 37 | cat ./playground/data/eval/gqa/answers/$SPLIT/$CKPT/${CHUNKS}_${IDX}.jsonl >> "$output_file" 38 | done 39 | 40 | python scripts/convert_gqa_for_eval.py --src $output_file --dst $GQADIR/testdev_balanced_predictions.json 41 | 42 | cd $GQADIR 43 | python eval/eval.py --tier testdev_balanced -------------------------------------------------------------------------------- /scripts/mmevol/eval/mmbench.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # srun -p mllm --gres gpu:8 bash scripts/v1_6/eval/mmbench.sh 3 | 4 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}" 5 | IFS=',' read -ra GPULIST <<< "$gpu_list" 6 | CHUNKS=${#GPULIST[@]} 7 | 8 | CONV_MODE=llava_llama_3 9 | CKPT=$1 10 | CKPT_DIR=${2-"checkpoints"} 11 | SPLIT="mmbench_dev_20230712" 12 | LANG="en" 13 | 14 | 15 | for IDX in $(seq 0 $((CHUNKS-1))); do 16 | CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m llava.eval.model_vqa_mmbench \ 17 | --model-path ${CKPT_DIR}/${CKPT} \ 18 | --question-file ./playground/data/eval/mmbench/$SPLIT.tsv \ 19 | --answers-file ./playground/data/eval/mmbench/answers/$SPLIT/${CKPT}/${CHUNKS}_${IDX}.jsonl \ 20 | --num-chunks $CHUNKS \ 21 | --chunk-idx $IDX \ 22 | --lang en \ 23 | --single-pred-prompt \ 24 | --square_eval True \ 25 | --temperature 0 \ 26 | --conv-mode ${CONV_MODE} & 27 | done 28 | 29 | wait 30 | 31 | output_file=./playground/data/eval/mmbench/answers/$SPLIT/${CKPT}/merge.jsonl 32 | 33 | # Clear out the output file if it exists. 34 | > "$output_file" 35 | 36 | # Loop through the indices and concatenate each file. 37 | for IDX in $(seq 0 $((CHUNKS-1))); do 38 | cat ./playground/data/eval/mmbench/answers/$SPLIT/${CKPT}/${CHUNKS}_${IDX}.jsonl >> "$output_file" 39 | done 40 | 41 | wait 42 | 43 | mkdir -p playground/data/eval/mmbench/answers_upload/$SPLIT 44 | mkdir -p playground/data/eval/mmbench/answers_upload/$SPLIT/${CKPT} 45 | 46 | python scripts/convert_mmbench_for_submission.py \ 47 | --annotation-file ./playground/data/eval/mmbench/$SPLIT.tsv \ 48 | --result-dir ./playground/data/eval/mmbench/answers/$SPLIT/${CKPT} \ 49 | --upload-dir ./playground/data/eval/mmbench/answers_upload/$SPLIT/${CKPT} \ 50 | --experiment merge 51 | -------------------------------------------------------------------------------- /scripts/mmevol/eval/mmbench_cn.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # srun -p mllm --gres gpu:8 bash scripts/v1_6/eval/mmbench.sh 3 | 4 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}" 5 | IFS=',' read -ra GPULIST <<< "$gpu_list" 6 | CHUNKS=${#GPULIST[@]} 7 | 8 | CONV_MODE=llava_llama_3 9 | CKPT=$1 10 | CKPT_DIR=${2-"checkpoints"} 11 | LANG="cn" 12 | SPLIT="mmbench_dev_cn_20231003" 13 | 14 | 15 | for IDX in $(seq 0 $((CHUNKS-1))); do 16 | CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m llava.eval.model_vqa_mmbench \ 17 | --model-path ${CKPT_DIR}/${CKPT} \ 18 | --question-file ./playground/data/eval/mmbench/$SPLIT.tsv \ 19 | --answers-file ./playground/data/eval/mmbench/answers/$SPLIT/${CKPT}/${CHUNKS}_${IDX}.jsonl \ 20 | --num-chunks $CHUNKS \ 21 | --chunk-idx $IDX \ 22 | --lang en \ 23 | --single-pred-prompt \ 24 | --square_eval True \ 25 | --temperature 0 \ 26 | --conv-mode ${CONV_MODE} & 27 | done 28 | 29 | wait 30 | 31 | output_file=./playground/data/eval/mmbench/answers/$SPLIT/${CKPT}/merge.jsonl 32 | 33 | # Clear out the output file if it exists. 34 | > "$output_file" 35 | 36 | # Loop through the indices and concatenate each file. 37 | for IDX in $(seq 0 $((CHUNKS-1))); do 38 | cat ./playground/data/eval/mmbench/answers/$SPLIT/${CKPT}/${CHUNKS}_${IDX}.jsonl >> "$output_file" 39 | done 40 | 41 | wait 42 | 43 | mkdir -p playground/data/eval/mmbench/answers_upload/$SPLIT 44 | mkdir -p playground/data/eval/mmbench/answers_upload/$SPLIT/${CKPT} 45 | 46 | python scripts/convert_mmbench_for_submission.py \ 47 | --annotation-file ./playground/data/eval/mmbench/$SPLIT.tsv \ 48 | --result-dir ./playground/data/eval/mmbench/answers/$SPLIT/${CKPT} \ 49 | --upload-dir ./playground/data/eval/mmbench/answers_upload/$SPLIT/${CKPT} \ 50 | --experiment merge 51 | -------------------------------------------------------------------------------- /scripts/mmevol/eval/mme.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # srun -p mllm --gres gpu:1 bash scripts/v1_6/eval/mme.sh 3 | 4 | CONV_MODE=llava_llama_3 5 | 6 | CUDA_VISIBLE_DEVICES=0 python -m llava.eval.model_vqa_loader \ 7 | --model-path /mnt/workspace/lr/datasets/checkpoints/Lin-Chen/open-llava-next-llama3-8b \ 8 | --question-file /mnt/workspace/lr/datasets/playground/playground/data/eval/MME/share4v_mme.jsonl \ 9 | --image-folder /mnt/workspace/lr/datasets/playground/playground/data/eval/MME/MME_Benchmark_release_version\ 10 | --answers-file ./playground/data/eval/MME/answers/std_topic.jsonl \ 11 | --temperature 0 \ 12 | --square_eval True \ 13 | --conv-mode $CONV_MODE 14 | 15 | # cd ./playground/data/eval/MME 16 | 17 | # python convert_answer_to_mme.py --experiment ${CKPT} 18 | 19 | # cd eval_tool 20 | 21 | # python calculation.py --results_dir answers/${CKPT} 22 | -------------------------------------------------------------------------------- /scripts/mmevol/eval/seed.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # srun -p mllm --gres gpu:8 bash scripts/v1_6/eval/seed.sh 3 | 4 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}" 5 | IFS=',' read -ra GPULIST <<< "$gpu_list" 6 | 7 | CHUNKS=${#GPULIST[@]} 8 | 9 | CONV_MODE=llava_llama_3 10 | CKPT=$1 11 | CKPT_DIR=${2-'checkpoints'} 12 | 13 | for IDX in $(seq 0 $((CHUNKS-1))); do 14 | CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m llava.eval.model_vqa_loader \ 15 | --model-path ${CKPT_DIR}/${CKPT} \ 16 | --question-file ./playground/data/eval/seed_bench/llava-seed-bench-image.jsonl \ 17 | --image-folder ./playground/data/eval/seed_bench \ 18 | --answers-file ./playground/data/eval/seed_bench/answers/$CKPT/${CHUNKS}_${IDX}.jsonl \ 19 | --num-chunks $CHUNKS \ 20 | --chunk-idx $IDX \ 21 | --temperature 0 \ 22 | --square_eval True \ 23 | --conv-mode $CONV_MODE & 24 | done 25 | 26 | wait 27 | 28 | output_file=./playground/data/eval/seed_bench/answers/$CKPT/merge.jsonl 29 | 30 | # Clear out the output file if it exists. 31 | > "$output_file" 32 | 33 | # Loop through the indices and concatenate each file. 34 | for IDX in $(seq 0 $((CHUNKS-1))); do 35 | cat ./playground/data/eval/seed_bench/answers/$CKPT/${CHUNKS}_${IDX}.jsonl >> "$output_file" 36 | done 37 | 38 | # Evaluate 39 | python scripts/convert_seed_for_submission.py \ 40 | --annotation-file ./playground/data/eval/seed_bench/SEED-Bench.json \ 41 | --result-file $output_file \ 42 | --result-upload-file ./playground/data/eval/seed_bench/answers_upload/llava-v1.6-7b.jsonl 43 | 44 | -------------------------------------------------------------------------------- /scripts/mmevol/eval/sqa.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # srun -p mllm --gres gpu:8 bash scripts/v1_6/eval/sqa.sh 3 | 4 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}" 5 | IFS=',' read -ra GPULIST <<< "$gpu_list" 6 | CHUNKS=${#GPULIST[@]} 7 | 8 | CONV_MODE=llava_llama_3 9 | CKPT=$1 10 | CKPT_DIR=${2-"checkpoints"} 11 | 12 | for IDX in $(seq 0 $((CHUNKS-1))); do 13 | CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m llava.eval.model_vqa_science \ 14 | --model-path ${CKPT_DIR}/${CKPT} \ 15 | --question-file ./playground/data/eval/scienceqa/llava_test_CQM-A.json \ 16 | --image-folder ./playground/data/eval/scienceqa/images/test \ 17 | --answers-file ./playground/data/eval/scienceqa/answers/$CKPT/${CHUNKS}_${IDX}.jsonl \ 18 | --single-pred-prompt \ 19 | --num-chunks $CHUNKS \ 20 | --chunk-idx $IDX \ 21 | --temperature 0 \ 22 | --square_eval True \ 23 | --conv-mode ${CONV_MODE} & 24 | done 25 | 26 | wait 27 | 28 | output_file=./playground/data/eval/scienceqa/answers/${CKPT}.jsonl 29 | 30 | # Clear out the output file if it exists. 31 | > "$output_file" 32 | 33 | # Loop through the indices and concatenate each file. 34 | for IDX in $(seq 0 $((CHUNKS-1))); do 35 | cat ./playground/data/eval/scienceqa/answers/$CKPT/${CHUNKS}_${IDX}.jsonl >> "$output_file" 36 | done 37 | 38 | # Evaluate 39 | python llava/eval/eval_science_qa.py \ 40 | --base-dir ./playground/data/eval/scienceqa \ 41 | --result-file ./playground/data/eval/scienceqa/answers/${CKPT}.jsonl \ 42 | --output-file ./playground/data/eval/scienceqa/answers/${CKPT}_output.jsonl \ 43 | --output-result ./playground/data/eval/scienceqa/answers/${CKPT}_result.json -------------------------------------------------------------------------------- /scripts/mmevol/eval/textvqa.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # srun -p mllm --gres gpu:8 bash scripts/v1_6/eval/textvqa.sh 3 | 4 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}" 5 | IFS=',' read -ra GPULIST <<< "$gpu_list" 6 | CHUNKS=${#GPULIST[@]} 7 | 8 | CONV_MODE=llava_llama_3 9 | CKPT=$1 10 | CKPT_DIR=${2-"checkpoints"} 11 | 12 | for IDX in $(seq 0 $((CHUNKS-1))); do 13 | CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m llava.eval.model_vqa_loader \ 14 | --model-path ${CKPT_DIR}/${CKPT} \ 15 | --question-file ./playground/data/eval/textvqa/llava_textvqa_val_v051_ocr.jsonl \ 16 | --image-folder ./playground/data/eval/textvqa/train_images \ 17 | --answers-file ./playground/data/eval/textvqa/answers/$CKPT/${CHUNKS}_${IDX}.jsonl \ 18 | --num-chunks $CHUNKS \ 19 | --chunk-idx $IDX \ 20 | --temperature 0 \ 21 | --square_eval True \ 22 | --conv-mode ${CONV_MODE} & 23 | done 24 | 25 | wait 26 | 27 | output_file=./playground/data/eval/textvqa/answers/$CKPT/merge.jsonl 28 | 29 | # Clear out the output file if it exists. 30 | > "$output_file" 31 | 32 | # Loop through the indices and concatenate each file. 33 | for IDX in $(seq 0 $((CHUNKS-1))); do 34 | cat ./playground/data/eval/textvqa/answers/$CKPT/${CHUNKS}_${IDX}.jsonl >> "$output_file" 35 | done 36 | 37 | # Evaluate 38 | python -m llava.eval.eval_textvqa \ 39 | --annotation-file ./playground/data/eval/textvqa/TextVQA_0.5.1_val.json \ 40 | --result-file $output_file -------------------------------------------------------------------------------- /scripts/mmevol/train/llama3/finetune.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -x 3 | 4 | # wandb login 5 | 6 | export CUDA_DEVICE_MAX_CONNECTIONS=1 7 | export GPUS_PER_NODE=8 8 | export NNODES=4 9 | export BATCH_SIZE=4 10 | export GRADIENT_ACCU_STEPS=4 11 | export MASTER_PORT=29588 12 | export CPUS_PER_TASK=16 13 | export QUOTA=reserved 14 | 15 | export DATA_PATH=datasets/json/mix_evol_sft.json 16 | export SAVE_PATH=llava-v1.6-8b_llama3-8b_clip-large-336_mmevol_sft 17 | export BASE_LR=2e-5 18 | export VIT_LR=2e-6 19 | 20 | 21 | bash -c 'torchrun --nproc_per_node $GPUS_PER_NODE llava/train/train_mem.py \ 22 | --deepspeed ./scripts/zero2.json \ 23 | --model_name_or_path checkpoints/llama-3-8b-Instruct \ 24 | --version llava_llama_3 \ 25 | --data_path ${DATA_PATH} \ 26 | --image_folder datasets \ 27 | --vision_tower checkpoints/openai/clip-vit-large-patch14-336 \ 28 | --mm_projector_type mlp2x_gelu \ 29 | --pretrain_mm_mlp_adapter checkpoints/llava-v1.6-8b_llama3-8b_clip-large-336_pretrain/mm_projector.bin \ 30 | --unfreeze_mm_vision_tower True \ 31 | --mm_vision_tower_lr ${VIT_LR} \ 32 | --image_aspect_ratio anyres \ 33 | --group_by_modality_length True \ 34 | --mm_vision_select_layer -2 \ 35 | --mm_vision_select_feature patch \ 36 | --mm_patch_merge_type spatial_unpad \ 37 | --mm_use_im_start_end False \ 38 | --mm_use_im_patch_token False \ 39 | --bf16 True \ 40 | --output_dir checkpoints/${SAVE_PATH} \ 41 | --num_train_epochs 1 \ 42 | --per_device_train_batch_size ${BATCH_SIZE} \ 43 | --per_device_eval_batch_size 4 \ 44 | --gradient_accumulation_steps ${GRADIENT_ACCU_STEPS} \ 45 | --evaluation_strategy "no" \ 46 | --save_strategy "steps" \ 47 | --save_steps 500 \ 48 | --save_total_limit 20 \ 49 | --learning_rate ${BASE_LR} \ 50 | --weight_decay 0. \ 51 | --warmup_ratio 0.03 \ 52 | --lr_scheduler_type "cosine" \ 53 | --logging_steps 1 \ 54 | --tf32 True \ 55 | --model_max_length 4096 \ 56 | --gradient_checkpointing True \ 57 | --dataloader_num_workers 4 \ 58 | --lazy_preprocess True \ 59 | --run_name ${SAVE_PATH} \ 60 | --dataloader_drop_last True \ 61 | --report_to tensorboard' 62 | -------------------------------------------------------------------------------- /scripts/mmevol/train/llama3/pretrain.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -x 3 | 4 | # wandb login 5 | 6 | export CUDA_DEVICE_MAX_CONNECTIONS=1 7 | export GPUS_PER_NODE=8 8 | export NNODES=4 9 | export BATCH_SIZE=4 10 | export GRADIENT_ACCU_STEPS=4 11 | export MASTER_PORT=29504 12 | export CPUS_PER_TASK=16 13 | export QUOTA=reserved 14 | 15 | export DATA_PATH=datasets/llava/llava_pretrain/blip_laion_cc_sbu_558k.json 16 | export SAVE_PATH=llava-v1.6-8b_llama3-8b_clip-large-336_pretrain_blip_laion_cc_sbu_558k 17 | export BASE_LR=1e-3 18 | 19 | 20 | bash -c 'torchrun --nproc_per_node $GPUS_PER_NODE llava/train/train_mem.py \ 21 | --deepspeed ./scripts/zero2.json \ 22 | --model_name_or_path checkpoints/LLM-Research/Meta-Llama-3-8B-Instruct \ 23 | --version plain \ 24 | --data_path ${DATA_PATH} \ 25 | --image_folder datasets/ \ 26 | --vision_tower checkpoints/openai/clip-vit-large-patch14-336 \ 27 | --mm_projector_type mlp2x_gelu \ 28 | --tune_mm_mlp_adapter True \ 29 | --unfreeze_mm_vision_tower False \ 30 | --image_aspect_ratio anyres \ 31 | --mm_vision_select_layer -2 \ 32 | --mm_vision_select_feature patch \ 33 | --mm_patch_merge_type spatial_unpad \ 34 | --mm_use_im_start_end False \ 35 | --mm_use_im_patch_token False \ 36 | --bf16 True \ 37 | --output_dir checkpoints/${SAVE_PATH} \ 38 | --num_train_epochs 1 \ 39 | --per_device_train_batch_size ${BATCH_SIZE} \ 40 | --per_device_eval_batch_size 4 \ 41 | --gradient_accumulation_steps ${GRADIENT_ACCU_STEPS} \ 42 | --evaluation_strategy "no" \ 43 | --save_strategy "steps" \ 44 | --save_steps 500 \ 45 | --save_total_limit 2 \ 46 | --learning_rate ${BASE_LR} \ 47 | --weight_decay 0. \ 48 | --warmup_ratio 0.03 \ 49 | --lr_scheduler_type "cosine" \ 50 | --logging_steps 1 \ 51 | --tf32 True \ 52 | --model_max_length 6144 \ 53 | --gradient_checkpointing True \ 54 | --dataloader_num_workers 4 \ 55 | --lazy_preprocess True \ 56 | --run_name ${SAVE_PATH} \ 57 | --dataloader_drop_last True \ 58 | --report_to tensorboard' 59 | -------------------------------------------------------------------------------- /scripts/mmevol/train/qwen2/finetune.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -x 3 | 4 | # wandb login 5 | 6 | export CUDA_DEVICE_MAX_CONNECTIONS=1 7 | export GPUS_PER_NODE=8 8 | export NNODES=4 9 | export BATCH_SIZE=4 10 | export GRADIENT_ACCU_STEPS=4 11 | export MASTER_PORT=29588 12 | export CPUS_PER_TASK=16 13 | export QUOTA=reserved 14 | 15 | export DATA_PATH=datasets/json/mix_evol_sft.json 16 | export SAVE_PATH=llava-v1.6-8b_qwen2-7b_clip-large-336_mmevol_sft 17 | export BASE_LR=2e-5 18 | export VIT_LR=2e-6 19 | 20 | 21 | bash -c 'torchrun --nproc_per_node $GPUS_PER_NODE llava/train/train_mem.py \ 22 | --deepspeed ./scripts/zero2.json \ 23 | --model_name_or_path checkpoints/qwen/Qwen2-7B-Instruct \ 24 | --version qwen_2 \ 25 | --data_path ${DATA_PATH} \ 26 | --image_folder datasets \ 27 | --vision_tower checkpoints/openai/clip-vit-large-patch14-336 \ 28 | --mm_projector_type mlp2x_gelu \ 29 | --pretrain_mm_mlp_adapter checkpoints/llava-v1.6-7b_qwen2-7b_clip-large-336_pretrain/mm_projector.bin \ 30 | --unfreeze_mm_vision_tower True \ 31 | --mm_vision_tower_lr ${VIT_LR} \ 32 | --image_aspect_ratio anyres \ 33 | --group_by_modality_length True \ 34 | --mm_vision_select_layer -2 \ 35 | --mm_vision_select_feature patch \ 36 | --mm_patch_merge_type spatial_unpad \ 37 | --mm_use_im_start_end False \ 38 | --mm_use_im_patch_token False \ 39 | --bf16 True \ 40 | --output_dir checkpoints/${SAVE_PATH} \ 41 | --num_train_epochs 1 \ 42 | --per_device_train_batch_size ${BATCH_SIZE} \ 43 | --per_device_eval_batch_size 4 \ 44 | --gradient_accumulation_steps ${GRADIENT_ACCU_STEPS} \ 45 | --evaluation_strategy "no" \ 46 | --save_strategy "steps" \ 47 | --save_steps 500 \ 48 | --save_total_limit 20 \ 49 | --learning_rate ${BASE_LR} \ 50 | --weight_decay 0. \ 51 | --warmup_ratio 0.03 \ 52 | --lr_scheduler_type "cosine" \ 53 | --logging_steps 1 \ 54 | --tf32 True \ 55 | --model_max_length 4096 \ 56 | --gradient_checkpointing True \ 57 | --dataloader_num_workers 4 \ 58 | --lazy_preprocess True \ 59 | --run_name ${SAVE_PATH} \ 60 | --dataloader_drop_last True \ 61 | --report_to tensorboard' 62 | -------------------------------------------------------------------------------- /scripts/mmevol/train/qwen2/pretrain.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -x 3 | 4 | # wandb login 5 | 6 | export CUDA_DEVICE_MAX_CONNECTIONS=1 7 | export GPUS_PER_NODE=4 8 | export NNODES=4 9 | export BATCH_SIZE=4 10 | export GRADIENT_ACCU_STEPS=8 11 | export MASTER_PORT=29504 12 | export CPUS_PER_TASK=16 13 | export QUOTA=reserved 14 | 15 | export DATA_PATH=datasets/llava/llava_pretrain/blip_laion_cc_sbu_558k.json 16 | export SAVE_PATH=llava-v1.6-8b_qwen2-7b_clip-large-336_pretrain_blip_laion_cc_sbu_558k 17 | export BASE_LR=1e-3 18 | 19 | 20 | bash -c 'torchrun --nproc_per_node $GPUS_PER_NODE llava/train/train_mem.py \ 21 | --deepspeed ./scripts/zero2.json \ 22 | --model_name_or_path checkpoints/qwen/Qwen2-7B-Instruct \ 23 | --version plain \ 24 | --data_path ${DATA_PATH} \ 25 | --image_folder datasets/llava/llava_pretrain/images \ 26 | --vision_tower checkpoints/openai/clip-vit-large-patch14-336 \ 27 | --mm_projector_type mlp2x_gelu \ 28 | --tune_mm_mlp_adapter True \ 29 | --unfreeze_mm_vision_tower False \ 30 | --image_aspect_ratio anyres \ 31 | --mm_vision_select_layer -2 \ 32 | --mm_vision_select_feature patch \ 33 | --mm_patch_merge_type spatial_unpad \ 34 | --mm_use_im_start_end False \ 35 | --mm_use_im_patch_token False \ 36 | --bf16 True \ 37 | --output_dir checkpoints/${SAVE_PATH} \ 38 | --num_train_epochs 1 \ 39 | --per_device_train_batch_size ${BATCH_SIZE} \ 40 | --per_device_eval_batch_size 4 \ 41 | --gradient_accumulation_steps ${GRADIENT_ACCU_STEPS} \ 42 | --evaluation_strategy "no" \ 43 | --save_strategy "steps" \ 44 | --save_steps 500 \ 45 | --save_total_limit 2 \ 46 | --learning_rate ${BASE_LR} \ 47 | --weight_decay 0. \ 48 | --warmup_ratio 0.03 \ 49 | --lr_scheduler_type "cosine" \ 50 | --logging_steps 1 \ 51 | --tf32 True \ 52 | --model_max_length 4096 \ 53 | --gradient_checkpointing True \ 54 | --dataloader_num_workers 4 \ 55 | --lazy_preprocess True \ 56 | --run_name ${SAVE_PATH} \ 57 | --dataloader_drop_last True \ 58 | --report_to tensorboard' 59 | -------------------------------------------------------------------------------- /scripts/train/llama3/asr_finetune.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -x 3 | 4 | # wandb login 5 | echo "DLC seemed world size(actually nodes): ${WORLD_SIZE}" 6 | NNODES=${WORLD_SIZE} 7 | NODE_RANK=${RANK} 8 | 9 | export CUDA_DEVICE_MAX_CONNECTIONS=1 10 | export GPUS_PER_NODE=8 11 | # export NNODES=2 12 | export BATCH_SIZE=8 13 | export GRADIENT_ACCU_STEPS=4 14 | export MASTER_PORT=29588 15 | export CPUS_PER_TASK=16 16 | export QUOTA=reserved 17 | 18 | export DATA_PATH=./datasets/openomni/json/openomni_asr.json 19 | export SAVE_PATH=openomni_asr_qwen_2 20 | export BASE_LR=2e-5 21 | export VIT_LR=2e-6 22 | 23 | DISTRIBUTED_ARGS=" 24 | --nproc_per_node $GPUS_PER_NODE \ 25 | --nnodes $NNODES \ 26 | --node_rank $NODE_RANK \ 27 | --master_addr $MASTER_ADDR \ 28 | --master_port $MASTER_PORT 29 | " 30 | 31 | # 定义重试的最大次数 32 | MAX_RETRIES=5 33 | 34 | # 每次重试之间的等待时间,单位为秒 35 | WAIT_TIME=200 36 | 37 | # 当前的重试次数 38 | retry_count=0 39 | 40 | # 要执行的命令 41 | command_to_run="torchrun --nproc_per_node $GPUS_PER_NODE openomni/train/train_mem.py \ 42 | --deepspeed ./scripts/zero2.json \ 43 | --model_name_or_path ./checkpoints/openomni_stage3_qwen_3/checkpoint-last \ 44 | --version llava_llama_3 \ 45 | --data_path ${DATA_PATH} \ 46 | --image_folder ./datasets \ 47 | --speech_folder ./datasets \ 48 | --vision_tower ./checkpoints/openai/clip-vit-large-patch14-336 \ 49 | --mm_projector_type mlp2x_gelu \ 50 | --freeze_backbone False \ 51 | --tune_speech_adapter True \ 52 | --speech_encoder ./checkpoints/openai-whisper/large-v3.pt \ 53 | --unfreeze_mm_vision_tower True \ 54 | --mm_vision_tower_lr ${VIT_LR} \ 55 | --image_aspect_ratio anyres \ 56 | --group_by_modality_length True \ 57 | --mm_vision_select_layer -2 \ 58 | --mm_vision_select_feature patch \ 59 | --mm_patch_merge_type spatial_unpad \ 60 | --mm_use_im_start_end False \ 61 | --mm_use_im_patch_token False \ 62 | --bf16 True \ 63 | --output_dir checkpoints/${SAVE_PATH} \ 64 | --num_train_epochs 1 \ 65 | --per_device_train_batch_size ${BATCH_SIZE} \ 66 | --per_device_eval_batch_size 4 \ 67 | --gradient_accumulation_steps ${GRADIENT_ACCU_STEPS} \ 68 | --evaluation_strategy "no" \ 69 | --save_strategy "steps" \ 70 | --save_steps 500 \ 71 | --save_total_limit 20 \ 72 | --learning_rate ${BASE_LR} \ 73 | --weight_decay 0. \ 74 | --warmup_ratio 0.03 \ 75 | --lr_scheduler_type "cosine" \ 76 | --logging_steps 1 \ 77 | --tf32 True \ 78 | --model_max_length 4096 \ 79 | --gradient_checkpointing True \ 80 | --dataloader_num_workers 8 \ 81 | --lazy_preprocess True \ 82 | --run_name ${SAVE_PATH} \ 83 | --dataloader_drop_last True \ 84 | --report_to tensorboard | tee train_${SAVE_PATH}.log" 85 | 86 | while (( retry_count < MAX_RETRIES )); do 87 | # 执行命令 88 | eval $command_to_run 89 | echo "命令失败,重试中..." 90 | ((retry_count++)) 91 | 92 | # 等待一段时间后再重试 93 | sleep $WAIT_TIME 94 | # fi 95 | done 96 | 97 | # 检查是否超过最大重试次数 98 | if (( retry_count == MAX_RETRIES )); then 99 | echo "命令在达到最大重试次数后仍然失败。" 100 | exit 1 101 | fi 102 | 103 | -------------------------------------------------------------------------------- /scripts/train/llama3/image2text_finetune.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -x 3 | 4 | # wandb login 5 | echo "DLC seemed world size(actually nodes): ${WORLD_SIZE}" 6 | NNODES=${WORLD_SIZE} 7 | NODE_RANK=${RANK} 8 | 9 | export CUDA_DEVICE_MAX_CONNECTIONS=1 10 | export GPUS_PER_NODE=8 11 | # export NNODES=2 12 | export BATCH_SIZE=4 13 | export GRADIENT_ACCU_STEPS=4 14 | export MASTER_PORT=29588 15 | export CPUS_PER_TASK=16 16 | export QUOTA=reserved 17 | 18 | export DATA_PATH=./datasets/openomni/json/openomni_stage2-2.json 19 | export SAVE_PATH=openomni_stage2-2_qwen_2 20 | export BASE_LR=2e-5 21 | export VIT_LR=2e-6 22 | 23 | DISTRIBUTED_ARGS=" 24 | --nproc_per_node $GPUS_PER_NODE \ 25 | --nnodes $NNODES \ 26 | --node_rank $NODE_RANK \ 27 | --master_addr $MASTER_ADDR \ 28 | --master_port $MASTER_PORT 29 | " 30 | 31 | # 定义重试的最大次数 32 | MAX_RETRIES=5 33 | 34 | # 每次重试之间的等待时间,单位为秒 35 | WAIT_TIME=200 36 | 37 | # 当前的重试次数 38 | retry_count=0 39 | 40 | # 要执行的命令 41 | command_to_run="torchrun --nproc_per_node $GPUS_PER_NODE openomni/train/train_mem.py \ 42 | --deepspeed ./scripts/zero2.json \ 43 | --model_name_or_path ./datasets/checkpoints/qwen/Qwen2-7B-Instruct \ 44 | --version llava_llama_3 \ 45 | --data_path ${DATA_PATH} \ 46 | --image_folder ./datasets \ 47 | --speech_folder ./datasets \ 48 | --vision_tower ./checkpoints/openai/clip-vit-large-patch14-336 \ 49 | --mm_projector_type mlp2x_gelu \ 50 | --freeze_backbone False \ 51 | --tune_speech_adapter False \ 52 | --pretrain_mm_mlp_adapter ./checkpoints/openomni_stage2_qwen_2/mm_projector.bin \ 53 | --speech_encoder ./checkpoints/openai-whisper/large-v3.pt \ 54 | --unfreeze_mm_vision_tower True \ 55 | --mm_vision_tower_lr ${VIT_LR} \ 56 | --image_aspect_ratio anyres \ 57 | --group_by_modality_length True \ 58 | --mm_vision_select_layer -2 \ 59 | --mm_vision_select_feature patch \ 60 | --mm_patch_merge_type spatial_unpad \ 61 | --mm_use_im_start_end False \ 62 | --mm_use_im_patch_token False \ 63 | --bf16 True \ 64 | --output_dir ./checkpoints/${SAVE_PATH} \ 65 | --num_train_epochs 1 \ 66 | --per_device_train_batch_size ${BATCH_SIZE} \ 67 | --per_device_eval_batch_size 4 \ 68 | --gradient_accumulation_steps ${GRADIENT_ACCU_STEPS} \ 69 | --evaluation_strategy "no" \ 70 | --save_strategy "steps" \ 71 | --save_steps 100 \ 72 | --save_total_limit 20 \ 73 | --learning_rate ${BASE_LR} \ 74 | --weight_decay 0. \ 75 | --warmup_ratio 0.03 \ 76 | --lr_scheduler_type "cosine" \ 77 | --logging_steps 1 \ 78 | --tf32 True \ 79 | --model_max_length 4096 \ 80 | --gradient_checkpointing True \ 81 | --dataloader_num_workers 16 \ 82 | --lazy_preprocess True \ 83 | --run_name ${SAVE_PATH} \ 84 | --dataloader_drop_last True \ 85 | --report_to tensorboard | tee train_${SAVE_PATH}.log" 86 | 87 | while (( retry_count < MAX_RETRIES )); do 88 | # 执行命令 89 | eval $command_to_run 90 | 91 | # # 检查命令的退出状态 92 | # if [[ $? -eq 0 ]]; then 93 | # # 命令成功,退出循环 94 | # echo "命令成功执行。" 95 | # break 96 | # else 97 | # 命令失败,增加重试计数 98 | echo "命令失败,重试中..." 99 | ((retry_count++)) 100 | 101 | # 等待一段时间后再重试 102 | sleep $WAIT_TIME 103 | # fi 104 | done 105 | 106 | # 检查是否超过最大重试次数 107 | if (( retry_count == MAX_RETRIES )); then 108 | echo "命令在达到最大重试次数后仍然失败。" 109 | exit 1 110 | fi 111 | 112 | -------------------------------------------------------------------------------- /scripts/train/llama3/image2text_pretrain.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -x 3 | 4 | # wandb login 5 | export CUDA_DEVICE_MAX_CONNECTIONS=1 6 | export GPUS_PER_NODE=8 7 | export NNODES=4 8 | export BATCH_SIZE=8 9 | export GRADIENT_ACCU_STEPS=4 10 | export MASTER_PORT=29504 11 | export CPUS_PER_TASK=16 12 | export QUOTA=reserved 13 | 14 | export DATA_PATH=./datasets/openomni/json/openomni_stage2-1.json 15 | export SAVE_PATH=openomni_stage2-1_qwen_2 16 | export BASE_LR=1e-3 17 | 18 | 19 | bash -c "torchrun --nproc_per_node $GPUS_PER_NODE openomni/train/train_mem.py \ 20 | --deepspeed ./scripts/zero2.json \ 21 | --model_name_or_path ./checkpoints/Meta-Llama-3.1-8B-Instruct \ 22 | --version plain \ 23 | --data_path ${DATA_PATH} \ 24 | --image_folder ./datasets/llava/llava_pretrain/images \ 25 | --vision_tower ./checkpoints/openai/clip-vit-large-patch14-336 \ 26 | --speech_encoder ./checkpoints/openai-whisper/large-v3.pt \ 27 | --mm_projector_type mlp2x_gelu \ 28 | --tune_mm_mlp_adapter True \ 29 | --unfreeze_mm_vision_tower False \ 30 | --image_aspect_ratio anyres \ 31 | --mm_vision_select_layer -2 \ 32 | --mm_vision_select_feature patch \ 33 | --mm_patch_merge_type spatial_unpad \ 34 | --mm_use_im_start_end False \ 35 | --mm_use_im_patch_token False \ 36 | --bf16 True \ 37 | --output_dir ./checkpoints/${SAVE_PATH} \ 38 | --num_train_epochs 1 \ 39 | --per_device_train_batch_size ${BATCH_SIZE} \ 40 | --per_device_eval_batch_size 4 \ 41 | --gradient_accumulation_steps ${GRADIENT_ACCU_STEPS} \ 42 | --evaluation_strategy "no" \ 43 | --save_strategy "steps" \ 44 | --save_steps 500 \ 45 | --save_total_limit 2 \ 46 | --learning_rate ${BASE_LR} \ 47 | --weight_decay 0. \ 48 | --warmup_ratio 0.03 \ 49 | --lr_scheduler_type "cosine" \ 50 | --logging_steps 1 \ 51 | --tf32 True \ 52 | --model_max_length 8096 \ 53 | --gradient_checkpointing True \ 54 | --dataloader_num_workers 4 \ 55 | --lazy_preprocess True \ 56 | --run_name ${SAVE_PATH} \ 57 | --dataloader_drop_last True \ 58 | --report_to tensorboard | tee train_${SAVE_PATH}.log" 59 | -------------------------------------------------------------------------------- /scripts/train/llama3/speech2text_pretrain.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -x 3 | 4 | # wandb login 5 | export CUDA_DEVICE_MAX_CONNECTIONS=1 6 | export GPUS_PER_NODE=8 7 | export NNODES=4 8 | export BATCH_SIZE=8 9 | export GRADIENT_ACCU_STEPS=4 10 | export MASTER_PORT=29504 11 | export CPUS_PER_TASK=16 12 | export QUOTA=reserved 13 | 14 | export DATA_PATH=./datasets/openomni/json/openomni_stage1-1.json 15 | export SAVE_PATH=openomni_stage1-1_qwen_2 16 | export BASE_LR=1e-4 17 | 18 | 19 | bash -c "torchrun --nproc_per_node $GPUS_PER_NODE openomni/train/train_mem.py \ 20 | --deepspeed ./scripts/zero2.json \ 21 | --model_name_or_path ./checkpoints/Meta-Llama-3.1-8B-Instruct \ 22 | --version plain \ 23 | --data_path ${DATA_PATH} \ 24 | --image_folder ./datasets/llava/llava_pretrain/images \ 25 | --vision_tower ./checkpoints/openai/clip-vit-large-patch14-336 \ 26 | --speech_encoder ./checkpoints/openai-whisper/large-v3.pt \ 27 | --mm_projector_type mlp2x_gelu \ 28 | --freeze_backbone True \ 29 | --tune_mm_mlp_adapter False \ 30 | --tune_speech_adapter True \ 31 | --freeze_mm_mlp_adapter True \ 32 | --unfreeze_mm_vision_tower False \ 33 | --image_aspect_ratio anyres \ 34 | --mm_vision_select_layer -2 \ 35 | --mm_vision_select_feature patch \ 36 | --mm_patch_merge_type spatial_unpad \ 37 | --mm_use_im_start_end False \ 38 | --mm_use_im_patch_token False \ 39 | --bf16 True \ 40 | --group_by_modality_length True \ 41 | --output_dir ./checkpoints/${SAVE_PATH} \ 42 | --num_train_epochs 1 \ 43 | --per_device_train_batch_size ${BATCH_SIZE} \ 44 | --per_device_eval_batch_size 4 \ 45 | --gradient_accumulation_steps ${GRADIENT_ACCU_STEPS} \ 46 | --evaluation_strategy "no" \ 47 | --save_strategy "steps" \ 48 | --save_steps 500 \ 49 | --save_total_limit 2 \ 50 | --learning_rate ${BASE_LR} \ 51 | --weight_decay 0. \ 52 | --warmup_ratio 0.03 \ 53 | --lr_scheduler_type "cosine" \ 54 | --logging_steps 1 \ 55 | --tf32 True \ 56 | --model_max_length 8096 \ 57 | --gradient_checkpointing True \ 58 | --dataloader_num_workers 8 \ 59 | --lazy_preprocess True \ 60 | --run_name ${SAVE_PATH} \ 61 | --dataloader_drop_last True \ 62 | --report_to tensorboard | tee train_${SAVE_PATH}.log" 63 | -------------------------------------------------------------------------------- /scripts/train/llama3/text2speech_dpo.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -x 3 | 4 | # wandb login 5 | echo "DLC seemed world size(actually nodes): ${WORLD_SIZE}" 6 | NNODES=${WORLD_SIZE} 7 | NODE_RANK=${RANK} 8 | 9 | export CUDA_DEVICE_MAX_CONNECTIONS=1 10 | export GPUS_PER_NODE=8 11 | # export NNODES=2 12 | export BATCH_SIZE=1 13 | export GRADIENT_ACCU_STEPS=4 14 | export MASTER_PORT=29588 15 | export CPUS_PER_TASK=16 16 | export QUOTA=reserved 17 | 18 | export DATA_PATH=./datasets/openomni/json/openomni_stage3-2.json 19 | export SAVE_PATH=openomni_stage3-2_qwen_2 20 | export BASE_LR=2e-5 21 | export VIT_LR=2e-6 22 | 23 | DISTRIBUTED_ARGS=" 24 | --nproc_per_node $GPUS_PER_NODE \ 25 | --nnodes $NNODES \ 26 | --node_rank $NODE_RANK \ 27 | --master_addr $MASTER_ADDR \ 28 | --master_port $MASTER_PORT 29 | " 30 | 31 | # 定义重试的最大次数 32 | MAX_RETRIES=3 33 | 34 | # 每次重试之间的等待时间,单位为秒 35 | WAIT_TIME=20 36 | 37 | # 当前的重试次数 38 | retry_count=0 39 | 40 | command_to_run="torchrun --nproc_per_node $GPUS_PER_NODE openomni/train/train_mem.py \ 41 | --deepspeed ./scripts/zero2.json \ 42 | --model_name_or_path ./checkpoints/openomni_stage3_qwen_3/checkpoint-last \ 43 | --version llava_llama_3 \ 44 | --data_path ${DATA_PATH} \ 45 | --image_folder ./datasets \ 46 | --speech_folder ./datasets \ 47 | --vision_tower ./datasets/checkpoints/openai/clip-vit-large-patch14-336 \ 48 | --mm_projector_type mlp2x_gelu \ 49 | --freeze_backbone True \ 50 | --tune_mm_mlp_adapter False \ 51 | --freeze_mm_mlp_adapter True \ 52 | --tune_speech_generator_only True \ 53 | --tune_speech_generator_dpo True \ 54 | --pretrain_mm_mlp_adapter ./checkpoints/openomni_stage2_qwen_2/mm_projector.bin \ 55 | --speech_encoder ./checkpoints/openai-whisper/large-v3.pt \ 56 | --speech_generator ./checkpoints/speech_generator/generator_16k.pt \ 57 | --unfreeze_mm_vision_tower False \ 58 | --mm_vision_tower_lr ${VIT_LR} \ 59 | --speech_generator_lr ${VIT_LR} \ 60 | --mm_projector_lr ${VIT_LR} \ 61 | --image_aspect_ratio anyres \ 62 | --group_by_modality_length True \ 63 | --mm_vision_select_layer -2 \ 64 | --mm_vision_select_feature patch \ 65 | --mm_patch_merge_type spatial_unpad \ 66 | --mm_use_im_start_end False \ 67 | --mm_use_im_patch_token False \ 68 | --bf16 True \ 69 | --output_dir ./checkpoints/${SAVE_PATH} \ 70 | --num_train_epochs 3 \ 71 | --per_device_train_batch_size ${BATCH_SIZE} \ 72 | --per_device_eval_batch_size 4 \ 73 | --gradient_accumulation_steps ${GRADIENT_ACCU_STEPS} \ 74 | --evaluation_strategy "no" \ 75 | --save_strategy "steps" \ 76 | --save_steps 1000 \ 77 | --save_total_limit 20 \ 78 | --learning_rate ${BASE_LR} \ 79 | --weight_decay 0. \ 80 | --warmup_ratio 0.03 \ 81 | --lr_scheduler_type "cosine" \ 82 | --logging_steps 1 \ 83 | --tf32 True \ 84 | --model_max_length 8196 \ 85 | --gradient_checkpointing True \ 86 | --dataloader_num_workers 8 \ 87 | --lazy_preprocess True \ 88 | --speech_generator_type "ar" \ 89 | --run_name ${SAVE_PATH} \ 90 | --dataloader_drop_last True \ 91 | --report_to tensorboard | tee train_${SAVE_PATH}.log" 92 | 93 | eval $command_to_run 94 | 95 | -------------------------------------------------------------------------------- /scripts/train/llama3/text2speech_pretrain.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -x 3 | 4 | # wandb login 5 | echo "DLC seemed world size(actually nodes): ${WORLD_SIZE}" 6 | NNODES=${WORLD_SIZE} 7 | NODE_RANK=${RANK} 8 | 9 | export CUDA_DEVICE_MAX_CONNECTIONS=1 10 | export GPUS_PER_NODE=8 11 | # export NNODES=2 12 | export BATCH_SIZE=1 13 | export GRADIENT_ACCU_STEPS=4 14 | export MASTER_PORT=29588 15 | export CPUS_PER_TASK=16 16 | export QUOTA=reserved 17 | 18 | export DATA_PATH=./datasets/openomni/json/openomni_stage3-1.json 19 | export SAVE_PATH=openomni_stage3-1_qwen_2 20 | export BASE_LR=2e-5 21 | export VIT_LR=2e-6 22 | 23 | DISTRIBUTED_ARGS=" 24 | --nproc_per_node $GPUS_PER_NODE \ 25 | --nnodes $NNODES \ 26 | --node_rank $NODE_RANK \ 27 | --master_addr $MASTER_ADDR \ 28 | --master_port $MASTER_PORT 29 | " 30 | 31 | # 定义重试的最大次数 32 | MAX_RETRIES=3 33 | 34 | # 每次重试之间的等待时间,单位为秒 35 | WAIT_TIME=20 36 | 37 | # 当前的重试次数 38 | retry_count=0 39 | 40 | command_to_run="torchrun --nproc_per_node $GPUS_PER_NODE openomni/train/train_mem.py \ 41 | --deepspeed ./scripts/zero2.json \ 42 | --model_name_or_path ./checkpoints/openomni_stage3_qwen_3/checkpoint-20900 \ 43 | --version llava_llama_3 \ 44 | --data_path ${DATA_PATH} \ 45 | --image_folder ./datasets \ 46 | --speech_folder ./datasets \ 47 | --vision_tower ./checkpoints/openai/clip-vit-large-patch14-336 \ 48 | --mm_projector_type mlp2x_gelu \ 49 | --freeze_backbone True \ 50 | --tune_mm_mlp_adapter False \ 51 | --freeze_mm_mlp_adapter True \ 52 | --tune_speech_generator_only True \ 53 | --pretrain_mm_mlp_adapter ./checkpoints/openomni_stage2_qwen_2/mm_projector.bin \ 54 | --speech_encoder ./checkpoints/openai-whisper/large-v3.pt \ 55 | --speech_generator ./checkpoints/speech_generator/generator_16K.pt \ 56 | --unfreeze_mm_vision_tower False \ 57 | --mm_vision_tower_lr ${VIT_LR} \ 58 | --speech_generator_lr ${VIT_LR} \ 59 | --mm_projector_lr ${VIT_LR} \ 60 | --image_aspect_ratio anyres \ 61 | --group_by_modality_length True \ 62 | --mm_vision_select_layer -2 \ 63 | --mm_vision_select_feature patch \ 64 | --mm_patch_merge_type spatial_unpad \ 65 | --mm_use_im_start_end False \ 66 | --mm_use_im_patch_token False \ 67 | --bf16 True \ 68 | --output_dir checkpoints/${SAVE_PATH} \ 69 | --num_train_epochs 1 \ 70 | --per_device_train_batch_size ${BATCH_SIZE} \ 71 | --per_device_eval_batch_size 4 \ 72 | --gradient_accumulation_steps ${GRADIENT_ACCU_STEPS} \ 73 | --evaluation_strategy "no" \ 74 | --save_strategy "steps" \ 75 | --save_steps 1000 \ 76 | --save_total_limit 20 \ 77 | --learning_rate ${BASE_LR} \ 78 | --weight_decay 0. \ 79 | --warmup_ratio 0.03 \ 80 | --lr_scheduler_type "cosine" \ 81 | --logging_steps 1 \ 82 | --tf32 True \ 83 | --model_max_length 8196 \ 84 | --gradient_checkpointing True \ 85 | --dataloader_num_workers 8 \ 86 | --lazy_preprocess True \ 87 | --speech_generator_type "ar" \ 88 | --run_name ${SAVE_PATH} \ 89 | --dataloader_drop_last True \ 90 | --report_to tensorboard | tee train_${SAVE_PATH}.log" 91 | 92 | eval $command_to_run 93 | -------------------------------------------------------------------------------- /scripts/train/llama3/text2speech_pretrain_ctc.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -x 3 | 4 | # wandb login 5 | echo "DLC seemed world size(actually nodes): ${WORLD_SIZE}" 6 | NNODES=${WORLD_SIZE} 7 | NODE_RANK=${RANK} 8 | 9 | export CUDA_DEVICE_MAX_CONNECTIONS=1 10 | export GPUS_PER_NODE=8 11 | # export NNODES=2 12 | export BATCH_SIZE=1 13 | export GRADIENT_ACCU_STEPS=4 14 | export MASTER_PORT=29588 15 | export CPUS_PER_TASK=16 16 | export QUOTA=reserved 17 | 18 | export DATA_PATH=./datasets/openomni/json/openomni_stage3-1.json 19 | export SAVE_PATH=openomni_stage3-1_qwen_2_ctc 20 | export BASE_LR=2e-4 21 | export VIT_LR=2e-4 22 | 23 | DISTRIBUTED_ARGS=" 24 | --nproc_per_node $GPUS_PER_NODE \ 25 | --nnodes $NNODES \ 26 | --node_rank $NODE_RANK \ 27 | --master_addr $MASTER_ADDR \ 28 | --master_port $MASTER_PORT 29 | " 30 | 31 | # 定义重试的最大次数 32 | MAX_RETRIES=3 33 | 34 | # 每次重试之间的等待时间,单位为秒 35 | WAIT_TIME=20 36 | 37 | # 当前的重试次数 38 | retry_count=0 39 | 40 | command_to_run="torchrun --nproc_per_node $GPUS_PER_NODE openomni/train/train_mem.py \ 41 | --deepspeed ./scripts/zero2.json \ 42 | --model_name_or_path ./checkpoints/openomni_stage3_qwen_3/checkpoint-last \ 43 | --version llava_qwen_2 \ 44 | --data_path ${DATA_PATH} \ 45 | --image_folder ./datasets \ 46 | --speech_folder ./datasets \ 47 | --vision_tower ./checkpoints/openai/clip-vit-large-patch14-336 \ 48 | --mm_projector_type mlp2x_gelu \ 49 | --freeze_backbone True \ 50 | --tune_mm_mlp_adapter False \ 51 | --freeze_mm_mlp_adapter True \ 52 | --tune_speech_generator_only True \ 53 | --pretrain_mm_mlp_adapter ./checkpoints/openomni_stage2_qwen_2/mm_projector.bin \ 54 | --speech_encoder ./checkpoints/openai-whipser/large-v3.pt \ 55 | --speech_generator ./checkpoints/speech_generator/generator_16k.pt \ 56 | --unfreeze_mm_vision_tower False \ 57 | --mm_vision_tower_lr ${VIT_LR} \ 58 | --speech_generator_lr ${VIT_LR} \ 59 | --mm_projector_lr ${VIT_LR} \ 60 | --image_aspect_ratio anyres \ 61 | --group_by_modality_length True \ 62 | --mm_vision_select_layer -2 \ 63 | --mm_vision_select_feature patch \ 64 | --mm_patch_merge_type spatial_unpad \ 65 | --mm_use_im_start_end False \ 66 | --mm_use_im_patch_token False \ 67 | --bf16 True \ 68 | --output_dir ./checkpoints/${SAVE_PATH} \ 69 | --num_train_epochs 3 \ 70 | --per_device_train_batch_size ${BATCH_SIZE} \ 71 | --per_device_eval_batch_size 4 \ 72 | --gradient_accumulation_steps ${GRADIENT_ACCU_STEPS} \ 73 | --evaluation_strategy "no" \ 74 | --save_strategy "steps" \ 75 | --save_steps 1000 \ 76 | --save_total_limit 20 \ 77 | --learning_rate ${BASE_LR} \ 78 | --weight_decay 0. \ 79 | --warmup_ratio 0.03 \ 80 | --lr_scheduler_type "cosine" \ 81 | --logging_steps 1 \ 82 | --tf32 True \ 83 | --model_max_length 8196 \ 84 | --gradient_checkpointing True \ 85 | --dataloader_num_workers 8 \ 86 | --lazy_preprocess True \ 87 | --run_name ${SAVE_PATH} \ 88 | --dataloader_drop_last True \ 89 | --speech_generator_type "ctc" \ 90 | --unit_vocab_size 6561 \ 91 | --report_to tensorboard | tee train_${SAVE_PATH}.log" 92 | 93 | eval $command_to_run 94 | 95 | 96 | -------------------------------------------------------------------------------- /scripts/train/qwen2/asr_finetune.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -x 3 | 4 | # wandb login 5 | echo "DLC seemed world size(actually nodes): ${WORLD_SIZE}" 6 | NNODES=${WORLD_SIZE} 7 | NODE_RANK=${RANK} 8 | 9 | export CUDA_DEVICE_MAX_CONNECTIONS=1 10 | export GPUS_PER_NODE=8 11 | # export NNODES=2 12 | export BATCH_SIZE=8 13 | export GRADIENT_ACCU_STEPS=4 14 | export MASTER_PORT=29588 15 | export CPUS_PER_TASK=16 16 | export QUOTA=reserved 17 | 18 | export DATA_PATH=./datasets/openomni/json/openomni_stage1-1.json 19 | export SAVE_PATH=openomni_asr_qwen_2 20 | export BASE_LR=2e-5 21 | export VIT_LR=2e-6 22 | 23 | DISTRIBUTED_ARGS=" 24 | --nproc_per_node $GPUS_PER_NODE \ 25 | --nnodes $NNODES \ 26 | --node_rank $NODE_RANK \ 27 | --master_addr $MASTER_ADDR \ 28 | --master_port $MASTER_PORT 29 | " 30 | 31 | # 定义重试的最大次数 32 | MAX_RETRIES=5 33 | 34 | # 每次重试之间的等待时间,单位为秒 35 | WAIT_TIME=200 36 | 37 | # 当前的重试次数 38 | retry_count=0 39 | 40 | # 要执行的命令 41 | command_to_run="torchrun --nproc_per_node $GPUS_PER_NODE openomni/train/train_mem.py \ 42 | --deepspeed ./scripts/zero2.json \ 43 | --model_name_or_path ./checkpoints/openomni_stage3_qwen_3/checkpoint-last \ 44 | --version llava_qwen_2 \ 45 | --data_path ${DATA_PATH} \ 46 | --image_folder ./datasets \ 47 | --speech_folder ./datasets \ 48 | --vision_tower ./checkpoints/openai/clip-vit-large-patch14-336 \ 49 | --mm_projector_type mlp2x_gelu \ 50 | --freeze_backbone False \ 51 | --tune_speech_adapter True \ 52 | --speech_encoder ./checkpoints/openai-whisper/large-v3.pt \ 53 | --unfreeze_mm_vision_tower True \ 54 | --mm_vision_tower_lr ${VIT_LR} \ 55 | --image_aspect_ratio anyres \ 56 | --group_by_modality_length True \ 57 | --mm_vision_select_layer -2 \ 58 | --mm_vision_select_feature patch \ 59 | --mm_patch_merge_type spatial_unpad \ 60 | --mm_use_im_start_end False \ 61 | --mm_use_im_patch_token False \ 62 | --bf16 True \ 63 | --output_dir ./checkpoints/${SAVE_PATH} \ 64 | --num_train_epochs 1 \ 65 | --per_device_train_batch_size ${BATCH_SIZE} \ 66 | --per_device_eval_batch_size 4 \ 67 | --gradient_accumulation_steps ${GRADIENT_ACCU_STEPS} \ 68 | --evaluation_strategy "no" \ 69 | --save_strategy "steps" \ 70 | --save_steps 500 \ 71 | --save_total_limit 20 \ 72 | --learning_rate ${BASE_LR} \ 73 | --weight_decay 0. \ 74 | --warmup_ratio 0.03 \ 75 | --lr_scheduler_type "cosine" \ 76 | --logging_steps 1 \ 77 | --tf32 True \ 78 | --model_max_length 4096 \ 79 | --gradient_checkpointing True \ 80 | --dataloader_num_workers 8 \ 81 | --lazy_preprocess True \ 82 | --run_name ${SAVE_PATH} \ 83 | --dataloader_drop_last True \ 84 | --report_to tensorboard | tee train_${SAVE_PATH}.log" 85 | 86 | while (( retry_count < MAX_RETRIES )); do 87 | # 执行命令 88 | eval $command_to_run 89 | echo "命令失败,重试中..." 90 | ((retry_count++)) 91 | 92 | # 等待一段时间后再重试 93 | sleep $WAIT_TIME 94 | # fi 95 | done 96 | 97 | # 检查是否超过最大重试次数 98 | if (( retry_count == MAX_RETRIES )); then 99 | echo "命令在达到最大重试次数后仍然失败。" 100 | exit 1 101 | fi 102 | 103 | -------------------------------------------------------------------------------- /scripts/train/qwen2/image2text_finetune.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -x 3 | 4 | # wandb login 5 | echo "DLC seemed world size(actually nodes): ${WORLD_SIZE}" 6 | NNODES=${WORLD_SIZE} 7 | NODE_RANK=${RANK} 8 | 9 | export CUDA_DEVICE_MAX_CONNECTIONS=1 10 | export GPUS_PER_NODE=8 11 | # export NNODES=2 12 | export BATCH_SIZE=4 13 | export GRADIENT_ACCU_STEPS=4 14 | export MASTER_PORT=29588 15 | export CPUS_PER_TASK=16 16 | export QUOTA=reserved 17 | 18 | export DATA_PATH=./datasets/openomni/json/openomni_stage2-2.json 19 | export SAVE_PATH=openomni_stage2-2_qwen_2 20 | export BASE_LR=2e-5 21 | export VIT_LR=2e-6 22 | 23 | DISTRIBUTED_ARGS=" 24 | --nproc_per_node $GPUS_PER_NODE \ 25 | --nnodes $NNODES \ 26 | --node_rank $NODE_RANK \ 27 | --master_addr $MASTER_ADDR \ 28 | --master_port $MASTER_PORT 29 | " 30 | 31 | # 定义重试的最大次数 32 | MAX_RETRIES=5 33 | 34 | # 每次重试之间的等待时间,单位为秒 35 | WAIT_TIME=200 36 | 37 | # 当前的重试次数 38 | retry_count=0 39 | 40 | # 要执行的命令 41 | command_to_run="torchrun --nproc_per_node $GPUS_PER_NODE openomni/train/train_mem.py \ 42 | --deepspeed ./scripts/zero2.json \ 43 | --model_name_or_path ./checkpoints/qwen/Qwen2-7B-Instruct \ 44 | --version llava_qwen_2 \ 45 | --data_path ${DATA_PATH} \ 46 | --image_folder ./datasets \ 47 | --speech_folder ./datasets \ 48 | --vision_tower ./datasets/checkpoints/openai/clip-vit-large-patch14-336 \ 49 | --mm_projector_type mlp2x_gelu \ 50 | --freeze_backbone False \ 51 | --tune_speech_adapter False \ 52 | --pretrain_mm_mlp_adapter ./checkpoints/openomni_stage2_qwen_2/mm_projector.bin \ 53 | --speech_encoder ./checkpoints/openai-whisper/large-v3.pt \ 54 | --unfreeze_mm_vision_tower True \ 55 | --mm_vision_tower_lr ${VIT_LR} \ 56 | --image_aspect_ratio anyres \ 57 | --group_by_modality_length True \ 58 | --mm_vision_select_layer -2 \ 59 | --mm_vision_select_feature patch \ 60 | --mm_patch_merge_type spatial_unpad \ 61 | --mm_use_im_start_end False \ 62 | --mm_use_im_patch_token False \ 63 | --bf16 True \ 64 | --output_dir ./checkpoints/${SAVE_PATH} \ 65 | --num_train_epochs 1 \ 66 | --per_device_train_batch_size ${BATCH_SIZE} \ 67 | --per_device_eval_batch_size 4 \ 68 | --gradient_accumulation_steps ${GRADIENT_ACCU_STEPS} \ 69 | --evaluation_strategy "no" \ 70 | --save_strategy "steps" \ 71 | --save_steps 100 \ 72 | --save_total_limit 20 \ 73 | --learning_rate ${BASE_LR} \ 74 | --weight_decay 0. \ 75 | --warmup_ratio 0.03 \ 76 | --lr_scheduler_type "cosine" \ 77 | --logging_steps 1 \ 78 | --tf32 True \ 79 | --model_max_length 4096 \ 80 | --gradient_checkpointing True \ 81 | --dataloader_num_workers 16 \ 82 | --lazy_preprocess True \ 83 | --run_name ${SAVE_PATH} \ 84 | --dataloader_drop_last True \ 85 | --report_to tensorboard | tee train_${SAVE_PATH}.log" 86 | 87 | while (( retry_count < MAX_RETRIES )); do 88 | # 执行命令 89 | eval $command_to_run 90 | 91 | # # 检查命令的退出状态 92 | # if [[ $? -eq 0 ]]; then 93 | # # 命令成功,退出循环 94 | # echo "命令成功执行。" 95 | # break 96 | # else 97 | # 命令失败,增加重试计数 98 | echo "命令失败,重试中..." 99 | ((retry_count++)) 100 | 101 | # 等待一段时间后再重试 102 | sleep $WAIT_TIME 103 | # fi 104 | done 105 | 106 | # 检查是否超过最大重试次数 107 | if (( retry_count == MAX_RETRIES )); then 108 | echo "命令在达到最大重试次数后仍然失败。" 109 | exit 1 110 | fi 111 | 112 | -------------------------------------------------------------------------------- /scripts/train/qwen2/image2text_pretrain.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -x 3 | 4 | # wandb login 5 | export CUDA_DEVICE_MAX_CONNECTIONS=1 6 | export GPUS_PER_NODE=8 7 | export NNODES=4 8 | export BATCH_SIZE=4 9 | export GRADIENT_ACCU_STEPS=8 10 | export MASTER_PORT=29504 11 | export CPUS_PER_TASK=16 12 | export QUOTA=reserved 13 | 14 | export DATA_PATH=./datasets/openomni/json/openomni_stage2-1.json 15 | export SAVE_PATH=openomni_stage2-1_qwen_2 16 | export BASE_LR=1e-3 17 | 18 | 19 | bash -c "torchrun --nproc_per_node $GPUS_PER_NODE openomni/train/train_mem.py \ 20 | --deepspeed ./scripts/zero2.json \ 21 | --model_name_or_path ./checkpoints/qwen/Qwen2-7B-Instruct \ 22 | --version plain \ 23 | --data_path ${DATA_PATH} \ 24 | --image_folder ./datasets/llava/llava_pretrain/images \ 25 | --vision_tower ./checkpoints/openai/clip-vit-large-patch14-336 \ 26 | --speech_encoder ./checkpoints/openai-whisper/large-v3.pt \ 27 | --mm_projector_type mlp2x_gelu \ 28 | --tune_mm_mlp_adapter True \ 29 | --unfreeze_mm_vision_tower False \ 30 | --image_aspect_ratio anyres \ 31 | --mm_vision_select_layer -2 \ 32 | --mm_vision_select_feature patch \ 33 | --mm_patch_merge_type spatial_unpad \ 34 | --mm_use_im_start_end False \ 35 | --mm_use_im_patch_token False \ 36 | --bf16 True \ 37 | --output_dir ./checkpoints/${SAVE_PATH} \ 38 | --num_train_epochs 1 \ 39 | --per_device_train_batch_size ${BATCH_SIZE} \ 40 | --per_device_eval_batch_size 4 \ 41 | --gradient_accumulation_steps ${GRADIENT_ACCU_STEPS} \ 42 | --evaluation_strategy "no" \ 43 | --save_strategy "steps" \ 44 | --save_steps 500 \ 45 | --save_total_limit 2 \ 46 | --learning_rate ${BASE_LR} \ 47 | --weight_decay 0. \ 48 | --warmup_ratio 0.03 \ 49 | --lr_scheduler_type "cosine" \ 50 | --logging_steps 1 \ 51 | --tf32 True \ 52 | --model_max_length 8096 \ 53 | --gradient_checkpointing True \ 54 | --dataloader_num_workers 4 \ 55 | --lazy_preprocess True \ 56 | --run_name ${SAVE_PATH} \ 57 | --dataloader_drop_last True \ 58 | --report_to tensorboard | tee train_${SAVE_PATH}.log" 59 | -------------------------------------------------------------------------------- /scripts/train/qwen2/speech2text_pretrain.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -x 3 | 4 | # wandb login 5 | export CUDA_DEVICE_MAX_CONNECTIONS=1 6 | export GPUS_PER_NODE=8 7 | export NNODES=4 8 | export BATCH_SIZE=8 9 | export GRADIENT_ACCU_STEPS=4 10 | export MASTER_PORT=29504 11 | export CPUS_PER_TASK=16 12 | export QUOTA=reserved 13 | 14 | export DATA_PATH=./datasets/openomni/json/openomni_stage1-1.json 15 | export SAVE_PATH=openomni_stage1-1_qwen_2 16 | export BASE_LR=1e-4 17 | 18 | 19 | bash -c "torchrun --nproc_per_node $GPUS_PER_NODE openomni/train/train_mem.py \ 20 | --deepspeed ./scripts/zero2.json \ 21 | --model_name_or_path ./checkpoints/qwen/Qwen2-7B-Instruct \ 22 | --version plain \ 23 | --data_path ${DATA_PATH} \ 24 | --image_folder ./datasets/llava/llava_pretrain/images \ 25 | --vision_tower ./checkpoints/openai/clip-vit-large-patch14-336 \ 26 | --speech_encoder ./checkpoints/openai-whisper/large-v3.pt \ 27 | --mm_projector_type mlp2x_gelu \ 28 | --freeze_backbone True \ 29 | --tune_mm_mlp_adapter False \ 30 | --tune_speech_adapter True \ 31 | --freeze_mm_mlp_adapter True \ 32 | --unfreeze_mm_vision_tower False \ 33 | --image_aspect_ratio anyres \ 34 | --mm_vision_select_layer -2 \ 35 | --mm_vision_select_feature patch \ 36 | --mm_patch_merge_type spatial_unpad \ 37 | --mm_use_im_start_end False \ 38 | --mm_use_im_patch_token False \ 39 | --bf16 True \ 40 | --group_by_modality_length True \ 41 | --output_dir checkpoints/${SAVE_PATH} \ 42 | --num_train_epochs 1 \ 43 | --per_device_train_batch_size ${BATCH_SIZE} \ 44 | --per_device_eval_batch_size 4 \ 45 | --gradient_accumulation_steps ${GRADIENT_ACCU_STEPS} \ 46 | --evaluation_strategy "no" \ 47 | --save_strategy "steps" \ 48 | --save_steps 500 \ 49 | --save_total_limit 2 \ 50 | --learning_rate ${BASE_LR} \ 51 | --weight_decay 0. \ 52 | --warmup_ratio 0.03 \ 53 | --lr_scheduler_type "cosine" \ 54 | --logging_steps 1 \ 55 | --tf32 True \ 56 | --model_max_length 8096 \ 57 | --gradient_checkpointing True \ 58 | --dataloader_num_workers 8 \ 59 | --lazy_preprocess True \ 60 | --run_name ${SAVE_PATH} \ 61 | --dataloader_drop_last True \ 62 | --report_to tensorboard | tee train_${SAVE_PATH}.log" 63 | -------------------------------------------------------------------------------- /scripts/train/qwen2/text2speech_dpo.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -x 3 | 4 | # wandb login 5 | echo "DLC seemed world size(actually nodes): ${WORLD_SIZE}" 6 | NNODES=${WORLD_SIZE} 7 | NODE_RANK=${RANK} 8 | 9 | export CUDA_DEVICE_MAX_CONNECTIONS=1 10 | export GPUS_PER_NODE=8 11 | # export NNODES=2 12 | export BATCH_SIZE=1 13 | export GRADIENT_ACCU_STEPS=4 14 | export MASTER_PORT=29588 15 | export CPUS_PER_TASK=16 16 | export QUOTA=reserved 17 | 18 | export DATA_PATH=./datasets/openomni/json/openomni_stage3-2.json 19 | export SAVE_PATH=openomni_stage3-2_qwen_2 20 | export BASE_LR=2e-5 21 | export VIT_LR=2e-6 22 | 23 | DISTRIBUTED_ARGS=" 24 | --nproc_per_node $GPUS_PER_NODE \ 25 | --nnodes $NNODES \ 26 | --node_rank $NODE_RANK \ 27 | --master_addr $MASTER_ADDR \ 28 | --master_port $MASTER_PORT 29 | " 30 | 31 | # 定义重试的最大次数 32 | MAX_RETRIES=3 33 | 34 | # 每次重试之间的等待时间,单位为秒 35 | WAIT_TIME=20 36 | 37 | # 当前的重试次数 38 | retry_count=0 39 | 40 | command_to_run="torchrun --nproc_per_node $GPUS_PER_NODE openomni/train/train_mem.py \ 41 | --deepspeed ./scripts/zero2.json \ 42 | --model_name_or_path ./checkpoints/openomni_stage3_qwen_3/checkpoint-last \ 43 | --version llava_qwen_2 \ 44 | --data_path ${DATA_PATH} \ 45 | --image_folder ./datasets \ 46 | --speech_folder ./datasets \ 47 | --vision_tower ./checkpoints/openai/clip-vit-large-patch14-336 \ 48 | --mm_projector_type mlp2x_gelu \ 49 | --freeze_backbone True \ 50 | --tune_mm_mlp_adapter False \ 51 | --freeze_mm_mlp_adapter True \ 52 | --tune_speech_generator_only True \ 53 | --tune_speech_generator_dpo True \ 54 | --pretrain_mm_mlp_adapter ./checkpoints/openomni_stage2_qwen_2/mm_projector.bin \ 55 | --speech_encoder ./checkpoints/openai-whisper/large-v3.pt \ 56 | --speech_generator ./checkpoints/speech_generator/generator_16k.pt \ 57 | --unfreeze_mm_vision_tower False \ 58 | --mm_vision_tower_lr ${VIT_LR} \ 59 | --speech_generator_lr ${VIT_LR} \ 60 | --mm_projector_lr ${VIT_LR} \ 61 | --image_aspect_ratio anyres \ 62 | --group_by_modality_length True \ 63 | --mm_vision_select_layer -2 \ 64 | --mm_vision_select_feature patch \ 65 | --mm_patch_merge_type spatial_unpad \ 66 | --mm_use_im_start_end False \ 67 | --mm_use_im_patch_token False \ 68 | --bf16 True \ 69 | --output_dir ./checkpoints/${SAVE_PATH} \ 70 | --num_train_epochs 3 \ 71 | --per_device_train_batch_size ${BATCH_SIZE} \ 72 | --per_device_eval_batch_size 4 \ 73 | --gradient_accumulation_steps ${GRADIENT_ACCU_STEPS} \ 74 | --evaluation_strategy "no" \ 75 | --save_strategy "steps" \ 76 | --save_steps 1000 \ 77 | --save_total_limit 20 \ 78 | --learning_rate ${BASE_LR} \ 79 | --weight_decay 0. \ 80 | --warmup_ratio 0.03 \ 81 | --lr_scheduler_type "cosine" \ 82 | --logging_steps 1 \ 83 | --tf32 True \ 84 | --model_max_length 8196 \ 85 | --gradient_checkpointing True \ 86 | --dataloader_num_workers 8 \ 87 | --lazy_preprocess True \ 88 | --speech_generator_type "ar" \ 89 | --run_name ${SAVE_PATH} \ 90 | --dataloader_drop_last True \ 91 | --report_to tensorboard | tee train_${SAVE_PATH}.log" 92 | 93 | eval $command_to_run 94 | 95 | -------------------------------------------------------------------------------- /scripts/train/qwen2/text2speech_pretrain.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -x 3 | 4 | # wandb login 5 | echo "DLC seemed world size(actually nodes): ${WORLD_SIZE}" 6 | NNODES=${WORLD_SIZE} 7 | NODE_RANK=${RANK} 8 | 9 | export CUDA_DEVICE_MAX_CONNECTIONS=1 10 | export GPUS_PER_NODE=8 11 | # export NNODES=2 12 | export BATCH_SIZE=1 13 | export GRADIENT_ACCU_STEPS=4 14 | export MASTER_PORT=29588 15 | export CPUS_PER_TASK=16 16 | export QUOTA=reserved 17 | 18 | export DATA_PATH=./datasets/openomni/json/openomni_stage3-1.json 19 | export SAVE_PATH=openomni_stage3-1_qwen_2 20 | export BASE_LR=2e-5 21 | export VIT_LR=2e-6 22 | 23 | DISTRIBUTED_ARGS=" 24 | --nproc_per_node $GPUS_PER_NODE \ 25 | --nnodes $NNODES \ 26 | --node_rank $NODE_RANK \ 27 | --master_addr $MASTER_ADDR \ 28 | --master_port $MASTER_PORT 29 | " 30 | 31 | # 定义重试的最大次数 32 | MAX_RETRIES=3 33 | 34 | # 每次重试之间的等待时间,单位为秒 35 | WAIT_TIME=20 36 | 37 | # 当前的重试次数 38 | retry_count=0 39 | 40 | command_to_run="torchrun --nproc_per_node $GPUS_PER_NODE openomni/train/train_mem.py \ 41 | --deepspeed ./scripts/zero2.json \ 42 | --model_name_or_path ./checkpoints/openomni_stage3_qwen_3/checkpoint-last \ 43 | --version llava_qwen_2 \ 44 | --data_path ${DATA_PATH} \ 45 | --image_folder ./datasets \ 46 | --speech_folder ./datasets \ 47 | --vision_tower ./checkpoints/openai/clip-vit-large-patch14-336 \ 48 | --mm_projector_type mlp2x_gelu \ 49 | --freeze_backbone True \ 50 | --tune_mm_mlp_adapter False \ 51 | --freeze_mm_mlp_adapter True \ 52 | --tune_speech_generator_only True \ 53 | --pretrain_mm_mlp_adapter ./checkpoints/openomni_stage2_qwen_2/mm_projector.bin \ 54 | --speech_encoder ./checkpoints/openai-whisper/large-v3.pt \ 55 | --speech_generator ./checkpoints/speech_generator/generator_16K.pt \ 56 | --unfreeze_mm_vision_tower False \ 57 | --mm_vision_tower_lr ${VIT_LR} \ 58 | --speech_generator_lr ${VIT_LR} \ 59 | --mm_projector_lr ${VIT_LR} \ 60 | --image_aspect_ratio anyres \ 61 | --group_by_modality_length True \ 62 | --mm_vision_select_layer -2 \ 63 | --mm_vision_select_feature patch \ 64 | --mm_patch_merge_type spatial_unpad \ 65 | --mm_use_im_start_end False \ 66 | --mm_use_im_patch_token False \ 67 | --bf16 True \ 68 | --output_dir ./checkpoints/${SAVE_PATH} \ 69 | --num_train_epochs 1 \ 70 | --per_device_train_batch_size ${BATCH_SIZE} \ 71 | --per_device_eval_batch_size 4 \ 72 | --gradient_accumulation_steps ${GRADIENT_ACCU_STEPS} \ 73 | --evaluation_strategy "no" \ 74 | --save_strategy "steps" \ 75 | --save_steps 1000 \ 76 | --save_total_limit 20 \ 77 | --learning_rate ${BASE_LR} \ 78 | --weight_decay 0. \ 79 | --warmup_ratio 0.03 \ 80 | --lr_scheduler_type "cosine" \ 81 | --logging_steps 1 \ 82 | --tf32 True \ 83 | --model_max_length 8196 \ 84 | --gradient_checkpointing True \ 85 | --dataloader_num_workers 8 \ 86 | --lazy_preprocess True \ 87 | --speech_generator_type "ar" \ 88 | --run_name ${SAVE_PATH} \ 89 | --dataloader_drop_last True \ 90 | --report_to tensorboard | tee train_${SAVE_PATH}.log" 91 | 92 | eval $command_to_run 93 | -------------------------------------------------------------------------------- /scripts/train/qwen2/text2speech_pretrain_6k.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -x 3 | 4 | # wandb login 5 | echo "DLC seemed world size(actually nodes): ${WORLD_SIZE}" 6 | NNODES=${WORLD_SIZE} 7 | NODE_RANK=${RANK} 8 | 9 | export CUDA_DEVICE_MAX_CONNECTIONS=1 10 | export GPUS_PER_NODE=8 11 | # export NNODES=2 12 | export BATCH_SIZE=1 13 | export GRADIENT_ACCU_STEPS=4 14 | export MASTER_PORT=29588 15 | export CPUS_PER_TASK=16 16 | export QUOTA=reserved 17 | 18 | export DATA_PATH=./datasets/openomni/json/openomni_stage3-1-6k.json 19 | export SAVE_PATH=openomni_stage3-1_qwen_2-6k 20 | export BASE_LR=2e-5 21 | export VIT_LR=2e-6 22 | 23 | DISTRIBUTED_ARGS=" 24 | --nproc_per_node $GPUS_PER_NODE \ 25 | --nnodes $NNODES \ 26 | --node_rank $NODE_RANK \ 27 | --master_addr $MASTER_ADDR \ 28 | --master_port $MASTER_PORT 29 | " 30 | 31 | # 定义重试的最大次数 32 | MAX_RETRIES=3 33 | 34 | # 每次重试之间的等待时间,单位为秒 35 | WAIT_TIME=20 36 | 37 | # 当前的重试次数 38 | retry_count=0 39 | 40 | command_to_run="torchrun --nproc_per_node $GPUS_PER_NODE openomni/train/train_mem.py \ 41 | --deepspeed ./scripts/zero2.json \ 42 | --model_name_or_path ./checkpoints/openomni_stage3_qwen_3/checkpoint-last \ 43 | --version llava_qwen_2 \ 44 | --data_path ${DATA_PATH} \ 45 | --image_folder ./datasets \ 46 | --speech_folder ./datasets \ 47 | --vision_tower ./checkpoints/openai/clip-vit-large-patch14-336 \ 48 | --mm_projector_type mlp2x_gelu \ 49 | --freeze_backbone True \ 50 | --tune_mm_mlp_adapter False \ 51 | --freeze_mm_mlp_adapter True \ 52 | --tune_speech_generator_only True \ 53 | --pretrain_mm_mlp_adapter ./checkpoints/openomni_stage2_qwen_2/mm_projector.bin \ 54 | --speech_encoder ./checkpoints/openai-whisper/large-v3.pt \ 55 | --speech_generator ./checkpoints/speech_generator/generator_16k.pt \ 56 | --unfreeze_mm_vision_tower False \ 57 | --mm_vision_tower_lr ${VIT_LR} \ 58 | --speech_generator_lr ${VIT_LR} \ 59 | --mm_projector_lr ${VIT_LR} \ 60 | --image_aspect_ratio anyres \ 61 | --group_by_modality_length True \ 62 | --mm_vision_select_layer -2 \ 63 | --mm_vision_select_feature patch \ 64 | --mm_patch_merge_type spatial_unpad \ 65 | --mm_use_im_start_end False \ 66 | --mm_use_im_patch_token False \ 67 | --bf16 True \ 68 | --output_dir checkpoints/${SAVE_PATH} \ 69 | --num_train_epochs 1 \ 70 | --per_device_train_batch_size ${BATCH_SIZE} \ 71 | --per_device_eval_batch_size 4 \ 72 | --gradient_accumulation_steps ${GRADIENT_ACCU_STEPS} \ 73 | --evaluation_strategy "no" \ 74 | --save_strategy "steps" \ 75 | --save_steps 1000 \ 76 | --save_total_limit 20 \ 77 | --learning_rate ${BASE_LR} \ 78 | --weight_decay 0. \ 79 | --warmup_ratio 0.03 \ 80 | --lr_scheduler_type "cosine" \ 81 | --logging_steps 1 \ 82 | --tf32 True \ 83 | --model_max_length 8196 \ 84 | --gradient_checkpointing True \ 85 | --dataloader_num_workers 8 \ 86 | --lazy_preprocess True \ 87 | --speech_generator_type "ar" \ 88 | --run_name ${SAVE_PATH} \ 89 | --dataloader_drop_last True \ 90 | --unit_vocab_size 6561 \ 91 | --report_to tensorboard | tee train_${SAVE_PATH}.log" 92 | 93 | eval $command_to_run 94 | -------------------------------------------------------------------------------- /scripts/train/qwen2/text2speech_pretrain_ctc.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -x 3 | 4 | # wandb login 5 | echo "DLC seemed world size(actually nodes): ${WORLD_SIZE}" 6 | NNODES=${WORLD_SIZE} 7 | NODE_RANK=${RANK} 8 | 9 | export CUDA_DEVICE_MAX_CONNECTIONS=1 10 | export GPUS_PER_NODE=8 11 | # export NNODES=2 12 | export BATCH_SIZE=1 13 | export GRADIENT_ACCU_STEPS=4 14 | export MASTER_PORT=29588 15 | export CPUS_PER_TASK=16 16 | export QUOTA=reserved 17 | 18 | export DATA_PATH=./datasets/openomni/json/openomni_stage3-1-6k.json 19 | export SAVE_PATH=openomni_stage3-1_qwen_2_ctc 20 | export BASE_LR=2e-4 21 | export VIT_LR=2e-4 22 | 23 | DISTRIBUTED_ARGS=" 24 | --nproc_per_node $GPUS_PER_NODE \ 25 | --nnodes $NNODES \ 26 | --node_rank $NODE_RANK \ 27 | --master_addr $MASTER_ADDR \ 28 | --master_port $MASTER_PORT 29 | " 30 | 31 | # 定义重试的最大次数 32 | MAX_RETRIES=3 33 | 34 | # 每次重试之间的等待时间,单位为秒 35 | WAIT_TIME=20 36 | 37 | # 当前的重试次数 38 | retry_count=0 39 | 40 | command_to_run="torchrun --nproc_per_node $GPUS_PER_NODE openomni/train/train_mem.py \ 41 | --deepspeed ./scripts/zero2.json \ 42 | --model_name_or_path ./checkpoints/openomni_stage3_qwen_3/checkpoint-last \ 43 | --version llava_qwen_2 \ 44 | --data_path ${DATA_PATH} \ 45 | --image_folder ./datasets \ 46 | --speech_folder ./datasets \ 47 | --vision_tower ./checkpoints/openai/clip-vit-large-patch14-336 \ 48 | --mm_projector_type mlp2x_gelu \ 49 | --freeze_backbone True \ 50 | --tune_mm_mlp_adapter False \ 51 | --freeze_mm_mlp_adapter True \ 52 | --tune_speech_generator_only True \ 53 | --pretrain_mm_mlp_adapter ./checkpoints/openomni_stage2_qwen_2/mm_projector.bin \ 54 | --speech_encoder ./checkpoints/openai-whisper/large-v3.pt \ 55 | --speech_generator ./checkpoints/speech_generator/generator_16k.pt \ 56 | --unfreeze_mm_vision_tower False \ 57 | --mm_vision_tower_lr ${VIT_LR} \ 58 | --speech_generator_lr ${VIT_LR} \ 59 | --mm_projector_lr ${VIT_LR} \ 60 | --image_aspect_ratio anyres \ 61 | --group_by_modality_length True \ 62 | --mm_vision_select_layer -2 \ 63 | --mm_vision_select_feature patch \ 64 | --mm_patch_merge_type spatial_unpad \ 65 | --mm_use_im_start_end False \ 66 | --mm_use_im_patch_token False \ 67 | --bf16 True \ 68 | --output_dir ./checkpoints/${SAVE_PATH} \ 69 | --num_train_epochs 2 \ 70 | --per_device_train_batch_size ${BATCH_SIZE} \ 71 | --per_device_eval_batch_size 4 \ 72 | --gradient_accumulation_steps ${GRADIENT_ACCU_STEPS} \ 73 | --evaluation_strategy "no" \ 74 | --save_strategy "steps" \ 75 | --save_steps 1000 \ 76 | --save_total_limit 20 \ 77 | --learning_rate ${BASE_LR} \ 78 | --weight_decay 0. \ 79 | --warmup_ratio 0.03 \ 80 | --lr_scheduler_type "cosine" \ 81 | --logging_steps 1 \ 82 | --tf32 True \ 83 | --model_max_length 8196 \ 84 | --gradient_checkpointing True \ 85 | --dataloader_num_workers 8 \ 86 | --lazy_preprocess True \ 87 | --run_name ${SAVE_PATH} \ 88 | --dataloader_drop_last True \ 89 | --speech_generator_type "ctc" \ 90 | --unit_vocab_size 6561 \ 91 | --report_to tensorboard | tee train_${SAVE_PATH}.log" 92 | 93 | eval $command_to_run 94 | 95 | 96 | -------------------------------------------------------------------------------- /scripts/zero2.json: -------------------------------------------------------------------------------- 1 | { 2 | "fp16": { 3 | "enabled": "auto", 4 | "loss_scale": 0, 5 | "loss_scale_window": 1000, 6 | "initial_scale_power": 16, 7 | "hysteresis": 2, 8 | "min_loss_scale": 1 9 | }, 10 | "bf16": { 11 | "enabled": "auto" 12 | }, 13 | "train_micro_batch_size_per_gpu": "auto", 14 | "train_batch_size": "auto", 15 | "gradient_accumulation_steps": "auto", 16 | "zero_optimization": { 17 | "stage": 2, 18 | "overlap_comm": true, 19 | "contiguous_gradients": true, 20 | "sub_group_size": 1e9, 21 | "reduce_bucket_size": "auto" 22 | } 23 | } -------------------------------------------------------------------------------- /scripts/zero3.json: -------------------------------------------------------------------------------- 1 | { 2 | "fp16": { 3 | "enabled": "auto", 4 | "loss_scale": 0, 5 | "loss_scale_window": 1000, 6 | "initial_scale_power": 16, 7 | "hysteresis": 2, 8 | "min_loss_scale": 1 9 | }, 10 | "bf16": { 11 | "enabled": "auto" 12 | }, 13 | "train_micro_batch_size_per_gpu": "auto", 14 | "train_batch_size": "auto", 15 | "gradient_accumulation_steps": "auto", 16 | "zero_optimization": { 17 | "stage": 3, 18 | "overlap_comm": true, 19 | "contiguous_gradients": true, 20 | "sub_group_size": 1e9, 21 | "reduce_bucket_size": "auto", 22 | "stage3_prefetch_bucket_size": "auto", 23 | "stage3_param_persistence_threshold": "auto", 24 | "stage3_max_live_parameters": 1e9, 25 | "stage3_max_reuse_distance": 1e9, 26 | "stage3_gather_16bit_weights_on_model_save": true 27 | } 28 | } -------------------------------------------------------------------------------- /scripts/zero3_offload.json: -------------------------------------------------------------------------------- 1 | { 2 | "fp16": { 3 | "enabled": "auto", 4 | "loss_scale": 0, 5 | "loss_scale_window": 1000, 6 | "initial_scale_power": 16, 7 | "hysteresis": 2, 8 | "min_loss_scale": 1 9 | }, 10 | "bf16": { 11 | "enabled": "auto" 12 | }, 13 | "optimizer": { 14 | "type": "AdamW", 15 | "params": { 16 | "lr": "auto", 17 | "betas": "auto", 18 | "eps": "auto", 19 | "weight_decay": "auto" 20 | } 21 | }, 22 | "scheduler": { 23 | "type": "WarmupLR", 24 | "params": { 25 | "warmup_min_lr": "auto", 26 | "warmup_max_lr": "auto", 27 | "warmup_num_steps": "auto" 28 | } 29 | }, 30 | "zero_optimization": { 31 | "stage": 3, 32 | "offload_optimizer": { 33 | "device": "cpu", 34 | "pin_memory": true 35 | }, 36 | "offload_param": { 37 | "device": "cpu", 38 | "pin_memory": true 39 | }, 40 | "overlap_comm": true, 41 | "contiguous_gradients": true, 42 | "sub_group_size": 1e9, 43 | "reduce_bucket_size": "auto", 44 | "stage3_prefetch_bucket_size": "auto", 45 | "stage3_param_persistence_threshold": "auto", 46 | "stage3_max_live_parameters": 1e9, 47 | "stage3_max_reuse_distance": 1e9, 48 | "gather_16bit_weights_on_model_save": true 49 | }, 50 | "gradient_accumulation_steps": "auto", 51 | "gradient_clipping": "auto", 52 | "train_batch_size": "auto", 53 | "train_micro_batch_size_per_gpu": "auto", 54 | "steps_per_print": 1e5, 55 | "wall_clock_breakdown": false 56 | } -------------------------------------------------------------------------------- /vlmevalkit/script/run_inference_2.sh: -------------------------------------------------------------------------------- 1 | export PATH=/usr/local/cuda/bin:$PATH 2 | 3 | export HF_ENDPOINT=https://hf-mirror.com 4 | export OMP_NUM_THREADS=1 5 | export timestamp=`date +"%Y%m%d%H%M%S"` 6 | export OLD_VERSION='False' 7 | export PYTHONPATH=$(dirname $SELF_DIR):$PYTHONPATH 8 | 9 | # gpu consumed 10 | # fp16 17-18G 11 | # int4 7-8G 12 | 13 | # model to be used 14 | # Example: MODELNAME=OpenOmni_Qwen 15 | MODELNAME=$1 16 | # datasets to be tested 17 | # Example: DATALIST="POPE ScienceQA_TEST ChartQA_TEST" 18 | DATALIST=$2 19 | # test mode, all or infer 20 | MODE=$3 21 | 22 | echo "Starting inference with model $MODELNAME on datasets $DATALIST" 23 | # run on multi gpus with torchrun command 24 | # remember to run twice, the first run may fail 25 | torchrun --nproc_per_node=1 --master_port=28881 run.py --data $DATALIST --model $MODELNAME --mode $MODE --rerun 26 | # torchrun --nproc_per_node=4 run.py --data $DATALIST --model $MODELNAME --mode $MODE --rerun 27 | # run on single gpu with python command 28 | # python run.py --data $DATALIST --model $MODELNAME --verbose --mode $MODE 29 | # python run.py --data $DATALIST --model $MODELNAME --verbose --mode $MODE 30 | # CUDA_VISIBLE_DEVICES=0,1,2,3 ./script/run_inference.sh OpenOmni "MME MMMU_DEV_VAL MathVista_MINI LLaVABench RealWorldQA MMStar MMVet AI2D_TEST OCRBench HallusionBench" all 31 | -------------------------------------------------------------------------------- /vlmevalkit/vlmeval/__init__.py: -------------------------------------------------------------------------------- 1 | try: 2 | import torch 3 | except ImportError: 4 | pass 5 | 6 | from .smp import * 7 | from .api import * 8 | from .evaluate import * 9 | from .utils import * 10 | from .vlm import * 11 | from .config import * 12 | 13 | load_env() 14 | -------------------------------------------------------------------------------- /vlmevalkit/vlmeval/api/__init__.py: -------------------------------------------------------------------------------- 1 | from .gpt import OpenAIWrapper, GPT4V 2 | from .gpt_int import OpenAIWrapperInternal, GPT4V_Internal 3 | 4 | __all__ = [ 5 | 'OpenAIWrapper', 'OpenAIWrapperInternal', 'GPT4V', 'GPT4V_Internal' 6 | ] 7 | -------------------------------------------------------------------------------- /vlmevalkit/vlmeval/api/gpt_int.py: -------------------------------------------------------------------------------- 1 | import json 2 | import warnings 3 | import requests 4 | from ..smp import * 5 | from .gpt import GPT_context_window, OpenAIWrapper 6 | 7 | url = 'http://ecs.sv.us.alles-apin.openxlab.org.cn/v1/openai/v2/text/chat' 8 | headers = { 9 | 'Content-Type': 'application/json' 10 | } 11 | 12 | 13 | class OpenAIWrapperInternal(OpenAIWrapper): 14 | 15 | is_api: bool = True 16 | 17 | def __init__(self, 18 | model: str = 'gpt-3.5-turbo-0613', 19 | retry: int = 5, 20 | wait: int = 3, 21 | verbose: bool = True, 22 | system_prompt: str = None, 23 | temperature: float = 0, 24 | timeout: int = 60, 25 | max_tokens: int = 2000, 26 | img_size: int = 512, 27 | img_detail: str = 'low', 28 | **kwargs): 29 | 30 | self.model = model 31 | if 'KEYS' in os.environ and osp.exists(os.environ['KEYS']): 32 | keys = load(os.environ['KEYS']) 33 | headers['alles-apin-token'] = keys.get('alles-apin-token', '') 34 | elif 'ALLES' in os.environ: 35 | headers['alles-apin-token'] = os.environ['ALLES'] 36 | self.headers = headers 37 | self.temperature = temperature 38 | self.timeout = timeout 39 | self.max_tokens = max_tokens 40 | 41 | assert img_size > 0 or img_size == -1 42 | self.img_size = img_size 43 | assert img_detail in ['high', 'low'] 44 | self.img_detail = img_detail 45 | 46 | super(OpenAIWrapper, self).__init__( 47 | wait=wait, retry=retry, system_prompt=system_prompt, verbose=verbose, **kwargs) 48 | 49 | def generate_inner(self, inputs, **kwargs) -> str: 50 | input_msgs = self.prepare_inputs(inputs) 51 | 52 | temperature = kwargs.pop('temperature', self.temperature) 53 | max_tokens = kwargs.pop('max_tokens', self.max_tokens) 54 | 55 | # Held out 100 tokens as buffer 56 | context_window = GPT_context_window(self.model) 57 | max_tokens = min(max_tokens, context_window - self.get_token_len(inputs)) 58 | if 0 < max_tokens <= 100: 59 | print('Less than 100 tokens left, may exceed the context window with some additional meta symbols. ') 60 | if max_tokens <= 0: 61 | return 0, self.fail_msg + 'Input string longer than context window. ', 'Length Exceeded. ' 62 | 63 | payload = dict( 64 | model=self.model, 65 | messages=input_msgs, 66 | max_tokens=max_tokens, 67 | n=1, 68 | stop=None, 69 | timeout=self.timeout, 70 | temperature=temperature, 71 | **kwargs) 72 | 73 | response = requests.post(url, headers=headers, data=json.dumps(payload), timeout=self.timeout * 1.1) 74 | ret_code = response.status_code 75 | ret_code = 0 if (200 <= int(ret_code) < 300) else ret_code 76 | 77 | answer = self.fail_msg 78 | try: 79 | resp_struct = json.loads(response.text) 80 | assert resp_struct['msg'] == 'ok' and resp_struct['msgCode'] == '10000', resp_struct 81 | answer = resp_struct['data']['response']['choices'][0]['message']['content'].strip() 82 | except: 83 | pass 84 | return ret_code, answer, response 85 | 86 | 87 | class GPT4V_Internal(OpenAIWrapperInternal): 88 | 89 | def generate(self, message, dataset=None): 90 | return super(GPT4V_Internal, self).generate(message) 91 | -------------------------------------------------------------------------------- /vlmevalkit/vlmeval/config.py: -------------------------------------------------------------------------------- 1 | from vlmeval.vlm import * 2 | from vlmeval.api import * 3 | from functools import partial 4 | 5 | model_path="./checkpoints/openomni_stage3_llama_3/checkpoint-20000" 6 | model_path2="./checkpoints/openomni_stage3_qwen_2/checkpoint-20000" 7 | ungrouped = { 8 | 'OpenOmni-Llama3-V-1_6':partial(OpenOmni_Llama3, model_path=model_path), 9 | 'OpenOmni-Qwen2-V-1_6':partial(OpenOmni_Qwen2, model_path=model_path2), 10 | } 11 | 12 | # "oss://coaidatasets-intern/minzheng/luorun/data/seed_data_15k_mini.json " 13 | # bash ./script/run_inference_2.sh OpenOmni-Qwen2-V-1_6_4 "MME MMMU_DEV_VAL MathVista_MINI RealWorldQA MMStar MMVet AI2D_TEST OCRBench HallusionBench" all 14 | # bash ./script/run_inference_8.sh OpenOmni-Llama3-V-1_6 "MME MMMU_DEV_VAL MathVista_MINI LLaVABench RealWorldQA MMStar MMVet AI2D_TEST OCRBench HallusionBench" all 15 | # bash ./script/run_inference_8.sh OpenOmni-Qwen2-V-1_6_ablation_evol_final_evol_2 "MME MMMU_DEV_VAL MathVista_MINI LLaVABench RealWorldQA MMStar MMVet AI2D_TEST OCRBench HallusionBench POPE BLINK" all 16 | # bash ./script/run_inference_4.sh OpenOmni-Llama3-V-1_6_ablation_seed_11k_seed "MMBench_TEST_EN MMBench_TEST_CN" all 17 | # bash ./script/run_inference_2.sh OpenOmni-Qwen2-V-1_6 "MME MMMU_DEV_VAL MathVista_MINI LLaVABench RealWorldQA MMStar MMVet AI2D_TEST OCRBench HallusionBenc" all 18 | 19 | supported_VLM = {} 20 | 21 | model_groups = [ 22 | ungrouped 23 | ] 24 | 25 | for grp in model_groups: 26 | supported_VLM.update(grp) 27 | 28 | -------------------------------------------------------------------------------- /vlmevalkit/vlmeval/evaluate/OCRBench.py: -------------------------------------------------------------------------------- 1 | from vlmeval.smp import * 2 | 3 | 4 | def OCRBench_eval(eval_file): 5 | OCRBench_score = { 6 | 'Regular Text Recognition': 0, 7 | 'Irregular Text Recognition': 0, 8 | 'Artistic Text Recognition': 0, 9 | 'Handwriting Recognition': 0, 10 | 'Digit String Recognition': 0, 11 | 'Non-Semantic Text Recognition': 0, 12 | 'Scene Text-centric VQA': 0, 13 | 'Doc-oriented VQA': 0, 14 | 'Key Information Extraction': 0, 15 | 'Handwritten Mathematical Expression Recognition': 0 16 | } 17 | 18 | logger = get_logger('Evaluation') 19 | 20 | data = load(eval_file) 21 | lt = len(data) 22 | lines = [data.iloc[i] for i in range(lt)] 23 | for i in tqdm(range(len(lines))): 24 | line = lines[i] 25 | predict = str(line['prediction']) 26 | answers = eval(line['answer']) 27 | category = line['category'] 28 | if category == 'Handwritten Mathematical Expression Recognition': 29 | for j in range(len(answers)): 30 | answer = answers[j].strip().replace('\n', ' ').replace(' ', '') 31 | predict = predict.strip().replace('\n', ' ').replace(' ', '') 32 | if answer in predict: 33 | OCRBench_score[category] += 1 34 | break 35 | else: 36 | for j in range(len(answers)): 37 | answer = answers[j].lower().strip().replace('\n', ' ') 38 | predict = predict.lower().strip().replace('\n', ' ') 39 | if answer in predict: 40 | OCRBench_score[category] += 1 41 | break 42 | 43 | final_score_dict = {} 44 | final_score_dict['Text Recognition'] = ( 45 | OCRBench_score['Regular Text Recognition'] + OCRBench_score['Irregular Text Recognition'] 46 | + OCRBench_score['Artistic Text Recognition'] + OCRBench_score['Handwriting Recognition'] 47 | + OCRBench_score['Digit String Recognition'] + OCRBench_score['Non-Semantic Text Recognition'] 48 | ) 49 | final_score_dict['Scene Text-centric VQA'] = OCRBench_score['Scene Text-centric VQA'] 50 | final_score_dict['Doc-oriented VQA'] = OCRBench_score['Doc-oriented VQA'] 51 | final_score_dict['Key Information Extraction'] = OCRBench_score['Key Information Extraction'] 52 | final_score_dict['Handwritten Mathematical Expression Recognition'] = \ 53 | OCRBench_score['Handwritten Mathematical Expression Recognition'] 54 | final_score_dict['Final Score'] = ( 55 | final_score_dict['Text Recognition'] + final_score_dict['Scene Text-centric VQA'] 56 | + final_score_dict['Doc-oriented VQA'] + final_score_dict['Key Information Extraction'] 57 | + final_score_dict['Handwritten Mathematical Expression Recognition'] 58 | ) 59 | final_score_dict['Final Score Norm'] = float(final_score_dict['Final Score']) / 10 60 | score_pth = eval_file.replace('.xlsx', '_score.json') 61 | dump(final_score_dict, score_pth) 62 | logger.info(f'OCRBench_eval successfully finished evaluating {eval_file}, results saved in {score_pth}') 63 | logger.info('Score: ') 64 | for key, value in final_score_dict.items(): 65 | logger.info('{}:{}'.format(key, value)) 66 | -------------------------------------------------------------------------------- /vlmevalkit/vlmeval/evaluate/__init__.py: -------------------------------------------------------------------------------- 1 | from .yes_or_no import default_rating, MME_rating, YOrN_eval 2 | from .mmvet_eval import MMVet_eval 3 | from .multiple_choice import multiple_choice_eval 4 | from .coco_eval import COCO_eval 5 | from .vqa_eval import VQAEval 6 | from .mathvista_eval import MathVista_eval 7 | from .llavabench import LLaVABench_eval 8 | from .misc import build_judge 9 | from .OCRBench import OCRBench_eval 10 | -------------------------------------------------------------------------------- /vlmevalkit/vlmeval/evaluate/coco_eval.py: -------------------------------------------------------------------------------- 1 | from vlmeval.smp import * 2 | from pycocoevalcap.bleu.bleu import Bleu 3 | from pycocoevalcap.rouge.rouge import Rouge 4 | from pycocoevalcap.cider.cider import Cider 5 | 6 | 7 | class COCO_Caption_Scorer(): 8 | def __init__(self, ref, gt): 9 | self.ref = ref 10 | self.gt = gt 11 | print('setting up scorers...') 12 | self.scorers = [ 13 | (Bleu(4), ['Bleu_1', 'Bleu_2', 'Bleu_3', 'Bleu_4']), 14 | # (Meteor(), "METEOR"), # need java version 11.0.16+ 15 | (Rouge(), 'ROUGE_L'), 16 | (Cider(), 'CIDEr'), 17 | # (Spice(), "SPICE"), # need java version 11.0.16+ 18 | ] 19 | 20 | def compute_scores(self): 21 | total_scores = {} 22 | for scorer, method in self.scorers: 23 | print('computing %s score...' % (scorer.method())) 24 | score, scores = scorer.compute_score(self.gt, self.ref) 25 | if type(method) == list: 26 | for sc, scs, m in zip(score, scores, method): 27 | print('%s: %0.3f' % (m, sc * 100)) 28 | total_scores['Bleu'] = [x * 100 for x in score] 29 | else: 30 | print('%s: %0.3f' % (method, score * 100)) 31 | total_scores[method] = score * 100 32 | 33 | print('*****DONE*****') 34 | for key, value in total_scores.items(): 35 | print('{}:{}'.format(key, value)) 36 | return total_scores 37 | 38 | 39 | def COCO_eval(eval_file, nproc=4, verbose=False): 40 | logger = get_logger('Evaluation') 41 | 42 | data = load(eval_file) 43 | 44 | lt = len(data) 45 | lines = [data.iloc[i] for i in range(lt)] 46 | ref = {} 47 | gt = {} 48 | for i, line in enumerate(lines): 49 | ref[str(i)] = [str(line['prediction'])] 50 | gt[str(i)] = eval(line['answer']) 51 | 52 | scorer = COCO_Caption_Scorer(ref, gt) 53 | coco_caption_score_dict = scorer.compute_scores() 54 | 55 | score_pth = eval_file.replace('.xlsx', '_score.json') 56 | dump(coco_caption_score_dict, score_pth) 57 | logger.info(f'COCO_eval successfully finished evaluating {eval_file}, results saved in {score_pth}') 58 | logger.info('Score: ') 59 | for key, value in coco_caption_score_dict.items(): 60 | logger.info('{}:{}'.format(key, value)) 61 | 62 | 63 | def parse_args(): 64 | parser = argparse.ArgumentParser(description='Inference LLM Answers. ') 65 | parser.add_argument('--data', type=str, help='The question set for inference, in excel / tsv / json format. ') 66 | parser.add_argument('--nproc', type=int, default=4) 67 | parser.add_argument('--verbose', action='store_true') 68 | args = parser.parse_args() 69 | return args 70 | 71 | 72 | if __name__ == '__main__': 73 | args = parse_args() 74 | COCO_eval(eval_file=args.data, nproc=args.nproc, verbose=args.verbose) 75 | -------------------------------------------------------------------------------- /vlmevalkit/vlmeval/evaluate/misc.py: -------------------------------------------------------------------------------- 1 | import os 2 | from vlmeval.api import OpenAIWrapper, OpenAIWrapperInternal 3 | from vlmeval.smp import load_env 4 | 5 | INTERNAL = os.environ.get('INTERNAL', 0) 6 | 7 | 8 | def build_judge(**kwargs): 9 | model = kwargs.pop('model', None) 10 | load_env() 11 | LOCAL_LLM = os.environ.get('LOCAL_LLM', None) 12 | if LOCAL_LLM is None: 13 | model_map = { 14 | 'gpt-4-turbo': 'gpt-4-1106-preview', 15 | 'gpt-4-0613': 'gpt-4-0613', 16 | 'gpt-4-0314': 'gpt-4-0314', 17 | 'gpt-4-0125': 'gpt-4-0125-preview', 18 | 'chatgpt-1106': 'gpt-3.5-turbo-1106', 19 | 'chatgpt-0613': 'gpt-3.5-turbo-0613', 20 | 'chatgpt-0125': 'gpt-3.5-turbo-0125' 21 | } 22 | model_version = model_map[model] 23 | else: 24 | model_version = LOCAL_LLM 25 | if INTERNAL: 26 | model = OpenAIWrapperInternal(model_version, **kwargs) 27 | else: 28 | model = OpenAIWrapper(model_version, **kwargs) 29 | return model 30 | -------------------------------------------------------------------------------- /vlmevalkit/vlmeval/smp/__init__.py: -------------------------------------------------------------------------------- 1 | from .file import * 2 | from .vlm import * 3 | from .misc import * 4 | from .log import * 5 | -------------------------------------------------------------------------------- /vlmevalkit/vlmeval/smp/log.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | logger_initialized = {} 4 | 5 | 6 | def get_logger(name, log_file=None, log_level=logging.INFO, file_mode='w'): 7 | logger = logging.getLogger(name) 8 | if name in logger_initialized: 9 | return logger 10 | 11 | for logger_name in logger_initialized: 12 | if name.startswith(logger_name): 13 | return logger 14 | 15 | stream_handler = logging.StreamHandler() 16 | handlers = [stream_handler] 17 | 18 | try: 19 | import torch.distributed as dist 20 | if dist.is_available() and dist.is_initialized(): 21 | rank = dist.get_rank() 22 | else: 23 | rank = 0 24 | except ImportError: 25 | rank = 0 26 | 27 | if rank == 0 and log_file is not None: 28 | file_handler = logging.FileHandler(log_file, file_mode) 29 | handlers.append(file_handler) 30 | 31 | formatter = logging.Formatter( 32 | '%(asctime)s - %(name)s - %(levelname)s - %(message)s') 33 | for handler in handlers: 34 | handler.setFormatter(formatter) 35 | handler.setLevel(log_level) 36 | logger.addHandler(handler) 37 | 38 | if rank == 0: 39 | logger.setLevel(log_level) 40 | else: 41 | logger.setLevel(logging.ERROR) 42 | 43 | logger_initialized[name] = True 44 | return logger 45 | -------------------------------------------------------------------------------- /vlmevalkit/vlmeval/smp/vlm.py: -------------------------------------------------------------------------------- 1 | import os 2 | import io 3 | import pandas as pd 4 | import numpy as np 5 | import string 6 | from uuid import uuid4 7 | import os.path as osp 8 | import base64 9 | from PIL import Image 10 | from .file import load, dump 11 | Image.MAX_IMAGE_PIXELS = 1e9 12 | 13 | 14 | def mmqa_display(question, target_size=512): 15 | question = {k.lower(): v for k, v in question.items()} 16 | keys = list(question.keys()) 17 | keys = [k for k in keys if k not in ['index', 'image']] 18 | 19 | images = question['image'] 20 | if isinstance(images, str): 21 | images = [images] 22 | 23 | idx = question.pop('index', 'XXX') 24 | print(f'INDEX: {idx}') 25 | 26 | for im in images: 27 | image = decode_base64_to_image(im, target_size=target_size) 28 | display(image) # noqa: F821 29 | 30 | for k in keys: 31 | try: 32 | if not pd.isna(question[k]): 33 | print(f'{k.upper()}. {question[k]}') 34 | except ValueError: 35 | if False in pd.isna(question[k]): 36 | print(f'{k.upper()}. {question[k]}') 37 | 38 | 39 | def encode_image_to_base64(img, target_size=-1): 40 | # if target_size == -1, will not do resizing 41 | # else, will set the max_size ot (target_size, target_size) 42 | if img.mode in ('RGBA', 'P'): 43 | img = img.convert('RGB') 44 | tmp = osp.join('/tmp', str(uuid4()) + '.jpg') 45 | if target_size > 0: 46 | img.thumbnail((target_size, target_size)) 47 | img.save(tmp) 48 | with open(tmp, 'rb') as image_file: 49 | image_data = image_file.read() 50 | ret = base64.b64encode(image_data).decode('utf-8') 51 | os.remove(tmp) 52 | return ret 53 | 54 | 55 | def encode_image_file_to_base64(image_path, target_size=-1): 56 | image = Image.open(image_path) 57 | return encode_image_to_base64(image, target_size=target_size) 58 | 59 | 60 | def decode_base64_to_image(base64_string, target_size=-1): 61 | image_data = base64.b64decode(base64_string) 62 | image = Image.open(io.BytesIO(image_data)) 63 | if image.mode in ('RGBA', 'P'): 64 | image = image.convert('RGB') 65 | if target_size > 0: 66 | image.thumbnail((target_size, target_size)) 67 | return image 68 | 69 | 70 | def decode_base64_to_image_file(base64_string, image_path, target_size=-1): 71 | image = decode_base64_to_image(base64_string, target_size=target_size) 72 | image.save(image_path) 73 | 74 | 75 | def build_option_str(option_dict): 76 | s = 'There are several options: \n' 77 | for c, content in option_dict.items(): 78 | if not pd.isna(content): 79 | s += f'{c}. {content}\n' 80 | return s 81 | 82 | 83 | def isimg(s): 84 | return osp.exists(s) or s.startswith('http') 85 | 86 | 87 | def read_ok(img_path): 88 | if not osp.exists(img_path): 89 | return False 90 | try: 91 | im = Image.open(img_path) 92 | assert im.size[0] > 0 and im.size[1] > 0 93 | return True 94 | except: 95 | return False 96 | 97 | 98 | def gpt_key_set(): 99 | openai_key = os.environ.get('OPENAI_API_KEY', None) 100 | return isinstance(openai_key, str) and openai_key.startswith('sk-') 101 | 102 | 103 | def apiok(wrapper): 104 | s = wrapper.generate('Hello!') 105 | return wrapper.fail_msg not in s 106 | 107 | 108 | def circular_pred(df, extract_func=None): 109 | if extract_func is None: 110 | extract_func = lambda x: x # noqa: E731 111 | df = df.sort_values('index') 112 | from vlmeval.utils import can_infer_option 113 | shift = int(1e6) 114 | 115 | choices = [extract_func(x) for x in df['prediction']] 116 | pred_map = {i: c for i, c in zip(df['index'], choices)} 117 | flag_map = {i: True for i in pred_map if i < 1e6} 118 | valid_map = {i: True for i in pred_map if i < 1e6} 119 | for i in df['index']: 120 | if i >= shift and pred_map[i] and pred_map[i - shift]: 121 | if ( 122 | pred_map[i] not in list(string.ascii_uppercase) or # noqa: W504 123 | pred_map[i - shift] not in list(string.ascii_uppercase) 124 | ): 125 | 126 | valid_map[i % shift] = False 127 | continue 128 | if (ord(pred_map[i]) - ord(pred_map[i - shift])) % 4 == 1: 129 | continue 130 | else: 131 | flag_map[i % shift] = False 132 | flag_map = {k: v for k, v in flag_map.items() if valid_map[k]} 133 | flags = list(flag_map.values()) 134 | return np.mean(flags) 135 | -------------------------------------------------------------------------------- /vlmevalkit/vlmeval/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from .matching_util import can_infer, can_infer_option, can_infer_text 2 | from .mp_util import track_progress_rich 3 | from .custom_prompt import CustomPrompt 4 | from .dataset_config import dataset_URLs, img_root_map, DATASET_TYPE, abbr2full 5 | from .dataset import TSVDataset, split_MMMU, MMMU_result_transfer 6 | 7 | 8 | __all__ = [ 9 | 'can_infer', 'can_infer_option', 'can_infer_text', 'track_progress_rich', 10 | 'TSVDataset', 'dataset_URLs', 'img_root_map', 'DATASET_TYPE', 'CustomPrompt', 11 | 'split_MMMU', 'abbr2full' 12 | ] 13 | -------------------------------------------------------------------------------- /vlmevalkit/vlmeval/utils/custom_prompt.py: -------------------------------------------------------------------------------- 1 | from ..smp import * 2 | from .dataset_config import img_root_map 3 | from abc import abstractmethod 4 | 5 | 6 | class CustomPrompt: 7 | 8 | @abstractmethod 9 | def use_custom_prompt(self, dataset): 10 | raise NotImplementedError 11 | 12 | @abstractmethod 13 | def build_prompt(self, line, dataset): 14 | raise NotImplementedError 15 | 16 | def dump_image(self, line, dataset): 17 | ROOT = LMUDataRoot() 18 | assert isinstance(dataset, str) 19 | img_root = osp.join(ROOT, 'images', img_root_map[dataset] if dataset in img_root_map else dataset) 20 | os.makedirs(img_root, exist_ok=True) 21 | if isinstance(line['image'], list): 22 | tgt_path = [] 23 | assert 'image_path' in line 24 | for img, im_name in zip(line['image'], line['image_path']): 25 | path = osp.join(img_root, im_name) 26 | if not read_ok(path): 27 | decode_base64_to_image_file(img, path) 28 | tgt_path.append(path) 29 | else: 30 | tgt_path = osp.join(img_root, f"{line['index']}.jpg") 31 | if not read_ok(tgt_path): 32 | decode_base64_to_image_file(line['image'], tgt_path) 33 | return tgt_path 34 | -------------------------------------------------------------------------------- /vlmevalkit/vlmeval/utils/matching_util.py: -------------------------------------------------------------------------------- 1 | import string 2 | import copy as cp 3 | import os 4 | from ..smp import * 5 | 6 | 7 | def can_infer_option(answer, choices): 8 | verbose = os.environ.get('VERBOSE', 0) 9 | # Choices is a dictionary 10 | if 'Failed to obtain answer via API' in answer: 11 | return False 12 | 13 | reject_to_answer = [ 14 | "Sorry, I can't help with images of people yet.", 15 | "I can't process this file.", 16 | "I'm sorry, but without the image provided", 17 | 'Cannot determine the answer' 18 | ] 19 | for err in reject_to_answer: 20 | if err in answer: 21 | return 'Z' 22 | 23 | def count_choice(splits, choices, prefix='', suffix=''): 24 | cnt = 0 25 | for c in choices: 26 | if prefix + c + suffix in splits: 27 | cnt += 1 28 | return cnt 29 | 30 | answer_mod = cp.copy(answer) 31 | chars = '.()[],:;!*#{}' 32 | for c in chars: 33 | answer_mod = answer_mod.replace(c, ' ') 34 | 35 | splits = [x.strip() for x in answer_mod.split()] 36 | count = count_choice(splits, choices) 37 | 38 | if count == 1: 39 | for ch in choices: 40 | if 'A' in splits and len(splits) > 3 and verbose: 41 | logger = get_logger('Evaluation') 42 | logger.info(f'A might be a quantifier in the string: {answer}.') 43 | return False 44 | if ch in splits: 45 | return ch 46 | elif count == 0 and count_choice(splits, {'Z', ''}) == 1: 47 | return 'Z' 48 | return False 49 | 50 | 51 | def can_infer_text(answer, choices): 52 | answer = answer.lower() 53 | assert isinstance(choices, dict) 54 | for k in choices: 55 | assert k in string.ascii_uppercase 56 | choices[k] = str(choices[k]).lower() 57 | cands = [] 58 | for k in choices: 59 | if choices[k] in answer: 60 | cands.append(k) 61 | if len(cands) == 1: 62 | return cands[0] 63 | return False 64 | 65 | 66 | def can_infer(answer, choices): 67 | answer = str(answer) 68 | copt = can_infer_option(answer, choices) 69 | return copt if copt else can_infer_text(answer, choices) 70 | -------------------------------------------------------------------------------- /vlmevalkit/vlmeval/vlm/__init__.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | torch.set_grad_enabled(False) 4 | torch.manual_seed(1234) 5 | from .base import BaseModel 6 | from .openomni_llama import OpenOmni_Llama3 7 | from .openomni_qwen import OpenOmni_Qwen2 --------------------------------------------------------------------------------