├── README.md
├── assets
    ├── emotion_temp.wav
    ├── example.png
    ├── framework.png
    ├── librispeech_temp.wav
    ├── logo.png
    ├── loss_plot_outliers_replaced.pdf
    ├── question.wav
    └── temp.wav
├── cosyvoice
    ├── __init__.py
    ├── bin
    │   ├── inference.py
    │   └── train.py
    ├── cli
    │   ├── __init__.py
    │   ├── cosyvoice.py
    │   ├── frontend.py
    │   └── model.py
    ├── dataset
    │   ├── __init__.py
    │   ├── dataset.py
    │   └── processor.py
    ├── flow
    │   ├── decoder.py
    │   ├── flow.py
    │   ├── flow_gradtts.py
    │   ├── flow_matching.py
    │   ├── flow_matching_dit.py
    │   ├── length_regulator.py
    │   └── stable
    │   │   ├── adp.py
    │   │   ├── blocks.py
    │   │   ├── dit.py
    │   │   ├── dit_v2.py
    │   │   ├── sampling.py
    │   │   ├── stable_diffusion.py
    │   │   ├── stable_diffusion_test.py
    │   │   ├── transformer.py
    │   │   └── transformer_use_mask.py
    ├── hifigan
    │   ├── f0_predictor.py
    │   └── generator.py
    ├── llm
    │   └── llm.py
    ├── transformer
    │   ├── __init__.py
    │   ├── activation.py
    │   ├── attention.py
    │   ├── convolution.py
    │   ├── decoder.py
    │   ├── decoder_layer.py
    │   ├── embedding.py
    │   ├── encoder.py
    │   ├── encoder_layer.py
    │   ├── label_smoothing_loss.py
    │   ├── positionwise_feed_forward.py
    │   └── subsampling.py
    ├── utils
    │   ├── __init__.py
    │   ├── block_mask_util.py
    │   ├── class_utils.py
    │   ├── common.py
    │   ├── executor.py
    │   ├── file_utils.py
    │   ├── frontend_utils.py
    │   ├── mask.py
    │   ├── scheduler.py
    │   └── train_utils.py
    ├── vocab_16K.yaml
    └── vocab_6K.yaml
├── demo.py
├── inference.py
├── openomni
    ├── __init__.py
    ├── constants.py
    ├── conversation.py
    ├── eval
    │   ├── cvbench_eval.py
    │   ├── eval_gpt_review.py
    │   ├── eval_gpt_review_bench.py
    │   ├── eval_gpt_review_visual.py
    │   ├── eval_pope.py
    │   ├── eval_science_qa.py
    │   ├── eval_science_qa_gpt4.py
    │   ├── eval_science_qa_gpt4_requery.py
    │   ├── eval_textvqa.py
    │   ├── generate_webpage_data_from_table.py
    │   ├── m4c_evaluator.py
    │   ├── mm_vet_eval.py
    │   ├── mminst_eval.py
    │   ├── mmvp_eval.py
    │   ├── model_qa.py
    │   ├── model_vqa_blink.py
    │   ├── model_vqa_cvbench.py
    │   ├── model_vqa_gqa.py
    │   ├── model_vqa_loader.py
    │   ├── model_vqa_mia.py
    │   ├── model_vqa_mmbench.py
    │   ├── model_vqa_mminst.py
    │   ├── model_vqa_mminst2.py
    │   ├── model_vqa_mmvp.py
    │   ├── model_vqa_science.py
    │   ├── model_vqa_test.py
    │   ├── model_vqa_textvqa.py
    │   ├── model_vqa_vqa2.py
    │   ├── omni_eval.py
    │   ├── ov_odssey_eval.py
    │   ├── qa_baseline_gpt35.py
    │   ├── qwen2
    │   │   ├── aishell2_eval.jsonl
    │   │   ├── asr_eval.py
    │   │   ├── et2s_eval.py
    │   │   ├── librispeech_eval.jsonl
    │   │   ├── omni_eval.py
    │   │   ├── openomni_emotion_val.json
    │   │   ├── ov_ossey_eval.py
    │   │   ├── t2s_eval.py
    │   │   └── wenetspeech_eval.json
    │   ├── run_llava.py
    │   └── summarize_gpt_review.py
    ├── flow_inference.py
    ├── mm_utils.py
    ├── model
    │   ├── __init__.py
    │   ├── apply_delta.py
    │   ├── builder.py
    │   ├── consolidate.py
    │   ├── language_model
    │   │   ├── llava_her_llama.py
    │   │   ├── llava_her_qwen.py
    │   │   ├── llava_llama.py
    │   │   ├── llava_mistral.py
    │   │   ├── llava_mpt.py
    │   │   └── llava_qwen.py
    │   ├── llava_arch.py
    │   ├── llava_her_arch.py
    │   ├── make_delta.py
    │   ├── multimodal_encoder
    │   │   ├── builder.py
    │   │   └── clip_encoder.py
    │   ├── multimodal_projector
    │   │   └── builder.py
    │   ├── speech_encoder
    │   │   ├── builder.py
    │   │   └── speech_encoder.py
    │   ├── speech_generator_ar
    │   │   ├── builder.py
    │   │   ├── generation.py
    │   │   └── speech_generator.py
    │   ├── speech_generator_ctc
    │   │   ├── builder.py
    │   │   ├── generation.py
    │   │   └── speech_generator.py
    │   ├── speech_projector
    │   │   ├── builder.py
    │   │   └── speech_projector.py
    │   ├── utils.py
    │   ├── visual_encoder
    │   │   ├── builder.py
    │   │   └── clip_encoder.py
    │   └── visual_projector
    │   │   └── builder.py
    ├── serve
    │   ├── __init__.py
    │   ├── cli.py
    │   ├── controller.py
    │   ├── examples
    │   │   ├── extreme_ironing.jpg
    │   │   └── waterview.jpg
    │   ├── gradio_web_server.py
    │   ├── model_worker.py
    │   ├── register_worker.py
    │   ├── sglang_worker.py
    │   └── test_message.py
    ├── train
    │   ├── llama_flash_attn_monkey_patch.py
    │   ├── llama_xformers_attn_monkey_patch.py
    │   ├── llava_trainer.py
    │   ├── train.py
    │   ├── train_mem.py
    │   └── train_xformers.py
    └── utils.py
├── pyproject.toml
├── requirements.txt
├── scripts
    ├── clear.sh
    ├── convert_gqa_for_eval.py
    ├── convert_mmbench_for_submission.py
    ├── convert_mmvet_for_eval.py
    ├── convert_seed_for_submission.py
    ├── convert_sqa_to_llava.py
    ├── convert_sqa_to_llava_base_prompt.py
    ├── convert_vizwiz_for_submission.py
    ├── convert_vqav2_for_submission.py
    ├── mmevol
    │   ├── eval
    │   │   ├── gqa.sh
    │   │   ├── mmbench.sh
    │   │   ├── mmbench_cn.sh
    │   │   ├── mme.sh
    │   │   ├── seed.sh
    │   │   ├── sqa.sh
    │   │   └── textvqa.sh
    │   └── train
    │   │   ├── llama3
    │   │       ├── finetune.sh
    │   │       └── pretrain.sh
    │   │   └── qwen2
    │   │       ├── finetune.sh
    │   │       └── pretrain.sh
    ├── train
    │   ├── llama3
    │   │   ├── asr_finetune.sh
    │   │   ├── image2text_finetune.sh
    │   │   ├── image2text_pretrain.sh
    │   │   ├── speech2text_pretrain.sh
    │   │   ├── text2speech_dpo.sh
    │   │   ├── text2speech_pretrain.sh
    │   │   └── text2speech_pretrain_ctc.sh
    │   └── qwen2
    │   │   ├── asr_finetune.sh
    │   │   ├── image2text_finetune.sh
    │   │   ├── image2text_pretrain.sh
    │   │   ├── speech2text_pretrain.sh
    │   │   ├── text2speech_dpo.sh
    │   │   ├── text2speech_pretrain.sh
    │   │   ├── text2speech_pretrain_6k.sh
    │   │   └── text2speech_pretrain_ctc.sh
    ├── zero2.json
    ├── zero3.json
    └── zero3_offload.json
└── vlmevalkit
    ├── run.py
    ├── script
        └── run_inference_2.sh
    └── vlmeval
        ├── __init__.py
        ├── api
            ├── __init__.py
            ├── base.py
            ├── gpt.py
            └── gpt_int.py
        ├── config.py
        ├── evaluate
            ├── OCRBench.py
            ├── __init__.py
            ├── coco_eval.py
            ├── llavabench.py
            ├── mathvista_eval.py
            ├── misc.py
            ├── mmvet_eval.py
            ├── multiple_choice.py
            ├── vqa_eval.py
            └── yes_or_no.py
        ├── inference.py
        ├── smp
            ├── __init__.py
            ├── file.py
            ├── log.py
            ├── misc.py
            └── vlm.py
        ├── utils
            ├── __init__.py
            ├── custom_prompt.py
            ├── dataset.py
            ├── dataset_config.py
            ├── matching_util.py
            └── mp_util.py
        └── vlm
            ├── __init__.py
            ├── base.py
            ├── openomni_llama.py
            └── openomni_qwen.py


/assets/emotion_temp.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RainBowLuoCS/OpenOmni/2892efff5fb4155a2f0b775aaa1f316aa69e289a/assets/emotion_temp.wav


--------------------------------------------------------------------------------
/assets/example.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RainBowLuoCS/OpenOmni/2892efff5fb4155a2f0b775aaa1f316aa69e289a/assets/example.png


--------------------------------------------------------------------------------
/assets/framework.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RainBowLuoCS/OpenOmni/2892efff5fb4155a2f0b775aaa1f316aa69e289a/assets/framework.png


--------------------------------------------------------------------------------
/assets/librispeech_temp.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RainBowLuoCS/OpenOmni/2892efff5fb4155a2f0b775aaa1f316aa69e289a/assets/librispeech_temp.wav


--------------------------------------------------------------------------------
/assets/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RainBowLuoCS/OpenOmni/2892efff5fb4155a2f0b775aaa1f316aa69e289a/assets/logo.png


--------------------------------------------------------------------------------
/assets/loss_plot_outliers_replaced.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RainBowLuoCS/OpenOmni/2892efff5fb4155a2f0b775aaa1f316aa69e289a/assets/loss_plot_outliers_replaced.pdf


--------------------------------------------------------------------------------
/assets/question.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RainBowLuoCS/OpenOmni/2892efff5fb4155a2f0b775aaa1f316aa69e289a/assets/question.wav


--------------------------------------------------------------------------------
/assets/temp.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RainBowLuoCS/OpenOmni/2892efff5fb4155a2f0b775aaa1f316aa69e289a/assets/temp.wav


--------------------------------------------------------------------------------
/cosyvoice/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RainBowLuoCS/OpenOmni/2892efff5fb4155a2f0b775aaa1f316aa69e289a/cosyvoice/__init__.py


--------------------------------------------------------------------------------
/cosyvoice/cli/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RainBowLuoCS/OpenOmni/2892efff5fb4155a2f0b775aaa1f316aa69e289a/cosyvoice/cli/__init__.py


--------------------------------------------------------------------------------
/cosyvoice/cli/cosyvoice.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu)
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #   http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | import os
15 | import torch
16 | from hyperpyyaml import load_hyperpyyaml
17 | from modelscope import snapshot_download
18 | from cosyvoice.cli.frontend import CosyVoiceFrontEnd
19 | from cosyvoice.cli.model import CosyVoiceModel
20 | 
21 | class CosyVoice:
22 | 
23 |     def __init__(self, model_dir):
24 |         instruct = True if '-Instruct' in model_dir else False
25 |         self.model_dir = model_dir
26 |         if not os.path.exists(model_dir):
27 |             model_dir = snapshot_download(model_dir)
28 |         with open('{}/cosyvoice.yaml'.format(model_dir), 'r') as f:
29 |             configs = load_hyperpyyaml(f)
30 |         self.frontend = CosyVoiceFrontEnd(configs['get_tokenizer'],
31 |                                           configs['feat_extractor'],
32 |                                           '{}/campplus.onnx'.format(model_dir),
33 |                                           '{}/speech_tokenizer_v1.onnx'.format(model_dir),
34 |                                           '{}/spk2info.pt'.format(model_dir),
35 |                                           instruct,
36 |                                           configs['allowed_special'])
37 |         self.model = CosyVoiceModel(configs['llm'], configs['flow'], configs['hift'])
38 |         self.model.load('{}/llm.pt'.format(model_dir),
39 |                         '{}/flow.pt'.format(model_dir),
40 |                         '{}/hift.pt'.format(model_dir))
41 |         del configs
42 | 
43 |     def list_avaliable_spks(self):
44 |         spks = list(self.frontend.spk2info.keys())
45 |         return spks
46 | 
47 |     def inference_sft(self, tts_text, spk_id):
48 |         tts_speeches = []
49 |         for i in self.frontend.text_normalize(tts_text, split=True):
50 |             model_input = self.frontend.frontend_sft(i, spk_id)
51 |             model_output = self.model.inference(**model_input)
52 |             tts_speeches.append(model_output['tts_speech'])
53 |         return {'tts_speech': torch.concat(tts_speeches, dim=1)}
54 | 
55 |     def inference_zero_shot(self, tts_text, prompt_text, prompt_speech_16k):
56 |         prompt_text = self.frontend.text_normalize(prompt_text, split=False)
57 |         tts_speeches = []
58 |         for i in self.frontend.text_normalize(tts_text, split=True):
59 |             model_input = self.frontend.frontend_zero_shot(i, prompt_text, prompt_speech_16k)
60 |             model_output = self.model.inference(**model_input)
61 |             tts_speeches.append(model_output['tts_speech'])
62 |         return {'tts_speech': torch.concat(tts_speeches, dim=1)}
63 | 
64 |     def inference_cross_lingual(self, tts_text, prompt_speech_16k):
65 |         if self.frontend.instruct is True:
66 |             raise ValueError('{} do not support cross_lingual inference'.format(self.model_dir))
67 |         tts_speeches = []
68 |         for i in self.frontend.text_normalize(tts_text, split=True):
69 |             model_input = self.frontend.frontend_cross_lingual(i, prompt_speech_16k)
70 |             model_output = self.model.inference(**model_input)
71 |             tts_speeches.append(model_output['tts_speech'])
72 |         return {'tts_speech': torch.concat(tts_speeches, dim=1)}
73 | 
74 |     def inference_instruct(self, tts_text, spk_id, instruct_text):
75 |         if self.frontend.instruct is False:
76 |             raise ValueError('{} do not support instruct inference'.format(self.model_dir))
77 |         instruct_text = self.frontend.text_normalize(instruct_text, split=False)
78 |         tts_speeches = []
79 |         for i in self.frontend.text_normalize(tts_text, split=True):
80 |             model_input = self.frontend.frontend_instruct(i, spk_id, instruct_text)
81 |             model_output = self.model.inference(**model_input)
82 |             tts_speeches.append(model_output['tts_speech'])
83 |         return {'tts_speech': torch.concat(tts_speeches, dim=1)}
84 | 


--------------------------------------------------------------------------------
/cosyvoice/dataset/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RainBowLuoCS/OpenOmni/2892efff5fb4155a2f0b775aaa1f316aa69e289a/cosyvoice/dataset/__init__.py


--------------------------------------------------------------------------------
/cosyvoice/flow/length_regulator.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu, Zhihao Du)
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | from typing import Tuple
15 | import torch.nn as nn
16 | from torch.nn import functional as F
17 | from cosyvoice.utils.mask import make_pad_mask
18 | 
19 | 
20 | class InterpolateRegulator(nn.Module):
21 |     def __init__(
22 |             self,
23 |             channels: int,
24 |             sampling_ratios: Tuple,
25 |             out_channels: int = None,
26 |             groups: int = 1,
27 |     ):
28 |         super().__init__()
29 |         self.sampling_ratios = sampling_ratios
30 |         out_channels = out_channels or channels
31 |         model = nn.ModuleList([])
32 |         if len(sampling_ratios) > 0:
33 |             for _ in sampling_ratios:
34 |                 module = nn.Conv1d(channels, channels, 3, 1, 1)
35 |                 norm = nn.GroupNorm(groups, channels)
36 |                 act = nn.Mish()
37 |                 model.extend([module, norm, act])
38 |         model.append(
39 |             nn.Conv1d(channels, out_channels, 1, 1)
40 |         )
41 |         self.model = nn.Sequential(*model)
42 | 
43 |     def forward(self, x, ylens=None):
44 |         # x in (B, T, D)
45 |         mask = (~make_pad_mask(ylens)).to(x).unsqueeze(-1)
46 |         x = F.interpolate(x.transpose(1, 2).contiguous(), size=ylens.max(), mode='nearest')
47 |         out = self.model(x).transpose(1, 2).contiguous()
48 |         olens = ylens
49 |         return out * mask, olens
50 | 


--------------------------------------------------------------------------------
/cosyvoice/flow/stable/stable_diffusion.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | from torch.nn import functional as F
  3 | from .dit import DiffusionTransformer
  4 | from .adp import UNet1d
  5 | from .sampling import sample
  6 | import math
  7 | from model.base import BaseModule
  8 | import pdb
  9 | 
 10 | target_length = 1536
 11 | 
 12 | 
 13 | def pad_and_create_mask(matrix, target_length):
 14 |     T = matrix.shape[2]
 15 |     if T > target_length:
 16 |         raise ValueError("The third dimension length %s should not exceed %s" % (T, target_length))
 17 | 
 18 |     padding_size = target_length - T
 19 | 
 20 |     padded_matrix = F.pad(matrix, (0, padding_size), "constant", 0)
 21 | 
 22 |     mask = torch.ones((1, target_length))
 23 |     mask[:, T:] = 0  # Set the padding part to 0
 24 | 
 25 |     return padded_matrix.to(matrix.device), mask.to(matrix.device)
 26 | 
 27 | 
 28 | class Stable_Diffusion(BaseModule):
 29 |     def __init__(self, io_channels, input_concat_dim=None, embed_dim=768, depth=24, num_heads=24,
 30 |                  project_cond_tokens=False, transformer_type="continuous_transformer"):
 31 |         super(Stable_Diffusion, self).__init__()
 32 |         self.diffusion = DiffusionTransformer(
 33 |             io_channels=io_channels,
 34 |             input_concat_dim=input_concat_dim,
 35 |             embed_dim=embed_dim,
 36 |             # cond_token_dim=target_length,
 37 |             depth=depth,
 38 |             num_heads=num_heads,
 39 |             project_cond_tokens=project_cond_tokens,
 40 |             transformer_type=transformer_type,
 41 |         )
 42 |         # self.diffusion = UNet1d(
 43 |         #                   in_channels=80,
 44 |         #                   channels=256,
 45 |         #                   resnet_groups=16,
 46 |         #                   kernel_multiplier_downsample=2,
 47 |         #                   multipliers=[4, 4, 4, 5, 5],
 48 |         #                   factors=[1, 2, 2, 4], # 输入长度不一致卷积缩短
 49 |         #                   num_blocks=[2, 2, 2, 2],
 50 |         #                   attentions=[1, 3, 3, 3, 3],
 51 |         #                   attention_heads=16,
 52 |         #                   attention_multiplier=4,
 53 |         #                   use_nearest_upsample=False,
 54 |         #                   use_skip_scale=True,
 55 |         #                   use_context_time=True
 56 |         #                   )
 57 |         self.rng = torch.quasirandom.SobolEngine(1, scramble=True)
 58 | 
 59 |     @torch.no_grad()
 60 |     def forward(self, mu, mask, n_timesteps):
 61 |         # pdb.set_trace()
 62 |         mask = mask.squeeze(1)
 63 |         noise = torch.randn_like(mu).to(mu.device)
 64 |         # mu_pad, mu_pad_mask = pad_and_create_mask(mu, target_length)
 65 |         # extra_args = {"cross_attn_cond": mu, "cross_attn_cond_mask": mask, "mask": mask}
 66 |         extra_args = {"input_concat_cond": mu, "mask": mask}
 67 |         fakes = sample(self.diffusion, noise, n_timesteps, 0, **extra_args)
 68 | 
 69 |         return fakes
 70 | 
 71 |     def compute_loss(self, x0, mask, mu):
 72 | 
 73 |         # pdb.set_trace()
 74 |         t = self.rng.draw(x0.shape[0])[:, 0].to(x0.device)
 75 |         alphas, sigmas = torch.cos(t * math.pi / 2), torch.sin(t * math.pi / 2)
 76 | 
 77 |         alphas = alphas[:, None, None]
 78 |         sigmas = sigmas[:, None, None]
 79 |         noise = torch.randn_like(x0)
 80 |         noised_inputs = x0 * alphas + noise * sigmas
 81 |         targets = noise * alphas - x0 * sigmas
 82 |         mask = mask.squeeze(1)
 83 |         # mu_pad, mu_pad_mask = pad_and_create_mask(mu, target_length)
 84 |         # output = self.diffusion(noised_inputs, t, cross_attn_cond=mu, 
 85 |         #                         cross_attn_cond_mask=mask, mask=mask, cfg_dropout_prob=0.1)
 86 |         # pdb.set_trace()
 87 |         output = self.diffusion(noised_inputs,  # [bs, 80, 229]
 88 |                                 t,  # (bs,)
 89 |                                 input_concat_cond=mu,
 90 |                                 mask=mask,  # [bs, 229]
 91 |                                 cfg_dropout_prob=0.1)
 92 | 
 93 |         return self.mse_loss(output, targets, mask), output
 94 | 
 95 |     def mse_loss(self, output, targets, mask):
 96 | 
 97 |         mse_loss = F.mse_loss(output, targets, reduction='none')
 98 | 
 99 |         if mask.ndim == 2 and mse_loss.ndim == 3:
100 |             mask = mask.unsqueeze(1)
101 | 
102 |         if mask.shape[1] != mse_loss.shape[1]:
103 |             mask = mask.repeat(1, mse_loss.shape[1], 1)
104 | 
105 |         mse_loss = mse_loss * mask
106 | 
107 |         mse_loss = mse_loss.mean()
108 | 
109 |         return mse_loss
110 | 


--------------------------------------------------------------------------------
/cosyvoice/flow/stable/stable_diffusion_test.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | from torch.nn import functional as F
  3 | from .dit import DiffusionTransformer
  4 | from .adp import UNet1d
  5 | from .sampling import sample
  6 | import math
  7 | from model.base import BaseModule
  8 | import pdb
  9 | 
 10 | target_length = 1536
 11 | def pad_and_create_mask(matrix, target_length):
 12 |     
 13 |     T = matrix.shape[2]
 14 |     if T > target_length:
 15 |         raise ValueError("The third dimension length %s should not exceed %s"%(T, target_length))
 16 | 
 17 |     padding_size = target_length - T
 18 | 
 19 |     padded_matrix = F.pad(matrix, (0, padding_size), "constant", 0)
 20 |     
 21 |     mask = torch.ones((1, target_length))
 22 |     mask[:, T:] = 0  # Set the padding part to 0
 23 |     
 24 |     return padded_matrix.to(matrix.device), mask.to(matrix.device)
 25 | 
 26 | 
 27 | class Stable_Diffusion(BaseModule):
 28 |     def __init__(self):
 29 |         super(Stable_Diffusion, self).__init__()
 30 |         self.diffusion = DiffusionTransformer(
 31 |                           io_channels=80, 
 32 |                           # input_concat_dim=80,
 33 |                           embed_dim=768,
 34 |                           # cond_token_dim=target_length,      
 35 |                           depth=24,
 36 |                           num_heads=24,
 37 |                           project_cond_tokens=False,
 38 |                           transformer_type="continuous_transformer",
 39 |                           )
 40 |         # self.diffusion = UNet1d(
 41 |         #                   in_channels=80,
 42 |         #                   channels=256,
 43 |         #                   resnet_groups=16,
 44 |         #                   kernel_multiplier_downsample=2,
 45 |         #                   multipliers=[4, 4, 4, 5, 5],
 46 |         #                   factors=[1, 2, 2, 4], # 输入长度不一致卷积缩短
 47 |         #                   num_blocks=[2, 2, 2, 2],
 48 |         #                   attentions=[1, 3, 3, 3, 3],
 49 |         #                   attention_heads=16,
 50 |         #                   attention_multiplier=4,
 51 |         #                   use_nearest_upsample=False,
 52 |         #                   use_skip_scale=True,
 53 |         #                   use_context_time=True
 54 |         #                   )
 55 |         self.rng = torch.quasirandom.SobolEngine(1, scramble=True)
 56 | 
 57 |     @torch.no_grad()
 58 |     def forward(self, mu, mask, n_timesteps):
 59 |         # pdb.set_trace()
 60 |         mask = mask.squeeze(1)
 61 |         # noise = torch.randn_like(mu).to(mu.device)
 62 |         # mu_pad, mu_pad_mask = pad_and_create_mask(mu, target_length)
 63 |         # extra_args = {"cross_attn_cond": mu, "cross_attn_cond_mask": mask, "mask": mask}
 64 |         extra_args = {"mask": mask}
 65 |         fakes = sample(self.diffusion, mu, n_timesteps, 0, **extra_args)
 66 | 
 67 |         return fakes
 68 | 
 69 | 
 70 |     def compute_loss(self, x0, mask, mu):
 71 |         
 72 |         # pdb.set_trace()
 73 |         t = self.rng.draw(x0.shape[0])[:, 0].to(x0.device)
 74 |         alphas, sigmas = torch.cos(t * math.pi / 2), torch.sin(t * math.pi / 2)
 75 |    
 76 |         alphas = alphas[:, None, None]
 77 |         sigmas = sigmas[:, None, None]
 78 |         noise = torch.randn_like(x0)
 79 |         noised_inputs = x0 * alphas + noise * sigmas
 80 |         targets = mu * alphas - x0 * sigmas
 81 |         mask = mask.squeeze(1)
 82 |         # mu_pad, mu_pad_mask = pad_and_create_mask(mu, target_length)
 83 |         # output = self.diffusion(noised_inputs, t, cross_attn_cond=mu, 
 84 |         #                         cross_attn_cond_mask=mask, mask=mask, cfg_dropout_prob=0.1)
 85 |         output = self.diffusion(noised_inputs, t, mask=mask, cfg_dropout_prob=0.1)
 86 | 
 87 |         return self.mse_loss(output, targets, mask), output
 88 |     
 89 | 
 90 |     def mse_loss(self, output, targets, mask):
 91 |         
 92 |         mse_loss = F.mse_loss(output, targets, reduction='none')
 93 | 
 94 |         if mask.ndim == 2 and mse_loss.ndim == 3:
 95 |             mask = mask.unsqueeze(1)
 96 | 
 97 |         if mask.shape[1] != mse_loss.shape[1]:
 98 |             mask = mask.repeat(1, mse_loss.shape[1], 1)
 99 | 
100 |         mse_loss = mse_loss[mask]
101 | 
102 |         mse_loss = mse_loss.mean()
103 | 
104 |         return mse_loss


--------------------------------------------------------------------------------
/cosyvoice/hifigan/f0_predictor.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu, Kai Hu)
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | import torch
15 | import torch.nn as nn
16 | from torch.nn.utils import weight_norm
17 | 
18 | 
19 | class ConvRNNF0Predictor(nn.Module):
20 |     def __init__(self,
21 |                  num_class: int = 1,
22 |                  in_channels: int = 80,
23 |                  cond_channels: int = 512
24 |                  ):
25 |         super().__init__()
26 | 
27 |         self.num_class = num_class
28 |         self.condnet = nn.Sequential(
29 |             weight_norm(
30 |                 nn.Conv1d(in_channels, cond_channels, kernel_size=3, padding=1)
31 |             ),
32 |             nn.ELU(),
33 |             weight_norm(
34 |                 nn.Conv1d(cond_channels, cond_channels, kernel_size=3, padding=1)
35 |             ),
36 |             nn.ELU(),
37 |             weight_norm(
38 |                 nn.Conv1d(cond_channels, cond_channels, kernel_size=3, padding=1)
39 |             ),
40 |             nn.ELU(),
41 |             weight_norm(
42 |                 nn.Conv1d(cond_channels, cond_channels, kernel_size=3, padding=1)
43 |             ),
44 |             nn.ELU(),
45 |             weight_norm(
46 |                 nn.Conv1d(cond_channels, cond_channels, kernel_size=3, padding=1)
47 |             ),
48 |             nn.ELU(),
49 |         )
50 |         self.classifier = nn.Linear(in_features=cond_channels, out_features=self.num_class)
51 | 
52 |     def forward(self, x: torch.Tensor) -> torch.Tensor:
53 |         x = self.condnet(x)
54 |         x = x.transpose(1, 2)
55 |         return torch.abs(self.classifier(x).squeeze(-1))
56 | 


--------------------------------------------------------------------------------
/cosyvoice/transformer/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RainBowLuoCS/OpenOmni/2892efff5fb4155a2f0b775aaa1f316aa69e289a/cosyvoice/transformer/__init__.py


--------------------------------------------------------------------------------
/cosyvoice/transformer/activation.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2020 Johns Hopkins University (Shinji Watanabe)
 2 | #               2020 Northwestern Polytechnical University (Pengcheng Guo)
 3 | #               2020 Mobvoi Inc (Binbin Zhang)
 4 | #               2024 Alibaba Inc (Xiang Lyu)
 5 | #
 6 | # Licensed under the Apache License, Version 2.0 (the "License");
 7 | # you may not use this file except in compliance with the License.
 8 | # You may obtain a copy of the License at
 9 | #
10 | #   http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | """Swish() activation function for Conformer."""
18 | 
19 | import torch
20 | from torch import nn, sin, pow
21 | from torch.nn import Parameter
22 | 
23 | 
24 | class Swish(torch.nn.Module):
25 |     """Construct an Swish object."""
26 | 
27 |     def forward(self, x: torch.Tensor) -> torch.Tensor:
28 |         """Return Swish activation function."""
29 |         return x * torch.sigmoid(x)
30 | 
31 | 
32 | # Implementation adapted from https://github.com/EdwardDixon/snake under the MIT license.
33 | #   LICENSE is in incl_licenses directory.
34 | class Snake(nn.Module):
35 |     '''
36 |     Implementation of a sine-based periodic activation function
37 |     Shape:
38 |         - Input: (B, C, T)
39 |         - Output: (B, C, T), same shape as the input
40 |     Parameters:
41 |         - alpha - trainable parameter
42 |     References:
43 |         - This activation function is from this paper by Liu Ziyin, Tilman Hartwig, Masahito Ueda:
44 |         https://arxiv.org/abs/2006.08195
45 |     Examples:
46 |         >>> a1 = snake(256)
47 |         >>> x = torch.randn(256)
48 |         >>> x = a1(x)
49 |     '''
50 |     def __init__(self, in_features, alpha=1.0, alpha_trainable=True, alpha_logscale=False):
51 |         '''
52 |         Initialization.
53 |         INPUT:
54 |             - in_features: shape of the input
55 |             - alpha: trainable parameter
56 |             alpha is initialized to 1 by default, higher values = higher-frequency.
57 |             alpha will be trained along with the rest of your model.
58 |         '''
59 |         super(Snake, self).__init__()
60 |         self.in_features = in_features
61 | 
62 |         # initialize alpha
63 |         self.alpha_logscale = alpha_logscale
64 |         if self.alpha_logscale:  # log scale alphas initialized to zeros
65 |             self.alpha = Parameter(torch.zeros(in_features) * alpha)
66 |         else:  # linear scale alphas initialized to ones
67 |             self.alpha = Parameter(torch.ones(in_features) * alpha)
68 | 
69 |         self.alpha.requires_grad = alpha_trainable
70 | 
71 |         self.no_div_by_zero = 0.000000001
72 | 
73 |     def forward(self, x):
74 |         '''
75 |         Forward pass of the function.
76 |         Applies the function to the input elementwise.
77 |         Snake ∶= x + 1/a * sin^2 (xa)
78 |         '''
79 |         alpha = self.alpha.unsqueeze(0).unsqueeze(-1)  # line up with x to [B, C, T]
80 |         if self.alpha_logscale:
81 |             alpha = torch.exp(alpha)
82 |         x = x + (1.0 / (alpha + self.no_div_by_zero)) * pow(sin(x * alpha), 2)
83 | 
84 |         return x
85 | 


--------------------------------------------------------------------------------
/cosyvoice/transformer/label_smoothing_loss.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2019 Shigeki Karita
 2 | #               2020 Mobvoi Inc (Binbin Zhang)
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #   http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Label smoothing module."""
16 | 
17 | import torch
18 | from torch import nn
19 | 
20 | 
21 | class LabelSmoothingLoss(nn.Module):
22 |     """Label-smoothing loss.
23 | 
24 |     In a standard CE loss, the label's data distribution is:
25 |     [0,1,2] ->
26 |     [
27 |         [1.0, 0.0, 0.0],
28 |         [0.0, 1.0, 0.0],
29 |         [0.0, 0.0, 1.0],
30 |     ]
31 | 
32 |     In the smoothing version CE Loss,some probabilities
33 |     are taken from the true label prob (1.0) and are divided
34 |     among other labels.
35 | 
36 |     e.g.
37 |     smoothing=0.1
38 |     [0,1,2] ->
39 |     [
40 |         [0.9, 0.05, 0.05],
41 |         [0.05, 0.9, 0.05],
42 |         [0.05, 0.05, 0.9],
43 |     ]
44 | 
45 |     Args:
46 |         size (int): the number of class
47 |         padding_idx (int): padding class id which will be ignored for loss
48 |         smoothing (float): smoothing rate (0.0 means the conventional CE)
49 |         normalize_length (bool):
50 |             normalize loss by sequence length if True
51 |             normalize loss by batch size if False
52 |     """
53 | 
54 |     def __init__(self,
55 |                  size: int,
56 |                  padding_idx: int,
57 |                  smoothing: float,
58 |                  normalize_length: bool = False):
59 |         """Construct an LabelSmoothingLoss object."""
60 |         super(LabelSmoothingLoss, self).__init__()
61 |         self.criterion = nn.KLDivLoss(reduction="none")
62 |         self.padding_idx = padding_idx
63 |         self.confidence = 1.0 - smoothing
64 |         self.smoothing = smoothing
65 |         self.size = size
66 |         self.normalize_length = normalize_length
67 | 
68 |     def forward(self, x: torch.Tensor, target: torch.Tensor) -> torch.Tensor:
69 |         """Compute loss between x and target.
70 | 
71 |         The model outputs and data labels tensors are flatten to
72 |         (batch*seqlen, class) shape and a mask is applied to the
73 |         padding part which should not be calculated for loss.
74 | 
75 |         Args:
76 |             x (torch.Tensor): prediction (batch, seqlen, class)
77 |             target (torch.Tensor):
78 |                 target signal masked with self.padding_id (batch, seqlen)
79 |         Returns:
80 |             loss (torch.Tensor) : The KL loss, scalar float value
81 |         """
82 |         assert x.size(2) == self.size
83 |         batch_size = x.size(0)
84 |         x = x.view(-1, self.size)
85 |         target = target.view(-1)
86 |         # use zeros_like instead of torch.no_grad() for true_dist,
87 |         # since no_grad() can not be exported by JIT
88 |         true_dist = torch.zeros_like(x)
89 |         true_dist.fill_(self.smoothing / (self.size - 1))
90 |         ignore = target == self.padding_idx  # (B,)
91 |         total = len(target) - ignore.sum().item()
92 |         target = target.masked_fill(ignore, 0)  # avoid -1 index
93 |         true_dist.scatter_(1, target.unsqueeze(1), self.confidence)
94 |         kl = self.criterion(torch.log_softmax(x, dim=1), true_dist)
95 |         denom = total if self.normalize_length else batch_size
96 |         return kl.masked_fill(ignore.unsqueeze(1), 0).sum() / denom
97 | 


--------------------------------------------------------------------------------
/cosyvoice/transformer/positionwise_feed_forward.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2019 Shigeki Karita
  2 | #               2020 Mobvoi Inc (Binbin Zhang)
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #   http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | """Positionwise feed forward layer definition."""
 16 | 
 17 | import torch
 18 | 
 19 | 
 20 | class PositionwiseFeedForward(torch.nn.Module):
 21 |     """Positionwise feed forward layer.
 22 | 
 23 |     FeedForward are appied on each position of the sequence.
 24 |     The output dim is same with the input dim.
 25 | 
 26 |     Args:
 27 |         idim (int): Input dimenstion.
 28 |         hidden_units (int): The number of hidden units.
 29 |         dropout_rate (float): Dropout rate.
 30 |         activation (torch.nn.Module): Activation function
 31 |     """
 32 | 
 33 |     def __init__(
 34 |             self,
 35 |             idim: int,
 36 |             hidden_units: int,
 37 |             dropout_rate: float,
 38 |             activation: torch.nn.Module = torch.nn.ReLU(),
 39 |     ):
 40 |         """Construct a PositionwiseFeedForward object."""
 41 |         super(PositionwiseFeedForward, self).__init__()
 42 |         self.w_1 = torch.nn.Linear(idim, hidden_units)
 43 |         self.activation = activation
 44 |         self.dropout = torch.nn.Dropout(dropout_rate)
 45 |         self.w_2 = torch.nn.Linear(hidden_units, idim)
 46 | 
 47 |     def forward(self, xs: torch.Tensor) -> torch.Tensor:
 48 |         """Forward function.
 49 | 
 50 |         Args:
 51 |             xs: input tensor (B, L, D)
 52 |         Returns:
 53 |             output tensor, (B, L, D)
 54 |         """
 55 |         return self.w_2(self.dropout(self.activation(self.w_1(xs))))
 56 | 
 57 | 
 58 | class MoEFFNLayer(torch.nn.Module):
 59 |     """
 60 |     Mixture of expert with Positionwise feed forward layer
 61 |     See also figure 1 in https://arxiv.org/pdf/2305.15663.pdf
 62 |     The output dim is same with the input dim.
 63 | 
 64 |     Modified from https://github.com/Lightning-AI/lit-gpt/pull/823
 65 |                   https://github.com/mistralai/mistral-src/blob/b46d6/moe_one_file_ref.py#L203-L219
 66 |     Args:
 67 |         n_expert: number of expert.
 68 |         n_expert_per_token: The actual number of experts used for each frame
 69 |         idim (int): Input dimenstion.
 70 |         hidden_units (int): The number of hidden units.
 71 |         dropout_rate (float): Dropout rate.
 72 |         activation (torch.nn.Module): Activation function
 73 |     """
 74 | 
 75 |     def __init__(
 76 |             self,
 77 |             n_expert: int,
 78 |             n_expert_per_token: int,
 79 |             idim: int,
 80 |             hidden_units: int,
 81 |             dropout_rate: float,
 82 |             activation: torch.nn.Module = torch.nn.ReLU(),
 83 |     ):
 84 |         super(MoEFFNLayer, self).__init__()
 85 |         self.gate = torch.nn.Linear(idim, n_expert, bias=False)
 86 |         self.experts = torch.nn.ModuleList(
 87 |             PositionwiseFeedForward(idim, hidden_units, dropout_rate,
 88 |                                     activation) for _ in range(n_expert))
 89 |         self.n_expert_per_token = n_expert_per_token
 90 | 
 91 |     def forward(self, xs: torch.Tensor) -> torch.Tensor:
 92 |         """Foward function.
 93 |         Args:
 94 |             xs: input tensor (B, L, D)
 95 |         Returns:
 96 |             output tensor, (B, L, D)
 97 | 
 98 |         """
 99 |         B, L, D = xs.size(
100 |         )  # batch size, sequence length, embedding dimension (idim)
101 |         xs = xs.view(-1, D)  # (B*L, D)
102 |         router = self.gate(xs)  # (B*L, n_expert)
103 |         logits, indices = torch.topk(
104 |             router, self.n_expert_per_token
105 |         )  # probs:(B*L, n_expert), indices: (B*L, n_expert)
106 |         weights = torch.nn.functional.softmax(
107 |             logits, dim=1,
108 |             dtype=torch.float).to(dtype=xs.dtype)  # (B*L, n_expert_per_token)
109 |         output = torch.zeros_like(xs)  # (B*L, D)
110 |         for i, expert in enumerate(self.experts):
111 |             mask = indices == i
112 |             batch_idx, ith_expert = torch.where(mask)
113 |             output[batch_idx] += weights[batch_idx, ith_expert, None] * expert(
114 |                 xs[batch_idx])
115 |         return output.view(B, L, D)
116 | 


--------------------------------------------------------------------------------
/cosyvoice/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RainBowLuoCS/OpenOmni/2892efff5fb4155a2f0b775aaa1f316aa69e289a/cosyvoice/utils/__init__.py


--------------------------------------------------------------------------------
/cosyvoice/utils/block_mask_util.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | 
 4 | def create_grid_mask(seq_length, trunck_length, fill_triangle):
 5 |     assert seq_length > 0
 6 | 
 7 |     # 先不考虑seen_length创建一个grid mask：
 8 |     if fill_triangle:
 9 |         mask = 1 - torch.triu(torch.ones(seq_length, seq_length), diagonal=1)
10 |         # 下三角与主对角线都为1
11 |     else:
12 |         mask = torch.zeros(seq_length, seq_length)
13 | 
14 |     for i in range(seq_length):
15 |         trunck_idx = i // trunck_length
16 |         trunck_start = trunck_idx * trunck_length
17 |         trunck_end = trunck_length + trunck_start
18 |         mask[i][trunck_start:trunck_end] = 1
19 | 
20 |     return mask
21 | 
22 | 
23 | if __name__ == "__main__":
24 |     mask = create_grid_mask(seq_length=8, trunck_length=3, fill_triangle=True).int()
25 |     print(mask)
26 | # tensor([[1, 1, 1, 0, 0, 0, 0, 0],
27 | #         [1, 1, 1, 0, 0, 0, 0, 0],
28 | #         [1, 1, 1, 0, 0, 0, 0, 0],
29 | #         [1, 1, 1, 1, 1, 1, 0, 0],
30 | #         [1, 1, 1, 1, 1, 1, 0, 0],
31 | #         [1, 1, 1, 1, 1, 1, 0, 0],
32 | #         [1, 1, 1, 1, 1, 1, 1, 1],
33 | #         [1, 1, 1, 1, 1, 1, 1, 1]]
34 | 
35 | 


--------------------------------------------------------------------------------
/cosyvoice/utils/class_utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright [2023-11-28] <sxc19@mails.tsinghua.edu.cn, Xingchen Song>
 2 | #            2024 Alibaba Inc (authors: Xiang Lyu)
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | import torch
16 | 
17 | from cosyvoice.transformer.activation import Swish
18 | from cosyvoice.transformer.subsampling import (
19 |     LinearNoSubsampling,
20 |     EmbedinigNoSubsampling,
21 |     Conv1dSubsampling2,
22 |     Conv2dSubsampling4,
23 |     Conv2dSubsampling6,
24 |     Conv2dSubsampling8,
25 | )
26 | from cosyvoice.transformer.embedding import (PositionalEncoding,
27 |                                              RelPositionalEncoding,
28 |                                              WhisperPositionalEncoding,
29 |                                              LearnablePositionalEncoding,
30 |                                              NoPositionalEncoding)
31 | from cosyvoice.transformer.attention import (MultiHeadedAttention,
32 |                                              RelPositionMultiHeadedAttention,
33 |                                              BlockRelPositionMultiHeadedAttention)
34 | from cosyvoice.transformer.embedding import EspnetRelPositionalEncoding
35 | from cosyvoice.transformer.subsampling import LegacyLinearNoSubsampling
36 | 
37 | 
38 | COSYVOICE_ACTIVATION_CLASSES = {
39 |     "hardtanh": torch.nn.Hardtanh,
40 |     "tanh": torch.nn.Tanh,
41 |     "relu": torch.nn.ReLU,
42 |     "selu": torch.nn.SELU,
43 |     "swish": getattr(torch.nn, "SiLU", Swish),
44 |     "gelu": torch.nn.GELU,
45 | }
46 | 
47 | COSYVOICE_SUBSAMPLE_CLASSES = {
48 |     "linear": LinearNoSubsampling,
49 |     "linear_legacy": LegacyLinearNoSubsampling,
50 |     "embed": EmbedinigNoSubsampling,
51 |     "conv1d2": Conv1dSubsampling2,
52 |     "conv2d": Conv2dSubsampling4,
53 |     "conv2d6": Conv2dSubsampling6,
54 |     "conv2d8": Conv2dSubsampling8,
55 |     'paraformer_dummy': torch.nn.Identity
56 | }
57 | 
58 | COSYVOICE_EMB_CLASSES = {
59 |     "embed": PositionalEncoding,
60 |     "abs_pos": PositionalEncoding,
61 |     "rel_pos": RelPositionalEncoding,
62 |     "rel_pos_espnet": EspnetRelPositionalEncoding,
63 |     "no_pos": NoPositionalEncoding,
64 |     "abs_pos_whisper": WhisperPositionalEncoding,
65 |     "embed_learnable_pe": LearnablePositionalEncoding,
66 | }
67 | 
68 | COSYVOICE_ATTENTION_CLASSES = {
69 |     "selfattn": MultiHeadedAttention,
70 |     "rel_selfattn": RelPositionMultiHeadedAttention,
71 |     "block_rel_selfattn": BlockRelPositionMultiHeadedAttention,
72 | }
73 | 


--------------------------------------------------------------------------------
/cosyvoice/utils/common.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2020 Mobvoi Inc (Binbin Zhang)
  2 | #               2024 Alibaba Inc (authors: Xiang Lyu)
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #   http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | # Modified from ESPnet(https://github.com/espnet/espnet)
 16 | """Unility functions for Transformer."""
 17 | 
 18 | from typing import List
 19 | 
 20 | import torch
 21 | 
 22 | IGNORE_ID = -1
 23 | 
 24 | 
 25 | def pad_list(xs: List[torch.Tensor], pad_value: int):
 26 |     """Perform padding for the list of tensors.
 27 | 
 28 |     Args:
 29 |         xs (List): List of Tensors [(T_1, `*`), (T_2, `*`), ..., (T_B, `*`)].
 30 |         pad_value (float): Value for padding.
 31 | 
 32 |     Returns:
 33 |         Tensor: Padded tensor (B, Tmax, `*`).
 34 | 
 35 |     Examples:
 36 |         >>> x = [torch.ones(4), torch.ones(2), torch.ones(1)]
 37 |         >>> x
 38 |         [tensor([1., 1., 1., 1.]), tensor([1., 1.]), tensor([1.])]
 39 |         >>> pad_list(x, 0)
 40 |         tensor([[1., 1., 1., 1.],
 41 |                 [1., 1., 0., 0.],
 42 |                 [1., 0., 0., 0.]])
 43 | 
 44 |     """
 45 |     max_len = max([len(item) for item in xs])
 46 |     batchs = len(xs)
 47 |     ndim = xs[0].ndim
 48 |     if ndim == 1:
 49 |         pad_res = torch.zeros(batchs,
 50 |                               max_len,
 51 |                               dtype=xs[0].dtype,
 52 |                               device=xs[0].device)
 53 |     elif ndim == 2:
 54 |         pad_res = torch.zeros(batchs,
 55 |                               max_len,
 56 |                               xs[0].shape[1],
 57 |                               dtype=xs[0].dtype,
 58 |                               device=xs[0].device)
 59 |     elif ndim == 3:
 60 |         pad_res = torch.zeros(batchs,
 61 |                               max_len,
 62 |                               xs[0].shape[1],
 63 |                               xs[0].shape[2],
 64 |                               dtype=xs[0].dtype,
 65 |                               device=xs[0].device)
 66 |     else:
 67 |         raise ValueError(f"Unsupported ndim: {ndim}")
 68 |     pad_res.fill_(pad_value)
 69 |     for i in range(batchs):
 70 |         pad_res[i, :len(xs[i])] = xs[i]
 71 |     return pad_res
 72 | 
 73 | 
 74 | def th_accuracy(pad_outputs: torch.Tensor, pad_targets: torch.Tensor,
 75 |                 ignore_label: int) -> torch.Tensor:
 76 |     """Calculate accuracy.
 77 | 
 78 |     Args:
 79 |         pad_outputs (Tensor): Prediction tensors (B * Lmax, D).
 80 |         pad_targets (LongTensor): Target label tensors (B, Lmax).
 81 |         ignore_label (int): Ignore label id.
 82 | 
 83 |     Returns:
 84 |         torch.Tensor: Accuracy value (0.0 - 1.0).
 85 | 
 86 |     """
 87 |     pad_pred = pad_outputs.view(pad_targets.size(0), pad_targets.size(1),
 88 |                                 pad_outputs.size(1)).argmax(2)
 89 |     mask = pad_targets != ignore_label
 90 |     numerator = torch.sum(
 91 |         pad_pred.masked_select(mask) == pad_targets.masked_select(mask))
 92 |     denominator = torch.sum(mask)
 93 |     return (numerator / denominator).detach()
 94 | 
 95 | 
 96 | def get_padding(kernel_size, dilation=1):
 97 |     return int((kernel_size * dilation - dilation) / 2)
 98 | 
 99 | 
100 | def init_weights(m, mean=0.0, std=0.01):
101 |     classname = m.__class__.__name__
102 |     if classname.find("Conv") != -1:
103 |         m.weight.data.normal_(mean, std)
104 | 


--------------------------------------------------------------------------------
/cosyvoice/utils/file_utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang)
 2 | #               2024 Alibaba Inc (authors: Xiang Lyu)
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | import json
17 | import torchaudio
18 | 
19 | 
20 | def read_lists(list_file):
21 |     lists = []
22 |     with open(list_file, 'r', encoding='utf8') as fin:
23 |         for line in fin:
24 |             lists.append(line.strip())
25 |     return lists
26 | 
27 | def read_json_lists(list_file):
28 |     lists = read_lists(list_file)
29 |     results = {}
30 |     for fn in lists:
31 |         with open(fn, 'r', encoding='utf8') as fin:
32 |             results.update(json.load(fin))
33 |     return results
34 | 
35 | def load_wav(wav, target_sr):
36 |     speech, sample_rate = torchaudio.load(wav)
37 |     speech = speech.mean(dim=0, keepdim=True)
38 |     if sample_rate != target_sr:
39 |         assert sample_rate > target_sr, 'wav sample rate {} must be greater than {}'.format(sample_rate, target_sr)
40 |         speech = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=target_sr)(speech)
41 |     return speech
42 | 
43 | def speed_change(waveform, sample_rate, speed_factor: str):
44 |     effects = [
45 |         ["tempo", speed_factor],  # speed_factor
46 |         ["rate", f"{sample_rate}"]
47 |     ]
48 |     augmented_waveform, new_sample_rate = torchaudio.sox_effects.apply_effects_tensor(
49 |         waveform,
50 |         sample_rate,
51 |         effects
52 |     )
53 |     return augmented_waveform, new_sample_rate
54 | 


--------------------------------------------------------------------------------
/cosyvoice/utils/frontend_utils.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu, Zhihao Du)
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #   http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | import re
 16 | chinese_char_pattern = re.compile(r'[\u4e00-\u9fff]+')
 17 | 
 18 | # whether contain chinese character
 19 | def contains_chinese(text):
 20 |     return bool(chinese_char_pattern.search(text))
 21 | 
 22 | 
 23 | # replace special symbol
 24 | def replace_corner_mark(text):
 25 |     text = text.replace('²', '平方')
 26 |     text = text.replace('³', '立方')
 27 |     return text
 28 | 
 29 | 
 30 | # remove meaningless symbol
 31 | def remove_bracket(text):
 32 |     text = text.replace('（', '').replace('）', '')
 33 |     text = text.replace('【', '').replace('】', '')
 34 |     text = text.replace('`', '').replace('`', '')
 35 |     text = text.replace("——", " ")
 36 |     return text
 37 | 
 38 | 
 39 | # spell Arabic numerals
 40 | def spell_out_number(text: str, inflect_parser):
 41 |     new_text = []
 42 |     st = None
 43 |     for i, c in enumerate(text):
 44 |         if not c.isdigit():
 45 |             if st is not None:
 46 |                 num_str = inflect_parser.number_to_words(text[st: i])
 47 |                 new_text.append(num_str)
 48 |                 st = None
 49 |             new_text.append(c)
 50 |         else:
 51 |             if st is None:
 52 |                 st = i
 53 |     if st is not None and st < len(text):
 54 |         num_str = inflect_parser.number_to_words(text[st:])
 55 |         new_text.append(num_str)
 56 |     return ''.join(new_text)
 57 | 
 58 | 
 59 | # split paragrah logic：
 60 | # 1. per sentence max len token_max_n, min len token_min_n, merge if last sentence len less than merge_len
 61 | # 2. cal sentence len according to lang
 62 | # 3. split sentence according to puncatation
 63 | def split_paragraph(text: str, tokenize, lang="zh", token_max_n=80, token_min_n=60, merge_len=20, comma_split=False):
 64 |     def calc_utt_length(_text: str):
 65 |         if lang == "zh":
 66 |             return len(_text)
 67 |         else:
 68 |             return len(tokenize(_text))
 69 | 
 70 |     def should_merge(_text: str):
 71 |         if lang == "zh":
 72 |             return len(_text) < merge_len
 73 |         else:
 74 |             return len(tokenize(_text)) < merge_len
 75 | 
 76 |     if lang == "zh":
 77 |         pounc = ['。', '？', '！', '；', '：', '、', '.', '?', '!', ';']
 78 |     else:
 79 |         pounc = ['.', '?', '!', ';', ':']
 80 |     if comma_split:
 81 |         pounc.extend(['，', ','])
 82 |     st = 0
 83 |     utts = []
 84 |     for i, c in enumerate(text):
 85 |         if c in pounc:
 86 |             if len(text[st: i]) > 0:
 87 |                 utts.append(text[st: i] + c)
 88 |             if i + 1 < len(text) and text[i + 1] in ['"', '”']:
 89 |                 tmp = utts.pop(-1)
 90 |                 utts.append(tmp + text[i + 1])
 91 |                 st = i + 2
 92 |             else:
 93 |                 st = i + 1
 94 |     if len(utts) == 0:
 95 |         if lang == "zh":
 96 |             utts.append(text + '。')
 97 |         else:
 98 |             utts.append(text + '.')
 99 |     final_utts = []
100 |     cur_utt = ""
101 |     for utt in utts:
102 |         if calc_utt_length(cur_utt + utt) > token_max_n and calc_utt_length(cur_utt) > token_min_n:
103 |             final_utts.append(cur_utt)
104 |             cur_utt = ""
105 |         cur_utt = cur_utt + utt
106 |     if len(cur_utt) > 0:
107 |         if should_merge(cur_utt) and len(final_utts) != 0:
108 |             final_utts[-1] = final_utts[-1] + cur_utt
109 |         else:
110 |             final_utts.append(cur_utt)
111 | 
112 |     return final_utts
113 | 
114 | 
115 | # remove blank between chinese character
116 | def replace_blank(text: str):
117 |     out_str = []
118 |     for i, c in enumerate(text):
119 |         if c == " ":
120 |             if ((text[i + 1].isascii() and text[i + 1] != " ") and
121 |                     (text[i - 1].isascii() and text[i - 1] != " ")):
122 |                 out_str.append(c)
123 |         else:
124 |             out_str.append(c)
125 |     return "".join(out_str)
126 | 


--------------------------------------------------------------------------------
/cosyvoice/vocab_16K.yaml:
--------------------------------------------------------------------------------
 1 | # set random seed, so that you may reproduce your result.
 2 | __set_seed1: !apply:random.seed [1986]
 3 | __set_seed2: !apply:numpy.random.seed [1986]
 4 | __set_seed3: !apply:torch.manual_seed [1986]
 5 | __set_seed4: !apply:torch.cuda.manual_seed_all [1986]
 6 | 
 7 | # fixed params
 8 | sample_rate: 22050
 9 | text_encoder_input_size: 512
10 | llm_input_size: 1024
11 | llm_output_size: 1024
12 | spk_embed_dim: 192
13 | 
14 | 
15 | flow: !new:cosyvoice.flow.flow.MaskedDiffWithXvec
16 |     input_size: 512
17 |     output_size: 80
18 |     spk_embed_dim: !ref <spk_embed_dim>
19 |     output_type: 'mel'
20 |     vocab_size: 16384
21 |     input_frame_rate: 12.5
22 |     only_mask_loss: True
23 |     encoder: !new:cosyvoice.transformer.encoder.BlockConformerEncoder
24 |         output_size: 512
25 |         attention_heads: 8
26 |         linear_units: 2048
27 |         num_blocks: 6
28 |         dropout_rate: 0.1
29 |         positional_dropout_rate: 0.1
30 |         attention_dropout_rate: 0.1
31 |         normalize_before: True
32 |         input_layer: 'linear'
33 |         pos_enc_layer_type: 'rel_pos_espnet'
34 |         selfattention_layer_type: 'block_rel_selfattn'
35 |         block_size: 10
36 |         input_size: 512
37 |         use_cnn_module: False
38 |         macaron_style: False
39 |     length_regulator: !new:cosyvoice.flow.length_regulator.InterpolateRegulator
40 |         channels: 80
41 |         sampling_ratios: [1, 1, 1, 1]
42 |     decoder: !new:cosyvoice.flow.flow_matching.ConditionalCFM
43 |         in_channels: 240
44 |         n_spks: 1
45 |         spk_emb_dim: 80
46 |         cfm_params: !new:omegaconf.DictConfig
47 |             content:
48 |                 sigma_min: 1e-06
49 |                 solver: 'euler'
50 |                 t_scheduler: 'cosine'
51 |                 training_cfg_rate: 0.2
52 |                 inference_cfg_rate: 0.7
53 |                 reg_loss_type: 'l1'
54 |         estimator: !new:cosyvoice.flow.decoder.ConditionalDecoder
55 |             in_channels: 320
56 |             out_channels: 80
57 |             channels: [256, 256]
58 |             dropout: 0
59 |             attention_head_dim: 64
60 |             n_blocks: 4
61 |             num_mid_blocks: 12
62 |             num_heads: 8
63 |             act_fn: 'gelu'
64 | 
65 | hift: !new:cosyvoice.hifigan.generator.HiFTGenerator
66 |     in_channels: 80
67 |     base_channels: 512
68 |     nb_harmonics: 8
69 |     sampling_rate: !ref <sample_rate>
70 |     nsf_alpha: 0.1
71 |     nsf_sigma: 0.003
72 |     nsf_voiced_threshold: 10
73 |     upsample_rates: [8, 8]
74 |     upsample_kernel_sizes: [16, 16]
75 |     istft_params:
76 |         n_fft: 16
77 |         hop_len: 4
78 |     resblock_kernel_sizes: [3, 7, 11]
79 |     resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
80 |     source_resblock_kernel_sizes: [7, 11]
81 |     source_resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5]]
82 |     lrelu_slope: 0.1
83 |     audio_limit: 0.99
84 |     f0_predictor: !new:cosyvoice.hifigan.f0_predictor.ConvRNNF0Predictor
85 |         num_class: 1
86 |         in_channels: 80
87 |         cond_channels: 512
88 | 


--------------------------------------------------------------------------------
/openomni/__init__.py:
--------------------------------------------------------------------------------
1 | # from llava.model import LlavaLlamaForCausalLM
2 | 


--------------------------------------------------------------------------------
/openomni/constants.py:
--------------------------------------------------------------------------------
 1 | CONTROLLER_HEART_BEAT_EXPIRATION = 30
 2 | WORKER_HEART_BEAT_INTERVAL = 15
 3 | 
 4 | LOGDIR = "."
 5 | 
 6 | # Model Constants
 7 | IGNORE_INDEX = -100
 8 | IMAGE_TOKEN_INDEX = -200
 9 | SPEECH_TOKEN_INDEX = -300
10 | DEFAULT_IMAGE_TOKEN = "<image>"
11 | DEFAULT_SPEECH_TOKEN = "<speech>"
12 | 


--------------------------------------------------------------------------------
/openomni/eval/eval_gpt_review.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import json
  3 | import os
  4 | 
  5 | import openai
  6 | import tqdm
  7 | import ray
  8 | import time
  9 | 
 10 | NUM_SECONDS_TO_SLEEP = 3
 11 | 
 12 | @ray.remote(num_cpus=4)
 13 | def get_eval(content: str, max_tokens: int):
 14 |     while True:
 15 |         try:
 16 |             response = openai.ChatCompletion.create(
 17 |                 model='gpt-4',
 18 |                 messages=[{
 19 |                     'role': 'system',
 20 |                     'content': 'You are a helpful and precise assistant for checking the quality of the answer.'
 21 |                 }, {
 22 |                     'role': 'user',
 23 |                     'content': content,
 24 |                 }],
 25 |                 temperature=0.2,  # TODO: figure out which temperature is best for evaluation
 26 |                 max_tokens=max_tokens,
 27 |             )
 28 |             break
 29 |         except openai.error.RateLimitError:
 30 |             pass
 31 |         except Exception as e:
 32 |             print(e)
 33 |         time.sleep(NUM_SECONDS_TO_SLEEP)
 34 | 
 35 |     print('success!')
 36 |     return response['choices'][0]['message']['content']
 37 | 
 38 | 
 39 | def parse_score(review):
 40 |     try:
 41 |         score_pair = review.split('\n')[0]
 42 |         score_pair = score_pair.replace(',', ' ')
 43 |         sp = score_pair.split(' ')
 44 |         if len(sp) == 2:
 45 |             return [float(sp[0]), float(sp[1])]
 46 |         else:
 47 |             print('error', review)
 48 |             return [-1, -1]
 49 |     except Exception as e:
 50 |         print(e)
 51 |         print('error', review)
 52 |         return [-1, -1]
 53 | 
 54 | 
 55 | if __name__ == '__main__':
 56 |     parser = argparse.ArgumentParser(description='ChatGPT-based QA evaluation.')
 57 |     parser.add_argument('-q', '--question')
 58 |     # parser.add_argument('-a', '--answer')
 59 |     parser.add_argument('-a', '--answer-list', nargs='+', default=[])
 60 |     parser.add_argument('-r', '--rule')
 61 |     parser.add_argument('-o', '--output')
 62 |     parser.add_argument('--max-tokens', type=int, default=1024, help='maximum number of tokens produced in the output')
 63 |     args = parser.parse_args()
 64 | 
 65 |     ray.init()
 66 | 
 67 |     f_q = open(os.path.expanduser(args.question))
 68 |     f_ans1 = open(os.path.expanduser(args.answer_list[0]))
 69 |     f_ans2 = open(os.path.expanduser(args.answer_list[1]))
 70 |     rule_dict = json.load(open(os.path.expanduser(args.rule), 'r'))
 71 | 
 72 |     review_file = open(f'{args.output}', 'w')
 73 | 
 74 |     js_list = []
 75 |     handles = []
 76 |     idx = 0
 77 |     for ques_js, ans1_js, ans2_js in zip(f_q, f_ans1, f_ans2):
 78 |         # if idx == 1:
 79 |         #     break
 80 | 
 81 |         ques = json.loads(ques_js)
 82 |         ans1 = json.loads(ans1_js)
 83 |         ans2 = json.loads(ans2_js)
 84 | 
 85 |         category = json.loads(ques_js)['category']
 86 |         if category in rule_dict:
 87 |             rule = rule_dict[category]
 88 |         else:
 89 |             rule = rule_dict['default']
 90 |         prompt = rule['prompt']
 91 |         role = rule['role']
 92 |         content = (f'[Question]\n{ques["text"]}\n\n'
 93 |                    f'[{role} 1]\n{ans1["text"]}\n\n[End of {role} 1]\n\n'
 94 |                    f'[{role} 2]\n{ans2["text"]}\n\n[End of {role} 2]\n\n'
 95 |                    f'[System]\n{prompt}\n\n')
 96 |         js_list.append({
 97 |             'id': idx+1,
 98 |             'question_id': ques['question_id'],
 99 |             'answer1_id': ans1['answer_id'],
100 |             'answer2_id': ans2['answer_id'],
101 |             'category': category})
102 |         idx += 1
103 |         handles.append(get_eval.remote(content, args.max_tokens))
104 |         # To avoid the rate limit set by OpenAI
105 |         time.sleep(NUM_SECONDS_TO_SLEEP)
106 | 
107 |     reviews = ray.get(handles)
108 |     for idx, review in enumerate(reviews):
109 |         scores = parse_score(review)
110 |         js_list[idx]['content'] = review
111 |         js_list[idx]['tuple'] = scores
112 |         review_file.write(json.dumps(js_list[idx]) + '\n')
113 |     review_file.close()
114 | 


--------------------------------------------------------------------------------
/openomni/eval/eval_gpt_review_bench.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import json
  3 | import os
  4 | 
  5 | import openai
  6 | import time
  7 | 
  8 | NUM_SECONDS_TO_SLEEP = 0.5
  9 | 
 10 | 
 11 | def get_eval(content: str, max_tokens: int):
 12 |     while True:
 13 |         try:
 14 |             response = openai.ChatCompletion.create(
 15 |                 model='gpt-4-0314',
 16 |                 messages=[{
 17 |                     'role': 'system',
 18 |                     'content': 'You are a helpful and precise assistant for checking the quality of the answer.'
 19 |                 }, {
 20 |                     'role': 'user',
 21 |                     'content': content,
 22 |                 }],
 23 |                 temperature=0.2,  # TODO: figure out which temperature is best for evaluation
 24 |                 max_tokens=max_tokens,
 25 |             )
 26 |             break
 27 |         except openai.error.RateLimitError:
 28 |             pass
 29 |         except Exception as e:
 30 |             print(e)
 31 |         time.sleep(NUM_SECONDS_TO_SLEEP)
 32 | 
 33 |     return response['choices'][0]['message']['content']
 34 | 
 35 | 
 36 | def parse_score(review):
 37 |     try:
 38 |         score_pair = review.split('\n')[0]
 39 |         score_pair = score_pair.replace(',', ' ')
 40 |         sp = score_pair.split(' ')
 41 |         if len(sp) == 2:
 42 |             return [float(sp[0]), float(sp[1])]
 43 |         else:
 44 |             print('error', review)
 45 |             return [-1, -1]
 46 |     except Exception as e:
 47 |         print(e)
 48 |         print('error', review)
 49 |         return [-1, -1]
 50 | 
 51 | 
 52 | if __name__ == '__main__':
 53 |     parser = argparse.ArgumentParser(description='ChatGPT-based QA evaluation.')
 54 |     parser.add_argument('-q', '--question')
 55 |     parser.add_argument('-c', '--context')
 56 |     parser.add_argument('-a', '--answer-list', nargs='+', default=[])
 57 |     parser.add_argument('-r', '--rule')
 58 |     parser.add_argument('-o', '--output')
 59 |     parser.add_argument('--max-tokens', type=int, default=1024, help='maximum number of tokens produced in the output')
 60 |     args = parser.parse_args()
 61 | 
 62 |     f_q = open(os.path.expanduser(args.question))
 63 |     f_ans1 = open(os.path.expanduser(args.answer_list[0]))
 64 |     f_ans2 = open(os.path.expanduser(args.answer_list[1]))
 65 |     rule_dict = json.load(open(os.path.expanduser(args.rule), 'r'))
 66 | 
 67 |     if os.path.isfile(os.path.expanduser(args.output)):
 68 |         cur_reviews = [json.loads(line) for line in open(os.path.expanduser(args.output))]
 69 |     else:
 70 |         cur_reviews = []
 71 | 
 72 |     review_file = open(f'{args.output}', 'a')
 73 | 
 74 |     context_list = [json.loads(line) for line in open(os.path.expanduser(args.context))]
 75 |     image_to_context = {context['image']: context for context in context_list}
 76 | 
 77 |     handles = []
 78 |     idx = 0
 79 |     for ques_js, ans1_js, ans2_js in zip(f_q, f_ans1, f_ans2):
 80 |         ques = json.loads(ques_js)
 81 |         ans1 = json.loads(ans1_js)
 82 |         ans2 = json.loads(ans2_js)
 83 | 
 84 |         inst = image_to_context[ques['image']]
 85 | 
 86 |         if isinstance(inst['caption'], list):
 87 |             cap_str = '\n'.join(inst['caption'])
 88 |         else:
 89 |             cap_str = inst['caption']
 90 | 
 91 |         category = 'llava_bench_' + json.loads(ques_js)['category']
 92 |         if category in rule_dict:
 93 |             rule = rule_dict[category]
 94 |         else:
 95 |             assert False, f"Visual QA category not found in rule file: {category}."
 96 |         prompt = rule['prompt']
 97 |         role = rule['role']
 98 |         content = (f'[Context]\n{cap_str}\n\n'
 99 |                    f'[Question]\n{ques["text"]}\n\n'
100 |                    f'[{role} 1]\n{ans1["text"]}\n\n[End of {role} 1]\n\n'
101 |                    f'[{role} 2]\n{ans2["text"]}\n\n[End of {role} 2]\n\n'
102 |                    f'[System]\n{prompt}\n\n')
103 |         cur_js = {
104 |             'id': idx+1,
105 |             'question_id': ques['question_id'],
106 |             'answer1_id': ans1.get('answer_id', ans1['question_id']),
107 |             'answer2_id': ans2.get('answer_id', ans2['answer_id']),
108 |             'category': category
109 |         }
110 |         if idx >= len(cur_reviews):
111 |             review = get_eval(content, args.max_tokens)
112 |             scores = parse_score(review)
113 |             cur_js['content'] = review
114 |             cur_js['tuple'] = scores
115 |             review_file.write(json.dumps(cur_js) + '\n')
116 |             review_file.flush()
117 |         else:
118 |             print(f'Skipping {idx} as we already have it.')
119 |         idx += 1
120 |         print(idx)
121 |     review_file.close()
122 | 


--------------------------------------------------------------------------------
/openomni/eval/eval_gpt_review_visual.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import json
  3 | import os
  4 | 
  5 | import openai
  6 | import time
  7 | 
  8 | NUM_SECONDS_TO_SLEEP = 0.5
  9 | 
 10 | 
 11 | def get_eval(content: str, max_tokens: int):
 12 |     while True:
 13 |         try:
 14 |             response = openai.ChatCompletion.create(
 15 |                 model='gpt-4-0314',
 16 |                 messages=[{
 17 |                     'role': 'system',
 18 |                     'content': 'You are a helpful and precise assistant for checking the quality of the answer.'
 19 |                 }, {
 20 |                     'role': 'user',
 21 |                     'content': content,
 22 |                 }],
 23 |                 temperature=0.2,  # TODO: figure out which temperature is best for evaluation
 24 |                 max_tokens=max_tokens,
 25 |             )
 26 |             break
 27 |         except openai.error.RateLimitError:
 28 |             pass
 29 |         except Exception as e:
 30 |             print(e)
 31 |         time.sleep(NUM_SECONDS_TO_SLEEP)
 32 | 
 33 |     return response['choices'][0]['message']['content']
 34 | 
 35 | 
 36 | def parse_score(review):
 37 |     try:
 38 |         score_pair = review.split('\n')[0]
 39 |         score_pair = score_pair.replace(',', ' ')
 40 |         sp = score_pair.split(' ')
 41 |         if len(sp) == 2:
 42 |             return [float(sp[0]), float(sp[1])]
 43 |         else:
 44 |             print('error', review)
 45 |             return [-1, -1]
 46 |     except Exception as e:
 47 |         print(e)
 48 |         print('error', review)
 49 |         return [-1, -1]
 50 | 
 51 | 
 52 | if __name__ == '__main__':
 53 |     parser = argparse.ArgumentParser(description='ChatGPT-based QA evaluation.')
 54 |     parser.add_argument('-q', '--question')
 55 |     parser.add_argument('-c', '--context')
 56 |     parser.add_argument('-a', '--answer-list', nargs='+', default=[])
 57 |     parser.add_argument('-r', '--rule')
 58 |     parser.add_argument('-o', '--output')
 59 |     parser.add_argument('--max-tokens', type=int, default=1024, help='maximum number of tokens produced in the output')
 60 |     args = parser.parse_args()
 61 | 
 62 |     f_q = open(os.path.expanduser(args.question))
 63 |     f_ans1 = open(os.path.expanduser(args.answer_list[0]))
 64 |     f_ans2 = open(os.path.expanduser(args.answer_list[1]))
 65 |     rule_dict = json.load(open(os.path.expanduser(args.rule), 'r'))
 66 | 
 67 |     if os.path.isfile(os.path.expanduser(args.output)):
 68 |         cur_reviews = [json.loads(line) for line in open(os.path.expanduser(args.output))]
 69 |     else:
 70 |         cur_reviews = []
 71 | 
 72 |     review_file = open(f'{args.output}', 'a')
 73 | 
 74 |     context_list = [json.loads(line) for line in open(os.path.expanduser(args.context))]
 75 |     image_to_context = {context['image']: context for context in context_list}
 76 | 
 77 |     handles = []
 78 |     idx = 0
 79 |     for ques_js, ans1_js, ans2_js in zip(f_q, f_ans1, f_ans2):
 80 |         ques = json.loads(ques_js)
 81 |         ans1 = json.loads(ans1_js)
 82 |         ans2 = json.loads(ans2_js)
 83 | 
 84 |         inst = image_to_context[ques['image']]
 85 |         cap_str = '\n'.join(inst['captions'])
 86 |         box_str = '\n'.join([f'{instance["category"]}: {instance["bbox"]}' for instance in inst['instances']])
 87 | 
 88 |         category = json.loads(ques_js)['category']
 89 |         if category in rule_dict:
 90 |             rule = rule_dict[category]
 91 |         else:
 92 |             assert False, f"Visual QA category not found in rule file: {category}."
 93 |         prompt = rule['prompt']
 94 |         role = rule['role']
 95 |         content = (f'[Context]\n{cap_str}\n\n{box_str}\n\n'
 96 |                    f'[Question]\n{ques["text"]}\n\n'
 97 |                    f'[{role} 1]\n{ans1["text"]}\n\n[End of {role} 1]\n\n'
 98 |                    f'[{role} 2]\n{ans2["text"]}\n\n[End of {role} 2]\n\n'
 99 |                    f'[System]\n{prompt}\n\n')
100 |         cur_js = {
101 |             'id': idx+1,
102 |             'question_id': ques['question_id'],
103 |             'answer1_id': ans1.get('answer_id', ans1['question_id']),
104 |             'answer2_id': ans2.get('answer_id', ans2['answer_id']),
105 |             'category': category
106 |         }
107 |         if idx >= len(cur_reviews):
108 |             review = get_eval(content, args.max_tokens)
109 |             scores = parse_score(review)
110 |             cur_js['content'] = review
111 |             cur_js['tuple'] = scores
112 |             review_file.write(json.dumps(cur_js) + '\n')
113 |             review_file.flush()
114 |         else:
115 |             print(f'Skipping {idx} as we already have it.')
116 |         idx += 1
117 |         print(idx)
118 |     review_file.close()
119 | 


--------------------------------------------------------------------------------
/openomni/eval/eval_pope.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | import argparse
 4 | 
 5 | def eval_pope(answers, label_file):
 6 |     label_list = [json.loads(q)['label'] for q in open(label_file, 'r')]
 7 | 
 8 |     for answer in answers:
 9 |         text = answer['text']
10 | 
11 |         # Only keep the first sentence
12 |         if text.find('.') != -1:
13 |             text = text.split('.')[0]
14 | 
15 |         text = text.replace(',', '')
16 |         words = text.split(' ')
17 |         if 'No' in words or 'not' in words or 'no' in words:
18 |             answer['text'] = 'no'
19 |         else:
20 |             answer['text'] = 'yes'
21 | 
22 |     for i in range(len(label_list)):
23 |         if label_list[i] == 'no':
24 |             label_list[i] = 0
25 |         else:
26 |             label_list[i] = 1
27 | 
28 |     pred_list = []
29 |     for answer in answers:
30 |         if answer['text'] == 'no':
31 |             pred_list.append(0)
32 |         else:
33 |             pred_list.append(1)
34 | 
35 |     pos = 1
36 |     neg = 0
37 |     yes_ratio = pred_list.count(1) / len(pred_list)
38 | 
39 |     TP, TN, FP, FN = 0, 0, 0, 0
40 |     for pred, label in zip(pred_list, label_list):
41 |         if pred == pos and label == pos:
42 |             TP += 1
43 |         elif pred == pos and label == neg:
44 |             FP += 1
45 |         elif pred == neg and label == neg:
46 |             TN += 1
47 |         elif pred == neg and label == pos:
48 |             FN += 1
49 | 
50 |     print('TP\tFP\tTN\tFN\t')
51 |     print('{}\t{}\t{}\t{}'.format(TP, FP, TN, FN))
52 | 
53 |     precision = float(TP) / float(TP + FP)
54 |     recall = float(TP) / float(TP + FN)
55 |     f1 = 2*precision*recall / (precision + recall)
56 |     acc = (TP + TN) / (TP + TN + FP + FN)
57 |     print('Accuracy: {}'.format(acc))
58 |     print('Precision: {}'.format(precision))
59 |     print('Recall: {}'.format(recall))
60 |     print('F1 score: {}'.format(f1))
61 |     print('Yes ratio: {}'.format(yes_ratio))
62 |     print('%.3f, %.3f, %.3f, %.3f, %.3f' % (f1, acc, precision, recall, yes_ratio) )
63 | 
64 | if __name__ == "__main__":
65 |     parser = argparse.ArgumentParser()
66 |     parser.add_argument("--annotation-dir", type=str)
67 |     parser.add_argument("--question-file", type=str)
68 |     parser.add_argument("--result-file", type=str)
69 |     args = parser.parse_args()
70 | 
71 |     questions = [json.loads(line) for line in open(args.question_file)]
72 |     questions = {question['question_id']: question for question in questions}
73 |     answers = [json.loads(q) for q in open(args.result_file)]
74 |     for file in os.listdir(args.annotation_dir):
75 |         assert file.startswith('coco_pope_')
76 |         assert file.endswith('.json')
77 |         category = file[10:-5]
78 |         cur_answers = [x for x in answers if questions[x['question_id']]['category'] == category]
79 |         print('Category: {}, # samples: {}'.format(category, len(cur_answers)))
80 |         eval_pope(cur_answers, os.path.join(args.annotation_dir, file))
81 |         print("====================================")
82 | 


--------------------------------------------------------------------------------
/openomni/eval/eval_science_qa.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import json
  3 | import os
  4 | import re
  5 | import random
  6 | 
  7 | 
  8 | def get_args():
  9 |     parser = argparse.ArgumentParser()
 10 |     parser.add_argument('--base-dir', type=str)
 11 |     parser.add_argument('--result-file', type=str)
 12 |     parser.add_argument('--output-file', type=str)
 13 |     parser.add_argument('--output-result', type=str)
 14 |     parser.add_argument('--split', type=str, default='test')
 15 |     parser.add_argument('--options', type=list, default=["A", "B", "C", "D", "E"])
 16 |     return parser.parse_args()
 17 | 
 18 | 
 19 | def convert_caps(results):
 20 |     fakecaps = []
 21 |     for result in results:
 22 |         image_id = result['question_id']
 23 |         caption = result['text']
 24 |         fakecaps.append({"image_id": int(image_id), "caption": caption})
 25 |     return fakecaps
 26 | 
 27 | 
 28 | def get_pred_idx(prediction, choices, options):
 29 |     """
 30 |     Get the index (e.g. 2) from the prediction (e.g. 'C')
 31 |     """
 32 |     if prediction in options[:len(choices)]:
 33 |         return options.index(prediction)
 34 |     else:
 35 |         return -1
 36 |         return random.choice(range(len(choices)))
 37 | 
 38 | 
 39 | if __name__ == "__main__":
 40 |     args = get_args()
 41 | 
 42 |     base_dir = args.base_dir
 43 |     split_indices = json.load(open(os.path.join(base_dir, "pid_splits.json")))[args.split]
 44 |     problems = json.load(open(os.path.join(base_dir, "problems.json")))
 45 |     predictions = [json.loads(line) for line in open(args.result_file)]
 46 |     predictions = {pred['question_id']: pred for pred in predictions}
 47 |     split_problems = {idx: problems[idx] for idx in split_indices}
 48 | 
 49 |     results = {'correct': [], 'incorrect': []}
 50 |     sqa_results = {}
 51 |     sqa_results['acc'] = None
 52 |     sqa_results['correct'] = None
 53 |     sqa_results['count'] = None
 54 |     sqa_results['results'] = {}
 55 |     sqa_results['outputs'] = {}
 56 | 
 57 |     for prob_id, prob in split_problems.items():
 58 |         if prob_id not in predictions:
 59 |             pred = {'text': 'FAILED', 'prompt': 'Unknown'}
 60 |             pred_text = 'FAILED'
 61 |         else:
 62 |             pred = predictions[prob_id]
 63 |             pred_text = pred['text']
 64 | 
 65 |         if pred_text in args.options:
 66 |             answer = pred_text
 67 |         elif len(pred_text) >= 3 and pred_text[0] in args.options and pred_text[1:3] == ". ":
 68 |             answer = pred_text[0]
 69 |         else:
 70 |             pattern = re.compile(r'The answer is ([A-Z]).')
 71 |             res = pattern.findall(pred_text)
 72 |             if len(res) == 1:
 73 |                 answer = res[0]  # 'A', 'B', ...
 74 |             else:
 75 |                 answer = "FAILED"
 76 | 
 77 |         pred_idx = get_pred_idx(answer, prob['choices'], args.options)
 78 | 
 79 |         analysis = {
 80 |             'question_id': prob_id,
 81 |             'parsed_ans': answer,
 82 |             'ground_truth': args.options[prob['answer']],
 83 |             'question': pred['prompt'],
 84 |             'pred': pred_text,
 85 |             'is_multimodal': '<image>' in pred['prompt'],
 86 |         }
 87 | 
 88 |         sqa_results['results'][prob_id] = get_pred_idx(answer, prob['choices'], args.options)
 89 |         sqa_results['outputs'][prob_id] = pred_text
 90 | 
 91 |         if pred_idx == prob['answer']:
 92 |             results['correct'].append(analysis)
 93 |         else:
 94 |             results['incorrect'].append(analysis)
 95 | 
 96 |     correct = len(results['correct'])
 97 |     total = len(results['correct']) + len(results['incorrect'])
 98 | 
 99 |     ###### IMG ######
100 |     multimodal_correct = len([x for x in results['correct'] if x['is_multimodal']])
101 |     multimodal_incorrect = len([x for x in results['incorrect'] if x['is_multimodal']])
102 |     multimodal_total = multimodal_correct + multimodal_incorrect
103 |     ###### IMG ######
104 | 
105 |     print(f'Total: {total}, Correct: {correct}, Accuracy: {correct / total * 100:.2f}%, IMG-Accuracy: {multimodal_correct / multimodal_total * 100:.2f}%')
106 | 
107 |     sqa_results['acc'] = correct / total * 100
108 |     sqa_results['correct'] = correct
109 |     sqa_results['count'] = total
110 | 
111 |     with open(args.output_file, 'w') as f:
112 |         json.dump(results, f, indent=2)
113 |     with open(args.output_result, 'w') as f:
114 |         json.dump(sqa_results, f, indent=2)
115 | 


--------------------------------------------------------------------------------
/openomni/eval/eval_science_qa_gpt4.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import json
  3 | import os
  4 | import re
  5 | import random
  6 | from collections import defaultdict
  7 | 
  8 | 
  9 | def get_args():
 10 |     parser = argparse.ArgumentParser()
 11 |     parser.add_argument('--base-dir', type=str)
 12 |     parser.add_argument('--gpt4-result', type=str)
 13 |     parser.add_argument('--our-result', type=str)
 14 |     parser.add_argument('--split', type=str, default='test')
 15 |     parser.add_argument('--options', type=list, default=["A", "B", "C", "D", "E"])
 16 |     return parser.parse_args()
 17 | 
 18 | 
 19 | def convert_caps(results):
 20 |     fakecaps = []
 21 |     for result in results:
 22 |         image_id = result['question_id']
 23 |         caption = result['text']
 24 |         fakecaps.append({"image_id": int(image_id), "caption": caption})
 25 |     return fakecaps
 26 | 
 27 | 
 28 | def get_pred_idx(prediction, choices, options):
 29 |     """
 30 |     Get the index (e.g. 2) from the prediction (e.g. 'C')
 31 |     """
 32 |     if prediction in options[:len(choices)]:
 33 |         return options.index(prediction)
 34 |     else:
 35 |         return random.choice(range(len(choices)))
 36 | 
 37 | 
 38 | if __name__ == "__main__":
 39 |     args = get_args()
 40 | 
 41 |     base_dir = args.base_dir
 42 |     split_indices = json.load(open(os.path.join(base_dir, "pid_splits.json")))[args.split]
 43 |     problems = json.load(open(os.path.join(base_dir, "problems.json")))
 44 |     our_predictions = [json.loads(line) for line in open(args.our_result)]
 45 |     our_predictions = {pred['question_id']: pred for pred in our_predictions}
 46 |     split_problems = {idx: problems[idx] for idx in split_indices}
 47 | 
 48 |     gpt4_predictions = json.load(open(args.gpt4_result))['outputs']
 49 | 
 50 |     results = defaultdict(lambda: 0)
 51 | 
 52 |     for prob_id, prob in split_problems.items():
 53 |         if prob_id not in our_predictions:
 54 |             continue
 55 |         if prob_id not in gpt4_predictions:
 56 |             continue
 57 |         our_pred = our_predictions[prob_id]['text']
 58 |         gpt4_pred = gpt4_predictions[prob_id]
 59 | 
 60 |         pattern = re.compile(r'The answer is ([A-Z]).')
 61 |         our_res = pattern.findall(our_pred)
 62 |         if len(our_res) == 1:
 63 |             our_answer = our_res[0]  # 'A', 'B', ...
 64 |         else:
 65 |             our_answer = "FAILED"
 66 |         gpt4_res = pattern.findall(gpt4_pred)
 67 |         if len(gpt4_res) == 1:
 68 |             gpt4_answer = gpt4_res[0]  # 'A', 'B', ...
 69 |         else:
 70 |             gpt4_answer = "FAILED"
 71 | 
 72 |         our_pred_idx = get_pred_idx(our_answer, prob['choices'], args.options)
 73 |         gpt4_pred_idx = get_pred_idx(gpt4_answer, prob['choices'], args.options)
 74 | 
 75 |         if gpt4_answer == 'FAILED':
 76 |             results['gpt4_failed'] += 1
 77 |             # continue
 78 |             gpt4_pred_idx = our_pred_idx
 79 |             # if our_pred_idx != prob['answer']:
 80 |             #     print(our_predictions[prob_id]['prompt'])
 81 |             #     print('-----------------')
 82 |             #     print(f'LECTURE: {prob["lecture"]}')
 83 |             #     print(f'SOLUTION: {prob["solution"]}')
 84 |             #     print('=====================')
 85 |         else:
 86 |             # continue
 87 |             pass
 88 |         # gpt4_pred_idx = our_pred_idx
 89 | 
 90 |         if gpt4_pred_idx == prob['answer']:
 91 |             results['correct'] += 1
 92 |         else:
 93 |             results['incorrect'] += 1
 94 | 
 95 | 
 96 |         if gpt4_pred_idx == prob['answer'] or our_pred_idx == prob['answer']:
 97 |             results['correct_upperbound'] += 1
 98 | 
 99 |     correct = results['correct']
100 |     total = results['correct'] + results['incorrect']
101 |     print(f'Total: {total}, Correct: {correct}, Accuracy: {correct / total * 100:.2f}%')
102 |     print(f'Total: {total}, Correct (upper): {results["correct_upperbound"]}, Accuracy: {results["correct_upperbound"] / total * 100:.2f}%')
103 |     print(f'Total: {total}, GPT-4 NO-ANS (RANDOM): {results["gpt4_failed"]}, Percentage: {results["gpt4_failed"] / total * 100:.2f}%')
104 | 
105 | 


--------------------------------------------------------------------------------
/openomni/eval/eval_textvqa.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import argparse
 3 | import json
 4 | import re
 5 | 
 6 | from llava.eval.m4c_evaluator import TextVQAAccuracyEvaluator
 7 | 
 8 | 
 9 | def get_args():
10 |     parser = argparse.ArgumentParser()
11 |     parser.add_argument('--annotation-file', type=str,default="datasets/textvqa/TextVQA_0.5.1_val.json")
12 |     parser.add_argument('--result-file', type=str,default="answers/qwen2_evol_textvqa_prediction.jsonl")
13 |     parser.add_argument('--result-dir', type=str)
14 |     return parser.parse_args()
15 | 
16 | 
17 | def prompt_processor(prompt):
18 |     if prompt.startswith('OCR tokens: '):
19 |         pattern = r"Question: (.*?) Short answer:"
20 |         match = re.search(pattern, prompt, re.DOTALL)
21 |         question = match.group(1)
22 |     elif 'Reference OCR token: ' in prompt and len(prompt.split('\n')) == 3:
23 |         if prompt.startswith('Reference OCR token:'):
24 |             question = prompt.split('\n')[1]
25 |         else:
26 |             question = prompt.split('\n')[0]
27 |     elif len(prompt.split('\n')) == 2:
28 |         question = prompt.split('\n')[0]
29 |     else:
30 |         assert False
31 | 
32 |     return question.lower()
33 | 
34 | 
35 | def eval_single(annotation_file, result_file):
36 |     experiment_name = os.path.splitext(os.path.basename(result_file))[0]
37 |     print(experiment_name)
38 |     annotations = json.load(open(annotation_file))['data']
39 |     annotations = {(annotation['image_id'], annotation['question'].lower()): annotation for annotation in annotations}
40 |     results = [json.loads(line) for line in open(result_file)]
41 | 
42 |     pred_list = []
43 |     for result in results:
44 |         annotation = annotations[(result['question_id'], prompt_processor(result['prompt']))]
45 |         pred_list.append({
46 |             "pred_answer": result['text'],
47 |             "gt_answers": annotation['answers'],
48 |         })
49 | 
50 |     evaluator = TextVQAAccuracyEvaluator()
51 |     print('Samples: {}\nAccuracy: {:.2f}%\n'.format(len(pred_list), 100. * evaluator.eval_pred_list(pred_list)))
52 | 
53 | 
54 | if __name__ == "__main__":
55 |     args = get_args()
56 | 
57 |     if args.result_file is not None:
58 |         eval_single(args.annotation_file, args.result_file)
59 | 
60 |     if args.result_dir is not None:
61 |         for result_file in sorted(os.listdir(args.result_dir)):
62 |             if not result_file.endswith('.jsonl'):
63 |                 print(f'Skipping {result_file}')
64 |                 continue
65 |             eval_single(args.annotation_file, os.path.join(args.result_dir, result_file))
66 | 


--------------------------------------------------------------------------------
/openomni/eval/generate_webpage_data_from_table.py:
--------------------------------------------------------------------------------
  1 | """Generate json file for webpage."""
  2 | import json
  3 | import os
  4 | import re
  5 | 
  6 | # models = ['llama', 'alpaca', 'gpt35', 'bard']
  7 | models = ['vicuna']
  8 | 
  9 | 
 10 | def read_jsonl(path: str, key: str=None):
 11 |     data = []
 12 |     with open(os.path.expanduser(path)) as f:
 13 |         for line in f:
 14 |             if not line:
 15 |                 continue
 16 |             data.append(json.loads(line))
 17 |     if key is not None:
 18 |         data.sort(key=lambda x: x[key])
 19 |         data = {item[key]: item for item in data}
 20 |     return data
 21 | 
 22 | 
 23 | def trim_hanging_lines(s: str, n: int) -> str:
 24 |     s = s.strip()
 25 |     for _ in range(n):
 26 |         s = s.split('\n', 1)[1].strip()
 27 |     return s
 28 | 
 29 | 
 30 | if __name__ == '__main__':
 31 |     questions = read_jsonl('table/question.jsonl', key='question_id')
 32 | 
 33 |     # alpaca_answers = read_jsonl('table/answer/answer_alpaca-13b.jsonl', key='question_id')
 34 |     # bard_answers = read_jsonl('table/answer/answer_bard.jsonl', key='question_id')
 35 |     # gpt35_answers = read_jsonl('table/answer/answer_gpt35.jsonl', key='question_id')
 36 |     # llama_answers = read_jsonl('table/answer/answer_llama-13b.jsonl', key='question_id')
 37 |     vicuna_answers = read_jsonl('table/answer/answer_vicuna-13b.jsonl', key='question_id')
 38 |     ours_answers = read_jsonl('table/results/llama-13b-hf-alpaca.jsonl', key='question_id')
 39 | 
 40 |     review_vicuna = read_jsonl('table/review/review_vicuna-13b_llama-13b-hf-alpaca.jsonl', key='question_id')
 41 |     # review_alpaca = read_jsonl('table/review/review_alpaca-13b_vicuna-13b.jsonl', key='question_id')
 42 |     # review_bard = read_jsonl('table/review/review_bard_vicuna-13b.jsonl', key='question_id')
 43 |     # review_gpt35 = read_jsonl('table/review/review_gpt35_vicuna-13b.jsonl', key='question_id')
 44 |     # review_llama = read_jsonl('table/review/review_llama-13b_vicuna-13b.jsonl', key='question_id')
 45 | 
 46 |     records = []
 47 |     for qid in questions.keys():
 48 |         r = {
 49 |             'id': qid,
 50 |             'category': questions[qid]['category'],
 51 |             'question': questions[qid]['text'],
 52 |             'answers': {
 53 |                 # 'alpaca': alpaca_answers[qid]['text'],
 54 |                 # 'llama': llama_answers[qid]['text'],
 55 |                 # 'bard': bard_answers[qid]['text'],
 56 |                 # 'gpt35': gpt35_answers[qid]['text'],
 57 |                 'vicuna': vicuna_answers[qid]['text'],
 58 |                 'ours': ours_answers[qid]['text'],
 59 |             },
 60 |             'evaluations': {
 61 |                 # 'alpaca': review_alpaca[qid]['text'],
 62 |                 # 'llama': review_llama[qid]['text'],
 63 |                 # 'bard': review_bard[qid]['text'],
 64 |                 'vicuna': review_vicuna[qid]['content'],
 65 |                 # 'gpt35': review_gpt35[qid]['text'],
 66 |             },
 67 |             'scores': {
 68 |                 'vicuna': review_vicuna[qid]['tuple'],
 69 |                 # 'alpaca': review_alpaca[qid]['score'],
 70 |                 # 'llama': review_llama[qid]['score'],
 71 |                 # 'bard': review_bard[qid]['score'],
 72 |                 # 'gpt35': review_gpt35[qid]['score'],
 73 |             },
 74 |         }
 75 | 
 76 |         # cleanup data
 77 |         cleaned_evals = {}
 78 |         for k, v in r['evaluations'].items():
 79 |             v = v.strip()
 80 |             lines = v.split('\n')
 81 |             # trim the first line if it's a pair of numbers
 82 |             if re.match(r'\d+[, ]+\d+', lines[0]):
 83 |                 lines = lines[1:]
 84 |             v = '\n'.join(lines)
 85 |             cleaned_evals[k] = v.replace('Assistant 1', "**Assistant 1**").replace('Assistant 2', '**Assistant 2**')
 86 | 
 87 |         r['evaluations'] = cleaned_evals
 88 |         records.append(r)
 89 | 
 90 |     # Reorder the records, this is optional
 91 |     for r in records:
 92 |         if r['id'] <= 20:
 93 |             r['id'] += 60
 94 |         else:
 95 |             r['id'] -= 20
 96 |     for r in records:
 97 |         if r['id'] <= 50:
 98 |             r['id'] += 10
 99 |         elif 50 < r['id'] <= 60:
100 |             r['id'] -= 50
101 |     for r in records:
102 |         if r['id'] == 7:
103 |             r['id'] = 1
104 |         elif r['id'] < 7:
105 |             r['id'] += 1 
106 | 
107 |     records.sort(key=lambda x: x['id'])
108 | 
109 |     # Write to file
110 |     with open('webpage/data.json', 'w') as f:
111 |         json.dump({'questions': records, 'models': models}, f, indent=2)
112 | 


--------------------------------------------------------------------------------
/openomni/eval/model_qa.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import json
 3 | import os
 4 | 
 5 | import shortuuid
 6 | import torch
 7 | from tqdm import tqdm
 8 | from transformers import AutoModelForCausalLM, AutoTokenizer
 9 | 
10 | from llava.conversation import default_conversation
11 | from llava.utils import disable_torch_init
12 | 
13 | 
14 | @torch.inference_mode()
15 | def eval_model(model_name, questions_file, answers_file):
16 |     # Model
17 |     disable_torch_init()
18 |     model_name = os.path.expanduser(model_name)
19 |     tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)
20 |     model = AutoModelForCausalLM.from_pretrained(model_name,
21 |                                                  torch_dtype=torch.float16).cuda()
22 | 
23 |     ques_file = open(os.path.expanduser(questions_file), "r")
24 |     ans_file = open(os.path.expanduser(answers_file), "w")
25 |     for i, line in enumerate(tqdm(ques_file)):
26 |         idx = json.loads(line)["question_id"]
27 |         qs = json.loads(line)["text"]
28 |         cat = json.loads(line)["category"]
29 |         conv = default_conversation.copy()
30 |         conv.append_message(conv.roles[0], qs)
31 |         prompt = conv.get_prompt()
32 |         inputs = tokenizer([prompt])
33 |         input_ids = torch.as_tensor(inputs.input_ids).cuda()
34 |         output_ids = model.generate(
35 |             input_ids,
36 |             do_sample=True,
37 |             use_cache=True,
38 |             temperature=0.7,
39 |             max_new_tokens=1024,)
40 |         outputs = tokenizer.batch_decode(
41 |             output_ids, skip_special_tokens=True)[0]
42 |         try:
43 |             index = outputs.index(conv.sep, len(prompt))
44 |         except ValueError:
45 |             outputs += conv.sep
46 |             index = outputs.index(conv.sep, len(prompt))
47 | 
48 |         outputs = outputs[len(prompt) + len(conv.roles[1]) + 2:index].strip()
49 |         ans_id = shortuuid.uuid()
50 |         ans_file.write(json.dumps({"question_id": idx,
51 |                                    "text": outputs,
52 |                                    "answer_id": ans_id,
53 |                                    "model_id": model_name,
54 |                                    "metadata": {}}) + "\n")
55 |         ans_file.flush()
56 |     ans_file.close()
57 | 
58 | 
59 | if __name__ == "__main__":
60 |     parser = argparse.ArgumentParser()
61 |     parser.add_argument("--model-name", type=str, default="facebook/opt-350m")
62 |     parser.add_argument("--question-file", type=str,
63 |                         default="tables/question.jsonl")
64 |     parser.add_argument("--answers-file", type=str, default="answer.jsonl")
65 |     args = parser.parse_args()
66 | 
67 |     eval_model(args.model_name, args.question_file, args.answers_file)
68 | 


--------------------------------------------------------------------------------
/openomni/eval/qa_baseline_gpt35.py:
--------------------------------------------------------------------------------
 1 | """Generate answers with GPT-3.5"""
 2 | # Note: you need to be using OpenAI Python v0.27.0 for the code below to work
 3 | import argparse
 4 | import json
 5 | import os
 6 | import time
 7 | import concurrent.futures
 8 | 
 9 | import openai
10 | import tqdm
11 | import shortuuid
12 | 
13 | MODEL = 'gpt-3.5-turbo'
14 | MODEL_ID = 'gpt-3.5-turbo:20230327'
15 | 
16 | def get_answer(question_id: int, question: str, max_tokens: int):
17 |     ans = {
18 |         'answer_id': shortuuid.uuid(),
19 |         'question_id': question_id,
20 |         'model_id': MODEL_ID,
21 |     }
22 |     for _ in range(3):
23 |         try:
24 |             response = openai.ChatCompletion.create(
25 |                 model=MODEL,
26 |                 messages=[{
27 |                     'role': 'system',
28 |                     'content': 'You are a helpful assistant.'
29 |                 }, {
30 |                     'role': 'user',
31 |                     'content': question,
32 |                 }],
33 |                 max_tokens=max_tokens,
34 |             )
35 |             ans['text'] = response['choices'][0]['message']['content']
36 |             return ans
37 |         except Exception as e:
38 |             print('[ERROR]', e)
39 |             ans['text'] = '#ERROR#'
40 |             time.sleep(1)
41 |     return ans
42 | 
43 | 
44 | if __name__ == '__main__':
45 |     parser = argparse.ArgumentParser(description='ChatGPT answer generation.')
46 |     parser.add_argument('-q', '--question')
47 |     parser.add_argument('-o', '--output')
48 |     parser.add_argument('--max-tokens', type=int, default=1024, help='maximum number of tokens produced in the output')
49 |     args = parser.parse_args()
50 | 
51 |     questions_dict = {}
52 |     with open(os.path.expanduser(args.question)) as f:
53 |         for line in f:
54 |             if not line:
55 |                 continue
56 |             q = json.loads(line)
57 |             questions_dict[q['question_id']] = q['text']
58 | 
59 |     answers = []
60 | 
61 |     with concurrent.futures.ThreadPoolExecutor(max_workers=32) as executor:
62 |         futures = []
63 |         for qid, question in questions_dict.items():
64 |             future = executor.submit(get_answer, qid, question, args.max_tokens)
65 |             futures.append(future)
66 | 
67 |         for future in tqdm.tqdm(concurrent.futures.as_completed(futures), total=len(futures)):
68 |             answers.append(future.result())
69 | 
70 |     answers.sort(key=lambda x: x['question_id'])
71 | 
72 |     with open(os.path.expanduser(args.output), 'w') as f:
73 |         table = [json.dumps(ans) for ans in answers]
74 |         f.write('\n'.join(table))
75 | 


--------------------------------------------------------------------------------
/openomni/eval/summarize_gpt_review.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | from collections import defaultdict
 4 | 
 5 | import numpy as np
 6 | 
 7 | import argparse
 8 | 
 9 | def parse_args():
10 |     parser = argparse.ArgumentParser(description='ChatGPT-based QA evaluation.')
11 |     parser.add_argument('-d', '--dir', default=None)
12 |     parser.add_argument('-v', '--version', default=None)
13 |     parser.add_argument('-s', '--select', nargs='*', default=None)
14 |     parser.add_argument('-f', '--files', nargs='*', default=[])
15 |     parser.add_argument('-i', '--ignore', nargs='*', default=[])
16 |     return parser.parse_args()
17 | 
18 | 
19 | if __name__ == '__main__':
20 |     args = parse_args()
21 | 
22 |     if args.ignore is not None:
23 |         args.ignore = [int(x) for x in args.ignore]
24 | 
25 |     if len(args.files) > 0:
26 |         review_files = args.files
27 |     else:
28 |         review_files = [x for x in os.listdir(args.dir) if x.endswith('.jsonl') and (x.startswith('gpt4_text') or x.startswith('reviews_') or x.startswith('review_') or 'review' in args.dir)]
29 | 
30 |     for review_file in sorted(review_files):
31 |         config = os.path.basename(review_file).replace('gpt4_text_', '').replace('.jsonl', '')
32 |         if args.select is not None and any(x not in config for x in args.select):
33 |             continue
34 |         if '0613' in config:
35 |             version = '0613'
36 |         else:
37 |             version = '0314'
38 |         if args.version is not None and args.version != version:
39 |             continue
40 |         scores = defaultdict(list)
41 |         print(config)
42 |         with open(os.path.join(args.dir, review_file) if args.dir is not None else review_file) as f:
43 |             for review_str in f:
44 |                 review = json.loads(review_str)
45 |                 if review['question_id'] in args.ignore:
46 |                     continue
47 |                 if 'category' in review:
48 |                     scores[review['category']].append(review['tuple'])
49 |                     scores['all'].append(review['tuple'])
50 |                 else:
51 |                     if 'tuple' in review:
52 |                         scores['all'].append(review['tuple'])
53 |                     else:
54 |                         scores['all'].append(review['score'])
55 |         for k, v in sorted(scores.items()):
56 |             stats = np.asarray(v).mean(0).tolist()
57 |             stats = [round(x, 3) for x in stats]
58 |             # print(k, stats, round(stats[1]/stats[0]*100, 1))
59 |             print(k, round(stats[1]/stats[0]*100, 1), round(stats[0] * 10, 1), round(stats[1] * 10, 1))
60 |         print('=================================')
61 | 


--------------------------------------------------------------------------------
/openomni/model/__init__.py:
--------------------------------------------------------------------------------
1 | from .language_model.llava_her_llama import LlavaHerLlamaForCausalLM, LlavaHerConfig
2 | from .language_model.llava_her_qwen import LlavaHerQwen2ForCausalLM, LlavaHerQwenConfig
3 | # from .language_model.llava_mpt import LlavaMptForCausalLM, LlavaMptConfig
4 | # from .language_model.llava_mistral import LlavaMistralForCausalLM, LlavaMistralConfig
5 | 
6 | 


--------------------------------------------------------------------------------
/openomni/model/apply_delta.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Usage:
 3 | python3 -m fastchat.model.apply_delta --base ~/model_weights/llama-7b --target ~/model_weights/vicuna-7b --delta lmsys/vicuna-7b-delta
 4 | """
 5 | import argparse
 6 | 
 7 | import torch
 8 | from tqdm import tqdm
 9 | from transformers import AutoTokenizer, AutoModelForCausalLM
10 | from llava import LlavaLlamaForCausalLM
11 | 
12 | 
13 | def apply_delta(base_model_path, target_model_path, delta_path):
14 |     print("Loading base model")
15 |     base = AutoModelForCausalLM.from_pretrained(
16 |         base_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True)
17 | 
18 |     print("Loading delta")
19 |     delta = LlavaLlamaForCausalLM.from_pretrained(delta_path, torch_dtype=torch.float16, low_cpu_mem_usage=True)
20 |     delta_tokenizer = AutoTokenizer.from_pretrained(delta_path)
21 | 
22 |     print("Applying delta")
23 |     for name, param in tqdm(delta.state_dict().items(), desc="Applying delta"):
24 |         if name not in base.state_dict():
25 |             assert name in ['model.mm_projector.weight', 'model.mm_projector.bias'], f'{name} not in base model'
26 |             continue
27 |         if param.data.shape == base.state_dict()[name].shape:
28 |             param.data += base.state_dict()[name]
29 |         else:
30 |             assert name in ['model.embed_tokens.weight', 'lm_head.weight'], \
31 |                 f'{name} dimension mismatch: {param.data.shape} vs {base.state_dict()[name].shape}'
32 |             bparam = base.state_dict()[name]
33 |             param.data[:bparam.shape[0], :bparam.shape[1]] += bparam
34 | 
35 |     print("Saving target model")
36 |     delta.save_pretrained(target_model_path)
37 |     delta_tokenizer.save_pretrained(target_model_path)
38 | 
39 | 
40 | if __name__ == "__main__":
41 |     parser = argparse.ArgumentParser()
42 |     parser.add_argument("--base-model-path", type=str, required=True)
43 |     parser.add_argument("--target-model-path", type=str, required=True)
44 |     parser.add_argument("--delta-path", type=str, required=True)
45 | 
46 |     args = parser.parse_args()
47 | 
48 |     apply_delta(args.base_model_path, args.target_model_path, args.delta_path)
49 | 


--------------------------------------------------------------------------------
/openomni/model/consolidate.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Usage:
 3 | python3 -m llava.model.consolidate --src ~/model_weights/llava-7b --dst ~/model_weights/llava-7b_consolidate
 4 | """
 5 | import argparse
 6 | 
 7 | import torch
 8 | from transformers import AutoTokenizer, AutoModelForCausalLM
 9 | from llava.model import *
10 | from llava.model.utils import auto_upgrade
11 | 
12 | 
13 | def consolidate_ckpt(src_path, dst_path):
14 |     print("Loading model")
15 |     auto_upgrade(src_path)
16 |     src_model = AutoModelForCausalLM.from_pretrained(src_path, torch_dtype=torch.float16, low_cpu_mem_usage=True)
17 |     src_tokenizer = AutoTokenizer.from_pretrained(src_path, use_fast=False)
18 |     src_model.save_pretrained(dst_path)
19 |     src_tokenizer.save_pretrained(dst_path)
20 | 
21 | 
22 | if __name__ == "__main__":
23 |     parser = argparse.ArgumentParser()
24 |     parser.add_argument("--src", type=str, required=True)
25 |     parser.add_argument("--dst", type=str, required=True)
26 | 
27 |     args = parser.parse_args()
28 | 
29 |     consolidate_ckpt(args.src, args.dst)
30 | 


--------------------------------------------------------------------------------
/openomni/model/language_model/llava_mpt.py:
--------------------------------------------------------------------------------
 1 | #    Copyright 2023 Haotian Liu
 2 | #
 3 | #    Licensed under the Apache License, Version 2.0 (the "License");
 4 | #    you may not use this file except in compliance with the License.
 5 | #    You may obtain a copy of the License at
 6 | #
 7 | #        http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | #    Unless required by applicable law or agreed to in writing, software
10 | #    distributed under the License is distributed on an "AS IS" BASIS,
11 | #    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | #    See the License for the specific language governing permissions and
13 | #    limitations under the License.
14 | 
15 | 
16 | from typing import Optional, Tuple
17 | 
18 | import torch
19 | 
20 | from transformers import AutoConfig, AutoModelForCausalLM, \
21 |                          MptConfig, MptForCausalLM, MptModel
22 | from llava.model.llava_arch import LlavaMetaModel, LlavaMetaForCausalLM
23 | 
24 | 
25 | class LlavaMptConfig(MptConfig):
26 |     model_type = "llava_mpt"
27 | 
28 | 
29 | class LlavaMptModel(LlavaMetaModel, MptModel):
30 |     config_class = LlavaMptConfig
31 | 
32 |     def __init__(self, config: MptConfig):
33 |         config.hidden_size = config.d_model
34 |         super(LlavaMptModel, self).__init__(config)
35 |     
36 |     def embed_tokens(self, x):
37 |         return self.wte(x)
38 | 
39 | 
40 | class LlavaMptForCausalLM(MptForCausalLM, LlavaMetaForCausalLM):
41 |     config_class = LlavaMptConfig
42 |     supports_gradient_checkpointing = True
43 | 
44 |     def __init__(self, config):
45 |         super(MptForCausalLM, self).__init__(config)
46 | 
47 |         self.transformer = LlavaMptModel(config)
48 |         self.lm_head = torch.nn.Linear(config.hidden_size, config.vocab_size, bias=False)
49 | 
50 |         # Initialize weights and apply final processing
51 |         self.post_init()
52 | 
53 |     def get_model(self):
54 |         return self.transformer
55 | 
56 |     def _set_gradient_checkpointing(self, module, value=False):
57 |         if isinstance(module, LlavaMptModel):
58 |             module.gradient_checkpointing = value
59 | 
60 |     def forward(
61 |         self,
62 |         input_ids: Optional[torch.LongTensor] = None,
63 |         past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None,
64 |         attention_mask: Optional[torch.Tensor] = None,
65 |         inputs_embeds: Optional[torch.Tensor] = None,
66 |         labels: Optional[torch.Tensor] = None,
67 |         use_cache: Optional[bool] = None,
68 |         output_attentions: Optional[bool] = None,
69 |         output_hidden_states: Optional[bool] = None,
70 |         return_dict: Optional[bool] = None,
71 |         images=None):
72 | 
73 |         input_ids, attention_mask, past_key_values, inputs_embeds, labels = self.prepare_inputs_labels_for_multimodal(input_ids, attention_mask, past_key_values, labels, images)
74 |         
75 |         return super().forward(
76 |             input_ids,
77 |             past_key_values=past_key_values,
78 |             attention_mask=attention_mask,
79 |             inputs_embeds=inputs_embeds,
80 |             labels=labels,
81 |             use_cache=use_cache,
82 |             output_attentions=output_attentions,
83 |             output_hidden_states=output_hidden_states,
84 |             return_dict=return_dict,
85 |         )
86 | 
87 |     def prepare_inputs_for_generation(self, input_ids, past_key_values=None, inputs_embeds=None, **kwargs):
88 |         images = kwargs.pop("images", None)
89 |         _inputs = super().prepare_inputs_for_generation(
90 |             input_ids, past_key_values=past_key_values, inputs_embeds=inputs_embeds, **kwargs
91 |         )
92 |         _inputs['images'] = images
93 |         return _inputs
94 | 
95 | 
96 | AutoConfig.register("llava_mpt", LlavaMptConfig)
97 | AutoModelForCausalLM.register(LlavaMptConfig, LlavaMptForCausalLM)
98 | 


--------------------------------------------------------------------------------
/openomni/model/make_delta.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Usage:
 3 | python3 -m llava.model.make_delta --base ~/model_weights/llama-7b --target ~/model_weights/llava-7b --delta ~/model_weights/llava-7b-delta --hub-repo-id liuhaotian/llava-7b-delta
 4 | """
 5 | import argparse
 6 | 
 7 | import torch
 8 | from tqdm import tqdm
 9 | from transformers import AutoTokenizer, AutoModelForCausalLM
10 | from llava.model.utils import auto_upgrade
11 | 
12 | 
13 | def make_delta(base_model_path, target_model_path, delta_path, hub_repo_id):
14 |     print("Loading base model")
15 |     base = AutoModelForCausalLM.from_pretrained(
16 |         base_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True)
17 | 
18 |     print("Loading target model")
19 |     auto_upgrade(target_model_path)
20 |     target = AutoModelForCausalLM.from_pretrained(target_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True)
21 | 
22 |     print("Calculating delta")
23 |     for name, param in tqdm(target.state_dict().items(), desc="Calculating delta"):
24 |         if name not in base.state_dict():
25 |             assert name in ['model.mm_projector.weight', 'model.mm_projector.bias'], f'{name} not in base model'
26 |             continue
27 |         if param.data.shape == base.state_dict()[name].shape:
28 |             param.data -= base.state_dict()[name]
29 |         else:
30 |             assert name in ['model.embed_tokens.weight', 'lm_head.weight'], f'{name} dimension mismatch: {param.data.shape} vs {base.state_dict()[name].shape}'
31 |             bparam = base.state_dict()[name]
32 |             param.data[:bparam.shape[0], :bparam.shape[1]] -= bparam
33 | 
34 |     print("Saving delta")
35 |     if hub_repo_id:
36 |         kwargs = {"push_to_hub": True, "repo_id": hub_repo_id}
37 |     else:
38 |         kwargs = {}
39 |     target.save_pretrained(delta_path, **kwargs)
40 |     target_tokenizer = AutoTokenizer.from_pretrained(target_model_path)
41 |     target_tokenizer.save_pretrained(delta_path, **kwargs)
42 | 
43 | 
44 | if __name__ == "__main__":
45 |     parser = argparse.ArgumentParser()
46 |     parser.add_argument("--base-model-path", type=str, required=True)
47 |     parser.add_argument("--target-model-path", type=str, required=True)
48 |     parser.add_argument("--delta-path", type=str, required=True)
49 |     parser.add_argument("--hub-repo-id", type=str, default=None)
50 |     args = parser.parse_args()
51 | 
52 |     make_delta(args.base_model_path, args.target_model_path, args.delta_path, args.hub_repo_id)
53 | 


--------------------------------------------------------------------------------
/openomni/model/multimodal_encoder/builder.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from .clip_encoder import CLIPVisionTower, CLIPVisionTowerS2
 3 | 
 4 | 
 5 | def build_vision_tower(vision_tower_cfg, **kwargs):
 6 |     vision_tower = getattr(vision_tower_cfg, 'mm_vision_tower', getattr(vision_tower_cfg, 'vision_tower', None))
 7 |     is_absolute_path_exists = os.path.exists(vision_tower)
 8 |     use_s2 = getattr(vision_tower_cfg, 's2', False)
 9 |     if is_absolute_path_exists or vision_tower.startswith("openai") or vision_tower.startswith("laion") or "ShareGPT4V" in vision_tower:
10 |         if use_s2:
11 |             return CLIPVisionTowerS2(vision_tower, args=vision_tower_cfg, **kwargs)
12 |         else:
13 |             return CLIPVisionTower(vision_tower, args=vision_tower_cfg, **kwargs)
14 | 
15 |     raise ValueError(f'Unknown vision tower: {vision_tower}')
16 | 


--------------------------------------------------------------------------------
/openomni/model/multimodal_projector/builder.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import re
 4 | 
 5 | 
 6 | class IdentityMap(nn.Module):
 7 |     def __init__(self):
 8 |         super().__init__()
 9 | 
10 |     def forward(self, x, *args, **kwargs):
11 |         return x
12 | 
13 |     @property
14 |     def config(self):
15 |         return {"mm_projector_type": 'identity'}
16 | 
17 | 
18 | class SimpleResBlock(nn.Module):
19 |     def __init__(self, channels):
20 |         super().__init__()
21 |         self.pre_norm = nn.LayerNorm(channels)
22 | 
23 |         self.proj = nn.Sequential(
24 |             nn.Linear(channels, channels),
25 |             nn.GELU(),
26 |             nn.Linear(channels, channels)
27 |         )
28 |     def forward(self, x):
29 |         x = self.pre_norm(x)
30 |         return x + self.proj(x)
31 | 
32 | 
33 | def build_vision_projector(config, delay_load=False, **kwargs):
34 |     projector_type = getattr(config, 'mm_projector_type', 'linear')
35 | 
36 |     if projector_type == 'linear':
37 |         return nn.Linear(config.mm_hidden_size, config.hidden_size)
38 | 
39 |     mlp_gelu_match = re.match(r'^mlp(\d+)x_gelu$', projector_type)
40 |     if mlp_gelu_match:
41 |         mlp_depth = int(mlp_gelu_match.group(1))
42 |         modules = [nn.Linear(config.mm_hidden_size, config.hidden_size)]
43 |         for _ in range(1, mlp_depth):
44 |             modules.append(nn.GELU())
45 |             modules.append(nn.Linear(config.hidden_size, config.hidden_size))
46 |         return nn.Sequential(*modules)
47 | 
48 |     if projector_type == 'identity':
49 |         return IdentityMap()
50 | 
51 |     raise ValueError(f'Unknown projector type: {projector_type}')
52 | 


--------------------------------------------------------------------------------
/openomni/model/speech_encoder/builder.py:
--------------------------------------------------------------------------------
 1 | from .speech_encoder import WhisperWrappedEncoder
 2 | 
 3 | 
 4 | def build_speech_encoder(config):
 5 |     speech_encoder_type = getattr(config, 'speech_encoder_type', None)
 6 |     if "whisper" in speech_encoder_type.lower():
 7 |         return WhisperWrappedEncoder.load(config)
 8 | 
 9 |     raise ValueError(f'Unknown speech encoder: {speech_encoder_type}')
10 | 


--------------------------------------------------------------------------------
/openomni/model/speech_encoder/speech_encoder.py:
--------------------------------------------------------------------------------
 1 | # Adopted from https://github.com/ddlBoJack/SLAM-LLM/blob/main/src/slam_llm/models/encoder.py
 2 | 
 3 | import types
 4 | import torch
 5 | import torch.nn as nn
 6 | import torch.nn.functional as F
 7 | import whisper
 8 | 
 9 | class WhisperWrappedEncoder:
10 |     
11 |     @classmethod
12 |     def load(cls, model_config):
13 | 
14 |         def replace_layer_norm(module):
15 |             from whisper.model import LayerNorm
16 |             for name, child in module.named_children():
17 |                 if isinstance(child, LayerNorm):
18 |                     old_params = child.state_dict()
19 |                     new_layer_norm = nn.LayerNorm(child.normalized_shape, eps=child.eps, elementwise_affine=child.elementwise_affine)
20 |                     new_layer_norm.load_state_dict(old_params)
21 |                     setattr(module, name, new_layer_norm)
22 |                 else:
23 |                     replace_layer_norm(child)
24 |         encoder = whisper.load_model(name=model_config.speech_encoder, device='cpu').encoder
25 |         # encoder = whisper.load_model(name="/mnt/workspace/lr/datasets/checkpoints/llava_her_pretrained/large-v3.pt",device='cpu').encoder
26 |         
27 |         replace_layer_norm(encoder)
28 |         return encoder
29 | 
30 |         # return None


--------------------------------------------------------------------------------
/openomni/model/speech_generator_ar/builder.py:
--------------------------------------------------------------------------------
 1 | from .speech_generator import SpeechGeneratorCTC
 2 | 
 3 | 
 4 | def build_speech_generator(config):
 5 |     generator_type = getattr(config, 'speech_generator_type', 'ctc')
 6 |     if generator_type == 'ctc' or generator_type == 'ar':
 7 |         return SpeechGeneratorCTC(config)
 8 | 
 9 |     raise ValueError(f'Unknown generator type: {generator_type}')
10 | 


--------------------------------------------------------------------------------
/openomni/model/speech_generator_ctc/builder.py:
--------------------------------------------------------------------------------
 1 | from .speech_generator import SpeechGeneratorCTC
 2 | 
 3 | 
 4 | def build_speech_generator(config):
 5 |     generator_type = getattr(config, 'speech_generator_type', 'ctc')
 6 |     if generator_type == 'ctc':
 7 |         return SpeechGeneratorCTC(config)
 8 | 
 9 |     raise ValueError(f'Unknown generator type: {generator_type}')
10 | 


--------------------------------------------------------------------------------
/openomni/model/speech_projector/builder.py:
--------------------------------------------------------------------------------
 1 | from .speech_projector import EncoderProjectorConcat
 2 | 
 3 | 
 4 | def build_speech_projector(config):
 5 |     projector_type = getattr(config, 'speech_projector_type', 'linear')
 6 |     if projector_type == 'linear':
 7 |         return EncoderProjectorConcat(config)
 8 | 
 9 |     raise ValueError(f'Unknown projector type: {projector_type}')
10 | 


--------------------------------------------------------------------------------
/openomni/model/speech_projector/speech_projector.py:
--------------------------------------------------------------------------------
 1 | # Adopted from https://github.com/ddlBoJack/SLAM-LLM/blob/main/src/slam_llm/models/projector.py
 2 | 
 3 | 
 4 | import torch
 5 | import torch.nn as nn
 6 | 
 7 | 
 8 | class EncoderProjectorConcat(nn.Module):
 9 |     def __init__(self, config):
10 |         super().__init__()
11 |         self.k = config.speech_encoder_ds_rate
12 |         self.encoder_dim = config.speech_encoder_hidden_size
13 |         self.llm_dim = config.hidden_size
14 |         self.linear1 = nn.Linear(self.encoder_dim * self.k, 2048)
15 |         self.relu = nn.ReLU()
16 |         self.linear2 = nn.Linear(2048, config.hidden_size)
17 | 
18 |     def forward(self, x):
19 |         batch_size, seq_len, dim = x.size()
20 |         num_frames_to_discard = seq_len % self.k
21 |         if num_frames_to_discard > 0:
22 |             x = x[:, :-num_frames_to_discard, :]
23 |         seq_len = x.size(1)
24 |         
25 |         x = x.contiguous()
26 |         x = x.view(batch_size, seq_len // self.k, dim * self.k)
27 |         x = self.linear1(x)
28 |         x = self.relu(x)
29 |         x = self.linear2(x)
30 |         return x


--------------------------------------------------------------------------------
/openomni/model/utils.py:
--------------------------------------------------------------------------------
 1 | from transformers import AutoConfig
 2 | 
 3 | 
 4 | def auto_upgrade(config):
 5 |     cfg = AutoConfig.from_pretrained(config)
 6 |     if 'llava' in config and 'llava' not in cfg.model_type:
 7 |         assert cfg.model_type == 'llama'
 8 |         print("You are using newer LLaVA code base, while the checkpoint of v0 is from older code base.")
 9 |         print("You must upgrade the checkpoint to the new code base (this can be done automatically).")
10 |         confirm = input("Please confirm that you want to upgrade the checkpoint. [Y/N]")
11 |         if confirm.lower() in ["y", "yes"]:
12 |             print("Upgrading checkpoint...")
13 |             assert len(cfg.architectures) == 1
14 |             setattr(cfg.__class__, "model_type", "llava")
15 |             cfg.architectures[0] = 'LlavaLlamaForCausalLM'
16 |             cfg.save_pretrained(config)
17 |             print("Checkpoint upgraded.")
18 |         else:
19 |             print("Checkpoint upgrade aborted.")
20 |             exit(1)
21 | 


--------------------------------------------------------------------------------
/openomni/model/visual_encoder/builder.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from .clip_encoder import CLIPVisionTower, CLIPVisionTowerS2
 3 | 
 4 | 
 5 | def build_vision_tower(vision_tower_cfg, **kwargs):
 6 |     vision_tower = getattr(vision_tower_cfg, 'mm_vision_tower', getattr(vision_tower_cfg, 'vision_tower', None))
 7 |     is_absolute_path_exists = os.path.exists(vision_tower)
 8 |     use_s2 = getattr(vision_tower_cfg, 's2', False)
 9 |     if is_absolute_path_exists or vision_tower.startswith("openai") or vision_tower.startswith("laion") or "ShareGPT4V" in vision_tower:
10 |         if use_s2:
11 |             return CLIPVisionTowerS2(vision_tower, args=vision_tower_cfg, **kwargs)
12 |         else:
13 |             return CLIPVisionTower(vision_tower, args=vision_tower_cfg, **kwargs)
14 | 
15 |     raise ValueError(f'Unknown vision tower: {vision_tower}')
16 | 


--------------------------------------------------------------------------------
/openomni/model/visual_projector/builder.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import re
 4 | 
 5 | 
 6 | class IdentityMap(nn.Module):
 7 |     def __init__(self):
 8 |         super().__init__()
 9 | 
10 |     def forward(self, x, *args, **kwargs):
11 |         return x
12 | 
13 |     @property
14 |     def config(self):
15 |         return {"mm_projector_type": 'identity'}
16 | 
17 | 
18 | class SimpleResBlock(nn.Module):
19 |     def __init__(self, channels):
20 |         super().__init__()
21 |         self.pre_norm = nn.LayerNorm(channels)
22 | 
23 |         self.proj = nn.Sequential(
24 |             nn.Linear(channels, channels),
25 |             nn.GELU(),
26 |             nn.Linear(channels, channels)
27 |         )
28 |     def forward(self, x):
29 |         x = self.pre_norm(x)
30 |         return x + self.proj(x)
31 | 
32 | 
33 | def build_vision_projector(config, delay_load=False, **kwargs):
34 |     projector_type = getattr(config, 'mm_projector_type', 'linear')
35 | 
36 |     if projector_type == 'linear':
37 |         return nn.Linear(config.mm_hidden_size, config.hidden_size)
38 | 
39 |     mlp_gelu_match = re.match(r'^mlp(\d+)x_gelu$', projector_type)
40 |     if mlp_gelu_match:
41 |         mlp_depth = int(mlp_gelu_match.group(1))
42 |         modules = [nn.Linear(config.mm_hidden_size, config.hidden_size)]
43 |         for _ in range(1, mlp_depth):
44 |             modules.append(nn.GELU())
45 |             modules.append(nn.Linear(config.hidden_size, config.hidden_size))
46 |         return nn.Sequential(*modules)
47 | 
48 |     if projector_type == 'identity':
49 |         return IdentityMap()
50 | 
51 |     raise ValueError(f'Unknown projector type: {projector_type}')
52 | 


--------------------------------------------------------------------------------
/openomni/serve/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RainBowLuoCS/OpenOmni/2892efff5fb4155a2f0b775aaa1f316aa69e289a/openomni/serve/__init__.py


--------------------------------------------------------------------------------
/openomni/serve/examples/extreme_ironing.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RainBowLuoCS/OpenOmni/2892efff5fb4155a2f0b775aaa1f316aa69e289a/openomni/serve/examples/extreme_ironing.jpg


--------------------------------------------------------------------------------
/openomni/serve/examples/waterview.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RainBowLuoCS/OpenOmni/2892efff5fb4155a2f0b775aaa1f316aa69e289a/openomni/serve/examples/waterview.jpg


--------------------------------------------------------------------------------
/openomni/serve/register_worker.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Manually register workers.
 3 | 
 4 | Usage:
 5 | python3 -m fastchat.serve.register_worker --controller http://localhost:21001 --worker-name http://localhost:21002
 6 | """
 7 | 
 8 | import argparse
 9 | 
10 | import requests
11 | 
12 | if __name__ == "__main__":
13 |     parser = argparse.ArgumentParser()
14 |     parser.add_argument("--controller-address", type=str)
15 |     parser.add_argument("--worker-name", type=str)
16 |     parser.add_argument("--check-heart-beat", action="store_true")
17 |     args = parser.parse_args()
18 | 
19 |     url = args.controller_address + "/register_worker"
20 |     data = {
21 |         "worker_name": args.worker_name,
22 |         "check_heart_beat": args.check_heart_beat,
23 |         "worker_status": None,
24 |     }
25 |     r = requests.post(url, json=data)
26 |     assert r.status_code == 200
27 | 


--------------------------------------------------------------------------------
/openomni/serve/test_message.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import json
 3 | 
 4 | import requests
 5 | 
 6 | from llava.conversation import default_conversation
 7 | 
 8 | 
 9 | def main():
10 |     if args.worker_address:
11 |         worker_addr = args.worker_address
12 |     else:
13 |         controller_addr = args.controller_address
14 |         ret = requests.post(controller_addr + "/refresh_all_workers")
15 |         ret = requests.post(controller_addr + "/list_models")
16 |         models = ret.json()["models"]
17 |         models.sort()
18 |         print(f"Models: {models}")
19 | 
20 |         ret = requests.post(controller_addr + "/get_worker_address",
21 |             json={"model": args.model_name})
22 |         worker_addr = ret.json()["address"]
23 |         print(f"worker_addr: {worker_addr}")
24 | 
25 |     if worker_addr == "":
26 |         return
27 | 
28 |     conv = default_conversation.copy()
29 |     conv.append_message(conv.roles[0], args.message)
30 |     prompt = conv.get_prompt()
31 | 
32 |     headers = {"User-Agent": "LLaVA Client"}
33 |     pload = {
34 |         "model": args.model_name,
35 |         "prompt": prompt,
36 |         "max_new_tokens": args.max_new_tokens,
37 |         "temperature": 0.7,
38 |         "stop": conv.sep,
39 |     }
40 |     response = requests.post(worker_addr + "/worker_generate_stream", headers=headers,
41 |             json=pload, stream=True)
42 | 
43 |     print(prompt.replace(conv.sep, "\n"), end="")
44 |     for chunk in response.iter_lines(chunk_size=8192, decode_unicode=False, delimiter=b"\0"):
45 |         if chunk:
46 |             data = json.loads(chunk.decode("utf-8"))
47 |             output = data["text"].split(conv.sep)[-1]
48 |             print(output, end="\r")
49 |     print("")
50 | 
51 | 
52 | if __name__ == "__main__":
53 |     parser = argparse.ArgumentParser()
54 |     parser.add_argument("--controller-address", type=str, default="http://localhost:21001")
55 |     parser.add_argument("--worker-address", type=str)
56 |     parser.add_argument("--model-name", type=str, default="facebook/opt-350m")
57 |     parser.add_argument("--max-new-tokens", type=int, default=32)
58 |     parser.add_argument("--message", type=str, default=
59 |         "Tell me a story with more than 1000 words.")
60 |     args = parser.parse_args()
61 | 
62 |     main()
63 | 


--------------------------------------------------------------------------------
/openomni/train/train_mem.py:
--------------------------------------------------------------------------------
1 | from openomni.train.train import train
2 | 
3 | if __name__ == "__main__":
4 |     train(attn_implementation="flash_attention_2")
5 | 


--------------------------------------------------------------------------------
/openomni/train/train_xformers.py:
--------------------------------------------------------------------------------
 1 | # Make it more memory efficient by monkey patching the LLaMA model with xformers attention.
 2 | # Need to call this before importing transformers.
 3 | from llava.train.llama_xformers_attn_monkey_patch import (
 4 |     replace_llama_attn_with_xformers_attn,
 5 | )
 6 | 
 7 | replace_llama_attn_with_xformers_attn()
 8 | 
 9 | from openomni.train.train import train
10 | 
11 | if __name__ == "__main__":
12 |     train()
13 | 


--------------------------------------------------------------------------------
/openomni/utils.py:
--------------------------------------------------------------------------------
  1 | import datetime
  2 | import logging
  3 | import logging.handlers
  4 | import os
  5 | import sys
  6 | 
  7 | import requests
  8 | 
  9 | from openomni.constants import LOGDIR
 10 | 
 11 | server_error_msg = "**NETWORK ERROR DUE TO HIGH TRAFFIC. PLEASE REGENERATE OR REFRESH THIS PAGE.**"
 12 | moderation_msg = "YOUR INPUT VIOLATES OUR CONTENT MODERATION GUIDELINES. PLEASE TRY AGAIN."
 13 | 
 14 | handler = None
 15 | 
 16 | 
 17 | def build_logger(logger_name, logger_filename):
 18 |     global handler
 19 | 
 20 |     formatter = logging.Formatter(
 21 |         fmt="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
 22 |         datefmt="%Y-%m-%d %H:%M:%S",
 23 |     )
 24 | 
 25 |     # Set the format of root handlers
 26 |     if not logging.getLogger().handlers:
 27 |         logging.basicConfig(level=logging.INFO)
 28 |     logging.getLogger().handlers[0].setFormatter(formatter)
 29 | 
 30 |     # Redirect stdout and stderr to loggers
 31 |     stdout_logger = logging.getLogger("stdout")
 32 |     stdout_logger.setLevel(logging.INFO)
 33 |     sl = StreamToLogger(stdout_logger, logging.INFO)
 34 |     sys.stdout = sl
 35 | 
 36 |     stderr_logger = logging.getLogger("stderr")
 37 |     stderr_logger.setLevel(logging.ERROR)
 38 |     sl = StreamToLogger(stderr_logger, logging.ERROR)
 39 |     sys.stderr = sl
 40 | 
 41 |     # Get logger
 42 |     logger = logging.getLogger(logger_name)
 43 |     logger.setLevel(logging.INFO)
 44 | 
 45 |     # Add a file handler for all loggers
 46 |     if handler is None:
 47 |         os.makedirs(LOGDIR, exist_ok=True)
 48 |         filename = os.path.join(LOGDIR, logger_filename)
 49 |         handler = logging.handlers.TimedRotatingFileHandler(
 50 |             filename, when='D', utc=True, encoding='UTF-8')
 51 |         handler.setFormatter(formatter)
 52 | 
 53 |         for name, item in logging.root.manager.loggerDict.items():
 54 |             if isinstance(item, logging.Logger):
 55 |                 item.addHandler(handler)
 56 | 
 57 |     return logger
 58 | 
 59 | 
 60 | class StreamToLogger(object):
 61 |     """
 62 |     Fake file-like stream object that redirects writes to a logger instance.
 63 |     """
 64 |     def __init__(self, logger, log_level=logging.INFO):
 65 |         self.terminal = sys.stdout
 66 |         self.logger = logger
 67 |         self.log_level = log_level
 68 |         self.linebuf = ''
 69 | 
 70 |     def __getattr__(self, attr):
 71 |         return getattr(self.terminal, attr)
 72 | 
 73 |     def write(self, buf):
 74 |         temp_linebuf = self.linebuf + buf
 75 |         self.linebuf = ''
 76 |         for line in temp_linebuf.splitlines(True):
 77 |             # From the io.TextIOWrapper docs:
 78 |             #   On output, if newline is None, any '\n' characters written
 79 |             #   are translated to the system default line separator.
 80 |             # By default sys.stdout.write() expects '\n' newlines and then
 81 |             # translates them so this is still cross platform.
 82 |             if line[-1] == '\n':
 83 |                 self.logger.log(self.log_level, line.rstrip())
 84 |             else:
 85 |                 self.linebuf += line
 86 | 
 87 |     def flush(self):
 88 |         if self.linebuf != '':
 89 |             self.logger.log(self.log_level, self.linebuf.rstrip())
 90 |         self.linebuf = ''
 91 | 
 92 | 
 93 | def disable_torch_init():
 94 |     """
 95 |     Disable the redundant torch default initialization to accelerate model creation.
 96 |     """
 97 |     import torch
 98 |     setattr(torch.nn.Linear, "reset_parameters", lambda self: None)
 99 |     setattr(torch.nn.LayerNorm, "reset_parameters", lambda self: None)
100 | 
101 | 
102 | def violates_moderation(text):
103 |     """
104 |     Check whether the text violates OpenAI moderation API.
105 |     """
106 |     url = "https://api.openai.com/v1/moderations"
107 |     headers = {"Content-Type": "application/json",
108 |                "Authorization": "Bearer " + os.environ["OPENAI_API_KEY"]}
109 |     text = text.replace("\n", "")
110 |     data = "{" + '"input": ' + f'"{text}"' + "}"
111 |     data = data.encode("utf-8")
112 |     try:
113 |         ret = requests.post(url, headers=headers, data=data, timeout=5)
114 |         flagged = ret.json()["results"][0]["flagged"]
115 |     except requests.exceptions.RequestException as e:
116 |         flagged = False
117 |     except KeyError as e:
118 |         flagged = False
119 | 
120 |     return flagged
121 | 
122 | 
123 | def pretty_print_semaphore(semaphore):
124 |     if semaphore is None:
125 |         return "None"
126 |     return f"Semaphore(value={semaphore._value}, locked={semaphore.locked()})"
127 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = ["setuptools>=61.0"]
 3 | build-backend = "setuptools.build_meta"
 4 | 
 5 | [project]
 6 | name = "openomni"
 7 | version = "0.0.1"
 8 | description = "OpenOmni"
 9 | readme = "README.md"
10 | requires-python = ">=3.8"
11 | classifiers = [
12 |     "Programming Language :: Python :: 3",
13 |     "License :: OSI Approved :: Apache Software License",
14 | ]
15 | dependencies = [
16 |     "torch==2.1.2", "torchvision==0.16.2",
17 |     "transformers==4.43.4", "tokenizers==0.19.0", "sentencepiece==0.1.99", "shortuuid",
18 |     "accelerate==0.21.0", "peft", "bitsandbytes",
19 |     "pydantic", "markdown2[all]", "numpy", "scikit-learn==1.2.2",
20 |     "gradio==4.16.0", "gradio_client==0.8.1","torchaudio==2.1.2",
21 |     "requests", "httpx==0.24.0", "uvicorn", "fastapi", "prettytable",
22 |     "einops==0.6.1", "einops-exts==0.0.4", "timm==0.6.13", "openpyxl"
23 | ]
24 | 
25 | [project.optional-dependencies]
26 | train = ["deepspeed==0.12.6", "ninja", "wandb"]
27 | build = ["build", "twine"]
28 | 
29 | [project.urls]
30 | "Homepage" = "https://github.com/RainBowLuoCS/OpenOmni"
31 | "Bug Tracker" = "https://github.com/RainBowLuoCS/OpenOmniissues"
32 | 
33 | [tool.setuptools.packages.find]
34 | exclude = ["assets*", "benchmark*", "docs", "dist*", "playground*", "scripts*", "tests*"]
35 | 
36 | [tool.wheel]
37 | exclude = ["assets*", "benchmark*", "docs", "dist*", "playground*", "scripts*", "tests*"]
38 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | pycocoevalcap
 2 | validators
 3 | visual_genome
 4 | xlsxwriter
 5 | sty
 6 | transformers==4.43.4
 7 | openai-whisper
 8 | git+https://github.com/shivammehta25/Matcha-TTS.git
 9 | 
10 | 
11 | 


--------------------------------------------------------------------------------
/scripts/clear.sh:
--------------------------------------------------------------------------------
1 | 
2 | for pid in $(nvidia-smi --query-compute-apps=pid --format=csv,noheader); do
3 |     kill -9 $pid
4 | done
5 | 


--------------------------------------------------------------------------------
/scripts/convert_gqa_for_eval.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | import argparse
 4 | 
 5 | parser = argparse.ArgumentParser()
 6 | parser.add_argument("--src", type=str)
 7 | parser.add_argument("--dst", type=str)
 8 | args = parser.parse_args()
 9 | 
10 | all_answers = []
11 | for line_idx, line in enumerate(open(args.src)):
12 |     res = json.loads(line)
13 |     question_id = res['question_id']
14 |     text = res['text'].rstrip('.').lower()
15 |     all_answers.append({"questionId": question_id, "prediction": text})
16 | 
17 | with open(args.dst, 'w') as f:
18 |     json.dump(all_answers, f)
19 | 


--------------------------------------------------------------------------------
/scripts/convert_mmbench_for_submission.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | import argparse
 4 | import pandas as pd
 5 | 
 6 | def get_args():
 7 |     parser = argparse.ArgumentParser()
 8 |     parser.add_argument("--annotation-file", type=str, required=True)
 9 |     parser.add_argument("--result-dir", type=str, required=True)
10 |     parser.add_argument("--upload-dir", type=str, required=True)
11 |     parser.add_argument("--experiment", type=str, required=True)
12 | 
13 |     return parser.parse_args()
14 | 
15 | if __name__ == "__main__":
16 |     args = get_args()
17 | 
18 |     df = pd.read_table(args.annotation_file)
19 | 
20 |     cur_df = df.copy()
21 |     cur_df = cur_df.drop(columns=['hint', 'category', 'source', 'image', 'comment', 'l2-category'])
22 |     cur_df.insert(6, 'prediction', None)
23 |     for pred in open(os.path.join(args.result_dir, f"{args.experiment}.jsonl")):
24 |         pred = json.loads(pred)
25 |         cur_df.loc[df['index'] == pred['question_id'], 'prediction'] = pred['text']
26 | 
27 |     cur_df.to_excel(os.path.join(args.upload_dir, f"{args.experiment}.xlsx"), index=False, engine='openpyxl')
28 | 


--------------------------------------------------------------------------------
/scripts/convert_mmvet_for_eval.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | import argparse
 4 | 
 5 | parser = argparse.ArgumentParser()
 6 | parser.add_argument("--src", type=str)
 7 | parser.add_argument("--dst", type=str)
 8 | args = parser.parse_args()
 9 | 
10 | cur_result = {}
11 | 
12 | for line in open(args.src):
13 |     data = json.loads(line)
14 |     qid = data['question_id']
15 |     cur_result[f'v1_{qid}'] = data['text']
16 | 
17 | with open(args.dst, 'w') as f:
18 |     json.dump(cur_result, f, indent=2)
19 | 


--------------------------------------------------------------------------------
/scripts/convert_seed_for_submission.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | import argparse
 4 | 
 5 | 
 6 | def get_args():
 7 |     parser = argparse.ArgumentParser()
 8 |     parser.add_argument("--annotation-file", type=str)
 9 |     parser.add_argument("--result-file", type=str)
10 |     parser.add_argument("--result-upload-file", type=str)
11 |     return parser.parse_args()
12 | 
13 | 
14 | def eval_single(result_file, eval_only_type=None):
15 |     results = {}
16 |     for line in open(result_file):
17 |         row = json.loads(line)
18 |         results[row['question_id']] = row
19 | 
20 |     type_counts = {}
21 |     correct_counts = {}
22 |     for question_data in data['questions']:
23 |         if eval_only_type is not None and question_data['data_type'] != eval_only_type: continue
24 |         data_type = question_data['question_type_id']
25 |         type_counts[data_type] = type_counts.get(data_type, 0) + 1
26 |         try:
27 |             question_id = int(question_data['question_id'])
28 |         except:
29 |             question_id = question_data['question_id']
30 |         if question_id not in results:
31 |             correct_counts[data_type] = correct_counts.get(data_type, 0)
32 |             continue
33 |         row = results[question_id]
34 |         if row['text'] == question_data['answer']:
35 |             correct_counts[data_type] = correct_counts.get(data_type, 0) + 1
36 | 
37 |     total_count = 0
38 |     total_correct = 0
39 |     for data_type in sorted(type_counts.keys()):
40 |         accuracy = correct_counts[data_type] / type_counts[data_type] * 100
41 |         if eval_only_type is None:
42 |             print(f"{ques_type_id_to_name[data_type]}: {accuracy:.2f}%")
43 | 
44 |         total_count += type_counts[data_type]
45 |         total_correct += correct_counts[data_type]
46 | 
47 |     total_accuracy = total_correct / total_count * 100
48 |     if eval_only_type is None:
49 |         print(f"Total accuracy: {total_accuracy:.2f}%")
50 |     else:
51 |         print(f"{eval_only_type} accuracy: {total_accuracy:.2f}%")
52 | 
53 |     return results
54 | 
55 | if __name__ == "__main__":
56 |     args = get_args()
57 |     data = json.load(open(args.annotation_file))
58 |     ques_type_id_to_name = {id:n for n,id in data['question_type'].items()}
59 | 
60 |     results = eval_single(args.result_file)
61 |     eval_single(args.result_file, eval_only_type='image')
62 | 


--------------------------------------------------------------------------------
/scripts/convert_sqa_to_llava.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | import fire
 4 | import re
 5 | from convert_sqa_to_llava_base_prompt import build_prompt_chatbot
 6 | 
 7 | 
 8 | def convert_to_llava(base_dir, split, prompt_format="QCM-LEA"):
 9 |     split_indices = json.load(open(os.path.join(base_dir, "pid_splits.json")))[split]
10 |     problems = json.load(open(os.path.join(base_dir, "problems.json")))
11 | 
12 |     split_problems = build_prompt_chatbot(
13 |         problems, split_indices, prompt_format,
14 |         use_caption=False, is_test=False)
15 | 
16 |     target_format = []
17 |     for prob_id, (input, output) in split_problems.items():
18 |         if input.startswith('Question: '):
19 |             input = input.replace('Question: ', '')
20 |         if output.startswith('Answer: '):
21 |             output = output.replace('Answer: ', '')
22 | 
23 |         raw_prob_data = problems[prob_id]
24 |         if raw_prob_data['image'] is None:
25 |             target_format.append({
26 |                 "id": prob_id,
27 |                 "conversations": [
28 |                     {'from': 'human', 'value': f"{input}"},
29 |                     {'from': 'gpt', 'value': f"{output}"},
30 |                 ],
31 |             })
32 | 
33 |         else:
34 |             target_format.append({
35 |                 "id": prob_id,
36 |                 "image": os.path.join(prob_id, raw_prob_data['image']),
37 |                 "conversations": [
38 |                     {'from': 'human', 'value': f"{input}\n<image>"},
39 |                     {'from': 'gpt', 'value': f"{output}"},
40 |                 ],
41 |             })
42 | 
43 |     print(f'Number of samples: {len(target_format)}')
44 | 
45 |     with open(os.path.join(base_dir, f"llava_{split}_{prompt_format}.json"), "w") as f:
46 |         json.dump(target_format, f, indent=2)
47 | 
48 | 
49 | def convert_to_jsonl(base_dir, split, prompt_format="QCM-LEPA"):
50 |     split_indices = json.load(open(os.path.join(base_dir, "pid_splits.json")))[split]
51 |     problems = json.load(open(os.path.join(base_dir, "problems.json")))
52 | 
53 |     split_problems = build_prompt_chatbot(
54 |         problems, split_indices, prompt_format,
55 |         use_caption=False, is_test=False)
56 | 
57 |     writer = open(os.path.join(base_dir, f"scienceqa_{split}_{prompt_format}.jsonl"), "w")
58 |     for prob_id, (input, output) in split_problems.items():
59 |         if input.startswith('Question: '):
60 |             input = input.replace('Question: ', '')
61 |         if output.startswith('Answer: '):
62 |             output = output.replace('Answer: ', '')
63 | 
64 |         raw_prob_data = problems[prob_id]
65 |         if raw_prob_data['image'] is None:
66 |             data = {
67 |                 "id": prob_id,
68 |                 "instruction": f"{input}",
69 |                 "output": f"{output}",
70 |             }
71 | 
72 |         else:
73 |             data = {
74 |                 "id": prob_id,
75 |                 "image": os.path.join(prob_id, raw_prob_data['image']),
76 |                 "instruction": f"{input}\n<image>",
77 |                 "output": f"{output}",
78 |             }
79 |         writer.write(json.dumps(data) + '\n')
80 |     writer.close()
81 | 
82 | 
83 | def main(task, **kwargs):
84 |     globals()[task](**kwargs)
85 | 
86 | 
87 | if __name__ == "__main__":
88 |     fire.Fire(main)
89 | 


--------------------------------------------------------------------------------
/scripts/convert_vizwiz_for_submission.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import argparse
 3 | import json
 4 | 
 5 | from llava.eval.m4c_evaluator import EvalAIAnswerProcessor
 6 | 
 7 | 
 8 | def parse_args():
 9 |     parser = argparse.ArgumentParser()
10 |     parser.add_argument('--annotation-file', type=str, required=True)
11 |     parser.add_argument('--result-file', type=str, required=True)
12 |     parser.add_argument('--result-upload-file', type=str, required=True)
13 |     return parser.parse_args()
14 | 
15 | 
16 | if __name__ == '__main__':
17 | 
18 |     args = parse_args()
19 | 
20 |     os.makedirs(os.path.dirname(args.result_upload_file), exist_ok=True)
21 | 
22 |     results = []
23 |     error_line = 0
24 |     for line_idx, line in enumerate(open(args.result_file)):
25 |         try:
26 |             results.append(json.loads(line))
27 |         except:
28 |             error_line += 1
29 |     results = {x['question_id']: x['text'] for x in results}
30 |     test_split = [json.loads(line) for line in open(args.annotation_file)]
31 |     split_ids = set([x['question_id'] for x in test_split])
32 | 
33 |     print(f'total results: {len(results)}, total split: {len(test_split)}, error_line: {error_line}')
34 | 
35 |     all_answers = []
36 | 
37 |     answer_processor = EvalAIAnswerProcessor()
38 | 
39 |     for x in test_split:
40 |         assert x['question_id'] in results
41 |         all_answers.append({
42 |             'image': x['image'],
43 |             'answer': answer_processor(results[x['question_id']])
44 |         })
45 | 
46 |     with open(args.result_upload_file, 'w') as f:
47 |         json.dump(all_answers, f)
48 | 


--------------------------------------------------------------------------------
/scripts/convert_vqav2_for_submission.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import argparse
 3 | import json
 4 | 
 5 | from llava.eval.m4c_evaluator import EvalAIAnswerProcessor
 6 | 
 7 | 
 8 | def parse_args():
 9 |     parser = argparse.ArgumentParser()
10 |     parser.add_argument('--dir', type=str, default="./playground/data/eval/vqav2")
11 |     parser.add_argument('--ckpt', type=str, required=True)
12 |     parser.add_argument('--split', type=str, required=True)
13 |     return parser.parse_args()
14 | 
15 | 
16 | if __name__ == '__main__':
17 | 
18 |     args = parse_args()
19 | 
20 |     src = os.path.join(args.dir, 'answers', args.split, args.ckpt, 'merge.jsonl')
21 |     test_split = os.path.join(args.dir, 'llava_vqav2_mscoco_test2015.jsonl')
22 |     dst = os.path.join(args.dir, 'answers_upload', args.split, f'{args.ckpt}.json')
23 |     os.makedirs(os.path.dirname(dst), exist_ok=True)
24 | 
25 |     results = []
26 |     error_line = 0
27 |     for line_idx, line in enumerate(open(src)):
28 |         try:
29 |             results.append(json.loads(line))
30 |         except:
31 |             error_line += 1
32 | 
33 |     results = {x['question_id']: x['text'] for x in results}
34 |     test_split = [json.loads(line) for line in open(test_split)]
35 |     split_ids = set([x['question_id'] for x in test_split])
36 | 
37 |     print(f'total results: {len(results)}, total split: {len(test_split)}, error_line: {error_line}')
38 | 
39 |     all_answers = []
40 | 
41 |     answer_processor = EvalAIAnswerProcessor()
42 | 
43 |     for x in test_split:
44 |         if x['question_id'] not in results:
45 |             all_answers.append({
46 |                 'question_id': x['question_id'],
47 |                 'answer': ''
48 |             })
49 |         else:
50 |             all_answers.append({
51 |                 'question_id': x['question_id'],
52 |                 'answer': answer_processor(results[x['question_id']])
53 |             })
54 | 
55 |     with open(dst, 'w') as f:
56 |         json.dump(all_answers, open(dst, 'w'))
57 | 


--------------------------------------------------------------------------------
/scripts/mmevol/eval/gqa.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # srun -p mllm --gres gpu:8 bash scripts/v1_6/eval/gqa.sh
 3 | 
 4 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
 5 | IFS=',' read -ra GPULIST <<< "$gpu_list"
 6 | CHUNKS=${#GPULIST[@]}
 7 | 
 8 | CONV_MODE=llava_llama_3
 9 | CKPT=$1
10 | CKPT_DIR=${2-"checkpoints"}
11 | 
12 | SPLIT="llava_gqa_testdev_balanced"
13 | GQADIR="./playground/data/eval/gqa/data"
14 | 
15 | for IDX in $(seq 0 $((CHUNKS-1))); do
16 |     CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m llava.eval.model_vqa_loader \
17 |         --model-path ${CKPT_DIR}/${CKPT} \
18 |         --question-file ./playground/data/eval/gqa/$SPLIT.jsonl \
19 |         --image-folder /mnt/hwfile/mllm/xinglong/llava/llava_1.5/playground/data/eval/gqa/images \
20 |         --answers-file ./playground/data/eval/gqa/answers/$SPLIT/$CKPT/${CHUNKS}_${IDX}.jsonl \
21 |         --num-chunks $CHUNKS \
22 |         --chunk-idx $IDX \
23 |         --temperature 0 \
24 |         --square_eval True \
25 |         --conv-mode ${CONV_MODE} &
26 | done
27 | 
28 | wait
29 | 
30 | output_file=./playground/data/eval/gqa/answers/$SPLIT/$CKPT/merge.jsonl
31 | 
32 | # Clear out the output file if it exists.
33 | > "$output_file"
34 | 
35 | # Loop through the indices and concatenate each file.
36 | for IDX in $(seq 0 $((CHUNKS-1))); do
37 |     cat ./playground/data/eval/gqa/answers/$SPLIT/$CKPT/${CHUNKS}_${IDX}.jsonl >> "$output_file"
38 | done
39 | 
40 | python scripts/convert_gqa_for_eval.py --src $output_file --dst $GQADIR/testdev_balanced_predictions.json
41 | 
42 | cd $GQADIR
43 | python eval/eval.py --tier testdev_balanced


--------------------------------------------------------------------------------
/scripts/mmevol/eval/mmbench.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # srun -p mllm --gres gpu:8 bash scripts/v1_6/eval/mmbench.sh
 3 | 
 4 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
 5 | IFS=',' read -ra GPULIST <<< "$gpu_list"
 6 | CHUNKS=${#GPULIST[@]}
 7 | 
 8 | CONV_MODE=llava_llama_3
 9 | CKPT=$1
10 | CKPT_DIR=${2-"checkpoints"}
11 | SPLIT="mmbench_dev_20230712"
12 | LANG="en"
13 | 
14 | 
15 | for IDX in $(seq 0 $((CHUNKS-1))); do
16 |     CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m llava.eval.model_vqa_mmbench \
17 |         --model-path ${CKPT_DIR}/${CKPT} \
18 |         --question-file ./playground/data/eval/mmbench/$SPLIT.tsv \
19 |         --answers-file ./playground/data/eval/mmbench/answers/$SPLIT/${CKPT}/${CHUNKS}_${IDX}.jsonl \
20 |         --num-chunks $CHUNKS \
21 |         --chunk-idx $IDX \
22 |         --lang en \
23 |         --single-pred-prompt \
24 |         --square_eval True \
25 |         --temperature 0 \
26 |         --conv-mode ${CONV_MODE} &
27 | done
28 | 
29 | wait
30 | 
31 | output_file=./playground/data/eval/mmbench/answers/$SPLIT/${CKPT}/merge.jsonl
32 | 
33 | # Clear out the output file if it exists.
34 | > "$output_file"
35 | 
36 | # Loop through the indices and concatenate each file.
37 | for IDX in $(seq 0 $((CHUNKS-1))); do
38 |     cat ./playground/data/eval/mmbench/answers/$SPLIT/${CKPT}/${CHUNKS}_${IDX}.jsonl >> "$output_file"
39 | done
40 | 
41 | wait
42 | 
43 | mkdir -p playground/data/eval/mmbench/answers_upload/$SPLIT
44 | mkdir -p playground/data/eval/mmbench/answers_upload/$SPLIT/${CKPT}
45 | 
46 | python scripts/convert_mmbench_for_submission.py \
47 |     --annotation-file ./playground/data/eval/mmbench/$SPLIT.tsv \
48 |     --result-dir ./playground/data/eval/mmbench/answers/$SPLIT/${CKPT} \
49 |     --upload-dir ./playground/data/eval/mmbench/answers_upload/$SPLIT/${CKPT} \
50 |     --experiment merge
51 | 


--------------------------------------------------------------------------------
/scripts/mmevol/eval/mmbench_cn.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # srun -p mllm --gres gpu:8 bash scripts/v1_6/eval/mmbench.sh
 3 | 
 4 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
 5 | IFS=',' read -ra GPULIST <<< "$gpu_list"
 6 | CHUNKS=${#GPULIST[@]}
 7 | 
 8 | CONV_MODE=llava_llama_3
 9 | CKPT=$1
10 | CKPT_DIR=${2-"checkpoints"}
11 | LANG="cn"
12 | SPLIT="mmbench_dev_cn_20231003"
13 | 
14 | 
15 | for IDX in $(seq 0 $((CHUNKS-1))); do
16 |     CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m llava.eval.model_vqa_mmbench \
17 |         --model-path ${CKPT_DIR}/${CKPT} \
18 |         --question-file ./playground/data/eval/mmbench/$SPLIT.tsv \
19 |         --answers-file ./playground/data/eval/mmbench/answers/$SPLIT/${CKPT}/${CHUNKS}_${IDX}.jsonl \
20 |         --num-chunks $CHUNKS \
21 |         --chunk-idx $IDX \
22 |         --lang en \
23 |         --single-pred-prompt \
24 |         --square_eval True \
25 |         --temperature 0 \
26 |         --conv-mode ${CONV_MODE} &
27 | done
28 | 
29 | wait
30 | 
31 | output_file=./playground/data/eval/mmbench/answers/$SPLIT/${CKPT}/merge.jsonl
32 | 
33 | # Clear out the output file if it exists.
34 | > "$output_file"
35 | 
36 | # Loop through the indices and concatenate each file.
37 | for IDX in $(seq 0 $((CHUNKS-1))); do
38 |     cat ./playground/data/eval/mmbench/answers/$SPLIT/${CKPT}/${CHUNKS}_${IDX}.jsonl >> "$output_file"
39 | done
40 | 
41 | wait
42 | 
43 | mkdir -p playground/data/eval/mmbench/answers_upload/$SPLIT
44 | mkdir -p playground/data/eval/mmbench/answers_upload/$SPLIT/${CKPT}
45 | 
46 | python scripts/convert_mmbench_for_submission.py \
47 |     --annotation-file ./playground/data/eval/mmbench/$SPLIT.tsv \
48 |     --result-dir ./playground/data/eval/mmbench/answers/$SPLIT/${CKPT} \
49 |     --upload-dir ./playground/data/eval/mmbench/answers_upload/$SPLIT/${CKPT} \
50 |     --experiment merge
51 | 


--------------------------------------------------------------------------------
/scripts/mmevol/eval/mme.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # srun -p mllm --gres gpu:1 bash scripts/v1_6/eval/mme.sh
 3 | 
 4 | CONV_MODE=llava_llama_3
 5 | 
 6 | CUDA_VISIBLE_DEVICES=0 python -m llava.eval.model_vqa_loader \
 7 |     --model-path /mnt/workspace/lr/datasets/checkpoints/Lin-Chen/open-llava-next-llama3-8b \
 8 |     --question-file /mnt/workspace/lr/datasets/playground/playground/data/eval/MME/share4v_mme.jsonl \
 9 |     --image-folder /mnt/workspace/lr/datasets/playground/playground/data/eval/MME/MME_Benchmark_release_version\
10 |     --answers-file ./playground/data/eval/MME/answers/std_topic.jsonl \
11 |     --temperature 0 \
12 |     --square_eval True \
13 |     --conv-mode $CONV_MODE
14 | 
15 | # cd ./playground/data/eval/MME
16 | 
17 | # python convert_answer_to_mme.py --experiment ${CKPT}
18 | 
19 | # cd eval_tool
20 | 
21 | # python calculation.py --results_dir answers/${CKPT}
22 | 


--------------------------------------------------------------------------------
/scripts/mmevol/eval/seed.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # srun -p mllm --gres gpu:8 bash scripts/v1_6/eval/seed.sh
 3 | 
 4 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
 5 | IFS=',' read -ra GPULIST <<< "$gpu_list"
 6 | 
 7 | CHUNKS=${#GPULIST[@]}
 8 | 
 9 | CONV_MODE=llava_llama_3
10 | CKPT=$1
11 | CKPT_DIR=${2-'checkpoints'}
12 | 
13 | for IDX in $(seq 0 $((CHUNKS-1))); do
14 |     CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m llava.eval.model_vqa_loader \
15 |         --model-path ${CKPT_DIR}/${CKPT} \
16 |         --question-file ./playground/data/eval/seed_bench/llava-seed-bench-image.jsonl \
17 |         --image-folder ./playground/data/eval/seed_bench \
18 |         --answers-file ./playground/data/eval/seed_bench/answers/$CKPT/${CHUNKS}_${IDX}.jsonl \
19 |         --num-chunks $CHUNKS \
20 |         --chunk-idx $IDX \
21 |         --temperature 0 \
22 |         --square_eval True \
23 |         --conv-mode $CONV_MODE &
24 | done
25 | 
26 | wait
27 | 
28 | output_file=./playground/data/eval/seed_bench/answers/$CKPT/merge.jsonl
29 | 
30 | # Clear out the output file if it exists.
31 | > "$output_file"
32 | 
33 | # Loop through the indices and concatenate each file.
34 | for IDX in $(seq 0 $((CHUNKS-1))); do
35 |     cat ./playground/data/eval/seed_bench/answers/$CKPT/${CHUNKS}_${IDX}.jsonl >> "$output_file"
36 | done
37 | 
38 | # Evaluate
39 | python scripts/convert_seed_for_submission.py \
40 |     --annotation-file ./playground/data/eval/seed_bench/SEED-Bench.json \
41 |     --result-file $output_file \
42 |     --result-upload-file ./playground/data/eval/seed_bench/answers_upload/llava-v1.6-7b.jsonl
43 | 
44 | 


--------------------------------------------------------------------------------
/scripts/mmevol/eval/sqa.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # srun -p mllm --gres gpu:8 bash scripts/v1_6/eval/sqa.sh
 3 | 
 4 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
 5 | IFS=',' read -ra GPULIST <<< "$gpu_list"
 6 | CHUNKS=${#GPULIST[@]}
 7 | 
 8 | CONV_MODE=llava_llama_3
 9 | CKPT=$1
10 | CKPT_DIR=${2-"checkpoints"}
11 | 
12 | for IDX in $(seq 0 $((CHUNKS-1))); do
13 |     CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m llava.eval.model_vqa_science \
14 |         --model-path ${CKPT_DIR}/${CKPT} \
15 |         --question-file ./playground/data/eval/scienceqa/llava_test_CQM-A.json \
16 |         --image-folder ./playground/data/eval/scienceqa/images/test \
17 |         --answers-file ./playground/data/eval/scienceqa/answers/$CKPT/${CHUNKS}_${IDX}.jsonl \
18 |         --single-pred-prompt \
19 |         --num-chunks $CHUNKS \
20 |         --chunk-idx $IDX \
21 |         --temperature 0 \
22 |         --square_eval True \
23 |         --conv-mode ${CONV_MODE} &
24 | done
25 | 
26 | wait
27 | 
28 | output_file=./playground/data/eval/scienceqa/answers/${CKPT}.jsonl
29 | 
30 | # Clear out the output file if it exists.
31 | > "$output_file"
32 | 
33 | # Loop through the indices and concatenate each file.
34 | for IDX in $(seq 0 $((CHUNKS-1))); do
35 |     cat ./playground/data/eval/scienceqa/answers/$CKPT/${CHUNKS}_${IDX}.jsonl >> "$output_file"
36 | done
37 | 
38 | # Evaluate
39 | python llava/eval/eval_science_qa.py \
40 |     --base-dir ./playground/data/eval/scienceqa \
41 |     --result-file ./playground/data/eval/scienceqa/answers/${CKPT}.jsonl \
42 |     --output-file ./playground/data/eval/scienceqa/answers/${CKPT}_output.jsonl \
43 |     --output-result ./playground/data/eval/scienceqa/answers/${CKPT}_result.json


--------------------------------------------------------------------------------
/scripts/mmevol/eval/textvqa.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # srun -p mllm --gres gpu:8 bash scripts/v1_6/eval/textvqa.sh
 3 | 
 4 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
 5 | IFS=',' read -ra GPULIST <<< "$gpu_list"
 6 | CHUNKS=${#GPULIST[@]}
 7 | 
 8 | CONV_MODE=llava_llama_3
 9 | CKPT=$1
10 | CKPT_DIR=${2-"checkpoints"}
11 | 
12 | for IDX in $(seq 0 $((CHUNKS-1))); do
13 |     CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m llava.eval.model_vqa_loader \
14 |         --model-path ${CKPT_DIR}/${CKPT} \
15 |         --question-file ./playground/data/eval/textvqa/llava_textvqa_val_v051_ocr.jsonl \
16 |         --image-folder ./playground/data/eval/textvqa/train_images \
17 |         --answers-file ./playground/data/eval/textvqa/answers/$CKPT/${CHUNKS}_${IDX}.jsonl \
18 |         --num-chunks $CHUNKS \
19 |         --chunk-idx $IDX \
20 |         --temperature 0 \
21 |         --square_eval True \
22 |         --conv-mode ${CONV_MODE} &
23 | done
24 | 
25 | wait
26 | 
27 | output_file=./playground/data/eval/textvqa/answers/$CKPT/merge.jsonl
28 | 
29 | # Clear out the output file if it exists.
30 | > "$output_file"
31 | 
32 | # Loop through the indices and concatenate each file.
33 | for IDX in $(seq 0 $((CHUNKS-1))); do
34 |     cat ./playground/data/eval/textvqa/answers/$CKPT/${CHUNKS}_${IDX}.jsonl >> "$output_file"
35 | done
36 | 
37 | # Evaluate
38 | python -m llava.eval.eval_textvqa \
39 |     --annotation-file ./playground/data/eval/textvqa/TextVQA_0.5.1_val.json \
40 |     --result-file $output_file


--------------------------------------------------------------------------------
/scripts/mmevol/train/llama3/finetune.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -x
 3 | 
 4 | # wandb login
 5 | 
 6 | export CUDA_DEVICE_MAX_CONNECTIONS=1
 7 | export GPUS_PER_NODE=8
 8 | export NNODES=4
 9 | export BATCH_SIZE=4
10 | export GRADIENT_ACCU_STEPS=4
11 | export MASTER_PORT=29588
12 | export CPUS_PER_TASK=16
13 | export QUOTA=reserved
14 | 
15 | export DATA_PATH=datasets/json/mix_evol_sft.json
16 | export SAVE_PATH=llava-v1.6-8b_llama3-8b_clip-large-336_mmevol_sft
17 | export BASE_LR=2e-5
18 | export VIT_LR=2e-6
19 | 
20 | 
21 | bash -c 'torchrun --nproc_per_node $GPUS_PER_NODE llava/train/train_mem.py \
22 | --deepspeed ./scripts/zero2.json \
23 | --model_name_or_path checkpoints/llama-3-8b-Instruct \
24 | --version llava_llama_3 \
25 | --data_path ${DATA_PATH} \
26 | --image_folder datasets \
27 | --vision_tower checkpoints/openai/clip-vit-large-patch14-336 \
28 | --mm_projector_type mlp2x_gelu \
29 | --pretrain_mm_mlp_adapter checkpoints/llava-v1.6-8b_llama3-8b_clip-large-336_pretrain/mm_projector.bin \
30 | --unfreeze_mm_vision_tower True \
31 | --mm_vision_tower_lr ${VIT_LR} \
32 | --image_aspect_ratio anyres \
33 | --group_by_modality_length True \
34 | --mm_vision_select_layer -2 \
35 | --mm_vision_select_feature patch \
36 | --mm_patch_merge_type spatial_unpad \
37 | --mm_use_im_start_end False \
38 | --mm_use_im_patch_token False \
39 | --bf16 True \
40 | --output_dir checkpoints/${SAVE_PATH} \
41 | --num_train_epochs 1 \
42 | --per_device_train_batch_size ${BATCH_SIZE} \
43 | --per_device_eval_batch_size 4 \
44 | --gradient_accumulation_steps ${GRADIENT_ACCU_STEPS} \
45 | --evaluation_strategy "no" \
46 | --save_strategy "steps" \
47 | --save_steps 500 \
48 | --save_total_limit 20 \
49 | --learning_rate ${BASE_LR} \
50 | --weight_decay 0. \
51 | --warmup_ratio 0.03 \
52 | --lr_scheduler_type "cosine" \
53 | --logging_steps 1 \
54 | --tf32 True \
55 | --model_max_length 4096 \
56 | --gradient_checkpointing True \
57 | --dataloader_num_workers 4 \
58 | --lazy_preprocess True \
59 | --run_name ${SAVE_PATH} \
60 | --dataloader_drop_last True \
61 | --report_to tensorboard'
62 | 


--------------------------------------------------------------------------------
/scripts/mmevol/train/llama3/pretrain.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -x
 3 | 
 4 | # wandb login
 5 | 
 6 | export CUDA_DEVICE_MAX_CONNECTIONS=1
 7 | export GPUS_PER_NODE=8
 8 | export NNODES=4
 9 | export BATCH_SIZE=4
10 | export GRADIENT_ACCU_STEPS=4
11 | export MASTER_PORT=29504
12 | export CPUS_PER_TASK=16
13 | export QUOTA=reserved
14 | 
15 | export DATA_PATH=datasets/llava/llava_pretrain/blip_laion_cc_sbu_558k.json
16 | export SAVE_PATH=llava-v1.6-8b_llama3-8b_clip-large-336_pretrain_blip_laion_cc_sbu_558k
17 | export BASE_LR=1e-3
18 | 
19 | 
20 | bash -c 'torchrun --nproc_per_node $GPUS_PER_NODE llava/train/train_mem.py \
21 | --deepspeed ./scripts/zero2.json \
22 | --model_name_or_path checkpoints/LLM-Research/Meta-Llama-3-8B-Instruct \
23 | --version plain \
24 | --data_path ${DATA_PATH} \
25 | --image_folder datasets/ \
26 | --vision_tower checkpoints/openai/clip-vit-large-patch14-336 \
27 | --mm_projector_type mlp2x_gelu \
28 | --tune_mm_mlp_adapter True \
29 | --unfreeze_mm_vision_tower False \
30 | --image_aspect_ratio anyres \
31 | --mm_vision_select_layer -2 \
32 | --mm_vision_select_feature patch \
33 | --mm_patch_merge_type spatial_unpad \
34 | --mm_use_im_start_end False \
35 | --mm_use_im_patch_token False \
36 | --bf16 True \
37 | --output_dir checkpoints/${SAVE_PATH} \
38 | --num_train_epochs 1 \
39 | --per_device_train_batch_size ${BATCH_SIZE} \
40 | --per_device_eval_batch_size 4 \
41 | --gradient_accumulation_steps ${GRADIENT_ACCU_STEPS} \
42 | --evaluation_strategy "no" \
43 | --save_strategy "steps" \
44 | --save_steps 500 \
45 | --save_total_limit 2 \
46 | --learning_rate ${BASE_LR} \
47 | --weight_decay 0. \
48 | --warmup_ratio 0.03 \
49 | --lr_scheduler_type "cosine" \
50 | --logging_steps 1 \
51 | --tf32 True \
52 | --model_max_length 6144 \
53 | --gradient_checkpointing True \
54 | --dataloader_num_workers 4 \
55 | --lazy_preprocess True \
56 | --run_name ${SAVE_PATH} \
57 | --dataloader_drop_last True \
58 | --report_to tensorboard'
59 | 


--------------------------------------------------------------------------------
/scripts/mmevol/train/qwen2/finetune.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -x
 3 | 
 4 | # wandb login
 5 | 
 6 | export CUDA_DEVICE_MAX_CONNECTIONS=1
 7 | export GPUS_PER_NODE=8
 8 | export NNODES=4
 9 | export BATCH_SIZE=4
10 | export GRADIENT_ACCU_STEPS=4
11 | export MASTER_PORT=29588
12 | export CPUS_PER_TASK=16
13 | export QUOTA=reserved
14 | 
15 | export DATA_PATH=datasets/json/mix_evol_sft.json
16 | export SAVE_PATH=llava-v1.6-8b_qwen2-7b_clip-large-336_mmevol_sft
17 | export BASE_LR=2e-5
18 | export VIT_LR=2e-6
19 | 
20 | 
21 | bash -c 'torchrun --nproc_per_node $GPUS_PER_NODE llava/train/train_mem.py \
22 | --deepspeed ./scripts/zero2.json \
23 | --model_name_or_path checkpoints/qwen/Qwen2-7B-Instruct \
24 | --version qwen_2 \
25 | --data_path ${DATA_PATH} \
26 | --image_folder datasets \
27 | --vision_tower checkpoints/openai/clip-vit-large-patch14-336 \
28 | --mm_projector_type mlp2x_gelu \
29 | --pretrain_mm_mlp_adapter checkpoints/llava-v1.6-7b_qwen2-7b_clip-large-336_pretrain/mm_projector.bin \
30 | --unfreeze_mm_vision_tower True \
31 | --mm_vision_tower_lr ${VIT_LR} \
32 | --image_aspect_ratio anyres \
33 | --group_by_modality_length True \
34 | --mm_vision_select_layer -2 \
35 | --mm_vision_select_feature patch \
36 | --mm_patch_merge_type spatial_unpad \
37 | --mm_use_im_start_end False \
38 | --mm_use_im_patch_token False \
39 | --bf16 True \
40 | --output_dir checkpoints/${SAVE_PATH} \
41 | --num_train_epochs 1 \
42 | --per_device_train_batch_size ${BATCH_SIZE} \
43 | --per_device_eval_batch_size 4 \
44 | --gradient_accumulation_steps ${GRADIENT_ACCU_STEPS} \
45 | --evaluation_strategy "no" \
46 | --save_strategy "steps" \
47 | --save_steps 500 \
48 | --save_total_limit 20 \
49 | --learning_rate ${BASE_LR} \
50 | --weight_decay 0. \
51 | --warmup_ratio 0.03 \
52 | --lr_scheduler_type "cosine" \
53 | --logging_steps 1 \
54 | --tf32 True \
55 | --model_max_length 4096 \
56 | --gradient_checkpointing True \
57 | --dataloader_num_workers 4 \
58 | --lazy_preprocess True \
59 | --run_name ${SAVE_PATH} \
60 | --dataloader_drop_last True \
61 | --report_to tensorboard'
62 | 


--------------------------------------------------------------------------------
/scripts/mmevol/train/qwen2/pretrain.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -x
 3 | 
 4 | # wandb login
 5 | 
 6 | export CUDA_DEVICE_MAX_CONNECTIONS=1
 7 | export GPUS_PER_NODE=4
 8 | export NNODES=4
 9 | export BATCH_SIZE=4
10 | export GRADIENT_ACCU_STEPS=8
11 | export MASTER_PORT=29504
12 | export CPUS_PER_TASK=16
13 | export QUOTA=reserved
14 | 
15 | export DATA_PATH=datasets/llava/llava_pretrain/blip_laion_cc_sbu_558k.json
16 | export SAVE_PATH=llava-v1.6-8b_qwen2-7b_clip-large-336_pretrain_blip_laion_cc_sbu_558k
17 | export BASE_LR=1e-3
18 | 
19 | 
20 | bash -c 'torchrun --nproc_per_node $GPUS_PER_NODE llava/train/train_mem.py \
21 | --deepspeed ./scripts/zero2.json \
22 | --model_name_or_path checkpoints/qwen/Qwen2-7B-Instruct \
23 | --version plain \
24 | --data_path ${DATA_PATH} \
25 | --image_folder datasets/llava/llava_pretrain/images \
26 | --vision_tower checkpoints/openai/clip-vit-large-patch14-336 \
27 | --mm_projector_type mlp2x_gelu \
28 | --tune_mm_mlp_adapter True \
29 | --unfreeze_mm_vision_tower False \
30 | --image_aspect_ratio anyres \
31 | --mm_vision_select_layer -2 \
32 | --mm_vision_select_feature patch \
33 | --mm_patch_merge_type spatial_unpad \
34 | --mm_use_im_start_end False \
35 | --mm_use_im_patch_token False \
36 | --bf16 True \
37 | --output_dir checkpoints/${SAVE_PATH} \
38 | --num_train_epochs 1 \
39 | --per_device_train_batch_size ${BATCH_SIZE} \
40 | --per_device_eval_batch_size 4 \
41 | --gradient_accumulation_steps ${GRADIENT_ACCU_STEPS} \
42 | --evaluation_strategy "no" \
43 | --save_strategy "steps" \
44 | --save_steps 500 \
45 | --save_total_limit 2 \
46 | --learning_rate ${BASE_LR} \
47 | --weight_decay 0. \
48 | --warmup_ratio 0.03 \
49 | --lr_scheduler_type "cosine" \
50 | --logging_steps 1 \
51 | --tf32 True \
52 | --model_max_length 4096 \
53 | --gradient_checkpointing True \
54 | --dataloader_num_workers 4 \
55 | --lazy_preprocess True \
56 | --run_name ${SAVE_PATH} \
57 | --dataloader_drop_last True \
58 | --report_to tensorboard'
59 | 


--------------------------------------------------------------------------------
/scripts/train/llama3/asr_finetune.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | set -x
  3 | 
  4 | # wandb login
  5 | echo "DLC seemed world size(actually nodes): ${WORLD_SIZE}"
  6 | NNODES=${WORLD_SIZE}
  7 | NODE_RANK=${RANK}
  8 | 
  9 | export CUDA_DEVICE_MAX_CONNECTIONS=1
 10 | export GPUS_PER_NODE=8
 11 | # export NNODES=2
 12 | export BATCH_SIZE=8
 13 | export GRADIENT_ACCU_STEPS=4
 14 | export MASTER_PORT=29588
 15 | export CPUS_PER_TASK=16
 16 | export QUOTA=reserved
 17 | 
 18 | export DATA_PATH=./datasets/openomni/json/openomni_asr.json
 19 | export SAVE_PATH=openomni_asr_qwen_2
 20 | export BASE_LR=2e-5
 21 | export VIT_LR=2e-6
 22 | 
 23 | DISTRIBUTED_ARGS="
 24 |     --nproc_per_node $GPUS_PER_NODE \
 25 |     --nnodes $NNODES \
 26 |     --node_rank $NODE_RANK \
 27 |     --master_addr $MASTER_ADDR \
 28 |     --master_port $MASTER_PORT
 29 | "
 30 | 
 31 | # 定义重试的最大次数
 32 | MAX_RETRIES=5
 33 | 
 34 | # 每次重试之间的等待时间，单位为秒
 35 | WAIT_TIME=200
 36 | 
 37 | # 当前的重试次数
 38 | retry_count=0
 39 | 
 40 | # 要执行的命令
 41 | command_to_run="torchrun --nproc_per_node $GPUS_PER_NODE openomni/train/train_mem.py \
 42 | --deepspeed ./scripts/zero2.json \
 43 | --model_name_or_path ./checkpoints/openomni_stage3_qwen_3/checkpoint-last \
 44 | --version llava_llama_3 \
 45 | --data_path ${DATA_PATH} \
 46 | --image_folder ./datasets \
 47 | --speech_folder ./datasets \
 48 | --vision_tower ./checkpoints/openai/clip-vit-large-patch14-336 \
 49 | --mm_projector_type mlp2x_gelu \
 50 | --freeze_backbone False \
 51 | --tune_speech_adapter True \
 52 | --speech_encoder ./checkpoints/openai-whisper/large-v3.pt \
 53 | --unfreeze_mm_vision_tower True \
 54 | --mm_vision_tower_lr ${VIT_LR} \
 55 | --image_aspect_ratio anyres \
 56 | --group_by_modality_length True \
 57 | --mm_vision_select_layer -2 \
 58 | --mm_vision_select_feature patch \
 59 | --mm_patch_merge_type spatial_unpad \
 60 | --mm_use_im_start_end False \
 61 | --mm_use_im_patch_token False \
 62 | --bf16 True \
 63 | --output_dir checkpoints/${SAVE_PATH} \
 64 | --num_train_epochs 1 \
 65 | --per_device_train_batch_size ${BATCH_SIZE} \
 66 | --per_device_eval_batch_size 4 \
 67 | --gradient_accumulation_steps ${GRADIENT_ACCU_STEPS} \
 68 | --evaluation_strategy "no" \
 69 | --save_strategy "steps" \
 70 | --save_steps 500 \
 71 | --save_total_limit 20 \
 72 | --learning_rate ${BASE_LR} \
 73 | --weight_decay 0. \
 74 | --warmup_ratio 0.03 \
 75 | --lr_scheduler_type "cosine" \
 76 | --logging_steps 1 \
 77 | --tf32 True \
 78 | --model_max_length 4096 \
 79 | --gradient_checkpointing True \
 80 | --dataloader_num_workers 8 \
 81 | --lazy_preprocess True \
 82 | --run_name ${SAVE_PATH} \
 83 | --dataloader_drop_last True \
 84 | --report_to tensorboard | tee  train_${SAVE_PATH}.log"
 85 | 
 86 | while (( retry_count < MAX_RETRIES )); do
 87 |     # 执行命令
 88 |     eval $command_to_run
 89 |     echo "命令失败，重试中..."
 90 |     ((retry_count++))
 91 | 
 92 |     # 等待一段时间后再重试
 93 |     sleep $WAIT_TIME
 94 |     # fi
 95 | done
 96 | 
 97 | # 检查是否超过最大重试次数
 98 | if (( retry_count == MAX_RETRIES )); then
 99 |     echo "命令在达到最大重试次数后仍然失败。"
100 |     exit 1
101 | fi
102 | 
103 | 


--------------------------------------------------------------------------------
/scripts/train/llama3/image2text_finetune.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | set -x
  3 | 
  4 | # wandb login
  5 | echo "DLC seemed world size(actually nodes): ${WORLD_SIZE}"
  6 | NNODES=${WORLD_SIZE}
  7 | NODE_RANK=${RANK}
  8 | 
  9 | export CUDA_DEVICE_MAX_CONNECTIONS=1
 10 | export GPUS_PER_NODE=8
 11 | # export NNODES=2
 12 | export BATCH_SIZE=4
 13 | export GRADIENT_ACCU_STEPS=4
 14 | export MASTER_PORT=29588
 15 | export CPUS_PER_TASK=16
 16 | export QUOTA=reserved
 17 | 
 18 | export DATA_PATH=./datasets/openomni/json/openomni_stage2-2.json
 19 | export SAVE_PATH=openomni_stage2-2_qwen_2
 20 | export BASE_LR=2e-5
 21 | export VIT_LR=2e-6
 22 | 
 23 | DISTRIBUTED_ARGS="
 24 |     --nproc_per_node $GPUS_PER_NODE \
 25 |     --nnodes $NNODES \
 26 |     --node_rank $NODE_RANK \
 27 |     --master_addr $MASTER_ADDR \
 28 |     --master_port $MASTER_PORT
 29 | "
 30 | 
 31 | # 定义重试的最大次数
 32 | MAX_RETRIES=5
 33 | 
 34 | # 每次重试之间的等待时间，单位为秒
 35 | WAIT_TIME=200
 36 | 
 37 | # 当前的重试次数
 38 | retry_count=0
 39 | 
 40 | # 要执行的命令
 41 | command_to_run="torchrun --nproc_per_node $GPUS_PER_NODE openomni/train/train_mem.py \
 42 | --deepspeed ./scripts/zero2.json \
 43 | --model_name_or_path ./datasets/checkpoints/qwen/Qwen2-7B-Instruct \
 44 | --version llava_llama_3  \
 45 | --data_path ${DATA_PATH} \
 46 | --image_folder ./datasets \
 47 | --speech_folder ./datasets \
 48 | --vision_tower ./checkpoints/openai/clip-vit-large-patch14-336 \
 49 | --mm_projector_type mlp2x_gelu \
 50 | --freeze_backbone False \
 51 | --tune_speech_adapter False \
 52 | --pretrain_mm_mlp_adapter ./checkpoints/openomni_stage2_qwen_2/mm_projector.bin \
 53 | --speech_encoder ./checkpoints/openai-whisper/large-v3.pt \
 54 | --unfreeze_mm_vision_tower True \
 55 | --mm_vision_tower_lr ${VIT_LR} \
 56 | --image_aspect_ratio anyres \
 57 | --group_by_modality_length True \
 58 | --mm_vision_select_layer -2 \
 59 | --mm_vision_select_feature patch \
 60 | --mm_patch_merge_type spatial_unpad \
 61 | --mm_use_im_start_end False \
 62 | --mm_use_im_patch_token False \
 63 | --bf16 True \
 64 | --output_dir ./checkpoints/${SAVE_PATH} \
 65 | --num_train_epochs 1 \
 66 | --per_device_train_batch_size ${BATCH_SIZE} \
 67 | --per_device_eval_batch_size 4 \
 68 | --gradient_accumulation_steps ${GRADIENT_ACCU_STEPS} \
 69 | --evaluation_strategy "no" \
 70 | --save_strategy "steps" \
 71 | --save_steps 100 \
 72 | --save_total_limit 20 \
 73 | --learning_rate ${BASE_LR} \
 74 | --weight_decay 0. \
 75 | --warmup_ratio 0.03 \
 76 | --lr_scheduler_type "cosine" \
 77 | --logging_steps 1 \
 78 | --tf32 True \
 79 | --model_max_length 4096 \
 80 | --gradient_checkpointing True \
 81 | --dataloader_num_workers 16 \
 82 | --lazy_preprocess True \
 83 | --run_name ${SAVE_PATH} \
 84 | --dataloader_drop_last True \
 85 | --report_to tensorboard | tee  train_${SAVE_PATH}.log"
 86 | 
 87 | while (( retry_count < MAX_RETRIES )); do
 88 |     # 执行命令
 89 |     eval $command_to_run
 90 | 
 91 |     # # 检查命令的退出状态
 92 |     # if [[ $? -eq 0 ]]; then
 93 |     #     # 命令成功，退出循环
 94 |     #     echo "命令成功执行。"
 95 |     #     break
 96 |     # else
 97 |         # 命令失败，增加重试计数
 98 |     echo "命令失败，重试中..."
 99 |     ((retry_count++))
100 | 
101 |     # 等待一段时间后再重试
102 |     sleep $WAIT_TIME
103 |     # fi
104 | done
105 | 
106 | # 检查是否超过最大重试次数
107 | if (( retry_count == MAX_RETRIES )); then
108 |     echo "命令在达到最大重试次数后仍然失败。"
109 |     exit 1
110 | fi
111 | 
112 | 


--------------------------------------------------------------------------------
/scripts/train/llama3/image2text_pretrain.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -x
 3 | 
 4 | # wandb login
 5 | export CUDA_DEVICE_MAX_CONNECTIONS=1
 6 | export GPUS_PER_NODE=8
 7 | export NNODES=4
 8 | export BATCH_SIZE=8
 9 | export GRADIENT_ACCU_STEPS=4
10 | export MASTER_PORT=29504
11 | export CPUS_PER_TASK=16
12 | export QUOTA=reserved
13 | 
14 | export DATA_PATH=./datasets/openomni/json/openomni_stage2-1.json
15 | export SAVE_PATH=openomni_stage2-1_qwen_2
16 | export BASE_LR=1e-3
17 | 
18 | 
19 | bash -c "torchrun --nproc_per_node $GPUS_PER_NODE openomni/train/train_mem.py \
20 | --deepspeed ./scripts/zero2.json \
21 | --model_name_or_path  ./checkpoints/Meta-Llama-3.1-8B-Instruct \
22 | --version plain \
23 | --data_path ${DATA_PATH} \
24 | --image_folder ./datasets/llava/llava_pretrain/images \
25 | --vision_tower ./checkpoints/openai/clip-vit-large-patch14-336 \
26 | --speech_encoder ./checkpoints/openai-whisper/large-v3.pt \
27 | --mm_projector_type mlp2x_gelu \
28 | --tune_mm_mlp_adapter True \
29 | --unfreeze_mm_vision_tower False \
30 | --image_aspect_ratio anyres \
31 | --mm_vision_select_layer -2 \
32 | --mm_vision_select_feature patch \
33 | --mm_patch_merge_type spatial_unpad \
34 | --mm_use_im_start_end False \
35 | --mm_use_im_patch_token False \
36 | --bf16 True \
37 | --output_dir ./checkpoints/${SAVE_PATH} \
38 | --num_train_epochs 1 \
39 | --per_device_train_batch_size ${BATCH_SIZE} \
40 | --per_device_eval_batch_size 4 \
41 | --gradient_accumulation_steps ${GRADIENT_ACCU_STEPS} \
42 | --evaluation_strategy "no" \
43 | --save_strategy "steps" \
44 | --save_steps 500 \
45 | --save_total_limit 2 \
46 | --learning_rate ${BASE_LR} \
47 | --weight_decay 0. \
48 | --warmup_ratio 0.03 \
49 | --lr_scheduler_type "cosine" \
50 | --logging_steps 1 \
51 | --tf32 True \
52 | --model_max_length 8096 \
53 | --gradient_checkpointing True \
54 | --dataloader_num_workers 4 \
55 | --lazy_preprocess True \
56 | --run_name ${SAVE_PATH} \
57 | --dataloader_drop_last True \
58 | --report_to tensorboard | tee train_${SAVE_PATH}.log"
59 | 


--------------------------------------------------------------------------------
/scripts/train/llama3/speech2text_pretrain.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -x
 3 | 
 4 | # wandb login
 5 | export CUDA_DEVICE_MAX_CONNECTIONS=1
 6 | export GPUS_PER_NODE=8
 7 | export NNODES=4
 8 | export BATCH_SIZE=8
 9 | export GRADIENT_ACCU_STEPS=4
10 | export MASTER_PORT=29504
11 | export CPUS_PER_TASK=16
12 | export QUOTA=reserved
13 | 
14 | export DATA_PATH=./datasets/openomni/json/openomni_stage1-1.json
15 | export SAVE_PATH=openomni_stage1-1_qwen_2
16 | export BASE_LR=1e-4
17 | 
18 | 
19 | bash -c "torchrun --nproc_per_node $GPUS_PER_NODE openomni/train/train_mem.py \
20 | --deepspeed ./scripts/zero2.json \
21 | --model_name_or_path  ./checkpoints/Meta-Llama-3.1-8B-Instruct \
22 | --version plain \
23 | --data_path ${DATA_PATH} \
24 | --image_folder ./datasets/llava/llava_pretrain/images \
25 | --vision_tower ./checkpoints/openai/clip-vit-large-patch14-336 \
26 | --speech_encoder ./checkpoints/openai-whisper/large-v3.pt \
27 | --mm_projector_type mlp2x_gelu \
28 | --freeze_backbone True \
29 | --tune_mm_mlp_adapter False \
30 | --tune_speech_adapter True \
31 | --freeze_mm_mlp_adapter True \
32 | --unfreeze_mm_vision_tower False \
33 | --image_aspect_ratio anyres \
34 | --mm_vision_select_layer -2 \
35 | --mm_vision_select_feature patch \
36 | --mm_patch_merge_type spatial_unpad \
37 | --mm_use_im_start_end False \
38 | --mm_use_im_patch_token False \
39 | --bf16 True \
40 | --group_by_modality_length True \
41 | --output_dir ./checkpoints/${SAVE_PATH} \
42 | --num_train_epochs 1 \
43 | --per_device_train_batch_size ${BATCH_SIZE} \
44 | --per_device_eval_batch_size 4 \
45 | --gradient_accumulation_steps ${GRADIENT_ACCU_STEPS} \
46 | --evaluation_strategy "no" \
47 | --save_strategy "steps" \
48 | --save_steps 500 \
49 | --save_total_limit 2 \
50 | --learning_rate ${BASE_LR} \
51 | --weight_decay 0. \
52 | --warmup_ratio 0.03 \
53 | --lr_scheduler_type "cosine" \
54 | --logging_steps 1 \
55 | --tf32 True \
56 | --model_max_length 8096 \
57 | --gradient_checkpointing True \
58 | --dataloader_num_workers 8 \
59 | --lazy_preprocess True \
60 | --run_name ${SAVE_PATH} \
61 | --dataloader_drop_last True \
62 | --report_to tensorboard | tee train_${SAVE_PATH}.log"
63 | 


--------------------------------------------------------------------------------
/scripts/train/llama3/text2speech_dpo.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -x
 3 | 
 4 | # wandb login
 5 | echo "DLC seemed world size(actually nodes): ${WORLD_SIZE}"
 6 | NNODES=${WORLD_SIZE}
 7 | NODE_RANK=${RANK}
 8 | 
 9 | export CUDA_DEVICE_MAX_CONNECTIONS=1
10 | export GPUS_PER_NODE=8
11 | # export NNODES=2
12 | export BATCH_SIZE=1
13 | export GRADIENT_ACCU_STEPS=4
14 | export MASTER_PORT=29588
15 | export CPUS_PER_TASK=16
16 | export QUOTA=reserved
17 | 
18 | export DATA_PATH=./datasets/openomni/json/openomni_stage3-2.json
19 | export SAVE_PATH=openomni_stage3-2_qwen_2
20 | export BASE_LR=2e-5
21 | export VIT_LR=2e-6
22 | 
23 | DISTRIBUTED_ARGS="
24 |     --nproc_per_node $GPUS_PER_NODE \
25 |     --nnodes $NNODES \
26 |     --node_rank $NODE_RANK \
27 |     --master_addr $MASTER_ADDR \
28 |     --master_port $MASTER_PORT
29 | "
30 | 
31 | # 定义重试的最大次数
32 | MAX_RETRIES=3
33 | 
34 | # 每次重试之间的等待时间，单位为秒
35 | WAIT_TIME=20
36 | 
37 | # 当前的重试次数
38 | retry_count=0
39 | 
40 | command_to_run="torchrun --nproc_per_node $GPUS_PER_NODE openomni/train/train_mem.py \
41 | --deepspeed ./scripts/zero2.json \
42 | --model_name_or_path ./checkpoints/openomni_stage3_qwen_3/checkpoint-last \
43 | --version llava_llama_3  \
44 | --data_path ${DATA_PATH} \
45 | --image_folder ./datasets \
46 | --speech_folder ./datasets \
47 | --vision_tower ./datasets/checkpoints/openai/clip-vit-large-patch14-336 \
48 | --mm_projector_type mlp2x_gelu \
49 | --freeze_backbone True \
50 | --tune_mm_mlp_adapter False \
51 | --freeze_mm_mlp_adapter True \
52 | --tune_speech_generator_only True \
53 | --tune_speech_generator_dpo True \
54 | --pretrain_mm_mlp_adapter ./checkpoints/openomni_stage2_qwen_2/mm_projector.bin \
55 | --speech_encoder ./checkpoints/openai-whisper/large-v3.pt \
56 | --speech_generator ./checkpoints/speech_generator/generator_16k.pt \
57 | --unfreeze_mm_vision_tower False \
58 | --mm_vision_tower_lr ${VIT_LR} \
59 | --speech_generator_lr ${VIT_LR} \
60 | --mm_projector_lr ${VIT_LR} \
61 | --image_aspect_ratio anyres \
62 | --group_by_modality_length True \
63 | --mm_vision_select_layer -2 \
64 | --mm_vision_select_feature patch \
65 | --mm_patch_merge_type spatial_unpad \
66 | --mm_use_im_start_end False \
67 | --mm_use_im_patch_token False \
68 | --bf16 True \
69 | --output_dir ./checkpoints/${SAVE_PATH} \
70 | --num_train_epochs 3 \
71 | --per_device_train_batch_size ${BATCH_SIZE} \
72 | --per_device_eval_batch_size 4 \
73 | --gradient_accumulation_steps ${GRADIENT_ACCU_STEPS} \
74 | --evaluation_strategy "no" \
75 | --save_strategy "steps" \
76 | --save_steps 1000 \
77 | --save_total_limit 20 \
78 | --learning_rate ${BASE_LR} \
79 | --weight_decay 0. \
80 | --warmup_ratio 0.03 \
81 | --lr_scheduler_type "cosine" \
82 | --logging_steps 1 \
83 | --tf32 True \
84 | --model_max_length 8196 \
85 | --gradient_checkpointing True \
86 | --dataloader_num_workers 8 \
87 | --lazy_preprocess True \
88 | --speech_generator_type "ar" \
89 | --run_name ${SAVE_PATH} \
90 | --dataloader_drop_last True \
91 | --report_to tensorboard | tee  train_${SAVE_PATH}.log"
92 | 
93 | eval $command_to_run
94 | 
95 | 


--------------------------------------------------------------------------------
/scripts/train/llama3/text2speech_pretrain.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -x
 3 | 
 4 | # wandb login
 5 | echo "DLC seemed world size(actually nodes): ${WORLD_SIZE}"
 6 | NNODES=${WORLD_SIZE}
 7 | NODE_RANK=${RANK}
 8 | 
 9 | export CUDA_DEVICE_MAX_CONNECTIONS=1
10 | export GPUS_PER_NODE=8
11 | # export NNODES=2
12 | export BATCH_SIZE=1
13 | export GRADIENT_ACCU_STEPS=4
14 | export MASTER_PORT=29588
15 | export CPUS_PER_TASK=16
16 | export QUOTA=reserved
17 | 
18 | export DATA_PATH=./datasets/openomni/json/openomni_stage3-1.json
19 | export SAVE_PATH=openomni_stage3-1_qwen_2
20 | export BASE_LR=2e-5
21 | export VIT_LR=2e-6
22 | 
23 | DISTRIBUTED_ARGS="
24 |     --nproc_per_node $GPUS_PER_NODE \
25 |     --nnodes $NNODES \
26 |     --node_rank $NODE_RANK \
27 |     --master_addr $MASTER_ADDR \
28 |     --master_port $MASTER_PORT
29 | "
30 | 
31 | # 定义重试的最大次数
32 | MAX_RETRIES=3
33 | 
34 | # 每次重试之间的等待时间，单位为秒
35 | WAIT_TIME=20
36 | 
37 | # 当前的重试次数
38 | retry_count=0
39 | 
40 | command_to_run="torchrun --nproc_per_node $GPUS_PER_NODE openomni/train/train_mem.py \
41 | --deepspeed ./scripts/zero2.json \
42 | --model_name_or_path ./checkpoints/openomni_stage3_qwen_3/checkpoint-20900 \
43 | --version llava_llama_3  \
44 | --data_path ${DATA_PATH} \
45 | --image_folder ./datasets \
46 | --speech_folder ./datasets \
47 | --vision_tower ./checkpoints/openai/clip-vit-large-patch14-336 \
48 | --mm_projector_type mlp2x_gelu \
49 | --freeze_backbone True \
50 | --tune_mm_mlp_adapter False \
51 | --freeze_mm_mlp_adapter True \
52 | --tune_speech_generator_only True \
53 | --pretrain_mm_mlp_adapter ./checkpoints/openomni_stage2_qwen_2/mm_projector.bin \
54 | --speech_encoder ./checkpoints/openai-whisper/large-v3.pt \
55 | --speech_generator ./checkpoints/speech_generator/generator_16K.pt \
56 | --unfreeze_mm_vision_tower False \
57 | --mm_vision_tower_lr ${VIT_LR} \
58 | --speech_generator_lr ${VIT_LR} \
59 | --mm_projector_lr ${VIT_LR} \
60 | --image_aspect_ratio anyres \
61 | --group_by_modality_length True \
62 | --mm_vision_select_layer -2 \
63 | --mm_vision_select_feature patch \
64 | --mm_patch_merge_type spatial_unpad \
65 | --mm_use_im_start_end False \
66 | --mm_use_im_patch_token False \
67 | --bf16 True \
68 | --output_dir checkpoints/${SAVE_PATH} \
69 | --num_train_epochs 1 \
70 | --per_device_train_batch_size ${BATCH_SIZE} \
71 | --per_device_eval_batch_size 4 \
72 | --gradient_accumulation_steps ${GRADIENT_ACCU_STEPS} \
73 | --evaluation_strategy "no" \
74 | --save_strategy "steps" \
75 | --save_steps 1000 \
76 | --save_total_limit 20 \
77 | --learning_rate ${BASE_LR} \
78 | --weight_decay 0. \
79 | --warmup_ratio 0.03 \
80 | --lr_scheduler_type "cosine" \
81 | --logging_steps 1 \
82 | --tf32 True \
83 | --model_max_length 8196 \
84 | --gradient_checkpointing True \
85 | --dataloader_num_workers 8 \
86 | --lazy_preprocess True \
87 | --speech_generator_type "ar" \
88 | --run_name ${SAVE_PATH} \
89 | --dataloader_drop_last True \
90 | --report_to tensorboard | tee  train_${SAVE_PATH}.log"
91 | 
92 | eval $command_to_run
93 | 


--------------------------------------------------------------------------------
/scripts/train/llama3/text2speech_pretrain_ctc.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -x
 3 | 
 4 | # wandb login
 5 | echo "DLC seemed world size(actually nodes): ${WORLD_SIZE}"
 6 | NNODES=${WORLD_SIZE}
 7 | NODE_RANK=${RANK}
 8 | 
 9 | export CUDA_DEVICE_MAX_CONNECTIONS=1
10 | export GPUS_PER_NODE=8
11 | # export NNODES=2
12 | export BATCH_SIZE=1
13 | export GRADIENT_ACCU_STEPS=4
14 | export MASTER_PORT=29588
15 | export CPUS_PER_TASK=16
16 | export QUOTA=reserved
17 | 
18 | export DATA_PATH=./datasets/openomni/json/openomni_stage3-1.json
19 | export SAVE_PATH=openomni_stage3-1_qwen_2_ctc
20 | export BASE_LR=2e-4
21 | export VIT_LR=2e-4
22 | 
23 | DISTRIBUTED_ARGS="
24 |     --nproc_per_node $GPUS_PER_NODE \
25 |     --nnodes $NNODES \
26 |     --node_rank $NODE_RANK \
27 |     --master_addr $MASTER_ADDR \
28 |     --master_port $MASTER_PORT
29 | "
30 | 
31 | # 定义重试的最大次数
32 | MAX_RETRIES=3
33 | 
34 | # 每次重试之间的等待时间，单位为秒
35 | WAIT_TIME=20
36 | 
37 | # 当前的重试次数
38 | retry_count=0
39 | 
40 | command_to_run="torchrun --nproc_per_node $GPUS_PER_NODE openomni/train/train_mem.py \
41 | --deepspeed ./scripts/zero2.json \
42 | --model_name_or_path ./checkpoints/openomni_stage3_qwen_3/checkpoint-last \
43 | --version llava_qwen_2 \
44 | --data_path ${DATA_PATH} \
45 | --image_folder ./datasets \
46 | --speech_folder ./datasets \
47 | --vision_tower ./checkpoints/openai/clip-vit-large-patch14-336 \
48 | --mm_projector_type mlp2x_gelu \
49 | --freeze_backbone True \
50 | --tune_mm_mlp_adapter False \
51 | --freeze_mm_mlp_adapter True \
52 | --tune_speech_generator_only True \
53 | --pretrain_mm_mlp_adapter ./checkpoints/openomni_stage2_qwen_2/mm_projector.bin \
54 | --speech_encoder ./checkpoints/openai-whipser/large-v3.pt \
55 | --speech_generator ./checkpoints/speech_generator/generator_16k.pt \
56 | --unfreeze_mm_vision_tower False \
57 | --mm_vision_tower_lr ${VIT_LR} \
58 | --speech_generator_lr ${VIT_LR} \
59 | --mm_projector_lr ${VIT_LR} \
60 | --image_aspect_ratio anyres \
61 | --group_by_modality_length True \
62 | --mm_vision_select_layer -2 \
63 | --mm_vision_select_feature patch \
64 | --mm_patch_merge_type spatial_unpad \
65 | --mm_use_im_start_end False \
66 | --mm_use_im_patch_token False \
67 | --bf16 True \
68 | --output_dir ./checkpoints/${SAVE_PATH} \
69 | --num_train_epochs 3 \
70 | --per_device_train_batch_size ${BATCH_SIZE} \
71 | --per_device_eval_batch_size 4 \
72 | --gradient_accumulation_steps ${GRADIENT_ACCU_STEPS} \
73 | --evaluation_strategy "no" \
74 | --save_strategy "steps" \
75 | --save_steps 1000 \
76 | --save_total_limit 20 \
77 | --learning_rate ${BASE_LR} \
78 | --weight_decay 0. \
79 | --warmup_ratio 0.03 \
80 | --lr_scheduler_type "cosine" \
81 | --logging_steps 1 \
82 | --tf32 True \
83 | --model_max_length 8196 \
84 | --gradient_checkpointing True \
85 | --dataloader_num_workers 8 \
86 | --lazy_preprocess True \
87 | --run_name ${SAVE_PATH} \
88 | --dataloader_drop_last True \
89 | --speech_generator_type "ctc" \
90 | --unit_vocab_size 6561 \
91 | --report_to tensorboard | tee train_${SAVE_PATH}.log"
92 | 
93 | eval $command_to_run
94 | 
95 | 
96 | 


--------------------------------------------------------------------------------
/scripts/train/qwen2/asr_finetune.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | set -x
  3 | 
  4 | # wandb login
  5 | echo "DLC seemed world size(actually nodes): ${WORLD_SIZE}"
  6 | NNODES=${WORLD_SIZE}
  7 | NODE_RANK=${RANK}
  8 | 
  9 | export CUDA_DEVICE_MAX_CONNECTIONS=1
 10 | export GPUS_PER_NODE=8
 11 | # export NNODES=2
 12 | export BATCH_SIZE=8
 13 | export GRADIENT_ACCU_STEPS=4
 14 | export MASTER_PORT=29588
 15 | export CPUS_PER_TASK=16
 16 | export QUOTA=reserved
 17 | 
 18 | export DATA_PATH=./datasets/openomni/json/openomni_stage1-1.json
 19 | export SAVE_PATH=openomni_asr_qwen_2
 20 | export BASE_LR=2e-5
 21 | export VIT_LR=2e-6
 22 | 
 23 | DISTRIBUTED_ARGS="
 24 |     --nproc_per_node $GPUS_PER_NODE \
 25 |     --nnodes $NNODES \
 26 |     --node_rank $NODE_RANK \
 27 |     --master_addr $MASTER_ADDR \
 28 |     --master_port $MASTER_PORT
 29 | "
 30 | 
 31 | # 定义重试的最大次数
 32 | MAX_RETRIES=5
 33 | 
 34 | # 每次重试之间的等待时间，单位为秒
 35 | WAIT_TIME=200
 36 | 
 37 | # 当前的重试次数
 38 | retry_count=0
 39 | 
 40 | # 要执行的命令
 41 | command_to_run="torchrun --nproc_per_node $GPUS_PER_NODE openomni/train/train_mem.py \
 42 | --deepspeed ./scripts/zero2.json \
 43 | --model_name_or_path ./checkpoints/openomni_stage3_qwen_3/checkpoint-last \
 44 | --version llava_qwen_2 \
 45 | --data_path ${DATA_PATH} \
 46 | --image_folder ./datasets \
 47 | --speech_folder ./datasets \
 48 | --vision_tower ./checkpoints/openai/clip-vit-large-patch14-336 \
 49 | --mm_projector_type mlp2x_gelu \
 50 | --freeze_backbone False \
 51 | --tune_speech_adapter True \
 52 | --speech_encoder ./checkpoints/openai-whisper/large-v3.pt \
 53 | --unfreeze_mm_vision_tower True \
 54 | --mm_vision_tower_lr ${VIT_LR} \
 55 | --image_aspect_ratio anyres \
 56 | --group_by_modality_length True \
 57 | --mm_vision_select_layer -2 \
 58 | --mm_vision_select_feature patch \
 59 | --mm_patch_merge_type spatial_unpad \
 60 | --mm_use_im_start_end False \
 61 | --mm_use_im_patch_token False \
 62 | --bf16 True \
 63 | --output_dir ./checkpoints/${SAVE_PATH} \
 64 | --num_train_epochs 1 \
 65 | --per_device_train_batch_size ${BATCH_SIZE} \
 66 | --per_device_eval_batch_size 4 \
 67 | --gradient_accumulation_steps ${GRADIENT_ACCU_STEPS} \
 68 | --evaluation_strategy "no" \
 69 | --save_strategy "steps" \
 70 | --save_steps 500 \
 71 | --save_total_limit 20 \
 72 | --learning_rate ${BASE_LR} \
 73 | --weight_decay 0. \
 74 | --warmup_ratio 0.03 \
 75 | --lr_scheduler_type "cosine" \
 76 | --logging_steps 1 \
 77 | --tf32 True \
 78 | --model_max_length 4096 \
 79 | --gradient_checkpointing True \
 80 | --dataloader_num_workers 8 \
 81 | --lazy_preprocess True \
 82 | --run_name ${SAVE_PATH} \
 83 | --dataloader_drop_last True \
 84 | --report_to tensorboard | tee  train_${SAVE_PATH}.log"
 85 | 
 86 | while (( retry_count < MAX_RETRIES )); do
 87 |     # 执行命令
 88 |     eval $command_to_run
 89 |     echo "命令失败，重试中..."
 90 |     ((retry_count++))
 91 | 
 92 |     # 等待一段时间后再重试
 93 |     sleep $WAIT_TIME
 94 |     # fi
 95 | done
 96 | 
 97 | # 检查是否超过最大重试次数
 98 | if (( retry_count == MAX_RETRIES )); then
 99 |     echo "命令在达到最大重试次数后仍然失败。"
100 |     exit 1
101 | fi
102 | 
103 | 


--------------------------------------------------------------------------------
/scripts/train/qwen2/image2text_finetune.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | set -x
  3 | 
  4 | # wandb login
  5 | echo "DLC seemed world size(actually nodes): ${WORLD_SIZE}"
  6 | NNODES=${WORLD_SIZE}
  7 | NODE_RANK=${RANK}
  8 | 
  9 | export CUDA_DEVICE_MAX_CONNECTIONS=1
 10 | export GPUS_PER_NODE=8
 11 | # export NNODES=2
 12 | export BATCH_SIZE=4
 13 | export GRADIENT_ACCU_STEPS=4
 14 | export MASTER_PORT=29588
 15 | export CPUS_PER_TASK=16
 16 | export QUOTA=reserved
 17 | 
 18 | export DATA_PATH=./datasets/openomni/json/openomni_stage2-2.json
 19 | export SAVE_PATH=openomni_stage2-2_qwen_2
 20 | export BASE_LR=2e-5
 21 | export VIT_LR=2e-6
 22 | 
 23 | DISTRIBUTED_ARGS="
 24 |     --nproc_per_node $GPUS_PER_NODE \
 25 |     --nnodes $NNODES \
 26 |     --node_rank $NODE_RANK \
 27 |     --master_addr $MASTER_ADDR \
 28 |     --master_port $MASTER_PORT
 29 | "
 30 | 
 31 | # 定义重试的最大次数
 32 | MAX_RETRIES=5
 33 | 
 34 | # 每次重试之间的等待时间，单位为秒
 35 | WAIT_TIME=200
 36 | 
 37 | # 当前的重试次数
 38 | retry_count=0
 39 | 
 40 | # 要执行的命令
 41 | command_to_run="torchrun --nproc_per_node $GPUS_PER_NODE openomni/train/train_mem.py \
 42 | --deepspeed ./scripts/zero2.json \
 43 | --model_name_or_path ./checkpoints/qwen/Qwen2-7B-Instruct \
 44 | --version llava_qwen_2 \
 45 | --data_path ${DATA_PATH} \
 46 | --image_folder ./datasets \
 47 | --speech_folder ./datasets \
 48 | --vision_tower ./datasets/checkpoints/openai/clip-vit-large-patch14-336 \
 49 | --mm_projector_type mlp2x_gelu \
 50 | --freeze_backbone False \
 51 | --tune_speech_adapter False \
 52 | --pretrain_mm_mlp_adapter ./checkpoints/openomni_stage2_qwen_2/mm_projector.bin \
 53 | --speech_encoder ./checkpoints/openai-whisper/large-v3.pt \
 54 | --unfreeze_mm_vision_tower True \
 55 | --mm_vision_tower_lr ${VIT_LR} \
 56 | --image_aspect_ratio anyres \
 57 | --group_by_modality_length True \
 58 | --mm_vision_select_layer -2 \
 59 | --mm_vision_select_feature patch \
 60 | --mm_patch_merge_type spatial_unpad \
 61 | --mm_use_im_start_end False \
 62 | --mm_use_im_patch_token False \
 63 | --bf16 True \
 64 | --output_dir ./checkpoints/${SAVE_PATH} \
 65 | --num_train_epochs 1 \
 66 | --per_device_train_batch_size ${BATCH_SIZE} \
 67 | --per_device_eval_batch_size 4 \
 68 | --gradient_accumulation_steps ${GRADIENT_ACCU_STEPS} \
 69 | --evaluation_strategy "no" \
 70 | --save_strategy "steps" \
 71 | --save_steps 100 \
 72 | --save_total_limit 20 \
 73 | --learning_rate ${BASE_LR} \
 74 | --weight_decay 0. \
 75 | --warmup_ratio 0.03 \
 76 | --lr_scheduler_type "cosine" \
 77 | --logging_steps 1 \
 78 | --tf32 True \
 79 | --model_max_length 4096 \
 80 | --gradient_checkpointing True \
 81 | --dataloader_num_workers 16 \
 82 | --lazy_preprocess True \
 83 | --run_name ${SAVE_PATH} \
 84 | --dataloader_drop_last True \
 85 | --report_to tensorboard | tee  train_${SAVE_PATH}.log"
 86 | 
 87 | while (( retry_count < MAX_RETRIES )); do
 88 |     # 执行命令
 89 |     eval $command_to_run
 90 | 
 91 |     # # 检查命令的退出状态
 92 |     # if [[ $? -eq 0 ]]; then
 93 |     #     # 命令成功，退出循环
 94 |     #     echo "命令成功执行。"
 95 |     #     break
 96 |     # else
 97 |         # 命令失败，增加重试计数
 98 |     echo "命令失败，重试中..."
 99 |     ((retry_count++))
100 | 
101 |     # 等待一段时间后再重试
102 |     sleep $WAIT_TIME
103 |     # fi
104 | done
105 | 
106 | # 检查是否超过最大重试次数
107 | if (( retry_count == MAX_RETRIES )); then
108 |     echo "命令在达到最大重试次数后仍然失败。"
109 |     exit 1
110 | fi
111 | 
112 | 


--------------------------------------------------------------------------------
/scripts/train/qwen2/image2text_pretrain.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -x
 3 | 
 4 | # wandb login
 5 | export CUDA_DEVICE_MAX_CONNECTIONS=1
 6 | export GPUS_PER_NODE=8
 7 | export NNODES=4
 8 | export BATCH_SIZE=4
 9 | export GRADIENT_ACCU_STEPS=8
10 | export MASTER_PORT=29504
11 | export CPUS_PER_TASK=16
12 | export QUOTA=reserved
13 | 
14 | export DATA_PATH=./datasets/openomni/json/openomni_stage2-1.json
15 | export SAVE_PATH=openomni_stage2-1_qwen_2
16 | export BASE_LR=1e-3
17 | 
18 | 
19 | bash -c "torchrun --nproc_per_node $GPUS_PER_NODE openomni/train/train_mem.py \
20 | --deepspeed ./scripts/zero2.json \
21 | --model_name_or_path ./checkpoints/qwen/Qwen2-7B-Instruct \
22 | --version plain \
23 | --data_path ${DATA_PATH} \
24 | --image_folder ./datasets/llava/llava_pretrain/images \
25 | --vision_tower ./checkpoints/openai/clip-vit-large-patch14-336 \
26 | --speech_encoder ./checkpoints/openai-whisper/large-v3.pt \
27 | --mm_projector_type mlp2x_gelu \
28 | --tune_mm_mlp_adapter True \
29 | --unfreeze_mm_vision_tower False \
30 | --image_aspect_ratio anyres \
31 | --mm_vision_select_layer -2 \
32 | --mm_vision_select_feature patch \
33 | --mm_patch_merge_type spatial_unpad \
34 | --mm_use_im_start_end False \
35 | --mm_use_im_patch_token False \
36 | --bf16 True \
37 | --output_dir ./checkpoints/${SAVE_PATH} \
38 | --num_train_epochs 1 \
39 | --per_device_train_batch_size ${BATCH_SIZE} \
40 | --per_device_eval_batch_size 4 \
41 | --gradient_accumulation_steps ${GRADIENT_ACCU_STEPS} \
42 | --evaluation_strategy "no" \
43 | --save_strategy "steps" \
44 | --save_steps 500 \
45 | --save_total_limit 2 \
46 | --learning_rate ${BASE_LR} \
47 | --weight_decay 0. \
48 | --warmup_ratio 0.03 \
49 | --lr_scheduler_type "cosine" \
50 | --logging_steps 1 \
51 | --tf32 True \
52 | --model_max_length 8096 \
53 | --gradient_checkpointing True \
54 | --dataloader_num_workers 4 \
55 | --lazy_preprocess True \
56 | --run_name ${SAVE_PATH} \
57 | --dataloader_drop_last True \
58 | --report_to tensorboard | tee train_${SAVE_PATH}.log"
59 | 


--------------------------------------------------------------------------------
/scripts/train/qwen2/speech2text_pretrain.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -x
 3 | 
 4 | # wandb login
 5 | export CUDA_DEVICE_MAX_CONNECTIONS=1
 6 | export GPUS_PER_NODE=8
 7 | export NNODES=4
 8 | export BATCH_SIZE=8
 9 | export GRADIENT_ACCU_STEPS=4
10 | export MASTER_PORT=29504
11 | export CPUS_PER_TASK=16
12 | export QUOTA=reserved
13 | 
14 | export DATA_PATH=./datasets/openomni/json/openomni_stage1-1.json
15 | export SAVE_PATH=openomni_stage1-1_qwen_2
16 | export BASE_LR=1e-4
17 | 
18 | 
19 | bash -c "torchrun --nproc_per_node $GPUS_PER_NODE openomni/train/train_mem.py \
20 | --deepspeed ./scripts/zero2.json \
21 | --model_name_or_path ./checkpoints/qwen/Qwen2-7B-Instruct \
22 | --version plain \
23 | --data_path ${DATA_PATH} \
24 | --image_folder ./datasets/llava/llava_pretrain/images \
25 | --vision_tower ./checkpoints/openai/clip-vit-large-patch14-336 \
26 | --speech_encoder ./checkpoints/openai-whisper/large-v3.pt \
27 | --mm_projector_type mlp2x_gelu \
28 | --freeze_backbone True \
29 | --tune_mm_mlp_adapter False \
30 | --tune_speech_adapter True \
31 | --freeze_mm_mlp_adapter True \
32 | --unfreeze_mm_vision_tower False \
33 | --image_aspect_ratio anyres \
34 | --mm_vision_select_layer -2 \
35 | --mm_vision_select_feature patch \
36 | --mm_patch_merge_type spatial_unpad \
37 | --mm_use_im_start_end False \
38 | --mm_use_im_patch_token False \
39 | --bf16 True \
40 | --group_by_modality_length True \
41 | --output_dir checkpoints/${SAVE_PATH} \
42 | --num_train_epochs 1 \
43 | --per_device_train_batch_size ${BATCH_SIZE} \
44 | --per_device_eval_batch_size 4 \
45 | --gradient_accumulation_steps ${GRADIENT_ACCU_STEPS} \
46 | --evaluation_strategy "no" \
47 | --save_strategy "steps" \
48 | --save_steps 500 \
49 | --save_total_limit 2 \
50 | --learning_rate ${BASE_LR} \
51 | --weight_decay 0. \
52 | --warmup_ratio 0.03 \
53 | --lr_scheduler_type "cosine" \
54 | --logging_steps 1 \
55 | --tf32 True \
56 | --model_max_length 8096 \
57 | --gradient_checkpointing True \
58 | --dataloader_num_workers 8 \
59 | --lazy_preprocess True \
60 | --run_name ${SAVE_PATH} \
61 | --dataloader_drop_last True \
62 | --report_to tensorboard | tee train_${SAVE_PATH}.log"
63 | 


--------------------------------------------------------------------------------
/scripts/train/qwen2/text2speech_dpo.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -x
 3 | 
 4 | # wandb login
 5 | echo "DLC seemed world size(actually nodes): ${WORLD_SIZE}"
 6 | NNODES=${WORLD_SIZE}
 7 | NODE_RANK=${RANK}
 8 | 
 9 | export CUDA_DEVICE_MAX_CONNECTIONS=1
10 | export GPUS_PER_NODE=8
11 | # export NNODES=2
12 | export BATCH_SIZE=1
13 | export GRADIENT_ACCU_STEPS=4
14 | export MASTER_PORT=29588
15 | export CPUS_PER_TASK=16
16 | export QUOTA=reserved
17 | 
18 | export DATA_PATH=./datasets/openomni/json/openomni_stage3-2.json
19 | export SAVE_PATH=openomni_stage3-2_qwen_2
20 | export BASE_LR=2e-5
21 | export VIT_LR=2e-6
22 | 
23 | DISTRIBUTED_ARGS="
24 |     --nproc_per_node $GPUS_PER_NODE \
25 |     --nnodes $NNODES \
26 |     --node_rank $NODE_RANK \
27 |     --master_addr $MASTER_ADDR \
28 |     --master_port $MASTER_PORT
29 | "
30 | 
31 | # 定义重试的最大次数
32 | MAX_RETRIES=3
33 | 
34 | # 每次重试之间的等待时间，单位为秒
35 | WAIT_TIME=20
36 | 
37 | # 当前的重试次数
38 | retry_count=0
39 | 
40 | command_to_run="torchrun --nproc_per_node $GPUS_PER_NODE openomni/train/train_mem.py \
41 | --deepspeed ./scripts/zero2.json \
42 | --model_name_or_path ./checkpoints/openomni_stage3_qwen_3/checkpoint-last \
43 | --version llava_qwen_2 \
44 | --data_path ${DATA_PATH} \
45 | --image_folder ./datasets \
46 | --speech_folder ./datasets \
47 | --vision_tower ./checkpoints/openai/clip-vit-large-patch14-336 \
48 | --mm_projector_type mlp2x_gelu \
49 | --freeze_backbone True \
50 | --tune_mm_mlp_adapter False \
51 | --freeze_mm_mlp_adapter True \
52 | --tune_speech_generator_only True \
53 | --tune_speech_generator_dpo True \
54 | --pretrain_mm_mlp_adapter ./checkpoints/openomni_stage2_qwen_2/mm_projector.bin \
55 | --speech_encoder ./checkpoints/openai-whisper/large-v3.pt \
56 | --speech_generator ./checkpoints/speech_generator/generator_16k.pt \
57 | --unfreeze_mm_vision_tower False \
58 | --mm_vision_tower_lr ${VIT_LR} \
59 | --speech_generator_lr ${VIT_LR} \
60 | --mm_projector_lr ${VIT_LR} \
61 | --image_aspect_ratio anyres \
62 | --group_by_modality_length True \
63 | --mm_vision_select_layer -2 \
64 | --mm_vision_select_feature patch \
65 | --mm_patch_merge_type spatial_unpad \
66 | --mm_use_im_start_end False \
67 | --mm_use_im_patch_token False \
68 | --bf16 True \
69 | --output_dir ./checkpoints/${SAVE_PATH} \
70 | --num_train_epochs 3 \
71 | --per_device_train_batch_size ${BATCH_SIZE} \
72 | --per_device_eval_batch_size 4 \
73 | --gradient_accumulation_steps ${GRADIENT_ACCU_STEPS} \
74 | --evaluation_strategy "no" \
75 | --save_strategy "steps" \
76 | --save_steps 1000 \
77 | --save_total_limit 20 \
78 | --learning_rate ${BASE_LR} \
79 | --weight_decay 0. \
80 | --warmup_ratio 0.03 \
81 | --lr_scheduler_type "cosine" \
82 | --logging_steps 1 \
83 | --tf32 True \
84 | --model_max_length 8196 \
85 | --gradient_checkpointing True \
86 | --dataloader_num_workers 8 \
87 | --lazy_preprocess True \
88 | --speech_generator_type "ar" \
89 | --run_name ${SAVE_PATH} \
90 | --dataloader_drop_last True \
91 | --report_to tensorboard | tee  train_${SAVE_PATH}.log"
92 | 
93 | eval $command_to_run
94 | 
95 | 


--------------------------------------------------------------------------------
/scripts/train/qwen2/text2speech_pretrain.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -x
 3 | 
 4 | # wandb login
 5 | echo "DLC seemed world size(actually nodes): ${WORLD_SIZE}"
 6 | NNODES=${WORLD_SIZE}
 7 | NODE_RANK=${RANK}
 8 | 
 9 | export CUDA_DEVICE_MAX_CONNECTIONS=1
10 | export GPUS_PER_NODE=8
11 | # export NNODES=2
12 | export BATCH_SIZE=1
13 | export GRADIENT_ACCU_STEPS=4
14 | export MASTER_PORT=29588
15 | export CPUS_PER_TASK=16
16 | export QUOTA=reserved
17 | 
18 | export DATA_PATH=./datasets/openomni/json/openomni_stage3-1.json
19 | export SAVE_PATH=openomni_stage3-1_qwen_2
20 | export BASE_LR=2e-5
21 | export VIT_LR=2e-6
22 | 
23 | DISTRIBUTED_ARGS="
24 |     --nproc_per_node $GPUS_PER_NODE \
25 |     --nnodes $NNODES \
26 |     --node_rank $NODE_RANK \
27 |     --master_addr $MASTER_ADDR \
28 |     --master_port $MASTER_PORT
29 | "
30 | 
31 | # 定义重试的最大次数
32 | MAX_RETRIES=3
33 | 
34 | # 每次重试之间的等待时间，单位为秒
35 | WAIT_TIME=20
36 | 
37 | # 当前的重试次数
38 | retry_count=0
39 | 
40 | command_to_run="torchrun --nproc_per_node $GPUS_PER_NODE openomni/train/train_mem.py \
41 | --deepspeed ./scripts/zero2.json \
42 | --model_name_or_path ./checkpoints/openomni_stage3_qwen_3/checkpoint-last \
43 | --version llava_qwen_2 \
44 | --data_path ${DATA_PATH} \
45 | --image_folder ./datasets \
46 | --speech_folder ./datasets \
47 | --vision_tower ./checkpoints/openai/clip-vit-large-patch14-336 \
48 | --mm_projector_type mlp2x_gelu \
49 | --freeze_backbone True \
50 | --tune_mm_mlp_adapter False \
51 | --freeze_mm_mlp_adapter True \
52 | --tune_speech_generator_only True \
53 | --pretrain_mm_mlp_adapter ./checkpoints/openomni_stage2_qwen_2/mm_projector.bin \
54 | --speech_encoder ./checkpoints/openai-whisper/large-v3.pt \
55 | --speech_generator ./checkpoints/speech_generator/generator_16K.pt \
56 | --unfreeze_mm_vision_tower False \
57 | --mm_vision_tower_lr ${VIT_LR} \
58 | --speech_generator_lr ${VIT_LR} \
59 | --mm_projector_lr ${VIT_LR} \
60 | --image_aspect_ratio anyres \
61 | --group_by_modality_length True \
62 | --mm_vision_select_layer -2 \
63 | --mm_vision_select_feature patch \
64 | --mm_patch_merge_type spatial_unpad \
65 | --mm_use_im_start_end False \
66 | --mm_use_im_patch_token False \
67 | --bf16 True \
68 | --output_dir ./checkpoints/${SAVE_PATH} \
69 | --num_train_epochs 1 \
70 | --per_device_train_batch_size ${BATCH_SIZE} \
71 | --per_device_eval_batch_size 4 \
72 | --gradient_accumulation_steps ${GRADIENT_ACCU_STEPS} \
73 | --evaluation_strategy "no" \
74 | --save_strategy "steps" \
75 | --save_steps 1000 \
76 | --save_total_limit 20 \
77 | --learning_rate ${BASE_LR} \
78 | --weight_decay 0. \
79 | --warmup_ratio 0.03 \
80 | --lr_scheduler_type "cosine" \
81 | --logging_steps 1 \
82 | --tf32 True \
83 | --model_max_length 8196 \
84 | --gradient_checkpointing True \
85 | --dataloader_num_workers 8 \
86 | --lazy_preprocess True \
87 | --speech_generator_type "ar" \
88 | --run_name ${SAVE_PATH} \
89 | --dataloader_drop_last True \
90 | --report_to tensorboard | tee  train_${SAVE_PATH}.log"
91 | 
92 | eval $command_to_run
93 | 


--------------------------------------------------------------------------------
/scripts/train/qwen2/text2speech_pretrain_6k.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -x
 3 | 
 4 | # wandb login
 5 | echo "DLC seemed world size(actually nodes): ${WORLD_SIZE}"
 6 | NNODES=${WORLD_SIZE}
 7 | NODE_RANK=${RANK}
 8 | 
 9 | export CUDA_DEVICE_MAX_CONNECTIONS=1
10 | export GPUS_PER_NODE=8
11 | # export NNODES=2
12 | export BATCH_SIZE=1
13 | export GRADIENT_ACCU_STEPS=4
14 | export MASTER_PORT=29588
15 | export CPUS_PER_TASK=16
16 | export QUOTA=reserved
17 | 
18 | export DATA_PATH=./datasets/openomni/json/openomni_stage3-1-6k.json
19 | export SAVE_PATH=openomni_stage3-1_qwen_2-6k
20 | export BASE_LR=2e-5
21 | export VIT_LR=2e-6
22 | 
23 | DISTRIBUTED_ARGS="
24 |     --nproc_per_node $GPUS_PER_NODE \
25 |     --nnodes $NNODES \
26 |     --node_rank $NODE_RANK \
27 |     --master_addr $MASTER_ADDR \
28 |     --master_port $MASTER_PORT
29 | "
30 | 
31 | # 定义重试的最大次数
32 | MAX_RETRIES=3
33 | 
34 | # 每次重试之间的等待时间，单位为秒
35 | WAIT_TIME=20
36 | 
37 | # 当前的重试次数
38 | retry_count=0
39 | 
40 | command_to_run="torchrun --nproc_per_node $GPUS_PER_NODE openomni/train/train_mem.py \
41 | --deepspeed ./scripts/zero2.json \
42 | --model_name_or_path ./checkpoints/openomni_stage3_qwen_3/checkpoint-last \
43 | --version llava_qwen_2 \
44 | --data_path ${DATA_PATH} \
45 | --image_folder ./datasets \
46 | --speech_folder ./datasets \
47 | --vision_tower ./checkpoints/openai/clip-vit-large-patch14-336 \
48 | --mm_projector_type mlp2x_gelu \
49 | --freeze_backbone True \
50 | --tune_mm_mlp_adapter False \
51 | --freeze_mm_mlp_adapter True \
52 | --tune_speech_generator_only True \
53 | --pretrain_mm_mlp_adapter ./checkpoints/openomni_stage2_qwen_2/mm_projector.bin \
54 | --speech_encoder ./checkpoints/openai-whisper/large-v3.pt \
55 | --speech_generator ./checkpoints/speech_generator/generator_16k.pt \
56 | --unfreeze_mm_vision_tower False \
57 | --mm_vision_tower_lr ${VIT_LR} \
58 | --speech_generator_lr ${VIT_LR} \
59 | --mm_projector_lr ${VIT_LR} \
60 | --image_aspect_ratio anyres \
61 | --group_by_modality_length True \
62 | --mm_vision_select_layer -2 \
63 | --mm_vision_select_feature patch \
64 | --mm_patch_merge_type spatial_unpad \
65 | --mm_use_im_start_end False \
66 | --mm_use_im_patch_token False \
67 | --bf16 True \
68 | --output_dir checkpoints/${SAVE_PATH} \
69 | --num_train_epochs 1 \
70 | --per_device_train_batch_size ${BATCH_SIZE} \
71 | --per_device_eval_batch_size 4 \
72 | --gradient_accumulation_steps ${GRADIENT_ACCU_STEPS} \
73 | --evaluation_strategy "no" \
74 | --save_strategy "steps" \
75 | --save_steps 1000 \
76 | --save_total_limit 20 \
77 | --learning_rate ${BASE_LR} \
78 | --weight_decay 0. \
79 | --warmup_ratio 0.03 \
80 | --lr_scheduler_type "cosine" \
81 | --logging_steps 1 \
82 | --tf32 True \
83 | --model_max_length 8196 \
84 | --gradient_checkpointing True \
85 | --dataloader_num_workers 8 \
86 | --lazy_preprocess True \
87 | --speech_generator_type "ar" \
88 | --run_name ${SAVE_PATH} \
89 | --dataloader_drop_last True \
90 | --unit_vocab_size 6561 \
91 | --report_to tensorboard | tee  train_${SAVE_PATH}.log"
92 | 
93 | eval $command_to_run
94 | 


--------------------------------------------------------------------------------
/scripts/train/qwen2/text2speech_pretrain_ctc.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -x
 3 | 
 4 | # wandb login
 5 | echo "DLC seemed world size(actually nodes): ${WORLD_SIZE}"
 6 | NNODES=${WORLD_SIZE}
 7 | NODE_RANK=${RANK}
 8 | 
 9 | export CUDA_DEVICE_MAX_CONNECTIONS=1
10 | export GPUS_PER_NODE=8
11 | # export NNODES=2
12 | export BATCH_SIZE=1
13 | export GRADIENT_ACCU_STEPS=4
14 | export MASTER_PORT=29588
15 | export CPUS_PER_TASK=16
16 | export QUOTA=reserved
17 | 
18 | export DATA_PATH=./datasets/openomni/json/openomni_stage3-1-6k.json
19 | export SAVE_PATH=openomni_stage3-1_qwen_2_ctc
20 | export BASE_LR=2e-4
21 | export VIT_LR=2e-4
22 | 
23 | DISTRIBUTED_ARGS="
24 |     --nproc_per_node $GPUS_PER_NODE \
25 |     --nnodes $NNODES \
26 |     --node_rank $NODE_RANK \
27 |     --master_addr $MASTER_ADDR \
28 |     --master_port $MASTER_PORT
29 | "
30 | 
31 | # 定义重试的最大次数
32 | MAX_RETRIES=3
33 | 
34 | # 每次重试之间的等待时间，单位为秒
35 | WAIT_TIME=20
36 | 
37 | # 当前的重试次数
38 | retry_count=0
39 | 
40 | command_to_run="torchrun --nproc_per_node $GPUS_PER_NODE openomni/train/train_mem.py \
41 | --deepspeed ./scripts/zero2.json \
42 | --model_name_or_path ./checkpoints/openomni_stage3_qwen_3/checkpoint-last \
43 | --version llava_qwen_2 \
44 | --data_path ${DATA_PATH} \
45 | --image_folder ./datasets \
46 | --speech_folder ./datasets \
47 | --vision_tower ./checkpoints/openai/clip-vit-large-patch14-336 \
48 | --mm_projector_type mlp2x_gelu \
49 | --freeze_backbone True \
50 | --tune_mm_mlp_adapter False \
51 | --freeze_mm_mlp_adapter True \
52 | --tune_speech_generator_only True \
53 | --pretrain_mm_mlp_adapter ./checkpoints/openomni_stage2_qwen_2/mm_projector.bin \
54 | --speech_encoder ./checkpoints/openai-whisper/large-v3.pt \
55 | --speech_generator ./checkpoints/speech_generator/generator_16k.pt \
56 | --unfreeze_mm_vision_tower False \
57 | --mm_vision_tower_lr ${VIT_LR} \
58 | --speech_generator_lr ${VIT_LR} \
59 | --mm_projector_lr ${VIT_LR} \
60 | --image_aspect_ratio anyres \
61 | --group_by_modality_length True \
62 | --mm_vision_select_layer -2 \
63 | --mm_vision_select_feature patch \
64 | --mm_patch_merge_type spatial_unpad \
65 | --mm_use_im_start_end False \
66 | --mm_use_im_patch_token False \
67 | --bf16 True \
68 | --output_dir ./checkpoints/${SAVE_PATH} \
69 | --num_train_epochs 2 \
70 | --per_device_train_batch_size ${BATCH_SIZE} \
71 | --per_device_eval_batch_size 4 \
72 | --gradient_accumulation_steps ${GRADIENT_ACCU_STEPS} \
73 | --evaluation_strategy "no" \
74 | --save_strategy "steps" \
75 | --save_steps 1000 \
76 | --save_total_limit 20 \
77 | --learning_rate ${BASE_LR} \
78 | --weight_decay 0. \
79 | --warmup_ratio 0.03 \
80 | --lr_scheduler_type "cosine" \
81 | --logging_steps 1 \
82 | --tf32 True \
83 | --model_max_length 8196 \
84 | --gradient_checkpointing True \
85 | --dataloader_num_workers 8 \
86 | --lazy_preprocess True \
87 | --run_name ${SAVE_PATH} \
88 | --dataloader_drop_last True \
89 | --speech_generator_type "ctc" \
90 | --unit_vocab_size 6561 \
91 | --report_to tensorboard | tee train_${SAVE_PATH}.log"
92 | 
93 | eval $command_to_run
94 | 
95 | 
96 | 


--------------------------------------------------------------------------------
/scripts/zero2.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "fp16": {
 3 |         "enabled": "auto",
 4 |         "loss_scale": 0,
 5 |         "loss_scale_window": 1000,
 6 |         "initial_scale_power": 16,
 7 |         "hysteresis": 2,
 8 |         "min_loss_scale": 1
 9 |     },
10 |     "bf16": {
11 |         "enabled": "auto"
12 |     },
13 |     "train_micro_batch_size_per_gpu": "auto",
14 |     "train_batch_size": "auto",
15 |     "gradient_accumulation_steps": "auto",
16 |     "zero_optimization": {
17 |         "stage": 2,
18 |         "overlap_comm": true,
19 |         "contiguous_gradients": true,
20 |         "sub_group_size": 1e9,
21 |         "reduce_bucket_size": "auto"
22 |     }
23 | }


--------------------------------------------------------------------------------
/scripts/zero3.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "fp16": {
 3 |         "enabled": "auto",
 4 |         "loss_scale": 0,
 5 |         "loss_scale_window": 1000,
 6 |         "initial_scale_power": 16,
 7 |         "hysteresis": 2,
 8 |         "min_loss_scale": 1
 9 |     },
10 |     "bf16": {
11 |         "enabled": "auto"
12 |     },
13 |     "train_micro_batch_size_per_gpu": "auto",
14 |     "train_batch_size": "auto",
15 |     "gradient_accumulation_steps": "auto",
16 |     "zero_optimization": {
17 |         "stage": 3,
18 |         "overlap_comm": true,
19 |         "contiguous_gradients": true,
20 |         "sub_group_size": 1e9,
21 |         "reduce_bucket_size": "auto",
22 |         "stage3_prefetch_bucket_size": "auto",
23 |         "stage3_param_persistence_threshold": "auto",
24 |         "stage3_max_live_parameters": 1e9,
25 |         "stage3_max_reuse_distance": 1e9,
26 |         "stage3_gather_16bit_weights_on_model_save": true
27 |     }
28 | }


--------------------------------------------------------------------------------
/scripts/zero3_offload.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "fp16": {
 3 |     "enabled": "auto",
 4 |     "loss_scale": 0,
 5 |     "loss_scale_window": 1000,
 6 |     "initial_scale_power": 16,
 7 |     "hysteresis": 2,
 8 |     "min_loss_scale": 1
 9 |   },
10 |   "bf16": {
11 |     "enabled": "auto"
12 |   },
13 |   "optimizer": {
14 |     "type": "AdamW",
15 |     "params": {
16 |       "lr": "auto",
17 |       "betas": "auto",
18 |       "eps": "auto",
19 |       "weight_decay": "auto"
20 |     }
21 |   },
22 |   "scheduler": {
23 |     "type": "WarmupLR",
24 |     "params": {
25 |       "warmup_min_lr": "auto",
26 |       "warmup_max_lr": "auto",
27 |       "warmup_num_steps": "auto"
28 |     }
29 |   },
30 |   "zero_optimization": {
31 |     "stage": 3,
32 |     "offload_optimizer": {
33 |       "device": "cpu",
34 |       "pin_memory": true
35 |     },
36 |     "offload_param": {
37 |       "device": "cpu",
38 |       "pin_memory": true
39 |     },
40 |     "overlap_comm": true,
41 |     "contiguous_gradients": true,
42 |     "sub_group_size": 1e9,
43 |     "reduce_bucket_size": "auto",
44 |     "stage3_prefetch_bucket_size": "auto",
45 |     "stage3_param_persistence_threshold": "auto",
46 |     "stage3_max_live_parameters": 1e9,
47 |     "stage3_max_reuse_distance": 1e9,
48 |     "gather_16bit_weights_on_model_save": true
49 |   },
50 |   "gradient_accumulation_steps": "auto",
51 |   "gradient_clipping": "auto",
52 |   "train_batch_size": "auto",
53 |   "train_micro_batch_size_per_gpu": "auto",
54 |   "steps_per_print": 1e5,
55 |   "wall_clock_breakdown": false
56 | }


--------------------------------------------------------------------------------
/vlmevalkit/script/run_inference_2.sh:
--------------------------------------------------------------------------------
 1 | export PATH=/usr/local/cuda/bin:$PATH
 2 | 
 3 | export HF_ENDPOINT=https://hf-mirror.com
 4 | export OMP_NUM_THREADS=1
 5 | export timestamp=`date +"%Y%m%d%H%M%S"`
 6 | export OLD_VERSION='False'
 7 | export PYTHONPATH=$(dirname $SELF_DIR):$PYTHONPATH
 8 | 
 9 | # gpu consumed
10 | # fp16 17-18G
11 | # int4 7-8G
12 | 
13 | # model to be used
14 | # Example: MODELNAME=OpenOmni_Qwen
15 | MODELNAME=$1
16 | # datasets to be tested
17 | # Example: DATALIST="POPE ScienceQA_TEST ChartQA_TEST"
18 | DATALIST=$2
19 | # test mode, all or infer
20 | MODE=$3
21 | 
22 | echo "Starting inference with model $MODELNAME on datasets $DATALIST"
23 | # run on multi gpus with torchrun command
24 | # remember to run twice, the first run may fail
25 | torchrun --nproc_per_node=1 --master_port=28881 run.py --data $DATALIST --model $MODELNAME --mode $MODE --rerun
26 | # torchrun --nproc_per_node=4 run.py --data $DATALIST --model $MODELNAME --mode $MODE --rerun
27 | # run on single gpu with python command
28 | # python run.py --data $DATALIST --model $MODELNAME --verbose --mode $MODE
29 | # python run.py --data $DATALIST --model $MODELNAME --verbose --mode $MODE
30 | # CUDA_VISIBLE_DEVICES=0,1,2,3 ./script/run_inference.sh OpenOmni "MME MMMU_DEV_VAL MathVista_MINI LLaVABench RealWorldQA MMStar MMVet AI2D_TEST OCRBench HallusionBench" all
31 | 


--------------------------------------------------------------------------------
/vlmevalkit/vlmeval/__init__.py:
--------------------------------------------------------------------------------
 1 | try:
 2 |     import torch
 3 | except ImportError:
 4 |     pass
 5 | 
 6 | from .smp import *
 7 | from .api import *
 8 | from .evaluate import *
 9 | from .utils import *
10 | from .vlm import *
11 | from .config import *
12 | 
13 | load_env()
14 | 


--------------------------------------------------------------------------------
/vlmevalkit/vlmeval/api/__init__.py:
--------------------------------------------------------------------------------
1 | from .gpt import OpenAIWrapper, GPT4V
2 | from .gpt_int import OpenAIWrapperInternal, GPT4V_Internal
3 | 
4 | __all__ = [
5 |     'OpenAIWrapper', 'OpenAIWrapperInternal', 'GPT4V', 'GPT4V_Internal'
6 | ]
7 | 


--------------------------------------------------------------------------------
/vlmevalkit/vlmeval/api/gpt_int.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import warnings
 3 | import requests
 4 | from ..smp import *
 5 | from .gpt import GPT_context_window, OpenAIWrapper
 6 | 
 7 | url = 'http://ecs.sv.us.alles-apin.openxlab.org.cn/v1/openai/v2/text/chat'
 8 | headers = {
 9 |     'Content-Type': 'application/json'
10 | }
11 | 
12 | 
13 | class OpenAIWrapperInternal(OpenAIWrapper):
14 | 
15 |     is_api: bool = True
16 | 
17 |     def __init__(self,
18 |                  model: str = 'gpt-3.5-turbo-0613',
19 |                  retry: int = 5,
20 |                  wait: int = 3,
21 |                  verbose: bool = True,
22 |                  system_prompt: str = None,
23 |                  temperature: float = 0,
24 |                  timeout: int = 60,
25 |                  max_tokens: int = 2000,
26 |                  img_size: int = 512,
27 |                  img_detail: str = 'low',
28 |                  **kwargs):
29 | 
30 |         self.model = model
31 |         if 'KEYS' in os.environ and osp.exists(os.environ['KEYS']):
32 |             keys = load(os.environ['KEYS'])
33 |             headers['alles-apin-token'] = keys.get('alles-apin-token', '')
34 |         elif 'ALLES' in os.environ:
35 |             headers['alles-apin-token'] = os.environ['ALLES']
36 |         self.headers = headers
37 |         self.temperature = temperature
38 |         self.timeout = timeout
39 |         self.max_tokens = max_tokens
40 | 
41 |         assert img_size > 0 or img_size == -1
42 |         self.img_size = img_size
43 |         assert img_detail in ['high', 'low']
44 |         self.img_detail = img_detail
45 | 
46 |         super(OpenAIWrapper, self).__init__(
47 |             wait=wait, retry=retry, system_prompt=system_prompt, verbose=verbose, **kwargs)
48 | 
49 |     def generate_inner(self, inputs, **kwargs) -> str:
50 |         input_msgs = self.prepare_inputs(inputs)
51 | 
52 |         temperature = kwargs.pop('temperature', self.temperature)
53 |         max_tokens = kwargs.pop('max_tokens', self.max_tokens)
54 | 
55 |         # Held out 100 tokens as buffer
56 |         context_window = GPT_context_window(self.model)
57 |         max_tokens = min(max_tokens, context_window - self.get_token_len(inputs))
58 |         if 0 < max_tokens <= 100:
59 |             print('Less than 100 tokens left, may exceed the context window with some additional meta symbols. ')
60 |         if max_tokens <= 0:
61 |             return 0, self.fail_msg + 'Input string longer than context window. ', 'Length Exceeded. '
62 | 
63 |         payload = dict(
64 |             model=self.model,
65 |             messages=input_msgs,
66 |             max_tokens=max_tokens,
67 |             n=1,
68 |             stop=None,
69 |             timeout=self.timeout,
70 |             temperature=temperature,
71 |             **kwargs)
72 | 
73 |         response = requests.post(url, headers=headers, data=json.dumps(payload), timeout=self.timeout * 1.1)
74 |         ret_code = response.status_code
75 |         ret_code = 0 if (200 <= int(ret_code) < 300) else ret_code
76 | 
77 |         answer = self.fail_msg
78 |         try:
79 |             resp_struct = json.loads(response.text)
80 |             assert resp_struct['msg'] == 'ok' and resp_struct['msgCode'] == '10000', resp_struct
81 |             answer = resp_struct['data']['response']['choices'][0]['message']['content'].strip()
82 |         except:
83 |             pass
84 |         return ret_code, answer, response
85 | 
86 | 
87 | class GPT4V_Internal(OpenAIWrapperInternal):
88 | 
89 |     def generate(self, message, dataset=None):
90 |         return super(GPT4V_Internal, self).generate(message)
91 | 


--------------------------------------------------------------------------------
/vlmevalkit/vlmeval/config.py:
--------------------------------------------------------------------------------
 1 | from vlmeval.vlm import *
 2 | from vlmeval.api import *
 3 | from functools import partial
 4 | 
 5 | model_path="./checkpoints/openomni_stage3_llama_3/checkpoint-20000"
 6 | model_path2="./checkpoints/openomni_stage3_qwen_2/checkpoint-20000"
 7 | ungrouped = {
 8 |     'OpenOmni-Llama3-V-1_6':partial(OpenOmni_Llama3, model_path=model_path),
 9 |     'OpenOmni-Qwen2-V-1_6':partial(OpenOmni_Qwen2, model_path=model_path2),
10 | }
11 | 
12 | # "oss://coaidatasets-intern/minzheng/luorun/data/seed_data_15k_mini.json "
13 | # bash ./script/run_inference_2.sh OpenOmni-Qwen2-V-1_6_4 "MME MMMU_DEV_VAL MathVista_MINI RealWorldQA MMStar MMVet AI2D_TEST OCRBench HallusionBench" all
14 | # bash ./script/run_inference_8.sh OpenOmni-Llama3-V-1_6 "MME MMMU_DEV_VAL MathVista_MINI LLaVABench RealWorldQA MMStar MMVet AI2D_TEST OCRBench HallusionBench" all
15 | # bash ./script/run_inference_8.sh OpenOmni-Qwen2-V-1_6_ablation_evol_final_evol_2 "MME MMMU_DEV_VAL MathVista_MINI LLaVABench RealWorldQA MMStar MMVet AI2D_TEST OCRBench HallusionBench POPE BLINK" all
16 | # bash ./script/run_inference_4.sh OpenOmni-Llama3-V-1_6_ablation_seed_11k_seed "MMBench_TEST_EN MMBench_TEST_CN" all
17 | # bash ./script/run_inference_2.sh OpenOmni-Qwen2-V-1_6 "MME MMMU_DEV_VAL MathVista_MINI LLaVABench RealWorldQA MMStar MMVet AI2D_TEST OCRBench HallusionBenc" all
18 | 
19 | supported_VLM = {} 
20 | 
21 | model_groups = [
22 |     ungrouped
23 | ]
24 | 
25 | for grp in model_groups:
26 |     supported_VLM.update(grp)
27 | 
28 | 


--------------------------------------------------------------------------------
/vlmevalkit/vlmeval/evaluate/OCRBench.py:
--------------------------------------------------------------------------------
 1 | from vlmeval.smp import *
 2 | 
 3 | 
 4 | def OCRBench_eval(eval_file):
 5 |     OCRBench_score = {
 6 |         'Regular Text Recognition': 0,
 7 |         'Irregular Text Recognition': 0,
 8 |         'Artistic Text Recognition': 0,
 9 |         'Handwriting Recognition': 0,
10 |         'Digit String Recognition': 0,
11 |         'Non-Semantic Text Recognition': 0,
12 |         'Scene Text-centric VQA': 0,
13 |         'Doc-oriented VQA': 0,
14 |         'Key Information Extraction': 0,
15 |         'Handwritten Mathematical Expression Recognition': 0
16 |     }
17 | 
18 |     logger = get_logger('Evaluation')
19 | 
20 |     data = load(eval_file)
21 |     lt = len(data)
22 |     lines = [data.iloc[i] for i in range(lt)]
23 |     for i in tqdm(range(len(lines))):
24 |         line = lines[i]
25 |         predict = str(line['prediction'])
26 |         answers = eval(line['answer'])
27 |         category = line['category']
28 |         if category == 'Handwritten Mathematical Expression Recognition':
29 |             for j in range(len(answers)):
30 |                 answer = answers[j].strip().replace('\n', ' ').replace(' ', '')
31 |                 predict = predict.strip().replace('\n', ' ').replace(' ', '')
32 |                 if answer in predict:
33 |                     OCRBench_score[category] += 1
34 |                     break
35 |         else:
36 |             for j in range(len(answers)):
37 |                 answer = answers[j].lower().strip().replace('\n', ' ')
38 |                 predict = predict.lower().strip().replace('\n', ' ')
39 |                 if answer in predict:
40 |                     OCRBench_score[category] += 1
41 |                     break
42 | 
43 |     final_score_dict = {}
44 |     final_score_dict['Text Recognition'] = (
45 |         OCRBench_score['Regular Text Recognition'] + OCRBench_score['Irregular Text Recognition']
46 |         + OCRBench_score['Artistic Text Recognition'] + OCRBench_score['Handwriting Recognition']
47 |         + OCRBench_score['Digit String Recognition'] + OCRBench_score['Non-Semantic Text Recognition']
48 |     )
49 |     final_score_dict['Scene Text-centric VQA'] = OCRBench_score['Scene Text-centric VQA']
50 |     final_score_dict['Doc-oriented VQA'] = OCRBench_score['Doc-oriented VQA']
51 |     final_score_dict['Key Information Extraction'] = OCRBench_score['Key Information Extraction']
52 |     final_score_dict['Handwritten Mathematical Expression Recognition'] = \
53 |         OCRBench_score['Handwritten Mathematical Expression Recognition']
54 |     final_score_dict['Final Score'] = (
55 |         final_score_dict['Text Recognition'] + final_score_dict['Scene Text-centric VQA']
56 |         + final_score_dict['Doc-oriented VQA'] + final_score_dict['Key Information Extraction']
57 |         + final_score_dict['Handwritten Mathematical Expression Recognition']
58 |     )
59 |     final_score_dict['Final Score Norm'] = float(final_score_dict['Final Score']) / 10
60 |     score_pth = eval_file.replace('.xlsx', '_score.json')
61 |     dump(final_score_dict, score_pth)
62 |     logger.info(f'OCRBench_eval successfully finished evaluating {eval_file}, results saved in {score_pth}')
63 |     logger.info('Score: ')
64 |     for key, value in final_score_dict.items():
65 |         logger.info('{}:{}'.format(key, value))
66 | 


--------------------------------------------------------------------------------
/vlmevalkit/vlmeval/evaluate/__init__.py:
--------------------------------------------------------------------------------
 1 | from .yes_or_no import default_rating, MME_rating, YOrN_eval
 2 | from .mmvet_eval import MMVet_eval
 3 | from .multiple_choice import multiple_choice_eval
 4 | from .coco_eval import COCO_eval
 5 | from .vqa_eval import VQAEval
 6 | from .mathvista_eval import MathVista_eval
 7 | from .llavabench import LLaVABench_eval
 8 | from .misc import build_judge
 9 | from .OCRBench import OCRBench_eval
10 | 


--------------------------------------------------------------------------------
/vlmevalkit/vlmeval/evaluate/coco_eval.py:
--------------------------------------------------------------------------------
 1 | from vlmeval.smp import *
 2 | from pycocoevalcap.bleu.bleu import Bleu
 3 | from pycocoevalcap.rouge.rouge import Rouge
 4 | from pycocoevalcap.cider.cider import Cider
 5 | 
 6 | 
 7 | class COCO_Caption_Scorer():
 8 |     def __init__(self, ref, gt):
 9 |         self.ref = ref
10 |         self.gt = gt
11 |         print('setting up scorers...')
12 |         self.scorers = [
13 |             (Bleu(4), ['Bleu_1', 'Bleu_2', 'Bleu_3', 'Bleu_4']),
14 |             # (Meteor(), "METEOR"), # need java version 11.0.16+
15 |             (Rouge(), 'ROUGE_L'),
16 |             (Cider(), 'CIDEr'),
17 |             # (Spice(), "SPICE"), # need java version 11.0.16+
18 |         ]
19 | 
20 |     def compute_scores(self):
21 |         total_scores = {}
22 |         for scorer, method in self.scorers:
23 |             print('computing %s score...' % (scorer.method()))
24 |             score, scores = scorer.compute_score(self.gt, self.ref)
25 |             if type(method) == list:
26 |                 for sc, scs, m in zip(score, scores, method):
27 |                     print('%s: %0.3f' % (m, sc * 100))
28 |                 total_scores['Bleu'] = [x * 100 for x in score]
29 |             else:
30 |                 print('%s: %0.3f' % (method, score * 100))
31 |                 total_scores[method] = score * 100
32 | 
33 |         print('*****DONE*****')
34 |         for key, value in total_scores.items():
35 |             print('{}:{}'.format(key, value))
36 |         return total_scores
37 | 
38 | 
39 | def COCO_eval(eval_file, nproc=4, verbose=False):
40 |     logger = get_logger('Evaluation')
41 | 
42 |     data = load(eval_file)
43 | 
44 |     lt = len(data)
45 |     lines = [data.iloc[i] for i in range(lt)]
46 |     ref = {}
47 |     gt = {}
48 |     for i, line in enumerate(lines):
49 |         ref[str(i)] = [str(line['prediction'])]
50 |         gt[str(i)] = eval(line['answer'])
51 | 
52 |     scorer = COCO_Caption_Scorer(ref, gt)
53 |     coco_caption_score_dict = scorer.compute_scores()
54 | 
55 |     score_pth = eval_file.replace('.xlsx', '_score.json')
56 |     dump(coco_caption_score_dict, score_pth)
57 |     logger.info(f'COCO_eval successfully finished evaluating {eval_file}, results saved in {score_pth}')
58 |     logger.info('Score: ')
59 |     for key, value in coco_caption_score_dict.items():
60 |         logger.info('{}:{}'.format(key, value))
61 | 
62 | 
63 | def parse_args():
64 |     parser = argparse.ArgumentParser(description='Inference LLM Answers. ')
65 |     parser.add_argument('--data', type=str, help='The question set for inference, in excel / tsv / json format. ')
66 |     parser.add_argument('--nproc', type=int, default=4)
67 |     parser.add_argument('--verbose', action='store_true')
68 |     args = parser.parse_args()
69 |     return args
70 | 
71 | 
72 | if __name__ == '__main__':
73 |     args = parse_args()
74 |     COCO_eval(eval_file=args.data, nproc=args.nproc, verbose=args.verbose)
75 | 


--------------------------------------------------------------------------------
/vlmevalkit/vlmeval/evaluate/misc.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from vlmeval.api import OpenAIWrapper, OpenAIWrapperInternal
 3 | from vlmeval.smp import load_env
 4 | 
 5 | INTERNAL = os.environ.get('INTERNAL', 0)
 6 | 
 7 | 
 8 | def build_judge(**kwargs):
 9 |     model = kwargs.pop('model', None)
10 |     load_env()
11 |     LOCAL_LLM = os.environ.get('LOCAL_LLM', None)
12 |     if LOCAL_LLM is None:
13 |         model_map = {
14 |             'gpt-4-turbo': 'gpt-4-1106-preview',
15 |             'gpt-4-0613': 'gpt-4-0613',
16 |             'gpt-4-0314': 'gpt-4-0314',
17 |             'gpt-4-0125': 'gpt-4-0125-preview',
18 |             'chatgpt-1106': 'gpt-3.5-turbo-1106',
19 |             'chatgpt-0613': 'gpt-3.5-turbo-0613',
20 |             'chatgpt-0125': 'gpt-3.5-turbo-0125'
21 |         }
22 |         model_version = model_map[model]
23 |     else:
24 |         model_version = LOCAL_LLM
25 |     if INTERNAL:
26 |         model = OpenAIWrapperInternal(model_version, **kwargs)
27 |     else:
28 |         model = OpenAIWrapper(model_version, **kwargs)
29 |     return model
30 | 


--------------------------------------------------------------------------------
/vlmevalkit/vlmeval/smp/__init__.py:
--------------------------------------------------------------------------------
1 | from .file import *
2 | from .vlm import *
3 | from .misc import *
4 | from .log import *
5 | 


--------------------------------------------------------------------------------
/vlmevalkit/vlmeval/smp/log.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | logger_initialized = {}
 4 | 
 5 | 
 6 | def get_logger(name, log_file=None, log_level=logging.INFO, file_mode='w'):
 7 |     logger = logging.getLogger(name)
 8 |     if name in logger_initialized:
 9 |         return logger
10 | 
11 |     for logger_name in logger_initialized:
12 |         if name.startswith(logger_name):
13 |             return logger
14 | 
15 |     stream_handler = logging.StreamHandler()
16 |     handlers = [stream_handler]
17 | 
18 |     try:
19 |         import torch.distributed as dist
20 |         if dist.is_available() and dist.is_initialized():
21 |             rank = dist.get_rank()
22 |         else:
23 |             rank = 0
24 |     except ImportError:
25 |         rank = 0
26 | 
27 |     if rank == 0 and log_file is not None:
28 |         file_handler = logging.FileHandler(log_file, file_mode)
29 |         handlers.append(file_handler)
30 | 
31 |     formatter = logging.Formatter(
32 |         '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
33 |     for handler in handlers:
34 |         handler.setFormatter(formatter)
35 |         handler.setLevel(log_level)
36 |         logger.addHandler(handler)
37 | 
38 |     if rank == 0:
39 |         logger.setLevel(log_level)
40 |     else:
41 |         logger.setLevel(logging.ERROR)
42 | 
43 |     logger_initialized[name] = True
44 |     return logger
45 | 


--------------------------------------------------------------------------------
/vlmevalkit/vlmeval/smp/vlm.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import io
  3 | import pandas as pd
  4 | import numpy as np
  5 | import string
  6 | from uuid import uuid4
  7 | import os.path as osp
  8 | import base64
  9 | from PIL import Image
 10 | from .file import load, dump
 11 | Image.MAX_IMAGE_PIXELS = 1e9
 12 | 
 13 | 
 14 | def mmqa_display(question, target_size=512):
 15 |     question = {k.lower(): v for k, v in question.items()}
 16 |     keys = list(question.keys())
 17 |     keys = [k for k in keys if k not in ['index', 'image']]
 18 | 
 19 |     images = question['image']
 20 |     if isinstance(images, str):
 21 |         images = [images]
 22 | 
 23 |     idx = question.pop('index', 'XXX')
 24 |     print(f'INDEX: {idx}')
 25 | 
 26 |     for im in images:
 27 |         image = decode_base64_to_image(im, target_size=target_size)
 28 |         display(image)  # noqa: F821
 29 | 
 30 |     for k in keys:
 31 |         try:
 32 |             if not pd.isna(question[k]):
 33 |                 print(f'{k.upper()}. {question[k]}')
 34 |         except ValueError:
 35 |             if False in pd.isna(question[k]):
 36 |                 print(f'{k.upper()}. {question[k]}')
 37 | 
 38 | 
 39 | def encode_image_to_base64(img, target_size=-1):
 40 |     # if target_size == -1, will not do resizing
 41 |     # else, will set the max_size ot (target_size, target_size)
 42 |     if img.mode in ('RGBA', 'P'):
 43 |         img = img.convert('RGB')
 44 |     tmp = osp.join('/tmp', str(uuid4()) + '.jpg')
 45 |     if target_size > 0:
 46 |         img.thumbnail((target_size, target_size))
 47 |     img.save(tmp)
 48 |     with open(tmp, 'rb') as image_file:
 49 |         image_data = image_file.read()
 50 |     ret = base64.b64encode(image_data).decode('utf-8')
 51 |     os.remove(tmp)
 52 |     return ret
 53 | 
 54 | 
 55 | def encode_image_file_to_base64(image_path, target_size=-1):
 56 |     image = Image.open(image_path)
 57 |     return encode_image_to_base64(image, target_size=target_size)
 58 | 
 59 | 
 60 | def decode_base64_to_image(base64_string, target_size=-1):
 61 |     image_data = base64.b64decode(base64_string)
 62 |     image = Image.open(io.BytesIO(image_data))
 63 |     if image.mode in ('RGBA', 'P'):
 64 |         image = image.convert('RGB')
 65 |     if target_size > 0:
 66 |         image.thumbnail((target_size, target_size))
 67 |     return image
 68 | 
 69 | 
 70 | def decode_base64_to_image_file(base64_string, image_path, target_size=-1):
 71 |     image = decode_base64_to_image(base64_string, target_size=target_size)
 72 |     image.save(image_path)
 73 | 
 74 | 
 75 | def build_option_str(option_dict):
 76 |     s = 'There are several options: \n'
 77 |     for c, content in option_dict.items():
 78 |         if not pd.isna(content):
 79 |             s += f'{c}. {content}\n'
 80 |     return s
 81 | 
 82 | 
 83 | def isimg(s):
 84 |     return osp.exists(s) or s.startswith('http')
 85 | 
 86 | 
 87 | def read_ok(img_path):
 88 |     if not osp.exists(img_path):
 89 |         return False
 90 |     try:
 91 |         im = Image.open(img_path)
 92 |         assert im.size[0] > 0 and im.size[1] > 0
 93 |         return True
 94 |     except:
 95 |         return False
 96 | 
 97 | 
 98 | def gpt_key_set():
 99 |     openai_key = os.environ.get('OPENAI_API_KEY', None)
100 |     return isinstance(openai_key, str) and openai_key.startswith('sk-')
101 | 
102 | 
103 | def apiok(wrapper):
104 |     s = wrapper.generate('Hello!')
105 |     return wrapper.fail_msg not in s
106 | 
107 | 
108 | def circular_pred(df, extract_func=None):
109 |     if extract_func is None:
110 |         extract_func = lambda x: x  # noqa: E731
111 |     df = df.sort_values('index')
112 |     from vlmeval.utils import can_infer_option
113 |     shift = int(1e6)
114 | 
115 |     choices = [extract_func(x) for x in df['prediction']]
116 |     pred_map = {i: c for i, c in zip(df['index'], choices)}
117 |     flag_map = {i: True for i in pred_map if i < 1e6}
118 |     valid_map = {i: True for i in pred_map if i < 1e6}
119 |     for i in df['index']:
120 |         if i >= shift and pred_map[i] and pred_map[i - shift]:
121 |             if (
122 |                 pred_map[i] not in list(string.ascii_uppercase) or  # noqa: W504
123 |                 pred_map[i - shift] not in list(string.ascii_uppercase)
124 |             ):
125 | 
126 |                 valid_map[i % shift] = False
127 |                 continue
128 |             if (ord(pred_map[i]) - ord(pred_map[i - shift])) % 4 == 1:
129 |                 continue
130 |             else:
131 |                 flag_map[i % shift] = False
132 |     flag_map = {k: v for k, v in flag_map.items() if valid_map[k]}
133 |     flags = list(flag_map.values())
134 |     return np.mean(flags)
135 | 


--------------------------------------------------------------------------------
/vlmevalkit/vlmeval/utils/__init__.py:
--------------------------------------------------------------------------------
 1 | from .matching_util import can_infer, can_infer_option, can_infer_text
 2 | from .mp_util import track_progress_rich
 3 | from .custom_prompt import CustomPrompt
 4 | from .dataset_config import dataset_URLs, img_root_map, DATASET_TYPE, abbr2full
 5 | from .dataset import TSVDataset, split_MMMU, MMMU_result_transfer
 6 | 
 7 | 
 8 | __all__ = [
 9 |     'can_infer', 'can_infer_option', 'can_infer_text', 'track_progress_rich',
10 |     'TSVDataset', 'dataset_URLs', 'img_root_map', 'DATASET_TYPE', 'CustomPrompt',
11 |     'split_MMMU', 'abbr2full'
12 | ]
13 | 


--------------------------------------------------------------------------------
/vlmevalkit/vlmeval/utils/custom_prompt.py:
--------------------------------------------------------------------------------
 1 | from ..smp import *
 2 | from .dataset_config import img_root_map
 3 | from abc import abstractmethod
 4 | 
 5 | 
 6 | class CustomPrompt:
 7 | 
 8 |     @abstractmethod
 9 |     def use_custom_prompt(self, dataset):
10 |         raise NotImplementedError
11 | 
12 |     @abstractmethod
13 |     def build_prompt(self, line, dataset):
14 |         raise NotImplementedError
15 | 
16 |     def dump_image(self, line, dataset):
17 |         ROOT = LMUDataRoot()
18 |         assert isinstance(dataset, str)
19 |         img_root = osp.join(ROOT, 'images', img_root_map[dataset] if dataset in img_root_map else dataset)
20 |         os.makedirs(img_root, exist_ok=True)
21 |         if isinstance(line['image'], list):
22 |             tgt_path = []
23 |             assert 'image_path' in line
24 |             for img, im_name in zip(line['image'], line['image_path']):
25 |                 path = osp.join(img_root, im_name)
26 |                 if not read_ok(path):
27 |                     decode_base64_to_image_file(img, path)
28 |                 tgt_path.append(path)
29 |         else:
30 |             tgt_path = osp.join(img_root, f"{line['index']}.jpg")
31 |             if not read_ok(tgt_path):
32 |                 decode_base64_to_image_file(line['image'], tgt_path)
33 |         return tgt_path
34 | 


--------------------------------------------------------------------------------
/vlmevalkit/vlmeval/utils/matching_util.py:
--------------------------------------------------------------------------------
 1 | import string
 2 | import copy as cp
 3 | import os
 4 | from ..smp import *
 5 | 
 6 | 
 7 | def can_infer_option(answer, choices):
 8 |     verbose = os.environ.get('VERBOSE', 0)
 9 |     # Choices is a dictionary
10 |     if 'Failed to obtain answer via API' in answer:
11 |         return False
12 | 
13 |     reject_to_answer = [
14 |         "Sorry, I can't help with images of people yet.",
15 |         "I can't process this file.",
16 |         "I'm sorry, but without the image provided",
17 |         'Cannot determine the answer'
18 |     ]
19 |     for err in reject_to_answer:
20 |         if err in answer:
21 |             return 'Z'
22 | 
23 |     def count_choice(splits, choices, prefix='', suffix=''):
24 |         cnt = 0
25 |         for c in choices:
26 |             if prefix + c + suffix in splits:
27 |                 cnt += 1
28 |         return cnt
29 | 
30 |     answer_mod = cp.copy(answer)
31 |     chars = '.()[],:;!*#{}'
32 |     for c in chars:
33 |         answer_mod = answer_mod.replace(c, ' ')
34 | 
35 |     splits = [x.strip() for x in answer_mod.split()]
36 |     count = count_choice(splits, choices)
37 | 
38 |     if count == 1:
39 |         for ch in choices:
40 |             if 'A' in splits and len(splits) > 3 and verbose:
41 |                 logger = get_logger('Evaluation')
42 |                 logger.info(f'A might be a quantifier in the string: {answer}.')
43 |                 return False
44 |             if ch in splits:
45 |                 return ch
46 |     elif count == 0 and count_choice(splits, {'Z', ''}) == 1:
47 |         return 'Z'
48 |     return False
49 | 
50 | 
51 | def can_infer_text(answer, choices):
52 |     answer = answer.lower()
53 |     assert isinstance(choices, dict)
54 |     for k in choices:
55 |         assert k in string.ascii_uppercase
56 |         choices[k] = str(choices[k]).lower()
57 |     cands = []
58 |     for k in choices:
59 |         if choices[k] in answer:
60 |             cands.append(k)
61 |     if len(cands) == 1:
62 |         return cands[0]
63 |     return False
64 | 
65 | 
66 | def can_infer(answer, choices):
67 |     answer = str(answer)
68 |     copt = can_infer_option(answer, choices)
69 |     return copt if copt else can_infer_text(answer, choices)
70 | 


--------------------------------------------------------------------------------
/vlmevalkit/vlmeval/vlm/__init__.py:
--------------------------------------------------------------------------------
1 | import torch
2 | 
3 | torch.set_grad_enabled(False)
4 | torch.manual_seed(1234)
5 | from .base import BaseModel
6 | from .openomni_llama import OpenOmni_Llama3
7 | from .openomni_qwen import OpenOmni_Qwen2


--------------------------------------------------------------------------------