├── LICENSE ├── README.md ├── bunny ├── constants.py ├── conversation.py ├── eval │ ├── m4c_evaluator.py │ ├── model_vqa.py │ ├── model_vqa_cmmmu.py │ ├── model_vqa_loader.py │ ├── model_vqa_mmbench.py │ ├── model_vqa_mmmu.py │ └── model_vqa_science.py ├── model │ ├── __init__.py │ ├── builder.py │ ├── bunny_arch.py │ ├── language_model │ │ ├── bunny_llama.py │ │ ├── bunny_minicpm.py │ │ ├── bunny_phi.py │ │ ├── bunny_phi3.py │ │ ├── bunny_qwen.py │ │ ├── bunny_stablelm.py │ │ ├── llama │ │ │ ├── __init__.py │ │ │ ├── configuration_llama.py │ │ │ ├── modeling_llama.py │ │ │ ├── tokenization_llama.py │ │ │ └── tokenization_llama_fast.py │ │ ├── minicpm │ │ │ ├── configuration_minicpm.py │ │ │ └── modeling_minicpm.py │ │ ├── phi │ │ │ ├── __init__.py │ │ │ ├── configuration_phi.py │ │ │ └── modeling_phi.py │ │ ├── phi3 │ │ │ ├── __init__.py │ │ │ ├── configuration_phi3.py │ │ │ └── modeling_phi3.py │ │ ├── qwen2 │ │ │ ├── __init__.py │ │ │ ├── configuration_qwen2.py │ │ │ ├── modeling_qwen2.py │ │ │ ├── tokenization_qwen2.py │ │ │ └── tokenization_qwen2_fast.py │ │ └── stable_lm │ │ │ ├── configuration_stablelm_epoch.py │ │ │ └── modeling_stablelm_epoch.py │ ├── multimodal_encoder │ │ ├── builder.py │ │ ├── clip │ │ │ └── clip_encoder.py │ │ ├── eva_clip │ │ │ ├── eva_clip_encoder.py │ │ │ ├── eva_clip_processors.py │ │ │ └── eva_vit.py │ │ └── siglip │ │ │ └── siglip_encoder.py │ └── multimodal_projector │ │ └── builder.py ├── serve │ ├── cli.py │ ├── controller.py │ ├── examples │ │ ├── example_1.png │ │ ├── example_2.png │ │ ├── icon.jpg │ │ └── user.png │ ├── gradio_web_server.py │ ├── model_worker.py │ └── register_worker.py ├── train │ ├── bunny_trainer.py │ └── train.py └── util │ ├── data_utils.py │ ├── mm_utils.py │ ├── s2wrapper │ ├── __init__.py │ ├── core.py │ └── utils.py │ └── utils.py ├── comparison_4B.png ├── comparison_8B.png ├── eval ├── cmmmu │ ├── cmmmu-data-val-answer.jsonl │ ├── eval_script.py │ ├── eval_utils.py │ └── prompt.yaml ├── gqa │ ├── bunny_gqa_testdev_balanced.jsonl │ ├── convert_gqa_for_eval.py │ ├── eval_gqa.py │ └── testdev_balanced_questions.tar.gz ├── mm-vet │ ├── bunny-mm-vet.jsonl │ └── convert_mmvet_for_eval.py ├── mmbench │ └── convert_mmbench_for_submission.py ├── mme │ ├── bunny_mme.jsonl │ ├── calculation_mme.py │ └── convert_answer_to_mme.py ├── mmmu │ ├── answer_dict_val.json │ ├── config.yaml │ └── eval_mmmu.py ├── pope │ ├── bunny_pope_test.jsonl │ └── eval_pope.py ├── scienceqa │ ├── bunny_test_CQM-A.json │ └── eval_science_qa.py ├── seed-bench │ ├── SEED-Bench.json │ ├── bunny-seed-bench.jsonl │ ├── convert_seed_for_submission.py │ └── extract_video_frames.py ├── textvqa │ ├── bunny_textvqa_val_v051_ocr.jsonl │ └── eval_textvqa.py ├── viswiz │ ├── bunny_test.jsonl │ └── convert_viswiz_for_submission.py └── vqav2 │ ├── bunny_vqav2_mscoco_test-dev2015.tar.gz │ ├── bunny_vqav2_mscoco_test2015.tar.gz │ └── convert_vqav2_for_submission.py ├── icon.png ├── pyproject.toml └── script ├── batch_inference.py ├── conversion_to_GGUF.md ├── deepspeed ├── zero2.json └── zero3.json ├── eval ├── full │ ├── cmmmu.sh │ ├── evaluation_full.md │ ├── gqa.sh │ ├── mmbench.sh │ ├── mme.sh │ ├── mmmu.sh │ ├── mmvet.sh │ ├── pope.sh │ ├── scienceqa.sh │ ├── seedbench.sh │ └── vqav2.sh └── lora │ ├── cmmmu.sh │ ├── evaluation_lora.md │ ├── gqa.sh │ ├── mmbench.sh │ ├── mme.sh │ ├── mmmu.sh │ ├── mmvet.sh │ ├── pope.sh │ ├── scienceqa.sh │ ├── seedbench.sh │ └── vqav2.sh ├── merge_lora_weights.py └── train ├── finetune_full.sh ├── finetune_lora.sh ├── pretrain.sh └── tutorials ├── Bunny-Llama-3-8B-V.md ├── Bunny-v1.0-4B.md ├── Bunny-v1.1-4B.md ├── Bunny-v1.1-Llama-3-8B-V.md ├── assets ├── Bunny-Llama-3-8B-V.png ├── Bunny-v1.0-4B.png ├── Bunny-v1.1-4B.png └── Bunny-v1.1-Llama-3-8B-V.png ├── bunny-minicpm-siglip-lora.md ├── bunny-phi-1.5-eva-lora.md ├── bunny-phi-1.5-siglip-lora.md ├── bunny-phi-2-eva-lora.md ├── bunny-phi-2-siglip-lora.md ├── bunny-qwen1.5-1.8b-siglip-lora.md ├── bunny-stablelm-2-eva-lora.md └── bunny-stablelm-2-siglip-lora.md /bunny/constants.py: -------------------------------------------------------------------------------- 1 | # Model Constants 2 | IGNORE_INDEX = -100 3 | IMAGE_TOKEN_INDEX = -200 4 | DEFAULT_IMAGE_TOKEN = "" 5 | CONTROLLER_HEART_BEAT_EXPIRATION = 30 6 | LOGDIR = "gradio-logs" 7 | WORKER_HEART_BEAT_INTERVAL = 15 8 | -------------------------------------------------------------------------------- /bunny/eval/model_vqa.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import torch 3 | import os 4 | import json 5 | from tqdm import tqdm 6 | import shortuuid 7 | 8 | from bunny.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN 9 | from bunny.conversation import conv_templates, SeparatorStyle 10 | from bunny.model.builder import load_pretrained_model 11 | from bunny.util.utils import disable_torch_init 12 | from bunny.util.mm_utils import tokenizer_image_token, get_model_name_from_path, process_images 13 | 14 | from PIL import Image 15 | import math 16 | 17 | 18 | def split_list(lst, n): 19 | """Split a list into n (roughly) equal-sized chunks""" 20 | chunk_size = math.ceil(len(lst) / n) # integer division 21 | return [lst[i:i + chunk_size] for i in range(0, len(lst), chunk_size)] 22 | 23 | 24 | def get_chunk(lst, n, k): 25 | chunks = split_list(lst, n) 26 | return chunks[k] 27 | 28 | 29 | def eval_model(args): 30 | # Model 31 | disable_torch_init() 32 | model_path = os.path.expanduser(args.model_path) 33 | model_name = get_model_name_from_path(model_path) 34 | tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, args.model_base, model_name, 35 | args.model_type) 36 | 37 | questions = [json.loads(q) for q in open(os.path.expanduser(args.question_file), "r")] 38 | questions = get_chunk(questions, args.num_chunks, args.chunk_idx) 39 | answers_file = os.path.expanduser(args.answers_file) 40 | os.makedirs(os.path.dirname(answers_file), exist_ok=True) 41 | ans_file = open(answers_file, "w") 42 | for line in tqdm(questions): 43 | idx = line["question_id"] 44 | image_file = line["image"] 45 | qs = line["text"] 46 | cur_prompt = qs 47 | 48 | qs = DEFAULT_IMAGE_TOKEN + '\n' + qs 49 | 50 | conv = conv_templates[args.conv_mode].copy() 51 | conv.append_message(conv.roles[0], qs) 52 | conv.append_message(conv.roles[1], None) 53 | prompt = conv.get_prompt() 54 | 55 | input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda() 56 | 57 | image = Image.open(os.path.join(args.image_folder, image_file)) 58 | image_tensor = process_images([image], image_processor, model.config)[0] 59 | 60 | stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2 61 | 62 | with torch.inference_mode(): 63 | output_ids = model.generate( 64 | input_ids, 65 | images=image_tensor.unsqueeze(0).to(dtype=model.dtype, device='cuda', non_blocking=True), 66 | do_sample=True if args.temperature > 0 else False, 67 | temperature=args.temperature, 68 | top_p=args.top_p, 69 | num_beams=args.num_beams, 70 | # no_repeat_ngram_size=3, 71 | max_new_tokens=1024, 72 | use_cache=True) 73 | 74 | input_token_len = input_ids.shape[1] 75 | n_diff_input_output = (input_ids != output_ids[:, :input_token_len]).sum().item() 76 | if n_diff_input_output > 0: 77 | print(f'[Warning] {n_diff_input_output} output_ids are not the same as the input_ids') 78 | outputs = tokenizer.batch_decode(output_ids[:, input_token_len:], skip_special_tokens=True)[0] 79 | outputs = outputs.strip() 80 | if outputs.endswith(stop_str): 81 | outputs = outputs[:-len(stop_str)] 82 | outputs = outputs.strip() 83 | 84 | ans_id = shortuuid.uuid() 85 | ans_file.write(json.dumps({"question_id": idx, 86 | "prompt": cur_prompt, 87 | "text": outputs, 88 | "answer_id": ans_id, 89 | "model_id": model_name, 90 | "metadata": {}}) + "\n") 91 | ans_file.flush() 92 | ans_file.close() 93 | 94 | 95 | if __name__ == "__main__": 96 | parser = argparse.ArgumentParser() 97 | parser.add_argument("--model-path", type=str, default=None) 98 | parser.add_argument("--model-base", type=str, default=None) 99 | parser.add_argument("--model-type", type=str, default=None) 100 | parser.add_argument("--image-folder", type=str, default=None) 101 | parser.add_argument("--question-file", type=str, default=None) 102 | parser.add_argument("--answers-file", type=str, default=None) 103 | parser.add_argument("--conv-mode", type=str, default=None) 104 | parser.add_argument("--num-chunks", type=int, default=1) 105 | parser.add_argument("--chunk-idx", type=int, default=0) 106 | parser.add_argument("--temperature", type=float, default=0.2) 107 | parser.add_argument("--top_p", type=float, default=None) 108 | parser.add_argument("--num_beams", type=int, default=1) 109 | args = parser.parse_args() 110 | 111 | eval_model(args) 112 | -------------------------------------------------------------------------------- /bunny/eval/model_vqa_loader.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import torch 3 | import os 4 | import json 5 | from tqdm import tqdm 6 | import shortuuid 7 | 8 | from bunny.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN 9 | from bunny.conversation import conv_templates 10 | from bunny.model.builder import load_pretrained_model 11 | from bunny.util.utils import disable_torch_init 12 | from bunny.util.mm_utils import tokenizer_image_token, process_images, get_model_name_from_path 13 | from torch.utils.data import Dataset, DataLoader 14 | 15 | from PIL import Image 16 | import math 17 | 18 | 19 | def split_list(lst, n): 20 | """Split a list into n (roughly) equal-sized chunks""" 21 | chunk_size = math.ceil(len(lst) / n) # integer division 22 | return [lst[i:i + chunk_size] for i in range(0, len(lst), chunk_size)] 23 | 24 | 25 | def get_chunk(lst, n, k): 26 | chunks = split_list(lst, n) 27 | return chunks[k] 28 | 29 | 30 | # Custom dataset class 31 | class CustomDataset(Dataset): 32 | def __init__(self, questions, image_folder, tokenizer, image_processor, model_config): 33 | self.questions = questions 34 | self.image_folder = image_folder 35 | self.tokenizer = tokenizer 36 | self.image_processor = image_processor 37 | self.model_config = model_config 38 | 39 | def __getitem__(self, index): 40 | line = self.questions[index] 41 | image_file = line["image"] 42 | qs = line["text"] 43 | 44 | qs = DEFAULT_IMAGE_TOKEN + '\n' + qs 45 | 46 | conv = conv_templates[args.conv_mode].copy() 47 | conv.append_message(conv.roles[0], qs) 48 | conv.append_message(conv.roles[1], None) 49 | prompt = conv.get_prompt() 50 | 51 | image = Image.open(os.path.join(self.image_folder, image_file)).convert('RGB') 52 | image_tensor = process_images([image], self.image_processor, self.model_config)[0] 53 | 54 | input_ids = tokenizer_image_token(prompt, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt') 55 | 56 | return input_ids, image_tensor 57 | 58 | def __len__(self): 59 | return len(self.questions) 60 | 61 | 62 | # DataLoader 63 | def create_data_loader(questions, image_folder, tokenizer, image_processor, model_config, batch_size=1, num_workers=4): 64 | assert batch_size == 1, "batch_size must be 1" 65 | dataset = CustomDataset(questions, image_folder, tokenizer, image_processor, model_config) 66 | data_loader = DataLoader(dataset, batch_size=batch_size, num_workers=num_workers, shuffle=False) 67 | return data_loader 68 | 69 | 70 | def eval_model(args): 71 | # Model 72 | disable_torch_init() 73 | model_path = os.path.expanduser(args.model_path) 74 | model_name = get_model_name_from_path(model_path) 75 | tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, args.model_base, model_name, 76 | args.model_type) 77 | 78 | questions = [json.loads(q) for q in open(os.path.expanduser(args.question_file), "r")] 79 | questions = get_chunk(questions, args.num_chunks, args.chunk_idx) 80 | answers_file = os.path.expanduser(args.answers_file) 81 | os.makedirs(os.path.dirname(answers_file), exist_ok=True) 82 | ans_file = open(answers_file, "w") 83 | 84 | if 'plain' in model_name and 'finetune' not in model_name.lower() and 'mmtag' not in args.conv_mode: 85 | args.conv_mode = args.conv_mode + '_mmtag' 86 | print( 87 | f'It seems that this is a plain model, but it is not using a mmtag prompt, auto switching to {args.conv_mode}.') 88 | 89 | data_loader = create_data_loader(questions, args.image_folder, tokenizer, image_processor, model.config) 90 | 91 | for (input_ids, image_tensor), line in tqdm(zip(data_loader, questions), total=len(questions)): 92 | idx = line["question_id"] 93 | cur_prompt = line["text"] 94 | 95 | input_ids = input_ids.to(device='cuda', non_blocking=True) 96 | 97 | with torch.inference_mode(): 98 | output_ids = model.generate( 99 | input_ids, 100 | images=image_tensor.to(dtype=model.dtype, device='cuda', non_blocking=True), 101 | do_sample=True if args.temperature > 0 else False, 102 | temperature=args.temperature, 103 | top_p=args.top_p, 104 | num_beams=args.num_beams, 105 | max_new_tokens=args.max_new_tokens, 106 | use_cache=True) 107 | 108 | input_token_len = input_ids.shape[1] 109 | n_diff_input_output = (input_ids != output_ids[:, :input_token_len]).sum().item() 110 | if n_diff_input_output > 0: 111 | print(f'[Warning] {n_diff_input_output} output_ids are not the same as the input_ids') 112 | outputs = tokenizer.batch_decode(output_ids[:, input_token_len:], skip_special_tokens=True)[0] 113 | outputs = outputs.strip() 114 | 115 | ans_id = shortuuid.uuid() 116 | ans_file.write(json.dumps({"question_id": idx, 117 | "prompt": cur_prompt, 118 | "text": outputs, 119 | "answer_id": ans_id, 120 | "model_id": model_name, 121 | "metadata": {}}) + "\n") 122 | # ans_file.flush() 123 | ans_file.close() 124 | 125 | 126 | if __name__ == "__main__": 127 | parser = argparse.ArgumentParser() 128 | parser.add_argument("--model-path", type=str, default=None) 129 | parser.add_argument("--model-base", type=str, default=None) 130 | parser.add_argument("--model-type", type=str, default=None) 131 | parser.add_argument("--image-folder", type=str, default=None) 132 | parser.add_argument("--question-file", type=str, default=None) 133 | parser.add_argument("--answers-file", type=str, default=None) 134 | parser.add_argument("--conv-mode", type=str, default=None) 135 | parser.add_argument("--num-chunks", type=int, default=1) 136 | parser.add_argument("--chunk-idx", type=int, default=0) 137 | parser.add_argument("--temperature", type=float, default=0.2) 138 | parser.add_argument("--top_p", type=float, default=None) 139 | parser.add_argument("--num_beams", type=int, default=1) 140 | parser.add_argument("--max_new_tokens", type=int, default=128) 141 | args = parser.parse_args() 142 | 143 | eval_model(args) 144 | -------------------------------------------------------------------------------- /bunny/eval/model_vqa_science.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import torch 3 | import os 4 | import json 5 | from tqdm import tqdm 6 | import shortuuid 7 | 8 | from bunny.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN 9 | from bunny.conversation import conv_templates, SeparatorStyle 10 | from bunny.model.builder import load_pretrained_model 11 | from bunny.util.utils import disable_torch_init 12 | from bunny.util.mm_utils import tokenizer_image_token, get_model_name_from_path 13 | 14 | from PIL import Image 15 | import math 16 | 17 | 18 | def split_list(lst, n): 19 | """Split a list into n (roughly) equal-sized chunks""" 20 | chunk_size = math.ceil(len(lst) / n) # integer division 21 | return [lst[i:i + chunk_size] for i in range(0, len(lst), chunk_size)] 22 | 23 | 24 | def get_chunk(lst, n, k): 25 | chunks = split_list(lst, n) 26 | return chunks[k] 27 | 28 | 29 | def eval_model(args): 30 | # Model 31 | disable_torch_init() 32 | model_path = os.path.expanduser(args.model_path) 33 | model_name = get_model_name_from_path(model_path) 34 | tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, args.model_base, model_name, 35 | args.model_type) 36 | 37 | questions = json.load(open(os.path.expanduser(args.question_file), "r")) 38 | questions = get_chunk(questions, args.num_chunks, args.chunk_idx) 39 | answers_file = os.path.expanduser(args.answers_file) 40 | os.makedirs(os.path.dirname(answers_file), exist_ok=True) 41 | ans_file = open(answers_file, "w") 42 | for i, line in enumerate(tqdm(questions)): 43 | idx = line["id"] 44 | question = line['conversations'][0] 45 | qs = question['value'].replace('', '').strip() 46 | cur_prompt = qs 47 | 48 | if 'image' in line: 49 | image_file = line["image"] 50 | image = Image.open(os.path.join(args.image_folder, image_file)) 51 | image_tensor = image_processor.preprocess(image, return_tensors='pt')['pixel_values'][0] 52 | images = image_tensor.unsqueeze(0).to(dtype=model.dtype, device='cuda', non_blocking=True) 53 | 54 | qs = DEFAULT_IMAGE_TOKEN + '\n' + qs 55 | cur_prompt = '' + '\n' + cur_prompt 56 | else: 57 | images = None 58 | 59 | if args.single_pred_prompt: 60 | qs = qs + '\n' + "Answer with the option's letter from the given choices directly." 61 | cur_prompt = cur_prompt + '\n' + "Answer with the option's letter from the given choices directly." 62 | 63 | conv = conv_templates[args.conv_mode].copy() 64 | conv.append_message(conv.roles[0], qs) 65 | conv.append_message(conv.roles[1], None) 66 | prompt = conv.get_prompt() 67 | 68 | input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda() 69 | 70 | stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2 71 | 72 | with torch.inference_mode(): 73 | output_ids = model.generate( 74 | input_ids, 75 | images=images, 76 | do_sample=True if args.temperature > 0 else False, 77 | temperature=args.temperature, 78 | max_new_tokens=1024, 79 | use_cache=True 80 | ) 81 | 82 | input_token_len = input_ids.shape[1] 83 | n_diff_input_output = (input_ids != output_ids[:, :input_token_len]).sum().item() 84 | if n_diff_input_output > 0: 85 | print(f'[Warning] {n_diff_input_output} output_ids are not the same as the input_ids') 86 | outputs = tokenizer.batch_decode(output_ids[:, input_token_len:], skip_special_tokens=True)[0] 87 | outputs = outputs.strip() 88 | if outputs.endswith(stop_str): 89 | outputs = outputs[:-len(stop_str)] 90 | outputs = outputs.strip() 91 | 92 | ans_id = shortuuid.uuid() 93 | ans_file.write(json.dumps({"question_id": idx, 94 | "prompt": cur_prompt, 95 | "text": outputs, 96 | "answer_id": ans_id, 97 | "model_id": model_name, 98 | "metadata": {}}) + "\n") 99 | ans_file.flush() 100 | ans_file.close() 101 | 102 | 103 | if __name__ == "__main__": 104 | parser = argparse.ArgumentParser() 105 | parser.add_argument("--model-path", type=str, default=None) 106 | parser.add_argument("--model-base", type=str, default=None) 107 | parser.add_argument("--model-type", type=str, default=None) 108 | parser.add_argument("--image-folder", type=str, default=None) 109 | parser.add_argument("--question-file", type=str, default=None) 110 | parser.add_argument("--answers-file", type=str, default=None) 111 | parser.add_argument("--conv-mode", type=str, default=None) 112 | parser.add_argument("--num-chunks", type=int, default=1) 113 | parser.add_argument("--chunk-idx", type=int, default=0) 114 | parser.add_argument("--temperature", type=float, default=0.2) 115 | parser.add_argument("--single-pred-prompt", action="store_true") 116 | 117 | args = parser.parse_args() 118 | 119 | eval_model(args) 120 | -------------------------------------------------------------------------------- /bunny/model/__init__.py: -------------------------------------------------------------------------------- 1 | from .language_model.bunny_phi import BunnyPhiForCausalLM, BunnyPhiConfig 2 | from .language_model.bunny_stablelm import BunnyStableLMForCausalLM, BunnyStableLMConfig 3 | from .language_model.bunny_qwen import BunnyQwen2ForCausalLM, BunnyQwen2Config 4 | from .language_model.bunny_minicpm import BunnyMiniCPMForCausalLM, BunnyMiniCPMConfig 5 | from .language_model.bunny_llama import BunnyLlamaForCausalLM, BunnyLlamaConfig 6 | from .language_model.bunny_phi3 import BunnyPhi3ForCausalLM, BunnyPhi3Config 7 | -------------------------------------------------------------------------------- /bunny/model/language_model/bunny_llama.py: -------------------------------------------------------------------------------- 1 | from typing import List, Optional, Tuple, Union 2 | 3 | import torch 4 | import torch.nn as nn 5 | from transformers import AutoConfig, AutoModelForCausalLM 6 | 7 | from .llama import LlamaModel, LlamaConfig, LlamaForCausalLM 8 | 9 | from transformers.modeling_outputs import CausalLMOutputWithPast 10 | 11 | from ..bunny_arch import BunnyMetaModel, BunnyMetaForCausalLM 12 | 13 | 14 | class BunnyLlamaConfig(LlamaConfig): 15 | model_type = "bunny-llama" 16 | 17 | 18 | class BunnyLlamaModel(BunnyMetaModel, LlamaModel): 19 | config_class = BunnyLlamaConfig 20 | 21 | def __init__(self, config: LlamaConfig): 22 | super(BunnyLlamaModel, self).__init__(config) 23 | 24 | 25 | class BunnyLlamaForCausalLM(LlamaForCausalLM, BunnyMetaForCausalLM): 26 | config_class = BunnyLlamaConfig 27 | 28 | def __init__(self, config): 29 | super(LlamaForCausalLM, self).__init__(config) 30 | self.model = BunnyLlamaModel(config) 31 | self.vocab_size = config.vocab_size 32 | self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) 33 | 34 | # Initialize weights and apply final processing 35 | self.post_init() 36 | 37 | def get_model(self): 38 | return self.model 39 | 40 | def forward( 41 | self, 42 | input_ids: torch.LongTensor = None, 43 | attention_mask: Optional[torch.Tensor] = None, 44 | position_ids: Optional[torch.LongTensor] = None, 45 | past_key_values: Optional[List[torch.FloatTensor]] = None, 46 | inputs_embeds: Optional[torch.FloatTensor] = None, 47 | labels: Optional[torch.LongTensor] = None, 48 | use_cache: Optional[bool] = None, 49 | output_attentions: Optional[bool] = None, 50 | output_hidden_states: Optional[bool] = None, 51 | images: Optional[torch.FloatTensor] = None, 52 | return_dict: Optional[bool] = None, 53 | cache_position: Optional[torch.LongTensor] = None, 54 | ) -> Union[Tuple, CausalLMOutputWithPast]: 55 | if inputs_embeds is None: 56 | ( 57 | input_ids, 58 | position_ids, 59 | attention_mask, 60 | past_key_values, 61 | inputs_embeds, 62 | labels 63 | ) = self.prepare_inputs_labels_for_multimodal( 64 | input_ids, 65 | position_ids, 66 | attention_mask, 67 | past_key_values, 68 | labels, 69 | images 70 | ) 71 | 72 | return super().forward( 73 | input_ids=input_ids, 74 | attention_mask=attention_mask, 75 | position_ids=position_ids, 76 | past_key_values=past_key_values, 77 | inputs_embeds=inputs_embeds, 78 | labels=labels, 79 | use_cache=use_cache, 80 | output_attentions=output_attentions, 81 | output_hidden_states=output_hidden_states, 82 | return_dict=return_dict, 83 | cache_position=None 84 | ) 85 | 86 | def prepare_inputs_for_generation(self, input_ids, past_key_values=None, inputs_embeds=None, attention_mask=None, 87 | **kwargs): 88 | images = kwargs.pop("images", None) 89 | 90 | _inputs = super().prepare_inputs_for_generation( 91 | input_ids, past_key_values=past_key_values, inputs_embeds=inputs_embeds, attention_mask=attention_mask, 92 | **kwargs 93 | ) 94 | 95 | if images is not None: 96 | _inputs['images'] = images 97 | 98 | return _inputs 99 | 100 | 101 | AutoConfig.register("bunny-llama", BunnyLlamaConfig) 102 | AutoModelForCausalLM.register(BunnyLlamaConfig, BunnyLlamaForCausalLM) 103 | -------------------------------------------------------------------------------- /bunny/model/language_model/bunny_minicpm.py: -------------------------------------------------------------------------------- 1 | from typing import List, Optional, Tuple, Union 2 | 3 | import torch 4 | import torch.nn as nn 5 | from transformers import AutoConfig, AutoModelForCausalLM 6 | 7 | from bunny.model.language_model.minicpm.modeling_minicpm import MiniCPMModel, MiniCPMForCausalLM 8 | from bunny.model.language_model.minicpm.configuration_minicpm import MiniCPMConfig 9 | 10 | from transformers.modeling_outputs import CausalLMOutputWithPast 11 | 12 | from ..bunny_arch import BunnyMetaModel, BunnyMetaForCausalLM 13 | 14 | 15 | class BunnyMiniCPMConfig(MiniCPMConfig): 16 | model_type = "bunny-minicpm" 17 | 18 | 19 | class BunnyMiniCPMModel(BunnyMetaModel, MiniCPMModel): 20 | config_class = BunnyMiniCPMConfig 21 | 22 | def __init__(self, config: MiniCPMConfig): 23 | super(BunnyMiniCPMModel, self).__init__(config) 24 | 25 | 26 | class BunnyMiniCPMForCausalLM(MiniCPMForCausalLM, BunnyMetaForCausalLM): 27 | config_class = BunnyMiniCPMConfig 28 | 29 | def __init__(self, config): 30 | super(MiniCPMForCausalLM, self).__init__(config) 31 | self.model = BunnyMiniCPMModel(config) 32 | self.vocab_size = config.vocab_size 33 | self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) 34 | 35 | # Initialize weights and apply final processing 36 | self.post_init() 37 | 38 | def get_model(self): 39 | return self.model 40 | 41 | def forward( 42 | self, 43 | input_ids: torch.LongTensor = None, 44 | attention_mask: Optional[torch.Tensor] = None, 45 | position_ids: Optional[torch.LongTensor] = None, 46 | past_key_values: Optional[List[torch.FloatTensor]] = None, 47 | inputs_embeds: Optional[torch.FloatTensor] = None, 48 | labels: Optional[torch.LongTensor] = None, 49 | use_cache: Optional[bool] = None, 50 | output_attentions: Optional[bool] = None, 51 | output_hidden_states: Optional[bool] = None, 52 | images: Optional[torch.FloatTensor] = None, 53 | return_dict: Optional[bool] = None, 54 | ) -> Union[Tuple, CausalLMOutputWithPast]: 55 | 56 | if inputs_embeds is None: 57 | ( 58 | input_ids, 59 | position_ids, 60 | attention_mask, 61 | past_key_values, 62 | inputs_embeds, 63 | labels 64 | ) = self.prepare_inputs_labels_for_multimodal( 65 | input_ids, 66 | position_ids, 67 | attention_mask, 68 | past_key_values, 69 | labels, 70 | images 71 | ) 72 | if inputs_embeds is not None: 73 | inputs_embeds *= self.get_model().config.scale_emb 74 | 75 | return super().forward( 76 | input_ids=input_ids, 77 | attention_mask=attention_mask, 78 | position_ids=position_ids, 79 | past_key_values=past_key_values, 80 | inputs_embeds=inputs_embeds, 81 | labels=labels, 82 | use_cache=use_cache, 83 | output_attentions=output_attentions, 84 | output_hidden_states=output_hidden_states, 85 | return_dict=return_dict 86 | ) 87 | 88 | def prepare_inputs_for_generation(self, input_ids, past_key_values=None, inputs_embeds=None, attention_mask=None, 89 | **kwargs): 90 | images = kwargs.pop("images", None) 91 | 92 | _inputs = super().prepare_inputs_for_generation( 93 | input_ids, past_key_values=past_key_values, inputs_embeds=inputs_embeds, attention_mask=attention_mask, 94 | **kwargs 95 | ) 96 | 97 | if images is not None: 98 | _inputs['images'] = images 99 | return _inputs 100 | 101 | 102 | AutoConfig.register("bunny-minicpm", BunnyMiniCPMConfig) 103 | AutoModelForCausalLM.register(BunnyMiniCPMConfig, BunnyMiniCPMForCausalLM) 104 | -------------------------------------------------------------------------------- /bunny/model/language_model/bunny_phi.py: -------------------------------------------------------------------------------- 1 | from typing import List, Optional, Tuple, Union 2 | 3 | import torch 4 | import torch.nn as nn 5 | from transformers import AutoConfig, AutoModelForCausalLM 6 | 7 | from .phi import PhiModel, PhiConfig, PhiForCausalLM 8 | 9 | from transformers.modeling_outputs import CausalLMOutputWithPast 10 | 11 | from ..bunny_arch import BunnyMetaModel, BunnyMetaForCausalLM 12 | 13 | 14 | class BunnyPhiConfig(PhiConfig): 15 | model_type = "bunny-phi" 16 | 17 | 18 | class BunnyPhiModel(BunnyMetaModel, PhiModel): 19 | config_class = BunnyPhiConfig 20 | 21 | def __init__(self, config: PhiConfig): 22 | super(BunnyPhiModel, self).__init__(config) 23 | 24 | 25 | class BunnyPhiForCausalLM(PhiForCausalLM, BunnyMetaForCausalLM): 26 | config_class = BunnyPhiConfig 27 | 28 | def __init__(self, config): 29 | super(PhiForCausalLM, self).__init__(config) 30 | self.model = BunnyPhiModel(config) 31 | self.vocab_size = config.vocab_size 32 | self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) 33 | 34 | # Initialize weights and apply final processing 35 | self.post_init() 36 | 37 | def get_model(self): 38 | return self.model 39 | 40 | def forward( 41 | self, 42 | input_ids: torch.LongTensor = None, 43 | attention_mask: Optional[torch.Tensor] = None, 44 | position_ids: Optional[torch.LongTensor] = None, 45 | past_key_values: Optional[List[torch.FloatTensor]] = None, 46 | inputs_embeds: Optional[torch.FloatTensor] = None, 47 | labels: Optional[torch.LongTensor] = None, 48 | use_cache: Optional[bool] = None, 49 | output_attentions: Optional[bool] = None, 50 | output_hidden_states: Optional[bool] = None, 51 | images: Optional[torch.FloatTensor] = None, 52 | return_dict: Optional[bool] = None, 53 | ) -> Union[Tuple, CausalLMOutputWithPast]: 54 | 55 | if inputs_embeds is None: 56 | ( 57 | input_ids, 58 | position_ids, 59 | attention_mask, 60 | past_key_values, 61 | inputs_embeds, 62 | labels 63 | ) = self.prepare_inputs_labels_for_multimodal( 64 | input_ids, 65 | position_ids, 66 | attention_mask, 67 | past_key_values, 68 | labels, 69 | images 70 | ) 71 | 72 | return super().forward( 73 | input_ids=input_ids, 74 | attention_mask=attention_mask, 75 | position_ids=position_ids, 76 | past_key_values=past_key_values, 77 | inputs_embeds=inputs_embeds, 78 | labels=labels, 79 | use_cache=use_cache, 80 | output_attentions=output_attentions, 81 | output_hidden_states=output_hidden_states, 82 | return_dict=return_dict 83 | ) 84 | 85 | def prepare_inputs_for_generation(self, input_ids, past_key_values=None, inputs_embeds=None, attention_mask=None, 86 | **kwargs): 87 | images = kwargs.pop("images", None) 88 | 89 | _inputs = super().prepare_inputs_for_generation( 90 | input_ids, past_key_values=past_key_values, inputs_embeds=inputs_embeds, attention_mask=attention_mask, 91 | **kwargs 92 | ) 93 | 94 | if images is not None: 95 | _inputs['images'] = images 96 | return _inputs 97 | 98 | 99 | AutoConfig.register("bunny-phi", BunnyPhiConfig) 100 | AutoModelForCausalLM.register(BunnyPhiConfig, BunnyPhiForCausalLM) 101 | -------------------------------------------------------------------------------- /bunny/model/language_model/bunny_phi3.py: -------------------------------------------------------------------------------- 1 | from typing import List, Optional, Tuple, Union 2 | 3 | import torch 4 | import torch.nn as nn 5 | from transformers import AutoConfig, AutoModelForCausalLM 6 | 7 | from .phi3 import Phi3Model, Phi3Config, Phi3ForCausalLM 8 | 9 | from transformers.modeling_outputs import CausalLMOutputWithPast 10 | 11 | from ..bunny_arch import BunnyMetaModel, BunnyMetaForCausalLM 12 | 13 | 14 | class BunnyPhi3Config(Phi3Config): 15 | model_type = "bunny-phi3" 16 | 17 | 18 | class BunnyPhi3Model(BunnyMetaModel, Phi3Model): 19 | config_class = BunnyPhi3Config 20 | 21 | def __init__(self, config: Phi3Config): 22 | super(BunnyPhi3Model, self).__init__(config) 23 | 24 | 25 | class BunnyPhi3ForCausalLM(Phi3ForCausalLM, BunnyMetaForCausalLM): 26 | config_class = BunnyPhi3Config 27 | 28 | def __init__(self, config): 29 | super(Phi3ForCausalLM, self).__init__(config) 30 | self.model = BunnyPhi3Model(config) 31 | self.vocab_size = config.vocab_size 32 | self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) 33 | 34 | # Initialize weights and apply final processing 35 | self.post_init() 36 | 37 | def get_model(self): 38 | return self.model 39 | 40 | def forward( 41 | self, 42 | input_ids: torch.LongTensor = None, 43 | attention_mask: Optional[torch.Tensor] = None, 44 | position_ids: Optional[torch.LongTensor] = None, 45 | past_key_values: Optional[List[torch.FloatTensor]] = None, 46 | inputs_embeds: Optional[torch.FloatTensor] = None, 47 | labels: Optional[torch.LongTensor] = None, 48 | use_cache: Optional[bool] = None, 49 | output_attentions: Optional[bool] = None, 50 | output_hidden_states: Optional[bool] = None, 51 | images: Optional[torch.FloatTensor] = None, 52 | return_dict: Optional[bool] = None, 53 | ) -> Union[Tuple, CausalLMOutputWithPast]: 54 | 55 | if inputs_embeds is None: 56 | ( 57 | input_ids, 58 | position_ids, 59 | attention_mask, 60 | past_key_values, 61 | inputs_embeds, 62 | labels 63 | ) = self.prepare_inputs_labels_for_multimodal( 64 | input_ids, 65 | position_ids, 66 | attention_mask, 67 | past_key_values, 68 | labels, 69 | images 70 | ) 71 | 72 | return super().forward( 73 | input_ids=input_ids, 74 | attention_mask=attention_mask, 75 | position_ids=position_ids, 76 | past_key_values=past_key_values, 77 | inputs_embeds=inputs_embeds, 78 | labels=labels, 79 | use_cache=use_cache, 80 | output_attentions=output_attentions, 81 | output_hidden_states=output_hidden_states, 82 | return_dict=return_dict 83 | ) 84 | 85 | def prepare_inputs_for_generation(self, input_ids, past_key_values=None, inputs_embeds=None, attention_mask=None, 86 | **kwargs): 87 | images = kwargs.pop("images", None) 88 | 89 | _inputs = super().prepare_inputs_for_generation( 90 | input_ids, past_key_values=past_key_values, inputs_embeds=inputs_embeds, attention_mask=attention_mask, 91 | **kwargs 92 | ) 93 | 94 | if images is not None: 95 | _inputs['images'] = images 96 | return _inputs 97 | 98 | 99 | AutoConfig.register("bunny-phi3", BunnyPhi3Config) 100 | AutoModelForCausalLM.register(BunnyPhi3Config, BunnyPhi3ForCausalLM) 101 | -------------------------------------------------------------------------------- /bunny/model/language_model/bunny_qwen.py: -------------------------------------------------------------------------------- 1 | from typing import List, Optional, Tuple, Union 2 | 3 | import torch 4 | import torch.nn as nn 5 | from transformers import AutoConfig, AutoModelForCausalLM 6 | 7 | from .qwen2 import Qwen2Model, Qwen2Config, Qwen2ForCausalLM 8 | 9 | from transformers.modeling_outputs import CausalLMOutputWithPast 10 | 11 | from ..bunny_arch import BunnyMetaModel, BunnyMetaForCausalLM 12 | 13 | 14 | class BunnyQwen2Config(Qwen2Config): 15 | model_type = "bunny-qwen2" 16 | 17 | 18 | class BunnyQwen2Model(BunnyMetaModel, Qwen2Model): 19 | config_class = BunnyQwen2Config 20 | 21 | def __init__(self, config: Qwen2Config): 22 | super(BunnyQwen2Model, self).__init__(config) 23 | 24 | 25 | class BunnyQwen2ForCausalLM(Qwen2ForCausalLM, BunnyMetaForCausalLM): 26 | config_class = BunnyQwen2Config 27 | 28 | def __init__(self, config): 29 | super(Qwen2ForCausalLM, self).__init__(config) 30 | self.model = BunnyQwen2Model(config) 31 | self.vocab_size = config.vocab_size 32 | self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) 33 | 34 | # Initialize weights and apply final processing 35 | self.post_init() 36 | 37 | def get_model(self): 38 | return self.model 39 | 40 | def forward( 41 | self, 42 | input_ids: torch.LongTensor = None, 43 | attention_mask: Optional[torch.Tensor] = None, 44 | position_ids: Optional[torch.LongTensor] = None, 45 | past_key_values: Optional[List[torch.FloatTensor]] = None, 46 | inputs_embeds: Optional[torch.FloatTensor] = None, 47 | labels: Optional[torch.LongTensor] = None, 48 | use_cache: Optional[bool] = None, 49 | output_attentions: Optional[bool] = None, 50 | output_hidden_states: Optional[bool] = None, 51 | images: Optional[torch.FloatTensor] = None, 52 | return_dict: Optional[bool] = None, 53 | ) -> Union[Tuple, CausalLMOutputWithPast]: 54 | 55 | if inputs_embeds is None: 56 | ( 57 | input_ids, 58 | position_ids, 59 | attention_mask, 60 | past_key_values, 61 | inputs_embeds, 62 | labels 63 | ) = self.prepare_inputs_labels_for_multimodal( 64 | input_ids, 65 | position_ids, 66 | attention_mask, 67 | past_key_values, 68 | labels, 69 | images 70 | ) 71 | 72 | return super().forward( 73 | input_ids=input_ids, 74 | attention_mask=attention_mask, 75 | position_ids=position_ids, 76 | past_key_values=past_key_values, 77 | inputs_embeds=inputs_embeds, 78 | labels=labels, 79 | use_cache=use_cache, 80 | output_attentions=output_attentions, 81 | output_hidden_states=output_hidden_states, 82 | return_dict=return_dict 83 | ) 84 | 85 | def prepare_inputs_for_generation(self, input_ids, past_key_values=None, inputs_embeds=None, attention_mask=None, 86 | **kwargs): 87 | images = kwargs.pop("images", None) 88 | 89 | _inputs = super().prepare_inputs_for_generation( 90 | input_ids, past_key_values=past_key_values, inputs_embeds=inputs_embeds, attention_mask=attention_mask, 91 | **kwargs 92 | ) 93 | 94 | if images is not None: 95 | _inputs['images'] = images 96 | return _inputs 97 | 98 | 99 | AutoConfig.register("bunny-qwen2", BunnyQwen2Config) 100 | AutoModelForCausalLM.register(BunnyQwen2Config, BunnyQwen2ForCausalLM) 101 | -------------------------------------------------------------------------------- /bunny/model/language_model/bunny_stablelm.py: -------------------------------------------------------------------------------- 1 | from typing import List, Optional, Tuple, Union 2 | 3 | import torch 4 | import torch.nn as nn 5 | from transformers import AutoConfig, AutoModelForCausalLM 6 | 7 | from bunny.model.language_model.stable_lm.modeling_stablelm_epoch import StableLMEpochModel, StableLMEpochConfig, \ 8 | StableLMEpochForCausalLM 9 | 10 | from transformers.modeling_outputs import CausalLMOutputWithPast 11 | 12 | from bunny.model.bunny_arch import BunnyMetaModel, BunnyMetaForCausalLM 13 | 14 | 15 | class BunnyStableLMConfig(StableLMEpochConfig): 16 | model_type = "bunny-stablelm" 17 | 18 | 19 | class BunnyStableLMModel(BunnyMetaModel, StableLMEpochModel): 20 | config_class = BunnyStableLMConfig 21 | 22 | def __init__(self, config: StableLMEpochConfig): 23 | super(BunnyStableLMModel, self).__init__(config) 24 | 25 | 26 | class BunnyStableLMForCausalLM(StableLMEpochForCausalLM, BunnyMetaForCausalLM): 27 | config_class = BunnyStableLMConfig 28 | 29 | def __init__(self, config): 30 | super(StableLMEpochForCausalLM, self).__init__(config) 31 | self.model = BunnyStableLMModel(config) 32 | self.vocab_size = config.vocab_size 33 | self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) 34 | 35 | # Initialize weights and apply final processing 36 | self.post_init() 37 | 38 | def get_model(self): 39 | return self.model 40 | 41 | def forward( 42 | self, 43 | input_ids: torch.LongTensor = None, 44 | attention_mask: Optional[torch.Tensor] = None, 45 | position_ids: Optional[torch.LongTensor] = None, 46 | past_key_values: Optional[List[torch.FloatTensor]] = None, 47 | inputs_embeds: Optional[torch.FloatTensor] = None, 48 | labels: Optional[torch.LongTensor] = None, 49 | use_cache: Optional[bool] = None, 50 | output_attentions: Optional[bool] = None, 51 | output_hidden_states: Optional[bool] = None, 52 | images: Optional[torch.FloatTensor] = None, 53 | return_dict: Optional[bool] = None, 54 | ) -> Union[Tuple, CausalLMOutputWithPast]: 55 | if inputs_embeds is None: 56 | ( 57 | input_ids, 58 | position_ids, 59 | attention_mask, 60 | past_key_values, 61 | inputs_embeds, 62 | labels 63 | ) = self.prepare_inputs_labels_for_multimodal( 64 | input_ids, 65 | position_ids, 66 | attention_mask, 67 | past_key_values, 68 | labels, 69 | images 70 | ) 71 | 72 | return super().forward( 73 | input_ids=input_ids, 74 | attention_mask=attention_mask, 75 | position_ids=position_ids, 76 | past_key_values=past_key_values, 77 | inputs_embeds=inputs_embeds, 78 | labels=labels, 79 | use_cache=use_cache, 80 | output_attentions=output_attentions, 81 | output_hidden_states=output_hidden_states, 82 | return_dict=return_dict 83 | ) 84 | 85 | def prepare_inputs_for_generation(self, input_ids, past_key_values=None, inputs_embeds=None, attention_mask=None, 86 | **kwargs): 87 | images = kwargs.pop("images", None) 88 | 89 | _inputs = super().prepare_inputs_for_generation( 90 | input_ids, past_key_values=past_key_values, inputs_embeds=inputs_embeds, attention_mask=attention_mask, 91 | **kwargs 92 | ) 93 | 94 | if images is not None: 95 | _inputs['images'] = images 96 | return _inputs 97 | 98 | 99 | AutoConfig.register("bunny-stablelm", BunnyStableLMConfig) 100 | AutoModelForCausalLM.register(BunnyStableLMConfig, BunnyStableLMForCausalLM) 101 | -------------------------------------------------------------------------------- /bunny/model/language_model/llama/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 EleutherAI and The HuggingFace Inc. team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | from typing import TYPE_CHECKING 15 | 16 | from transformers.utils import ( 17 | OptionalDependencyNotAvailable, 18 | _LazyModule, 19 | is_flax_available, 20 | is_sentencepiece_available, 21 | is_tokenizers_available, 22 | is_torch_available, 23 | ) 24 | 25 | 26 | _import_structure = { 27 | "configuration_llama": ["LLAMA_PRETRAINED_CONFIG_ARCHIVE_MAP", "LlamaConfig"], 28 | } 29 | 30 | try: 31 | if not is_sentencepiece_available(): 32 | raise OptionalDependencyNotAvailable() 33 | except OptionalDependencyNotAvailable: 34 | pass 35 | else: 36 | _import_structure["tokenization_llama"] = ["LlamaTokenizer"] 37 | 38 | try: 39 | if not is_tokenizers_available(): 40 | raise OptionalDependencyNotAvailable() 41 | except OptionalDependencyNotAvailable: 42 | pass 43 | else: 44 | _import_structure["tokenization_llama_fast"] = ["LlamaTokenizerFast"] 45 | 46 | try: 47 | if not is_torch_available(): 48 | raise OptionalDependencyNotAvailable() 49 | except OptionalDependencyNotAvailable: 50 | pass 51 | else: 52 | _import_structure["modeling_llama"] = [ 53 | "LlamaForCausalLM", 54 | "LlamaModel", 55 | "LlamaPreTrainedModel", 56 | "LlamaForSequenceClassification", 57 | "LlamaForQuestionAnswering", 58 | ] 59 | 60 | try: 61 | if not is_flax_available(): 62 | raise OptionalDependencyNotAvailable() 63 | except OptionalDependencyNotAvailable: 64 | pass 65 | else: 66 | _import_structure["modeling_flax_llama"] = ["FlaxLlamaForCausalLM", "FlaxLlamaModel", "FlaxLlamaPreTrainedModel"] 67 | 68 | 69 | if TYPE_CHECKING: 70 | from .configuration_llama import LLAMA_PRETRAINED_CONFIG_ARCHIVE_MAP, LlamaConfig 71 | 72 | try: 73 | if not is_sentencepiece_available(): 74 | raise OptionalDependencyNotAvailable() 75 | except OptionalDependencyNotAvailable: 76 | pass 77 | else: 78 | from .tokenization_llama import LlamaTokenizer 79 | 80 | try: 81 | if not is_tokenizers_available(): 82 | raise OptionalDependencyNotAvailable() 83 | except OptionalDependencyNotAvailable: 84 | pass 85 | else: 86 | from .tokenization_llama_fast import LlamaTokenizerFast 87 | 88 | try: 89 | if not is_torch_available(): 90 | raise OptionalDependencyNotAvailable() 91 | except OptionalDependencyNotAvailable: 92 | pass 93 | else: 94 | from .modeling_llama import ( 95 | LlamaForCausalLM, 96 | LlamaForQuestionAnswering, 97 | LlamaForSequenceClassification, 98 | LlamaModel, 99 | LlamaPreTrainedModel, 100 | ) 101 | 102 | try: 103 | if not is_flax_available(): 104 | raise OptionalDependencyNotAvailable() 105 | except OptionalDependencyNotAvailable: 106 | pass 107 | else: 108 | from .modeling_flax_llama import FlaxLlamaForCausalLM, FlaxLlamaModel, FlaxLlamaPreTrainedModel 109 | 110 | 111 | else: 112 | import sys 113 | 114 | sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__) 115 | -------------------------------------------------------------------------------- /bunny/model/language_model/phi/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Microsoft and The HuggingFace Inc. team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | from typing import TYPE_CHECKING 17 | 18 | from transformers.utils import ( 19 | OptionalDependencyNotAvailable, 20 | _LazyModule, 21 | is_sentencepiece_available, 22 | is_tokenizers_available, 23 | is_torch_available, 24 | ) 25 | 26 | 27 | _import_structure = { 28 | "configuration_phi": ["PHI_PRETRAINED_CONFIG_ARCHIVE_MAP", "PhiConfig"], 29 | } 30 | 31 | try: 32 | if not is_torch_available(): 33 | raise OptionalDependencyNotAvailable() 34 | except OptionalDependencyNotAvailable: 35 | pass 36 | else: 37 | _import_structure["modeling_phi"] = [ 38 | "PHI_PRETRAINED_MODEL_ARCHIVE_LIST", 39 | "PhiPreTrainedModel", 40 | "PhiModel", 41 | "PhiForCausalLM", 42 | "PhiForSequenceClassification", 43 | "PhiForTokenClassification", 44 | ] 45 | 46 | 47 | if TYPE_CHECKING: 48 | from .configuration_phi import PHI_PRETRAINED_CONFIG_ARCHIVE_MAP, PhiConfig 49 | 50 | try: 51 | if not is_torch_available(): 52 | raise OptionalDependencyNotAvailable() 53 | except OptionalDependencyNotAvailable: 54 | pass 55 | else: 56 | from .modeling_phi import ( 57 | PHI_PRETRAINED_MODEL_ARCHIVE_LIST, 58 | PhiForCausalLM, 59 | PhiForSequenceClassification, 60 | PhiForTokenClassification, 61 | PhiModel, 62 | PhiPreTrainedModel, 63 | ) 64 | 65 | 66 | else: 67 | import sys 68 | 69 | sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__) 70 | -------------------------------------------------------------------------------- /bunny/model/language_model/phi3/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Microsoft and The HuggingFace Inc. team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | from typing import TYPE_CHECKING 17 | 18 | from transformers.utils import ( 19 | OptionalDependencyNotAvailable, 20 | _LazyModule, 21 | is_sentencepiece_available, 22 | is_tokenizers_available, 23 | is_torch_available, 24 | ) 25 | 26 | 27 | _import_structure = { 28 | "configuration_phi3": ["PHI3_PRETRAINED_CONFIG_ARCHIVE_MAP", "Phi3Config"], 29 | } 30 | 31 | try: 32 | if not is_torch_available(): 33 | raise OptionalDependencyNotAvailable() 34 | except OptionalDependencyNotAvailable: 35 | pass 36 | else: 37 | _import_structure["modeling_phi3"] = [ 38 | "PHI3_PRETRAINED_MODEL_ARCHIVE_LIST", 39 | "Phi3PreTrainedModel", 40 | "Phi3Model", 41 | "Phi3ForCausalLM", 42 | "Phi3ForSequenceClassification", 43 | "Phi3ForTokenClassification", 44 | ] 45 | 46 | 47 | if TYPE_CHECKING: 48 | from .configuration_phi3 import PHI3_PRETRAINED_CONFIG_ARCHIVE_MAP, Phi3Config 49 | 50 | try: 51 | if not is_torch_available(): 52 | raise OptionalDependencyNotAvailable() 53 | except OptionalDependencyNotAvailable: 54 | pass 55 | else: 56 | from .modeling_phi3 import ( 57 | PHI3_PRETRAINED_MODEL_ARCHIVE_LIST, 58 | Phi3ForCausalLM, 59 | Phi3ForSequenceClassification, 60 | Phi3ForTokenClassification, 61 | Phi3Model, 62 | Phi3PreTrainedModel, 63 | ) 64 | 65 | 66 | else: 67 | import sys 68 | 69 | sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__) 70 | -------------------------------------------------------------------------------- /bunny/model/language_model/qwen2/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 The Qwen Team and The HuggingFace Inc. team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | from typing import TYPE_CHECKING 15 | 16 | from transformers.utils import ( 17 | OptionalDependencyNotAvailable, 18 | _LazyModule, 19 | is_tokenizers_available, 20 | is_torch_available, 21 | ) 22 | 23 | 24 | _import_structure = { 25 | "configuration_qwen2": ["QWEN2_PRETRAINED_CONFIG_ARCHIVE_MAP", "Qwen2Config"], 26 | "tokenization_qwen2": ["Qwen2Tokenizer"], 27 | } 28 | 29 | try: 30 | if not is_tokenizers_available(): 31 | raise OptionalDependencyNotAvailable() 32 | except OptionalDependencyNotAvailable: 33 | pass 34 | else: 35 | _import_structure["tokenization_qwen2_fast"] = ["Qwen2TokenizerFast"] 36 | 37 | try: 38 | if not is_torch_available(): 39 | raise OptionalDependencyNotAvailable() 40 | except OptionalDependencyNotAvailable: 41 | pass 42 | else: 43 | _import_structure["modeling_qwen2"] = [ 44 | "Qwen2ForCausalLM", 45 | "Qwen2Model", 46 | "Qwen2PreTrainedModel", 47 | "Qwen2ForSequenceClassification", 48 | ] 49 | 50 | 51 | if TYPE_CHECKING: 52 | from .configuration_qwen2 import QWEN2_PRETRAINED_CONFIG_ARCHIVE_MAP, Qwen2Config 53 | from .tokenization_qwen2 import Qwen2Tokenizer 54 | 55 | try: 56 | if not is_tokenizers_available(): 57 | raise OptionalDependencyNotAvailable() 58 | except OptionalDependencyNotAvailable: 59 | pass 60 | else: 61 | from .tokenization_qwen2_fast import Qwen2TokenizerFast 62 | 63 | try: 64 | if not is_torch_available(): 65 | raise OptionalDependencyNotAvailable() 66 | except OptionalDependencyNotAvailable: 67 | pass 68 | else: 69 | from .modeling_qwen2 import ( 70 | Qwen2ForCausalLM, 71 | Qwen2ForSequenceClassification, 72 | Qwen2Model, 73 | Qwen2PreTrainedModel, 74 | ) 75 | 76 | 77 | else: 78 | import sys 79 | 80 | sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__) -------------------------------------------------------------------------------- /bunny/model/language_model/qwen2/tokenization_qwen2_fast.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2024 The Qwen team, Alibaba Group and The HuggingFace Inc. team. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Tokenization classes for Qwen2.""" 16 | 17 | from typing import Optional, Tuple 18 | 19 | from transformers.tokenization_utils import AddedToken 20 | from transformers.tokenization_utils_fast import PreTrainedTokenizerFast 21 | from transformers.utils import logging 22 | from .tokenization_qwen2 import Qwen2Tokenizer 23 | 24 | 25 | logger = logging.get_logger(__name__) 26 | 27 | VOCAB_FILES_NAMES = { 28 | "vocab_file": "vocab.json", 29 | "merges_file": "merges.txt", 30 | "tokenizer_file": "tokenizer.json", 31 | } 32 | 33 | PRETRAINED_VOCAB_FILES_MAP = { 34 | "vocab_file": {"qwen/qwen-tokenizer": "https://huggingface.co/qwen/qwen-tokenizer/resolve/main/vocab.json"}, 35 | "merges_file": {"qwen/qwen-tokenizer": "https://huggingface.co/qwen/qwen-tokenizer/resolve/main/merges.txt"}, 36 | "tokenizer_file": { 37 | "qwen/qwen-tokenizer": "https://huggingface.co/qwen/qwen-tokenizer/resolve/main/tokenizer.json" 38 | }, 39 | } 40 | 41 | MAX_MODEL_INPUT_SIZES = {"qwen/qwen-tokenizer": 32768} 42 | 43 | 44 | class Qwen2TokenizerFast(PreTrainedTokenizerFast): 45 | """ 46 | Construct a "fast" Qwen2 tokenizer (backed by HuggingFace's *tokenizers* library). Based on byte-level 47 | Byte-Pair-Encoding. 48 | 49 | Same with GPT2Tokenizer, this tokenizer has been trained to treat spaces like parts of the tokens so a word will 50 | be encoded differently whether it is at the beginning of the sentence (without space) or not: 51 | 52 | ```python 53 | >>> from transformers import Qwen2TokenizerFast 54 | 55 | >>> tokenizer = Qwen2TokenizerFast.from_pretrained("Qwen/Qwen-tokenizer") 56 | >>> tokenizer("Hello world")["input_ids"] 57 | [9707, 1879] 58 | 59 | >>> tokenizer(" Hello world")["input_ids"] 60 | [21927, 1879] 61 | ``` 62 | This is expected. 63 | 64 | This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should 65 | refer to this superclass for more information regarding those methods. 66 | 67 | Args: 68 | vocab_file (`str`, *optional*): 69 | Path to the vocabulary file. 70 | merges_file (`str`, *optional*): 71 | Path to the merges file. 72 | tokenizer_file (`str`, *optional*): 73 | Path to [tokenizers](https://github.com/huggingface/tokenizers) file (generally has a .json extension) that 74 | contains everything needed to load the tokenizer. 75 | unk_token (`str`, *optional*, defaults to `"<|endoftext|>"`): 76 | The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this 77 | token instead. Not applicable to this tokenizer. 78 | bos_token (`str`, *optional*): 79 | The beginning of sequence token. Not applicable for this tokenizer. 80 | eos_token (`str`, *optional*, defaults to `"<|endoftext|>"`): 81 | The end of sequence token. 82 | pad_token (`str`, *optional*, defaults to `"<|endoftext|>"`): 83 | The token used for padding, for example when batching sequences of different lengths. 84 | """ 85 | 86 | vocab_files_names = VOCAB_FILES_NAMES 87 | pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP 88 | max_model_input_sizes = MAX_MODEL_INPUT_SIZES 89 | model_input_names = ["input_ids", "attention_mask"] 90 | slow_tokenizer_class = Qwen2Tokenizer 91 | 92 | def __init__( 93 | self, 94 | vocab_file=None, 95 | merges_file=None, 96 | tokenizer_file=None, 97 | unk_token="<|endoftext|>", 98 | bos_token=None, 99 | eos_token="<|endoftext|>", 100 | pad_token="<|endoftext|>", 101 | **kwargs, 102 | ): 103 | # We need to at least pass vocab_file and merges_file to base class 104 | # in case a slow tokenizer needs to be initialized; other can be 105 | # configured through files. 106 | # following GPT2TokenizerFast, also adding unk_token, bos_token, and eos_token 107 | 108 | bos_token = ( 109 | AddedToken(bos_token, lstrip=False, rstrip=False, special=True, normalized=False) 110 | if isinstance(bos_token, str) 111 | else bos_token 112 | ) 113 | eos_token = ( 114 | AddedToken(eos_token, lstrip=False, rstrip=False, special=True, normalized=False) 115 | if isinstance(eos_token, str) 116 | else eos_token 117 | ) 118 | unk_token = ( 119 | AddedToken(unk_token, lstrip=False, rstrip=False, special=True, normalized=False) 120 | if isinstance(unk_token, str) 121 | else unk_token 122 | ) 123 | pad_token = ( 124 | AddedToken(pad_token, lstrip=False, rstrip=False, special=True, normalized=False) 125 | if isinstance(pad_token, str) 126 | else pad_token 127 | ) 128 | 129 | super().__init__( 130 | vocab_file, 131 | merges_file, 132 | tokenizer_file=tokenizer_file, 133 | unk_token=unk_token, 134 | bos_token=bos_token, 135 | eos_token=eos_token, 136 | pad_token=pad_token, 137 | **kwargs, 138 | ) 139 | 140 | # Copied from transformers.models.gpt2.tokenization_gpt2_fast.GPT2TokenizerFast.save_vocabulary 141 | def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: 142 | files = self._tokenizer.model.save(save_directory, name=filename_prefix) 143 | return tuple(files) -------------------------------------------------------------------------------- /bunny/model/language_model/stable_lm/configuration_stablelm_epoch.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Stability and The HuggingFace Inc. team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """ StableLM Epoch model configuration""" 15 | from transformers import PretrainedConfig 16 | from transformers.utils import logging 17 | 18 | 19 | logger = logging.get_logger(__name__) 20 | 21 | 22 | class StableLMEpochConfig(PretrainedConfig): 23 | r""" 24 | Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the 25 | documentation from [`PretrainedConfig`] for more information. 26 | 27 | Args: 28 | vocab_size (`int`, *optional*, defaults to 50_304): 29 | Vocabulary size of the StableLM model. Defines the number of different tokens that 30 | can be represented by the `inputs_ids` passed when calling [`StableLMEpochModel`]. 31 | intermediate_size (`int`, *optional*, defaults to 6912): 32 | Dimension of the MLP representations. 33 | hidden_size (`int`, *optional*, defaults to 2560): 34 | Dimension of the decoder layers and the pooler layer. 35 | num_hidden_layers (`int`, *optional*, defaults to 32): 36 | Number of hidden layers in the Transformer decoder. 37 | num_attention_heads (`int`, *optional*, defaults to 32): 38 | Number of attention heads for each attention layer in the Transformer encoder. 39 | num_key_value_heads (`int`, *optional*): 40 | This is the number of key_value heads that should be used to implement Grouped Query Attention. If 41 | `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if 42 | `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When 43 | converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed 44 | by meanpooling all the original heads within that group. For more details checkout [this 45 | paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to 46 | `num_attention_heads`. 47 | hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): 48 | The non-linear activation function (function or string). 49 | rope_pct (`float`, *optional*, defaults to 1.0): 50 | Percentage of hidden dimensions to allocate to rotary embeddings. 51 | rope_theta (`float`, *optional*, defaults to 10000.0): 52 | The base period of the RoPE embeddings. 53 | max_position_embeddings (`int`, *optional*, defaults to 2048): 54 | The maximum sequence length that this model might ever be used with. 55 | Typically set this to something large just in case (e.g., 512 or 1024 or 2048). 56 | initializer_range (`float`, *optional*, defaults to 1e-5): 57 | The standard deviation of the truncated_normal_initializer for initializing 58 | all weight matrices. 59 | norm_eps (`float`, *optional*, defaults to 1e-8): 60 | The epsilon used by the normalization layers. 61 | use_cache (`bool`, *optional*, defaults to `True`): 62 | Whether or not the model should return the last key/values attentions 63 | (not used by all models). Only relevant if `config.is_decoder=True`. 64 | use_qkv_bias (`bool`, *optional*, defaults to `True`): 65 | Whether or not the model should use bias for qkv layers. 66 | tie_word_embeddings(`bool`, *optional*, defaults to `False`): 67 | Whether to tie weight embeddings 68 | """ 69 | model_type = "stablelm_epoch" 70 | keys_to_ignore_at_inference = ["past_key_values"] 71 | 72 | def __init__( 73 | self, 74 | vocab_size=50_304, 75 | intermediate_size=6912, 76 | hidden_size=2560, 77 | num_hidden_layers=32, 78 | num_attention_heads=32, 79 | num_key_value_heads=32, 80 | hidden_act="silu", 81 | rope_pct=0.25, 82 | rope_theta=10_000, 83 | max_position_embeddings=4096, 84 | initializer_range=0.02, 85 | norm_eps=1.0e-5, 86 | use_cache=True, 87 | use_qkv_bias=True, 88 | bos_token_id=0, 89 | eos_token_id=2, 90 | tie_word_embeddings=False, 91 | **kwargs, 92 | ): 93 | self.vocab_size = vocab_size 94 | self.max_position_embeddings = max_position_embeddings 95 | self.intermediate_size = intermediate_size 96 | self.hidden_size = hidden_size 97 | self.num_hidden_layers = num_hidden_layers 98 | self.num_attention_heads = num_attention_heads 99 | self.num_key_value_heads = num_key_value_heads 100 | self.hidden_act = hidden_act 101 | self.rope_pct = rope_pct 102 | self.rope_theta = rope_theta 103 | self.initializer_range = initializer_range 104 | self.norm_eps = norm_eps 105 | self.use_cache = use_cache 106 | self.use_qkv_bias = use_qkv_bias 107 | self.tie_word_embeddings = tie_word_embeddings 108 | super().__init__( 109 | bos_token_id=bos_token_id, 110 | eos_token_id=eos_token_id, 111 | tie_word_embeddings=tie_word_embeddings, 112 | **kwargs, 113 | ) 114 | -------------------------------------------------------------------------------- /bunny/model/multimodal_encoder/builder.py: -------------------------------------------------------------------------------- 1 | import os 2 | from .eva_clip.eva_clip_encoder import EvaClipVisionTower 3 | from .siglip.siglip_encoder import SiglipVisionTower, SiglipVisionTowerS2 4 | from .clip.clip_encoder import CLIPVisionTower 5 | 6 | 7 | def build_vision_tower(vision_tower_cfg, **kwargs): 8 | vision_tower = getattr(vision_tower_cfg, 'mm_vision_tower', getattr(vision_tower_cfg, 'vision_tower', None)) 9 | use_s2 = getattr(vision_tower_cfg, 'use_s2', False) 10 | 11 | if 'sig' in vision_tower.lower(): 12 | if use_s2: 13 | return SiglipVisionTowerS2(vision_tower, args=vision_tower_cfg, **kwargs) 14 | else: 15 | return SiglipVisionTower(vision_tower, args=vision_tower_cfg, **kwargs) 16 | elif 'eva' in vision_tower.lower(): 17 | if use_s2: 18 | raise ValueError(f'Currently not supporting S2 for EVA-CLIP') 19 | else: 20 | return EvaClipVisionTower(vision_tower, args=vision_tower_cfg, **kwargs) 21 | 22 | elif 'clip' in vision_tower.lower(): 23 | if use_s2: 24 | raise ValueError(f'Currently not supporting S2 for CLIP') 25 | else: 26 | return CLIPVisionTower(vision_tower, args=vision_tower_cfg, **kwargs) 27 | 28 | else: 29 | raise ValueError(f'Unknown vision tower: {vision_tower}') 30 | -------------------------------------------------------------------------------- /bunny/model/multimodal_encoder/clip/clip_encoder.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | from transformers import CLIPVisionModel, CLIPImageProcessor, CLIPVisionConfig 5 | 6 | 7 | class CLIPVisionTower(nn.Module): 8 | def __init__(self, vision_tower, args, delay_load=False): 9 | super().__init__() 10 | 11 | self.is_loaded = False 12 | 13 | self.vision_tower_name = vision_tower 14 | self.select_layer = -2 15 | 16 | if not delay_load: 17 | self.load_model() 18 | else: 19 | self.cfg_only = CLIPVisionConfig.from_pretrained(self.vision_tower_name) 20 | 21 | def load_model(self): 22 | if self.is_loaded: 23 | return 24 | self.image_processor = CLIPImageProcessor.from_pretrained(self.vision_tower_name) 25 | self.vision_tower = CLIPVisionModel.from_pretrained(self.vision_tower_name) 26 | self.vision_tower.requires_grad_(False) 27 | 28 | self.is_loaded = True 29 | 30 | def feature_select(self, image_forward_outs): 31 | image_features = image_forward_outs.hidden_states[self.select_layer] 32 | 33 | image_features = image_features[:, 1:] 34 | 35 | return image_features 36 | 37 | def forward(self, images): 38 | if type(images) is list: 39 | image_features = [] 40 | for image in images: 41 | image_forward_out = self.vision_tower(image.to(device=self.device, dtype=self.dtype).unsqueeze(0), 42 | output_hidden_states=True) 43 | image_feature = self.feature_select(image_forward_out).to(image.dtype) 44 | image_features.append(image_feature) 45 | else: 46 | image_forward_outs = self.vision_tower(images.to(device=self.device, dtype=self.dtype), 47 | output_hidden_states=True) 48 | image_features = self.feature_select(image_forward_outs).to(images.dtype) 49 | 50 | return image_features 51 | 52 | @property 53 | def dummy_feature(self): 54 | return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype) 55 | 56 | @property 57 | def dtype(self): 58 | return self.vision_tower.dtype 59 | 60 | @property 61 | def device(self): 62 | return self.vision_tower.device 63 | 64 | @property 65 | def config(self): 66 | if self.is_loaded: 67 | return self.vision_tower.config 68 | else: 69 | return self.cfg_only 70 | 71 | @property 72 | def hidden_size(self): 73 | return self.config.hidden_size 74 | 75 | @property 76 | def num_patches(self): 77 | return (self.config.image_size // self.config.patch_size) ** 2 78 | -------------------------------------------------------------------------------- /bunny/model/multimodal_encoder/eva_clip/eva_clip_encoder.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | from .eva_clip_processors import EvaClipImageTrainProcessor 5 | from .eva_vit import Eva2LargePlusEncoder 6 | 7 | 8 | class EvaClipVisionTower(nn.Module): 9 | def __init__(self, vision_tower, args, delay_load=False): 10 | super().__init__() 11 | 12 | self.is_loaded = False 13 | 14 | self.vision_tower_path = vision_tower 15 | self.config = VisionTowerConfig() 16 | 17 | if not delay_load: 18 | self.load_model() 19 | else: 20 | self.cfg_only = self.config 21 | 22 | def load_model(self): 23 | if self.is_loaded: 24 | return 25 | self.image_processor = EvaClipImageTrainProcessor(self.config.image_size) 26 | self.vision_tower = Eva2LargePlusEncoder(self.vision_tower_path) 27 | self.vision_tower.requires_grad_(False) 28 | 29 | self.is_loaded = True 30 | 31 | def forward(self, images): 32 | if type(images) is list: 33 | image_features = [] 34 | for image in images: 35 | image_feature = self.vision_tower(image.to(device=self.device, dtype=self.dtype).unsqueeze(0)).to( 36 | image.dtype) 37 | image_features.append(image_feature) 38 | else: 39 | image_features = self.vision_tower(images.to(device=self.device, dtype=self.dtype)).to(images.dtype) 40 | 41 | return image_features 42 | 43 | @property 44 | def dtype(self): 45 | return self.vision_tower.dtype 46 | 47 | @property 48 | def device(self): 49 | return self.vision_tower.device 50 | 51 | @property 52 | def hidden_size(self): 53 | return self.config.hidden_size 54 | 55 | @property 56 | def num_patches(self): 57 | return (self.config.image_size // self.config.patch_size) ** 2 58 | 59 | 60 | class VisionTowerConfig(): 61 | def __init__(self): 62 | self.image_size = 336 63 | self.patch_size = 14 64 | self.hidden_size = 1024 65 | -------------------------------------------------------------------------------- /bunny/model/multimodal_encoder/eva_clip/eva_clip_processors.py: -------------------------------------------------------------------------------- 1 | ''' 2 | # Adapted from https://github.com/baaivision/EVA/tree/master/EVA-CLIP 3 | ''' 4 | 5 | from torchvision import transforms 6 | from torchvision.transforms.functional import InterpolationMode 7 | from transformers.image_processing_utils import BatchFeature 8 | from PIL import Image 9 | from transformers.image_transforms import convert_to_rgb 10 | 11 | 12 | class BaseProcessor: 13 | def __init__(self): 14 | self.transform = lambda x: x 15 | return 16 | 17 | def __call__(self, item): 18 | return self.transform(item) 19 | 20 | 21 | class EvaClipImageBaseProcessor(BaseProcessor): 22 | def __init__(self, mean=None, std=None): 23 | self.mean = (0.48145466, 0.4578275, 0.40821073) if mean is None else mean 24 | self.std = (0.26862954, 0.26130258, 0.27577711) if std is None else std 25 | 26 | self.normalize = transforms.Normalize(self.mean, self.std) 27 | 28 | @property 29 | def image_mean(self): 30 | return self.mean 31 | 32 | 33 | class EvaClipImageTrainProcessor(EvaClipImageBaseProcessor): 34 | def __init__(self, image_size=224, mean=None, std=None, min_scale=0.5, max_scale=1.0): 35 | super().__init__(mean=mean, std=std) 36 | 37 | self.transform = transforms.Compose( 38 | [ 39 | convert_to_rgb, 40 | transforms.Resize( 41 | image_size, 42 | interpolation=InterpolationMode.BICUBIC, 43 | ), 44 | transforms.CenterCrop(image_size), 45 | transforms.ToTensor(), 46 | self.normalize, 47 | ] 48 | ) 49 | 50 | self.image_size = image_size 51 | 52 | def preprocess(self, images, return_tensors): 53 | if isinstance(images, Image.Image): 54 | images = [images] 55 | else: 56 | assert isinstance(images, list) 57 | 58 | transformed_images = [self.transform(image).numpy() for image in images] 59 | data = {"pixel_values": transformed_images} 60 | 61 | return BatchFeature(data=data, tensor_type=return_tensors) 62 | 63 | def __call__(self, item): 64 | return self.transform(item) 65 | 66 | @property 67 | def crop_size(self): 68 | return {'height': self.image_size, 'width': self.image_size} 69 | -------------------------------------------------------------------------------- /bunny/model/multimodal_encoder/siglip/siglip_encoder.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | from transformers import SiglipVisionModel, SiglipImageProcessor, SiglipVisionConfig 5 | from bunny.util.s2wrapper import forward as multiscale_forward 6 | 7 | 8 | class SiglipVisionTower(nn.Module): 9 | def __init__(self, vision_tower, args, delay_load=False): 10 | super().__init__() 11 | 12 | self.is_loaded = False 13 | 14 | self.vision_tower_name = vision_tower 15 | self.select_layer = -2 16 | 17 | if not delay_load: 18 | self.load_model() 19 | else: 20 | self.cfg_only = SiglipVisionConfig.from_pretrained(self.vision_tower_name) 21 | 22 | def load_model(self): 23 | if self.is_loaded: 24 | return 25 | self.image_processor = SiglipImageProcessor.from_pretrained(self.vision_tower_name) 26 | self.image_processor.crop_size = self.image_processor.size 27 | self.vision_tower = SiglipVisionModel.from_pretrained(self.vision_tower_name) 28 | self.vision_tower.requires_grad_(False) 29 | 30 | self.is_loaded = True 31 | 32 | def feature_select(self, image_forward_outs): 33 | image_features = image_forward_outs.hidden_states[self.select_layer] 34 | 35 | return image_features 36 | 37 | def forward(self, images): 38 | if type(images) is list: 39 | image_features = [] 40 | for image in images: 41 | image_forward_out = self.vision_tower(image.to(device=self.device, dtype=self.dtype).unsqueeze(0), 42 | output_hidden_states=True) 43 | image_feature = self.feature_select(image_forward_out).to(image.dtype) 44 | image_features.append(image_feature) 45 | else: 46 | image_forward_outs = self.vision_tower(images.to(device=self.device, dtype=self.dtype), 47 | output_hidden_states=True) 48 | image_features = self.feature_select(image_forward_outs).to(images.dtype) 49 | 50 | return image_features 51 | 52 | @property 53 | def dummy_feature(self): 54 | return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype) 55 | 56 | @property 57 | def dtype(self): 58 | return self.vision_tower.dtype 59 | 60 | @property 61 | def device(self): 62 | return self.vision_tower.device 63 | 64 | @property 65 | def config(self): 66 | if self.is_loaded: 67 | return self.vision_tower.config 68 | else: 69 | return self.cfg_only 70 | 71 | @property 72 | def hidden_size(self): 73 | return self.config.hidden_size 74 | 75 | @property 76 | def num_patches(self): 77 | return (self.config.image_size // self.config.patch_size) ** 2 78 | 79 | 80 | class SiglipVisionTowerS2(SiglipVisionTower): 81 | def __init__(self, vision_tower, args, delay_load=False): 82 | self.s2_scales = getattr(args, 's2_scales', '384,768,1152') 83 | self.s2_scales = list(map(int, self.s2_scales.split(','))) 84 | self.s2_scales.sort() 85 | self.s2_split_size = self.s2_scales[0] 86 | self.s2_image_size = self.s2_scales[-1] 87 | 88 | super().__init__(vision_tower, args, delay_load) 89 | 90 | self.multiscale_forward = multiscale_forward 91 | 92 | if not delay_load: 93 | self.image_processor.size['height'] = self.image_processor.size['width'] = self.s2_image_size 94 | self.image_processor.crop_size['height'] = self.image_processor.crop_size['width'] = self.s2_image_size 95 | 96 | def load_model(self): 97 | if self.is_loaded: 98 | return 99 | self.image_processor = SiglipImageProcessor.from_pretrained(self.vision_tower_name) 100 | self.image_processor.crop_size = self.image_processor.size 101 | self.vision_tower = SiglipVisionModel.from_pretrained(self.vision_tower_name) 102 | self.vision_tower.requires_grad_(False) 103 | 104 | self.image_processor.size['height'] = self.image_processor.size['width'] = self.s2_image_size 105 | self.image_processor.crop_size['height'] = self.image_processor.crop_size['width'] = self.s2_image_size 106 | 107 | self.is_loaded = True 108 | 109 | def forward_feature(self, images): 110 | image_forward_outs = self.vision_tower(images.to(device=self.device, dtype=self.dtype), 111 | output_hidden_states=True) 112 | image_features = self.feature_select(image_forward_outs).to(images.dtype) 113 | return image_features 114 | 115 | def forward(self, images): 116 | if type(images) is list: 117 | image_features = [] 118 | for image in images: 119 | image_feature = self.multiscale_forward(self.forward_feature, image.unsqueeze(0), 120 | img_sizes=self.s2_scales, max_split_size=self.s2_split_size) 121 | image_features.append(image_feature) 122 | else: 123 | image_features = self.multiscale_forward(self.forward_feature, images, img_sizes=self.s2_scales, 124 | max_split_size=self.s2_split_size) 125 | 126 | return image_features 127 | 128 | @property 129 | def hidden_size(self): 130 | return self.config.hidden_size * len(self.s2_scales) 131 | -------------------------------------------------------------------------------- /bunny/model/multimodal_projector/builder.py: -------------------------------------------------------------------------------- 1 | import re 2 | import math 3 | from torch import nn 4 | from functools import partial 5 | from timm.layers.norm_act import LayerNormAct2d 6 | from torchvision.ops.misc import SqueezeExcitation as SELayer 7 | from torchvision.models.mobilenetv3 import InvertedResidual, InvertedResidualConfig 8 | 9 | 10 | class IdentityMap(nn.Module): 11 | def __init__(self): 12 | super().__init__() 13 | 14 | def forward(self, x, *args, **kwargs): 15 | return x 16 | 17 | @property 18 | def config(self): 19 | return {"mm_projector_type": 'identity'} 20 | 21 | 22 | class Minigpt(nn.Module): 23 | def __init__(self, config=None): 24 | super(Minigpt, self).__init__() 25 | # c*4 is the input size, and c is the output size for the linear layer 26 | inc, ouc = config.mm_hidden_size, config.hidden_size 27 | self.linear = nn.Linear(inc * 4, ouc) 28 | 29 | def forward(self, x): 30 | # x is the input tensor with shape [b, num_tokens, c] 31 | b, num_tokens, c = x.shape 32 | 33 | # Check if num_tokens is divisible by 4 34 | if num_tokens % 4 != 0: 35 | raise ValueError("num_tokens must be divisible by 4") 36 | 37 | # Reshape x to [b, num_tokens/4, c*4] 38 | x = x.view(b, num_tokens // 4, c * 4) 39 | 40 | # Apply the linear transformation 41 | x = self.linear(x) 42 | return x 43 | 44 | 45 | class Vanilla(nn.Module): 46 | def __init__(self, config=None): 47 | super(Vanilla, self).__init__() 48 | # c*4 is the input size, and c is the output size for the linear layer 49 | inc, ouc = config.mm_hidden_size, config.hidden_size 50 | self.linear = nn.Linear(inc * 4, ouc) 51 | 52 | def forward(self, x): 53 | b, num_tokens, c = x.shape 54 | 55 | # Check if num_tokens is divisible by 4 56 | if num_tokens % 4 != 0: 57 | raise ValueError("num_tokens must be divisible by 4") 58 | 59 | # First, reshape to [b, num_tokens//4, 4, c] 60 | x = x.view(b, num_tokens // 4, 4, c) 61 | 62 | # Then, permute to interleave the tokens 63 | x = x.permute(0, 1, 3, 2).contiguous() 64 | 65 | # Finally, reshape to [b, num_tokens//4, c*4] to interleave features of 4 tokens 66 | x = x.view(b, num_tokens // 4, c * 4) 67 | 68 | # Apply the linear transformation 69 | x = self.linear(x) 70 | return x 71 | 72 | 73 | class LDPBlock(nn.Module): 74 | # Lightweight Downsample Projector Block 75 | 76 | def __init__(self, config=None): 77 | super().__init__() 78 | 79 | inc, ouc = config.mm_hidden_size, config.hidden_size 80 | layer_norm = partial(LayerNormAct2d, act_layer=None) 81 | se_layer = partial(SELayer, scale_activation=nn.Hardsigmoid) 82 | self.mlp = nn.Sequential( 83 | nn.Identity(), nn.Linear(inc, ouc), nn.GELU(), nn.Linear(ouc, ouc) 84 | ) 85 | self.mb_block = nn.Sequential( 86 | nn.Identity(), 87 | InvertedResidual(InvertedResidualConfig(ouc, 3, ouc, ouc, True, "HS", 1, 1, 1), layer_norm, se_layer), 88 | InvertedResidual(InvertedResidualConfig(ouc, 3, ouc, ouc, True, "HS", 2, 1, 1), layer_norm, se_layer) 89 | ) 90 | 91 | def forward(self, x): 92 | b, num_tokens, c = x.shape 93 | h = int(math.sqrt(num_tokens)) 94 | x = self.mlp(x) 95 | x = x.permute(0, 2, 1).reshape(b, -1, h, h) 96 | x = self.mb_block(x) 97 | x = x.flatten(2).permute(0, 2, 1) 98 | return x 99 | 100 | 101 | class LDPNetProjector(nn.Module): 102 | 103 | def __init__(self, config=None): 104 | super().__init__() 105 | self.model = LDPBlock(config) 106 | 107 | def forward(self, x): 108 | return self.model(x) 109 | 110 | 111 | class SPP(nn.Module): 112 | 113 | def __init__(self, config=None, projector_type='v1'): 114 | super().__init__() 115 | 116 | self.projector_type = projector_type 117 | 118 | inc, ouc = config.mm_hidden_size, config.hidden_size 119 | self.linear_0 = nn.Linear(inc, inc) 120 | 121 | self.linear_1 = nn.Linear(inc, ouc) 122 | 123 | self.pooling = nn.AvgPool2d(kernel_size=2) 124 | 125 | self.linear_2 = nn.Linear(ouc, ouc) 126 | 127 | def forward(self, x): 128 | b, num_tokens, c = x.shape 129 | h = int(math.sqrt(num_tokens)) 130 | if 'v1' in self.projector_type: 131 | x = self.linear_1(x) 132 | x = x.permute(0, 2, 1).reshape(b, -1, h, h) 133 | x = self.pooling(x) 134 | x = x.flatten(2).permute(0, 2, 1) 135 | x = self.linear_2(x) 136 | elif 'v2' in self.projector_type: 137 | x = self.linear_1(x) 138 | x = self.linear_2(x) 139 | x = x.permute(0, 2, 1).reshape(b, -1, h, h) 140 | x = self.pooling(x) 141 | x = x.flatten(2).permute(0, 2, 1) 142 | elif 'v3' in self.projector_type: 143 | x = self.linear_0(x) 144 | x = x.permute(0, 2, 1).reshape(b, -1, h, h) 145 | x = self.pooling(x) 146 | x = x.flatten(2).permute(0, 2, 1) 147 | x = self.linear_1(x) 148 | x = self.linear_2(x) 149 | return x 150 | 151 | 152 | def build_vision_projector(config, delay_load=False, **kwargs): 153 | projector_type = getattr(config, 'mm_projector_type', 'mlp2x_gelu') 154 | 155 | if projector_type == 'linear': 156 | return nn.Linear(config.mm_hidden_size, config.hidden_size) 157 | 158 | elif projector_type.startswith('mlp'): 159 | mlp_gelu_match = re.match(r'^mlp(\d+)x_gelu$', projector_type) 160 | if mlp_gelu_match: 161 | mlp_depth = int(mlp_gelu_match.group(1)) 162 | modules = [nn.Linear(config.mm_hidden_size, config.hidden_size)] 163 | for _ in range(1, mlp_depth): 164 | modules.append(nn.GELU()) 165 | modules.append(nn.Linear(config.hidden_size, config.hidden_size)) 166 | return nn.Sequential(*modules) 167 | 168 | elif projector_type.startswith('spp'): 169 | return SPP(config, projector_type) 170 | 171 | elif projector_type == 'ldp': 172 | return LDPNetProjector(config) 173 | 174 | elif projector_type == 'vanilla': 175 | return Vanilla(config) 176 | 177 | elif projector_type == 'minigpt': 178 | return Minigpt(config) 179 | 180 | elif projector_type == 'identity': 181 | return IdentityMap() 182 | 183 | raise ValueError(f'Unknown projector type: {projector_type}') 184 | -------------------------------------------------------------------------------- /bunny/serve/cli.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import torch 3 | import requests 4 | 5 | from PIL import Image 6 | from io import BytesIO 7 | from transformers import TextStreamer 8 | 9 | from bunny.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN 10 | from bunny.conversation import conv_templates, SeparatorStyle 11 | from bunny.model.builder import load_pretrained_model 12 | from bunny.util.utils import disable_torch_init 13 | from bunny.util.mm_utils import process_images, tokenizer_image_token, get_model_name_from_path, \ 14 | KeywordsStoppingCriteria 15 | 16 | 17 | def load_image(image_file): 18 | if image_file.startswith('http://') or image_file.startswith('https://'): 19 | response = requests.get(image_file) 20 | image = Image.open(BytesIO(response.content)).convert('RGB') 21 | else: 22 | image = Image.open(image_file).convert('RGB') 23 | return image 24 | 25 | 26 | def main(args): 27 | # Model 28 | disable_torch_init() 29 | 30 | model_name = get_model_name_from_path(args.model_path) 31 | tokenizer, model, image_processor, context_len = load_pretrained_model(args.model_path, args.model_base, model_name, 32 | args.model_type, args.load_8bit, 33 | args.load_4bit, device=args.device) 34 | 35 | conv_mode = "bunny" 36 | 37 | if args.conv_mode is not None and conv_mode != args.conv_mode: 38 | print( 39 | '[WARNING] the auto inferred conversation mode is {}, while `--conv-mode` is {}, using {}'.format(conv_mode, 40 | args.conv_mode, 41 | args.conv_mode)) 42 | else: 43 | args.conv_mode = conv_mode 44 | 45 | conv = conv_templates[args.conv_mode].copy() 46 | roles = conv.roles 47 | 48 | image = load_image(args.image_file) 49 | # Similar operation in model_worker.py 50 | image_tensor = process_images([image], image_processor, model.config) 51 | if type(image_tensor) is list: 52 | image_tensor = [image.to(model.device, dtype=model.dtype) for image in image_tensor] 53 | else: 54 | image_tensor = image_tensor.to(model.device, dtype=model.dtype) 55 | 56 | while True: 57 | try: 58 | inp = input(f"{roles[0]}: ") 59 | except EOFError: 60 | inp = "" 61 | if not inp: 62 | print("exit...") 63 | break 64 | 65 | print(f"{roles[1]}: ", end="") 66 | 67 | if image is not None: 68 | # first message 69 | inp = DEFAULT_IMAGE_TOKEN + '\n' + inp 70 | conv.append_message(conv.roles[0], inp) 71 | image = None 72 | else: 73 | conv.append_message(conv.roles[0], inp) 74 | conv.append_message(conv.roles[1], None) 75 | prompt = conv.get_prompt() 76 | 77 | input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).to( 78 | model.device) 79 | stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2 80 | keywords = [stop_str] 81 | stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids) 82 | streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True) 83 | 84 | with torch.inference_mode(): 85 | output_ids = model.generate( 86 | input_ids, 87 | images=image_tensor, 88 | do_sample=True if args.temperature > 0 else False, 89 | temperature=args.temperature, 90 | max_new_tokens=args.max_new_tokens, 91 | streamer=streamer, 92 | use_cache=True, 93 | repetition_penalty=args.repetition_penalty, 94 | stopping_criteria=[stopping_criteria]) 95 | 96 | outputs = tokenizer.decode(output_ids[0, input_ids.shape[1]:]).strip() 97 | conv.messages[-1][-1] = outputs 98 | 99 | if args.debug: 100 | print("\n", {"prompt": prompt, "outputs": outputs}, "\n") 101 | 102 | 103 | if __name__ == "__main__": 104 | parser = argparse.ArgumentParser() 105 | parser.add_argument("--model-path", type=str, default=None) 106 | parser.add_argument("--model-base", type=str, default=None) 107 | parser.add_argument("--model-type", type=str, default=None) 108 | parser.add_argument("--image-file", type=str, required=True) 109 | parser.add_argument("--device", type=str, default="cuda") 110 | parser.add_argument("--conv-mode", type=str, default=None) 111 | parser.add_argument("--temperature", type=float, default=0.2) 112 | parser.add_argument("--repetition-penalty", type=float, default=1.0) 113 | parser.add_argument("--max-new-tokens", type=int, default=512) 114 | parser.add_argument("--load-8bit", action="store_true") 115 | parser.add_argument("--load-4bit", action="store_true") 116 | parser.add_argument("--debug", action="store_true") 117 | args = parser.parse_args() 118 | main(args) 119 | -------------------------------------------------------------------------------- /bunny/serve/examples/example_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BAAI-DCAI/Bunny/08273acbd83b6836c03dce25cc134eb6b9cedba5/bunny/serve/examples/example_1.png -------------------------------------------------------------------------------- /bunny/serve/examples/example_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BAAI-DCAI/Bunny/08273acbd83b6836c03dce25cc134eb6b9cedba5/bunny/serve/examples/example_2.png -------------------------------------------------------------------------------- /bunny/serve/examples/icon.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BAAI-DCAI/Bunny/08273acbd83b6836c03dce25cc134eb6b9cedba5/bunny/serve/examples/icon.jpg -------------------------------------------------------------------------------- /bunny/serve/examples/user.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BAAI-DCAI/Bunny/08273acbd83b6836c03dce25cc134eb6b9cedba5/bunny/serve/examples/user.png -------------------------------------------------------------------------------- /bunny/serve/register_worker.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import requests 3 | 4 | if __name__ == "__main__": 5 | parser = argparse.ArgumentParser() 6 | parser.add_argument("--controller-address", type=str) 7 | parser.add_argument("--worker-name", type=str) 8 | parser.add_argument("--check-heart-beat", action="store_true") 9 | args = parser.parse_args() 10 | 11 | url = args.controller_address + "/register_worker" 12 | data = { 13 | "worker_name": args.worker_name, 14 | "check_heart_beat": args.check_heart_beat, 15 | "worker_status": None, 16 | } 17 | r = requests.post(url, json=data) 18 | assert r.status_code == 200 19 | -------------------------------------------------------------------------------- /bunny/util/mm_utils.py: -------------------------------------------------------------------------------- 1 | import base64 2 | import torch 3 | 4 | from PIL import Image 5 | from io import BytesIO 6 | from transformers import StoppingCriteria 7 | 8 | from bunny.constants import IMAGE_TOKEN_INDEX 9 | 10 | 11 | def load_image_from_base64(image): 12 | return Image.open(BytesIO(base64.b64decode(image))) 13 | 14 | 15 | def expand2square(pil_img, background_color): 16 | width, height = pil_img.size 17 | if width == height: 18 | return pil_img 19 | elif width > height: 20 | result = Image.new(pil_img.mode, (width, width), background_color) 21 | result.paste(pil_img, (0, (width - height) // 2)) 22 | return result 23 | else: 24 | result = Image.new(pil_img.mode, (height, height), background_color) 25 | result.paste(pil_img, ((height - width) // 2, 0)) 26 | return result 27 | 28 | 29 | def process_images(images, image_processor, model_cfg): 30 | image_aspect_ratio = getattr(model_cfg, "image_aspect_ratio", None) 31 | new_images = [] 32 | if image_aspect_ratio == 'pad': 33 | for image in images: 34 | image = expand2square(image, tuple(int(x * 255) for x in image_processor.image_mean)) 35 | image = image_processor.preprocess(image, return_tensors='pt')['pixel_values'][0] 36 | new_images.append(image) 37 | else: 38 | return image_processor(images, return_tensors='pt')['pixel_values'] 39 | if all(x.shape == new_images[0].shape for x in new_images): 40 | new_images = torch.stack(new_images, dim=0) 41 | return new_images 42 | 43 | 44 | def tokenizer_image_token(prompt, tokenizer, image_token_index=IMAGE_TOKEN_INDEX, return_tensors=None): 45 | prompt_chunks = [tokenizer(chunk).input_ids for chunk in prompt.split('')] 46 | 47 | def insert_separator(X, sep): 48 | return [ele for sublist in zip(X, [sep] * len(X)) for ele in sublist][:-1] 49 | 50 | input_ids = [] 51 | offset = 0 52 | if len(prompt_chunks) > 0 and len(prompt_chunks[0]) > 0 and prompt_chunks[0][0] == tokenizer.bos_token_id: 53 | offset = 1 54 | input_ids.append(prompt_chunks[0][0]) 55 | 56 | for x in insert_separator(prompt_chunks, [image_token_index] * (offset + 1)): 57 | input_ids.extend(x[offset:]) 58 | 59 | if return_tensors is not None: 60 | if return_tensors == 'pt': 61 | return torch.tensor(input_ids, dtype=torch.long) 62 | raise ValueError(f'Unsupported tensor type: {return_tensors}') 63 | return input_ids 64 | 65 | 66 | def get_model_name_from_path(model_path): 67 | model_path = model_path.strip("/") 68 | model_paths = model_path.split("/") 69 | if model_paths[-1].startswith('checkpoint-'): 70 | return model_paths[-2] + "_" + model_paths[-1] 71 | else: 72 | return model_paths[-1] 73 | 74 | 75 | class KeywordsStoppingCriteria(StoppingCriteria): 76 | def __init__(self, keywords, tokenizer, input_ids): 77 | self.keywords = keywords 78 | self.keyword_ids = [] 79 | self.max_keyword_len = 0 80 | for keyword in keywords: 81 | cur_keyword_ids = tokenizer(keyword).input_ids 82 | if len(cur_keyword_ids) > 1 and cur_keyword_ids[0] == tokenizer.bos_token_id: 83 | cur_keyword_ids = cur_keyword_ids[1:] 84 | if len(cur_keyword_ids) > self.max_keyword_len: 85 | self.max_keyword_len = len(cur_keyword_ids) 86 | self.keyword_ids.append(torch.tensor(cur_keyword_ids)) 87 | self.tokenizer = tokenizer 88 | self.start_len = input_ids.shape[1] 89 | 90 | def call_for_batch(self, output_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool: 91 | offset = min(output_ids.shape[1] - self.start_len, self.max_keyword_len) 92 | self.keyword_ids = [keyword_id.to(output_ids.device) for keyword_id in self.keyword_ids] 93 | for keyword_id in self.keyword_ids: 94 | truncated_output_ids = output_ids[0, -keyword_id.shape[0]:] 95 | if torch.equal(truncated_output_ids, keyword_id): 96 | return True 97 | outputs = self.tokenizer.batch_decode(output_ids[:, -offset:], skip_special_tokens=True)[0] 98 | for keyword in self.keywords: 99 | if keyword in outputs: 100 | return True 101 | return False 102 | 103 | def __call__(self, output_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool: 104 | outputs = [] 105 | for i in range(output_ids.shape[0]): 106 | outputs.append(self.call_for_batch(output_ids[i].unsqueeze(0), scores)) 107 | return all(outputs) 108 | -------------------------------------------------------------------------------- /bunny/util/s2wrapper/__init__.py: -------------------------------------------------------------------------------- 1 | from .core import * 2 | from .utils import * -------------------------------------------------------------------------------- /bunny/util/s2wrapper/core.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------------------ 2 | # Copyright (c) 2024 Baifeng Shi. 3 | # All rights reserved. 4 | # 5 | # Licensed under the MIT License (MIT). See LICENSE in the repo root for license information. 6 | # ------------------------------------------------------------------------------------------ 7 | 8 | import math 9 | import torch 10 | import torch.nn.functional as F 11 | from einops import rearrange 12 | from .utils import split_chessboard, merge_chessboard, batched_forward 13 | 14 | def forward(model, input, scales=None, img_sizes=None, max_split_size=None, resize_output_to_idx=0, num_prefix_token=0, 15 | output_shape='bnc', split_forward=False): 16 | 17 | assert input.dim() == 4, "Input image must be in the shape of BxCxHxW." 18 | assert input.shape[2] == input.shape[3], "Currently only square images are supported." 19 | assert output_shape in ['bnc', 'bchw'], "Output shape should be either BxNxC (e.g., ViT) or BxCxHxW (e.g., ConvNet)." 20 | assert output_shape == 'bnc' or num_prefix_token == 0, "For ConvNet there shouldn't be any prefix token." 21 | 22 | b, c, input_size, _ = input.shape 23 | 24 | # image size for each scale 25 | assert scales is not None or img_sizes is not None, "Please assign either scales or img_sizes." 26 | img_sizes = img_sizes or [int(input_size * scale) for scale in scales] 27 | 28 | # prepare multiscale inputs 29 | max_split_size = max_split_size or input_size # The maximum size of each split of image. Set as the input size by default 30 | num_splits = [math.ceil(size / max_split_size) for size in img_sizes] # number of splits each scale 31 | input_multiscale = [] 32 | for size, num_split in zip(img_sizes, num_splits): 33 | x = F.interpolate(input.to(torch.float32), size=size, mode='bicubic').to(input.dtype) 34 | x = split_chessboard(x, num_split=num_split) 35 | input_multiscale.append(x) 36 | 37 | # run feedforward on each scale 38 | outs_multiscale = [batched_forward(model, x, b) if split_forward else model(x) for x in input_multiscale] 39 | if num_prefix_token > 0: 40 | outs_prefix_multiscale = [out[:, :num_prefix_token] for out in outs_multiscale] 41 | outs_multiscale = [out[:, num_prefix_token:] for out in outs_multiscale] 42 | if output_shape == 'bnc': 43 | outs_multiscale = [rearrange(out, 'b (h w) c -> b c h w', h=int(out.shape[1] ** 0.5), w=int(out.shape[1] ** 0.5)) 44 | for out in outs_multiscale] 45 | 46 | # merge outputs of different splits for each scale separately 47 | outs_multiscale = [merge_chessboard(out, num_split=num_split) for num_split, out in zip(num_splits, outs_multiscale)] 48 | 49 | # interpolate outputs from different scales and concat together 50 | output_size = outs_multiscale[resize_output_to_idx].shape[-2] 51 | out = torch.cat([F.interpolate(outs_multiscale[i].to(torch.float32), size=output_size, 52 | mode='area').to(outs_multiscale[i].dtype) 53 | for i in range(len(outs_multiscale))], dim=1) 54 | if output_shape == 'bnc': 55 | out = rearrange(out, 'b c h w -> b (h w) c') 56 | if num_prefix_token > 0: 57 | # take the mean of prefix tokens from different splits for each scale 58 | outs_prefix_multiscale = [torch.stack(out.split(b, dim=0), dim=0).mean(dim=0) for out in outs_prefix_multiscale] 59 | out_prefix_multiscale = torch.cat(outs_prefix_multiscale, dim=-1) 60 | out = torch.cat([out_prefix_multiscale, out], dim=1) 61 | 62 | return out 63 | -------------------------------------------------------------------------------- /bunny/util/s2wrapper/utils.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------------------ 2 | # Copyright (c) 2024 Baifeng Shi. 3 | # All rights reserved. 4 | # 5 | # Licensed under the MIT License (MIT). See LICENSE in the repo root for license information. 6 | # ------------------------------------------------------------------------------------------ 7 | 8 | import torch 9 | 10 | def split_chessboard(x, num_split): 11 | """ 12 | x: b * c * h * w 13 | Deividing x into num_split**2 sub-squares, and concatenate all the sub-squares on the batch dimension 14 | """ 15 | B, C, H, W = x.shape 16 | assert H % num_split == 0 and W % num_split == 0 17 | h, w = H // num_split, W // num_split 18 | x_split = torch.cat([x[:, :, i*h:(i+1)*h, j*w:(j+1)*w] for i in range(num_split) for j in range(num_split)], dim=0) 19 | return x_split 20 | 21 | def merge_chessboard(x, num_split): 22 | """ 23 | x: b * c * h * w 24 | Assuming x contains num_split**2 sub-squares concatenated along batch dimension, merge the sub-squares back to the original whole square. 25 | (inverse of split_chessboard) 26 | """ 27 | B, C, H, W = x.shape 28 | assert B % (num_split**2) == 0 29 | b = B // (num_split**2) 30 | x_merge = torch.cat([torch.cat([x[(i*num_split + j)*b:(i*num_split + j + 1)*b] for j in range(num_split)], dim=-1) 31 | for i in range(num_split)], dim=-2) 32 | return x_merge 33 | 34 | def batched_forward(model, x, batch_size=-1): 35 | if batch_size == -1: 36 | return model(x) 37 | else: 38 | x_batched = x.split(batch_size) 39 | outs = [model(x) for x in x_batched] 40 | return torch.cat(outs, dim=0) 41 | 42 | -------------------------------------------------------------------------------- /bunny/util/utils.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import logging.handlers 3 | import os 4 | import sys 5 | 6 | from bunny.constants import LOGDIR 7 | 8 | server_error_msg = "**NETWORK ERROR DUE TO HIGH TRAFFIC. PLEASE REGENERATE OR REFRESH THIS PAGE.**" 9 | moderation_msg = "YOUR INPUT VIOLATES OUR CONTENT MODERATION GUIDELINES. PLEASE TRY AGAIN." 10 | 11 | handler = None 12 | 13 | 14 | def disable_torch_init(): 15 | """ 16 | Disable the redundant torch default initialization to accelerate model creation. 17 | """ 18 | import torch 19 | setattr(torch.nn.Linear, "reset_parameters", lambda self: None) 20 | setattr(torch.nn.LayerNorm, "reset_parameters", lambda self: None) 21 | 22 | 23 | def build_logger(logger_name, logger_filename): 24 | global handler 25 | 26 | formatter = logging.Formatter( 27 | fmt="%(asctime)s | %(levelname)s | %(name)s | %(message)s", 28 | datefmt="%Y-%m-%d %H:%M:%S", 29 | ) 30 | 31 | # Set the format of root handlers 32 | if not logging.getLogger().handlers: 33 | logging.basicConfig(level=logging.INFO) 34 | logging.getLogger().handlers[0].setFormatter(formatter) 35 | 36 | # Redirect stdout and stderr to loggers 37 | stdout_logger = logging.getLogger("stdout") 38 | stdout_logger.setLevel(logging.INFO) 39 | sl = StreamToLogger(stdout_logger, logging.INFO) 40 | sys.stdout = sl 41 | 42 | stderr_logger = logging.getLogger("stderr") 43 | stderr_logger.setLevel(logging.ERROR) 44 | sl = StreamToLogger(stderr_logger, logging.ERROR) 45 | sys.stderr = sl 46 | 47 | # Get logger 48 | logger = logging.getLogger(logger_name) 49 | logger.setLevel(logging.INFO) 50 | 51 | # Add a file handler for all loggers 52 | if handler is None: 53 | os.makedirs(LOGDIR, exist_ok=True) 54 | filename = os.path.join(LOGDIR, logger_filename) 55 | handler = logging.handlers.TimedRotatingFileHandler( 56 | filename, when='D', utc=True, encoding='UTF-8') 57 | handler.setFormatter(formatter) 58 | 59 | for name, item in logging.root.manager.loggerDict.items(): 60 | if isinstance(item, logging.Logger): 61 | item.addHandler(handler) 62 | 63 | return logger 64 | 65 | 66 | class StreamToLogger(object): 67 | """ 68 | Fake file-like stream object that redirects writes to a logger instance. 69 | """ 70 | 71 | def __init__(self, logger, log_level=logging.INFO): 72 | self.terminal = sys.stdout 73 | self.logger = logger 74 | self.log_level = log_level 75 | self.linebuf = '' 76 | 77 | def __getattr__(self, attr): 78 | return getattr(self.terminal, attr) 79 | 80 | def write(self, buf): 81 | temp_linebuf = self.linebuf + buf 82 | self.linebuf = '' 83 | for line in temp_linebuf.splitlines(True): 84 | # From the io.TextIOWrapper docs: 85 | # On output, if newline is None, any '\n' characters written 86 | # are translated to the system default line separator. 87 | # By default sys.stdout.write() expects '\n' newlines and then 88 | # translates them so this is still cross platform. 89 | if line[-1] == '\n': 90 | self.logger.log(self.log_level, line.rstrip()) 91 | else: 92 | self.linebuf += line 93 | 94 | def flush(self): 95 | if self.linebuf != '': 96 | self.logger.log(self.log_level, self.linebuf.rstrip()) 97 | self.linebuf = '' 98 | 99 | 100 | def violates_moderation(text): 101 | """ 102 | Check whether the text violates OpenAI moderation API. 103 | """ 104 | url = "https://api.openai.com/v1/moderations" 105 | headers = {"Content-Type": "application/json", 106 | "Authorization": "Bearer " + os.environ["OPENAI_API_KEY"]} 107 | text = text.replace("\n", "") 108 | data = "{" + '"input": ' + f'"{text}"' + "}" 109 | data = data.encode("utf-8") 110 | try: 111 | ret = requests.post(url, headers=headers, data=data, timeout=5) 112 | flagged = ret.json()["results"][0]["flagged"] 113 | except requests.exceptions.RequestException as e: 114 | flagged = False 115 | except KeyError as e: 116 | flagged = False 117 | 118 | return flagged 119 | 120 | 121 | def pretty_print_semaphore(semaphore): 122 | if semaphore is None: 123 | return "None" 124 | return f"Semaphore(value={semaphore._value}, locked={semaphore.locked()})" 125 | -------------------------------------------------------------------------------- /comparison_4B.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BAAI-DCAI/Bunny/08273acbd83b6836c03dce25cc134eb6b9cedba5/comparison_4B.png -------------------------------------------------------------------------------- /comparison_8B.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BAAI-DCAI/Bunny/08273acbd83b6836c03dce25cc134eb6b9cedba5/comparison_8B.png -------------------------------------------------------------------------------- /eval/cmmmu/eval_script.py: -------------------------------------------------------------------------------- 1 | import json 2 | from argparse import ArgumentParser 3 | from tabulate import tabulate 4 | from eval_utils import evaluate_answer, evaluate_response 5 | 6 | 7 | def read_jsonl_to_dict(data_path, output_path, category): 8 | with open(data_path, 'r', encoding='utf-8') as file: 9 | data = {int(parsed_line['id']): parsed_line for line in file if 10 | (parsed_line := json.loads(line)).get('category') == category} 11 | 12 | with open(output_path, 'r', encoding='utf-8') as file: 13 | output = {int(parsed_line['id']): parsed_line for line in file if 14 | int((parsed_line := json.loads(line)).get('id')) in data.keys()} 15 | 16 | return data, output 17 | 18 | 19 | def process_answer_jsonl_file(data_path, output_path, category): 20 | global global_cnt 21 | global global_correct_cnt 22 | 23 | data_dict, output_dict = read_jsonl_to_dict(data_path, output_path, category) 24 | 25 | assert set(data_dict.keys()) == set( 26 | output_dict.keys()), "The ids are not exactly the same and cannot be processed further, please check files" 27 | 28 | for data_key, data_value in data_dict.items(): 29 | output_dict[data_key]['predicted_answer'] = output_dict[data_key].get('answer') 30 | output_dict[data_key]['answer'] = data_value.get('answer') 31 | 32 | results_count = evaluate_answer(output_dict.values()) 33 | 34 | return results_count 35 | 36 | 37 | def process_response_jsonl_file(data_path, output_path, category): 38 | global global_cnt 39 | global global_correct_cnt 40 | 41 | data_dict, output_dict = read_jsonl_to_dict(data_path, output_path, category) 42 | 43 | assert set(data_dict.keys()) == set( 44 | output_dict.keys()), "The ids are not exactly the same and cannot be processed further, please check files" 45 | 46 | for data_key, data_value in data_dict.items(): 47 | if data_value.get('type') == "选择": 48 | index2ans = { 49 | 'A': data_value.get('option1', ''), 50 | 'B': data_value.get('option2', ''), 51 | 'C': data_value.get('option3', ''), 52 | 'D': data_value.get('option4', '') 53 | } 54 | output_dict[data_key]['index2ans'] = index2ans 55 | output_dict[data_key]['answer'] = data_value.get('answer') 56 | 57 | results_count = evaluate_response(output_dict.values()) 58 | 59 | return results_count 60 | 61 | 62 | if __name__ == '__main__': 63 | 64 | parser = ArgumentParser() 65 | parser.add_argument('--output_path', type=str, default="eval/example/Yi-VL-34B-answer.jsonl", 66 | help="The path to model output file.") 67 | parser.add_argument('--data_path', type=str, default="eval/cmmmu/cmmmu-data-val-answer.jsonl", 68 | help="Answer file path.") 69 | args = parser.parse_args() 70 | 71 | category_list = ['艺术与设计', '商业', '科学', '健康与医学', '人文社会科学', '技术与工程'] 72 | category_dict = {'艺术与设计': 'Art & Design', '商业': 'Business', '科学': 'Science', 73 | '健康与医学': 'Health & Medicine', '人文社会科学': 'Humanities & Social Sciences', 74 | '技术与工程': 'Technology & Engineering'} 75 | 76 | headers = ['Subject', 'Correct Num', 'Entries Num', 'Acc'] 77 | table = [] 78 | correct_sum = 0 79 | entries_sum = 0 80 | 81 | is_answer = True 82 | is_response = True 83 | with open(args.output_path, 'r') as file: 84 | for line in file: 85 | data = json.loads(line) 86 | if set(data.keys()) != {'id', 'type', 'answer'}: 87 | is_answer = False 88 | if set(data.keys()) != {'id', 'type', 'response'}: 89 | is_response = False 90 | assert is_answer or is_response, "The file should contain either 'answer' or 'response'" 91 | 92 | for category in category_list: 93 | if is_answer: 94 | results_count = process_answer_jsonl_file(args.data_path, args.output_path, category) 95 | elif is_response: 96 | results_count = process_response_jsonl_file(args.data_path, args.output_path, category) 97 | correct_sum += results_count['correct_num'] 98 | entries_sum += results_count['entries_num'] 99 | table.append( 100 | [category_dict[category], results_count['correct_num'], results_count['entries_num'], results_count['acc']]) 101 | 102 | table.append(['Overall', correct_sum, entries_sum, correct_sum / entries_sum]) 103 | print(tabulate(table, headers=headers, tablefmt='orgtbl')) 104 | -------------------------------------------------------------------------------- /eval/cmmmu/prompt.yaml: -------------------------------------------------------------------------------- 1 | task_instructions: 2 | - '请回答以下多项选择题,并选出正确选项。这些题目可能包括单选和多选题型。如果所提供的信息不足以确定一个明确的答案,那么请根据可用的数据和你的判断来选择最可能正确的选项。' 3 | - '请回答以下判断题,并根据题目描述和所给的信息来判断问题中陈述的对错。如果信息不完整或不足以作出绝对判断,请运用你的逻辑推理和现有信息来做出最可能的判断。' 4 | - '请回答以下填空题,并根据题目的要求和所提供的信息来给出最恰当的答案。如果信息不足以确切回答,那么请依据现有的数据和你的推理能力来填写最合理的答案。' 5 | multi_choice_example_format: 6 | - | 7 | 问题:{} 8 | 选项: 9 | {} 10 | 正确答案: 11 | 12 | T/F_example_format: 13 | - | 14 | 问题:{} 15 | 正确答案: 16 | 17 | short_ans_example_format: 18 | - | 19 | 问题:{} 20 | 正确答案: 21 | 22 | temperature: 23 | - 0 -------------------------------------------------------------------------------- /eval/gqa/convert_gqa_for_eval.py: -------------------------------------------------------------------------------- 1 | import json 2 | import argparse 3 | 4 | parser = argparse.ArgumentParser() 5 | parser.add_argument("--src", type=str) 6 | parser.add_argument("--dst", type=str) 7 | args = parser.parse_args() 8 | 9 | all_answers = [] 10 | for line_idx, line in enumerate(open(args.src)): 11 | res = json.loads(line) 12 | question_id = res['question_id'] 13 | text = res['text'].rstrip('.').lower() 14 | all_answers.append({"questionId": question_id, "prediction": text}) 15 | 16 | with open(args.dst, 'w') as f: 17 | json.dump(all_answers, f) 18 | -------------------------------------------------------------------------------- /eval/gqa/testdev_balanced_questions.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BAAI-DCAI/Bunny/08273acbd83b6836c03dce25cc134eb6b9cedba5/eval/gqa/testdev_balanced_questions.tar.gz -------------------------------------------------------------------------------- /eval/mm-vet/convert_mmvet_for_eval.py: -------------------------------------------------------------------------------- 1 | import json 2 | import argparse 3 | 4 | parser = argparse.ArgumentParser() 5 | parser.add_argument("--src", type=str) 6 | parser.add_argument("--dst", type=str) 7 | args = parser.parse_args() 8 | 9 | cur_result = {} 10 | 11 | for line in open(args.src): 12 | data = json.loads(line) 13 | qid = data['question_id'] 14 | cur_result[f'v1_{qid}'] = data['text'] 15 | 16 | with open(args.dst, 'w') as f: 17 | json.dump(cur_result, f, indent=2) 18 | -------------------------------------------------------------------------------- /eval/mmbench/convert_mmbench_for_submission.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import argparse 4 | import pandas as pd 5 | 6 | 7 | def get_args(): 8 | parser = argparse.ArgumentParser() 9 | parser.add_argument("--annotation-file", type=str, required=True) 10 | parser.add_argument("--result-dir", type=str, required=True) 11 | parser.add_argument("--upload-dir", type=str, required=True) 12 | parser.add_argument("--experiment", type=str, required=True) 13 | 14 | return parser.parse_args() 15 | 16 | 17 | if __name__ == "__main__": 18 | args = get_args() 19 | 20 | df = pd.read_table(args.annotation_file) 21 | 22 | cur_df = df.copy() 23 | cur_df = cur_df.drop(columns=['hint', 'category', 'source', 'image', 'comment', 'l2-category']) 24 | cur_df.insert(6, 'prediction', None) 25 | for pred in open(os.path.join(args.result_dir, f"{args.experiment}.jsonl")): 26 | pred = json.loads(pred) 27 | cur_df.loc[df['index'] == pred['question_id'], 'prediction'] = pred['text'] 28 | 29 | cur_df.to_excel(os.path.join(args.upload_dir, f"{args.experiment}.xlsx"), index=False, engine='openpyxl') 30 | -------------------------------------------------------------------------------- /eval/mme/calculation_mme.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix 4 | 5 | parser = argparse.ArgumentParser() 6 | parser.add_argument('--results_dir', default='./LaVIN', type=str) 7 | 8 | eval_type_dict = { 9 | "Perception": ["existence", "count", "position", "color", "posters", "celebrity", "scene", "landmark", "artwork", 10 | "OCR"], 11 | "Cognition": ["commonsense_reasoning", "numerical_calculation", "text_translation", "code_reasoning"] 12 | } 13 | 14 | 15 | class calculate_metrics: 16 | def divide_chunks(self, l, n=2): 17 | # looping till length l 18 | for i in range(0, len(l), n): 19 | yield l[i:i + n] 20 | 21 | return 22 | 23 | def parse_pred_ans(self, pred_ans): 24 | pred_label = None 25 | if pred_ans in ["yes", "no"]: 26 | pred_label = pred_ans 27 | else: 28 | prefix_pred_ans = pred_ans[:4] 29 | 30 | if "yes" in prefix_pred_ans: 31 | pred_label = "yes" 32 | elif "no" in prefix_pred_ans: 33 | pred_label = "no" 34 | else: 35 | pred_label = "other" 36 | 37 | return pred_label 38 | 39 | def compute_metric(self, gts, preds): 40 | assert len(gts) == len(preds) 41 | 42 | label_map = { 43 | "yes": 1, 44 | "no": 0, 45 | "other": -1, 46 | } 47 | 48 | gts = [label_map[x] for x in gts] 49 | preds = [label_map[x] for x in preds] 50 | 51 | acc = accuracy_score(gts, preds) 52 | 53 | clean_gts = [] 54 | clean_preds = [] 55 | other_num = 0 56 | for gt, pred in zip(gts, preds): 57 | if pred == -1: 58 | other_num += 1 59 | continue 60 | clean_gts.append(gt) 61 | clean_preds.append(pred) 62 | 63 | conf_mat = confusion_matrix(clean_gts, clean_preds, labels=[1, 0]) 64 | precision = precision_score(clean_gts, clean_preds, average='binary') 65 | recall = recall_score(clean_gts, clean_preds, average='binary') 66 | tp, fn = conf_mat[0] 67 | fp, tn = conf_mat[1] 68 | 69 | metric_dict = dict() 70 | metric_dict = { 71 | "TP": tp, 72 | "FN": fn, 73 | "TN": tn, 74 | "FP": fp, 75 | "precision": precision, 76 | "recall": recall, 77 | "other_num": other_num, 78 | "acc": acc, 79 | } 80 | 81 | return metric_dict 82 | 83 | def process_result(self, results_dir): 84 | 85 | model_score_dict = dict() 86 | for eval_type, task_name_list in eval_type_dict.items(): 87 | print("===========", eval_type, "===========") 88 | 89 | scores = 0 90 | task_score_dict = dict() 91 | 92 | for task_name in task_name_list: 93 | 94 | task_txt = os.path.join(results_dir, task_name + ".txt") 95 | lines = open(task_txt, 'r').readlines() 96 | filtered_lines = [] 97 | for line in lines: 98 | try: 99 | img_name, question, gt_ans, pred_ans = line.split("\t") 100 | filtered_lines.append(line) 101 | except: 102 | pass 103 | lines = filtered_lines[:] 104 | 105 | chunk_lines = list(self.divide_chunks(lines)) # one image corresponds to two questions 106 | 107 | img_num = len(chunk_lines) 108 | task_other_ans_num = 0 109 | task_score = 0 110 | acc_plus_correct_num = 0 111 | gts = [] 112 | preds = [] 113 | 114 | for img_items in chunk_lines: 115 | assert len(img_items) == 2 116 | img_correct_num = 0 117 | 118 | for img_item in img_items: 119 | img_name, question, gt_ans, pred_ans = img_item.split("\t") 120 | 121 | gt_ans = gt_ans.lower() 122 | pred_ans = pred_ans.lower() 123 | 124 | assert gt_ans in ["yes", "no"] # gt can only be yes or no. 125 | 126 | pred_ans = self.parse_pred_ans(pred_ans) 127 | assert pred_ans in ["yes", "no", "other"] 128 | 129 | gts.append(gt_ans) 130 | preds.append(pred_ans) 131 | 132 | if gt_ans == pred_ans: 133 | img_correct_num += 1 134 | 135 | if pred_ans not in ["yes", "no"]: 136 | task_other_ans_num += 1 137 | 138 | if img_correct_num == 2: 139 | acc_plus_correct_num += 1 140 | 141 | # cal TP precision acc, etc. 142 | metric_dict = self.compute_metric(gts, preds) 143 | acc_plus = acc_plus_correct_num / img_num 144 | metric_dict["acc_plus"] = acc_plus 145 | 146 | for k, v in metric_dict.items(): 147 | if k in ["acc", "acc_plus"]: 148 | task_score += v * 100 149 | 150 | task_score_dict[task_name] = task_score 151 | 152 | scores += task_score 153 | 154 | print("total score:", scores, "\n") 155 | for task_name, score in task_score_dict.items(): 156 | print("\t", task_name, " score:", score) 157 | print("\n") 158 | 159 | return 160 | 161 | 162 | if __name__ == "__main__": 163 | cal = calculate_metrics() 164 | 165 | args = parser.parse_args() 166 | results_dir = args.results_dir 167 | cal.process_result(results_dir) 168 | -------------------------------------------------------------------------------- /eval/mme/convert_answer_to_mme.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import argparse 4 | from collections import defaultdict 5 | 6 | 7 | def get_args(): 8 | parser = argparse.ArgumentParser() 9 | 10 | parser.add_argument('--experiment', 11 | type=str, 12 | required=True) 13 | 14 | args = parser.parse_args() 15 | return args 16 | 17 | 18 | def get_gt(data_path): 19 | GT = {} 20 | for category in os.listdir(data_path): 21 | category_dir = os.path.join(data_path, category) 22 | if not os.path.isdir(category_dir): 23 | continue 24 | if os.path.exists(os.path.join(category_dir, 'images')): 25 | image_path = os.path.join(category_dir, 'images') 26 | qa_path = os.path.join(category_dir, 'questions_answers_YN') 27 | else: 28 | image_path = qa_path = category_dir 29 | assert os.path.isdir(image_path), image_path 30 | assert os.path.isdir(qa_path), qa_path 31 | for file in os.listdir(qa_path): 32 | if not file.endswith('.txt'): 33 | continue 34 | for line in open(os.path.join(qa_path, file)): 35 | question, answer = line.strip().split('\t') 36 | GT[(category, file, question)] = answer 37 | return GT 38 | 39 | 40 | if __name__ == "__main__": 41 | 42 | args = get_args() 43 | 44 | GT = get_gt( 45 | data_path='MME_Benchmark_release_version' 46 | ) 47 | 48 | experiment = args.experiment 49 | 50 | result_dir = os.path.join('answers_upload', experiment) 51 | os.makedirs(result_dir, exist_ok=True) 52 | 53 | answers = [json.loads(line) for line in open(os.path.join('answers', f'{experiment}.jsonl'))] 54 | 55 | results = defaultdict(list) 56 | for answer in answers: 57 | category = answer['question_id'].split('/')[0] 58 | file = answer['question_id'].split('/')[-1].split('.')[0] + '.txt' 59 | question = answer['prompt'] 60 | results[category].append((file, answer['prompt'], answer['text'])) 61 | 62 | for category, cate_tups in results.items(): 63 | with open(os.path.join(result_dir, f'{category}.txt'), 'w') as fp: 64 | for file, prompt, answer in cate_tups: 65 | if 'Answer the question using a single word or phrase.' in prompt: 66 | prompt = prompt.replace('Answer the question using a single word or phrase.', '').strip() 67 | if 'Answer the question directly with a short sentence or phrase.' in prompt: 68 | prompt = prompt.replace('Answer the question directly with a short sentence or phrase.', '').strip() 69 | if 'Please answer yes or no.' not in prompt: 70 | prompt = prompt + ' Please answer yes or no.' 71 | if (category, file, prompt) not in GT: 72 | prompt = prompt.replace(' Please answer yes or no.', ' Please answer yes or no.') 73 | gt_ans = GT[category, file, prompt] 74 | tup = file, prompt, gt_ans, answer 75 | fp.write('\t'.join(tup) + '\n') 76 | -------------------------------------------------------------------------------- /eval/mmmu/config.yaml: -------------------------------------------------------------------------------- 1 | task_instructions: 2 | - "" 3 | multi_choice_example_format: 4 | - "{} 5 | 6 | {} 7 | 8 | Answer with the option's letter from the given choices directly." 9 | 10 | short_ans_example_format: 11 | - "{} 12 | 13 | Answer the question using a single word or phrase." 14 | temperature: 15 | - 0 -------------------------------------------------------------------------------- /eval/pope/eval_pope.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import argparse 4 | 5 | 6 | def eval_pope(answers, label_file): 7 | label_list = [json.loads(q)['label'] for q in open(label_file, 'r')] 8 | 9 | for answer in answers: 10 | text = answer['text'] 11 | 12 | # Only keep the first sentence 13 | if text.find('.') != -1: 14 | text = text.split('.')[0] 15 | 16 | text = text.replace(',', '') 17 | words = text.split(' ') 18 | if 'No' in words or 'not' in words or 'no' in words: 19 | answer['text'] = 'no' 20 | else: 21 | answer['text'] = 'yes' 22 | 23 | for i in range(len(label_list)): 24 | if label_list[i] == 'no': 25 | label_list[i] = 0 26 | else: 27 | label_list[i] = 1 28 | 29 | pred_list = [] 30 | for answer in answers: 31 | if answer['text'] == 'no': 32 | pred_list.append(0) 33 | else: 34 | pred_list.append(1) 35 | 36 | pos = 1 37 | neg = 0 38 | yes_ratio = pred_list.count(1) / len(pred_list) 39 | 40 | TP, TN, FP, FN = 0, 0, 0, 0 41 | for pred, label in zip(pred_list, label_list): 42 | if pred == pos and label == pos: 43 | TP += 1 44 | elif pred == pos and label == neg: 45 | FP += 1 46 | elif pred == neg and label == neg: 47 | TN += 1 48 | elif pred == neg and label == pos: 49 | FN += 1 50 | 51 | print('TP\tFP\tTN\tFN\t') 52 | print('{}\t{}\t{}\t{}'.format(TP, FP, TN, FN)) 53 | 54 | precision = float(TP) / float(TP + FP) 55 | recall = float(TP) / float(TP + FN) 56 | f1 = 2 * precision * recall / (precision + recall) 57 | acc = (TP + TN) / (TP + TN + FP + FN) 58 | print('Accuracy: {}'.format(acc)) 59 | print('Precision: {}'.format(precision)) 60 | print('Recall: {}'.format(recall)) 61 | print('F1 score: {}'.format(f1)) 62 | print('Yes ratio: {}'.format(yes_ratio)) 63 | print('%.3f, %.3f, %.3f, %.3f, %.3f' % (f1, acc, precision, recall, yes_ratio)) 64 | return f1 65 | 66 | 67 | if __name__ == "__main__": 68 | parser = argparse.ArgumentParser() 69 | parser.add_argument("--annotation-dir", type=str) 70 | parser.add_argument("--question-file", type=str) 71 | parser.add_argument("--result-file", type=str) 72 | args = parser.parse_args() 73 | 74 | questions = [json.loads(line) for line in open(args.question_file)] 75 | questions = {question['question_id']: question for question in questions} 76 | answers = [json.loads(q) for q in open(args.result_file)] 77 | 78 | average_f1 = 0 79 | for file in os.listdir(args.annotation_dir): 80 | assert file.startswith('coco_pope_') 81 | assert file.endswith('.json') 82 | category = file[10:-5] 83 | cur_answers = [x for x in answers if questions[x['question_id']]['category'] == category] 84 | print('Category: {}, # samples: {}'.format(category, len(cur_answers))) 85 | average_f1 += eval_pope(cur_answers, os.path.join(args.annotation_dir, file)) 86 | print("====================================") 87 | 88 | average_f1 /= len(os.listdir(args.annotation_dir)) 89 | 90 | print(f'Average F1 score: {average_f1}') 91 | -------------------------------------------------------------------------------- /eval/scienceqa/eval_science_qa.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | import os 4 | import re 5 | import random 6 | 7 | 8 | def get_args(): 9 | parser = argparse.ArgumentParser() 10 | parser.add_argument('--base-dir', type=str) 11 | parser.add_argument('--result-file', type=str) 12 | parser.add_argument('--output-file', type=str) 13 | parser.add_argument('--output-result', type=str) 14 | parser.add_argument('--split', type=str, default='test') 15 | parser.add_argument('--options', type=list, default=["A", "B", "C", "D", "E"]) 16 | return parser.parse_args() 17 | 18 | 19 | def convert_caps(results): 20 | fakecaps = [] 21 | for result in results: 22 | image_id = result['question_id'] 23 | caption = result['text'] 24 | fakecaps.append({"image_id": int(image_id), "caption": caption}) 25 | return fakecaps 26 | 27 | 28 | def get_pred_idx(prediction, choices, options): 29 | """ 30 | Get the index (e.g. 2) from the prediction (e.g. 'C') 31 | """ 32 | if prediction in options[:len(choices)]: 33 | return options.index(prediction) 34 | else: 35 | return -1 36 | return random.choice(range(len(choices))) 37 | 38 | 39 | if __name__ == "__main__": 40 | args = get_args() 41 | 42 | base_dir = args.base_dir 43 | split_indices = json.load(open(os.path.join(base_dir, "pid_splits.json")))[args.split] 44 | problems = json.load(open(os.path.join(base_dir, "problems.json"))) 45 | predictions = [json.loads(line) for line in open(args.result_file)] 46 | predictions = {pred['question_id']: pred for pred in predictions} 47 | split_problems = {idx: problems[idx] for idx in split_indices} 48 | 49 | results = {'correct': [], 'incorrect': []} 50 | sqa_results = {} 51 | sqa_results['acc'] = None 52 | sqa_results['correct'] = None 53 | sqa_results['count'] = None 54 | sqa_results['results'] = {} 55 | sqa_results['outputs'] = {} 56 | 57 | for prob_id, prob in split_problems.items(): 58 | if prob_id not in predictions: 59 | pred = {'text': 'FAILED', 'prompt': 'Unknown'} 60 | pred_text = 'FAILED' 61 | else: 62 | pred = predictions[prob_id] 63 | pred_text = pred['text'] 64 | 65 | if pred_text in args.options: 66 | answer = pred_text 67 | elif len(pred_text) >= 3 and pred_text[0] in args.options and pred_text[1:3] == ". ": 68 | answer = pred_text[0] 69 | else: 70 | pattern = re.compile(r'The answer is ([A-Z]).') 71 | res = pattern.findall(pred_text) 72 | if len(res) == 1: 73 | answer = res[0] # 'A', 'B', ... 74 | else: 75 | answer = "FAILED" 76 | 77 | pred_idx = get_pred_idx(answer, prob['choices'], args.options) 78 | 79 | analysis = { 80 | 'question_id': prob_id, 81 | 'parsed_ans': answer, 82 | 'ground_truth': args.options[prob['answer']], 83 | 'question': pred['prompt'], 84 | 'pred': pred_text, 85 | 'is_multimodal': '' in pred['prompt'], 86 | } 87 | 88 | sqa_results['results'][prob_id] = get_pred_idx(answer, prob['choices'], args.options) 89 | sqa_results['outputs'][prob_id] = pred_text 90 | 91 | if pred_idx == prob['answer']: 92 | results['correct'].append(analysis) 93 | else: 94 | results['incorrect'].append(analysis) 95 | 96 | correct = len(results['correct']) 97 | total = len(results['correct']) + len(results['incorrect']) 98 | 99 | ###### IMG ###### 100 | multimodal_correct = len([x for x in results['correct'] if x['is_multimodal']]) 101 | multimodal_incorrect = len([x for x in results['incorrect'] if x['is_multimodal']]) 102 | multimodal_total = multimodal_correct + multimodal_incorrect 103 | ###### IMG ###### 104 | 105 | print( 106 | f'Total: {total}, Correct: {correct}, Accuracy: {correct / total * 100:.2f}%, IMG-Accuracy: {multimodal_correct / multimodal_total * 100:.2f}%') 107 | 108 | sqa_results['acc'] = correct / total * 100 109 | sqa_results['correct'] = correct 110 | sqa_results['count'] = total 111 | sqa_results['image_acc'] = multimodal_correct / multimodal_total * 100 112 | sqa_results['image_correct'] = multimodal_correct 113 | sqa_results['image_count'] = multimodal_total 114 | 115 | with open(args.output_file, 'w') as f: 116 | json.dump(results, f, indent=2) 117 | with open(args.output_result, 'w') as f: 118 | json.dump(sqa_results, f, indent=2) 119 | -------------------------------------------------------------------------------- /eval/seed-bench/convert_seed_for_submission.py: -------------------------------------------------------------------------------- 1 | import json 2 | import argparse 3 | 4 | 5 | def get_args(): 6 | parser = argparse.ArgumentParser() 7 | parser.add_argument("--annotation-file", type=str) 8 | parser.add_argument("--result-file", type=str) 9 | parser.add_argument("--result-upload-file", type=str) 10 | return parser.parse_args() 11 | 12 | 13 | def eval_single(result_file, eval_only_type=None): 14 | results = {} 15 | for line in open(result_file): 16 | row = json.loads(line) 17 | results[row['question_id']] = row 18 | 19 | type_counts = {} 20 | correct_counts = {} 21 | for question_data in data['questions']: 22 | if eval_only_type is not None and question_data['data_type'] != eval_only_type: continue 23 | data_type = question_data['question_type_id'] 24 | type_counts[data_type] = type_counts.get(data_type, 0) + 1 25 | try: 26 | question_id = int(question_data['question_id']) 27 | except: 28 | question_id = question_data['question_id'] 29 | if question_id not in results: 30 | correct_counts[data_type] = correct_counts.get(data_type, 0) 31 | continue 32 | row = results[question_id] 33 | if row['text'] == question_data['answer']: 34 | correct_counts[data_type] = correct_counts.get(data_type, 0) + 1 35 | 36 | total_count = 0 37 | total_correct = 0 38 | for data_type in sorted(type_counts.keys()): 39 | accuracy = correct_counts[data_type] / type_counts[data_type] * 100 40 | if eval_only_type is None: 41 | print(f"{ques_type_id_to_name[data_type]}: {accuracy:.2f}%") 42 | 43 | total_count += type_counts[data_type] 44 | total_correct += correct_counts[data_type] 45 | 46 | total_accuracy = total_correct / total_count * 100 47 | if eval_only_type is None: 48 | print(f"Total accuracy: {total_accuracy:.2f}%") 49 | else: 50 | print(f"{eval_only_type} accuracy: {total_accuracy:.2f}%") 51 | 52 | return results 53 | 54 | 55 | if __name__ == "__main__": 56 | args = get_args() 57 | data = json.load(open(args.annotation_file)) 58 | ques_type_id_to_name = {id: n for n, id in data['question_type'].items()} 59 | 60 | results = eval_single(args.result_file) 61 | eval_single(args.result_file, eval_only_type='image') 62 | eval_single(args.result_file, eval_only_type='video') 63 | 64 | with open(args.result_upload_file, 'w') as fp: 65 | for question in data['questions']: 66 | qid = question['question_id'] 67 | if qid in results: 68 | result = results[qid] 69 | else: 70 | result = results[int(qid)] 71 | fp.write(json.dumps({ 72 | 'question_id': qid, 73 | 'prediction': result['text'] 74 | }) + '\n') 75 | -------------------------------------------------------------------------------- /eval/seed-bench/extract_video_frames.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import numpy as np 4 | import torch 5 | import av 6 | from decord import VideoReader, cpu 7 | from PIL import Image 8 | 9 | from tqdm.auto import tqdm 10 | import concurrent.futures 11 | 12 | num_segments = 1 13 | 14 | # root directory of evaluation dimension 10 15 | dimension10_dir = "eval/seed-bench/SEED-Bench-video/20bn-something-something-v2" 16 | # root directory of evaluation dimension 11 17 | dimension11_dir = "eval/seed-bench/SEED-Bench-video/EPIC-KITCHENS" 18 | # root directory of evaluation dimension 12 19 | dimension12_dir = "eval/seed-bench/SEED-Bench-video/BreakfastII_15fps_qvga_sync" 20 | 21 | 22 | def transform_video(buffer): 23 | try: 24 | buffer = buffer.numpy() 25 | except AttributeError: 26 | try: 27 | buffer = buffer.asnumpy() 28 | except AttributeError: 29 | print("Both buffer.numpy() and buffer.asnumpy() failed.") 30 | buffer = None 31 | images_group = list() 32 | for fid in range(len(buffer)): 33 | images_group.append(Image.fromarray(buffer[fid])) 34 | return images_group 35 | 36 | 37 | def get_index(num_frames, num_segments): 38 | if num_segments > num_frames: 39 | offsets = np.array([ 40 | idx for idx in range(num_frames) 41 | ]) 42 | else: 43 | # uniform sampling 44 | seg_size = float(num_frames - 1) / num_segments 45 | start = int(seg_size / 2) 46 | offsets = np.array([ 47 | start + int(np.round(seg_size * idx)) for idx in range(num_segments) 48 | ]) 49 | return offsets 50 | 51 | 52 | def fetch_images(qa_item): 53 | use_pyav = False 54 | segment = None 55 | if qa_item['question_type_id'] == 10: 56 | data_path = os.path.join(dimension10_dir, qa_item['data_id']) 57 | start = 0.0 58 | end = 0.0 59 | elif qa_item['question_type_id'] == 11: 60 | data_path = os.path.join(dimension11_dir, qa_item['data_id'].split('/')[-1]) 61 | segment = qa_item['segment'] 62 | start, end = segment[0], segment[1] 63 | elif qa_item['question_type_id'] == 12: 64 | data_path = os.path.join(dimension12_dir, qa_item['data_id']) 65 | segment = qa_item['segment'] 66 | start, end = segment[0], segment[1] 67 | use_pyav = True 68 | 69 | if use_pyav: 70 | # using pyav for decoding videos in evaluation dimension 12 71 | reader = av.open(data_path) 72 | frames = [torch.from_numpy(f.to_rgb().to_ndarray()) for f in reader.decode(video=0)] 73 | video_len = len(frames) 74 | start_frame, end_frame = start, end 75 | end_frame = min(end_frame, video_len) 76 | offset = get_index(end_frame - start_frame, num_segments) 77 | frame_indices = offset + start_frame 78 | buffer = torch.stack([frames[idx] for idx in frame_indices]) 79 | else: 80 | # using decord for decoding videos in evaluation dimension 10-11 81 | vr = VideoReader(data_path, num_threads=1, ctx=cpu(0)) 82 | video_len = len(vr) 83 | fps = vr.get_avg_fps() 84 | if segment is not None: 85 | # obtain start and end frame for the video segment in evaluation dimension 11 86 | start_frame = int(min(max(start * fps, 0), video_len - 1)) 87 | end_frame = int(min(max(end * fps, 0), video_len - 1)) 88 | tot_frames = int(end_frame - start_frame) 89 | offset = get_index(tot_frames, num_segments) 90 | frame_indices = offset + start_frame 91 | else: 92 | # sample frames of the video in evaluation dimension 10 93 | frame_indices = get_index(video_len - 1, num_segments) 94 | vr.seek(0) 95 | buffer = vr.get_batch(frame_indices) 96 | return transform_video(buffer) 97 | 98 | 99 | def fetch_images_parallel(qa_item): 100 | return qa_item, fetch_images(qa_item) 101 | 102 | 103 | if __name__ == "__main__": 104 | data = json.load(open('eval/seed-bench/SEED-Bench.json')) 105 | video_img_dir = 'eval/seed-bench/SEED-Bench-video-image' 106 | os.makedirs(video_img_dir, exist_ok=True) 107 | ques_type_id_to_name = {id: n for n, id in data['question_type'].items()} 108 | 109 | video_data = [x for x in data['questions'] if x['data_type'] == 'video'] 110 | output = 'temp' 111 | with open(output, 'w') as f, concurrent.futures.ThreadPoolExecutor() as executor: 112 | future_to_images = {executor.submit(fetch_images_parallel, qa_item): qa_item for qa_item in video_data} 113 | for future in tqdm(concurrent.futures.as_completed(future_to_images), total=len(future_to_images)): 114 | qa_item = future_to_images[future] 115 | try: 116 | qa_item, images = future.result() 117 | except Exception as exc: 118 | print(f'{qa_item} generated an exception: {exc}') 119 | else: 120 | img_file = f"{qa_item['question_type_id']}_{qa_item['question_id']}.png" 121 | images[0].save(os.path.join(video_img_dir, img_file)) 122 | -------------------------------------------------------------------------------- /eval/textvqa/eval_textvqa.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | import json 4 | import re 5 | 6 | from bunny.eval.m4c_evaluator import TextVQAAccuracyEvaluator 7 | 8 | 9 | def get_args(): 10 | parser = argparse.ArgumentParser() 11 | parser.add_argument('--annotation-file', type=str) 12 | parser.add_argument('--result-file', type=str) 13 | parser.add_argument('--result-dir', type=str) 14 | return parser.parse_args() 15 | 16 | 17 | def prompt_processor(prompt): 18 | if prompt.startswith('OCR tokens: '): 19 | pattern = r"Question: (.*?) Short answer:" 20 | match = re.search(pattern, prompt, re.DOTALL) 21 | question = match.group(1) 22 | elif 'Reference OCR token: ' in prompt and len(prompt.split('\n')) == 3: 23 | if prompt.startswith('Reference OCR token:'): 24 | question = prompt.split('\n')[1] 25 | else: 26 | question = prompt.split('\n')[0] 27 | elif len(prompt.split('\n')) == 2: 28 | question = prompt.split('\n')[0] 29 | else: 30 | assert False 31 | 32 | return question.lower() 33 | 34 | 35 | def eval_single(annotation_file, result_file): 36 | experiment_name = os.path.splitext(os.path.basename(result_file))[0] 37 | print(experiment_name) 38 | annotations = json.load(open(annotation_file))['data'] 39 | annotations = {(annotation['image_id'], annotation['question'].lower()): annotation for annotation in annotations} 40 | results = [json.loads(line) for line in open(result_file)] 41 | 42 | pred_list = [] 43 | for result in results: 44 | annotation = annotations[(result['question_id'], prompt_processor(result['prompt']))] 45 | pred_list.append({ 46 | "pred_answer": result['text'], 47 | "gt_answers": annotation['answers'], 48 | }) 49 | 50 | evaluator = TextVQAAccuracyEvaluator() 51 | print('Samples: {}\nAccuracy: {:.2f}%\n'.format(len(pred_list), 100. * evaluator.eval_pred_list(pred_list))) 52 | 53 | 54 | if __name__ == "__main__": 55 | args = get_args() 56 | 57 | if args.result_file is not None: 58 | eval_single(args.annotation_file, args.result_file) 59 | 60 | if args.result_dir is not None: 61 | for result_file in sorted(os.listdir(args.result_dir)): 62 | if not result_file.endswith('.jsonl'): 63 | print(f'Skipping {result_file}') 64 | continue 65 | eval_single(args.annotation_file, os.path.join(args.result_dir, result_file)) 66 | -------------------------------------------------------------------------------- /eval/viswiz/convert_viswiz_for_submission.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | import json 4 | 5 | from bunny.eval.m4c_evaluator import EvalAIAnswerProcessor 6 | 7 | 8 | def parse_args(): 9 | parser = argparse.ArgumentParser() 10 | parser.add_argument('--annotation-file', type=str, required=True) 11 | parser.add_argument('--result-file', type=str, required=True) 12 | parser.add_argument('--result-upload-file', type=str, required=True) 13 | return parser.parse_args() 14 | 15 | 16 | if __name__ == '__main__': 17 | 18 | args = parse_args() 19 | 20 | os.makedirs(os.path.dirname(args.result_upload_file), exist_ok=True) 21 | 22 | results = [] 23 | error_line = 0 24 | for line_idx, line in enumerate(open(args.result_file)): 25 | try: 26 | results.append(json.loads(line)) 27 | except: 28 | error_line += 1 29 | results = {x['question_id']: x['text'] for x in results} 30 | test_split = [json.loads(line) for line in open(args.annotation_file)] 31 | split_ids = set([x['question_id'] for x in test_split]) 32 | 33 | print(f'total results: {len(results)}, total split: {len(test_split)}, error_line: {error_line}') 34 | 35 | all_answers = [] 36 | 37 | answer_processor = EvalAIAnswerProcessor() 38 | 39 | for x in test_split: 40 | assert x['question_id'] in results 41 | all_answers.append({ 42 | 'image': x['image'], 43 | 'answer': answer_processor(results[x['question_id']]) 44 | }) 45 | 46 | with open(args.result_upload_file, 'w') as f: 47 | json.dump(all_answers, f) 48 | -------------------------------------------------------------------------------- /eval/vqav2/bunny_vqav2_mscoco_test-dev2015.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BAAI-DCAI/Bunny/08273acbd83b6836c03dce25cc134eb6b9cedba5/eval/vqav2/bunny_vqav2_mscoco_test-dev2015.tar.gz -------------------------------------------------------------------------------- /eval/vqav2/bunny_vqav2_mscoco_test2015.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BAAI-DCAI/Bunny/08273acbd83b6836c03dce25cc134eb6b9cedba5/eval/vqav2/bunny_vqav2_mscoco_test2015.tar.gz -------------------------------------------------------------------------------- /eval/vqav2/convert_vqav2_for_submission.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | import json 4 | 5 | from bunny.eval.m4c_evaluator import EvalAIAnswerProcessor 6 | 7 | 8 | def parse_args(): 9 | parser = argparse.ArgumentParser() 10 | parser.add_argument('--dir', type=str, default="./eval/vqav2") 11 | parser.add_argument('--ckpt', type=str, required=True) 12 | parser.add_argument('--split', type=str, required=True) 13 | return parser.parse_args() 14 | 15 | 16 | if __name__ == '__main__': 17 | 18 | args = parse_args() 19 | 20 | src = os.path.join(args.dir, 'answers', args.split, args.ckpt, 'merge.jsonl') 21 | test_split = os.path.join(args.dir, 'bunny_vqav2_mscoco_test2015.jsonl') 22 | dst = os.path.join(args.dir, 'answers_upload', args.split, f'{args.ckpt}.json') 23 | os.makedirs(os.path.dirname(dst), exist_ok=True) 24 | 25 | results = [] 26 | error_line = 0 27 | for line_idx, line in enumerate(open(src)): 28 | try: 29 | results.append(json.loads(line)) 30 | except: 31 | error_line += 1 32 | 33 | results = {x['question_id']: x['text'] for x in results} 34 | test_split = [json.loads(line) for line in open(test_split)] 35 | split_ids = set([x['question_id'] for x in test_split]) 36 | 37 | print(f'total results: {len(results)}, total split: {len(test_split)}, error_line: {error_line}') 38 | 39 | all_answers = [] 40 | 41 | answer_processor = EvalAIAnswerProcessor() 42 | 43 | for x in test_split: 44 | if x['question_id'] not in results: 45 | all_answers.append({ 46 | 'question_id': x['question_id'], 47 | 'answer': '' 48 | }) 49 | else: 50 | all_answers.append({ 51 | 'question_id': x['question_id'], 52 | 'answer': answer_processor(results[x['question_id']]) 53 | }) 54 | 55 | with open(dst, 'w') as f: 56 | json.dump(all_answers, open(dst, 'w')) 57 | -------------------------------------------------------------------------------- /icon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BAAI-DCAI/Bunny/08273acbd83b6836c03dce25cc134eb6b9cedba5/icon.png -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools>=61.0"] 3 | build-backend = "setuptools.build_meta" 4 | 5 | [project] 6 | name = "bunny" 7 | version = "1.0" 8 | description = "A family of lightweight multimodal models." 9 | readme = "README.md" 10 | requires-python = ">=3.8" 11 | classifiers = [ 12 | "Programming Language :: Python :: 3", 13 | "License :: OSI Approved :: Apache Software License", 14 | ] 15 | dependencies = [ 16 | 'accelerate', 'apex', 'bitsandbytes', 'datasets', 'deepspeed', 'einops', 'einops-exts', 17 | 'fastapi', 'flash_attn', 'gradio', 'gradio_client', 'httpx', 'markdown2', 'numpy', 'openpyxl', 18 | 'peft', 'protobuf', 'pydantic', 'pypandoc', 'requests', 'scikit-learn', 'sentencepiece', 'shortuuid', 19 | 'tabulate', 'timm', 'tiktoken', 'tokenizers', 'torch', 'torchvision', 'transformers', 'uvicorn', 'xformers' 20 | ] 21 | 22 | 23 | [project.urls] 24 | "Homepage" = "https://github.com/BAAI-DCAI/Bunny" 25 | "Discussion" = "https://github.com/BAAI-DCAI/Bunny/issues" 26 | 27 | [tool.setuptools.packages.find] 28 | exclude = ["assets*", "benchmark*", "docs", "dist*", "playground*", "scripts*", "tests*"] 29 | 30 | [tool.wheel] 31 | exclude = ["assets*", "benchmark*", "docs", "dist*", "playground*", "scripts*", "tests*"] 32 | -------------------------------------------------------------------------------- /script/batch_inference.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import transformers 3 | from transformers import AutoModelForCausalLM, AutoTokenizer 4 | from PIL import Image 5 | import warnings 6 | 7 | # disable some warnings 8 | transformers.logging.set_verbosity_error() 9 | transformers.logging.disable_progress_bar() 10 | warnings.filterwarnings('ignore') 11 | 12 | # set device 13 | device = 'cuda' # or cpu 14 | torch.set_default_device(device) 15 | 16 | model_name = 'BAAI/Bunny-v1_1-Llama-3-8B-V' # or 'BAAI/Bunny-Llama-3-8B-V' or 'BAAI/Bunny-v1_1-4B' or 'BAAI/Bunny-v1_0-4B' or 'BAAI/Bunny-v1_0-3B' or 'BAAI/Bunny-v1_0-3B-zh' or 'BAAI/Bunny-v1_0-2B-zh' 17 | 18 | # create model 19 | model = AutoModelForCausalLM.from_pretrained( 20 | model_name, 21 | torch_dtype=torch.float16, # float32 for cpu 22 | device_map='auto', 23 | trust_remote_code=True) 24 | tokenizer = AutoTokenizer.from_pretrained( 25 | model_name, 26 | trust_remote_code=True) 27 | 28 | # for batch inference 29 | tokenizer.padding_side = "left" 30 | tokenizer.pad_token_id = model.generation_config.pad_token_id 31 | padding_max_length = 128 # customize for your circumstance 32 | tokenizer.add_tokens(['']) 33 | image_token_id = tokenizer.convert_tokens_to_ids('') 34 | 35 | # text prompts 36 | prompts = [ 37 | 'What is the astronaut holding in his hand?', 38 | 'Why is the image funny?', 39 | 'What is the occupation of the person in the picture?', 40 | 'What animal is in the picture?' 41 | ] 42 | texts = [ 43 | f"A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: \n{prompt} ASSISTANT:" 44 | for prompt in prompts] 45 | input_ids = torch.tensor( 46 | [tokenizer(text, padding='max_length', max_length=padding_max_length).input_ids for text in texts], 47 | dtype=torch.long).to(device) 48 | input_ids[input_ids == image_token_id] = -200 49 | 50 | # images, sample images can be found in https://huggingface.co/BAAI/Bunny-v1_1-Llama-3-8B-V/tree/main/images 51 | image_paths = [ 52 | 'example_1.png', 53 | 'example_2.png', 54 | 'example_1.png', 55 | 'example_2.png' 56 | ] 57 | images = [Image.open(image_path) for image_path in image_paths] 58 | image_tensor = model.process_images(images, model.config).to(dtype=model.dtype, device=device) 59 | 60 | # generate 61 | output_ids = model.generate( 62 | input_ids, 63 | images=image_tensor, 64 | max_new_tokens=100, 65 | use_cache=True, 66 | repetition_penalty=1.0 # increase this to avoid chattering 67 | ) 68 | 69 | print([ans.strip() for ans in tokenizer.batch_decode(output_ids[:, input_ids.shape[1]:], skip_special_tokens=True)]) 70 | -------------------------------------------------------------------------------- /script/deepspeed/zero2.json: -------------------------------------------------------------------------------- 1 | { 2 | "fp16": { 3 | "enabled": "auto", 4 | "loss_scale": 0, 5 | "loss_scale_window": 1000, 6 | "initial_scale_power": 16, 7 | "hysteresis": 2, 8 | "min_loss_scale": 1 9 | }, 10 | "bf16": { 11 | "enabled": "auto" 12 | }, 13 | "train_micro_batch_size_per_gpu": "auto", 14 | "train_batch_size": "auto", 15 | "gradient_accumulation_steps": "auto", 16 | "zero_optimization": { 17 | "stage": 2, 18 | "overlap_comm": true, 19 | "contiguous_gradients": true, 20 | "sub_group_size": 1e9, 21 | "reduce_bucket_size": "auto" 22 | } 23 | } -------------------------------------------------------------------------------- /script/deepspeed/zero3.json: -------------------------------------------------------------------------------- 1 | { 2 | "fp16": { 3 | "enabled": "auto", 4 | "loss_scale": 0, 5 | "loss_scale_window": 1000, 6 | "initial_scale_power": 16, 7 | "hysteresis": 2, 8 | "min_loss_scale": 1 9 | }, 10 | "bf16": { 11 | "enabled": "auto" 12 | }, 13 | "train_micro_batch_size_per_gpu": "auto", 14 | "train_batch_size": "auto", 15 | "gradient_accumulation_steps": "auto", 16 | "zero_optimization": { 17 | "stage": 3, 18 | "overlap_comm": true, 19 | "contiguous_gradients": true, 20 | "sub_group_size": 1e9, 21 | "reduce_bucket_size": "auto", 22 | "stage3_prefetch_bucket_size": "auto", 23 | "stage3_param_persistence_threshold": "auto", 24 | "stage3_max_live_parameters": 1e9, 25 | "stage3_max_reuse_distance": 1e9, 26 | "stage3_gather_16bit_weights_on_model_save": true 27 | } 28 | } -------------------------------------------------------------------------------- /script/eval/full/cmmmu.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | SPLIT="val" 4 | MODEL_TYPE=phi-2 5 | TARGET_DIR=bunny-phi-2 6 | 7 | python -m bunny.eval.model_vqa_cmmmu \ 8 | --model-path ./checkpoints-$MODEL_TYPE/$TARGET_DIR \ 9 | --model-type $MODEL_TYPE \ 10 | --data-path ./eval/cmmmu/CMMMU \ 11 | --config-path ./eval/cmmmu/prompt.yaml \ 12 | --output-path ./eval/cmmmu/answers_upload/$SPLIT/$TARGET_DIR.jsonl \ 13 | --split $SPLIT \ 14 | --conv-mode bunny 15 | -------------------------------------------------------------------------------- /script/eval/full/gqa.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}" 4 | IFS=',' read -ra GPULIST <<< "$gpu_list" 5 | 6 | CHUNKS=${#GPULIST[@]} 7 | 8 | MODEL_TYPE=phi-2 9 | TARGET_DIR=bunny-phi-2 10 | 11 | SPLIT="bunny_gqa_testdev_balanced" 12 | 13 | for IDX in $(seq 0 $((CHUNKS-1))); do 14 | CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m bunny.eval.model_vqa_loader \ 15 | --model-path ./checkpoints-$MODEL_TYPE/$TARGET_DIR \ 16 | --model-type $MODEL_TYPE \ 17 | --question-file ./eval/gqa/$SPLIT.jsonl \ 18 | --image-folder ./eval/gqa/images \ 19 | --answers-file ./eval/gqa/answers/$SPLIT/$TARGET_DIR/${CHUNKS}_${IDX}.jsonl \ 20 | --num-chunks $CHUNKS \ 21 | --chunk-idx $IDX \ 22 | --temperature 0 \ 23 | --conv-mode bunny & 24 | done 25 | 26 | wait 27 | 28 | output_file=./eval/gqa/answers/$SPLIT/$TARGET_DIR/merge.jsonl 29 | 30 | # Clear out the output file if it exists. 31 | > "$output_file" 32 | 33 | # Loop through the indices and concatenate each file. 34 | for IDX in $(seq 0 $((CHUNKS-1))); do 35 | cat ./eval/gqa/answers/$SPLIT/$TARGET_DIR/${CHUNKS}_${IDX}.jsonl >> "$output_file" 36 | done 37 | 38 | python eval/gqa/convert_gqa_for_eval.py --src $output_file --dst ./eval/gqa/testdev_balanced_predictions.json 39 | 40 | cd eval/gqa 41 | python eval_gqa.py --tier testdev_balanced 42 | -------------------------------------------------------------------------------- /script/eval/full/mmbench.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | SPLIT="MMBench_DEV_EN_legacy" 4 | LANG=en 5 | MODEL_TYPE=phi-2 6 | TARGET_DIR=bunny-phi-2 7 | 8 | 9 | python -m bunny.eval.model_vqa_mmbench \ 10 | --model-path ./checkpoints-$MODEL_TYPE/$TARGET_DIR \ 11 | --model-type $MODEL_TYPE \ 12 | --question-file ./eval/mmbench/$SPLIT.tsv \ 13 | --answers-file ./eval/mmbench/answers/$SPLIT/$TARGET_DIR.jsonl \ 14 | --lang $LANG \ 15 | --single-pred-prompt \ 16 | --temperature 0 \ 17 | --conv-mode bunny 18 | 19 | mkdir -p eval/mmbench/answers_upload/$SPLIT 20 | 21 | python eval/mmbench/convert_mmbench_for_submission.py \ 22 | --annotation-file ./eval/mmbench/$SPLIT.tsv \ 23 | --result-dir ./eval/mmbench/answers/$SPLIT \ 24 | --upload-dir ./eval/mmbench/answers_upload/$SPLIT \ 25 | --experiment $TARGET_DIR 26 | -------------------------------------------------------------------------------- /script/eval/full/mme.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | MODEL_TYPE=phi-2 4 | TARGET_DIR=bunny-phi-2 5 | 6 | python -m bunny.eval.model_vqa_loader \ 7 | --model-path ./checkpoints-$MODEL_TYPE/$TARGET_DIR \ 8 | --model-type $MODEL_TYPE \ 9 | --image-folder ./eval/mme/MME_Benchmark_release_version \ 10 | --question-file ./eval/mme/bunny_mme.jsonl \ 11 | --answers-file ./eval/mme/answers/$TARGET_DIR.jsonl \ 12 | --temperature 0 \ 13 | --conv-mode bunny 14 | 15 | cd ./eval/mme 16 | 17 | python convert_answer_to_mme.py --experiment $TARGET_DIR 18 | 19 | python calculation_mme.py --results_dir answers_upload/$TARGET_DIR \ 20 | | tee 2>&1 answers_upload/$TARGET_DIR/res.txt 21 | -------------------------------------------------------------------------------- /script/eval/full/mmmu.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | SPLIT="test" 4 | MODEL_TYPE=phi-2 5 | TARGET_DIR=bunny-phi-2 6 | 7 | python -m bunny.eval.model_vqa_mmmu \ 8 | --model-path ./checkpoints-$MODEL_TYPE/$TARGET_DIR \ 9 | --model-type $MODEL_TYPE \ 10 | --data-path ./eval/mmmu/MMMU \ 11 | --config-path ./eval/mmmu/config.yaml \ 12 | --output-path ./eval/mmmu/answers_upload/$SPLIT/$TARGET_DIR.json \ 13 | --split $SPLIT \ 14 | --conv-mode bunny 15 | -------------------------------------------------------------------------------- /script/eval/full/mmvet.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | MODEL_TYPE=phi-2 4 | TARGET_DIR=bunny-phi-2 5 | 6 | python -m bunny.eval.model_vqa \ 7 | --model-path ./checkpoints-$MODEL_TYPE/$TARGET_DIR \ 8 | --model-type $MODEL_TYPE \ 9 | --question-file ./eval/mm-vet/bunny-mm-vet.jsonl \ 10 | --image-folder ./eval/mm-vet/images \ 11 | --answers-file ./eval/mm-vet/answers/$TARGET_DIR.jsonl \ 12 | --temperature 0 \ 13 | --conv-mode bunny 14 | 15 | mkdir -p ./eval/mm-vet/answers_upload 16 | 17 | python ./eval/mm-vet/convert_mmvet_for_eval.py \ 18 | --src ./eval/mm-vet/answers/$TARGET_DIR.jsonl \ 19 | --dst ./eval/mm-vet/answers_upload/$TARGET_DIR.json -------------------------------------------------------------------------------- /script/eval/full/pope.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | MODEL_TYPE=phi-2 4 | TARGET_DIR=bunny-phi-2 5 | 6 | python -m bunny.eval.model_vqa_loader \ 7 | --model-path ./checkpoints-$MODEL_TYPE/$TARGET_DIR \ 8 | --model-type $MODEL_TYPE \ 9 | --question-file ./eval/pope/bunny_pope_test.jsonl \ 10 | --image-folder ./eval/pope/val2014 \ 11 | --answers-file ./eval/pope/answers/$TARGET_DIR.jsonl \ 12 | --temperature 0 \ 13 | --conv-mode bunny 14 | 15 | python eval/pope/eval_pope.py \ 16 | --annotation-dir ./eval/pope/coco \ 17 | --question-file ./eval/pope/bunny_pope_test.jsonl \ 18 | --result-file ./eval/pope/answers/$TARGET_DIR.jsonl 19 | -------------------------------------------------------------------------------- /script/eval/full/scienceqa.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | MODEL_TYPE=phi-2 4 | TARGET_DIR=bunny-phi-2 5 | 6 | python -m bunny.eval.model_vqa_science \ 7 | --model-path ./checkpoints-$MODEL_TYPE/$TARGET_DIR \ 8 | --model-type $MODEL_TYPE \ 9 | --question-file ./eval/scienceqa/bunny_test_CQM-A.json \ 10 | --image-folder ./eval/scienceqa/test \ 11 | --answers-file ./eval/scienceqa/answers/$TARGET_DIR.jsonl \ 12 | --single-pred-prompt \ 13 | --temperature 0 \ 14 | --conv-mode bunny 15 | 16 | mkdir -p ./eval/scienceqa/outputs/ 17 | mkdir -p ./eval/scienceqa/results/ 18 | 19 | python ./eval/scienceqa/eval_science_qa.py \ 20 | --base-dir ./eval/scienceqa \ 21 | --result-file ./eval/scienceqa/answers/$TARGET_DIR.jsonl \ 22 | --output-file ./eval/scienceqa/outputs/$TARGET_DIR.jsonl \ 23 | --output-result ./eval/scienceqa/results/$TARGET_DIR.json -------------------------------------------------------------------------------- /script/eval/full/seedbench.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}" 4 | IFS=',' read -ra GPULIST <<< "$gpu_list" 5 | 6 | CHUNKS=${#GPULIST[@]} 7 | 8 | 9 | MODEL_TYPE=phi-2 10 | TARGET_DIR=bunny-phi-2 11 | 12 | for IDX in $(seq 0 $((CHUNKS-1))); do 13 | CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m bunny.eval.model_vqa_loader \ 14 | --model-path ./checkpoints-$MODEL_TYPE/$TARGET_DIR \ 15 | --model-type $MODEL_TYPE \ 16 | --question-file ./eval/seed-bench/bunny-seed-bench.jsonl \ 17 | --image-folder ./eval/seed-bench \ 18 | --answers-file ./eval/seed-bench/answers/$TARGET_DIR/${CHUNKS}_${IDX}.jsonl \ 19 | --num-chunks $CHUNKS \ 20 | --chunk-idx $IDX \ 21 | --temperature 0 \ 22 | --conv-mode bunny & 23 | done 24 | 25 | wait 26 | 27 | output_file=./eval/seed-bench/answers/$TARGET_DIR/merge.jsonl 28 | 29 | # Clear out the output file if it exists. 30 | > "$output_file" 31 | 32 | # Loop through the indices and concatenate each file. 33 | for IDX in $(seq 0 $((CHUNKS-1))); do 34 | cat ./eval/seed-bench/answers/$TARGET_DIR/${CHUNKS}_${IDX}.jsonl >> "$output_file" 35 | done 36 | 37 | mkdir -p ./eval/seed-bench/answers_upload 38 | mkdir -p ./eval/seed-bench/scores 39 | 40 | # Evaluate 41 | python ./eval/seed-bench/convert_seed_for_submission.py \ 42 | --annotation-file ./eval/seed-bench/SEED-Bench.json \ 43 | --result-file $output_file \ 44 | --result-upload-file ./eval/seed-bench/answers_upload/$TARGET_DIR.jsonl | tee 2>&1 ./eval/seed-bench/scores/$TARGET_DIR.txt -------------------------------------------------------------------------------- /script/eval/full/vqav2.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}" 4 | IFS=',' read -ra GPULIST <<< "$gpu_list" 5 | 6 | CHUNKS=${#GPULIST[@]} 7 | 8 | MODEL_TYPE=phi-2 9 | TARGET_DIR=bunny-phi-2 10 | 11 | SPLIT="bunny_vqav2_mscoco_test-dev2015" 12 | 13 | for IDX in $(seq 0 $((CHUNKS-1))); do 14 | CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m bunny.eval.model_vqa_loader \ 15 | --model-path ./checkpoints-$MODEL_TYPE/$TARGET_DIR \ 16 | --model-type $MODEL_TYPE \ 17 | --question-file ./eval/vqav2/$SPLIT.jsonl \ 18 | --image-folder ./eval/vqav2/test2015 \ 19 | --answers-file ./eval/vqav2/answers/$SPLIT/$TARGET_DIR/${CHUNKS}_${IDX}.jsonl \ 20 | --num-chunks $CHUNKS \ 21 | --chunk-idx $IDX \ 22 | --temperature 0 \ 23 | --conv-mode bunny & 24 | done 25 | 26 | wait 27 | 28 | output_file=./eval/vqav2/answers/$SPLIT/$TARGET_DIR/merge.jsonl 29 | 30 | # Clear out the output file if it exists. 31 | > "$output_file" 32 | 33 | # Loop through the indices and concatenate each file. 34 | for IDX in $(seq 0 $((CHUNKS-1))); do 35 | cat ./eval/vqav2/answers/$SPLIT/$TARGET_DIR/${CHUNKS}_${IDX}.jsonl >> "$output_file" 36 | done 37 | 38 | python eval/vqav2/convert_vqav2_for_submission.py --split $SPLIT --ckpt $TARGET_DIR 39 | 40 | -------------------------------------------------------------------------------- /script/eval/lora/cmmmu.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | SPLIT="val" 4 | MODEL_TYPE=phi-2 5 | MODEL_BASE=/path/to/base_llm_model 6 | TARGET_DIR=bunny-lora-phi-2 7 | 8 | python -m bunny.eval.model_vqa_cmmmu \ 9 | --model-path ./checkpoints-$MODEL_TYPE/$TARGET_DIR \ 10 | --model-base $MODEL_BASE \ 11 | --model-type $MODEL_TYPE \ 12 | --data-path ./eval/cmmmu/CMMMU \ 13 | --config-path ./eval/cmmmu/prompt.yaml \ 14 | --output-path ./eval/cmmmu/answers_upload/$SPLIT/$TARGET_DIR.jsonl \ 15 | --split $SPLIT \ 16 | --conv-mode bunny 17 | -------------------------------------------------------------------------------- /script/eval/lora/gqa.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}" 4 | IFS=',' read -ra GPULIST <<< "$gpu_list" 5 | 6 | CHUNKS=${#GPULIST[@]} 7 | 8 | MODEL_TYPE=phi-2 9 | MODEL_BASE=/path/to/base_llm_model 10 | TARGET_DIR=bunny-lora-phi-2 11 | 12 | SPLIT="bunny_gqa_testdev_balanced" 13 | 14 | for IDX in $(seq 0 $((CHUNKS-1))); do 15 | CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m bunny.eval.model_vqa_loader \ 16 | --model-path ./checkpoints-$MODEL_TYPE/$TARGET_DIR \ 17 | --model-base $MODEL_BASE \ 18 | --model-type $MODEL_TYPE \ 19 | --question-file ./eval/gqa/$SPLIT.jsonl \ 20 | --image-folder ./eval/gqa/images \ 21 | --answers-file ./eval/gqa/answers/$SPLIT/$TARGET_DIR/${CHUNKS}_${IDX}.jsonl \ 22 | --num-chunks $CHUNKS \ 23 | --chunk-idx $IDX \ 24 | --temperature 0 \ 25 | --conv-mode bunny & 26 | done 27 | 28 | wait 29 | 30 | output_file=./eval/gqa/answers/$SPLIT/$TARGET_DIR/merge.jsonl 31 | 32 | # Clear out the output file if it exists. 33 | > "$output_file" 34 | 35 | # Loop through the indices and concatenate each file. 36 | for IDX in $(seq 0 $((CHUNKS-1))); do 37 | cat ./eval/gqa/answers/$SPLIT/$TARGET_DIR/${CHUNKS}_${IDX}.jsonl >> "$output_file" 38 | done 39 | 40 | python eval/gqa/convert_gqa_for_eval.py --src $output_file --dst ./eval/gqa/testdev_balanced_predictions.json 41 | 42 | cd eval/gqa 43 | python eval_gqa.py --tier testdev_balanced 44 | -------------------------------------------------------------------------------- /script/eval/lora/mmbench.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | SPLIT="MMBench_DEV_EN_legacy" 4 | LANG=en 5 | MODEL_TYPE=phi-2 6 | MODEL_BASE=/path/to/base_llm_model 7 | TARGET_DIR=bunny-lora-phi-2 8 | 9 | 10 | python -m bunny.eval.model_vqa_mmbench \ 11 | --model-path ./checkpoints-$MODEL_TYPE/$TARGET_DIR \ 12 | --model-base $MODEL_BASE \ 13 | --model-type $MODEL_TYPE \ 14 | --question-file ./eval/mmbench/$SPLIT.tsv \ 15 | --answers-file ./eval/mmbench/answers/$SPLIT/$TARGET_DIR.jsonl \ 16 | --lang $LANG \ 17 | --single-pred-prompt \ 18 | --temperature 0 \ 19 | --conv-mode bunny 20 | 21 | mkdir -p eval/mmbench/answers_upload/$SPLIT 22 | 23 | python eval/mmbench/convert_mmbench_for_submission.py \ 24 | --annotation-file ./eval/mmbench/$SPLIT.tsv \ 25 | --result-dir ./eval/mmbench/answers/$SPLIT \ 26 | --upload-dir ./eval/mmbench/answers_upload/$SPLIT \ 27 | --experiment $TARGET_DIR 28 | -------------------------------------------------------------------------------- /script/eval/lora/mme.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | MODEL_TYPE=phi-2 4 | MODEL_BASE=/path/to/base_llm_model 5 | TARGET_DIR=bunny-lora-phi-2 6 | 7 | python -m bunny.eval.model_vqa_loader \ 8 | --model-path ./checkpoints-$MODEL_TYPE/$TARGET_DIR \ 9 | --model-base $MODEL_BASE \ 10 | --model-type $MODEL_TYPE \ 11 | --image-folder ./eval/mme/MME_Benchmark_release_version \ 12 | --question-file ./eval/mme/bunny_mme.jsonl \ 13 | --answers-file ./eval/mme/answers/$TARGET_DIR.jsonl \ 14 | --temperature 0 \ 15 | --conv-mode bunny 16 | 17 | cd ./eval/mme 18 | 19 | python convert_answer_to_mme.py --experiment $TARGET_DIR 20 | 21 | python calculation_mme.py --results_dir answers_upload/$TARGET_DIR \ 22 | | tee 2>&1 answers_upload/$TARGET_DIR/res.txt 23 | -------------------------------------------------------------------------------- /script/eval/lora/mmmu.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | SPLIT="test" 4 | MODEL_TYPE=phi-2 5 | MODEL_BASE=/path/to/base_llm_model 6 | TARGET_DIR=bunny-lora-phi-2 7 | 8 | python -m bunny.eval.model_vqa_mmmu \ 9 | --model-path ./checkpoints-$MODEL_TYPE/$TARGET_DIR \ 10 | --model-base $MODEL_BASE \ 11 | --model-type $MODEL_TYPE \ 12 | --data-path ./eval/mmmu/MMMU \ 13 | --config-path ./eval/mmmu/config.yaml \ 14 | --output-path ./eval/mmmu/answers_upload/$SPLIT/$TARGET_DIR.json \ 15 | --split $SPLIT \ 16 | --conv-mode bunny 17 | -------------------------------------------------------------------------------- /script/eval/lora/mmvet.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | MODEL_TYPE=phi-2 4 | MODEL_BASE=/path/to/base_llm_model 5 | TARGET_DIR=bunny-lora-phi-2 6 | 7 | python -m bunny.eval.model_vqa \ 8 | --model-path ./checkpoints-$MODEL_TYPE/$TARGET_DIR \ 9 | --model-base $MODEL_BASE \ 10 | --model-type $MODEL_TYPE \ 11 | --question-file ./eval/mm-vet/bunny-mm-vet.jsonl \ 12 | --image-folder ./eval/mm-vet/images \ 13 | --answers-file ./eval/mm-vet/answers/$TARGET_DIR.jsonl \ 14 | --temperature 0 \ 15 | --conv-mode bunny 16 | 17 | mkdir -p ./eval/mm-vet/answers_upload 18 | 19 | python ./eval/mm-vet/convert_mmvet_for_eval.py \ 20 | --src ./eval/mm-vet/answers/$TARGET_DIR.jsonl \ 21 | --dst ./eval/mm-vet/answers_upload/$TARGET_DIR.json -------------------------------------------------------------------------------- /script/eval/lora/pope.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | MODEL_TYPE=phi-2 4 | MODEL_BASE=/path/to/base_llm_model 5 | TARGET_DIR=bunny-lora-phi-2 6 | 7 | python -m bunny.eval.model_vqa_loader \ 8 | --model-path ./checkpoints-$MODEL_TYPE/$TARGET_DIR \ 9 | --model-base $MODEL_BASE \ 10 | --model-type $MODEL_TYPE \ 11 | --question-file ./eval/pope/bunny_pope_test.jsonl \ 12 | --image-folder ./eval/pope/val2014 \ 13 | --answers-file ./eval/pope/answers/$TARGET_DIR.jsonl \ 14 | --temperature 0 \ 15 | --conv-mode bunny 16 | 17 | python eval/pope/eval_pope.py \ 18 | --annotation-dir ./eval/pope/coco \ 19 | --question-file ./eval/pope/bunny_pope_test.jsonl \ 20 | --result-file ./eval/pope/answers/$TARGET_DIR.jsonl 21 | -------------------------------------------------------------------------------- /script/eval/lora/scienceqa.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | MODEL_TYPE=phi-2 4 | MODEL_BASE=/path/to/base_llm_model 5 | TARGET_DIR=bunny-lora-phi-2 6 | 7 | python -m bunny.eval.model_vqa_science \ 8 | --model-path ./checkpoints-$MODEL_TYPE/$TARGET_DIR \ 9 | --model-base $MODEL_BASE \ 10 | --model-type $MODEL_TYPE \ 11 | --question-file ./eval/scienceqa/bunny_test_CQM-A.json \ 12 | --image-folder ./eval/scienceqa/test \ 13 | --answers-file ./eval/scienceqa/answers/$TARGET_DIR.jsonl \ 14 | --single-pred-prompt \ 15 | --temperature 0 \ 16 | --conv-mode bunny 17 | 18 | mkdir -p ./eval/scienceqa/outputs/ 19 | mkdir -p ./eval/scienceqa/results/ 20 | 21 | python ./eval/scienceqa/eval_science_qa.py \ 22 | --base-dir ./eval/scienceqa \ 23 | --result-file ./eval/scienceqa/answers/$TARGET_DIR.jsonl \ 24 | --output-file ./eval/scienceqa/outputs/$TARGET_DIR.jsonl \ 25 | --output-result ./eval/scienceqa/results/$TARGET_DIR.json -------------------------------------------------------------------------------- /script/eval/lora/seedbench.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}" 4 | IFS=',' read -ra GPULIST <<< "$gpu_list" 5 | 6 | CHUNKS=${#GPULIST[@]} 7 | 8 | 9 | MODEL_TYPE=phi-2 10 | MODEL_BASE=/path/to/base_llm_model 11 | TARGET_DIR=bunny-lora-phi-2 12 | 13 | for IDX in $(seq 0 $((CHUNKS-1))); do 14 | CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m bunny.eval.model_vqa_loader \ 15 | --model-path ./checkpoints-$MODEL_TYPE/$TARGET_DIR \ 16 | --model-base $MODEL_BASE \ 17 | --model-type $MODEL_TYPE \ 18 | --question-file ./eval/seed-bench/bunny-seed-bench.jsonl \ 19 | --image-folder ./eval/seed-bench \ 20 | --answers-file ./eval/seed-bench/answers/$TARGET_DIR/${CHUNKS}_${IDX}.jsonl \ 21 | --num-chunks $CHUNKS \ 22 | --chunk-idx $IDX \ 23 | --temperature 0 \ 24 | --conv-mode bunny & 25 | done 26 | 27 | wait 28 | 29 | output_file=./eval/seed-bench/answers/$TARGET_DIR/merge.jsonl 30 | 31 | # Clear out the output file if it exists. 32 | > "$output_file" 33 | 34 | # Loop through the indices and concatenate each file. 35 | for IDX in $(seq 0 $((CHUNKS-1))); do 36 | cat ./eval/seed-bench/answers/$TARGET_DIR/${CHUNKS}_${IDX}.jsonl >> "$output_file" 37 | done 38 | 39 | mkdir -p ./eval/seed-bench/answers_upload 40 | mkdir -p ./eval/seed-bench/scores 41 | 42 | # Evaluate 43 | python ./eval/seed-bench/convert_seed_for_submission.py \ 44 | --annotation-file ./eval/seed-bench/SEED-Bench.json \ 45 | --result-file $output_file \ 46 | --result-upload-file ./eval/seed-bench/answers_upload/$TARGET_DIR.jsonl | tee 2>&1 ./eval/seed-bench/scores/$TARGET_DIR.txt -------------------------------------------------------------------------------- /script/eval/lora/vqav2.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}" 4 | IFS=',' read -ra GPULIST <<< "$gpu_list" 5 | 6 | CHUNKS=${#GPULIST[@]} 7 | 8 | MODEL_TYPE=phi-2 9 | MODEL_BASE=/path/to/base_llm_model 10 | TARGET_DIR=bunny-lora-phi-2 11 | 12 | SPLIT="bunny_vqav2_mscoco_test-dev2015" 13 | 14 | for IDX in $(seq 0 $((CHUNKS-1))); do 15 | CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m bunny.eval.model_vqa_loader \ 16 | --model-path ./checkpoints-$MODEL_TYPE/$TARGET_DIR \ 17 | --model-base $MODEL_BASE \ 18 | --model-type $MODEL_TYPE \ 19 | --question-file ./eval/vqav2/$SPLIT.jsonl \ 20 | --image-folder ./eval/vqav2/test2015 \ 21 | --answers-file ./eval/vqav2/answers/$SPLIT/$TARGET_DIR/${CHUNKS}_${IDX}.jsonl \ 22 | --num-chunks $CHUNKS \ 23 | --chunk-idx $IDX \ 24 | --temperature 0 \ 25 | --conv-mode bunny & 26 | done 27 | 28 | wait 29 | 30 | output_file=./eval/vqav2/answers/$SPLIT/$TARGET_DIR/merge.jsonl 31 | 32 | # Clear out the output file if it exists. 33 | > "$output_file" 34 | 35 | # Loop through the indices and concatenate each file. 36 | for IDX in $(seq 0 $((CHUNKS-1))); do 37 | cat ./eval/vqav2/answers/$SPLIT/$TARGET_DIR/${CHUNKS}_${IDX}.jsonl >> "$output_file" 38 | done 39 | 40 | python eval/vqav2/convert_vqav2_for_submission.py --split $SPLIT --ckpt $TARGET_DIR 41 | 42 | -------------------------------------------------------------------------------- /script/merge_lora_weights.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | from bunny.model.builder import load_pretrained_model 4 | from bunny.util.mm_utils import get_model_name_from_path 5 | 6 | 7 | def merge_lora(args): 8 | model_path = os.path.expanduser(args.model_path) 9 | model_name = get_model_name_from_path(model_path) 10 | tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, args.model_base, model_name, 11 | args.model_type) 12 | 13 | model.save_pretrained(args.save_model_path) 14 | tokenizer.save_pretrained(args.save_model_path) 15 | 16 | 17 | if __name__ == "__main__": 18 | parser = argparse.ArgumentParser() 19 | parser.add_argument("--model-path", type=str, required=True) 20 | parser.add_argument("--model-base", type=str, required=True) 21 | parser.add_argument("--model-type", type=str, required=True) 22 | parser.add_argument("--save-model-path", type=str, required=True) 23 | 24 | args = parser.parse_args() 25 | 26 | merge_lora(args) 27 | -------------------------------------------------------------------------------- /script/train/finetune_full.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | MODEL_TYPE=phi-2 4 | 5 | PRETRAIN_DIR=bunny-$MODEL_TYPE-pretrain 6 | OUTPUT_DIR=bunny-$MODEL_TYPE 7 | 8 | mkdir -p ./checkpoints-$MODEL_TYPE/$OUTPUT_DIR 9 | 10 | deepspeed bunny/train/train.py \ 11 | --deepspeed ./script/deepspeed/zero3.json \ 12 | --model_name_or_path /path/to/base_llm_model \ 13 | --model_type $MODEL_TYPE \ 14 | --version bunny \ 15 | --data_path ./data/finetune/bunny_695k.json \ 16 | --image_folder ./data/finetune/images \ 17 | --vision_tower /path/to/siglip-so400m-patch14-384 \ 18 | --pretrain_mm_mlp_adapter ./checkpoints-pretrain/$PRETRAIN_DIR/mm_projector.bin \ 19 | --mm_projector_type mlp2x_gelu \ 20 | --image_aspect_ratio pad \ 21 | --group_by_modality_length False \ 22 | --bf16 True \ 23 | --output_dir ./checkpoints-$MODEL_TYPE/$OUTPUT_DIR \ 24 | --num_train_epochs 1 \ 25 | --per_device_train_batch_size 8 \ 26 | --per_device_eval_batch_size 4 \ 27 | --gradient_accumulation_steps 2 \ 28 | --evaluation_strategy "no" \ 29 | --save_strategy "steps" \ 30 | --save_steps 500 \ 31 | --save_total_limit 1 \ 32 | --learning_rate 2e-5 \ 33 | --weight_decay 0. \ 34 | --warmup_ratio 0.03 \ 35 | --lr_scheduler_type "cosine" \ 36 | --logging_steps 1 \ 37 | --tf32 True \ 38 | --model_max_length 2048 \ 39 | --gradient_checkpointing True \ 40 | --dataloader_num_workers 4 \ 41 | --lazy_preprocess True \ 42 | --report_to none | tee 2>&1 ./checkpoints-$MODEL_TYPE/$OUTPUT_DIR/log.txt 43 | -------------------------------------------------------------------------------- /script/train/finetune_lora.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | MODEL_TYPE=phi-2 4 | 5 | PRETRAIN_DIR=bunny-$MODEL_TYPE-pretrain 6 | OUTPUT_DIR=bunny-lora-$MODEL_TYPE 7 | 8 | mkdir -p ./checkpoints-$MODEL_TYPE/$OUTPUT_DIR 9 | 10 | deepspeed bunny/train/train.py \ 11 | --lora_enable True --lora_r 128 --lora_alpha 256 --mm_projector_lr 2e-5 \ 12 | --deepspeed ./script/deepspeed/zero3.json \ 13 | --model_name_or_path /path/to/base_llm_model \ 14 | --model_type $MODEL_TYPE \ 15 | --version bunny \ 16 | --data_path ./data/finetune/bunny_695k.json \ 17 | --image_folder ./data/finetune/images \ 18 | --vision_tower /path/to/siglip-so400m-patch14-384 \ 19 | --pretrain_mm_mlp_adapter ./checkpoints-pretrain/$PRETRAIN_DIR/mm_projector.bin \ 20 | --mm_projector_type mlp2x_gelu \ 21 | --image_aspect_ratio pad \ 22 | --group_by_modality_length False \ 23 | --bf16 True \ 24 | --output_dir ./checkpoints-$MODEL_TYPE/$OUTPUT_DIR \ 25 | --num_train_epochs 1 \ 26 | --per_device_train_batch_size 8 \ 27 | --per_device_eval_batch_size 4 \ 28 | --gradient_accumulation_steps 2 \ 29 | --evaluation_strategy "no" \ 30 | --save_strategy "steps" \ 31 | --save_steps 500 \ 32 | --save_total_limit 1 \ 33 | --learning_rate 2e-4 \ 34 | --weight_decay 0. \ 35 | --warmup_ratio 0.03 \ 36 | --lr_scheduler_type "cosine" \ 37 | --logging_steps 1 \ 38 | --tf32 True \ 39 | --model_max_length 2048 \ 40 | --gradient_checkpointing True \ 41 | --dataloader_num_workers 4 \ 42 | --lazy_preprocess True \ 43 | --report_to none | tee 2>&1 ./checkpoints-$MODEL_TYPE/$OUTPUT_DIR/log.txt -------------------------------------------------------------------------------- /script/train/pretrain.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | MODEL_TYPE=phi-2 4 | OUTPUT_DIR=bunny-$MODEL_TYPE-pretrain 5 | 6 | mkdir -p ./checkpoints-pretrain/$OUTPUT_DIR 7 | 8 | deepspeed bunny/train/train.py \ 9 | --deepspeed ./script/deepspeed/zero2.json \ 10 | --model_name_or_path /path/to/base_llm_model \ 11 | --model_type $MODEL_TYPE \ 12 | --version plain \ 13 | --data_path ./data/pretrain/bunny_pretrain_laion_2m.json \ 14 | --image_folder ./data/pretrain/images \ 15 | --vision_tower /path/to/siglip-so400m-patch14-384 \ 16 | --mm_projector_type mlp2x_gelu \ 17 | --tune_mm_mlp_adapter True \ 18 | --image_aspect_ratio square \ 19 | --bf16 True \ 20 | --output_dir ./checkpoints-pretrain/$OUTPUT_DIR \ 21 | --num_train_epochs 1 \ 22 | --per_device_train_batch_size 8 \ 23 | --per_device_eval_batch_size 4 \ 24 | --gradient_accumulation_steps 4 \ 25 | --evaluation_strategy "no" \ 26 | --save_strategy "steps" \ 27 | --save_steps 24000 \ 28 | --save_total_limit 1 \ 29 | --learning_rate 5e-4 \ 30 | --weight_decay 0. \ 31 | --warmup_ratio 0.03 \ 32 | --lr_scheduler_type "cosine" \ 33 | --logging_steps 1 \ 34 | --tf32 True \ 35 | --model_max_length 2048 \ 36 | --gradient_checkpointing True \ 37 | --dataloader_num_workers 4 \ 38 | --lazy_preprocess True \ 39 | --report_to none | tee 2>&1 ./checkpoints-pretrain/$OUTPUT_DIR/log.txt 40 | -------------------------------------------------------------------------------- /script/train/tutorials/Bunny-Llama-3-8B-V.md: -------------------------------------------------------------------------------- 1 | # Training Tutorial of Bunny-Llama-3-8B-V 2 | 3 | ![Bunny-Llama-3-8B-V](assets/Bunny-Llama-3-8B-V.png) 4 | 5 | ## Pretrain 6 | 7 | ```shell 8 | #!/bin/bash 9 | 10 | MODEL_TYPE=llama3-8b 11 | OUTPUT_DIR=bunny-$MODEL_TYPE-pretrain 12 | 13 | mkdir -p ./checkpoints-pretrain/$OUTPUT_DIR 14 | 15 | deepspeed bunny/train/train.py \ 16 | --deepspeed ./script/deepspeed/zero2.json \ 17 | --model_name_or_path /path/to/meta-llama/Meta-Llama-3-8B-Instruct \ 18 | --model_type $MODEL_TYPE \ 19 | --version plain \ 20 | --data_path ./data/pretrain/bunny_pretrain_laion_2m.json \ 21 | --image_folder ./data/pretrain/images \ 22 | --vision_tower /path/to/siglip-so400m-patch14-384 \ 23 | --mm_projector_type mlp2x_gelu \ 24 | --tune_mm_mlp_adapter True \ 25 | --image_aspect_ratio square \ 26 | --bf16 True \ 27 | --output_dir ./checkpoints-pretrain/$OUTPUT_DIR \ 28 | --num_train_epochs 1 \ 29 | --per_device_train_batch_size 8 \ 30 | --per_device_eval_batch_size 4 \ 31 | --gradient_accumulation_steps 4 \ 32 | --evaluation_strategy "no" \ 33 | --save_strategy "steps" \ 34 | --save_steps 24000 \ 35 | --save_total_limit 1 \ 36 | --learning_rate 1e-3 \ 37 | --weight_decay 0. \ 38 | --warmup_ratio 0.03 \ 39 | --lr_scheduler_type "cosine" \ 40 | --logging_steps 1 \ 41 | --tf32 True \ 42 | --model_max_length 2048 \ 43 | --gradient_checkpointing True \ 44 | --dataloader_num_workers 4 \ 45 | --lazy_preprocess True \ 46 | --report_to none | tee 2>&1 ./checkpoints-pretrain/$OUTPUT_DIR/log.txt 47 | ``` 48 | 49 | ## Visual Instruction Tuning 50 | 51 | ### Recipe-1 52 | 53 | ```shell 54 | #!/bin/bash 55 | 56 | MODEL_TYPE=llama3-8b 57 | 58 | PRETRAIN_DIR=bunny-$MODEL_TYPE-pretrain 59 | OUTPUT_DIR=bunny-lora-$MODEL_TYPE-recipe-1 60 | 61 | mkdir -p ./checkpoints-$MODEL_TYPE/$OUTPUT_DIR 62 | 63 | deepspeed bunny/train/train.py \ 64 | --lora_enable True --lora_r 128 --lora_alpha 256 --mm_projector_lr 2e-5 \ 65 | --deepspeed ./script/deepspeed/zero3.json \ 66 | --model_name_or_path /path/to/meta-llama/Meta-Llama-3-8B-Instruct \ 67 | --model_type $MODEL_TYPE \ 68 | --version llama \ 69 | --data_path ./data/finetune/bunny_695k.json \ 70 | --image_folder ./data/finetune/images \ 71 | --vision_tower /path/to/siglip-so400m-patch14-384 \ 72 | --pretrain_mm_mlp_adapter ./checkpoints-pretrain/$PRETRAIN_DIR/mm_projector.bin \ 73 | --mm_projector_type mlp2x_gelu \ 74 | --image_aspect_ratio pad \ 75 | --group_by_modality_length False \ 76 | --bf16 True \ 77 | --output_dir ./checkpoints-$MODEL_TYPE/$OUTPUT_DIR \ 78 | --num_train_epochs 1 \ 79 | --per_device_train_batch_size 8 \ 80 | --per_device_eval_batch_size 4 \ 81 | --gradient_accumulation_steps 2 \ 82 | --evaluation_strategy "no" \ 83 | --save_strategy "steps" \ 84 | --save_steps 500 \ 85 | --save_total_limit 1 \ 86 | --learning_rate 2e-4 \ 87 | --weight_decay 0. \ 88 | --warmup_ratio 0.03 \ 89 | --lr_scheduler_type "cosine" \ 90 | --logging_steps 1 \ 91 | --tf32 True \ 92 | --model_max_length 2048 \ 93 | --gradient_checkpointing True \ 94 | --dataloader_num_workers 4 \ 95 | --lazy_preprocess True \ 96 | --report_to none | tee 2>&1 ./checkpoints-$MODEL_TYPE/$OUTPUT_DIR/log.txt 97 | ``` 98 | 99 | ### Recipe-2 100 | 101 | ```shell 102 | #!/bin/bash 103 | 104 | MODEL_TYPE=llama3-8b 105 | 106 | PRETRAIN_DIR=bunny-$MODEL_TYPE-pretrain 107 | OUTPUT_DIR=bunny-lora-$MODEL_TYPE-recipe-2 108 | 109 | mkdir -p ./checkpoints-$MODEL_TYPE/$OUTPUT_DIR 110 | 111 | deepspeed bunny/train/train.py \ 112 | --lora_enable True --lora_r 128 --lora_alpha 256 --mm_projector_lr 1e-5 \ 113 | --deepspeed ./script/deepspeed/zero3.json \ 114 | --model_name_or_path /path/to/meta-llama/Meta-Llama-3-8B-Instruct \ 115 | --model_type $MODEL_TYPE \ 116 | --version llama \ 117 | --data_path ./data/finetune/bunny_llava_1.4m.json \ 118 | --image_folder ./data/finetune/images \ 119 | --vision_tower /path/to/siglip-so400m-patch14-384 \ 120 | --unfreeze_vision_tower True \ 121 | --pretrain_mm_mlp_adapter ./checkpoints-pretrain/$PRETRAIN_DIR/mm_projector.bin \ 122 | --mm_projector_type mlp2x_gelu \ 123 | --image_aspect_ratio pad \ 124 | --group_by_modality_length False \ 125 | --bf16 True \ 126 | --output_dir ./checkpoints-$MODEL_TYPE/$OUTPUT_DIR \ 127 | --num_train_epochs 1 \ 128 | --per_device_train_batch_size 8 \ 129 | --per_device_eval_batch_size 4 \ 130 | --gradient_accumulation_steps 2 \ 131 | --evaluation_strategy "no" \ 132 | --save_strategy "steps" \ 133 | --save_steps 500 \ 134 | --save_total_limit 1 \ 135 | --learning_rate 1e-4 \ 136 | --weight_decay 0. \ 137 | --warmup_ratio 0.03 \ 138 | --lr_scheduler_type "cosine" \ 139 | --logging_steps 1 \ 140 | --tf32 True \ 141 | --model_max_length 2048 \ 142 | --gradient_checkpointing True \ 143 | --dataloader_num_workers 4 \ 144 | --lazy_preprocess True \ 145 | --report_to none | tee 2>&1 ./checkpoints-$MODEL_TYPE/$OUTPUT_DIR/log.txt 146 | ``` 147 | 148 | ### Weight Merging 149 | 150 | * Firstly, merge the LoRA weights and base LLM 151 | 152 | ```shell 153 | python script/merge_lora_weights.py \ 154 | --model-path ./checkpoints-llama3-8b/bunny-lora-llama3-8b-recipe-1 \ 155 | --model-base /path/to/meta-llama/Meta-Llama-3-8B-Instruct \ 156 | --model-type llama3-8b \ 157 | --save-model-path ./checkpoints-llama3-8b/bunny-llama3-8b-recipe-1 158 | ``` 159 | 160 | ```shell 161 | python script/merge_lora_weights.py \ 162 | --model-path ./checkpoints-llama3-8b/bunny-lora-llama3-8b-recipe-2 \ 163 | --model-base /path/to/meta-llama/Meta-Llama-3-8B-Instruct \ 164 | --model-type llama3-8b \ 165 | --save-model-path ./checkpoints-llama3-8b/bunny-llama3-8b-recipe-2 166 | ``` 167 | 168 | * Then, inherit configurations from recipe-2 169 | 170 | ```shell 171 | cp -r ./checkpoints-llama3-8b/bunny-llama3-8b-recipe-2 ./checkpoints-llama3-8b/bunny-llama3-8b-avg 172 | ``` 173 | 174 | * Lastly, linearly avearge two models 175 | 176 | ```python 177 | from safetensors.torch import load_file, save_file 178 | 179 | total = 4 180 | for i in range(1, total + 1): 181 | model_1 = load_file(f'./checkpoints-llama3-8b/bunny-lora-llama3-8b-recipe-1/model-{i:05d}-of-{total:05d}.safetensors') 182 | model_2 = load_file(f'./checkpoints-llama3-8b/bunny-lora-llama3-8b-recipe-2/model-{i:05d}-of-{total:05d}.safetensors') 183 | 184 | assert model_1.keys() == model_2.keys() 185 | 186 | avg = {} 187 | for k in model_1.keys(): 188 | avg[k] = model_1[k] * 0.5 + model_2[k] * 0.5 # the weight factor is selected empirically 189 | 190 | save_file(avg, f'./checkpoints-llama3-8b/bunny-llama3-8b-avg/model-{i:05d}-of-{total:05d}.safetensors', {'format': 'pt'}) 191 | ``` 192 | 193 | 194 | -------------------------------------------------------------------------------- /script/train/tutorials/Bunny-v1.0-4B.md: -------------------------------------------------------------------------------- 1 | # Training Tutorial of Bunny-v1.0-4B 2 | 3 | ![Bunny-v1.0-4B](assets/Bunny-v1.0-4B.png) 4 | 5 | ## Pretrain 6 | 7 | ```shell 8 | #!/bin/bash 9 | 10 | MODEL_TYPE=phi-3 11 | OUTPUT_DIR=bunny-$MODEL_TYPE-pretrain 12 | 13 | mkdir -p ./checkpoints-pretrain/$OUTPUT_DIR 14 | 15 | deepspeed bunny/train/train.py \ 16 | --deepspeed ./script/deepspeed/zero2.json \ 17 | --model_name_or_path /path/to/microsoft/Phi-3-mini-4k-instruct \ 18 | --model_type $MODEL_TYPE \ 19 | --version plain \ 20 | --data_path ./data/pretrain/bunny_pretrain_laion_2m.json \ 21 | --image_folder ./data/pretrain/images \ 22 | --vision_tower /path/to/siglip-so400m-patch14-384 \ 23 | --mm_projector_type mlp2x_gelu \ 24 | --tune_mm_mlp_adapter True \ 25 | --image_aspect_ratio square \ 26 | --bf16 True \ 27 | --output_dir ./checkpoints-pretrain/$OUTPUT_DIR \ 28 | --num_train_epochs 1 \ 29 | --per_device_train_batch_size 8 \ 30 | --per_device_eval_batch_size 4 \ 31 | --gradient_accumulation_steps 4 \ 32 | --evaluation_strategy "no" \ 33 | --save_strategy "steps" \ 34 | --save_steps 24000 \ 35 | --save_total_limit 1 \ 36 | --learning_rate 1e-3 \ 37 | --weight_decay 0. \ 38 | --warmup_ratio 0.03 \ 39 | --lr_scheduler_type "cosine" \ 40 | --logging_steps 1 \ 41 | --tf32 True \ 42 | --model_max_length 2048 \ 43 | --gradient_checkpointing True \ 44 | --dataloader_num_workers 4 \ 45 | --lazy_preprocess True \ 46 | --report_to none | tee 2>&1 ./checkpoints-pretrain/$OUTPUT_DIR/log.txt 47 | ``` 48 | 49 | ## Visual Instruction Tuning 50 | 51 | ### Recipe-1 52 | 53 | ```shell 54 | #!/bin/bash 55 | 56 | MODEL_TYPE=phi-3 57 | 58 | PRETRAIN_DIR=bunny-$MODEL_TYPE-pretrain 59 | OUTPUT_DIR=bunny-lora-$MODEL_TYPE-recipe-1 60 | 61 | mkdir -p ./checkpoints-$MODEL_TYPE/$OUTPUT_DIR 62 | 63 | deepspeed bunny/train/train.py \ 64 | --lora_enable True --lora_r 128 --lora_alpha 256 --mm_projector_lr 2e-5 \ 65 | --deepspeed ./script/deepspeed/zero3.json \ 66 | --model_name_or_path /path/to/microsoft/Phi-3-mini-4k-instruct \ 67 | --model_type $MODEL_TYPE \ 68 | --version phi3 \ 69 | --data_path ./data/finetune/bunny_695k.json \ 70 | --image_folder ./data/finetune/images \ 71 | --vision_tower /path/to/siglip-so400m-patch14-384 \ 72 | --pretrain_mm_mlp_adapter ./checkpoints-pretrain/$PRETRAIN_DIR/mm_projector.bin \ 73 | --mm_projector_type mlp2x_gelu \ 74 | --image_aspect_ratio pad \ 75 | --group_by_modality_length False \ 76 | --bf16 True \ 77 | --output_dir ./checkpoints-$MODEL_TYPE/$OUTPUT_DIR \ 78 | --num_train_epochs 1 \ 79 | --per_device_train_batch_size 4 \ 80 | --per_device_eval_batch_size 4 \ 81 | --gradient_accumulation_steps 4 \ 82 | --evaluation_strategy "no" \ 83 | --save_strategy "steps" \ 84 | --save_steps 500 \ 85 | --save_total_limit 1 \ 86 | --learning_rate 2e-4 \ 87 | --weight_decay 0. \ 88 | --warmup_ratio 0.03 \ 89 | --lr_scheduler_type "cosine" \ 90 | --logging_steps 1 \ 91 | --tf32 True \ 92 | --model_max_length 4096 \ 93 | --gradient_checkpointing True \ 94 | --dataloader_num_workers 4 \ 95 | --lazy_preprocess True \ 96 | --report_to none | tee 2>&1 ./checkpoints-$MODEL_TYPE/$OUTPUT_DIR/log.txt 97 | ``` 98 | 99 | ### Recipe-2 100 | 101 | ```shell 102 | #!/bin/bash 103 | 104 | MODEL_TYPE=phi-3 105 | 106 | PRETRAIN_DIR=bunny-$MODEL_TYPE-pretrain 107 | OUTPUT_DIR=bunny-lora-$MODEL_TYPE-recipe-2 108 | 109 | mkdir -p ./checkpoints-$MODEL_TYPE/$OUTPUT_DIR 110 | 111 | deepspeed bunny/train/train.py \ 112 | --lora_enable True --lora_r 128 --lora_alpha 256 --mm_projector_lr 2e-5 \ 113 | --deepspeed ./script/deepspeed/zero3.json \ 114 | --model_name_or_path /path/to/microsoft/Phi-3-mini-4k-instruct \ 115 | --model_type $MODEL_TYPE \ 116 | --version phi3 \ 117 | --data_path ./data/finetune/bunny_llava_1.4m.json \ 118 | --image_folder ./data/finetune/images \ 119 | --vision_tower /path/to/siglip-so400m-patch14-384 \ 120 | --unfreeze_vision_tower True \ 121 | --pretrain_mm_mlp_adapter ./checkpoints-pretrain/$PRETRAIN_DIR/mm_projector.bin \ 122 | --mm_projector_type mlp2x_gelu \ 123 | --image_aspect_ratio pad \ 124 | --group_by_modality_length False \ 125 | --bf16 True \ 126 | --output_dir ./checkpoints-$MODEL_TYPE/$OUTPUT_DIR \ 127 | --num_train_epochs 1 \ 128 | --per_device_train_batch_size 4 \ 129 | --per_device_eval_batch_size 4 \ 130 | --gradient_accumulation_steps 4 \ 131 | --evaluation_strategy "no" \ 132 | --save_strategy "steps" \ 133 | --save_steps 500 \ 134 | --save_total_limit 1 \ 135 | --learning_rate 2e-4 \ 136 | --weight_decay 0. \ 137 | --warmup_ratio 0.03 \ 138 | --lr_scheduler_type "cosine" \ 139 | --logging_steps 1 \ 140 | --tf32 True \ 141 | --model_max_length 4096 \ 142 | --gradient_checkpointing True \ 143 | --dataloader_num_workers 4 \ 144 | --lazy_preprocess True \ 145 | --report_to none | tee 2>&1 ./checkpoints-$MODEL_TYPE/$OUTPUT_DIR/log.txt 146 | ``` 147 | 148 | ### Weight Merging 149 | 150 | * Firstly, merge the LoRA weights and base LLM 151 | 152 | ```shell 153 | python script/merge_lora_weights.py \ 154 | --model-path ./checkpoints-phi-3/bunny-lora-phi-3-recipe-1 \ 155 | --model-base /path/to/microsoft/Phi-3-mini-4k-instruct \ 156 | --model-type phi-3 \ 157 | --save-model-path ./checkpoints-phi-3/bunny-phi-3-recipe-1 158 | ``` 159 | 160 | ```shell 161 | python script/merge_lora_weights.py \ 162 | --model-path ./checkpoints-phi-3/bunny-lora-phi-3-recipe-2 \ 163 | --model-base /path/to/microsoft/Phi-3-mini-4k-instruct \ 164 | --model-type phi-3 \ 165 | --save-model-path ./checkpoints-phi-3/bunny-phi-3-recipe-2 166 | ``` 167 | 168 | * Then, inherit configurations from recipe-2 169 | 170 | ```shell 171 | cp -r ./checkpoints-phi-3/bunny-phi-3-recipe-2 ./checkpoints-phi-3/bunny-phi-3-avg 172 | ``` 173 | 174 | * Lastly, linearly avearge two models 175 | 176 | ```python 177 | from safetensors.torch import load_file, save_file 178 | 179 | total = 2 180 | for i in range(1, total + 1): 181 | model_1 = load_file(f'./checkpoints-phi-3/bunny-lora-phi-3-recipe-1/model-{i:05d}-of-{total:05d}.safetensors') 182 | model_2 = load_file(f'./checkpoints-phi-3/bunny-lora-phi-3-recipe-2/model-{i:05d}-of-{total:05d}.safetensors') 183 | 184 | assert model_1.keys() == model_2.keys() 185 | 186 | avg = {} 187 | for k in model_1.keys(): 188 | avg[k] = model_1[k] * 0.3 + model_2[k] * 0.7 # the weight factor is selected empirically 189 | 190 | save_file(avg, f'./checkpoints-phi-3/bunny-phi-3-avg/model-{i:05d}-of-{total:05d}.safetensors', {'format': 'pt'}) 191 | ``` 192 | 193 | 194 | -------------------------------------------------------------------------------- /script/train/tutorials/Bunny-v1.1-4B.md: -------------------------------------------------------------------------------- 1 | # Training Tutorial of Bunny-v1.1-4B 2 | 3 | ![Bunny-v1.1-4B](assets/Bunny-v1.1-4B.png) 4 | 5 | ## Pretrain 6 | 7 | ```shell 8 | #!/bin/bash 9 | 10 | MODEL_TYPE=phi-3 11 | OUTPUT_DIR=bunny-$MODEL_TYPE-pretrain 12 | 13 | mkdir -p ./checkpoints-pretrain/$OUTPUT_DIR 14 | 15 | deepspeed bunny/train/train.py \ 16 | --deepspeed ./script/deepspeed/zero2.json \ 17 | --model_name_or_path /path/to/microsoft/Phi-3-mini-4k-instruct \ 18 | --model_type $MODEL_TYPE \ 19 | --version plain \ 20 | --data_path ./data/pretrain/bunny_pretrain_laion_2m.json \ 21 | --image_folder ./data/pretrain/images \ 22 | --vision_tower /path/to/siglip-so400m-patch14-384 \ 23 | --use_s2 True \ 24 | --mm_projector_type mlp2x_gelu \ 25 | --tune_mm_mlp_adapter True \ 26 | --image_aspect_ratio square \ 27 | --bf16 True \ 28 | --output_dir ./checkpoints-pretrain/$OUTPUT_DIR \ 29 | --num_train_epochs 1 \ 30 | --per_device_train_batch_size 8 \ 31 | --per_device_eval_batch_size 4 \ 32 | --gradient_accumulation_steps 4 \ 33 | --evaluation_strategy "no" \ 34 | --save_strategy "steps" \ 35 | --save_steps 24000 \ 36 | --save_total_limit 1 \ 37 | --learning_rate 1e-3 \ 38 | --weight_decay 0. \ 39 | --warmup_ratio 0.03 \ 40 | --lr_scheduler_type "cosine" \ 41 | --logging_steps 1 \ 42 | --tf32 True \ 43 | --model_max_length 2048 \ 44 | --gradient_checkpointing True \ 45 | --dataloader_num_workers 4 \ 46 | --lazy_preprocess True \ 47 | --report_to none | tee 2>&1 ./checkpoints-pretrain/$OUTPUT_DIR/log.txt 48 | ``` 49 | 50 | ## Visual Instruction Tuning 51 | 52 | ### Recipe-1 53 | 54 | ```shell 55 | #!/bin/bash 56 | 57 | MODEL_TYPE=phi-3 58 | 59 | PRETRAIN_DIR=bunny-$MODEL_TYPE-pretrain 60 | OUTPUT_DIR=bunny-lora-$MODEL_TYPE-recipe-1 61 | 62 | mkdir -p ./checkpoints-$MODEL_TYPE/$OUTPUT_DIR 63 | 64 | deepspeed bunny/train/train.py \ 65 | --lora_enable True --lora_r 128 --lora_alpha 256 --mm_projector_lr 2e-5 \ 66 | --deepspeed ./script/deepspeed/zero3.json \ 67 | --model_name_or_path /path/to/microsoft/Phi-3-mini-4k-instruct \ 68 | --model_type $MODEL_TYPE \ 69 | --version phi3 \ 70 | --data_path ./data/finetune/bunny_allava_1.3m.json \ 71 | --image_folder ./data/finetune/images \ 72 | --vision_tower /path/to/siglip-so400m-patch14-384 \ 73 | --use_s2 True \ 74 | --pretrain_mm_mlp_adapter ./checkpoints-pretrain/$PRETRAIN_DIR/mm_projector.bin \ 75 | --mm_projector_type mlp2x_gelu \ 76 | --image_aspect_ratio pad \ 77 | --group_by_modality_length False \ 78 | --bf16 True \ 79 | --output_dir ./checkpoints-$MODEL_TYPE/$OUTPUT_DIR \ 80 | --num_train_epochs 1 \ 81 | --per_device_train_batch_size 4 \ 82 | --per_device_eval_batch_size 4 \ 83 | --gradient_accumulation_steps 4 \ 84 | --evaluation_strategy "no" \ 85 | --save_strategy "steps" \ 86 | --save_steps 500 \ 87 | --save_total_limit 1 \ 88 | --learning_rate 2e-4 \ 89 | --weight_decay 0. \ 90 | --warmup_ratio 0.03 \ 91 | --lr_scheduler_type "cosine" \ 92 | --logging_steps 1 \ 93 | --tf32 True \ 94 | --model_max_length 4096 \ 95 | --gradient_checkpointing True \ 96 | --dataloader_num_workers 4 \ 97 | --lazy_preprocess True \ 98 | --report_to none | tee 2>&1 ./checkpoints-$MODEL_TYPE/$OUTPUT_DIR/log.txt 99 | ``` 100 | 101 | ### Recipe-2 102 | 103 | ```shell 104 | #!/bin/bash 105 | 106 | MODEL_TYPE=phi-3 107 | 108 | PRETRAIN_DIR=bunny-$MODEL_TYPE-pretrain 109 | OUTPUT_DIR=bunny-lora-$MODEL_TYPE-recipe-2 110 | 111 | mkdir -p ./checkpoints-$MODEL_TYPE/$OUTPUT_DIR 112 | 113 | deepspeed bunny/train/train.py \ 114 | --lora_enable True --lora_r 128 --lora_alpha 256 --mm_projector_lr 2e-5 \ 115 | --deepspeed ./script/deepspeed/zero3.json \ 116 | --model_name_or_path /path/to/microsoft/Phi-3-mini-4k-instruct \ 117 | --model_type $MODEL_TYPE \ 118 | --version phi3 \ 119 | --data_path ./data/finetune/bunny_llava_allava_2m.json \ 120 | --image_folder ./data/finetune/images \ 121 | --vision_tower /path/to/siglip-so400m-patch14-384 \ 122 | --use_s2 True \ 123 | --unfreeze_vision_tower True \ 124 | --pretrain_mm_mlp_adapter ./checkpoints-pretrain/$PRETRAIN_DIR/mm_projector.bin \ 125 | --mm_projector_type mlp2x_gelu \ 126 | --image_aspect_ratio pad \ 127 | --group_by_modality_length False \ 128 | --bf16 True \ 129 | --output_dir ./checkpoints-$MODEL_TYPE/$OUTPUT_DIR \ 130 | --num_train_epochs 1 \ 131 | --per_device_train_batch_size 4 \ 132 | --per_device_eval_batch_size 4 \ 133 | --gradient_accumulation_steps 4 \ 134 | --evaluation_strategy "no" \ 135 | --save_strategy "steps" \ 136 | --save_steps 500 \ 137 | --save_total_limit 1 \ 138 | --learning_rate 2e-4 \ 139 | --weight_decay 0. \ 140 | --warmup_ratio 0.03 \ 141 | --lr_scheduler_type "cosine" \ 142 | --logging_steps 1 \ 143 | --tf32 True \ 144 | --model_max_length 4096 \ 145 | --gradient_checkpointing True \ 146 | --dataloader_num_workers 4 \ 147 | --lazy_preprocess True \ 148 | --report_to none | tee 2>&1 ./checkpoints-$MODEL_TYPE/$OUTPUT_DIR/log.txt 149 | ``` 150 | 151 | ### Weight Merging 152 | 153 | * Firstly, merge the LoRA weights and base LLM 154 | 155 | ```shell 156 | python script/merge_lora_weights.py \ 157 | --model-path ./checkpoints-phi-3/bunny-lora-phi-3-recipe-1 \ 158 | --model-base /path/to/microsoft/Phi-3-mini-4k-instruct \ 159 | --model-type phi-3 \ 160 | --save-model-path ./checkpoints-phi-3/bunny-phi-3-recipe-1 161 | ``` 162 | 163 | ```shell 164 | python script/merge_lora_weights.py \ 165 | --model-path ./checkpoints-phi-3/bunny-lora-phi-3-recipe-2 \ 166 | --model-base /path/to/microsoft/Phi-3-mini-4k-instruct \ 167 | --model-type phi-3 \ 168 | --save-model-path ./checkpoints-phi-3/bunny-phi-3-recipe-2 169 | ``` 170 | 171 | * Then, inherit configurations from recipe-2 172 | 173 | ```shell 174 | cp -r ./checkpoints-phi-3/bunny-phi-3-recipe-2 ./checkpoints-phi-3/bunny-phi-3-avg 175 | ``` 176 | 177 | * Lastly, linearly avearge two models 178 | 179 | ```python 180 | from safetensors.torch import load_file, save_file 181 | 182 | total = 2 183 | for i in range(1, total + 1): 184 | model_1 = load_file(f'./checkpoints-phi-3/bunny-lora-phi-3-recipe-1/model-{i:05d}-of-{total:05d}.safetensors') 185 | model_2 = load_file(f'./checkpoints-phi-3/bunny-lora-phi-3-recipe-2/model-{i:05d}-of-{total:05d}.safetensors') 186 | 187 | assert model_1.keys() == model_2.keys() 188 | 189 | avg = {} 190 | for k in model_1.keys(): 191 | avg[k] = model_1[k] * 0.3 + model_2[k] * 0.7 # the weight factor is selected empirically 192 | 193 | save_file(avg, f'./checkpoints-phi-3/bunny-phi-3-avg/model-{i:05d}-of-{total:05d}.safetensors', {'format': 'pt'}) 194 | ``` 195 | 196 | 197 | -------------------------------------------------------------------------------- /script/train/tutorials/assets/Bunny-Llama-3-8B-V.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BAAI-DCAI/Bunny/08273acbd83b6836c03dce25cc134eb6b9cedba5/script/train/tutorials/assets/Bunny-Llama-3-8B-V.png -------------------------------------------------------------------------------- /script/train/tutorials/assets/Bunny-v1.0-4B.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BAAI-DCAI/Bunny/08273acbd83b6836c03dce25cc134eb6b9cedba5/script/train/tutorials/assets/Bunny-v1.0-4B.png -------------------------------------------------------------------------------- /script/train/tutorials/assets/Bunny-v1.1-4B.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BAAI-DCAI/Bunny/08273acbd83b6836c03dce25cc134eb6b9cedba5/script/train/tutorials/assets/Bunny-v1.1-4B.png -------------------------------------------------------------------------------- /script/train/tutorials/assets/Bunny-v1.1-Llama-3-8B-V.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BAAI-DCAI/Bunny/08273acbd83b6836c03dce25cc134eb6b9cedba5/script/train/tutorials/assets/Bunny-v1.1-Llama-3-8B-V.png -------------------------------------------------------------------------------- /script/train/tutorials/bunny-minicpm-siglip-lora.md: -------------------------------------------------------------------------------- 1 | # Training Tutorial of bunny-minicpm-siglip-lora 2 | 3 | ## Pretrain 4 | 5 | ```shell 6 | #!/bin/bash 7 | 8 | MODEL_TYPE=minicpm 9 | OUTPUT_DIR=bunny-$MODEL_TYPE-pretrain 10 | 11 | mkdir -p ./checkpoints-pretrain/$OUTPUT_DIR 12 | 13 | deepspeed bunny/train/train.py \ 14 | --deepspeed ./script/deepspeed/zero2.json \ 15 | --model_name_or_path /path/to/openbmb/MiniCPM-2B \ 16 | --model_type $MODEL_TYPE \ 17 | --version plain \ 18 | --data_path ./data/pretrain/bunny_pretrain_laion_2m.json \ 19 | --image_folder ./data/pretrain/images \ 20 | --vision_tower /path/to/siglip-so400m-patch14-384 \ 21 | --mm_projector_type mlp2x_gelu \ 22 | --tune_mm_mlp_adapter True \ 23 | --image_aspect_ratio square \ 24 | --bf16 True \ 25 | --output_dir ./checkpoints-pretrain/$OUTPUT_DIR \ 26 | --num_train_epochs 1 \ 27 | --per_device_train_batch_size 8 \ 28 | --per_device_eval_batch_size 4 \ 29 | --gradient_accumulation_steps 4 \ 30 | --evaluation_strategy "no" \ 31 | --save_strategy "steps" \ 32 | --save_steps 24000 \ 33 | --save_total_limit 1 \ 34 | --learning_rate 5e-4 \ 35 | --weight_decay 0. \ 36 | --warmup_ratio 0.03 \ 37 | --lr_scheduler_type "cosine" \ 38 | --logging_steps 1 \ 39 | --tf32 True \ 40 | --model_max_length 2048 \ 41 | --gradient_checkpointing True \ 42 | --dataloader_num_workers 4 \ 43 | --lazy_preprocess True \ 44 | --report_to none | tee 2>&1 ./checkpoints-pretrain/$OUTPUT_DIR/log.txt 45 | ``` 46 | 47 | ## Visual Instruction Tuning 48 | 49 | ```shell 50 | #!/bin/bash 51 | 52 | MODEL_TYPE=minicpm 53 | 54 | PRETRAIN_DIR=bunny-$MODEL_TYPE-pretrain 55 | OUTPUT_DIR=bunny-lora-$MODEL_TYPE 56 | 57 | mkdir -p ./checkpoints-$MODEL_TYPE/$OUTPUT_DIR 58 | 59 | deepspeed bunny/train/train.py \ 60 | --lora_enable True --lora_r 128 --lora_alpha 256 --mm_projector_lr 2e-5 \ 61 | --deepspeed ./script/deepspeed/zero3.json \ 62 | --model_name_or_path /path/to/openbmb/MiniCPM-2B \ 63 | --model_type $MODEL_TYPE \ 64 | --version minicpm \ 65 | --data_path ./data/finetune/bunny_695k.json \ 66 | --image_folder ./data/finetune/images \ 67 | --vision_tower /path/to/siglip-so400m-patch14-384 \ 68 | --pretrain_mm_mlp_adapter ./checkpoints-pretrain/$PRETRAIN_DIR/mm_projector.bin \ 69 | --mm_projector_type mlp2x_gelu \ 70 | --image_aspect_ratio pad \ 71 | --group_by_modality_length False \ 72 | --bf16 True \ 73 | --output_dir ./checkpoints-$MODEL_TYPE/$OUTPUT_DIR \ 74 | --num_train_epochs 1 \ 75 | --per_device_train_batch_size 8 \ 76 | --per_device_eval_batch_size 4 \ 77 | --gradient_accumulation_steps 2 \ 78 | --evaluation_strategy "no" \ 79 | --save_strategy "steps" \ 80 | --save_steps 500 \ 81 | --save_total_limit 1 \ 82 | --learning_rate 2e-4 \ 83 | --weight_decay 0. \ 84 | --warmup_ratio 0.03 \ 85 | --lr_scheduler_type "cosine" \ 86 | --logging_steps 1 \ 87 | --tf32 True \ 88 | --model_max_length 2048 \ 89 | --gradient_checkpointing True \ 90 | --dataloader_num_workers 4 \ 91 | --lazy_preprocess True \ 92 | --report_to none | tee 2>&1 ./checkpoints-$MODEL_TYPE/$OUTPUT_DIR/log.txt 93 | ``` 94 | 95 | -------------------------------------------------------------------------------- /script/train/tutorials/bunny-phi-1.5-eva-lora.md: -------------------------------------------------------------------------------- 1 | # Training Tutorial of bunny-phi-1.5-eva-lora 2 | 3 | ## Pretrain 4 | 5 | ```shell 6 | #!/bin/bash 7 | 8 | MODEL_TYPE=phi-1.5 9 | OUTPUT_DIR=bunny-$MODEL_TYPE-pretrain 10 | 11 | mkdir -p ./checkpoints-pretrain/$OUTPUT_DIR 12 | 13 | deepspeed bunny/train/train.py \ 14 | --deepspeed ./script/deepspeed/zero2.json \ 15 | --model_name_or_path /path/to/microsoft/phi-1_5 \ 16 | --model_type $MODEL_TYPE \ 17 | --version plain \ 18 | --data_path ./data/pretrain/bunny_pretrain_laion_2m.json \ 19 | --image_folder ./data/pretrain/images \ 20 | --vision_tower /path/to/EVA02_CLIP_L_336_psz14_s6B \ 21 | --mm_projector_type mlp2x_gelu \ 22 | --tune_mm_mlp_adapter True \ 23 | --image_aspect_ratio square \ 24 | --bf16 True \ 25 | --output_dir ./checkpoints-pretrain/$OUTPUT_DIR \ 26 | --num_train_epochs 1 \ 27 | --per_device_train_batch_size 8 \ 28 | --per_device_eval_batch_size 4 \ 29 | --gradient_accumulation_steps 4 \ 30 | --evaluation_strategy "no" \ 31 | --save_strategy "steps" \ 32 | --save_steps 24000 \ 33 | --save_total_limit 1 \ 34 | --learning_rate 1e-3 \ 35 | --weight_decay 0. \ 36 | --warmup_ratio 0.03 \ 37 | --lr_scheduler_type "cosine" \ 38 | --logging_steps 1 \ 39 | --tf32 True \ 40 | --model_max_length 2048 \ 41 | --gradient_checkpointing True \ 42 | --dataloader_num_workers 4 \ 43 | --lazy_preprocess True \ 44 | --report_to none | tee 2>&1 ./checkpoints-pretrain/$OUTPUT_DIR/log.txt 45 | ``` 46 | 47 | ## Visual Instruction Tuning 48 | 49 | ```shell 50 | #!/bin/bash 51 | 52 | MODEL_TYPE=phi-1.5 53 | 54 | PRETRAIN_DIR=bunny-$MODEL_TYPE-pretrain 55 | OUTPUT_DIR=bunny-lora-$MODEL_TYPE 56 | 57 | mkdir -p ./checkpoints-$MODEL_TYPE/$OUTPUT_DIR 58 | 59 | deepspeed bunny/train/train.py \ 60 | --lora_enable True --lora_r 128 --lora_alpha 256 --mm_projector_lr 2e-5 \ 61 | --deepspeed ./script/deepspeed/zero3.json \ 62 | --model_name_or_path /path/to/microsoft/phi-1_5 \ 63 | --model_type $MODEL_TYPE \ 64 | --version bunny \ 65 | --data_path ./data/finetune/bunny_695k.json \ 66 | --image_folder ./data/finetune/images \ 67 | --vision_tower /path/to/EVA02_CLIP_L_336_psz14_s6B \ 68 | --pretrain_mm_mlp_adapter ./checkpoints-pretrain/$PRETRAIN_DIR/mm_projector.bin \ 69 | --mm_projector_type mlp2x_gelu \ 70 | --image_aspect_ratio pad \ 71 | --group_by_modality_length False \ 72 | --bf16 True \ 73 | --output_dir ./checkpoints-$MODEL_TYPE/$OUTPUT_DIR \ 74 | --num_train_epochs 1 \ 75 | --per_device_train_batch_size 8 \ 76 | --per_device_eval_batch_size 4 \ 77 | --gradient_accumulation_steps 2 \ 78 | --evaluation_strategy "no" \ 79 | --save_strategy "steps" \ 80 | --save_steps 500 \ 81 | --save_total_limit 1 \ 82 | --learning_rate 2e-4 \ 83 | --weight_decay 0. \ 84 | --warmup_ratio 0.03 \ 85 | --lr_scheduler_type "cosine" \ 86 | --logging_steps 1 \ 87 | --tf32 True \ 88 | --model_max_length 2048 \ 89 | --gradient_checkpointing True \ 90 | --dataloader_num_workers 4 \ 91 | --lazy_preprocess True \ 92 | --report_to none | tee 2>&1 ./checkpoints-$MODEL_TYPE/$OUTPUT_DIR/log.txt 93 | ``` 94 | 95 | -------------------------------------------------------------------------------- /script/train/tutorials/bunny-phi-1.5-siglip-lora.md: -------------------------------------------------------------------------------- 1 | # Training Tutorial of bunny-phi-1.5-siglip-lora 2 | 3 | ## Pretrain 4 | 5 | ```shell 6 | #!/bin/bash 7 | 8 | MODEL_TYPE=phi-1.5 9 | OUTPUT_DIR=bunny-$MODEL_TYPE-pretrain 10 | 11 | mkdir -p ./checkpoints-pretrain/$OUTPUT_DIR 12 | 13 | deepspeed bunny/train/train.py \ 14 | --deepspeed ./script/deepspeed/zero2.json \ 15 | --model_name_or_path /path/to/microsoft/phi-1_5 \ 16 | --model_type $MODEL_TYPE \ 17 | --version plain \ 18 | --data_path ./data/pretrain/bunny_pretrain_laion_2m.json \ 19 | --image_folder ./data/pretrain/images \ 20 | --vision_tower /path/to/siglip-so400m-patch14-384 \ 21 | --mm_projector_type mlp2x_gelu \ 22 | --tune_mm_mlp_adapter True \ 23 | --image_aspect_ratio square \ 24 | --bf16 True \ 25 | --output_dir ./checkpoints-pretrain/$OUTPUT_DIR \ 26 | --num_train_epochs 1 \ 27 | --per_device_train_batch_size 8 \ 28 | --per_device_eval_batch_size 4 \ 29 | --gradient_accumulation_steps 4 \ 30 | --evaluation_strategy "no" \ 31 | --save_strategy "steps" \ 32 | --save_steps 24000 \ 33 | --save_total_limit 1 \ 34 | --learning_rate 5e-4 \ 35 | --weight_decay 0. \ 36 | --warmup_ratio 0.03 \ 37 | --lr_scheduler_type "cosine" \ 38 | --logging_steps 1 \ 39 | --tf32 True \ 40 | --model_max_length 2048 \ 41 | --gradient_checkpointing True \ 42 | --dataloader_num_workers 4 \ 43 | --lazy_preprocess True \ 44 | --report_to none | tee 2>&1 ./checkpoints-pretrain/$OUTPUT_DIR/log.txt 45 | ``` 46 | 47 | ## Visual Instruction Tuning 48 | 49 | ```shell 50 | #!/bin/bash 51 | 52 | MODEL_TYPE=phi-1.5 53 | 54 | PRETRAIN_DIR=bunny-$MODEL_TYPE-pretrain 55 | OUTPUT_DIR=bunny-lora-$MODEL_TYPE 56 | 57 | mkdir -p ./checkpoints-$MODEL_TYPE/$OUTPUT_DIR 58 | 59 | deepspeed bunny/train/train.py \ 60 | --lora_enable True --lora_r 128 --lora_alpha 256 --mm_projector_lr 2e-5 \ 61 | --deepspeed ./script/deepspeed/zero3.json \ 62 | --model_name_or_path /path/to/microsoft/phi-1_5 \ 63 | --model_type $MODEL_TYPE \ 64 | --version bunny \ 65 | --data_path ./data/finetune/bunny_695k.json \ 66 | --image_folder ./data/finetune/images \ 67 | --vision_tower /path/to/siglip-so400m-patch14-384 \ 68 | --pretrain_mm_mlp_adapter ./checkpoints-pretrain/$PRETRAIN_DIR/mm_projector.bin \ 69 | --mm_projector_type mlp2x_gelu \ 70 | --image_aspect_ratio pad \ 71 | --group_by_modality_length False \ 72 | --bf16 True \ 73 | --output_dir ./checkpoints-$MODEL_TYPE/$OUTPUT_DIR \ 74 | --num_train_epochs 1 \ 75 | --per_device_train_batch_size 8 \ 76 | --per_device_eval_batch_size 4 \ 77 | --gradient_accumulation_steps 2 \ 78 | --evaluation_strategy "no" \ 79 | --save_strategy "steps" \ 80 | --save_steps 500 \ 81 | --save_total_limit 1 \ 82 | --learning_rate 2e-4 \ 83 | --weight_decay 0. \ 84 | --warmup_ratio 0.03 \ 85 | --lr_scheduler_type "cosine" \ 86 | --logging_steps 1 \ 87 | --tf32 True \ 88 | --model_max_length 2048 \ 89 | --gradient_checkpointing True \ 90 | --dataloader_num_workers 4 \ 91 | --lazy_preprocess True \ 92 | --report_to none | tee 2>&1 ./checkpoints-$MODEL_TYPE/$OUTPUT_DIR/log.txt 93 | ``` 94 | 95 | -------------------------------------------------------------------------------- /script/train/tutorials/bunny-phi-2-eva-lora.md: -------------------------------------------------------------------------------- 1 | # Training Tutorial of bunny-phi-2-eva-lora 2 | 3 | ## Pretrain 4 | 5 | ```shell 6 | #!/bin/bash 7 | 8 | MODEL_TYPE=phi-2 9 | OUTPUT_DIR=bunny-$MODEL_TYPE-pretrain 10 | 11 | mkdir -p ./checkpoints-pretrain/$OUTPUT_DIR 12 | 13 | deepspeed bunny/train/train.py \ 14 | --deepspeed ./script/deepspeed/zero2.json \ 15 | --model_name_or_path /path/to/microsoft/phi-2 \ 16 | --model_type $MODEL_TYPE \ 17 | --version plain \ 18 | --data_path ./data/pretrain/bunny_pretrain_laion_2m.json \ 19 | --image_folder ./data/pretrain/images \ 20 | --vision_tower /path/to/EVA02_CLIP_L_336_psz14_s6B \ 21 | --mm_projector_type mlp2x_gelu \ 22 | --tune_mm_mlp_adapter True \ 23 | --image_aspect_ratio square \ 24 | --bf16 True \ 25 | --output_dir ./checkpoints-pretrain/$OUTPUT_DIR \ 26 | --num_train_epochs 1 \ 27 | --per_device_train_batch_size 8 \ 28 | --per_device_eval_batch_size 4 \ 29 | --gradient_accumulation_steps 4 \ 30 | --evaluation_strategy "no" \ 31 | --save_strategy "steps" \ 32 | --save_steps 24000 \ 33 | --save_total_limit 1 \ 34 | --learning_rate 5e-5 \ 35 | --weight_decay 0. \ 36 | --warmup_ratio 0.03 \ 37 | --lr_scheduler_type "cosine" \ 38 | --logging_steps 1 \ 39 | --tf32 True \ 40 | --model_max_length 2048 \ 41 | --gradient_checkpointing True \ 42 | --dataloader_num_workers 4 \ 43 | --lazy_preprocess True \ 44 | --report_to none | tee 2>&1 ./checkpoints-pretrain/$OUTPUT_DIR/log.txt 45 | ``` 46 | 47 | ## Visual Instruction Tuning 48 | 49 | ```shell 50 | #!/bin/bash 51 | 52 | MODEL_TYPE=phi-2 53 | 54 | PRETRAIN_DIR=bunny-$MODEL_TYPE-pretrain 55 | OUTPUT_DIR=bunny-lora-$MODEL_TYPE 56 | 57 | mkdir -p ./checkpoints-$MODEL_TYPE/$OUTPUT_DIR 58 | 59 | deepspeed bunny/train/train.py \ 60 | --lora_enable True --lora_r 128 --lora_alpha 256 --mm_projector_lr 2e-5 \ 61 | --deepspeed ./script/deepspeed/zero3.json \ 62 | --model_name_or_path /path/to/microsoft/phi-2 \ 63 | --model_type $MODEL_TYPE \ 64 | --version bunny \ 65 | --data_path ./data/finetune/bunny_695k.json \ 66 | --image_folder ./data/finetune/images \ 67 | --vision_tower /path/to/EVA02_CLIP_L_336_psz14_s6B \ 68 | --pretrain_mm_mlp_adapter ./checkpoints-pretrain/$PRETRAIN_DIR/mm_projector.bin \ 69 | --mm_projector_type mlp2x_gelu \ 70 | --image_aspect_ratio pad \ 71 | --group_by_modality_length False \ 72 | --bf16 True \ 73 | --output_dir ./checkpoints-$MODEL_TYPE/$OUTPUT_DIR \ 74 | --num_train_epochs 1 \ 75 | --per_device_train_batch_size 8 \ 76 | --per_device_eval_batch_size 4 \ 77 | --gradient_accumulation_steps 2 \ 78 | --evaluation_strategy "no" \ 79 | --save_strategy "steps" \ 80 | --save_steps 500 \ 81 | --save_total_limit 1 \ 82 | --learning_rate 2e-4 \ 83 | --weight_decay 0. \ 84 | --warmup_ratio 0.03 \ 85 | --lr_scheduler_type "cosine" \ 86 | --logging_steps 1 \ 87 | --tf32 True \ 88 | --model_max_length 2048 \ 89 | --gradient_checkpointing True \ 90 | --dataloader_num_workers 4 \ 91 | --lazy_preprocess True \ 92 | --report_to none | tee 2>&1 ./checkpoints-$MODEL_TYPE/$OUTPUT_DIR/log.txt 93 | ``` 94 | 95 | -------------------------------------------------------------------------------- /script/train/tutorials/bunny-phi-2-siglip-lora.md: -------------------------------------------------------------------------------- 1 | # Training Tutorial of bunny-phi-2-siglip-lora 2 | 3 | ## Pretrain 4 | 5 | ```shell 6 | #!/bin/bash 7 | 8 | MODEL_TYPE=phi-2 9 | OUTPUT_DIR=bunny-$MODEL_TYPE-pretrain 10 | 11 | mkdir -p ./checkpoints-pretrain/$OUTPUT_DIR 12 | 13 | deepspeed bunny/train/train.py \ 14 | --deepspeed ./script/deepspeed/zero2.json \ 15 | --model_name_or_path /path/to/microsoft/phi-2 \ 16 | --model_type $MODEL_TYPE \ 17 | --version plain \ 18 | --data_path ./data/pretrain/bunny_pretrain_laion_2m.json \ 19 | --image_folder ./data/pretrain/images \ 20 | --vision_tower /path/to/siglip-so400m-patch14-384 \ 21 | --mm_projector_type mlp2x_gelu \ 22 | --tune_mm_mlp_adapter True \ 23 | --image_aspect_ratio square \ 24 | --bf16 True \ 25 | --output_dir ./checkpoints-pretrain/$OUTPUT_DIR \ 26 | --num_train_epochs 1 \ 27 | --per_device_train_batch_size 8 \ 28 | --per_device_eval_batch_size 4 \ 29 | --gradient_accumulation_steps 4 \ 30 | --evaluation_strategy "no" \ 31 | --save_strategy "steps" \ 32 | --save_steps 24000 \ 33 | --save_total_limit 1 \ 34 | --learning_rate 5e-4 \ 35 | --weight_decay 0. \ 36 | --warmup_ratio 0.03 \ 37 | --lr_scheduler_type "cosine" \ 38 | --logging_steps 1 \ 39 | --tf32 True \ 40 | --model_max_length 2048 \ 41 | --gradient_checkpointing True \ 42 | --dataloader_num_workers 4 \ 43 | --lazy_preprocess True \ 44 | --report_to none | tee 2>&1 ./checkpoints-pretrain/$OUTPUT_DIR/log.txt 45 | ``` 46 | 47 | ## Visual Instruction Tuning 48 | 49 | ```shell 50 | #!/bin/bash 51 | 52 | MODEL_TYPE=phi-2 53 | 54 | PRETRAIN_DIR=bunny-$MODEL_TYPE-pretrain 55 | OUTPUT_DIR=bunny-lora-$MODEL_TYPE 56 | 57 | mkdir -p ./checkpoints-$MODEL_TYPE/$OUTPUT_DIR 58 | 59 | deepspeed bunny/train/train.py \ 60 | --lora_enable True --lora_r 128 --lora_alpha 256 --mm_projector_lr 2e-5 \ 61 | --deepspeed ./script/deepspeed/zero3.json \ 62 | --model_name_or_path /path/to/microsoft/phi-2 \ 63 | --model_type $MODEL_TYPE \ 64 | --version bunny \ 65 | --data_path ./data/finetune/bunny_695k.json \ 66 | --image_folder ./data/finetune/images \ 67 | --vision_tower /path/to/siglip-so400m-patch14-384 \ 68 | --pretrain_mm_mlp_adapter ./checkpoints-pretrain/$PRETRAIN_DIR/mm_projector.bin \ 69 | --mm_projector_type mlp2x_gelu \ 70 | --image_aspect_ratio pad \ 71 | --group_by_modality_length False \ 72 | --bf16 True \ 73 | --output_dir ./checkpoints-$MODEL_TYPE/$OUTPUT_DIR \ 74 | --num_train_epochs 1 \ 75 | --per_device_train_batch_size 8 \ 76 | --per_device_eval_batch_size 4 \ 77 | --gradient_accumulation_steps 2 \ 78 | --evaluation_strategy "no" \ 79 | --save_strategy "steps" \ 80 | --save_steps 500 \ 81 | --save_total_limit 1 \ 82 | --learning_rate 2e-4 \ 83 | --weight_decay 0. \ 84 | --warmup_ratio 0.03 \ 85 | --lr_scheduler_type "cosine" \ 86 | --logging_steps 1 \ 87 | --tf32 True \ 88 | --model_max_length 2048 \ 89 | --gradient_checkpointing True \ 90 | --dataloader_num_workers 4 \ 91 | --lazy_preprocess True \ 92 | --report_to none | tee 2>&1 ./checkpoints-$MODEL_TYPE/$OUTPUT_DIR/log.txt 93 | ``` 94 | 95 | -------------------------------------------------------------------------------- /script/train/tutorials/bunny-qwen1.5-1.8b-siglip-lora.md: -------------------------------------------------------------------------------- 1 | # Training Tutorial of bunny-qwen1.5-1.8b-siglip-lora 2 | 3 | ## Pretrain 4 | 5 | ```shell 6 | #!/bin/bash 7 | 8 | MODEL_TYPE=qwen1.5-1.8b 9 | OUTPUT_DIR=bunny-$MODEL_TYPE-pretrain 10 | 11 | mkdir -p ./checkpoints-pretrain/$OUTPUT_DIR 12 | 13 | deepspeed bunny/train/train.py \ 14 | --deepspeed ./script/deepspeed/zero2.json \ 15 | --model_name_or_path /path/to/Qwen/Qwen1.5-1.8B \ 16 | --model_type $MODEL_TYPE \ 17 | --version plain \ 18 | --data_path ./data/pretrain/bunny_pretrain_laion_2m.json \ 19 | --image_folder ./data/pretrain/images \ 20 | --vision_tower /path/to/siglip-so400m-patch14-384 \ 21 | --mm_projector_type mlp2x_gelu \ 22 | --tune_mm_mlp_adapter True \ 23 | --image_aspect_ratio square \ 24 | --bf16 True \ 25 | --output_dir ./checkpoints-pretrain/$OUTPUT_DIR \ 26 | --num_train_epochs 1 \ 27 | --per_device_train_batch_size 8 \ 28 | --per_device_eval_batch_size 4 \ 29 | --gradient_accumulation_steps 4 \ 30 | --evaluation_strategy "no" \ 31 | --save_strategy "steps" \ 32 | --save_steps 24000 \ 33 | --save_total_limit 1 \ 34 | --learning_rate 5e-4 \ 35 | --weight_decay 0. \ 36 | --warmup_ratio 0.03 \ 37 | --lr_scheduler_type "cosine" \ 38 | --logging_steps 1 \ 39 | --tf32 True \ 40 | --model_max_length 2048 \ 41 | --gradient_checkpointing True \ 42 | --dataloader_num_workers 4 \ 43 | --lazy_preprocess True \ 44 | --report_to none | tee 2>&1 ./checkpoints-pretrain/$OUTPUT_DIR/log.txt 45 | ``` 46 | 47 | ## Visual Instruction Tuning 48 | 49 | ```shell 50 | #!/bin/bash 51 | 52 | MODEL_TYPE=qwen1.5-1.8b 53 | 54 | PRETRAIN_DIR=bunny-$MODEL_TYPE-pretrain 55 | OUTPUT_DIR=bunny-lora-$MODEL_TYPE 56 | 57 | mkdir -p ./checkpoints-$MODEL_TYPE/$OUTPUT_DIR 58 | 59 | deepspeed bunny/train/train.py \ 60 | --lora_enable True --lora_r 128 --lora_alpha 256 --mm_projector_lr 2e-5 \ 61 | --deepspeed ./script/deepspeed/zero3.json \ 62 | --model_name_or_path /path/to/Qwen/Qwen1.5-1.8B \ 63 | --model_type $MODEL_TYPE \ 64 | --version bunny \ 65 | --data_path ./data/finetune/bunny_695k.json \ 66 | --image_folder ./data/finetune/images \ 67 | --vision_tower /path/to/siglip-so400m-patch14-384 \ 68 | --pretrain_mm_mlp_adapter ./checkpoints-pretrain/$PRETRAIN_DIR/mm_projector.bin \ 69 | --mm_projector_type mlp2x_gelu \ 70 | --image_aspect_ratio pad \ 71 | --group_by_modality_length False \ 72 | --bf16 True \ 73 | --output_dir ./checkpoints-$MODEL_TYPE/$OUTPUT_DIR \ 74 | --num_train_epochs 1 \ 75 | --per_device_train_batch_size 8 \ 76 | --per_device_eval_batch_size 4 \ 77 | --gradient_accumulation_steps 2 \ 78 | --evaluation_strategy "no" \ 79 | --save_strategy "steps" \ 80 | --save_steps 500 \ 81 | --save_total_limit 1 \ 82 | --learning_rate 2e-4 \ 83 | --weight_decay 0. \ 84 | --warmup_ratio 0.03 \ 85 | --lr_scheduler_type "cosine" \ 86 | --logging_steps 1 \ 87 | --tf32 True \ 88 | --model_max_length 2048 \ 89 | --gradient_checkpointing True \ 90 | --dataloader_num_workers 4 \ 91 | --lazy_preprocess True \ 92 | --report_to none | tee 2>&1 ./checkpoints-$MODEL_TYPE/$OUTPUT_DIR/log.txt 93 | ``` 94 | 95 | -------------------------------------------------------------------------------- /script/train/tutorials/bunny-stablelm-2-eva-lora.md: -------------------------------------------------------------------------------- 1 | # Training Tutorial of bunny-stablelm-2-eva-lora 2 | 3 | ## Pretrain 4 | 5 | ```shell 6 | #!/bin/bash 7 | 8 | MODEL_TYPE=stablelm-2 9 | OUTPUT_DIR=bunny-$MODEL_TYPE-pretrain 10 | 11 | mkdir -p ./checkpoints-pretrain/$OUTPUT_DIR 12 | 13 | deepspeed bunny/train/train.py \ 14 | --deepspeed ./script/deepspeed/zero2.json \ 15 | --model_name_or_path /path/to/stabilityai/stablelm-2-1_6b \ 16 | --model_type $MODEL_TYPE \ 17 | --version plain \ 18 | --data_path ./data/pretrain/bunny_pretrain_laion_2m.json \ 19 | --image_folder ./data/pretrain/images \ 20 | --vision_tower /path/to/EVA02_CLIP_L_336_psz14_s6B \ 21 | --mm_projector_type mlp2x_gelu \ 22 | --tune_mm_mlp_adapter True \ 23 | --image_aspect_ratio square \ 24 | --bf16 True \ 25 | --output_dir ./checkpoints-pretrain/$OUTPUT_DIR \ 26 | --num_train_epochs 1 \ 27 | --per_device_train_batch_size 8 \ 28 | --per_device_eval_batch_size 4 \ 29 | --gradient_accumulation_steps 4 \ 30 | --evaluation_strategy "no" \ 31 | --save_strategy "steps" \ 32 | --save_steps 24000 \ 33 | --save_total_limit 1 \ 34 | --learning_rate 1e-3 \ 35 | --weight_decay 0. \ 36 | --warmup_ratio 0.03 \ 37 | --lr_scheduler_type "cosine" \ 38 | --logging_steps 1 \ 39 | --tf32 True \ 40 | --model_max_length 2048 \ 41 | --gradient_checkpointing True \ 42 | --dataloader_num_workers 4 \ 43 | --lazy_preprocess True \ 44 | --report_to none | tee 2>&1 ./checkpoints-pretrain/$OUTPUT_DIR/log.txt 45 | ``` 46 | 47 | ## Visual Instruction Tuning 48 | 49 | ```shell 50 | #!/bin/bash 51 | 52 | MODEL_TYPE=stablelm-2 53 | 54 | PRETRAIN_DIR=bunny-$MODEL_TYPE-pretrain 55 | OUTPUT_DIR=bunny-lora-$MODEL_TYPE 56 | 57 | mkdir -p ./checkpoints-$MODEL_TYPE/$OUTPUT_DIR 58 | 59 | deepspeed bunny/train/train.py \ 60 | --lora_enable True --lora_r 128 --lora_alpha 256 --mm_projector_lr 2e-5 \ 61 | --deepspeed ./script/deepspeed/zero3.json \ 62 | --model_name_or_path /path/to/stabilityai/stablelm-2-1_6b \ 63 | --model_type $MODEL_TYPE \ 64 | --version bunny \ 65 | --data_path ./data/finetune/bunny_695k.json \ 66 | --image_folder ./data/finetune/images \ 67 | --vision_tower /path/to/EVA02_CLIP_L_336_psz14_s6B \ 68 | --pretrain_mm_mlp_adapter ./checkpoints-pretrain/$PRETRAIN_DIR/mm_projector.bin \ 69 | --mm_projector_type mlp2x_gelu \ 70 | --image_aspect_ratio pad \ 71 | --group_by_modality_length False \ 72 | --bf16 True \ 73 | --output_dir ./checkpoints-$MODEL_TYPE/$OUTPUT_DIR \ 74 | --num_train_epochs 1 \ 75 | --per_device_train_batch_size 8 \ 76 | --per_device_eval_batch_size 4 \ 77 | --gradient_accumulation_steps 2 \ 78 | --evaluation_strategy "no" \ 79 | --save_strategy "steps" \ 80 | --save_steps 500 \ 81 | --save_total_limit 1 \ 82 | --learning_rate 2e-4 \ 83 | --weight_decay 0. \ 84 | --warmup_ratio 0.03 \ 85 | --lr_scheduler_type "cosine" \ 86 | --logging_steps 1 \ 87 | --tf32 True \ 88 | --model_max_length 2048 \ 89 | --gradient_checkpointing True \ 90 | --dataloader_num_workers 4 \ 91 | --lazy_preprocess True \ 92 | --report_to none | tee 2>&1 ./checkpoints-$MODEL_TYPE/$OUTPUT_DIR/log.txt 93 | ``` 94 | 95 | -------------------------------------------------------------------------------- /script/train/tutorials/bunny-stablelm-2-siglip-lora.md: -------------------------------------------------------------------------------- 1 | # Training Tutorial of bunny-stablelm-2-siglip-lora 2 | 3 | ## Pretrain 4 | 5 | ```shell 6 | #!/bin/bash 7 | 8 | MODEL_TYPE=stablelm-2 9 | OUTPUT_DIR=bunny-$MODEL_TYPE-pretrain 10 | 11 | mkdir -p ./checkpoints-pretrain/$OUTPUT_DIR 12 | 13 | deepspeed bunny/train/train.py \ 14 | --deepspeed ./script/deepspeed/zero2.json \ 15 | --model_name_or_path /path/to/stabilityai/stablelm-2-1_6b \ 16 | --model_type $MODEL_TYPE \ 17 | --version plain \ 18 | --data_path ./data/pretrain/bunny_pretrain_laion_2m.json \ 19 | --image_folder ./data/pretrain/images \ 20 | --vision_tower /path/to/siglip-so400m-patch14-384 \ 21 | --mm_projector_type mlp2x_gelu \ 22 | --tune_mm_mlp_adapter True \ 23 | --image_aspect_ratio square \ 24 | --bf16 True \ 25 | --output_dir ./checkpoints-pretrain/$OUTPUT_DIR \ 26 | --num_train_epochs 1 \ 27 | --per_device_train_batch_size 8 \ 28 | --per_device_eval_batch_size 4 \ 29 | --gradient_accumulation_steps 4 \ 30 | --evaluation_strategy "no" \ 31 | --save_strategy "steps" \ 32 | --save_steps 24000 \ 33 | --save_total_limit 1 \ 34 | --learning_rate 5e-4 \ 35 | --weight_decay 0. \ 36 | --warmup_ratio 0.03 \ 37 | --lr_scheduler_type "cosine" \ 38 | --logging_steps 1 \ 39 | --tf32 True \ 40 | --model_max_length 2048 \ 41 | --gradient_checkpointing True \ 42 | --dataloader_num_workers 4 \ 43 | --lazy_preprocess True \ 44 | --report_to none | tee 2>&1 ./checkpoints-pretrain/$OUTPUT_DIR/log.txt 45 | ``` 46 | 47 | ## Visual Instruction Tuning 48 | 49 | ```shell 50 | #!/bin/bash 51 | 52 | MODEL_TYPE=stablelm-2 53 | 54 | PRETRAIN_DIR=bunny-$MODEL_TYPE-pretrain 55 | OUTPUT_DIR=bunny-lora-$MODEL_TYPE 56 | 57 | mkdir -p ./checkpoints-$MODEL_TYPE/$OUTPUT_DIR 58 | 59 | deepspeed bunny/train/train.py \ 60 | --lora_enable True --lora_r 128 --lora_alpha 256 --mm_projector_lr 2e-5 \ 61 | --deepspeed ./script/deepspeed/zero3.json \ 62 | --model_name_or_path /path/to/stabilityai/stablelm-2-1_6b \ 63 | --model_type $MODEL_TYPE \ 64 | --version bunny \ 65 | --data_path ./data/finetune/bunny_695k.json \ 66 | --image_folder ./data/finetune/images \ 67 | --vision_tower /path/to/siglip-so400m-patch14-384 \ 68 | --pretrain_mm_mlp_adapter ./checkpoints-pretrain/$PRETRAIN_DIR/mm_projector.bin \ 69 | --mm_projector_type mlp2x_gelu \ 70 | --image_aspect_ratio pad \ 71 | --group_by_modality_length False \ 72 | --bf16 True \ 73 | --output_dir ./checkpoints-$MODEL_TYPE/$OUTPUT_DIR \ 74 | --num_train_epochs 1 \ 75 | --per_device_train_batch_size 8 \ 76 | --per_device_eval_batch_size 4 \ 77 | --gradient_accumulation_steps 2 \ 78 | --evaluation_strategy "no" \ 79 | --save_strategy "steps" \ 80 | --save_steps 500 \ 81 | --save_total_limit 1 \ 82 | --learning_rate 2e-4 \ 83 | --weight_decay 0. \ 84 | --warmup_ratio 0.03 \ 85 | --lr_scheduler_type "cosine" \ 86 | --logging_steps 1 \ 87 | --tf32 True \ 88 | --model_max_length 2048 \ 89 | --gradient_checkpointing True \ 90 | --dataloader_num_workers 4 \ 91 | --lazy_preprocess True \ 92 | --report_to none | tee 2>&1 ./checkpoints-$MODEL_TYPE/$OUTPUT_DIR/log.txt 93 | ``` 94 | 95 | --------------------------------------------------------------------------------