├── LICENSE
├── README.md
├── bunny
    ├── constants.py
    ├── conversation.py
    ├── eval
    │   ├── m4c_evaluator.py
    │   ├── model_vqa.py
    │   ├── model_vqa_cmmmu.py
    │   ├── model_vqa_loader.py
    │   ├── model_vqa_mmbench.py
    │   ├── model_vqa_mmmu.py
    │   └── model_vqa_science.py
    ├── model
    │   ├── __init__.py
    │   ├── builder.py
    │   ├── bunny_arch.py
    │   ├── language_model
    │   │   ├── bunny_llama.py
    │   │   ├── bunny_minicpm.py
    │   │   ├── bunny_phi.py
    │   │   ├── bunny_phi3.py
    │   │   ├── bunny_qwen.py
    │   │   ├── bunny_stablelm.py
    │   │   ├── llama
    │   │   │   ├── __init__.py
    │   │   │   ├── configuration_llama.py
    │   │   │   ├── modeling_llama.py
    │   │   │   ├── tokenization_llama.py
    │   │   │   └── tokenization_llama_fast.py
    │   │   ├── minicpm
    │   │   │   ├── configuration_minicpm.py
    │   │   │   └── modeling_minicpm.py
    │   │   ├── phi
    │   │   │   ├── __init__.py
    │   │   │   ├── configuration_phi.py
    │   │   │   └── modeling_phi.py
    │   │   ├── phi3
    │   │   │   ├── __init__.py
    │   │   │   ├── configuration_phi3.py
    │   │   │   └── modeling_phi3.py
    │   │   ├── qwen2
    │   │   │   ├── __init__.py
    │   │   │   ├── configuration_qwen2.py
    │   │   │   ├── modeling_qwen2.py
    │   │   │   ├── tokenization_qwen2.py
    │   │   │   └── tokenization_qwen2_fast.py
    │   │   └── stable_lm
    │   │   │   ├── configuration_stablelm_epoch.py
    │   │   │   └── modeling_stablelm_epoch.py
    │   ├── multimodal_encoder
    │   │   ├── builder.py
    │   │   ├── clip
    │   │   │   └── clip_encoder.py
    │   │   ├── eva_clip
    │   │   │   ├── eva_clip_encoder.py
    │   │   │   ├── eva_clip_processors.py
    │   │   │   └── eva_vit.py
    │   │   └── siglip
    │   │   │   └── siglip_encoder.py
    │   └── multimodal_projector
    │   │   └── builder.py
    ├── serve
    │   ├── cli.py
    │   ├── controller.py
    │   ├── examples
    │   │   ├── example_1.png
    │   │   ├── example_2.png
    │   │   ├── icon.jpg
    │   │   └── user.png
    │   ├── gradio_web_server.py
    │   ├── model_worker.py
    │   └── register_worker.py
    ├── train
    │   ├── bunny_trainer.py
    │   └── train.py
    └── util
    │   ├── data_utils.py
    │   ├── mm_utils.py
    │   ├── s2wrapper
    │       ├── __init__.py
    │       ├── core.py
    │       └── utils.py
    │   └── utils.py
├── comparison_4B.png
├── comparison_8B.png
├── eval
    ├── cmmmu
    │   ├── cmmmu-data-val-answer.jsonl
    │   ├── eval_script.py
    │   ├── eval_utils.py
    │   └── prompt.yaml
    ├── gqa
    │   ├── bunny_gqa_testdev_balanced.jsonl
    │   ├── convert_gqa_for_eval.py
    │   ├── eval_gqa.py
    │   └── testdev_balanced_questions.tar.gz
    ├── mm-vet
    │   ├── bunny-mm-vet.jsonl
    │   └── convert_mmvet_for_eval.py
    ├── mmbench
    │   └── convert_mmbench_for_submission.py
    ├── mme
    │   ├── bunny_mme.jsonl
    │   ├── calculation_mme.py
    │   └── convert_answer_to_mme.py
    ├── mmmu
    │   ├── answer_dict_val.json
    │   ├── config.yaml
    │   └── eval_mmmu.py
    ├── pope
    │   ├── bunny_pope_test.jsonl
    │   └── eval_pope.py
    ├── scienceqa
    │   ├── bunny_test_CQM-A.json
    │   └── eval_science_qa.py
    ├── seed-bench
    │   ├── SEED-Bench.json
    │   ├── bunny-seed-bench.jsonl
    │   ├── convert_seed_for_submission.py
    │   └── extract_video_frames.py
    ├── textvqa
    │   ├── bunny_textvqa_val_v051_ocr.jsonl
    │   └── eval_textvqa.py
    ├── viswiz
    │   ├── bunny_test.jsonl
    │   └── convert_viswiz_for_submission.py
    └── vqav2
    │   ├── bunny_vqav2_mscoco_test-dev2015.tar.gz
    │   ├── bunny_vqav2_mscoco_test2015.tar.gz
    │   └── convert_vqav2_for_submission.py
├── icon.png
├── pyproject.toml
└── script
    ├── batch_inference.py
    ├── conversion_to_GGUF.md
    ├── deepspeed
        ├── zero2.json
        └── zero3.json
    ├── eval
        ├── full
        │   ├── cmmmu.sh
        │   ├── evaluation_full.md
        │   ├── gqa.sh
        │   ├── mmbench.sh
        │   ├── mme.sh
        │   ├── mmmu.sh
        │   ├── mmvet.sh
        │   ├── pope.sh
        │   ├── scienceqa.sh
        │   ├── seedbench.sh
        │   └── vqav2.sh
        └── lora
        │   ├── cmmmu.sh
        │   ├── evaluation_lora.md
        │   ├── gqa.sh
        │   ├── mmbench.sh
        │   ├── mme.sh
        │   ├── mmmu.sh
        │   ├── mmvet.sh
        │   ├── pope.sh
        │   ├── scienceqa.sh
        │   ├── seedbench.sh
        │   └── vqav2.sh
    ├── merge_lora_weights.py
    └── train
        ├── finetune_full.sh
        ├── finetune_lora.sh
        ├── pretrain.sh
        └── tutorials
            ├── Bunny-Llama-3-8B-V.md
            ├── Bunny-v1.0-4B.md
            ├── Bunny-v1.1-4B.md
            ├── Bunny-v1.1-Llama-3-8B-V.md
            ├── assets
                ├── Bunny-Llama-3-8B-V.png
                ├── Bunny-v1.0-4B.png
                ├── Bunny-v1.1-4B.png
                └── Bunny-v1.1-Llama-3-8B-V.png
            ├── bunny-minicpm-siglip-lora.md
            ├── bunny-phi-1.5-eva-lora.md
            ├── bunny-phi-1.5-siglip-lora.md
            ├── bunny-phi-2-eva-lora.md
            ├── bunny-phi-2-siglip-lora.md
            ├── bunny-qwen1.5-1.8b-siglip-lora.md
            ├── bunny-stablelm-2-eva-lora.md
            └── bunny-stablelm-2-siglip-lora.md


/bunny/constants.py:
--------------------------------------------------------------------------------
1 | # Model Constants
2 | IGNORE_INDEX = -100
3 | IMAGE_TOKEN_INDEX = -200
4 | DEFAULT_IMAGE_TOKEN = "<image>"
5 | CONTROLLER_HEART_BEAT_EXPIRATION = 30
6 | LOGDIR = "gradio-logs"
7 | WORKER_HEART_BEAT_INTERVAL = 15
8 | 


--------------------------------------------------------------------------------
/bunny/eval/model_vqa.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import torch
  3 | import os
  4 | import json
  5 | from tqdm import tqdm
  6 | import shortuuid
  7 | 
  8 | from bunny.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN
  9 | from bunny.conversation import conv_templates, SeparatorStyle
 10 | from bunny.model.builder import load_pretrained_model
 11 | from bunny.util.utils import disable_torch_init
 12 | from bunny.util.mm_utils import tokenizer_image_token, get_model_name_from_path, process_images
 13 | 
 14 | from PIL import Image
 15 | import math
 16 | 
 17 | 
 18 | def split_list(lst, n):
 19 |     """Split a list into n (roughly) equal-sized chunks"""
 20 |     chunk_size = math.ceil(len(lst) / n)  # integer division
 21 |     return [lst[i:i + chunk_size] for i in range(0, len(lst), chunk_size)]
 22 | 
 23 | 
 24 | def get_chunk(lst, n, k):
 25 |     chunks = split_list(lst, n)
 26 |     return chunks[k]
 27 | 
 28 | 
 29 | def eval_model(args):
 30 |     # Model
 31 |     disable_torch_init()
 32 |     model_path = os.path.expanduser(args.model_path)
 33 |     model_name = get_model_name_from_path(model_path)
 34 |     tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, args.model_base, model_name,
 35 |                                                                            args.model_type)
 36 | 
 37 |     questions = [json.loads(q) for q in open(os.path.expanduser(args.question_file), "r")]
 38 |     questions = get_chunk(questions, args.num_chunks, args.chunk_idx)
 39 |     answers_file = os.path.expanduser(args.answers_file)
 40 |     os.makedirs(os.path.dirname(answers_file), exist_ok=True)
 41 |     ans_file = open(answers_file, "w")
 42 |     for line in tqdm(questions):
 43 |         idx = line["question_id"]
 44 |         image_file = line["image"]
 45 |         qs = line["text"]
 46 |         cur_prompt = qs
 47 | 
 48 |         qs = DEFAULT_IMAGE_TOKEN + '\n' + qs
 49 | 
 50 |         conv = conv_templates[args.conv_mode].copy()
 51 |         conv.append_message(conv.roles[0], qs)
 52 |         conv.append_message(conv.roles[1], None)
 53 |         prompt = conv.get_prompt()
 54 | 
 55 |         input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda()
 56 | 
 57 |         image = Image.open(os.path.join(args.image_folder, image_file))
 58 |         image_tensor = process_images([image], image_processor, model.config)[0]
 59 | 
 60 |         stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
 61 | 
 62 |         with torch.inference_mode():
 63 |             output_ids = model.generate(
 64 |                 input_ids,
 65 |                 images=image_tensor.unsqueeze(0).to(dtype=model.dtype, device='cuda', non_blocking=True),
 66 |                 do_sample=True if args.temperature > 0 else False,
 67 |                 temperature=args.temperature,
 68 |                 top_p=args.top_p,
 69 |                 num_beams=args.num_beams,
 70 |                 # no_repeat_ngram_size=3,
 71 |                 max_new_tokens=1024,
 72 |                 use_cache=True)
 73 | 
 74 |         input_token_len = input_ids.shape[1]
 75 |         n_diff_input_output = (input_ids != output_ids[:, :input_token_len]).sum().item()
 76 |         if n_diff_input_output > 0:
 77 |             print(f'[Warning] {n_diff_input_output} output_ids are not the same as the input_ids')
 78 |         outputs = tokenizer.batch_decode(output_ids[:, input_token_len:], skip_special_tokens=True)[0]
 79 |         outputs = outputs.strip()
 80 |         if outputs.endswith(stop_str):
 81 |             outputs = outputs[:-len(stop_str)]
 82 |         outputs = outputs.strip()
 83 | 
 84 |         ans_id = shortuuid.uuid()
 85 |         ans_file.write(json.dumps({"question_id": idx,
 86 |                                    "prompt": cur_prompt,
 87 |                                    "text": outputs,
 88 |                                    "answer_id": ans_id,
 89 |                                    "model_id": model_name,
 90 |                                    "metadata": {}}) + "\n")
 91 |         ans_file.flush()
 92 |     ans_file.close()
 93 | 
 94 | 
 95 | if __name__ == "__main__":
 96 |     parser = argparse.ArgumentParser()
 97 |     parser.add_argument("--model-path", type=str, default=None)
 98 |     parser.add_argument("--model-base", type=str, default=None)
 99 |     parser.add_argument("--model-type", type=str, default=None)
100 |     parser.add_argument("--image-folder", type=str, default=None)
101 |     parser.add_argument("--question-file", type=str, default=None)
102 |     parser.add_argument("--answers-file", type=str, default=None)
103 |     parser.add_argument("--conv-mode", type=str, default=None)
104 |     parser.add_argument("--num-chunks", type=int, default=1)
105 |     parser.add_argument("--chunk-idx", type=int, default=0)
106 |     parser.add_argument("--temperature", type=float, default=0.2)
107 |     parser.add_argument("--top_p", type=float, default=None)
108 |     parser.add_argument("--num_beams", type=int, default=1)
109 |     args = parser.parse_args()
110 | 
111 |     eval_model(args)
112 | 


--------------------------------------------------------------------------------
/bunny/eval/model_vqa_loader.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import torch
  3 | import os
  4 | import json
  5 | from tqdm import tqdm
  6 | import shortuuid
  7 | 
  8 | from bunny.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN
  9 | from bunny.conversation import conv_templates
 10 | from bunny.model.builder import load_pretrained_model
 11 | from bunny.util.utils import disable_torch_init
 12 | from bunny.util.mm_utils import tokenizer_image_token, process_images, get_model_name_from_path
 13 | from torch.utils.data import Dataset, DataLoader
 14 | 
 15 | from PIL import Image
 16 | import math
 17 | 
 18 | 
 19 | def split_list(lst, n):
 20 |     """Split a list into n (roughly) equal-sized chunks"""
 21 |     chunk_size = math.ceil(len(lst) / n)  # integer division
 22 |     return [lst[i:i + chunk_size] for i in range(0, len(lst), chunk_size)]
 23 | 
 24 | 
 25 | def get_chunk(lst, n, k):
 26 |     chunks = split_list(lst, n)
 27 |     return chunks[k]
 28 | 
 29 | 
 30 | # Custom dataset class
 31 | class CustomDataset(Dataset):
 32 |     def __init__(self, questions, image_folder, tokenizer, image_processor, model_config):
 33 |         self.questions = questions
 34 |         self.image_folder = image_folder
 35 |         self.tokenizer = tokenizer
 36 |         self.image_processor = image_processor
 37 |         self.model_config = model_config
 38 | 
 39 |     def __getitem__(self, index):
 40 |         line = self.questions[index]
 41 |         image_file = line["image"]
 42 |         qs = line["text"]
 43 | 
 44 |         qs = DEFAULT_IMAGE_TOKEN + '\n' + qs
 45 | 
 46 |         conv = conv_templates[args.conv_mode].copy()
 47 |         conv.append_message(conv.roles[0], qs)
 48 |         conv.append_message(conv.roles[1], None)
 49 |         prompt = conv.get_prompt()
 50 | 
 51 |         image = Image.open(os.path.join(self.image_folder, image_file)).convert('RGB')
 52 |         image_tensor = process_images([image], self.image_processor, self.model_config)[0]
 53 | 
 54 |         input_ids = tokenizer_image_token(prompt, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt')
 55 | 
 56 |         return input_ids, image_tensor
 57 | 
 58 |     def __len__(self):
 59 |         return len(self.questions)
 60 | 
 61 | 
 62 | # DataLoader
 63 | def create_data_loader(questions, image_folder, tokenizer, image_processor, model_config, batch_size=1, num_workers=4):
 64 |     assert batch_size == 1, "batch_size must be 1"
 65 |     dataset = CustomDataset(questions, image_folder, tokenizer, image_processor, model_config)
 66 |     data_loader = DataLoader(dataset, batch_size=batch_size, num_workers=num_workers, shuffle=False)
 67 |     return data_loader
 68 | 
 69 | 
 70 | def eval_model(args):
 71 |     # Model
 72 |     disable_torch_init()
 73 |     model_path = os.path.expanduser(args.model_path)
 74 |     model_name = get_model_name_from_path(model_path)
 75 |     tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, args.model_base, model_name,
 76 |                                                                            args.model_type)
 77 | 
 78 |     questions = [json.loads(q) for q in open(os.path.expanduser(args.question_file), "r")]
 79 |     questions = get_chunk(questions, args.num_chunks, args.chunk_idx)
 80 |     answers_file = os.path.expanduser(args.answers_file)
 81 |     os.makedirs(os.path.dirname(answers_file), exist_ok=True)
 82 |     ans_file = open(answers_file, "w")
 83 | 
 84 |     if 'plain' in model_name and 'finetune' not in model_name.lower() and 'mmtag' not in args.conv_mode:
 85 |         args.conv_mode = args.conv_mode + '_mmtag'
 86 |         print(
 87 |             f'It seems that this is a plain model, but it is not using a mmtag prompt, auto switching to {args.conv_mode}.')
 88 | 
 89 |     data_loader = create_data_loader(questions, args.image_folder, tokenizer, image_processor, model.config)
 90 | 
 91 |     for (input_ids, image_tensor), line in tqdm(zip(data_loader, questions), total=len(questions)):
 92 |         idx = line["question_id"]
 93 |         cur_prompt = line["text"]
 94 | 
 95 |         input_ids = input_ids.to(device='cuda', non_blocking=True)
 96 | 
 97 |         with torch.inference_mode():
 98 |             output_ids = model.generate(
 99 |                 input_ids,
100 |                 images=image_tensor.to(dtype=model.dtype, device='cuda', non_blocking=True),
101 |                 do_sample=True if args.temperature > 0 else False,
102 |                 temperature=args.temperature,
103 |                 top_p=args.top_p,
104 |                 num_beams=args.num_beams,
105 |                 max_new_tokens=args.max_new_tokens,
106 |                 use_cache=True)
107 | 
108 |         input_token_len = input_ids.shape[1]
109 |         n_diff_input_output = (input_ids != output_ids[:, :input_token_len]).sum().item()
110 |         if n_diff_input_output > 0:
111 |             print(f'[Warning] {n_diff_input_output} output_ids are not the same as the input_ids')
112 |         outputs = tokenizer.batch_decode(output_ids[:, input_token_len:], skip_special_tokens=True)[0]
113 |         outputs = outputs.strip()
114 | 
115 |         ans_id = shortuuid.uuid()
116 |         ans_file.write(json.dumps({"question_id": idx,
117 |                                    "prompt": cur_prompt,
118 |                                    "text": outputs,
119 |                                    "answer_id": ans_id,
120 |                                    "model_id": model_name,
121 |                                    "metadata": {}}) + "\n")
122 |         # ans_file.flush()
123 |     ans_file.close()
124 | 
125 | 
126 | if __name__ == "__main__":
127 |     parser = argparse.ArgumentParser()
128 |     parser.add_argument("--model-path", type=str, default=None)
129 |     parser.add_argument("--model-base", type=str, default=None)
130 |     parser.add_argument("--model-type", type=str, default=None)
131 |     parser.add_argument("--image-folder", type=str, default=None)
132 |     parser.add_argument("--question-file", type=str, default=None)
133 |     parser.add_argument("--answers-file", type=str, default=None)
134 |     parser.add_argument("--conv-mode", type=str, default=None)
135 |     parser.add_argument("--num-chunks", type=int, default=1)
136 |     parser.add_argument("--chunk-idx", type=int, default=0)
137 |     parser.add_argument("--temperature", type=float, default=0.2)
138 |     parser.add_argument("--top_p", type=float, default=None)
139 |     parser.add_argument("--num_beams", type=int, default=1)
140 |     parser.add_argument("--max_new_tokens", type=int, default=128)
141 |     args = parser.parse_args()
142 | 
143 |     eval_model(args)
144 | 


--------------------------------------------------------------------------------
/bunny/eval/model_vqa_science.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import torch
  3 | import os
  4 | import json
  5 | from tqdm import tqdm
  6 | import shortuuid
  7 | 
  8 | from bunny.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN
  9 | from bunny.conversation import conv_templates, SeparatorStyle
 10 | from bunny.model.builder import load_pretrained_model
 11 | from bunny.util.utils import disable_torch_init
 12 | from bunny.util.mm_utils import tokenizer_image_token, get_model_name_from_path
 13 | 
 14 | from PIL import Image
 15 | import math
 16 | 
 17 | 
 18 | def split_list(lst, n):
 19 |     """Split a list into n (roughly) equal-sized chunks"""
 20 |     chunk_size = math.ceil(len(lst) / n)  # integer division
 21 |     return [lst[i:i + chunk_size] for i in range(0, len(lst), chunk_size)]
 22 | 
 23 | 
 24 | def get_chunk(lst, n, k):
 25 |     chunks = split_list(lst, n)
 26 |     return chunks[k]
 27 | 
 28 | 
 29 | def eval_model(args):
 30 |     # Model
 31 |     disable_torch_init()
 32 |     model_path = os.path.expanduser(args.model_path)
 33 |     model_name = get_model_name_from_path(model_path)
 34 |     tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, args.model_base, model_name,
 35 |                                                                            args.model_type)
 36 | 
 37 |     questions = json.load(open(os.path.expanduser(args.question_file), "r"))
 38 |     questions = get_chunk(questions, args.num_chunks, args.chunk_idx)
 39 |     answers_file = os.path.expanduser(args.answers_file)
 40 |     os.makedirs(os.path.dirname(answers_file), exist_ok=True)
 41 |     ans_file = open(answers_file, "w")
 42 |     for i, line in enumerate(tqdm(questions)):
 43 |         idx = line["id"]
 44 |         question = line['conversations'][0]
 45 |         qs = question['value'].replace('<image>', '').strip()
 46 |         cur_prompt = qs
 47 | 
 48 |         if 'image' in line:
 49 |             image_file = line["image"]
 50 |             image = Image.open(os.path.join(args.image_folder, image_file))
 51 |             image_tensor = image_processor.preprocess(image, return_tensors='pt')['pixel_values'][0]
 52 |             images = image_tensor.unsqueeze(0).to(dtype=model.dtype, device='cuda', non_blocking=True)
 53 | 
 54 |             qs = DEFAULT_IMAGE_TOKEN + '\n' + qs
 55 |             cur_prompt = '<image>' + '\n' + cur_prompt
 56 |         else:
 57 |             images = None
 58 | 
 59 |         if args.single_pred_prompt:
 60 |             qs = qs + '\n' + "Answer with the option's letter from the given choices directly."
 61 |             cur_prompt = cur_prompt + '\n' + "Answer with the option's letter from the given choices directly."
 62 | 
 63 |         conv = conv_templates[args.conv_mode].copy()
 64 |         conv.append_message(conv.roles[0], qs)
 65 |         conv.append_message(conv.roles[1], None)
 66 |         prompt = conv.get_prompt()
 67 | 
 68 |         input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda()
 69 | 
 70 |         stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
 71 | 
 72 |         with torch.inference_mode():
 73 |             output_ids = model.generate(
 74 |                 input_ids,
 75 |                 images=images,
 76 |                 do_sample=True if args.temperature > 0 else False,
 77 |                 temperature=args.temperature,
 78 |                 max_new_tokens=1024,
 79 |                 use_cache=True
 80 |             )
 81 | 
 82 |         input_token_len = input_ids.shape[1]
 83 |         n_diff_input_output = (input_ids != output_ids[:, :input_token_len]).sum().item()
 84 |         if n_diff_input_output > 0:
 85 |             print(f'[Warning] {n_diff_input_output} output_ids are not the same as the input_ids')
 86 |         outputs = tokenizer.batch_decode(output_ids[:, input_token_len:], skip_special_tokens=True)[0]
 87 |         outputs = outputs.strip()
 88 |         if outputs.endswith(stop_str):
 89 |             outputs = outputs[:-len(stop_str)]
 90 |         outputs = outputs.strip()
 91 | 
 92 |         ans_id = shortuuid.uuid()
 93 |         ans_file.write(json.dumps({"question_id": idx,
 94 |                                    "prompt": cur_prompt,
 95 |                                    "text": outputs,
 96 |                                    "answer_id": ans_id,
 97 |                                    "model_id": model_name,
 98 |                                    "metadata": {}}) + "\n")
 99 |         ans_file.flush()
100 |     ans_file.close()
101 | 
102 | 
103 | if __name__ == "__main__":
104 |     parser = argparse.ArgumentParser()
105 |     parser.add_argument("--model-path", type=str, default=None)
106 |     parser.add_argument("--model-base", type=str, default=None)
107 |     parser.add_argument("--model-type", type=str, default=None)
108 |     parser.add_argument("--image-folder", type=str, default=None)
109 |     parser.add_argument("--question-file", type=str, default=None)
110 |     parser.add_argument("--answers-file", type=str, default=None)
111 |     parser.add_argument("--conv-mode", type=str, default=None)
112 |     parser.add_argument("--num-chunks", type=int, default=1)
113 |     parser.add_argument("--chunk-idx", type=int, default=0)
114 |     parser.add_argument("--temperature", type=float, default=0.2)
115 |     parser.add_argument("--single-pred-prompt", action="store_true")
116 | 
117 |     args = parser.parse_args()
118 | 
119 |     eval_model(args)
120 | 


--------------------------------------------------------------------------------
/bunny/model/__init__.py:
--------------------------------------------------------------------------------
1 | from .language_model.bunny_phi import BunnyPhiForCausalLM, BunnyPhiConfig
2 | from .language_model.bunny_stablelm import BunnyStableLMForCausalLM, BunnyStableLMConfig
3 | from .language_model.bunny_qwen import BunnyQwen2ForCausalLM, BunnyQwen2Config
4 | from .language_model.bunny_minicpm import BunnyMiniCPMForCausalLM, BunnyMiniCPMConfig
5 | from .language_model.bunny_llama import BunnyLlamaForCausalLM, BunnyLlamaConfig
6 | from .language_model.bunny_phi3 import BunnyPhi3ForCausalLM, BunnyPhi3Config
7 | 


--------------------------------------------------------------------------------
/bunny/model/language_model/bunny_llama.py:
--------------------------------------------------------------------------------
  1 | from typing import List, Optional, Tuple, Union
  2 | 
  3 | import torch
  4 | import torch.nn as nn
  5 | from transformers import AutoConfig, AutoModelForCausalLM
  6 | 
  7 | from .llama import LlamaModel, LlamaConfig, LlamaForCausalLM
  8 | 
  9 | from transformers.modeling_outputs import CausalLMOutputWithPast
 10 | 
 11 | from ..bunny_arch import BunnyMetaModel, BunnyMetaForCausalLM
 12 | 
 13 | 
 14 | class BunnyLlamaConfig(LlamaConfig):
 15 |     model_type = "bunny-llama"
 16 | 
 17 | 
 18 | class BunnyLlamaModel(BunnyMetaModel, LlamaModel):
 19 |     config_class = BunnyLlamaConfig
 20 | 
 21 |     def __init__(self, config: LlamaConfig):
 22 |         super(BunnyLlamaModel, self).__init__(config)
 23 | 
 24 | 
 25 | class BunnyLlamaForCausalLM(LlamaForCausalLM, BunnyMetaForCausalLM):
 26 |     config_class = BunnyLlamaConfig
 27 | 
 28 |     def __init__(self, config):
 29 |         super(LlamaForCausalLM, self).__init__(config)
 30 |         self.model = BunnyLlamaModel(config)
 31 |         self.vocab_size = config.vocab_size
 32 |         self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
 33 | 
 34 |         # Initialize weights and apply final processing
 35 |         self.post_init()
 36 | 
 37 |     def get_model(self):
 38 |         return self.model
 39 | 
 40 |     def forward(
 41 |             self,
 42 |             input_ids: torch.LongTensor = None,
 43 |             attention_mask: Optional[torch.Tensor] = None,
 44 |             position_ids: Optional[torch.LongTensor] = None,
 45 |             past_key_values: Optional[List[torch.FloatTensor]] = None,
 46 |             inputs_embeds: Optional[torch.FloatTensor] = None,
 47 |             labels: Optional[torch.LongTensor] = None,
 48 |             use_cache: Optional[bool] = None,
 49 |             output_attentions: Optional[bool] = None,
 50 |             output_hidden_states: Optional[bool] = None,
 51 |             images: Optional[torch.FloatTensor] = None,
 52 |             return_dict: Optional[bool] = None,
 53 |             cache_position: Optional[torch.LongTensor] = None,
 54 |     ) -> Union[Tuple, CausalLMOutputWithPast]:
 55 |         if inputs_embeds is None:
 56 |             (
 57 |                 input_ids,
 58 |                 position_ids,
 59 |                 attention_mask,
 60 |                 past_key_values,
 61 |                 inputs_embeds,
 62 |                 labels
 63 |             ) = self.prepare_inputs_labels_for_multimodal(
 64 |                 input_ids,
 65 |                 position_ids,
 66 |                 attention_mask,
 67 |                 past_key_values,
 68 |                 labels,
 69 |                 images
 70 |             )
 71 | 
 72 |         return super().forward(
 73 |             input_ids=input_ids,
 74 |             attention_mask=attention_mask,
 75 |             position_ids=position_ids,
 76 |             past_key_values=past_key_values,
 77 |             inputs_embeds=inputs_embeds,
 78 |             labels=labels,
 79 |             use_cache=use_cache,
 80 |             output_attentions=output_attentions,
 81 |             output_hidden_states=output_hidden_states,
 82 |             return_dict=return_dict,
 83 |             cache_position=None
 84 |         )
 85 | 
 86 |     def prepare_inputs_for_generation(self, input_ids, past_key_values=None, inputs_embeds=None, attention_mask=None,
 87 |                                       **kwargs):
 88 |         images = kwargs.pop("images", None)
 89 | 
 90 |         _inputs = super().prepare_inputs_for_generation(
 91 |             input_ids, past_key_values=past_key_values, inputs_embeds=inputs_embeds, attention_mask=attention_mask,
 92 |             **kwargs
 93 |         )
 94 | 
 95 |         if images is not None:
 96 |             _inputs['images'] = images
 97 | 
 98 |         return _inputs
 99 | 
100 | 
101 | AutoConfig.register("bunny-llama", BunnyLlamaConfig)
102 | AutoModelForCausalLM.register(BunnyLlamaConfig, BunnyLlamaForCausalLM)
103 | 


--------------------------------------------------------------------------------
/bunny/model/language_model/bunny_minicpm.py:
--------------------------------------------------------------------------------
  1 | from typing import List, Optional, Tuple, Union
  2 | 
  3 | import torch
  4 | import torch.nn as nn
  5 | from transformers import AutoConfig, AutoModelForCausalLM
  6 | 
  7 | from bunny.model.language_model.minicpm.modeling_minicpm import MiniCPMModel, MiniCPMForCausalLM
  8 | from bunny.model.language_model.minicpm.configuration_minicpm import MiniCPMConfig
  9 | 
 10 | from transformers.modeling_outputs import CausalLMOutputWithPast
 11 | 
 12 | from ..bunny_arch import BunnyMetaModel, BunnyMetaForCausalLM
 13 | 
 14 | 
 15 | class BunnyMiniCPMConfig(MiniCPMConfig):
 16 |     model_type = "bunny-minicpm"
 17 | 
 18 | 
 19 | class BunnyMiniCPMModel(BunnyMetaModel, MiniCPMModel):
 20 |     config_class = BunnyMiniCPMConfig
 21 | 
 22 |     def __init__(self, config: MiniCPMConfig):
 23 |         super(BunnyMiniCPMModel, self).__init__(config)
 24 | 
 25 | 
 26 | class BunnyMiniCPMForCausalLM(MiniCPMForCausalLM, BunnyMetaForCausalLM):
 27 |     config_class = BunnyMiniCPMConfig
 28 | 
 29 |     def __init__(self, config):
 30 |         super(MiniCPMForCausalLM, self).__init__(config)
 31 |         self.model = BunnyMiniCPMModel(config)
 32 |         self.vocab_size = config.vocab_size
 33 |         self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
 34 | 
 35 |         # Initialize weights and apply final processing
 36 |         self.post_init()
 37 | 
 38 |     def get_model(self):
 39 |         return self.model
 40 | 
 41 |     def forward(
 42 |             self,
 43 |             input_ids: torch.LongTensor = None,
 44 |             attention_mask: Optional[torch.Tensor] = None,
 45 |             position_ids: Optional[torch.LongTensor] = None,
 46 |             past_key_values: Optional[List[torch.FloatTensor]] = None,
 47 |             inputs_embeds: Optional[torch.FloatTensor] = None,
 48 |             labels: Optional[torch.LongTensor] = None,
 49 |             use_cache: Optional[bool] = None,
 50 |             output_attentions: Optional[bool] = None,
 51 |             output_hidden_states: Optional[bool] = None,
 52 |             images: Optional[torch.FloatTensor] = None,
 53 |             return_dict: Optional[bool] = None,
 54 |     ) -> Union[Tuple, CausalLMOutputWithPast]:
 55 | 
 56 |         if inputs_embeds is None:
 57 |             (
 58 |                 input_ids,
 59 |                 position_ids,
 60 |                 attention_mask,
 61 |                 past_key_values,
 62 |                 inputs_embeds,
 63 |                 labels
 64 |             ) = self.prepare_inputs_labels_for_multimodal(
 65 |                 input_ids,
 66 |                 position_ids,
 67 |                 attention_mask,
 68 |                 past_key_values,
 69 |                 labels,
 70 |                 images
 71 |             )
 72 |             if inputs_embeds is not None:
 73 |                 inputs_embeds *= self.get_model().config.scale_emb
 74 | 
 75 |         return super().forward(
 76 |             input_ids=input_ids,
 77 |             attention_mask=attention_mask,
 78 |             position_ids=position_ids,
 79 |             past_key_values=past_key_values,
 80 |             inputs_embeds=inputs_embeds,
 81 |             labels=labels,
 82 |             use_cache=use_cache,
 83 |             output_attentions=output_attentions,
 84 |             output_hidden_states=output_hidden_states,
 85 |             return_dict=return_dict
 86 |         )
 87 | 
 88 |     def prepare_inputs_for_generation(self, input_ids, past_key_values=None, inputs_embeds=None, attention_mask=None,
 89 |                                       **kwargs):
 90 |         images = kwargs.pop("images", None)
 91 | 
 92 |         _inputs = super().prepare_inputs_for_generation(
 93 |             input_ids, past_key_values=past_key_values, inputs_embeds=inputs_embeds, attention_mask=attention_mask,
 94 |             **kwargs
 95 |         )
 96 | 
 97 |         if images is not None:
 98 |             _inputs['images'] = images
 99 |         return _inputs
100 | 
101 | 
102 | AutoConfig.register("bunny-minicpm", BunnyMiniCPMConfig)
103 | AutoModelForCausalLM.register(BunnyMiniCPMConfig, BunnyMiniCPMForCausalLM)
104 | 


--------------------------------------------------------------------------------
/bunny/model/language_model/bunny_phi.py:
--------------------------------------------------------------------------------
  1 | from typing import List, Optional, Tuple, Union
  2 | 
  3 | import torch
  4 | import torch.nn as nn
  5 | from transformers import AutoConfig, AutoModelForCausalLM
  6 | 
  7 | from .phi import PhiModel, PhiConfig, PhiForCausalLM
  8 | 
  9 | from transformers.modeling_outputs import CausalLMOutputWithPast
 10 | 
 11 | from ..bunny_arch import BunnyMetaModel, BunnyMetaForCausalLM
 12 | 
 13 | 
 14 | class BunnyPhiConfig(PhiConfig):
 15 |     model_type = "bunny-phi"
 16 | 
 17 | 
 18 | class BunnyPhiModel(BunnyMetaModel, PhiModel):
 19 |     config_class = BunnyPhiConfig
 20 | 
 21 |     def __init__(self, config: PhiConfig):
 22 |         super(BunnyPhiModel, self).__init__(config)
 23 | 
 24 | 
 25 | class BunnyPhiForCausalLM(PhiForCausalLM, BunnyMetaForCausalLM):
 26 |     config_class = BunnyPhiConfig
 27 | 
 28 |     def __init__(self, config):
 29 |         super(PhiForCausalLM, self).__init__(config)
 30 |         self.model = BunnyPhiModel(config)
 31 |         self.vocab_size = config.vocab_size
 32 |         self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
 33 | 
 34 |         # Initialize weights and apply final processing
 35 |         self.post_init()
 36 | 
 37 |     def get_model(self):
 38 |         return self.model
 39 | 
 40 |     def forward(
 41 |             self,
 42 |             input_ids: torch.LongTensor = None,
 43 |             attention_mask: Optional[torch.Tensor] = None,
 44 |             position_ids: Optional[torch.LongTensor] = None,
 45 |             past_key_values: Optional[List[torch.FloatTensor]] = None,
 46 |             inputs_embeds: Optional[torch.FloatTensor] = None,
 47 |             labels: Optional[torch.LongTensor] = None,
 48 |             use_cache: Optional[bool] = None,
 49 |             output_attentions: Optional[bool] = None,
 50 |             output_hidden_states: Optional[bool] = None,
 51 |             images: Optional[torch.FloatTensor] = None,
 52 |             return_dict: Optional[bool] = None,
 53 |     ) -> Union[Tuple, CausalLMOutputWithPast]:
 54 | 
 55 |         if inputs_embeds is None:
 56 |             (
 57 |                 input_ids,
 58 |                 position_ids,
 59 |                 attention_mask,
 60 |                 past_key_values,
 61 |                 inputs_embeds,
 62 |                 labels
 63 |             ) = self.prepare_inputs_labels_for_multimodal(
 64 |                 input_ids,
 65 |                 position_ids,
 66 |                 attention_mask,
 67 |                 past_key_values,
 68 |                 labels,
 69 |                 images
 70 |             )
 71 | 
 72 |         return super().forward(
 73 |             input_ids=input_ids,
 74 |             attention_mask=attention_mask,
 75 |             position_ids=position_ids,
 76 |             past_key_values=past_key_values,
 77 |             inputs_embeds=inputs_embeds,
 78 |             labels=labels,
 79 |             use_cache=use_cache,
 80 |             output_attentions=output_attentions,
 81 |             output_hidden_states=output_hidden_states,
 82 |             return_dict=return_dict
 83 |         )
 84 | 
 85 |     def prepare_inputs_for_generation(self, input_ids, past_key_values=None, inputs_embeds=None, attention_mask=None,
 86 |                                       **kwargs):
 87 |         images = kwargs.pop("images", None)
 88 | 
 89 |         _inputs = super().prepare_inputs_for_generation(
 90 |             input_ids, past_key_values=past_key_values, inputs_embeds=inputs_embeds, attention_mask=attention_mask,
 91 |             **kwargs
 92 |         )
 93 | 
 94 |         if images is not None:
 95 |             _inputs['images'] = images
 96 |         return _inputs
 97 | 
 98 | 
 99 | AutoConfig.register("bunny-phi", BunnyPhiConfig)
100 | AutoModelForCausalLM.register(BunnyPhiConfig, BunnyPhiForCausalLM)
101 | 


--------------------------------------------------------------------------------
/bunny/model/language_model/bunny_phi3.py:
--------------------------------------------------------------------------------
  1 | from typing import List, Optional, Tuple, Union
  2 | 
  3 | import torch
  4 | import torch.nn as nn
  5 | from transformers import AutoConfig, AutoModelForCausalLM
  6 | 
  7 | from .phi3 import Phi3Model, Phi3Config, Phi3ForCausalLM
  8 | 
  9 | from transformers.modeling_outputs import CausalLMOutputWithPast
 10 | 
 11 | from ..bunny_arch import BunnyMetaModel, BunnyMetaForCausalLM
 12 | 
 13 | 
 14 | class BunnyPhi3Config(Phi3Config):
 15 |     model_type = "bunny-phi3"
 16 | 
 17 | 
 18 | class BunnyPhi3Model(BunnyMetaModel, Phi3Model):
 19 |     config_class = BunnyPhi3Config
 20 | 
 21 |     def __init__(self, config: Phi3Config):
 22 |         super(BunnyPhi3Model, self).__init__(config)
 23 | 
 24 | 
 25 | class BunnyPhi3ForCausalLM(Phi3ForCausalLM, BunnyMetaForCausalLM):
 26 |     config_class = BunnyPhi3Config
 27 | 
 28 |     def __init__(self, config):
 29 |         super(Phi3ForCausalLM, self).__init__(config)
 30 |         self.model = BunnyPhi3Model(config)
 31 |         self.vocab_size = config.vocab_size
 32 |         self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
 33 | 
 34 |         # Initialize weights and apply final processing
 35 |         self.post_init()
 36 | 
 37 |     def get_model(self):
 38 |         return self.model
 39 | 
 40 |     def forward(
 41 |             self,
 42 |             input_ids: torch.LongTensor = None,
 43 |             attention_mask: Optional[torch.Tensor] = None,
 44 |             position_ids: Optional[torch.LongTensor] = None,
 45 |             past_key_values: Optional[List[torch.FloatTensor]] = None,
 46 |             inputs_embeds: Optional[torch.FloatTensor] = None,
 47 |             labels: Optional[torch.LongTensor] = None,
 48 |             use_cache: Optional[bool] = None,
 49 |             output_attentions: Optional[bool] = None,
 50 |             output_hidden_states: Optional[bool] = None,
 51 |             images: Optional[torch.FloatTensor] = None,
 52 |             return_dict: Optional[bool] = None,
 53 |     ) -> Union[Tuple, CausalLMOutputWithPast]:
 54 | 
 55 |         if inputs_embeds is None:
 56 |             (
 57 |                 input_ids,
 58 |                 position_ids,
 59 |                 attention_mask,
 60 |                 past_key_values,
 61 |                 inputs_embeds,
 62 |                 labels
 63 |             ) = self.prepare_inputs_labels_for_multimodal(
 64 |                 input_ids,
 65 |                 position_ids,
 66 |                 attention_mask,
 67 |                 past_key_values,
 68 |                 labels,
 69 |                 images
 70 |             )
 71 | 
 72 |         return super().forward(
 73 |             input_ids=input_ids,
 74 |             attention_mask=attention_mask,
 75 |             position_ids=position_ids,
 76 |             past_key_values=past_key_values,
 77 |             inputs_embeds=inputs_embeds,
 78 |             labels=labels,
 79 |             use_cache=use_cache,
 80 |             output_attentions=output_attentions,
 81 |             output_hidden_states=output_hidden_states,
 82 |             return_dict=return_dict
 83 |         )
 84 | 
 85 |     def prepare_inputs_for_generation(self, input_ids, past_key_values=None, inputs_embeds=None, attention_mask=None,
 86 |                                       **kwargs):
 87 |         images = kwargs.pop("images", None)
 88 | 
 89 |         _inputs = super().prepare_inputs_for_generation(
 90 |             input_ids, past_key_values=past_key_values, inputs_embeds=inputs_embeds, attention_mask=attention_mask,
 91 |             **kwargs
 92 |         )
 93 | 
 94 |         if images is not None:
 95 |             _inputs['images'] = images
 96 |         return _inputs
 97 | 
 98 | 
 99 | AutoConfig.register("bunny-phi3", BunnyPhi3Config)
100 | AutoModelForCausalLM.register(BunnyPhi3Config, BunnyPhi3ForCausalLM)
101 | 


--------------------------------------------------------------------------------
/bunny/model/language_model/bunny_qwen.py:
--------------------------------------------------------------------------------
  1 | from typing import List, Optional, Tuple, Union
  2 | 
  3 | import torch
  4 | import torch.nn as nn
  5 | from transformers import AutoConfig, AutoModelForCausalLM
  6 | 
  7 | from .qwen2 import Qwen2Model, Qwen2Config, Qwen2ForCausalLM
  8 | 
  9 | from transformers.modeling_outputs import CausalLMOutputWithPast
 10 | 
 11 | from ..bunny_arch import BunnyMetaModel, BunnyMetaForCausalLM
 12 | 
 13 | 
 14 | class BunnyQwen2Config(Qwen2Config):
 15 |     model_type = "bunny-qwen2"
 16 | 
 17 | 
 18 | class BunnyQwen2Model(BunnyMetaModel, Qwen2Model):
 19 |     config_class = BunnyQwen2Config
 20 | 
 21 |     def __init__(self, config: Qwen2Config):
 22 |         super(BunnyQwen2Model, self).__init__(config)
 23 | 
 24 | 
 25 | class BunnyQwen2ForCausalLM(Qwen2ForCausalLM, BunnyMetaForCausalLM):
 26 |     config_class = BunnyQwen2Config
 27 | 
 28 |     def __init__(self, config):
 29 |         super(Qwen2ForCausalLM, self).__init__(config)
 30 |         self.model = BunnyQwen2Model(config)
 31 |         self.vocab_size = config.vocab_size
 32 |         self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
 33 | 
 34 |         # Initialize weights and apply final processing
 35 |         self.post_init()
 36 | 
 37 |     def get_model(self):
 38 |         return self.model
 39 | 
 40 |     def forward(
 41 |             self,
 42 |             input_ids: torch.LongTensor = None,
 43 |             attention_mask: Optional[torch.Tensor] = None,
 44 |             position_ids: Optional[torch.LongTensor] = None,
 45 |             past_key_values: Optional[List[torch.FloatTensor]] = None,
 46 |             inputs_embeds: Optional[torch.FloatTensor] = None,
 47 |             labels: Optional[torch.LongTensor] = None,
 48 |             use_cache: Optional[bool] = None,
 49 |             output_attentions: Optional[bool] = None,
 50 |             output_hidden_states: Optional[bool] = None,
 51 |             images: Optional[torch.FloatTensor] = None,
 52 |             return_dict: Optional[bool] = None,
 53 |     ) -> Union[Tuple, CausalLMOutputWithPast]:
 54 | 
 55 |         if inputs_embeds is None:
 56 |             (
 57 |                 input_ids,
 58 |                 position_ids,
 59 |                 attention_mask,
 60 |                 past_key_values,
 61 |                 inputs_embeds,
 62 |                 labels
 63 |             ) = self.prepare_inputs_labels_for_multimodal(
 64 |                 input_ids,
 65 |                 position_ids,
 66 |                 attention_mask,
 67 |                 past_key_values,
 68 |                 labels,
 69 |                 images
 70 |             )
 71 | 
 72 |         return super().forward(
 73 |             input_ids=input_ids,
 74 |             attention_mask=attention_mask,
 75 |             position_ids=position_ids,
 76 |             past_key_values=past_key_values,
 77 |             inputs_embeds=inputs_embeds,
 78 |             labels=labels,
 79 |             use_cache=use_cache,
 80 |             output_attentions=output_attentions,
 81 |             output_hidden_states=output_hidden_states,
 82 |             return_dict=return_dict
 83 |         )
 84 | 
 85 |     def prepare_inputs_for_generation(self, input_ids, past_key_values=None, inputs_embeds=None, attention_mask=None,
 86 |                                       **kwargs):
 87 |         images = kwargs.pop("images", None)
 88 | 
 89 |         _inputs = super().prepare_inputs_for_generation(
 90 |             input_ids, past_key_values=past_key_values, inputs_embeds=inputs_embeds, attention_mask=attention_mask,
 91 |             **kwargs
 92 |         )
 93 | 
 94 |         if images is not None:
 95 |             _inputs['images'] = images
 96 |         return _inputs
 97 | 
 98 | 
 99 | AutoConfig.register("bunny-qwen2", BunnyQwen2Config)
100 | AutoModelForCausalLM.register(BunnyQwen2Config, BunnyQwen2ForCausalLM)
101 | 


--------------------------------------------------------------------------------
/bunny/model/language_model/bunny_stablelm.py:
--------------------------------------------------------------------------------
  1 | from typing import List, Optional, Tuple, Union
  2 | 
  3 | import torch
  4 | import torch.nn as nn
  5 | from transformers import AutoConfig, AutoModelForCausalLM
  6 | 
  7 | from bunny.model.language_model.stable_lm.modeling_stablelm_epoch import StableLMEpochModel, StableLMEpochConfig, \
  8 |     StableLMEpochForCausalLM
  9 | 
 10 | from transformers.modeling_outputs import CausalLMOutputWithPast
 11 | 
 12 | from bunny.model.bunny_arch import BunnyMetaModel, BunnyMetaForCausalLM
 13 | 
 14 | 
 15 | class BunnyStableLMConfig(StableLMEpochConfig):
 16 |     model_type = "bunny-stablelm"
 17 | 
 18 | 
 19 | class BunnyStableLMModel(BunnyMetaModel, StableLMEpochModel):
 20 |     config_class = BunnyStableLMConfig
 21 | 
 22 |     def __init__(self, config: StableLMEpochConfig):
 23 |         super(BunnyStableLMModel, self).__init__(config)
 24 | 
 25 | 
 26 | class BunnyStableLMForCausalLM(StableLMEpochForCausalLM, BunnyMetaForCausalLM):
 27 |     config_class = BunnyStableLMConfig
 28 | 
 29 |     def __init__(self, config):
 30 |         super(StableLMEpochForCausalLM, self).__init__(config)
 31 |         self.model = BunnyStableLMModel(config)
 32 |         self.vocab_size = config.vocab_size
 33 |         self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
 34 | 
 35 |         # Initialize weights and apply final processing
 36 |         self.post_init()
 37 | 
 38 |     def get_model(self):
 39 |         return self.model
 40 | 
 41 |     def forward(
 42 |             self,
 43 |             input_ids: torch.LongTensor = None,
 44 |             attention_mask: Optional[torch.Tensor] = None,
 45 |             position_ids: Optional[torch.LongTensor] = None,
 46 |             past_key_values: Optional[List[torch.FloatTensor]] = None,
 47 |             inputs_embeds: Optional[torch.FloatTensor] = None,
 48 |             labels: Optional[torch.LongTensor] = None,
 49 |             use_cache: Optional[bool] = None,
 50 |             output_attentions: Optional[bool] = None,
 51 |             output_hidden_states: Optional[bool] = None,
 52 |             images: Optional[torch.FloatTensor] = None,
 53 |             return_dict: Optional[bool] = None,
 54 |     ) -> Union[Tuple, CausalLMOutputWithPast]:
 55 |         if inputs_embeds is None:
 56 |             (
 57 |                 input_ids,
 58 |                 position_ids,
 59 |                 attention_mask,
 60 |                 past_key_values,
 61 |                 inputs_embeds,
 62 |                 labels
 63 |             ) = self.prepare_inputs_labels_for_multimodal(
 64 |                 input_ids,
 65 |                 position_ids,
 66 |                 attention_mask,
 67 |                 past_key_values,
 68 |                 labels,
 69 |                 images
 70 |             )
 71 | 
 72 |         return super().forward(
 73 |             input_ids=input_ids,
 74 |             attention_mask=attention_mask,
 75 |             position_ids=position_ids,
 76 |             past_key_values=past_key_values,
 77 |             inputs_embeds=inputs_embeds,
 78 |             labels=labels,
 79 |             use_cache=use_cache,
 80 |             output_attentions=output_attentions,
 81 |             output_hidden_states=output_hidden_states,
 82 |             return_dict=return_dict
 83 |         )
 84 | 
 85 |     def prepare_inputs_for_generation(self, input_ids, past_key_values=None, inputs_embeds=None, attention_mask=None,
 86 |                                       **kwargs):
 87 |         images = kwargs.pop("images", None)
 88 | 
 89 |         _inputs = super().prepare_inputs_for_generation(
 90 |             input_ids, past_key_values=past_key_values, inputs_embeds=inputs_embeds, attention_mask=attention_mask,
 91 |             **kwargs
 92 |         )
 93 | 
 94 |         if images is not None:
 95 |             _inputs['images'] = images
 96 |         return _inputs
 97 | 
 98 | 
 99 | AutoConfig.register("bunny-stablelm", BunnyStableLMConfig)
100 | AutoModelForCausalLM.register(BunnyStableLMConfig, BunnyStableLMForCausalLM)
101 | 


--------------------------------------------------------------------------------
/bunny/model/language_model/llama/__init__.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2022 EleutherAI and The HuggingFace Inc. team. All rights reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | from typing import TYPE_CHECKING
 15 | 
 16 | from transformers.utils import (
 17 |     OptionalDependencyNotAvailable,
 18 |     _LazyModule,
 19 |     is_flax_available,
 20 |     is_sentencepiece_available,
 21 |     is_tokenizers_available,
 22 |     is_torch_available,
 23 | )
 24 | 
 25 | 
 26 | _import_structure = {
 27 |     "configuration_llama": ["LLAMA_PRETRAINED_CONFIG_ARCHIVE_MAP", "LlamaConfig"],
 28 | }
 29 | 
 30 | try:
 31 |     if not is_sentencepiece_available():
 32 |         raise OptionalDependencyNotAvailable()
 33 | except OptionalDependencyNotAvailable:
 34 |     pass
 35 | else:
 36 |     _import_structure["tokenization_llama"] = ["LlamaTokenizer"]
 37 | 
 38 | try:
 39 |     if not is_tokenizers_available():
 40 |         raise OptionalDependencyNotAvailable()
 41 | except OptionalDependencyNotAvailable:
 42 |     pass
 43 | else:
 44 |     _import_structure["tokenization_llama_fast"] = ["LlamaTokenizerFast"]
 45 | 
 46 | try:
 47 |     if not is_torch_available():
 48 |         raise OptionalDependencyNotAvailable()
 49 | except OptionalDependencyNotAvailable:
 50 |     pass
 51 | else:
 52 |     _import_structure["modeling_llama"] = [
 53 |         "LlamaForCausalLM",
 54 |         "LlamaModel",
 55 |         "LlamaPreTrainedModel",
 56 |         "LlamaForSequenceClassification",
 57 |         "LlamaForQuestionAnswering",
 58 |     ]
 59 | 
 60 | try:
 61 |     if not is_flax_available():
 62 |         raise OptionalDependencyNotAvailable()
 63 | except OptionalDependencyNotAvailable:
 64 |     pass
 65 | else:
 66 |     _import_structure["modeling_flax_llama"] = ["FlaxLlamaForCausalLM", "FlaxLlamaModel", "FlaxLlamaPreTrainedModel"]
 67 | 
 68 | 
 69 | if TYPE_CHECKING:
 70 |     from .configuration_llama import LLAMA_PRETRAINED_CONFIG_ARCHIVE_MAP, LlamaConfig
 71 | 
 72 |     try:
 73 |         if not is_sentencepiece_available():
 74 |             raise OptionalDependencyNotAvailable()
 75 |     except OptionalDependencyNotAvailable:
 76 |         pass
 77 |     else:
 78 |         from .tokenization_llama import LlamaTokenizer
 79 | 
 80 |     try:
 81 |         if not is_tokenizers_available():
 82 |             raise OptionalDependencyNotAvailable()
 83 |     except OptionalDependencyNotAvailable:
 84 |         pass
 85 |     else:
 86 |         from .tokenization_llama_fast import LlamaTokenizerFast
 87 | 
 88 |     try:
 89 |         if not is_torch_available():
 90 |             raise OptionalDependencyNotAvailable()
 91 |     except OptionalDependencyNotAvailable:
 92 |         pass
 93 |     else:
 94 |         from .modeling_llama import (
 95 |             LlamaForCausalLM,
 96 |             LlamaForQuestionAnswering,
 97 |             LlamaForSequenceClassification,
 98 |             LlamaModel,
 99 |             LlamaPreTrainedModel,
100 |         )
101 | 
102 |     try:
103 |         if not is_flax_available():
104 |             raise OptionalDependencyNotAvailable()
105 |     except OptionalDependencyNotAvailable:
106 |         pass
107 |     else:
108 |         from .modeling_flax_llama import FlaxLlamaForCausalLM, FlaxLlamaModel, FlaxLlamaPreTrainedModel
109 | 
110 | 
111 | else:
112 |     import sys
113 | 
114 |     sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
115 | 


--------------------------------------------------------------------------------
/bunny/model/language_model/phi/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2023 Microsoft and The HuggingFace Inc. team. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | 
16 | from typing import TYPE_CHECKING
17 | 
18 | from transformers.utils import (
19 |     OptionalDependencyNotAvailable,
20 |     _LazyModule,
21 |     is_sentencepiece_available,
22 |     is_tokenizers_available,
23 |     is_torch_available,
24 | )
25 | 
26 | 
27 | _import_structure = {
28 |     "configuration_phi": ["PHI_PRETRAINED_CONFIG_ARCHIVE_MAP", "PhiConfig"],
29 | }
30 | 
31 | try:
32 |     if not is_torch_available():
33 |         raise OptionalDependencyNotAvailable()
34 | except OptionalDependencyNotAvailable:
35 |     pass
36 | else:
37 |     _import_structure["modeling_phi"] = [
38 |         "PHI_PRETRAINED_MODEL_ARCHIVE_LIST",
39 |         "PhiPreTrainedModel",
40 |         "PhiModel",
41 |         "PhiForCausalLM",
42 |         "PhiForSequenceClassification",
43 |         "PhiForTokenClassification",
44 |     ]
45 | 
46 | 
47 | if TYPE_CHECKING:
48 |     from .configuration_phi import PHI_PRETRAINED_CONFIG_ARCHIVE_MAP, PhiConfig
49 | 
50 |     try:
51 |         if not is_torch_available():
52 |             raise OptionalDependencyNotAvailable()
53 |     except OptionalDependencyNotAvailable:
54 |         pass
55 |     else:
56 |         from .modeling_phi import (
57 |             PHI_PRETRAINED_MODEL_ARCHIVE_LIST,
58 |             PhiForCausalLM,
59 |             PhiForSequenceClassification,
60 |             PhiForTokenClassification,
61 |             PhiModel,
62 |             PhiPreTrainedModel,
63 |         )
64 | 
65 | 
66 | else:
67 |     import sys
68 | 
69 |     sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
70 | 


--------------------------------------------------------------------------------
/bunny/model/language_model/phi3/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2024 Microsoft and The HuggingFace Inc. team. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | 
16 | from typing import TYPE_CHECKING
17 | 
18 | from transformers.utils import (
19 |     OptionalDependencyNotAvailable,
20 |     _LazyModule,
21 |     is_sentencepiece_available,
22 |     is_tokenizers_available,
23 |     is_torch_available,
24 | )
25 | 
26 | 
27 | _import_structure = {
28 |     "configuration_phi3": ["PHI3_PRETRAINED_CONFIG_ARCHIVE_MAP", "Phi3Config"],
29 | }
30 | 
31 | try:
32 |     if not is_torch_available():
33 |         raise OptionalDependencyNotAvailable()
34 | except OptionalDependencyNotAvailable:
35 |     pass
36 | else:
37 |     _import_structure["modeling_phi3"] = [
38 |         "PHI3_PRETRAINED_MODEL_ARCHIVE_LIST",
39 |         "Phi3PreTrainedModel",
40 |         "Phi3Model",
41 |         "Phi3ForCausalLM",
42 |         "Phi3ForSequenceClassification",
43 |         "Phi3ForTokenClassification",
44 |     ]
45 | 
46 | 
47 | if TYPE_CHECKING:
48 |     from .configuration_phi3 import PHI3_PRETRAINED_CONFIG_ARCHIVE_MAP, Phi3Config
49 | 
50 |     try:
51 |         if not is_torch_available():
52 |             raise OptionalDependencyNotAvailable()
53 |     except OptionalDependencyNotAvailable:
54 |         pass
55 |     else:
56 |         from .modeling_phi3 import (
57 |             PHI3_PRETRAINED_MODEL_ARCHIVE_LIST,
58 |             Phi3ForCausalLM,
59 |             Phi3ForSequenceClassification,
60 |             Phi3ForTokenClassification,
61 |             Phi3Model,
62 |             Phi3PreTrainedModel,
63 |         )
64 | 
65 | 
66 | else:
67 |     import sys
68 | 
69 |     sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
70 | 


--------------------------------------------------------------------------------
/bunny/model/language_model/qwen2/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2024 The Qwen Team and The HuggingFace Inc. team. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | from typing import TYPE_CHECKING
15 | 
16 | from transformers.utils import (
17 |     OptionalDependencyNotAvailable,
18 |     _LazyModule,
19 |     is_tokenizers_available,
20 |     is_torch_available,
21 | )
22 | 
23 | 
24 | _import_structure = {
25 |     "configuration_qwen2": ["QWEN2_PRETRAINED_CONFIG_ARCHIVE_MAP", "Qwen2Config"],
26 |     "tokenization_qwen2": ["Qwen2Tokenizer"],
27 | }
28 | 
29 | try:
30 |     if not is_tokenizers_available():
31 |         raise OptionalDependencyNotAvailable()
32 | except OptionalDependencyNotAvailable:
33 |     pass
34 | else:
35 |     _import_structure["tokenization_qwen2_fast"] = ["Qwen2TokenizerFast"]
36 | 
37 | try:
38 |     if not is_torch_available():
39 |         raise OptionalDependencyNotAvailable()
40 | except OptionalDependencyNotAvailable:
41 |     pass
42 | else:
43 |     _import_structure["modeling_qwen2"] = [
44 |         "Qwen2ForCausalLM",
45 |         "Qwen2Model",
46 |         "Qwen2PreTrainedModel",
47 |         "Qwen2ForSequenceClassification",
48 |     ]
49 | 
50 | 
51 | if TYPE_CHECKING:
52 |     from .configuration_qwen2 import QWEN2_PRETRAINED_CONFIG_ARCHIVE_MAP, Qwen2Config
53 |     from .tokenization_qwen2 import Qwen2Tokenizer
54 | 
55 |     try:
56 |         if not is_tokenizers_available():
57 |             raise OptionalDependencyNotAvailable()
58 |     except OptionalDependencyNotAvailable:
59 |         pass
60 |     else:
61 |         from .tokenization_qwen2_fast import Qwen2TokenizerFast
62 | 
63 |     try:
64 |         if not is_torch_available():
65 |             raise OptionalDependencyNotAvailable()
66 |     except OptionalDependencyNotAvailable:
67 |         pass
68 |     else:
69 |         from .modeling_qwen2 import (
70 |             Qwen2ForCausalLM,
71 |             Qwen2ForSequenceClassification,
72 |             Qwen2Model,
73 |             Qwen2PreTrainedModel,
74 |         )
75 | 
76 | 
77 | else:
78 |     import sys
79 | 
80 |     sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)


--------------------------------------------------------------------------------
/bunny/model/language_model/qwen2/tokenization_qwen2_fast.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2024 The Qwen team, Alibaba Group and The HuggingFace Inc. team. All rights reserved.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | """Tokenization classes for Qwen2."""
 16 | 
 17 | from typing import Optional, Tuple
 18 | 
 19 | from transformers.tokenization_utils import AddedToken
 20 | from transformers.tokenization_utils_fast import PreTrainedTokenizerFast
 21 | from transformers.utils import logging
 22 | from .tokenization_qwen2 import Qwen2Tokenizer
 23 | 
 24 | 
 25 | logger = logging.get_logger(__name__)
 26 | 
 27 | VOCAB_FILES_NAMES = {
 28 |     "vocab_file": "vocab.json",
 29 |     "merges_file": "merges.txt",
 30 |     "tokenizer_file": "tokenizer.json",
 31 | }
 32 | 
 33 | PRETRAINED_VOCAB_FILES_MAP = {
 34 |     "vocab_file": {"qwen/qwen-tokenizer": "https://huggingface.co/qwen/qwen-tokenizer/resolve/main/vocab.json"},
 35 |     "merges_file": {"qwen/qwen-tokenizer": "https://huggingface.co/qwen/qwen-tokenizer/resolve/main/merges.txt"},
 36 |     "tokenizer_file": {
 37 |         "qwen/qwen-tokenizer": "https://huggingface.co/qwen/qwen-tokenizer/resolve/main/tokenizer.json"
 38 |     },
 39 | }
 40 | 
 41 | MAX_MODEL_INPUT_SIZES = {"qwen/qwen-tokenizer": 32768}
 42 | 
 43 | 
 44 | class Qwen2TokenizerFast(PreTrainedTokenizerFast):
 45 |     """
 46 |     Construct a "fast" Qwen2 tokenizer (backed by HuggingFace's *tokenizers* library). Based on byte-level
 47 |     Byte-Pair-Encoding.
 48 | 
 49 |     Same with GPT2Tokenizer, this tokenizer has been trained to treat spaces like parts of the tokens so a word will
 50 |     be encoded differently whether it is at the beginning of the sentence (without space) or not:
 51 | 
 52 |     ```python
 53 |     >>> from transformers import Qwen2TokenizerFast
 54 | 
 55 |     >>> tokenizer = Qwen2TokenizerFast.from_pretrained("Qwen/Qwen-tokenizer")
 56 |     >>> tokenizer("Hello world")["input_ids"]
 57 |     [9707, 1879]
 58 | 
 59 |     >>> tokenizer(" Hello world")["input_ids"]
 60 |     [21927, 1879]
 61 |     ```
 62 |     This is expected.
 63 | 
 64 |     This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
 65 |     refer to this superclass for more information regarding those methods.
 66 | 
 67 |     Args:
 68 |         vocab_file (`str`, *optional*):
 69 |             Path to the vocabulary file.
 70 |         merges_file (`str`, *optional*):
 71 |             Path to the merges file.
 72 |         tokenizer_file (`str`, *optional*):
 73 |             Path to [tokenizers](https://github.com/huggingface/tokenizers) file (generally has a .json extension) that
 74 |             contains everything needed to load the tokenizer.
 75 |         unk_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
 76 |             The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
 77 |             token instead. Not applicable to this tokenizer.
 78 |         bos_token (`str`, *optional*):
 79 |             The beginning of sequence token. Not applicable for this tokenizer.
 80 |         eos_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
 81 |             The end of sequence token.
 82 |         pad_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
 83 |             The token used for padding, for example when batching sequences of different lengths.
 84 |     """
 85 | 
 86 |     vocab_files_names = VOCAB_FILES_NAMES
 87 |     pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
 88 |     max_model_input_sizes = MAX_MODEL_INPUT_SIZES
 89 |     model_input_names = ["input_ids", "attention_mask"]
 90 |     slow_tokenizer_class = Qwen2Tokenizer
 91 | 
 92 |     def __init__(
 93 |         self,
 94 |         vocab_file=None,
 95 |         merges_file=None,
 96 |         tokenizer_file=None,
 97 |         unk_token="<|endoftext|>",
 98 |         bos_token=None,
 99 |         eos_token="<|endoftext|>",
100 |         pad_token="<|endoftext|>",
101 |         **kwargs,
102 |     ):
103 |         # We need to at least pass vocab_file and merges_file to base class
104 |         # in case a slow tokenizer needs to be initialized; other can be
105 |         # configured through files.
106 |         # following GPT2TokenizerFast, also adding unk_token, bos_token, and eos_token
107 | 
108 |         bos_token = (
109 |             AddedToken(bos_token, lstrip=False, rstrip=False, special=True, normalized=False)
110 |             if isinstance(bos_token, str)
111 |             else bos_token
112 |         )
113 |         eos_token = (
114 |             AddedToken(eos_token, lstrip=False, rstrip=False, special=True, normalized=False)
115 |             if isinstance(eos_token, str)
116 |             else eos_token
117 |         )
118 |         unk_token = (
119 |             AddedToken(unk_token, lstrip=False, rstrip=False, special=True, normalized=False)
120 |             if isinstance(unk_token, str)
121 |             else unk_token
122 |         )
123 |         pad_token = (
124 |             AddedToken(pad_token, lstrip=False, rstrip=False, special=True, normalized=False)
125 |             if isinstance(pad_token, str)
126 |             else pad_token
127 |         )
128 | 
129 |         super().__init__(
130 |             vocab_file,
131 |             merges_file,
132 |             tokenizer_file=tokenizer_file,
133 |             unk_token=unk_token,
134 |             bos_token=bos_token,
135 |             eos_token=eos_token,
136 |             pad_token=pad_token,
137 |             **kwargs,
138 |         )
139 | 
140 |     # Copied from transformers.models.gpt2.tokenization_gpt2_fast.GPT2TokenizerFast.save_vocabulary
141 |     def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
142 |         files = self._tokenizer.model.save(save_directory, name=filename_prefix)
143 |         return tuple(files)


--------------------------------------------------------------------------------
/bunny/model/language_model/stable_lm/configuration_stablelm_epoch.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2023 Stability and The HuggingFace Inc. team. All rights reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | """ StableLM Epoch model configuration"""
 15 | from transformers import PretrainedConfig
 16 | from transformers.utils import logging
 17 | 
 18 | 
 19 | logger = logging.get_logger(__name__)
 20 | 
 21 | 
 22 | class StableLMEpochConfig(PretrainedConfig):
 23 |     r"""
 24 |     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
 25 |     documentation from [`PretrainedConfig`] for more information.
 26 | 
 27 |     Args:
 28 |         vocab_size (`int`, *optional*, defaults to 50_304):
 29 |             Vocabulary size of the StableLM model. Defines the number of different tokens that
 30 |             can be represented by the `inputs_ids` passed when calling [`StableLMEpochModel`].
 31 |         intermediate_size (`int`, *optional*, defaults to 6912):
 32 |             Dimension of the MLP representations.
 33 |         hidden_size (`int`, *optional*, defaults to 2560):
 34 |             Dimension of the decoder layers and the pooler layer.
 35 |         num_hidden_layers (`int`, *optional*, defaults to 32):
 36 |             Number of hidden layers in the Transformer decoder.
 37 |         num_attention_heads (`int`, *optional*, defaults to 32):
 38 |             Number of attention heads for each attention layer in the Transformer encoder.
 39 |         num_key_value_heads (`int`, *optional*):
 40 |             This is the number of key_value heads that should be used to implement Grouped Query Attention. If
 41 |             `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
 42 |             `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
 43 |             converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
 44 |             by meanpooling all the original heads within that group. For more details checkout [this
 45 |             paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
 46 |             `num_attention_heads`.
 47 |         hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
 48 |             The non-linear activation function (function or string).
 49 |         rope_pct (`float`, *optional*, defaults to 1.0):
 50 |             Percentage of hidden dimensions to allocate to rotary embeddings.
 51 |         rope_theta (`float`, *optional*, defaults to 10000.0):
 52 |             The base period of the RoPE embeddings.
 53 |         max_position_embeddings (`int`, *optional*, defaults to 2048):
 54 |             The maximum sequence length that this model might ever be used with.
 55 |             Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
 56 |         initializer_range (`float`, *optional*, defaults to 1e-5):
 57 |             The standard deviation of the truncated_normal_initializer for initializing
 58 |              all weight matrices.
 59 |         norm_eps (`float`, *optional*, defaults to 1e-8):
 60 |             The epsilon used by the normalization layers.
 61 |         use_cache (`bool`, *optional*, defaults to `True`):
 62 |             Whether or not the model should return the last key/values attentions
 63 |             (not used by all models). Only relevant if `config.is_decoder=True`.
 64 |         use_qkv_bias (`bool`, *optional*, defaults to `True`):
 65 |             Whether or not the model should use bias for qkv layers.
 66 |         tie_word_embeddings(`bool`, *optional*, defaults to `False`):
 67 |             Whether to tie weight embeddings
 68 |     """
 69 |     model_type = "stablelm_epoch"
 70 |     keys_to_ignore_at_inference = ["past_key_values"]
 71 | 
 72 |     def __init__(
 73 |         self,
 74 |         vocab_size=50_304,
 75 |         intermediate_size=6912,
 76 |         hidden_size=2560,
 77 |         num_hidden_layers=32,
 78 |         num_attention_heads=32,
 79 |         num_key_value_heads=32,
 80 |         hidden_act="silu",
 81 |         rope_pct=0.25,
 82 |         rope_theta=10_000,
 83 |         max_position_embeddings=4096,
 84 |         initializer_range=0.02,
 85 |         norm_eps=1.0e-5,
 86 |         use_cache=True,
 87 |         use_qkv_bias=True,
 88 |         bos_token_id=0,
 89 |         eos_token_id=2,
 90 |         tie_word_embeddings=False,
 91 |         **kwargs,
 92 |     ):
 93 |         self.vocab_size = vocab_size
 94 |         self.max_position_embeddings = max_position_embeddings
 95 |         self.intermediate_size = intermediate_size
 96 |         self.hidden_size = hidden_size
 97 |         self.num_hidden_layers = num_hidden_layers
 98 |         self.num_attention_heads = num_attention_heads
 99 |         self.num_key_value_heads = num_key_value_heads
100 |         self.hidden_act = hidden_act
101 |         self.rope_pct = rope_pct
102 |         self.rope_theta = rope_theta
103 |         self.initializer_range = initializer_range
104 |         self.norm_eps = norm_eps
105 |         self.use_cache = use_cache
106 |         self.use_qkv_bias = use_qkv_bias
107 |         self.tie_word_embeddings = tie_word_embeddings
108 |         super().__init__(
109 |             bos_token_id=bos_token_id,
110 |             eos_token_id=eos_token_id,
111 |             tie_word_embeddings=tie_word_embeddings,
112 |             **kwargs,
113 |         )
114 | 


--------------------------------------------------------------------------------
/bunny/model/multimodal_encoder/builder.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from .eva_clip.eva_clip_encoder import EvaClipVisionTower
 3 | from .siglip.siglip_encoder import SiglipVisionTower, SiglipVisionTowerS2
 4 | from .clip.clip_encoder import CLIPVisionTower
 5 | 
 6 | 
 7 | def build_vision_tower(vision_tower_cfg, **kwargs):
 8 |     vision_tower = getattr(vision_tower_cfg, 'mm_vision_tower', getattr(vision_tower_cfg, 'vision_tower', None))
 9 |     use_s2 = getattr(vision_tower_cfg, 'use_s2', False)
10 | 
11 |     if 'sig' in vision_tower.lower():
12 |         if use_s2:
13 |             return SiglipVisionTowerS2(vision_tower, args=vision_tower_cfg, **kwargs)
14 |         else:
15 |             return SiglipVisionTower(vision_tower, args=vision_tower_cfg, **kwargs)
16 |     elif 'eva' in vision_tower.lower():
17 |         if use_s2:
18 |             raise ValueError(f'Currently not supporting S2 for EVA-CLIP')
19 |         else:
20 |             return EvaClipVisionTower(vision_tower, args=vision_tower_cfg, **kwargs)
21 | 
22 |     elif 'clip' in vision_tower.lower():
23 |         if use_s2:
24 |             raise ValueError(f'Currently not supporting S2 for CLIP')
25 |         else:
26 |             return CLIPVisionTower(vision_tower, args=vision_tower_cfg, **kwargs)
27 | 
28 |     else:
29 |         raise ValueError(f'Unknown vision tower: {vision_tower}')
30 | 


--------------------------------------------------------------------------------
/bunny/model/multimodal_encoder/clip/clip_encoder.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | 
 4 | from transformers import CLIPVisionModel, CLIPImageProcessor, CLIPVisionConfig
 5 | 
 6 | 
 7 | class CLIPVisionTower(nn.Module):
 8 |     def __init__(self, vision_tower, args, delay_load=False):
 9 |         super().__init__()
10 | 
11 |         self.is_loaded = False
12 | 
13 |         self.vision_tower_name = vision_tower
14 |         self.select_layer = -2
15 | 
16 |         if not delay_load:
17 |             self.load_model()
18 |         else:
19 |             self.cfg_only = CLIPVisionConfig.from_pretrained(self.vision_tower_name)
20 | 
21 |     def load_model(self):
22 |         if self.is_loaded:
23 |             return
24 |         self.image_processor = CLIPImageProcessor.from_pretrained(self.vision_tower_name)
25 |         self.vision_tower = CLIPVisionModel.from_pretrained(self.vision_tower_name)
26 |         self.vision_tower.requires_grad_(False)
27 | 
28 |         self.is_loaded = True
29 | 
30 |     def feature_select(self, image_forward_outs):
31 |         image_features = image_forward_outs.hidden_states[self.select_layer]
32 | 
33 |         image_features = image_features[:, 1:]
34 | 
35 |         return image_features
36 | 
37 |     def forward(self, images):
38 |         if type(images) is list:
39 |             image_features = []
40 |             for image in images:
41 |                 image_forward_out = self.vision_tower(image.to(device=self.device, dtype=self.dtype).unsqueeze(0),
42 |                                                       output_hidden_states=True)
43 |                 image_feature = self.feature_select(image_forward_out).to(image.dtype)
44 |                 image_features.append(image_feature)
45 |         else:
46 |             image_forward_outs = self.vision_tower(images.to(device=self.device, dtype=self.dtype),
47 |                                                    output_hidden_states=True)
48 |             image_features = self.feature_select(image_forward_outs).to(images.dtype)
49 | 
50 |         return image_features
51 | 
52 |     @property
53 |     def dummy_feature(self):
54 |         return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype)
55 | 
56 |     @property
57 |     def dtype(self):
58 |         return self.vision_tower.dtype
59 | 
60 |     @property
61 |     def device(self):
62 |         return self.vision_tower.device
63 | 
64 |     @property
65 |     def config(self):
66 |         if self.is_loaded:
67 |             return self.vision_tower.config
68 |         else:
69 |             return self.cfg_only
70 | 
71 |     @property
72 |     def hidden_size(self):
73 |         return self.config.hidden_size
74 | 
75 |     @property
76 |     def num_patches(self):
77 |         return (self.config.image_size // self.config.patch_size) ** 2
78 | 


--------------------------------------------------------------------------------
/bunny/model/multimodal_encoder/eva_clip/eva_clip_encoder.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | 
 4 | from .eva_clip_processors import EvaClipImageTrainProcessor
 5 | from .eva_vit import Eva2LargePlusEncoder
 6 | 
 7 | 
 8 | class EvaClipVisionTower(nn.Module):
 9 |     def __init__(self, vision_tower, args, delay_load=False):
10 |         super().__init__()
11 | 
12 |         self.is_loaded = False
13 | 
14 |         self.vision_tower_path = vision_tower
15 |         self.config = VisionTowerConfig()
16 | 
17 |         if not delay_load:
18 |             self.load_model()
19 |         else:
20 |             self.cfg_only = self.config
21 | 
22 |     def load_model(self):
23 |         if self.is_loaded:
24 |             return
25 |         self.image_processor = EvaClipImageTrainProcessor(self.config.image_size)
26 |         self.vision_tower = Eva2LargePlusEncoder(self.vision_tower_path)
27 |         self.vision_tower.requires_grad_(False)
28 | 
29 |         self.is_loaded = True
30 | 
31 |     def forward(self, images):
32 |         if type(images) is list:
33 |             image_features = []
34 |             for image in images:
35 |                 image_feature = self.vision_tower(image.to(device=self.device, dtype=self.dtype).unsqueeze(0)).to(
36 |                     image.dtype)
37 |                 image_features.append(image_feature)
38 |         else:
39 |             image_features = self.vision_tower(images.to(device=self.device, dtype=self.dtype)).to(images.dtype)
40 | 
41 |         return image_features
42 | 
43 |     @property
44 |     def dtype(self):
45 |         return self.vision_tower.dtype
46 | 
47 |     @property
48 |     def device(self):
49 |         return self.vision_tower.device
50 | 
51 |     @property
52 |     def hidden_size(self):
53 |         return self.config.hidden_size
54 | 
55 |     @property
56 |     def num_patches(self):
57 |         return (self.config.image_size // self.config.patch_size) ** 2
58 | 
59 | 
60 | class VisionTowerConfig():
61 |     def __init__(self):
62 |         self.image_size = 336
63 |         self.patch_size = 14
64 |         self.hidden_size = 1024
65 | 


--------------------------------------------------------------------------------
/bunny/model/multimodal_encoder/eva_clip/eva_clip_processors.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | # Adapted from https://github.com/baaivision/EVA/tree/master/EVA-CLIP
 3 | '''
 4 | 
 5 | from torchvision import transforms
 6 | from torchvision.transforms.functional import InterpolationMode
 7 | from transformers.image_processing_utils import BatchFeature
 8 | from PIL import Image
 9 | from transformers.image_transforms import convert_to_rgb
10 | 
11 | 
12 | class BaseProcessor:
13 |     def __init__(self):
14 |         self.transform = lambda x: x
15 |         return
16 | 
17 |     def __call__(self, item):
18 |         return self.transform(item)
19 | 
20 | 
21 | class EvaClipImageBaseProcessor(BaseProcessor):
22 |     def __init__(self, mean=None, std=None):
23 |         self.mean = (0.48145466, 0.4578275, 0.40821073) if mean is None else mean
24 |         self.std = (0.26862954, 0.26130258, 0.27577711) if std is None else std
25 | 
26 |         self.normalize = transforms.Normalize(self.mean, self.std)
27 | 
28 |     @property
29 |     def image_mean(self):
30 |         return self.mean
31 | 
32 | 
33 | class EvaClipImageTrainProcessor(EvaClipImageBaseProcessor):
34 |     def __init__(self, image_size=224, mean=None, std=None, min_scale=0.5, max_scale=1.0):
35 |         super().__init__(mean=mean, std=std)
36 | 
37 |         self.transform = transforms.Compose(
38 |             [
39 |                 convert_to_rgb,
40 |                 transforms.Resize(
41 |                     image_size,
42 |                     interpolation=InterpolationMode.BICUBIC,
43 |                 ),
44 |                 transforms.CenterCrop(image_size),
45 |                 transforms.ToTensor(),
46 |                 self.normalize,
47 |             ]
48 |         )
49 | 
50 |         self.image_size = image_size
51 | 
52 |     def preprocess(self, images, return_tensors):
53 |         if isinstance(images, Image.Image):
54 |             images = [images]
55 |         else:
56 |             assert isinstance(images, list)
57 | 
58 |         transformed_images = [self.transform(image).numpy() for image in images]
59 |         data = {"pixel_values": transformed_images}
60 | 
61 |         return BatchFeature(data=data, tensor_type=return_tensors)
62 | 
63 |     def __call__(self, item):
64 |         return self.transform(item)
65 | 
66 |     @property
67 |     def crop_size(self):
68 |         return {'height': self.image_size, 'width': self.image_size}
69 | 


--------------------------------------------------------------------------------
/bunny/model/multimodal_encoder/siglip/siglip_encoder.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | 
  4 | from transformers import SiglipVisionModel, SiglipImageProcessor, SiglipVisionConfig
  5 | from bunny.util.s2wrapper import forward as multiscale_forward
  6 | 
  7 | 
  8 | class SiglipVisionTower(nn.Module):
  9 |     def __init__(self, vision_tower, args, delay_load=False):
 10 |         super().__init__()
 11 | 
 12 |         self.is_loaded = False
 13 | 
 14 |         self.vision_tower_name = vision_tower
 15 |         self.select_layer = -2
 16 | 
 17 |         if not delay_load:
 18 |             self.load_model()
 19 |         else:
 20 |             self.cfg_only = SiglipVisionConfig.from_pretrained(self.vision_tower_name)
 21 | 
 22 |     def load_model(self):
 23 |         if self.is_loaded:
 24 |             return
 25 |         self.image_processor = SiglipImageProcessor.from_pretrained(self.vision_tower_name)
 26 |         self.image_processor.crop_size = self.image_processor.size
 27 |         self.vision_tower = SiglipVisionModel.from_pretrained(self.vision_tower_name)
 28 |         self.vision_tower.requires_grad_(False)
 29 | 
 30 |         self.is_loaded = True
 31 | 
 32 |     def feature_select(self, image_forward_outs):
 33 |         image_features = image_forward_outs.hidden_states[self.select_layer]
 34 | 
 35 |         return image_features
 36 | 
 37 |     def forward(self, images):
 38 |         if type(images) is list:
 39 |             image_features = []
 40 |             for image in images:
 41 |                 image_forward_out = self.vision_tower(image.to(device=self.device, dtype=self.dtype).unsqueeze(0),
 42 |                                                       output_hidden_states=True)
 43 |                 image_feature = self.feature_select(image_forward_out).to(image.dtype)
 44 |                 image_features.append(image_feature)
 45 |         else:
 46 |             image_forward_outs = self.vision_tower(images.to(device=self.device, dtype=self.dtype),
 47 |                                                    output_hidden_states=True)
 48 |             image_features = self.feature_select(image_forward_outs).to(images.dtype)
 49 | 
 50 |         return image_features
 51 | 
 52 |     @property
 53 |     def dummy_feature(self):
 54 |         return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype)
 55 | 
 56 |     @property
 57 |     def dtype(self):
 58 |         return self.vision_tower.dtype
 59 | 
 60 |     @property
 61 |     def device(self):
 62 |         return self.vision_tower.device
 63 | 
 64 |     @property
 65 |     def config(self):
 66 |         if self.is_loaded:
 67 |             return self.vision_tower.config
 68 |         else:
 69 |             return self.cfg_only
 70 | 
 71 |     @property
 72 |     def hidden_size(self):
 73 |         return self.config.hidden_size
 74 | 
 75 |     @property
 76 |     def num_patches(self):
 77 |         return (self.config.image_size // self.config.patch_size) ** 2
 78 | 
 79 | 
 80 | class SiglipVisionTowerS2(SiglipVisionTower):
 81 |     def __init__(self, vision_tower, args, delay_load=False):
 82 |         self.s2_scales = getattr(args, 's2_scales', '384,768,1152')
 83 |         self.s2_scales = list(map(int, self.s2_scales.split(',')))
 84 |         self.s2_scales.sort()
 85 |         self.s2_split_size = self.s2_scales[0]
 86 |         self.s2_image_size = self.s2_scales[-1]
 87 | 
 88 |         super().__init__(vision_tower, args, delay_load)
 89 | 
 90 |         self.multiscale_forward = multiscale_forward
 91 | 
 92 |         if not delay_load:
 93 |             self.image_processor.size['height'] = self.image_processor.size['width'] = self.s2_image_size
 94 |             self.image_processor.crop_size['height'] = self.image_processor.crop_size['width'] = self.s2_image_size
 95 | 
 96 |     def load_model(self):
 97 |         if self.is_loaded:
 98 |             return
 99 |         self.image_processor = SiglipImageProcessor.from_pretrained(self.vision_tower_name)
100 |         self.image_processor.crop_size = self.image_processor.size
101 |         self.vision_tower = SiglipVisionModel.from_pretrained(self.vision_tower_name)
102 |         self.vision_tower.requires_grad_(False)
103 | 
104 |         self.image_processor.size['height'] = self.image_processor.size['width'] = self.s2_image_size
105 |         self.image_processor.crop_size['height'] = self.image_processor.crop_size['width'] = self.s2_image_size
106 | 
107 |         self.is_loaded = True
108 | 
109 |     def forward_feature(self, images):
110 |         image_forward_outs = self.vision_tower(images.to(device=self.device, dtype=self.dtype),
111 |                                                output_hidden_states=True)
112 |         image_features = self.feature_select(image_forward_outs).to(images.dtype)
113 |         return image_features
114 | 
115 |     def forward(self, images):
116 |         if type(images) is list:
117 |             image_features = []
118 |             for image in images:
119 |                 image_feature = self.multiscale_forward(self.forward_feature, image.unsqueeze(0),
120 |                                                         img_sizes=self.s2_scales, max_split_size=self.s2_split_size)
121 |                 image_features.append(image_feature)
122 |         else:
123 |             image_features = self.multiscale_forward(self.forward_feature, images, img_sizes=self.s2_scales,
124 |                                                      max_split_size=self.s2_split_size)
125 | 
126 |         return image_features
127 | 
128 |     @property
129 |     def hidden_size(self):
130 |         return self.config.hidden_size * len(self.s2_scales)
131 | 


--------------------------------------------------------------------------------
/bunny/model/multimodal_projector/builder.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import math
  3 | from torch import nn
  4 | from functools import partial
  5 | from timm.layers.norm_act import LayerNormAct2d
  6 | from torchvision.ops.misc import SqueezeExcitation as SELayer
  7 | from torchvision.models.mobilenetv3 import InvertedResidual, InvertedResidualConfig
  8 | 
  9 | 
 10 | class IdentityMap(nn.Module):
 11 |     def __init__(self):
 12 |         super().__init__()
 13 | 
 14 |     def forward(self, x, *args, **kwargs):
 15 |         return x
 16 | 
 17 |     @property
 18 |     def config(self):
 19 |         return {"mm_projector_type": 'identity'}
 20 | 
 21 | 
 22 | class Minigpt(nn.Module):
 23 |     def __init__(self, config=None):
 24 |         super(Minigpt, self).__init__()
 25 |         # c*4 is the input size, and c is the output size for the linear layer
 26 |         inc, ouc = config.mm_hidden_size, config.hidden_size
 27 |         self.linear = nn.Linear(inc * 4, ouc)
 28 | 
 29 |     def forward(self, x):
 30 |         # x is the input tensor with shape [b, num_tokens, c]
 31 |         b, num_tokens, c = x.shape
 32 | 
 33 |         # Check if num_tokens is divisible by 4
 34 |         if num_tokens % 4 != 0:
 35 |             raise ValueError("num_tokens must be divisible by 4")
 36 | 
 37 |         # Reshape x to [b, num_tokens/4, c*4]
 38 |         x = x.view(b, num_tokens // 4, c * 4)
 39 | 
 40 |         # Apply the linear transformation
 41 |         x = self.linear(x)
 42 |         return x
 43 | 
 44 | 
 45 | class Vanilla(nn.Module):
 46 |     def __init__(self, config=None):
 47 |         super(Vanilla, self).__init__()
 48 |         # c*4 is the input size, and c is the output size for the linear layer
 49 |         inc, ouc = config.mm_hidden_size, config.hidden_size
 50 |         self.linear = nn.Linear(inc * 4, ouc)
 51 | 
 52 |     def forward(self, x):
 53 |         b, num_tokens, c = x.shape
 54 | 
 55 |         # Check if num_tokens is divisible by 4
 56 |         if num_tokens % 4 != 0:
 57 |             raise ValueError("num_tokens must be divisible by 4")
 58 | 
 59 |         # First, reshape to [b, num_tokens//4, 4, c]
 60 |         x = x.view(b, num_tokens // 4, 4, c)
 61 | 
 62 |         # Then, permute to interleave the tokens
 63 |         x = x.permute(0, 1, 3, 2).contiguous()
 64 | 
 65 |         # Finally, reshape to [b, num_tokens//4, c*4] to interleave features of 4 tokens
 66 |         x = x.view(b, num_tokens // 4, c * 4)
 67 | 
 68 |         # Apply the linear transformation
 69 |         x = self.linear(x)
 70 |         return x
 71 | 
 72 | 
 73 | class LDPBlock(nn.Module):
 74 |     # Lightweight Downsample Projector Block
 75 | 
 76 |     def __init__(self, config=None):
 77 |         super().__init__()
 78 | 
 79 |         inc, ouc = config.mm_hidden_size, config.hidden_size
 80 |         layer_norm = partial(LayerNormAct2d, act_layer=None)
 81 |         se_layer = partial(SELayer, scale_activation=nn.Hardsigmoid)
 82 |         self.mlp = nn.Sequential(
 83 |             nn.Identity(), nn.Linear(inc, ouc), nn.GELU(), nn.Linear(ouc, ouc)
 84 |         )
 85 |         self.mb_block = nn.Sequential(
 86 |             nn.Identity(),
 87 |             InvertedResidual(InvertedResidualConfig(ouc, 3, ouc, ouc, True, "HS", 1, 1, 1), layer_norm, se_layer),
 88 |             InvertedResidual(InvertedResidualConfig(ouc, 3, ouc, ouc, True, "HS", 2, 1, 1), layer_norm, se_layer)
 89 |         )
 90 | 
 91 |     def forward(self, x):
 92 |         b, num_tokens, c = x.shape
 93 |         h = int(math.sqrt(num_tokens))
 94 |         x = self.mlp(x)
 95 |         x = x.permute(0, 2, 1).reshape(b, -1, h, h)
 96 |         x = self.mb_block(x)
 97 |         x = x.flatten(2).permute(0, 2, 1)
 98 |         return x
 99 | 
100 | 
101 | class LDPNetProjector(nn.Module):
102 | 
103 |     def __init__(self, config=None):
104 |         super().__init__()
105 |         self.model = LDPBlock(config)
106 | 
107 |     def forward(self, x):
108 |         return self.model(x)
109 | 
110 | 
111 | class SPP(nn.Module):
112 | 
113 |     def __init__(self, config=None, projector_type='v1'):
114 |         super().__init__()
115 | 
116 |         self.projector_type = projector_type
117 | 
118 |         inc, ouc = config.mm_hidden_size, config.hidden_size
119 |         self.linear_0 = nn.Linear(inc, inc)
120 | 
121 |         self.linear_1 = nn.Linear(inc, ouc)
122 | 
123 |         self.pooling = nn.AvgPool2d(kernel_size=2)
124 | 
125 |         self.linear_2 = nn.Linear(ouc, ouc)
126 | 
127 |     def forward(self, x):
128 |         b, num_tokens, c = x.shape
129 |         h = int(math.sqrt(num_tokens))
130 |         if 'v1' in self.projector_type:
131 |             x = self.linear_1(x)
132 |             x = x.permute(0, 2, 1).reshape(b, -1, h, h)
133 |             x = self.pooling(x)
134 |             x = x.flatten(2).permute(0, 2, 1)
135 |             x = self.linear_2(x)
136 |         elif 'v2' in self.projector_type:
137 |             x = self.linear_1(x)
138 |             x = self.linear_2(x)
139 |             x = x.permute(0, 2, 1).reshape(b, -1, h, h)
140 |             x = self.pooling(x)
141 |             x = x.flatten(2).permute(0, 2, 1)
142 |         elif 'v3' in self.projector_type:
143 |             x = self.linear_0(x)
144 |             x = x.permute(0, 2, 1).reshape(b, -1, h, h)
145 |             x = self.pooling(x)
146 |             x = x.flatten(2).permute(0, 2, 1)
147 |             x = self.linear_1(x)
148 |             x = self.linear_2(x)
149 |         return x
150 | 
151 | 
152 | def build_vision_projector(config, delay_load=False, **kwargs):
153 |     projector_type = getattr(config, 'mm_projector_type', 'mlp2x_gelu')
154 | 
155 |     if projector_type == 'linear':
156 |         return nn.Linear(config.mm_hidden_size, config.hidden_size)
157 | 
158 |     elif projector_type.startswith('mlp'):
159 |         mlp_gelu_match = re.match(r'^mlp(\d+)x_gelu$', projector_type)
160 |         if mlp_gelu_match:
161 |             mlp_depth = int(mlp_gelu_match.group(1))
162 |             modules = [nn.Linear(config.mm_hidden_size, config.hidden_size)]
163 |             for _ in range(1, mlp_depth):
164 |                 modules.append(nn.GELU())
165 |                 modules.append(nn.Linear(config.hidden_size, config.hidden_size))
166 |             return nn.Sequential(*modules)
167 | 
168 |     elif projector_type.startswith('spp'):
169 |         return SPP(config, projector_type)
170 | 
171 |     elif projector_type == 'ldp':
172 |         return LDPNetProjector(config)
173 | 
174 |     elif projector_type == 'vanilla':
175 |         return Vanilla(config)
176 | 
177 |     elif projector_type == 'minigpt':
178 |         return Minigpt(config)
179 | 
180 |     elif projector_type == 'identity':
181 |         return IdentityMap()
182 | 
183 |     raise ValueError(f'Unknown projector type: {projector_type}')
184 | 


--------------------------------------------------------------------------------
/bunny/serve/cli.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import torch
  3 | import requests
  4 | 
  5 | from PIL import Image
  6 | from io import BytesIO
  7 | from transformers import TextStreamer
  8 | 
  9 | from bunny.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN
 10 | from bunny.conversation import conv_templates, SeparatorStyle
 11 | from bunny.model.builder import load_pretrained_model
 12 | from bunny.util.utils import disable_torch_init
 13 | from bunny.util.mm_utils import process_images, tokenizer_image_token, get_model_name_from_path, \
 14 |     KeywordsStoppingCriteria
 15 | 
 16 | 
 17 | def load_image(image_file):
 18 |     if image_file.startswith('http://') or image_file.startswith('https://'):
 19 |         response = requests.get(image_file)
 20 |         image = Image.open(BytesIO(response.content)).convert('RGB')
 21 |     else:
 22 |         image = Image.open(image_file).convert('RGB')
 23 |     return image
 24 | 
 25 | 
 26 | def main(args):
 27 |     # Model
 28 |     disable_torch_init()
 29 | 
 30 |     model_name = get_model_name_from_path(args.model_path)
 31 |     tokenizer, model, image_processor, context_len = load_pretrained_model(args.model_path, args.model_base, model_name,
 32 |                                                                            args.model_type, args.load_8bit,
 33 |                                                                            args.load_4bit, device=args.device)
 34 | 
 35 |     conv_mode = "bunny"
 36 | 
 37 |     if args.conv_mode is not None and conv_mode != args.conv_mode:
 38 |         print(
 39 |             '[WARNING] the auto inferred conversation mode is {}, while `--conv-mode` is {}, using {}'.format(conv_mode,
 40 |                                                                                                               args.conv_mode,
 41 |                                                                                                               args.conv_mode))
 42 |     else:
 43 |         args.conv_mode = conv_mode
 44 | 
 45 |     conv = conv_templates[args.conv_mode].copy()
 46 |     roles = conv.roles
 47 | 
 48 |     image = load_image(args.image_file)
 49 |     # Similar operation in model_worker.py
 50 |     image_tensor = process_images([image], image_processor, model.config)
 51 |     if type(image_tensor) is list:
 52 |         image_tensor = [image.to(model.device, dtype=model.dtype) for image in image_tensor]
 53 |     else:
 54 |         image_tensor = image_tensor.to(model.device, dtype=model.dtype)
 55 | 
 56 |     while True:
 57 |         try:
 58 |             inp = input(f"{roles[0]}: ")
 59 |         except EOFError:
 60 |             inp = ""
 61 |         if not inp:
 62 |             print("exit...")
 63 |             break
 64 | 
 65 |         print(f"{roles[1]}: ", end="")
 66 | 
 67 |         if image is not None:
 68 |             # first message
 69 |             inp = DEFAULT_IMAGE_TOKEN + '\n' + inp
 70 |             conv.append_message(conv.roles[0], inp)
 71 |             image = None
 72 |         else:
 73 |             conv.append_message(conv.roles[0], inp)
 74 |         conv.append_message(conv.roles[1], None)
 75 |         prompt = conv.get_prompt()
 76 | 
 77 |         input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).to(
 78 |             model.device)
 79 |         stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
 80 |         keywords = [stop_str]
 81 |         stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids)
 82 |         streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
 83 | 
 84 |         with torch.inference_mode():
 85 |             output_ids = model.generate(
 86 |                 input_ids,
 87 |                 images=image_tensor,
 88 |                 do_sample=True if args.temperature > 0 else False,
 89 |                 temperature=args.temperature,
 90 |                 max_new_tokens=args.max_new_tokens,
 91 |                 streamer=streamer,
 92 |                 use_cache=True,
 93 |                 repetition_penalty=args.repetition_penalty,
 94 |                 stopping_criteria=[stopping_criteria])
 95 | 
 96 |         outputs = tokenizer.decode(output_ids[0, input_ids.shape[1]:]).strip()
 97 |         conv.messages[-1][-1] = outputs
 98 | 
 99 |         if args.debug:
100 |             print("\n", {"prompt": prompt, "outputs": outputs}, "\n")
101 | 
102 | 
103 | if __name__ == "__main__":
104 |     parser = argparse.ArgumentParser()
105 |     parser.add_argument("--model-path", type=str, default=None)
106 |     parser.add_argument("--model-base", type=str, default=None)
107 |     parser.add_argument("--model-type", type=str, default=None)
108 |     parser.add_argument("--image-file", type=str, required=True)
109 |     parser.add_argument("--device", type=str, default="cuda")
110 |     parser.add_argument("--conv-mode", type=str, default=None)
111 |     parser.add_argument("--temperature", type=float, default=0.2)
112 |     parser.add_argument("--repetition-penalty", type=float, default=1.0)
113 |     parser.add_argument("--max-new-tokens", type=int, default=512)
114 |     parser.add_argument("--load-8bit", action="store_true")
115 |     parser.add_argument("--load-4bit", action="store_true")
116 |     parser.add_argument("--debug", action="store_true")
117 |     args = parser.parse_args()
118 |     main(args)
119 | 


--------------------------------------------------------------------------------
/bunny/serve/examples/example_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BAAI-DCAI/Bunny/08273acbd83b6836c03dce25cc134eb6b9cedba5/bunny/serve/examples/example_1.png


--------------------------------------------------------------------------------
/bunny/serve/examples/example_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BAAI-DCAI/Bunny/08273acbd83b6836c03dce25cc134eb6b9cedba5/bunny/serve/examples/example_2.png


--------------------------------------------------------------------------------
/bunny/serve/examples/icon.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BAAI-DCAI/Bunny/08273acbd83b6836c03dce25cc134eb6b9cedba5/bunny/serve/examples/icon.jpg


--------------------------------------------------------------------------------
/bunny/serve/examples/user.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BAAI-DCAI/Bunny/08273acbd83b6836c03dce25cc134eb6b9cedba5/bunny/serve/examples/user.png


--------------------------------------------------------------------------------
/bunny/serve/register_worker.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import requests
 3 | 
 4 | if __name__ == "__main__":
 5 |     parser = argparse.ArgumentParser()
 6 |     parser.add_argument("--controller-address", type=str)
 7 |     parser.add_argument("--worker-name", type=str)
 8 |     parser.add_argument("--check-heart-beat", action="store_true")
 9 |     args = parser.parse_args()
10 | 
11 |     url = args.controller_address + "/register_worker"
12 |     data = {
13 |         "worker_name": args.worker_name,
14 |         "check_heart_beat": args.check_heart_beat,
15 |         "worker_status": None,
16 |     }
17 |     r = requests.post(url, json=data)
18 |     assert r.status_code == 200
19 | 


--------------------------------------------------------------------------------
/bunny/util/mm_utils.py:
--------------------------------------------------------------------------------
  1 | import base64
  2 | import torch
  3 | 
  4 | from PIL import Image
  5 | from io import BytesIO
  6 | from transformers import StoppingCriteria
  7 | 
  8 | from bunny.constants import IMAGE_TOKEN_INDEX
  9 | 
 10 | 
 11 | def load_image_from_base64(image):
 12 |     return Image.open(BytesIO(base64.b64decode(image)))
 13 | 
 14 | 
 15 | def expand2square(pil_img, background_color):
 16 |     width, height = pil_img.size
 17 |     if width == height:
 18 |         return pil_img
 19 |     elif width > height:
 20 |         result = Image.new(pil_img.mode, (width, width), background_color)
 21 |         result.paste(pil_img, (0, (width - height) // 2))
 22 |         return result
 23 |     else:
 24 |         result = Image.new(pil_img.mode, (height, height), background_color)
 25 |         result.paste(pil_img, ((height - width) // 2, 0))
 26 |         return result
 27 | 
 28 | 
 29 | def process_images(images, image_processor, model_cfg):
 30 |     image_aspect_ratio = getattr(model_cfg, "image_aspect_ratio", None)
 31 |     new_images = []
 32 |     if image_aspect_ratio == 'pad':
 33 |         for image in images:
 34 |             image = expand2square(image, tuple(int(x * 255) for x in image_processor.image_mean))
 35 |             image = image_processor.preprocess(image, return_tensors='pt')['pixel_values'][0]
 36 |             new_images.append(image)
 37 |     else:
 38 |         return image_processor(images, return_tensors='pt')['pixel_values']
 39 |     if all(x.shape == new_images[0].shape for x in new_images):
 40 |         new_images = torch.stack(new_images, dim=0)
 41 |     return new_images
 42 | 
 43 | 
 44 | def tokenizer_image_token(prompt, tokenizer, image_token_index=IMAGE_TOKEN_INDEX, return_tensors=None):
 45 |     prompt_chunks = [tokenizer(chunk).input_ids for chunk in prompt.split('<image>')]
 46 | 
 47 |     def insert_separator(X, sep):
 48 |         return [ele for sublist in zip(X, [sep] * len(X)) for ele in sublist][:-1]
 49 | 
 50 |     input_ids = []
 51 |     offset = 0
 52 |     if len(prompt_chunks) > 0 and len(prompt_chunks[0]) > 0 and prompt_chunks[0][0] == tokenizer.bos_token_id:
 53 |         offset = 1
 54 |         input_ids.append(prompt_chunks[0][0])
 55 | 
 56 |     for x in insert_separator(prompt_chunks, [image_token_index] * (offset + 1)):
 57 |         input_ids.extend(x[offset:])
 58 | 
 59 |     if return_tensors is not None:
 60 |         if return_tensors == 'pt':
 61 |             return torch.tensor(input_ids, dtype=torch.long)
 62 |         raise ValueError(f'Unsupported tensor type: {return_tensors}')
 63 |     return input_ids
 64 | 
 65 | 
 66 | def get_model_name_from_path(model_path):
 67 |     model_path = model_path.strip("/")
 68 |     model_paths = model_path.split("/")
 69 |     if model_paths[-1].startswith('checkpoint-'):
 70 |         return model_paths[-2] + "_" + model_paths[-1]
 71 |     else:
 72 |         return model_paths[-1]
 73 | 
 74 | 
 75 | class KeywordsStoppingCriteria(StoppingCriteria):
 76 |     def __init__(self, keywords, tokenizer, input_ids):
 77 |         self.keywords = keywords
 78 |         self.keyword_ids = []
 79 |         self.max_keyword_len = 0
 80 |         for keyword in keywords:
 81 |             cur_keyword_ids = tokenizer(keyword).input_ids
 82 |             if len(cur_keyword_ids) > 1 and cur_keyword_ids[0] == tokenizer.bos_token_id:
 83 |                 cur_keyword_ids = cur_keyword_ids[1:]
 84 |             if len(cur_keyword_ids) > self.max_keyword_len:
 85 |                 self.max_keyword_len = len(cur_keyword_ids)
 86 |             self.keyword_ids.append(torch.tensor(cur_keyword_ids))
 87 |         self.tokenizer = tokenizer
 88 |         self.start_len = input_ids.shape[1]
 89 | 
 90 |     def call_for_batch(self, output_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
 91 |         offset = min(output_ids.shape[1] - self.start_len, self.max_keyword_len)
 92 |         self.keyword_ids = [keyword_id.to(output_ids.device) for keyword_id in self.keyword_ids]
 93 |         for keyword_id in self.keyword_ids:
 94 |             truncated_output_ids = output_ids[0, -keyword_id.shape[0]:]
 95 |             if torch.equal(truncated_output_ids, keyword_id):
 96 |                 return True
 97 |         outputs = self.tokenizer.batch_decode(output_ids[:, -offset:], skip_special_tokens=True)[0]
 98 |         for keyword in self.keywords:
 99 |             if keyword in outputs:
100 |                 return True
101 |         return False
102 | 
103 |     def __call__(self, output_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
104 |         outputs = []
105 |         for i in range(output_ids.shape[0]):
106 |             outputs.append(self.call_for_batch(output_ids[i].unsqueeze(0), scores))
107 |         return all(outputs)
108 | 


--------------------------------------------------------------------------------
/bunny/util/s2wrapper/__init__.py:
--------------------------------------------------------------------------------
1 | from .core import *
2 | from .utils import *


--------------------------------------------------------------------------------
/bunny/util/s2wrapper/core.py:
--------------------------------------------------------------------------------
 1 | #  ------------------------------------------------------------------------------------------
 2 | #  Copyright (c) 2024 Baifeng Shi.
 3 | #  All rights reserved.
 4 | #
 5 | #  Licensed under the MIT License (MIT). See LICENSE in the repo root for license information.
 6 | #  ------------------------------------------------------------------------------------------
 7 | 
 8 | import math
 9 | import torch
10 | import torch.nn.functional as F
11 | from einops import rearrange
12 | from .utils import split_chessboard, merge_chessboard, batched_forward
13 | 
14 | def forward(model, input, scales=None, img_sizes=None, max_split_size=None, resize_output_to_idx=0, num_prefix_token=0,
15 |             output_shape='bnc', split_forward=False):
16 | 
17 |     assert input.dim() == 4, "Input image must be in the shape of BxCxHxW."
18 |     assert input.shape[2] == input.shape[3], "Currently only square images are supported."
19 |     assert output_shape in ['bnc', 'bchw'], "Output shape should be either BxNxC (e.g., ViT) or BxCxHxW (e.g., ConvNet)."
20 |     assert output_shape == 'bnc' or num_prefix_token == 0, "For ConvNet there shouldn't be any prefix token."
21 | 
22 |     b, c, input_size, _ = input.shape
23 | 
24 |     # image size for each scale
25 |     assert scales is not None or img_sizes is not None, "Please assign either scales or img_sizes."
26 |     img_sizes = img_sizes or [int(input_size * scale) for scale in scales]
27 | 
28 |     # prepare multiscale inputs
29 |     max_split_size = max_split_size or input_size   # The maximum size of each split of image. Set as the input size by default
30 |     num_splits = [math.ceil(size / max_split_size) for size in img_sizes]   # number of splits each scale
31 |     input_multiscale = []
32 |     for size, num_split in zip(img_sizes, num_splits):
33 |         x = F.interpolate(input.to(torch.float32), size=size, mode='bicubic').to(input.dtype)
34 |         x = split_chessboard(x, num_split=num_split)
35 |         input_multiscale.append(x)
36 | 
37 |     # run feedforward on each scale
38 |     outs_multiscale = [batched_forward(model, x, b) if split_forward else model(x) for x in input_multiscale]
39 |     if num_prefix_token > 0:
40 |         outs_prefix_multiscale = [out[:, :num_prefix_token] for out in outs_multiscale]
41 |         outs_multiscale = [out[:, num_prefix_token:] for out in outs_multiscale]
42 |     if output_shape == 'bnc':
43 |         outs_multiscale = [rearrange(out, 'b (h w) c -> b c h w', h=int(out.shape[1] ** 0.5), w=int(out.shape[1] ** 0.5))
44 |                            for out in outs_multiscale]
45 | 
46 |     # merge outputs of different splits for each scale separately
47 |     outs_multiscale = [merge_chessboard(out, num_split=num_split) for num_split, out in zip(num_splits, outs_multiscale)]
48 | 
49 |     # interpolate outputs from different scales and concat together
50 |     output_size = outs_multiscale[resize_output_to_idx].shape[-2]
51 |     out = torch.cat([F.interpolate(outs_multiscale[i].to(torch.float32), size=output_size,
52 |                                    mode='area').to(outs_multiscale[i].dtype)
53 |                      for i in range(len(outs_multiscale))], dim=1)
54 |     if output_shape == 'bnc':
55 |         out = rearrange(out, 'b c h w -> b (h w) c')
56 |     if num_prefix_token > 0:
57 |         # take the mean of prefix tokens from different splits for each scale
58 |         outs_prefix_multiscale = [torch.stack(out.split(b, dim=0), dim=0).mean(dim=0) for out in outs_prefix_multiscale]
59 |         out_prefix_multiscale = torch.cat(outs_prefix_multiscale, dim=-1)
60 |         out = torch.cat([out_prefix_multiscale, out], dim=1)
61 | 
62 |     return out
63 | 


--------------------------------------------------------------------------------
/bunny/util/s2wrapper/utils.py:
--------------------------------------------------------------------------------
 1 | #  ------------------------------------------------------------------------------------------
 2 | #  Copyright (c) 2024 Baifeng Shi.
 3 | #  All rights reserved.
 4 | #
 5 | #  Licensed under the MIT License (MIT). See LICENSE in the repo root for license information.
 6 | #  ------------------------------------------------------------------------------------------
 7 | 
 8 | import torch
 9 | 
10 | def split_chessboard(x, num_split):
11 |     """
12 |         x: b * c * h * w
13 |         Deividing x into num_split**2 sub-squares, and concatenate all the sub-squares on the batch dimension
14 |     """
15 |     B, C, H, W = x.shape
16 |     assert H % num_split == 0 and W % num_split == 0
17 |     h, w = H // num_split, W // num_split
18 |     x_split = torch.cat([x[:, :, i*h:(i+1)*h, j*w:(j+1)*w] for i in range(num_split) for j in range(num_split)], dim=0)
19 |     return x_split
20 | 
21 | def merge_chessboard(x, num_split):
22 |     """
23 |         x: b * c * h * w
24 |         Assuming x contains num_split**2 sub-squares concatenated along batch dimension, merge the sub-squares back to the original whole square.
25 |         (inverse of split_chessboard)
26 |     """
27 |     B, C, H, W = x.shape
28 |     assert B % (num_split**2) == 0
29 |     b = B // (num_split**2)
30 |     x_merge = torch.cat([torch.cat([x[(i*num_split + j)*b:(i*num_split + j + 1)*b] for j in range(num_split)], dim=-1)
31 |                          for i in range(num_split)], dim=-2)
32 |     return x_merge
33 | 
34 | def batched_forward(model, x, batch_size=-1):
35 |     if batch_size == -1:
36 |         return model(x)
37 |     else:
38 |         x_batched = x.split(batch_size)
39 |         outs = [model(x) for x in x_batched]
40 |         return torch.cat(outs, dim=0)
41 | 
42 | 


--------------------------------------------------------------------------------
/bunny/util/utils.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import logging.handlers
  3 | import os
  4 | import sys
  5 | 
  6 | from bunny.constants import LOGDIR
  7 | 
  8 | server_error_msg = "**NETWORK ERROR DUE TO HIGH TRAFFIC. PLEASE REGENERATE OR REFRESH THIS PAGE.**"
  9 | moderation_msg = "YOUR INPUT VIOLATES OUR CONTENT MODERATION GUIDELINES. PLEASE TRY AGAIN."
 10 | 
 11 | handler = None
 12 | 
 13 | 
 14 | def disable_torch_init():
 15 |     """
 16 |     Disable the redundant torch default initialization to accelerate model creation.
 17 |     """
 18 |     import torch
 19 |     setattr(torch.nn.Linear, "reset_parameters", lambda self: None)
 20 |     setattr(torch.nn.LayerNorm, "reset_parameters", lambda self: None)
 21 | 
 22 | 
 23 | def build_logger(logger_name, logger_filename):
 24 |     global handler
 25 | 
 26 |     formatter = logging.Formatter(
 27 |         fmt="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
 28 |         datefmt="%Y-%m-%d %H:%M:%S",
 29 |     )
 30 | 
 31 |     # Set the format of root handlers
 32 |     if not logging.getLogger().handlers:
 33 |         logging.basicConfig(level=logging.INFO)
 34 |     logging.getLogger().handlers[0].setFormatter(formatter)
 35 | 
 36 |     # Redirect stdout and stderr to loggers
 37 |     stdout_logger = logging.getLogger("stdout")
 38 |     stdout_logger.setLevel(logging.INFO)
 39 |     sl = StreamToLogger(stdout_logger, logging.INFO)
 40 |     sys.stdout = sl
 41 | 
 42 |     stderr_logger = logging.getLogger("stderr")
 43 |     stderr_logger.setLevel(logging.ERROR)
 44 |     sl = StreamToLogger(stderr_logger, logging.ERROR)
 45 |     sys.stderr = sl
 46 | 
 47 |     # Get logger
 48 |     logger = logging.getLogger(logger_name)
 49 |     logger.setLevel(logging.INFO)
 50 | 
 51 |     # Add a file handler for all loggers
 52 |     if handler is None:
 53 |         os.makedirs(LOGDIR, exist_ok=True)
 54 |         filename = os.path.join(LOGDIR, logger_filename)
 55 |         handler = logging.handlers.TimedRotatingFileHandler(
 56 |             filename, when='D', utc=True, encoding='UTF-8')
 57 |         handler.setFormatter(formatter)
 58 | 
 59 |         for name, item in logging.root.manager.loggerDict.items():
 60 |             if isinstance(item, logging.Logger):
 61 |                 item.addHandler(handler)
 62 | 
 63 |     return logger
 64 | 
 65 | 
 66 | class StreamToLogger(object):
 67 |     """
 68 |     Fake file-like stream object that redirects writes to a logger instance.
 69 |     """
 70 | 
 71 |     def __init__(self, logger, log_level=logging.INFO):
 72 |         self.terminal = sys.stdout
 73 |         self.logger = logger
 74 |         self.log_level = log_level
 75 |         self.linebuf = ''
 76 | 
 77 |     def __getattr__(self, attr):
 78 |         return getattr(self.terminal, attr)
 79 | 
 80 |     def write(self, buf):
 81 |         temp_linebuf = self.linebuf + buf
 82 |         self.linebuf = ''
 83 |         for line in temp_linebuf.splitlines(True):
 84 |             # From the io.TextIOWrapper docs:
 85 |             #   On output, if newline is None, any '\n' characters written
 86 |             #   are translated to the system default line separator.
 87 |             # By default sys.stdout.write() expects '\n' newlines and then
 88 |             # translates them so this is still cross platform.
 89 |             if line[-1] == '\n':
 90 |                 self.logger.log(self.log_level, line.rstrip())
 91 |             else:
 92 |                 self.linebuf += line
 93 | 
 94 |     def flush(self):
 95 |         if self.linebuf != '':
 96 |             self.logger.log(self.log_level, self.linebuf.rstrip())
 97 |         self.linebuf = ''
 98 | 
 99 | 
100 | def violates_moderation(text):
101 |     """
102 |     Check whether the text violates OpenAI moderation API.
103 |     """
104 |     url = "https://api.openai.com/v1/moderations"
105 |     headers = {"Content-Type": "application/json",
106 |                "Authorization": "Bearer " + os.environ["OPENAI_API_KEY"]}
107 |     text = text.replace("\n", "")
108 |     data = "{" + '"input": ' + f'"{text}"' + "}"
109 |     data = data.encode("utf-8")
110 |     try:
111 |         ret = requests.post(url, headers=headers, data=data, timeout=5)
112 |         flagged = ret.json()["results"][0]["flagged"]
113 |     except requests.exceptions.RequestException as e:
114 |         flagged = False
115 |     except KeyError as e:
116 |         flagged = False
117 | 
118 |     return flagged
119 | 
120 | 
121 | def pretty_print_semaphore(semaphore):
122 |     if semaphore is None:
123 |         return "None"
124 |     return f"Semaphore(value={semaphore._value}, locked={semaphore.locked()})"
125 | 


--------------------------------------------------------------------------------
/comparison_4B.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BAAI-DCAI/Bunny/08273acbd83b6836c03dce25cc134eb6b9cedba5/comparison_4B.png


--------------------------------------------------------------------------------
/comparison_8B.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BAAI-DCAI/Bunny/08273acbd83b6836c03dce25cc134eb6b9cedba5/comparison_8B.png


--------------------------------------------------------------------------------
/eval/cmmmu/eval_script.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | from argparse import ArgumentParser
  3 | from tabulate import tabulate
  4 | from eval_utils import evaluate_answer, evaluate_response
  5 | 
  6 | 
  7 | def read_jsonl_to_dict(data_path, output_path, category):
  8 |     with open(data_path, 'r', encoding='utf-8') as file:
  9 |         data = {int(parsed_line['id']): parsed_line for line in file if
 10 |                 (parsed_line := json.loads(line)).get('category') == category}
 11 | 
 12 |     with open(output_path, 'r', encoding='utf-8') as file:
 13 |         output = {int(parsed_line['id']): parsed_line for line in file if
 14 |                   int((parsed_line := json.loads(line)).get('id')) in data.keys()}
 15 | 
 16 |     return data, output
 17 | 
 18 | 
 19 | def process_answer_jsonl_file(data_path, output_path, category):
 20 |     global global_cnt
 21 |     global global_correct_cnt
 22 | 
 23 |     data_dict, output_dict = read_jsonl_to_dict(data_path, output_path, category)
 24 | 
 25 |     assert set(data_dict.keys()) == set(
 26 |         output_dict.keys()), "The ids are not exactly the same and cannot be processed further, please check files"
 27 | 
 28 |     for data_key, data_value in data_dict.items():
 29 |         output_dict[data_key]['predicted_answer'] = output_dict[data_key].get('answer')
 30 |         output_dict[data_key]['answer'] = data_value.get('answer')
 31 | 
 32 |     results_count = evaluate_answer(output_dict.values())
 33 | 
 34 |     return results_count
 35 | 
 36 | 
 37 | def process_response_jsonl_file(data_path, output_path, category):
 38 |     global global_cnt
 39 |     global global_correct_cnt
 40 | 
 41 |     data_dict, output_dict = read_jsonl_to_dict(data_path, output_path, category)
 42 | 
 43 |     assert set(data_dict.keys()) == set(
 44 |         output_dict.keys()), "The ids are not exactly the same and cannot be processed further, please check files"
 45 | 
 46 |     for data_key, data_value in data_dict.items():
 47 |         if data_value.get('type') == "选择":
 48 |             index2ans = {
 49 |                 'A': data_value.get('option1', ''),
 50 |                 'B': data_value.get('option2', ''),
 51 |                 'C': data_value.get('option3', ''),
 52 |                 'D': data_value.get('option4', '')
 53 |             }
 54 |             output_dict[data_key]['index2ans'] = index2ans
 55 |         output_dict[data_key]['answer'] = data_value.get('answer')
 56 | 
 57 |     results_count = evaluate_response(output_dict.values())
 58 | 
 59 |     return results_count
 60 | 
 61 | 
 62 | if __name__ == '__main__':
 63 | 
 64 |     parser = ArgumentParser()
 65 |     parser.add_argument('--output_path', type=str, default="eval/example/Yi-VL-34B-answer.jsonl",
 66 |                         help="The path to model output file.")
 67 |     parser.add_argument('--data_path', type=str, default="eval/cmmmu/cmmmu-data-val-answer.jsonl",
 68 |                         help="Answer file path.")
 69 |     args = parser.parse_args()
 70 | 
 71 |     category_list = ['艺术与设计', '商业', '科学', '健康与医学', '人文社会科学', '技术与工程']
 72 |     category_dict = {'艺术与设计': 'Art & Design', '商业': 'Business', '科学': 'Science',
 73 |                      '健康与医学': 'Health & Medicine', '人文社会科学': 'Humanities & Social Sciences',
 74 |                      '技术与工程': 'Technology & Engineering'}
 75 | 
 76 |     headers = ['Subject', 'Correct Num', 'Entries Num', 'Acc']
 77 |     table = []
 78 |     correct_sum = 0
 79 |     entries_sum = 0
 80 | 
 81 |     is_answer = True
 82 |     is_response = True
 83 |     with open(args.output_path, 'r') as file:
 84 |         for line in file:
 85 |             data = json.loads(line)
 86 |             if set(data.keys()) != {'id', 'type', 'answer'}:
 87 |                 is_answer = False
 88 |             if set(data.keys()) != {'id', 'type', 'response'}:
 89 |                 is_response = False
 90 |     assert is_answer or is_response, "The file should contain either 'answer' or 'response'"
 91 | 
 92 |     for category in category_list:
 93 |         if is_answer:
 94 |             results_count = process_answer_jsonl_file(args.data_path, args.output_path, category)
 95 |         elif is_response:
 96 |             results_count = process_response_jsonl_file(args.data_path, args.output_path, category)
 97 |         correct_sum += results_count['correct_num']
 98 |         entries_sum += results_count['entries_num']
 99 |         table.append(
100 |             [category_dict[category], results_count['correct_num'], results_count['entries_num'], results_count['acc']])
101 | 
102 |     table.append(['Overall', correct_sum, entries_sum, correct_sum / entries_sum])
103 |     print(tabulate(table, headers=headers, tablefmt='orgtbl'))
104 | 


--------------------------------------------------------------------------------
/eval/cmmmu/prompt.yaml:
--------------------------------------------------------------------------------
 1 | task_instructions:
 2 | - '请回答以下多项选择题，并选出正确选项。这些题目可能包括单选和多选题型。如果所提供的信息不足以确定一个明确的答案，那么请根据可用的数据和你的判断来选择最可能正确的选项。'
 3 | - '请回答以下判断题，并根据题目描述和所给的信息来判断问题中陈述的对错。如果信息不完整或不足以作出绝对判断，请运用你的逻辑推理和现有信息来做出最可能的判断。'
 4 | - '请回答以下填空题，并根据题目的要求和所提供的信息来给出最恰当的答案。如果信息不足以确切回答，那么请依据现有的数据和你的推理能力来填写最合理的答案。'
 5 | multi_choice_example_format:
 6 |   - |
 7 |     问题：{}
 8 |     选项：
 9 |     {}
10 |     正确答案：
11 | 
12 | T/F_example_format:
13 |   - |
14 |     问题：{}
15 |     正确答案：
16 | 
17 | short_ans_example_format:
18 |   - |
19 |     问题：{}
20 |     正确答案：
21 | 
22 | temperature:
23 | - 0


--------------------------------------------------------------------------------
/eval/gqa/convert_gqa_for_eval.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import argparse
 3 | 
 4 | parser = argparse.ArgumentParser()
 5 | parser.add_argument("--src", type=str)
 6 | parser.add_argument("--dst", type=str)
 7 | args = parser.parse_args()
 8 | 
 9 | all_answers = []
10 | for line_idx, line in enumerate(open(args.src)):
11 |     res = json.loads(line)
12 |     question_id = res['question_id']
13 |     text = res['text'].rstrip('.').lower()
14 |     all_answers.append({"questionId": question_id, "prediction": text})
15 | 
16 | with open(args.dst, 'w') as f:
17 |     json.dump(all_answers, f)
18 | 


--------------------------------------------------------------------------------
/eval/gqa/testdev_balanced_questions.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BAAI-DCAI/Bunny/08273acbd83b6836c03dce25cc134eb6b9cedba5/eval/gqa/testdev_balanced_questions.tar.gz


--------------------------------------------------------------------------------
/eval/mm-vet/convert_mmvet_for_eval.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import argparse
 3 | 
 4 | parser = argparse.ArgumentParser()
 5 | parser.add_argument("--src", type=str)
 6 | parser.add_argument("--dst", type=str)
 7 | args = parser.parse_args()
 8 | 
 9 | cur_result = {}
10 | 
11 | for line in open(args.src):
12 |     data = json.loads(line)
13 |     qid = data['question_id']
14 |     cur_result[f'v1_{qid}'] = data['text']
15 | 
16 | with open(args.dst, 'w') as f:
17 |     json.dump(cur_result, f, indent=2)
18 | 


--------------------------------------------------------------------------------
/eval/mmbench/convert_mmbench_for_submission.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | import argparse
 4 | import pandas as pd
 5 | 
 6 | 
 7 | def get_args():
 8 |     parser = argparse.ArgumentParser()
 9 |     parser.add_argument("--annotation-file", type=str, required=True)
10 |     parser.add_argument("--result-dir", type=str, required=True)
11 |     parser.add_argument("--upload-dir", type=str, required=True)
12 |     parser.add_argument("--experiment", type=str, required=True)
13 | 
14 |     return parser.parse_args()
15 | 
16 | 
17 | if __name__ == "__main__":
18 |     args = get_args()
19 | 
20 |     df = pd.read_table(args.annotation_file)
21 | 
22 |     cur_df = df.copy()
23 |     cur_df = cur_df.drop(columns=['hint', 'category', 'source', 'image', 'comment', 'l2-category'])
24 |     cur_df.insert(6, 'prediction', None)
25 |     for pred in open(os.path.join(args.result_dir, f"{args.experiment}.jsonl")):
26 |         pred = json.loads(pred)
27 |         cur_df.loc[df['index'] == pred['question_id'], 'prediction'] = pred['text']
28 | 
29 |     cur_df.to_excel(os.path.join(args.upload_dir, f"{args.experiment}.xlsx"), index=False, engine='openpyxl')
30 | 


--------------------------------------------------------------------------------
/eval/mme/calculation_mme.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import argparse
  3 | from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix
  4 | 
  5 | parser = argparse.ArgumentParser()
  6 | parser.add_argument('--results_dir', default='./LaVIN', type=str)
  7 | 
  8 | eval_type_dict = {
  9 |     "Perception": ["existence", "count", "position", "color", "posters", "celebrity", "scene", "landmark", "artwork",
 10 |                    "OCR"],
 11 |     "Cognition": ["commonsense_reasoning", "numerical_calculation", "text_translation", "code_reasoning"]
 12 | }
 13 | 
 14 | 
 15 | class calculate_metrics:
 16 |     def divide_chunks(self, l, n=2):
 17 |         # looping till length l
 18 |         for i in range(0, len(l), n):
 19 |             yield l[i:i + n]
 20 | 
 21 |         return
 22 | 
 23 |     def parse_pred_ans(self, pred_ans):
 24 |         pred_label = None
 25 |         if pred_ans in ["yes", "no"]:
 26 |             pred_label = pred_ans
 27 |         else:
 28 |             prefix_pred_ans = pred_ans[:4]
 29 | 
 30 |             if "yes" in prefix_pred_ans:
 31 |                 pred_label = "yes"
 32 |             elif "no" in prefix_pred_ans:
 33 |                 pred_label = "no"
 34 |             else:
 35 |                 pred_label = "other"
 36 | 
 37 |         return pred_label
 38 | 
 39 |     def compute_metric(self, gts, preds):
 40 |         assert len(gts) == len(preds)
 41 | 
 42 |         label_map = {
 43 |             "yes": 1,
 44 |             "no": 0,
 45 |             "other": -1,
 46 |         }
 47 | 
 48 |         gts = [label_map[x] for x in gts]
 49 |         preds = [label_map[x] for x in preds]
 50 | 
 51 |         acc = accuracy_score(gts, preds)
 52 | 
 53 |         clean_gts = []
 54 |         clean_preds = []
 55 |         other_num = 0
 56 |         for gt, pred in zip(gts, preds):
 57 |             if pred == -1:
 58 |                 other_num += 1
 59 |                 continue
 60 |             clean_gts.append(gt)
 61 |             clean_preds.append(pred)
 62 | 
 63 |         conf_mat = confusion_matrix(clean_gts, clean_preds, labels=[1, 0])
 64 |         precision = precision_score(clean_gts, clean_preds, average='binary')
 65 |         recall = recall_score(clean_gts, clean_preds, average='binary')
 66 |         tp, fn = conf_mat[0]
 67 |         fp, tn = conf_mat[1]
 68 | 
 69 |         metric_dict = dict()
 70 |         metric_dict = {
 71 |             "TP": tp,
 72 |             "FN": fn,
 73 |             "TN": tn,
 74 |             "FP": fp,
 75 |             "precision": precision,
 76 |             "recall": recall,
 77 |             "other_num": other_num,
 78 |             "acc": acc,
 79 |         }
 80 | 
 81 |         return metric_dict
 82 | 
 83 |     def process_result(self, results_dir):
 84 | 
 85 |         model_score_dict = dict()
 86 |         for eval_type, task_name_list in eval_type_dict.items():
 87 |             print("===========", eval_type, "===========")
 88 | 
 89 |             scores = 0
 90 |             task_score_dict = dict()
 91 | 
 92 |             for task_name in task_name_list:
 93 | 
 94 |                 task_txt = os.path.join(results_dir, task_name + ".txt")
 95 |                 lines = open(task_txt, 'r').readlines()
 96 |                 filtered_lines = []
 97 |                 for line in lines:
 98 |                     try:
 99 |                         img_name, question, gt_ans, pred_ans = line.split("\t")
100 |                         filtered_lines.append(line)
101 |                     except:
102 |                         pass
103 |                 lines = filtered_lines[:]
104 | 
105 |                 chunk_lines = list(self.divide_chunks(lines))  # one image corresponds to two questions
106 | 
107 |                 img_num = len(chunk_lines)
108 |                 task_other_ans_num = 0
109 |                 task_score = 0
110 |                 acc_plus_correct_num = 0
111 |                 gts = []
112 |                 preds = []
113 | 
114 |                 for img_items in chunk_lines:
115 |                     assert len(img_items) == 2
116 |                     img_correct_num = 0
117 | 
118 |                     for img_item in img_items:
119 |                         img_name, question, gt_ans, pred_ans = img_item.split("\t")
120 | 
121 |                         gt_ans = gt_ans.lower()
122 |                         pred_ans = pred_ans.lower()
123 | 
124 |                         assert gt_ans in ["yes", "no"]  # gt can only be yes or no.
125 | 
126 |                         pred_ans = self.parse_pred_ans(pred_ans)
127 |                         assert pred_ans in ["yes", "no", "other"]
128 | 
129 |                         gts.append(gt_ans)
130 |                         preds.append(pred_ans)
131 | 
132 |                         if gt_ans == pred_ans:
133 |                             img_correct_num += 1
134 | 
135 |                         if pred_ans not in ["yes", "no"]:
136 |                             task_other_ans_num += 1
137 | 
138 |                     if img_correct_num == 2:
139 |                         acc_plus_correct_num += 1
140 | 
141 |                 # cal TP precision acc, etc.
142 |                 metric_dict = self.compute_metric(gts, preds)
143 |                 acc_plus = acc_plus_correct_num / img_num
144 |                 metric_dict["acc_plus"] = acc_plus
145 | 
146 |                 for k, v in metric_dict.items():
147 |                     if k in ["acc", "acc_plus"]:
148 |                         task_score += v * 100
149 | 
150 |                 task_score_dict[task_name] = task_score
151 | 
152 |                 scores += task_score
153 | 
154 |             print("total score:", scores, "\n")
155 |             for task_name, score in task_score_dict.items():
156 |                 print("\t", task_name, " score:", score)
157 |             print("\n")
158 | 
159 |         return
160 | 
161 | 
162 | if __name__ == "__main__":
163 |     cal = calculate_metrics()
164 | 
165 |     args = parser.parse_args()
166 |     results_dir = args.results_dir
167 |     cal.process_result(results_dir)
168 | 


--------------------------------------------------------------------------------
/eval/mme/convert_answer_to_mme.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | import argparse
 4 | from collections import defaultdict
 5 | 
 6 | 
 7 | def get_args():
 8 |     parser = argparse.ArgumentParser()
 9 | 
10 |     parser.add_argument('--experiment',
11 |                         type=str,
12 |                         required=True)
13 | 
14 |     args = parser.parse_args()
15 |     return args
16 | 
17 | 
18 | def get_gt(data_path):
19 |     GT = {}
20 |     for category in os.listdir(data_path):
21 |         category_dir = os.path.join(data_path, category)
22 |         if not os.path.isdir(category_dir):
23 |             continue
24 |         if os.path.exists(os.path.join(category_dir, 'images')):
25 |             image_path = os.path.join(category_dir, 'images')
26 |             qa_path = os.path.join(category_dir, 'questions_answers_YN')
27 |         else:
28 |             image_path = qa_path = category_dir
29 |         assert os.path.isdir(image_path), image_path
30 |         assert os.path.isdir(qa_path), qa_path
31 |         for file in os.listdir(qa_path):
32 |             if not file.endswith('.txt'):
33 |                 continue
34 |             for line in open(os.path.join(qa_path, file)):
35 |                 question, answer = line.strip().split('\t')
36 |                 GT[(category, file, question)] = answer
37 |     return GT
38 | 
39 | 
40 | if __name__ == "__main__":
41 | 
42 |     args = get_args()
43 | 
44 |     GT = get_gt(
45 |         data_path='MME_Benchmark_release_version'
46 |     )
47 | 
48 |     experiment = args.experiment
49 | 
50 |     result_dir = os.path.join('answers_upload', experiment)
51 |     os.makedirs(result_dir, exist_ok=True)
52 | 
53 |     answers = [json.loads(line) for line in open(os.path.join('answers', f'{experiment}.jsonl'))]
54 | 
55 |     results = defaultdict(list)
56 |     for answer in answers:
57 |         category = answer['question_id'].split('/')[0]
58 |         file = answer['question_id'].split('/')[-1].split('.')[0] + '.txt'
59 |         question = answer['prompt']
60 |         results[category].append((file, answer['prompt'], answer['text']))
61 | 
62 |     for category, cate_tups in results.items():
63 |         with open(os.path.join(result_dir, f'{category}.txt'), 'w') as fp:
64 |             for file, prompt, answer in cate_tups:
65 |                 if 'Answer the question using a single word or phrase.' in prompt:
66 |                     prompt = prompt.replace('Answer the question using a single word or phrase.', '').strip()
67 |                 if 'Answer the question directly with a short sentence or phrase.' in prompt:
68 |                     prompt = prompt.replace('Answer the question directly with a short sentence or phrase.', '').strip()
69 |                 if 'Please answer yes or no.' not in prompt:
70 |                     prompt = prompt + ' Please answer yes or no.'
71 |                     if (category, file, prompt) not in GT:
72 |                         prompt = prompt.replace(' Please answer yes or no.', '  Please answer yes or no.')
73 |                 gt_ans = GT[category, file, prompt]
74 |                 tup = file, prompt, gt_ans, answer
75 |                 fp.write('\t'.join(tup) + '\n')
76 | 


--------------------------------------------------------------------------------
/eval/mmmu/config.yaml:
--------------------------------------------------------------------------------
 1 | task_instructions:
 2 | - ""
 3 | multi_choice_example_format:
 4 | - "{}
 5 | 
 6 | {}
 7 | 
 8 | Answer with the option's letter from the given choices directly."
 9 | 
10 | short_ans_example_format:
11 | - "{}
12 | 
13 | Answer the question using a single word or phrase."
14 | temperature:
15 | - 0


--------------------------------------------------------------------------------
/eval/pope/eval_pope.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | import argparse
 4 | 
 5 | 
 6 | def eval_pope(answers, label_file):
 7 |     label_list = [json.loads(q)['label'] for q in open(label_file, 'r')]
 8 | 
 9 |     for answer in answers:
10 |         text = answer['text']
11 | 
12 |         # Only keep the first sentence
13 |         if text.find('.') != -1:
14 |             text = text.split('.')[0]
15 | 
16 |         text = text.replace(',', '')
17 |         words = text.split(' ')
18 |         if 'No' in words or 'not' in words or 'no' in words:
19 |             answer['text'] = 'no'
20 |         else:
21 |             answer['text'] = 'yes'
22 | 
23 |     for i in range(len(label_list)):
24 |         if label_list[i] == 'no':
25 |             label_list[i] = 0
26 |         else:
27 |             label_list[i] = 1
28 | 
29 |     pred_list = []
30 |     for answer in answers:
31 |         if answer['text'] == 'no':
32 |             pred_list.append(0)
33 |         else:
34 |             pred_list.append(1)
35 | 
36 |     pos = 1
37 |     neg = 0
38 |     yes_ratio = pred_list.count(1) / len(pred_list)
39 | 
40 |     TP, TN, FP, FN = 0, 0, 0, 0
41 |     for pred, label in zip(pred_list, label_list):
42 |         if pred == pos and label == pos:
43 |             TP += 1
44 |         elif pred == pos and label == neg:
45 |             FP += 1
46 |         elif pred == neg and label == neg:
47 |             TN += 1
48 |         elif pred == neg and label == pos:
49 |             FN += 1
50 | 
51 |     print('TP\tFP\tTN\tFN\t')
52 |     print('{}\t{}\t{}\t{}'.format(TP, FP, TN, FN))
53 | 
54 |     precision = float(TP) / float(TP + FP)
55 |     recall = float(TP) / float(TP + FN)
56 |     f1 = 2 * precision * recall / (precision + recall)
57 |     acc = (TP + TN) / (TP + TN + FP + FN)
58 |     print('Accuracy: {}'.format(acc))
59 |     print('Precision: {}'.format(precision))
60 |     print('Recall: {}'.format(recall))
61 |     print('F1 score: {}'.format(f1))
62 |     print('Yes ratio: {}'.format(yes_ratio))
63 |     print('%.3f, %.3f, %.3f, %.3f, %.3f' % (f1, acc, precision, recall, yes_ratio))
64 |     return f1
65 | 
66 | 
67 | if __name__ == "__main__":
68 |     parser = argparse.ArgumentParser()
69 |     parser.add_argument("--annotation-dir", type=str)
70 |     parser.add_argument("--question-file", type=str)
71 |     parser.add_argument("--result-file", type=str)
72 |     args = parser.parse_args()
73 | 
74 |     questions = [json.loads(line) for line in open(args.question_file)]
75 |     questions = {question['question_id']: question for question in questions}
76 |     answers = [json.loads(q) for q in open(args.result_file)]
77 | 
78 |     average_f1 = 0
79 |     for file in os.listdir(args.annotation_dir):
80 |         assert file.startswith('coco_pope_')
81 |         assert file.endswith('.json')
82 |         category = file[10:-5]
83 |         cur_answers = [x for x in answers if questions[x['question_id']]['category'] == category]
84 |         print('Category: {}, # samples: {}'.format(category, len(cur_answers)))
85 |         average_f1 += eval_pope(cur_answers, os.path.join(args.annotation_dir, file))
86 |         print("====================================")
87 | 
88 |     average_f1 /= len(os.listdir(args.annotation_dir))
89 | 
90 |     print(f'Average F1 score: {average_f1}')
91 | 


--------------------------------------------------------------------------------
/eval/scienceqa/eval_science_qa.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import json
  3 | import os
  4 | import re
  5 | import random
  6 | 
  7 | 
  8 | def get_args():
  9 |     parser = argparse.ArgumentParser()
 10 |     parser.add_argument('--base-dir', type=str)
 11 |     parser.add_argument('--result-file', type=str)
 12 |     parser.add_argument('--output-file', type=str)
 13 |     parser.add_argument('--output-result', type=str)
 14 |     parser.add_argument('--split', type=str, default='test')
 15 |     parser.add_argument('--options', type=list, default=["A", "B", "C", "D", "E"])
 16 |     return parser.parse_args()
 17 | 
 18 | 
 19 | def convert_caps(results):
 20 |     fakecaps = []
 21 |     for result in results:
 22 |         image_id = result['question_id']
 23 |         caption = result['text']
 24 |         fakecaps.append({"image_id": int(image_id), "caption": caption})
 25 |     return fakecaps
 26 | 
 27 | 
 28 | def get_pred_idx(prediction, choices, options):
 29 |     """
 30 |     Get the index (e.g. 2) from the prediction (e.g. 'C')
 31 |     """
 32 |     if prediction in options[:len(choices)]:
 33 |         return options.index(prediction)
 34 |     else:
 35 |         return -1
 36 |         return random.choice(range(len(choices)))
 37 | 
 38 | 
 39 | if __name__ == "__main__":
 40 |     args = get_args()
 41 | 
 42 |     base_dir = args.base_dir
 43 |     split_indices = json.load(open(os.path.join(base_dir, "pid_splits.json")))[args.split]
 44 |     problems = json.load(open(os.path.join(base_dir, "problems.json")))
 45 |     predictions = [json.loads(line) for line in open(args.result_file)]
 46 |     predictions = {pred['question_id']: pred for pred in predictions}
 47 |     split_problems = {idx: problems[idx] for idx in split_indices}
 48 | 
 49 |     results = {'correct': [], 'incorrect': []}
 50 |     sqa_results = {}
 51 |     sqa_results['acc'] = None
 52 |     sqa_results['correct'] = None
 53 |     sqa_results['count'] = None
 54 |     sqa_results['results'] = {}
 55 |     sqa_results['outputs'] = {}
 56 | 
 57 |     for prob_id, prob in split_problems.items():
 58 |         if prob_id not in predictions:
 59 |             pred = {'text': 'FAILED', 'prompt': 'Unknown'}
 60 |             pred_text = 'FAILED'
 61 |         else:
 62 |             pred = predictions[prob_id]
 63 |             pred_text = pred['text']
 64 | 
 65 |         if pred_text in args.options:
 66 |             answer = pred_text
 67 |         elif len(pred_text) >= 3 and pred_text[0] in args.options and pred_text[1:3] == ". ":
 68 |             answer = pred_text[0]
 69 |         else:
 70 |             pattern = re.compile(r'The answer is ([A-Z]).')
 71 |             res = pattern.findall(pred_text)
 72 |             if len(res) == 1:
 73 |                 answer = res[0]  # 'A', 'B', ...
 74 |             else:
 75 |                 answer = "FAILED"
 76 | 
 77 |         pred_idx = get_pred_idx(answer, prob['choices'], args.options)
 78 | 
 79 |         analysis = {
 80 |             'question_id': prob_id,
 81 |             'parsed_ans': answer,
 82 |             'ground_truth': args.options[prob['answer']],
 83 |             'question': pred['prompt'],
 84 |             'pred': pred_text,
 85 |             'is_multimodal': '<image>' in pred['prompt'],
 86 |         }
 87 | 
 88 |         sqa_results['results'][prob_id] = get_pred_idx(answer, prob['choices'], args.options)
 89 |         sqa_results['outputs'][prob_id] = pred_text
 90 | 
 91 |         if pred_idx == prob['answer']:
 92 |             results['correct'].append(analysis)
 93 |         else:
 94 |             results['incorrect'].append(analysis)
 95 | 
 96 |     correct = len(results['correct'])
 97 |     total = len(results['correct']) + len(results['incorrect'])
 98 | 
 99 |     ###### IMG ######
100 |     multimodal_correct = len([x for x in results['correct'] if x['is_multimodal']])
101 |     multimodal_incorrect = len([x for x in results['incorrect'] if x['is_multimodal']])
102 |     multimodal_total = multimodal_correct + multimodal_incorrect
103 |     ###### IMG ######
104 | 
105 |     print(
106 |         f'Total: {total}, Correct: {correct}, Accuracy: {correct / total * 100:.2f}%, IMG-Accuracy: {multimodal_correct / multimodal_total * 100:.2f}%')
107 | 
108 |     sqa_results['acc'] = correct / total * 100
109 |     sqa_results['correct'] = correct
110 |     sqa_results['count'] = total
111 |     sqa_results['image_acc'] = multimodal_correct / multimodal_total * 100
112 |     sqa_results['image_correct'] = multimodal_correct
113 |     sqa_results['image_count'] = multimodal_total
114 | 
115 |     with open(args.output_file, 'w') as f:
116 |         json.dump(results, f, indent=2)
117 |     with open(args.output_result, 'w') as f:
118 |         json.dump(sqa_results, f, indent=2)
119 | 


--------------------------------------------------------------------------------
/eval/seed-bench/convert_seed_for_submission.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import argparse
 3 | 
 4 | 
 5 | def get_args():
 6 |     parser = argparse.ArgumentParser()
 7 |     parser.add_argument("--annotation-file", type=str)
 8 |     parser.add_argument("--result-file", type=str)
 9 |     parser.add_argument("--result-upload-file", type=str)
10 |     return parser.parse_args()
11 | 
12 | 
13 | def eval_single(result_file, eval_only_type=None):
14 |     results = {}
15 |     for line in open(result_file):
16 |         row = json.loads(line)
17 |         results[row['question_id']] = row
18 | 
19 |     type_counts = {}
20 |     correct_counts = {}
21 |     for question_data in data['questions']:
22 |         if eval_only_type is not None and question_data['data_type'] != eval_only_type: continue
23 |         data_type = question_data['question_type_id']
24 |         type_counts[data_type] = type_counts.get(data_type, 0) + 1
25 |         try:
26 |             question_id = int(question_data['question_id'])
27 |         except:
28 |             question_id = question_data['question_id']
29 |         if question_id not in results:
30 |             correct_counts[data_type] = correct_counts.get(data_type, 0)
31 |             continue
32 |         row = results[question_id]
33 |         if row['text'] == question_data['answer']:
34 |             correct_counts[data_type] = correct_counts.get(data_type, 0) + 1
35 | 
36 |     total_count = 0
37 |     total_correct = 0
38 |     for data_type in sorted(type_counts.keys()):
39 |         accuracy = correct_counts[data_type] / type_counts[data_type] * 100
40 |         if eval_only_type is None:
41 |             print(f"{ques_type_id_to_name[data_type]}: {accuracy:.2f}%")
42 | 
43 |         total_count += type_counts[data_type]
44 |         total_correct += correct_counts[data_type]
45 | 
46 |     total_accuracy = total_correct / total_count * 100
47 |     if eval_only_type is None:
48 |         print(f"Total accuracy: {total_accuracy:.2f}%")
49 |     else:
50 |         print(f"{eval_only_type} accuracy: {total_accuracy:.2f}%")
51 | 
52 |     return results
53 | 
54 | 
55 | if __name__ == "__main__":
56 |     args = get_args()
57 |     data = json.load(open(args.annotation_file))
58 |     ques_type_id_to_name = {id: n for n, id in data['question_type'].items()}
59 | 
60 |     results = eval_single(args.result_file)
61 |     eval_single(args.result_file, eval_only_type='image')
62 |     eval_single(args.result_file, eval_only_type='video')
63 | 
64 |     with open(args.result_upload_file, 'w') as fp:
65 |         for question in data['questions']:
66 |             qid = question['question_id']
67 |             if qid in results:
68 |                 result = results[qid]
69 |             else:
70 |                 result = results[int(qid)]
71 |             fp.write(json.dumps({
72 |                 'question_id': qid,
73 |                 'prediction': result['text']
74 |             }) + '\n')
75 | 


--------------------------------------------------------------------------------
/eval/seed-bench/extract_video_frames.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import json
  3 | import numpy as np
  4 | import torch
  5 | import av
  6 | from decord import VideoReader, cpu
  7 | from PIL import Image
  8 | 
  9 | from tqdm.auto import tqdm
 10 | import concurrent.futures
 11 | 
 12 | num_segments = 1
 13 | 
 14 | # root directory of evaluation dimension 10
 15 | dimension10_dir = "eval/seed-bench/SEED-Bench-video/20bn-something-something-v2"
 16 | # root directory of evaluation dimension 11
 17 | dimension11_dir = "eval/seed-bench/SEED-Bench-video/EPIC-KITCHENS"
 18 | # root directory of evaluation dimension 12
 19 | dimension12_dir = "eval/seed-bench/SEED-Bench-video/BreakfastII_15fps_qvga_sync"
 20 | 
 21 | 
 22 | def transform_video(buffer):
 23 |     try:
 24 |         buffer = buffer.numpy()
 25 |     except AttributeError:
 26 |         try:
 27 |             buffer = buffer.asnumpy()
 28 |         except AttributeError:
 29 |             print("Both buffer.numpy() and buffer.asnumpy() failed.")
 30 |             buffer = None
 31 |     images_group = list()
 32 |     for fid in range(len(buffer)):
 33 |         images_group.append(Image.fromarray(buffer[fid]))
 34 |     return images_group
 35 | 
 36 | 
 37 | def get_index(num_frames, num_segments):
 38 |     if num_segments > num_frames:
 39 |         offsets = np.array([
 40 |             idx for idx in range(num_frames)
 41 |         ])
 42 |     else:
 43 |         # uniform sampling
 44 |         seg_size = float(num_frames - 1) / num_segments
 45 |         start = int(seg_size / 2)
 46 |         offsets = np.array([
 47 |             start + int(np.round(seg_size * idx)) for idx in range(num_segments)
 48 |         ])
 49 |     return offsets
 50 | 
 51 | 
 52 | def fetch_images(qa_item):
 53 |     use_pyav = False
 54 |     segment = None
 55 |     if qa_item['question_type_id'] == 10:
 56 |         data_path = os.path.join(dimension10_dir, qa_item['data_id'])
 57 |         start = 0.0
 58 |         end = 0.0
 59 |     elif qa_item['question_type_id'] == 11:
 60 |         data_path = os.path.join(dimension11_dir, qa_item['data_id'].split('/')[-1])
 61 |         segment = qa_item['segment']
 62 |         start, end = segment[0], segment[1]
 63 |     elif qa_item['question_type_id'] == 12:
 64 |         data_path = os.path.join(dimension12_dir, qa_item['data_id'])
 65 |         segment = qa_item['segment']
 66 |         start, end = segment[0], segment[1]
 67 |         use_pyav = True
 68 | 
 69 |     if use_pyav:
 70 |         # using pyav for decoding videos in evaluation dimension 12
 71 |         reader = av.open(data_path)
 72 |         frames = [torch.from_numpy(f.to_rgb().to_ndarray()) for f in reader.decode(video=0)]
 73 |         video_len = len(frames)
 74 |         start_frame, end_frame = start, end
 75 |         end_frame = min(end_frame, video_len)
 76 |         offset = get_index(end_frame - start_frame, num_segments)
 77 |         frame_indices = offset + start_frame
 78 |         buffer = torch.stack([frames[idx] for idx in frame_indices])
 79 |     else:
 80 |         # using decord for decoding videos in evaluation dimension 10-11
 81 |         vr = VideoReader(data_path, num_threads=1, ctx=cpu(0))
 82 |         video_len = len(vr)
 83 |         fps = vr.get_avg_fps()
 84 |         if segment is not None:
 85 |             # obtain start and end frame for the video segment in evaluation dimension 11
 86 |             start_frame = int(min(max(start * fps, 0), video_len - 1))
 87 |             end_frame = int(min(max(end * fps, 0), video_len - 1))
 88 |             tot_frames = int(end_frame - start_frame)
 89 |             offset = get_index(tot_frames, num_segments)
 90 |             frame_indices = offset + start_frame
 91 |         else:
 92 |             # sample frames of the video in evaluation dimension 10
 93 |             frame_indices = get_index(video_len - 1, num_segments)
 94 |         vr.seek(0)
 95 |         buffer = vr.get_batch(frame_indices)
 96 |     return transform_video(buffer)
 97 | 
 98 | 
 99 | def fetch_images_parallel(qa_item):
100 |     return qa_item, fetch_images(qa_item)
101 | 
102 | 
103 | if __name__ == "__main__":
104 |     data = json.load(open('eval/seed-bench/SEED-Bench.json'))
105 |     video_img_dir = 'eval/seed-bench/SEED-Bench-video-image'
106 |     os.makedirs(video_img_dir, exist_ok=True)
107 |     ques_type_id_to_name = {id: n for n, id in data['question_type'].items()}
108 | 
109 |     video_data = [x for x in data['questions'] if x['data_type'] == 'video']
110 |     output = 'temp'
111 |     with open(output, 'w') as f, concurrent.futures.ThreadPoolExecutor() as executor:
112 |         future_to_images = {executor.submit(fetch_images_parallel, qa_item): qa_item for qa_item in video_data}
113 |         for future in tqdm(concurrent.futures.as_completed(future_to_images), total=len(future_to_images)):
114 |             qa_item = future_to_images[future]
115 |             try:
116 |                 qa_item, images = future.result()
117 |             except Exception as exc:
118 |                 print(f'{qa_item} generated an exception: {exc}')
119 |             else:
120 |                 img_file = f"{qa_item['question_type_id']}_{qa_item['question_id']}.png"
121 |                 images[0].save(os.path.join(video_img_dir, img_file))
122 | 


--------------------------------------------------------------------------------
/eval/textvqa/eval_textvqa.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import argparse
 3 | import json
 4 | import re
 5 | 
 6 | from bunny.eval.m4c_evaluator import TextVQAAccuracyEvaluator
 7 | 
 8 | 
 9 | def get_args():
10 |     parser = argparse.ArgumentParser()
11 |     parser.add_argument('--annotation-file', type=str)
12 |     parser.add_argument('--result-file', type=str)
13 |     parser.add_argument('--result-dir', type=str)
14 |     return parser.parse_args()
15 | 
16 | 
17 | def prompt_processor(prompt):
18 |     if prompt.startswith('OCR tokens: '):
19 |         pattern = r"Question: (.*?) Short answer:"
20 |         match = re.search(pattern, prompt, re.DOTALL)
21 |         question = match.group(1)
22 |     elif 'Reference OCR token: ' in prompt and len(prompt.split('\n')) == 3:
23 |         if prompt.startswith('Reference OCR token:'):
24 |             question = prompt.split('\n')[1]
25 |         else:
26 |             question = prompt.split('\n')[0]
27 |     elif len(prompt.split('\n')) == 2:
28 |         question = prompt.split('\n')[0]
29 |     else:
30 |         assert False
31 | 
32 |     return question.lower()
33 | 
34 | 
35 | def eval_single(annotation_file, result_file):
36 |     experiment_name = os.path.splitext(os.path.basename(result_file))[0]
37 |     print(experiment_name)
38 |     annotations = json.load(open(annotation_file))['data']
39 |     annotations = {(annotation['image_id'], annotation['question'].lower()): annotation for annotation in annotations}
40 |     results = [json.loads(line) for line in open(result_file)]
41 | 
42 |     pred_list = []
43 |     for result in results:
44 |         annotation = annotations[(result['question_id'], prompt_processor(result['prompt']))]
45 |         pred_list.append({
46 |             "pred_answer": result['text'],
47 |             "gt_answers": annotation['answers'],
48 |         })
49 | 
50 |     evaluator = TextVQAAccuracyEvaluator()
51 |     print('Samples: {}\nAccuracy: {:.2f}%\n'.format(len(pred_list), 100. * evaluator.eval_pred_list(pred_list)))
52 | 
53 | 
54 | if __name__ == "__main__":
55 |     args = get_args()
56 | 
57 |     if args.result_file is not None:
58 |         eval_single(args.annotation_file, args.result_file)
59 | 
60 |     if args.result_dir is not None:
61 |         for result_file in sorted(os.listdir(args.result_dir)):
62 |             if not result_file.endswith('.jsonl'):
63 |                 print(f'Skipping {result_file}')
64 |                 continue
65 |             eval_single(args.annotation_file, os.path.join(args.result_dir, result_file))
66 | 


--------------------------------------------------------------------------------
/eval/viswiz/convert_viswiz_for_submission.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import argparse
 3 | import json
 4 | 
 5 | from bunny.eval.m4c_evaluator import EvalAIAnswerProcessor
 6 | 
 7 | 
 8 | def parse_args():
 9 |     parser = argparse.ArgumentParser()
10 |     parser.add_argument('--annotation-file', type=str, required=True)
11 |     parser.add_argument('--result-file', type=str, required=True)
12 |     parser.add_argument('--result-upload-file', type=str, required=True)
13 |     return parser.parse_args()
14 | 
15 | 
16 | if __name__ == '__main__':
17 | 
18 |     args = parse_args()
19 | 
20 |     os.makedirs(os.path.dirname(args.result_upload_file), exist_ok=True)
21 | 
22 |     results = []
23 |     error_line = 0
24 |     for line_idx, line in enumerate(open(args.result_file)):
25 |         try:
26 |             results.append(json.loads(line))
27 |         except:
28 |             error_line += 1
29 |     results = {x['question_id']: x['text'] for x in results}
30 |     test_split = [json.loads(line) for line in open(args.annotation_file)]
31 |     split_ids = set([x['question_id'] for x in test_split])
32 | 
33 |     print(f'total results: {len(results)}, total split: {len(test_split)}, error_line: {error_line}')
34 | 
35 |     all_answers = []
36 | 
37 |     answer_processor = EvalAIAnswerProcessor()
38 | 
39 |     for x in test_split:
40 |         assert x['question_id'] in results
41 |         all_answers.append({
42 |             'image': x['image'],
43 |             'answer': answer_processor(results[x['question_id']])
44 |         })
45 | 
46 |     with open(args.result_upload_file, 'w') as f:
47 |         json.dump(all_answers, f)
48 | 


--------------------------------------------------------------------------------
/eval/vqav2/bunny_vqav2_mscoco_test-dev2015.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BAAI-DCAI/Bunny/08273acbd83b6836c03dce25cc134eb6b9cedba5/eval/vqav2/bunny_vqav2_mscoco_test-dev2015.tar.gz


--------------------------------------------------------------------------------
/eval/vqav2/bunny_vqav2_mscoco_test2015.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BAAI-DCAI/Bunny/08273acbd83b6836c03dce25cc134eb6b9cedba5/eval/vqav2/bunny_vqav2_mscoco_test2015.tar.gz


--------------------------------------------------------------------------------
/eval/vqav2/convert_vqav2_for_submission.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import argparse
 3 | import json
 4 | 
 5 | from bunny.eval.m4c_evaluator import EvalAIAnswerProcessor
 6 | 
 7 | 
 8 | def parse_args():
 9 |     parser = argparse.ArgumentParser()
10 |     parser.add_argument('--dir', type=str, default="./eval/vqav2")
11 |     parser.add_argument('--ckpt', type=str, required=True)
12 |     parser.add_argument('--split', type=str, required=True)
13 |     return parser.parse_args()
14 | 
15 | 
16 | if __name__ == '__main__':
17 | 
18 |     args = parse_args()
19 | 
20 |     src = os.path.join(args.dir, 'answers', args.split, args.ckpt, 'merge.jsonl')
21 |     test_split = os.path.join(args.dir, 'bunny_vqav2_mscoco_test2015.jsonl')
22 |     dst = os.path.join(args.dir, 'answers_upload', args.split, f'{args.ckpt}.json')
23 |     os.makedirs(os.path.dirname(dst), exist_ok=True)
24 | 
25 |     results = []
26 |     error_line = 0
27 |     for line_idx, line in enumerate(open(src)):
28 |         try:
29 |             results.append(json.loads(line))
30 |         except:
31 |             error_line += 1
32 | 
33 |     results = {x['question_id']: x['text'] for x in results}
34 |     test_split = [json.loads(line) for line in open(test_split)]
35 |     split_ids = set([x['question_id'] for x in test_split])
36 | 
37 |     print(f'total results: {len(results)}, total split: {len(test_split)}, error_line: {error_line}')
38 | 
39 |     all_answers = []
40 | 
41 |     answer_processor = EvalAIAnswerProcessor()
42 | 
43 |     for x in test_split:
44 |         if x['question_id'] not in results:
45 |             all_answers.append({
46 |                 'question_id': x['question_id'],
47 |                 'answer': ''
48 |             })
49 |         else:
50 |             all_answers.append({
51 |                 'question_id': x['question_id'],
52 |                 'answer': answer_processor(results[x['question_id']])
53 |             })
54 | 
55 |     with open(dst, 'w') as f:
56 |         json.dump(all_answers, open(dst, 'w'))
57 | 


--------------------------------------------------------------------------------
/icon.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BAAI-DCAI/Bunny/08273acbd83b6836c03dce25cc134eb6b9cedba5/icon.png


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = ["setuptools>=61.0"]
 3 | build-backend = "setuptools.build_meta"
 4 | 
 5 | [project]
 6 | name = "bunny"
 7 | version = "1.0"
 8 | description = "A family of lightweight multimodal models."
 9 | readme = "README.md"
10 | requires-python = ">=3.8"
11 | classifiers = [
12 |     "Programming Language :: Python :: 3",
13 |     "License :: OSI Approved :: Apache Software License",
14 | ]
15 | dependencies = [
16 |     'accelerate', 'apex', 'bitsandbytes', 'datasets', 'deepspeed', 'einops', 'einops-exts',
17 |     'fastapi', 'flash_attn', 'gradio', 'gradio_client', 'httpx', 'markdown2', 'numpy', 'openpyxl',
18 |     'peft', 'protobuf', 'pydantic', 'pypandoc', 'requests', 'scikit-learn', 'sentencepiece', 'shortuuid',
19 |     'tabulate', 'timm', 'tiktoken', 'tokenizers', 'torch', 'torchvision', 'transformers', 'uvicorn', 'xformers'
20 | ]
21 | 
22 | 
23 | [project.urls]
24 | "Homepage" = "https://github.com/BAAI-DCAI/Bunny"
25 | "Discussion" = "https://github.com/BAAI-DCAI/Bunny/issues"
26 | 
27 | [tool.setuptools.packages.find]
28 | exclude = ["assets*", "benchmark*", "docs", "dist*", "playground*", "scripts*", "tests*"]
29 | 
30 | [tool.wheel]
31 | exclude = ["assets*", "benchmark*", "docs", "dist*", "playground*", "scripts*", "tests*"]
32 | 


--------------------------------------------------------------------------------
/script/batch_inference.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import transformers
 3 | from transformers import AutoModelForCausalLM, AutoTokenizer
 4 | from PIL import Image
 5 | import warnings
 6 | 
 7 | # disable some warnings
 8 | transformers.logging.set_verbosity_error()
 9 | transformers.logging.disable_progress_bar()
10 | warnings.filterwarnings('ignore')
11 | 
12 | # set device
13 | device = 'cuda'  # or cpu
14 | torch.set_default_device(device)
15 | 
16 | model_name = 'BAAI/Bunny-v1_1-Llama-3-8B-V'  # or 'BAAI/Bunny-Llama-3-8B-V' or 'BAAI/Bunny-v1_1-4B' or 'BAAI/Bunny-v1_0-4B' or 'BAAI/Bunny-v1_0-3B' or 'BAAI/Bunny-v1_0-3B-zh' or 'BAAI/Bunny-v1_0-2B-zh'
17 | 
18 | # create model
19 | model = AutoModelForCausalLM.from_pretrained(
20 |     model_name,
21 |     torch_dtype=torch.float16,  # float32 for cpu
22 |     device_map='auto',
23 |     trust_remote_code=True)
24 | tokenizer = AutoTokenizer.from_pretrained(
25 |     model_name,
26 |     trust_remote_code=True)
27 | 
28 | # for batch inference
29 | tokenizer.padding_side = "left"
30 | tokenizer.pad_token_id = model.generation_config.pad_token_id
31 | padding_max_length = 128  # customize for your circumstance
32 | tokenizer.add_tokens(['<image>'])
33 | image_token_id = tokenizer.convert_tokens_to_ids('<image>')
34 | 
35 | # text prompts
36 | prompts = [
37 |     'What is the astronaut holding in his hand?',
38 |     'Why is the image funny?',
39 |     'What is the occupation of the person in the picture?',
40 |     'What animal is in the picture?'
41 | ]
42 | texts = [
43 |     f"A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\n{prompt} ASSISTANT:"
44 |     for prompt in prompts]
45 | input_ids = torch.tensor(
46 |     [tokenizer(text, padding='max_length', max_length=padding_max_length).input_ids for text in texts],
47 |     dtype=torch.long).to(device)
48 | input_ids[input_ids == image_token_id] = -200
49 | 
50 | # images, sample images can be found in https://huggingface.co/BAAI/Bunny-v1_1-Llama-3-8B-V/tree/main/images
51 | image_paths = [
52 |     'example_1.png',
53 |     'example_2.png',
54 |     'example_1.png',
55 |     'example_2.png'
56 | ]
57 | images = [Image.open(image_path) for image_path in image_paths]
58 | image_tensor = model.process_images(images, model.config).to(dtype=model.dtype, device=device)
59 | 
60 | # generate
61 | output_ids = model.generate(
62 |     input_ids,
63 |     images=image_tensor,
64 |     max_new_tokens=100,
65 |     use_cache=True,
66 |     repetition_penalty=1.0  # increase this to avoid chattering
67 | )
68 | 
69 | print([ans.strip() for ans in tokenizer.batch_decode(output_ids[:, input_ids.shape[1]:], skip_special_tokens=True)])
70 | 


--------------------------------------------------------------------------------
/script/deepspeed/zero2.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "fp16": {
 3 |         "enabled": "auto",
 4 |         "loss_scale": 0,
 5 |         "loss_scale_window": 1000,
 6 |         "initial_scale_power": 16,
 7 |         "hysteresis": 2,
 8 |         "min_loss_scale": 1
 9 |     },
10 |     "bf16": {
11 |         "enabled": "auto"
12 |     },
13 |     "train_micro_batch_size_per_gpu": "auto",
14 |     "train_batch_size": "auto",
15 |     "gradient_accumulation_steps": "auto",
16 |     "zero_optimization": {
17 |         "stage": 2,
18 |         "overlap_comm": true,
19 |         "contiguous_gradients": true,
20 |         "sub_group_size": 1e9,
21 |         "reduce_bucket_size": "auto"
22 |     }
23 | }


--------------------------------------------------------------------------------
/script/deepspeed/zero3.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "fp16": {
 3 |         "enabled": "auto",
 4 |         "loss_scale": 0,
 5 |         "loss_scale_window": 1000,
 6 |         "initial_scale_power": 16,
 7 |         "hysteresis": 2,
 8 |         "min_loss_scale": 1
 9 |     },
10 |     "bf16": {
11 |         "enabled": "auto"
12 |     },
13 |     "train_micro_batch_size_per_gpu": "auto",
14 |     "train_batch_size": "auto",
15 |     "gradient_accumulation_steps": "auto",
16 |     "zero_optimization": {
17 |         "stage": 3,
18 |         "overlap_comm": true,
19 |         "contiguous_gradients": true,
20 |         "sub_group_size": 1e9,
21 |         "reduce_bucket_size": "auto",
22 |         "stage3_prefetch_bucket_size": "auto",
23 |         "stage3_param_persistence_threshold": "auto",
24 |         "stage3_max_live_parameters": 1e9,
25 |         "stage3_max_reuse_distance": 1e9,
26 |         "stage3_gather_16bit_weights_on_model_save": true
27 |     }
28 | }


--------------------------------------------------------------------------------
/script/eval/full/cmmmu.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | SPLIT="val"
 4 | MODEL_TYPE=phi-2
 5 | TARGET_DIR=bunny-phi-2
 6 | 
 7 | python -m bunny.eval.model_vqa_cmmmu \
 8 |     --model-path ./checkpoints-$MODEL_TYPE/$TARGET_DIR \
 9 |     --model-type $MODEL_TYPE \
10 |     --data-path ./eval/cmmmu/CMMMU \
11 |     --config-path ./eval/cmmmu/prompt.yaml \
12 |     --output-path ./eval/cmmmu/answers_upload/$SPLIT/$TARGET_DIR.jsonl \
13 |     --split $SPLIT \
14 |     --conv-mode bunny
15 | 


--------------------------------------------------------------------------------
/script/eval/full/gqa.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
 4 | IFS=',' read -ra GPULIST <<< "$gpu_list"
 5 | 
 6 | CHUNKS=${#GPULIST[@]}
 7 | 
 8 | MODEL_TYPE=phi-2
 9 | TARGET_DIR=bunny-phi-2
10 | 
11 | SPLIT="bunny_gqa_testdev_balanced"
12 | 
13 | for IDX in $(seq 0 $((CHUNKS-1))); do
14 |     CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m bunny.eval.model_vqa_loader \
15 |         --model-path ./checkpoints-$MODEL_TYPE/$TARGET_DIR \
16 |         --model-type $MODEL_TYPE \
17 |         --question-file ./eval/gqa/$SPLIT.jsonl \
18 |         --image-folder ./eval/gqa/images \
19 |         --answers-file ./eval/gqa/answers/$SPLIT/$TARGET_DIR/${CHUNKS}_${IDX}.jsonl \
20 |         --num-chunks $CHUNKS \
21 |         --chunk-idx $IDX \
22 |         --temperature 0 \
23 |         --conv-mode bunny &
24 | done
25 | 
26 | wait
27 | 
28 | output_file=./eval/gqa/answers/$SPLIT/$TARGET_DIR/merge.jsonl
29 | 
30 | # Clear out the output file if it exists.
31 | > "$output_file"
32 | 
33 | # Loop through the indices and concatenate each file.
34 | for IDX in $(seq 0 $((CHUNKS-1))); do
35 |     cat ./eval/gqa/answers/$SPLIT/$TARGET_DIR/${CHUNKS}_${IDX}.jsonl >> "$output_file"
36 | done
37 | 
38 | python eval/gqa/convert_gqa_for_eval.py --src $output_file --dst ./eval/gqa/testdev_balanced_predictions.json
39 | 
40 | cd eval/gqa
41 | python eval_gqa.py --tier testdev_balanced
42 | 


--------------------------------------------------------------------------------
/script/eval/full/mmbench.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | SPLIT="MMBench_DEV_EN_legacy"
 4 | LANG=en
 5 | MODEL_TYPE=phi-2
 6 | TARGET_DIR=bunny-phi-2
 7 | 
 8 | 
 9 | python -m bunny.eval.model_vqa_mmbench \
10 |     --model-path ./checkpoints-$MODEL_TYPE/$TARGET_DIR \
11 |     --model-type $MODEL_TYPE \
12 |     --question-file ./eval/mmbench/$SPLIT.tsv \
13 |     --answers-file ./eval/mmbench/answers/$SPLIT/$TARGET_DIR.jsonl \
14 |     --lang $LANG \
15 |     --single-pred-prompt \
16 |     --temperature 0 \
17 |     --conv-mode bunny
18 | 
19 | mkdir -p eval/mmbench/answers_upload/$SPLIT
20 | 
21 | python eval/mmbench/convert_mmbench_for_submission.py \
22 |     --annotation-file ./eval/mmbench/$SPLIT.tsv \
23 |     --result-dir ./eval/mmbench/answers/$SPLIT \
24 |     --upload-dir ./eval/mmbench/answers_upload/$SPLIT \
25 |     --experiment $TARGET_DIR
26 | 


--------------------------------------------------------------------------------
/script/eval/full/mme.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | MODEL_TYPE=phi-2
 4 | TARGET_DIR=bunny-phi-2
 5 | 
 6 | python -m bunny.eval.model_vqa_loader \
 7 |     --model-path ./checkpoints-$MODEL_TYPE/$TARGET_DIR \
 8 |     --model-type $MODEL_TYPE \
 9 |     --image-folder ./eval/mme/MME_Benchmark_release_version \
10 |     --question-file ./eval/mme/bunny_mme.jsonl \
11 |     --answers-file ./eval/mme/answers/$TARGET_DIR.jsonl \
12 |     --temperature 0 \
13 |     --conv-mode bunny
14 | 
15 | cd ./eval/mme
16 | 
17 | python convert_answer_to_mme.py --experiment $TARGET_DIR
18 | 
19 | python calculation_mme.py --results_dir answers_upload/$TARGET_DIR \
20 | | tee 2>&1 answers_upload/$TARGET_DIR/res.txt
21 | 


--------------------------------------------------------------------------------
/script/eval/full/mmmu.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | SPLIT="test"
 4 | MODEL_TYPE=phi-2
 5 | TARGET_DIR=bunny-phi-2
 6 | 
 7 | python -m bunny.eval.model_vqa_mmmu \
 8 |     --model-path ./checkpoints-$MODEL_TYPE/$TARGET_DIR \
 9 |     --model-type $MODEL_TYPE \
10 |     --data-path ./eval/mmmu/MMMU \
11 |     --config-path ./eval/mmmu/config.yaml \
12 |     --output-path ./eval/mmmu/answers_upload/$SPLIT/$TARGET_DIR.json \
13 |     --split $SPLIT \
14 |     --conv-mode bunny
15 | 


--------------------------------------------------------------------------------
/script/eval/full/mmvet.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | MODEL_TYPE=phi-2
 4 | TARGET_DIR=bunny-phi-2
 5 | 
 6 | python -m bunny.eval.model_vqa \
 7 |     --model-path ./checkpoints-$MODEL_TYPE/$TARGET_DIR \
 8 |     --model-type $MODEL_TYPE \
 9 |     --question-file ./eval/mm-vet/bunny-mm-vet.jsonl \
10 |     --image-folder ./eval/mm-vet/images \
11 |     --answers-file ./eval/mm-vet/answers/$TARGET_DIR.jsonl \
12 |     --temperature 0 \
13 |     --conv-mode bunny
14 | 
15 | mkdir -p ./eval/mm-vet/answers_upload
16 | 
17 | python ./eval/mm-vet/convert_mmvet_for_eval.py \
18 |     --src ./eval/mm-vet/answers/$TARGET_DIR.jsonl \
19 |     --dst ./eval/mm-vet/answers_upload/$TARGET_DIR.json


--------------------------------------------------------------------------------
/script/eval/full/pope.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | MODEL_TYPE=phi-2
 4 | TARGET_DIR=bunny-phi-2
 5 | 
 6 | python -m bunny.eval.model_vqa_loader \
 7 |     --model-path ./checkpoints-$MODEL_TYPE/$TARGET_DIR \
 8 |     --model-type $MODEL_TYPE \
 9 |     --question-file ./eval/pope/bunny_pope_test.jsonl \
10 |     --image-folder ./eval/pope/val2014 \
11 |     --answers-file ./eval/pope/answers/$TARGET_DIR.jsonl \
12 |     --temperature 0 \
13 |     --conv-mode bunny
14 | 
15 | python eval/pope/eval_pope.py \
16 |     --annotation-dir ./eval/pope/coco \
17 |     --question-file ./eval/pope/bunny_pope_test.jsonl \
18 |     --result-file ./eval/pope/answers/$TARGET_DIR.jsonl
19 | 


--------------------------------------------------------------------------------
/script/eval/full/scienceqa.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | MODEL_TYPE=phi-2
 4 | TARGET_DIR=bunny-phi-2
 5 | 
 6 | python -m bunny.eval.model_vqa_science \
 7 |     --model-path ./checkpoints-$MODEL_TYPE/$TARGET_DIR \
 8 |     --model-type $MODEL_TYPE \
 9 |     --question-file ./eval/scienceqa/bunny_test_CQM-A.json \
10 |     --image-folder ./eval/scienceqa/test \
11 |     --answers-file ./eval/scienceqa/answers/$TARGET_DIR.jsonl \
12 |     --single-pred-prompt \
13 |     --temperature 0 \
14 |     --conv-mode bunny
15 | 
16 | mkdir -p ./eval/scienceqa/outputs/
17 | mkdir -p ./eval/scienceqa/results/
18 | 
19 | python ./eval/scienceqa/eval_science_qa.py \
20 |     --base-dir ./eval/scienceqa \
21 |     --result-file ./eval/scienceqa/answers/$TARGET_DIR.jsonl \
22 |     --output-file ./eval/scienceqa/outputs/$TARGET_DIR.jsonl \
23 |     --output-result ./eval/scienceqa/results/$TARGET_DIR.json


--------------------------------------------------------------------------------
/script/eval/full/seedbench.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
 4 | IFS=',' read -ra GPULIST <<< "$gpu_list"
 5 | 
 6 | CHUNKS=${#GPULIST[@]}
 7 | 
 8 | 
 9 | MODEL_TYPE=phi-2
10 | TARGET_DIR=bunny-phi-2
11 | 
12 | for IDX in $(seq 0 $((CHUNKS-1))); do
13 |     CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m bunny.eval.model_vqa_loader \
14 |         --model-path ./checkpoints-$MODEL_TYPE/$TARGET_DIR \
15 |         --model-type $MODEL_TYPE \
16 |         --question-file ./eval/seed-bench/bunny-seed-bench.jsonl \
17 |         --image-folder ./eval/seed-bench \
18 |         --answers-file ./eval/seed-bench/answers/$TARGET_DIR/${CHUNKS}_${IDX}.jsonl \
19 |         --num-chunks $CHUNKS \
20 |         --chunk-idx $IDX \
21 |         --temperature 0 \
22 |         --conv-mode bunny &
23 | done
24 | 
25 | wait
26 | 
27 | output_file=./eval/seed-bench/answers/$TARGET_DIR/merge.jsonl
28 | 
29 | # Clear out the output file if it exists.
30 | > "$output_file"
31 | 
32 | # Loop through the indices and concatenate each file.
33 | for IDX in $(seq 0 $((CHUNKS-1))); do
34 |     cat ./eval/seed-bench/answers/$TARGET_DIR/${CHUNKS}_${IDX}.jsonl >> "$output_file"
35 | done
36 | 
37 | mkdir -p ./eval/seed-bench/answers_upload
38 | mkdir -p ./eval/seed-bench/scores
39 | 
40 | # Evaluate
41 | python ./eval/seed-bench/convert_seed_for_submission.py \
42 |     --annotation-file ./eval/seed-bench/SEED-Bench.json \
43 |     --result-file $output_file \
44 |     --result-upload-file ./eval/seed-bench/answers_upload/$TARGET_DIR.jsonl | tee 2>&1 ./eval/seed-bench/scores/$TARGET_DIR.txt


--------------------------------------------------------------------------------
/script/eval/full/vqav2.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
 4 | IFS=',' read -ra GPULIST <<< "$gpu_list"
 5 | 
 6 | CHUNKS=${#GPULIST[@]}
 7 | 
 8 | MODEL_TYPE=phi-2
 9 | TARGET_DIR=bunny-phi-2
10 | 
11 | SPLIT="bunny_vqav2_mscoco_test-dev2015"
12 | 
13 | for IDX in $(seq 0 $((CHUNKS-1))); do
14 |     CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m bunny.eval.model_vqa_loader \
15 |         --model-path ./checkpoints-$MODEL_TYPE/$TARGET_DIR \
16 |         --model-type $MODEL_TYPE \
17 |         --question-file ./eval/vqav2/$SPLIT.jsonl \
18 |         --image-folder ./eval/vqav2/test2015 \
19 |         --answers-file ./eval/vqav2/answers/$SPLIT/$TARGET_DIR/${CHUNKS}_${IDX}.jsonl \
20 |         --num-chunks $CHUNKS \
21 |         --chunk-idx $IDX \
22 |         --temperature 0 \
23 |         --conv-mode bunny &
24 | done
25 | 
26 | wait
27 | 
28 | output_file=./eval/vqav2/answers/$SPLIT/$TARGET_DIR/merge.jsonl
29 | 
30 | # Clear out the output file if it exists.
31 | > "$output_file"
32 | 
33 | # Loop through the indices and concatenate each file.
34 | for IDX in $(seq 0 $((CHUNKS-1))); do
35 |     cat ./eval/vqav2/answers/$SPLIT/$TARGET_DIR/${CHUNKS}_${IDX}.jsonl >> "$output_file"
36 | done
37 | 
38 | python eval/vqav2/convert_vqav2_for_submission.py --split $SPLIT --ckpt $TARGET_DIR
39 | 
40 | 


--------------------------------------------------------------------------------
/script/eval/lora/cmmmu.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | SPLIT="val"
 4 | MODEL_TYPE=phi-2
 5 | MODEL_BASE=/path/to/base_llm_model
 6 | TARGET_DIR=bunny-lora-phi-2
 7 | 
 8 | python -m bunny.eval.model_vqa_cmmmu \
 9 |     --model-path ./checkpoints-$MODEL_TYPE/$TARGET_DIR \
10 |     --model-base $MODEL_BASE \
11 |     --model-type $MODEL_TYPE \
12 |     --data-path ./eval/cmmmu/CMMMU \
13 |     --config-path ./eval/cmmmu/prompt.yaml \
14 |     --output-path ./eval/cmmmu/answers_upload/$SPLIT/$TARGET_DIR.jsonl \
15 |     --split $SPLIT \
16 |     --conv-mode bunny
17 | 


--------------------------------------------------------------------------------
/script/eval/lora/gqa.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
 4 | IFS=',' read -ra GPULIST <<< "$gpu_list"
 5 | 
 6 | CHUNKS=${#GPULIST[@]}
 7 | 
 8 | MODEL_TYPE=phi-2
 9 | MODEL_BASE=/path/to/base_llm_model
10 | TARGET_DIR=bunny-lora-phi-2
11 | 
12 | SPLIT="bunny_gqa_testdev_balanced"
13 | 
14 | for IDX in $(seq 0 $((CHUNKS-1))); do
15 |     CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m bunny.eval.model_vqa_loader \
16 |         --model-path ./checkpoints-$MODEL_TYPE/$TARGET_DIR \
17 |         --model-base $MODEL_BASE \
18 |         --model-type $MODEL_TYPE \
19 |         --question-file ./eval/gqa/$SPLIT.jsonl \
20 |         --image-folder ./eval/gqa/images \
21 |         --answers-file ./eval/gqa/answers/$SPLIT/$TARGET_DIR/${CHUNKS}_${IDX}.jsonl \
22 |         --num-chunks $CHUNKS \
23 |         --chunk-idx $IDX \
24 |         --temperature 0 \
25 |         --conv-mode bunny &
26 | done
27 | 
28 | wait
29 | 
30 | output_file=./eval/gqa/answers/$SPLIT/$TARGET_DIR/merge.jsonl
31 | 
32 | # Clear out the output file if it exists.
33 | > "$output_file"
34 | 
35 | # Loop through the indices and concatenate each file.
36 | for IDX in $(seq 0 $((CHUNKS-1))); do
37 |     cat ./eval/gqa/answers/$SPLIT/$TARGET_DIR/${CHUNKS}_${IDX}.jsonl >> "$output_file"
38 | done
39 | 
40 | python eval/gqa/convert_gqa_for_eval.py --src $output_file --dst ./eval/gqa/testdev_balanced_predictions.json
41 | 
42 | cd eval/gqa
43 | python eval_gqa.py --tier testdev_balanced
44 | 


--------------------------------------------------------------------------------
/script/eval/lora/mmbench.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | SPLIT="MMBench_DEV_EN_legacy"
 4 | LANG=en
 5 | MODEL_TYPE=phi-2
 6 | MODEL_BASE=/path/to/base_llm_model
 7 | TARGET_DIR=bunny-lora-phi-2
 8 | 
 9 | 
10 | python -m bunny.eval.model_vqa_mmbench \
11 |     --model-path ./checkpoints-$MODEL_TYPE/$TARGET_DIR \
12 |     --model-base $MODEL_BASE \
13 |     --model-type $MODEL_TYPE \
14 |     --question-file ./eval/mmbench/$SPLIT.tsv \
15 |     --answers-file ./eval/mmbench/answers/$SPLIT/$TARGET_DIR.jsonl \
16 |     --lang $LANG \
17 |     --single-pred-prompt \
18 |     --temperature 0 \
19 |     --conv-mode bunny
20 | 
21 | mkdir -p eval/mmbench/answers_upload/$SPLIT
22 | 
23 | python eval/mmbench/convert_mmbench_for_submission.py \
24 |     --annotation-file ./eval/mmbench/$SPLIT.tsv \
25 |     --result-dir ./eval/mmbench/answers/$SPLIT \
26 |     --upload-dir ./eval/mmbench/answers_upload/$SPLIT \
27 |     --experiment $TARGET_DIR
28 | 


--------------------------------------------------------------------------------
/script/eval/lora/mme.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | MODEL_TYPE=phi-2
 4 | MODEL_BASE=/path/to/base_llm_model
 5 | TARGET_DIR=bunny-lora-phi-2
 6 | 
 7 | python -m bunny.eval.model_vqa_loader \
 8 |     --model-path ./checkpoints-$MODEL_TYPE/$TARGET_DIR \
 9 |     --model-base $MODEL_BASE \
10 |     --model-type $MODEL_TYPE \
11 |     --image-folder ./eval/mme/MME_Benchmark_release_version \
12 |     --question-file ./eval/mme/bunny_mme.jsonl \
13 |     --answers-file ./eval/mme/answers/$TARGET_DIR.jsonl \
14 |     --temperature 0 \
15 |     --conv-mode bunny
16 | 
17 | cd ./eval/mme
18 | 
19 | python convert_answer_to_mme.py --experiment $TARGET_DIR
20 | 
21 | python calculation_mme.py --results_dir answers_upload/$TARGET_DIR \
22 | | tee 2>&1 answers_upload/$TARGET_DIR/res.txt
23 | 


--------------------------------------------------------------------------------
/script/eval/lora/mmmu.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | SPLIT="test"
 4 | MODEL_TYPE=phi-2
 5 | MODEL_BASE=/path/to/base_llm_model
 6 | TARGET_DIR=bunny-lora-phi-2
 7 | 
 8 | python -m bunny.eval.model_vqa_mmmu \
 9 |     --model-path ./checkpoints-$MODEL_TYPE/$TARGET_DIR \
10 |     --model-base $MODEL_BASE \
11 |     --model-type $MODEL_TYPE \
12 |     --data-path ./eval/mmmu/MMMU \
13 |     --config-path ./eval/mmmu/config.yaml \
14 |     --output-path ./eval/mmmu/answers_upload/$SPLIT/$TARGET_DIR.json \
15 |     --split $SPLIT \
16 |     --conv-mode bunny
17 | 


--------------------------------------------------------------------------------
/script/eval/lora/mmvet.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | MODEL_TYPE=phi-2
 4 | MODEL_BASE=/path/to/base_llm_model
 5 | TARGET_DIR=bunny-lora-phi-2
 6 | 
 7 | python -m bunny.eval.model_vqa \
 8 |     --model-path ./checkpoints-$MODEL_TYPE/$TARGET_DIR \
 9 |     --model-base $MODEL_BASE \
10 |     --model-type $MODEL_TYPE \
11 |     --question-file ./eval/mm-vet/bunny-mm-vet.jsonl \
12 |     --image-folder ./eval/mm-vet/images \
13 |     --answers-file ./eval/mm-vet/answers/$TARGET_DIR.jsonl \
14 |     --temperature 0 \
15 |     --conv-mode bunny
16 | 
17 | mkdir -p ./eval/mm-vet/answers_upload
18 | 
19 | python ./eval/mm-vet/convert_mmvet_for_eval.py \
20 |     --src ./eval/mm-vet/answers/$TARGET_DIR.jsonl \
21 |     --dst ./eval/mm-vet/answers_upload/$TARGET_DIR.json


--------------------------------------------------------------------------------
/script/eval/lora/pope.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | MODEL_TYPE=phi-2
 4 | MODEL_BASE=/path/to/base_llm_model
 5 | TARGET_DIR=bunny-lora-phi-2
 6 | 
 7 | python -m bunny.eval.model_vqa_loader \
 8 |     --model-path ./checkpoints-$MODEL_TYPE/$TARGET_DIR \
 9 |     --model-base $MODEL_BASE \
10 |     --model-type $MODEL_TYPE \
11 |     --question-file ./eval/pope/bunny_pope_test.jsonl \
12 |     --image-folder ./eval/pope/val2014 \
13 |     --answers-file ./eval/pope/answers/$TARGET_DIR.jsonl \
14 |     --temperature 0 \
15 |     --conv-mode bunny
16 | 
17 | python eval/pope/eval_pope.py \
18 |     --annotation-dir ./eval/pope/coco \
19 |     --question-file ./eval/pope/bunny_pope_test.jsonl \
20 |     --result-file ./eval/pope/answers/$TARGET_DIR.jsonl
21 | 


--------------------------------------------------------------------------------
/script/eval/lora/scienceqa.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | MODEL_TYPE=phi-2
 4 | MODEL_BASE=/path/to/base_llm_model
 5 | TARGET_DIR=bunny-lora-phi-2
 6 | 
 7 | python -m bunny.eval.model_vqa_science \
 8 |     --model-path ./checkpoints-$MODEL_TYPE/$TARGET_DIR \
 9 |     --model-base $MODEL_BASE \
10 |     --model-type $MODEL_TYPE \
11 |     --question-file ./eval/scienceqa/bunny_test_CQM-A.json \
12 |     --image-folder ./eval/scienceqa/test \
13 |     --answers-file ./eval/scienceqa/answers/$TARGET_DIR.jsonl \
14 |     --single-pred-prompt \
15 |     --temperature 0 \
16 |     --conv-mode bunny
17 | 
18 | mkdir -p ./eval/scienceqa/outputs/
19 | mkdir -p ./eval/scienceqa/results/
20 | 
21 | python ./eval/scienceqa/eval_science_qa.py \
22 |     --base-dir ./eval/scienceqa \
23 |     --result-file ./eval/scienceqa/answers/$TARGET_DIR.jsonl \
24 |     --output-file ./eval/scienceqa/outputs/$TARGET_DIR.jsonl \
25 |     --output-result ./eval/scienceqa/results/$TARGET_DIR.json


--------------------------------------------------------------------------------
/script/eval/lora/seedbench.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
 4 | IFS=',' read -ra GPULIST <<< "$gpu_list"
 5 | 
 6 | CHUNKS=${#GPULIST[@]}
 7 | 
 8 | 
 9 | MODEL_TYPE=phi-2
10 | MODEL_BASE=/path/to/base_llm_model
11 | TARGET_DIR=bunny-lora-phi-2
12 | 
13 | for IDX in $(seq 0 $((CHUNKS-1))); do
14 |     CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m bunny.eval.model_vqa_loader \
15 |         --model-path ./checkpoints-$MODEL_TYPE/$TARGET_DIR \
16 |         --model-base $MODEL_BASE \
17 |         --model-type $MODEL_TYPE \
18 |         --question-file ./eval/seed-bench/bunny-seed-bench.jsonl \
19 |         --image-folder ./eval/seed-bench \
20 |         --answers-file ./eval/seed-bench/answers/$TARGET_DIR/${CHUNKS}_${IDX}.jsonl \
21 |         --num-chunks $CHUNKS \
22 |         --chunk-idx $IDX \
23 |         --temperature 0 \
24 |         --conv-mode bunny &
25 | done
26 | 
27 | wait
28 | 
29 | output_file=./eval/seed-bench/answers/$TARGET_DIR/merge.jsonl
30 | 
31 | # Clear out the output file if it exists.
32 | > "$output_file"
33 | 
34 | # Loop through the indices and concatenate each file.
35 | for IDX in $(seq 0 $((CHUNKS-1))); do
36 |     cat ./eval/seed-bench/answers/$TARGET_DIR/${CHUNKS}_${IDX}.jsonl >> "$output_file"
37 | done
38 | 
39 | mkdir -p ./eval/seed-bench/answers_upload
40 | mkdir -p ./eval/seed-bench/scores
41 | 
42 | # Evaluate
43 | python ./eval/seed-bench/convert_seed_for_submission.py \
44 |     --annotation-file ./eval/seed-bench/SEED-Bench.json \
45 |     --result-file $output_file \
46 |     --result-upload-file ./eval/seed-bench/answers_upload/$TARGET_DIR.jsonl | tee 2>&1 ./eval/seed-bench/scores/$TARGET_DIR.txt


--------------------------------------------------------------------------------
/script/eval/lora/vqav2.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
 4 | IFS=',' read -ra GPULIST <<< "$gpu_list"
 5 | 
 6 | CHUNKS=${#GPULIST[@]}
 7 | 
 8 | MODEL_TYPE=phi-2
 9 | MODEL_BASE=/path/to/base_llm_model
10 | TARGET_DIR=bunny-lora-phi-2
11 | 
12 | SPLIT="bunny_vqav2_mscoco_test-dev2015"
13 | 
14 | for IDX in $(seq 0 $((CHUNKS-1))); do
15 |     CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m bunny.eval.model_vqa_loader \
16 |         --model-path ./checkpoints-$MODEL_TYPE/$TARGET_DIR \
17 |         --model-base $MODEL_BASE \
18 |         --model-type $MODEL_TYPE \
19 |         --question-file ./eval/vqav2/$SPLIT.jsonl \
20 |         --image-folder ./eval/vqav2/test2015 \
21 |         --answers-file ./eval/vqav2/answers/$SPLIT/$TARGET_DIR/${CHUNKS}_${IDX}.jsonl \
22 |         --num-chunks $CHUNKS \
23 |         --chunk-idx $IDX \
24 |         --temperature 0 \
25 |         --conv-mode bunny &
26 | done
27 | 
28 | wait
29 | 
30 | output_file=./eval/vqav2/answers/$SPLIT/$TARGET_DIR/merge.jsonl
31 | 
32 | # Clear out the output file if it exists.
33 | > "$output_file"
34 | 
35 | # Loop through the indices and concatenate each file.
36 | for IDX in $(seq 0 $((CHUNKS-1))); do
37 |     cat ./eval/vqav2/answers/$SPLIT/$TARGET_DIR/${CHUNKS}_${IDX}.jsonl >> "$output_file"
38 | done
39 | 
40 | python eval/vqav2/convert_vqav2_for_submission.py --split $SPLIT --ckpt $TARGET_DIR
41 | 
42 | 


--------------------------------------------------------------------------------
/script/merge_lora_weights.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import os
 3 | from bunny.model.builder import load_pretrained_model
 4 | from bunny.util.mm_utils import get_model_name_from_path
 5 | 
 6 | 
 7 | def merge_lora(args):
 8 |     model_path = os.path.expanduser(args.model_path)
 9 |     model_name = get_model_name_from_path(model_path)
10 |     tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, args.model_base, model_name,
11 |                                                                            args.model_type)
12 | 
13 |     model.save_pretrained(args.save_model_path)
14 |     tokenizer.save_pretrained(args.save_model_path)
15 | 
16 | 
17 | if __name__ == "__main__":
18 |     parser = argparse.ArgumentParser()
19 |     parser.add_argument("--model-path", type=str, required=True)
20 |     parser.add_argument("--model-base", type=str, required=True)
21 |     parser.add_argument("--model-type", type=str, required=True)
22 |     parser.add_argument("--save-model-path", type=str, required=True)
23 | 
24 |     args = parser.parse_args()
25 | 
26 |     merge_lora(args)
27 | 


--------------------------------------------------------------------------------
/script/train/finetune_full.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | MODEL_TYPE=phi-2
 4 | 
 5 | PRETRAIN_DIR=bunny-$MODEL_TYPE-pretrain
 6 | OUTPUT_DIR=bunny-$MODEL_TYPE
 7 | 
 8 | mkdir -p ./checkpoints-$MODEL_TYPE/$OUTPUT_DIR
 9 | 
10 | deepspeed bunny/train/train.py \
11 |     --deepspeed ./script/deepspeed/zero3.json \
12 |     --model_name_or_path /path/to/base_llm_model \
13 |     --model_type $MODEL_TYPE \
14 |     --version bunny \
15 |     --data_path ./data/finetune/bunny_695k.json \
16 |     --image_folder ./data/finetune/images \
17 |     --vision_tower /path/to/siglip-so400m-patch14-384 \
18 |     --pretrain_mm_mlp_adapter ./checkpoints-pretrain/$PRETRAIN_DIR/mm_projector.bin \
19 |     --mm_projector_type mlp2x_gelu \
20 |     --image_aspect_ratio pad \
21 |     --group_by_modality_length False \
22 |     --bf16 True \
23 |     --output_dir ./checkpoints-$MODEL_TYPE/$OUTPUT_DIR \
24 |     --num_train_epochs 1 \
25 |     --per_device_train_batch_size 8 \
26 |     --per_device_eval_batch_size 4 \
27 |     --gradient_accumulation_steps 2 \
28 |     --evaluation_strategy "no" \
29 |     --save_strategy "steps" \
30 |     --save_steps 500 \
31 |     --save_total_limit 1 \
32 |     --learning_rate 2e-5 \
33 |     --weight_decay 0. \
34 |     --warmup_ratio 0.03 \
35 |     --lr_scheduler_type "cosine" \
36 |     --logging_steps 1 \
37 |     --tf32 True \
38 |     --model_max_length 2048 \
39 |     --gradient_checkpointing True \
40 |     --dataloader_num_workers 4 \
41 |     --lazy_preprocess True \
42 |     --report_to none | tee 2>&1 ./checkpoints-$MODEL_TYPE/$OUTPUT_DIR/log.txt
43 | 


--------------------------------------------------------------------------------
/script/train/finetune_lora.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | MODEL_TYPE=phi-2
 4 | 
 5 | PRETRAIN_DIR=bunny-$MODEL_TYPE-pretrain
 6 | OUTPUT_DIR=bunny-lora-$MODEL_TYPE
 7 | 
 8 | mkdir -p ./checkpoints-$MODEL_TYPE/$OUTPUT_DIR
 9 | 
10 | deepspeed bunny/train/train.py \
11 |     --lora_enable True --lora_r 128 --lora_alpha 256 --mm_projector_lr 2e-5 \
12 |     --deepspeed ./script/deepspeed/zero3.json \
13 |     --model_name_or_path /path/to/base_llm_model \
14 |     --model_type $MODEL_TYPE \
15 |     --version bunny \
16 |     --data_path ./data/finetune/bunny_695k.json \
17 |     --image_folder ./data/finetune/images \
18 |     --vision_tower /path/to/siglip-so400m-patch14-384 \
19 |     --pretrain_mm_mlp_adapter ./checkpoints-pretrain/$PRETRAIN_DIR/mm_projector.bin \
20 |     --mm_projector_type mlp2x_gelu \
21 |     --image_aspect_ratio pad \
22 |     --group_by_modality_length False \
23 |     --bf16 True \
24 |     --output_dir ./checkpoints-$MODEL_TYPE/$OUTPUT_DIR \
25 |     --num_train_epochs 1 \
26 |     --per_device_train_batch_size 8 \
27 |     --per_device_eval_batch_size 4 \
28 |     --gradient_accumulation_steps 2 \
29 |     --evaluation_strategy "no" \
30 |     --save_strategy "steps" \
31 |     --save_steps 500 \
32 |     --save_total_limit 1 \
33 |     --learning_rate 2e-4 \
34 |     --weight_decay 0. \
35 |     --warmup_ratio 0.03 \
36 |     --lr_scheduler_type "cosine" \
37 |     --logging_steps 1 \
38 |     --tf32 True \
39 |     --model_max_length 2048 \
40 |     --gradient_checkpointing True \
41 |     --dataloader_num_workers 4 \
42 |     --lazy_preprocess True \
43 |     --report_to none | tee 2>&1 ./checkpoints-$MODEL_TYPE/$OUTPUT_DIR/log.txt


--------------------------------------------------------------------------------
/script/train/pretrain.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | MODEL_TYPE=phi-2
 4 | OUTPUT_DIR=bunny-$MODEL_TYPE-pretrain
 5 | 
 6 | mkdir -p ./checkpoints-pretrain/$OUTPUT_DIR
 7 | 
 8 | deepspeed bunny/train/train.py \
 9 |     --deepspeed ./script/deepspeed/zero2.json \
10 |     --model_name_or_path /path/to/base_llm_model \
11 |     --model_type $MODEL_TYPE \
12 |     --version plain \
13 |     --data_path ./data/pretrain/bunny_pretrain_laion_2m.json \
14 |     --image_folder ./data/pretrain/images \
15 |     --vision_tower /path/to/siglip-so400m-patch14-384 \
16 |     --mm_projector_type mlp2x_gelu \
17 |     --tune_mm_mlp_adapter True \
18 |     --image_aspect_ratio square \
19 |     --bf16 True \
20 |     --output_dir ./checkpoints-pretrain/$OUTPUT_DIR \
21 |     --num_train_epochs 1 \
22 |     --per_device_train_batch_size 8 \
23 |     --per_device_eval_batch_size 4 \
24 |     --gradient_accumulation_steps 4 \
25 |     --evaluation_strategy "no" \
26 |     --save_strategy "steps" \
27 |     --save_steps 24000 \
28 |     --save_total_limit 1 \
29 |     --learning_rate 5e-4 \
30 |     --weight_decay 0. \
31 |     --warmup_ratio 0.03 \
32 |     --lr_scheduler_type "cosine" \
33 |     --logging_steps 1 \
34 |     --tf32 True \
35 |     --model_max_length 2048 \
36 |     --gradient_checkpointing True \
37 |     --dataloader_num_workers 4 \
38 |     --lazy_preprocess True \
39 |     --report_to none | tee 2>&1 ./checkpoints-pretrain/$OUTPUT_DIR/log.txt
40 | 


--------------------------------------------------------------------------------
/script/train/tutorials/Bunny-Llama-3-8B-V.md:
--------------------------------------------------------------------------------
  1 | # Training Tutorial of Bunny-Llama-3-8B-V
  2 | 
  3 | ![Bunny-Llama-3-8B-V](assets/Bunny-Llama-3-8B-V.png)
  4 | 
  5 | ## Pretrain
  6 | 
  7 | ```shell
  8 | #!/bin/bash
  9 | 
 10 | MODEL_TYPE=llama3-8b
 11 | OUTPUT_DIR=bunny-$MODEL_TYPE-pretrain
 12 | 
 13 | mkdir -p ./checkpoints-pretrain/$OUTPUT_DIR
 14 | 
 15 | deepspeed bunny/train/train.py \
 16 |     --deepspeed ./script/deepspeed/zero2.json \
 17 |     --model_name_or_path /path/to/meta-llama/Meta-Llama-3-8B-Instruct \
 18 |     --model_type $MODEL_TYPE \
 19 |     --version plain \
 20 |     --data_path ./data/pretrain/bunny_pretrain_laion_2m.json \
 21 |     --image_folder ./data/pretrain/images \
 22 |     --vision_tower /path/to/siglip-so400m-patch14-384 \
 23 |     --mm_projector_type mlp2x_gelu \
 24 |     --tune_mm_mlp_adapter True \
 25 |     --image_aspect_ratio square \
 26 |     --bf16 True \
 27 |     --output_dir ./checkpoints-pretrain/$OUTPUT_DIR \
 28 |     --num_train_epochs 1 \
 29 |     --per_device_train_batch_size 8 \
 30 |     --per_device_eval_batch_size 4 \
 31 |     --gradient_accumulation_steps 4 \
 32 |     --evaluation_strategy "no" \
 33 |     --save_strategy "steps" \
 34 |     --save_steps 24000 \
 35 |     --save_total_limit 1 \
 36 |     --learning_rate 1e-3 \
 37 |     --weight_decay 0. \
 38 |     --warmup_ratio 0.03 \
 39 |     --lr_scheduler_type "cosine" \
 40 |     --logging_steps 1 \
 41 |     --tf32 True \
 42 |     --model_max_length 2048 \
 43 |     --gradient_checkpointing True \
 44 |     --dataloader_num_workers 4 \
 45 |     --lazy_preprocess True \
 46 |     --report_to none | tee 2>&1 ./checkpoints-pretrain/$OUTPUT_DIR/log.txt
 47 | ```
 48 | 
 49 | ## Visual Instruction Tuning
 50 | 
 51 | ### Recipe-1
 52 | 
 53 | ```shell
 54 | #!/bin/bash
 55 | 
 56 | MODEL_TYPE=llama3-8b
 57 | 
 58 | PRETRAIN_DIR=bunny-$MODEL_TYPE-pretrain
 59 | OUTPUT_DIR=bunny-lora-$MODEL_TYPE-recipe-1
 60 | 
 61 | mkdir -p ./checkpoints-$MODEL_TYPE/$OUTPUT_DIR
 62 | 
 63 | deepspeed bunny/train/train.py \
 64 |     --lora_enable True --lora_r 128 --lora_alpha 256 --mm_projector_lr 2e-5 \
 65 |     --deepspeed ./script/deepspeed/zero3.json \
 66 |     --model_name_or_path /path/to/meta-llama/Meta-Llama-3-8B-Instruct \
 67 |     --model_type $MODEL_TYPE \
 68 |     --version llama \
 69 |     --data_path ./data/finetune/bunny_695k.json \
 70 |     --image_folder ./data/finetune/images \
 71 |     --vision_tower /path/to/siglip-so400m-patch14-384 \
 72 |     --pretrain_mm_mlp_adapter ./checkpoints-pretrain/$PRETRAIN_DIR/mm_projector.bin \
 73 |     --mm_projector_type mlp2x_gelu \
 74 |     --image_aspect_ratio pad \
 75 |     --group_by_modality_length False \
 76 |     --bf16 True \
 77 |     --output_dir ./checkpoints-$MODEL_TYPE/$OUTPUT_DIR \
 78 |     --num_train_epochs 1 \
 79 |     --per_device_train_batch_size 8 \
 80 |     --per_device_eval_batch_size 4 \
 81 |     --gradient_accumulation_steps 2 \
 82 |     --evaluation_strategy "no" \
 83 |     --save_strategy "steps" \
 84 |     --save_steps 500 \
 85 |     --save_total_limit 1 \
 86 |     --learning_rate 2e-4 \
 87 |     --weight_decay 0. \
 88 |     --warmup_ratio 0.03 \
 89 |     --lr_scheduler_type "cosine" \
 90 |     --logging_steps 1 \
 91 |     --tf32 True \
 92 |     --model_max_length 2048 \
 93 |     --gradient_checkpointing True \
 94 |     --dataloader_num_workers 4 \
 95 |     --lazy_preprocess True \
 96 |     --report_to none | tee 2>&1 ./checkpoints-$MODEL_TYPE/$OUTPUT_DIR/log.txt
 97 | ```
 98 | 
 99 | ### Recipe-2
100 | 
101 | ```shell
102 | #!/bin/bash
103 | 
104 | MODEL_TYPE=llama3-8b
105 | 
106 | PRETRAIN_DIR=bunny-$MODEL_TYPE-pretrain
107 | OUTPUT_DIR=bunny-lora-$MODEL_TYPE-recipe-2
108 | 
109 | mkdir -p ./checkpoints-$MODEL_TYPE/$OUTPUT_DIR
110 | 
111 | deepspeed bunny/train/train.py \
112 |     --lora_enable True --lora_r 128 --lora_alpha 256 --mm_projector_lr 1e-5 \
113 |     --deepspeed ./script/deepspeed/zero3.json \
114 |     --model_name_or_path /path/to/meta-llama/Meta-Llama-3-8B-Instruct \
115 |     --model_type $MODEL_TYPE \
116 |     --version llama \
117 |     --data_path ./data/finetune/bunny_llava_1.4m.json \
118 |     --image_folder ./data/finetune/images \
119 |     --vision_tower /path/to/siglip-so400m-patch14-384 \
120 |     --unfreeze_vision_tower True \
121 |     --pretrain_mm_mlp_adapter ./checkpoints-pretrain/$PRETRAIN_DIR/mm_projector.bin \
122 |     --mm_projector_type mlp2x_gelu \
123 |     --image_aspect_ratio pad \
124 |     --group_by_modality_length False \
125 |     --bf16 True \
126 |     --output_dir ./checkpoints-$MODEL_TYPE/$OUTPUT_DIR \
127 |     --num_train_epochs 1 \
128 |     --per_device_train_batch_size 8 \
129 |     --per_device_eval_batch_size 4 \
130 |     --gradient_accumulation_steps 2 \
131 |     --evaluation_strategy "no" \
132 |     --save_strategy "steps" \
133 |     --save_steps 500 \
134 |     --save_total_limit 1 \
135 |     --learning_rate 1e-4 \
136 |     --weight_decay 0. \
137 |     --warmup_ratio 0.03 \
138 |     --lr_scheduler_type "cosine" \
139 |     --logging_steps 1 \
140 |     --tf32 True \
141 |     --model_max_length 2048 \
142 |     --gradient_checkpointing True \
143 |     --dataloader_num_workers 4 \
144 |     --lazy_preprocess True \
145 |     --report_to none | tee 2>&1 ./checkpoints-$MODEL_TYPE/$OUTPUT_DIR/log.txt
146 | ```
147 | 
148 | ### Weight Merging
149 | 
150 | * Firstly, merge the LoRA weights and base LLM
151 | 
152 |   ```shell
153 |   python script/merge_lora_weights.py \
154 |     --model-path ./checkpoints-llama3-8b/bunny-lora-llama3-8b-recipe-1 \
155 |     --model-base /path/to/meta-llama/Meta-Llama-3-8B-Instruct \
156 |     --model-type llama3-8b \
157 |     --save-model-path ./checkpoints-llama3-8b/bunny-llama3-8b-recipe-1
158 |   ```
159 | 
160 |   ```shell
161 |   python script/merge_lora_weights.py \
162 |     --model-path ./checkpoints-llama3-8b/bunny-lora-llama3-8b-recipe-2 \
163 |     --model-base /path/to/meta-llama/Meta-Llama-3-8B-Instruct \
164 |     --model-type llama3-8b \
165 |     --save-model-path ./checkpoints-llama3-8b/bunny-llama3-8b-recipe-2
166 |   ```
167 | 
168 | * Then, inherit configurations from recipe-2
169 | 
170 |   ```shell
171 |   cp -r ./checkpoints-llama3-8b/bunny-llama3-8b-recipe-2 ./checkpoints-llama3-8b/bunny-llama3-8b-avg
172 |   ```
173 | 
174 | * Lastly, linearly avearge two models
175 | 
176 |   ```python
177 |   from safetensors.torch import load_file, save_file
178 |   
179 |   total = 4
180 |   for i in range(1, total + 1):
181 |       model_1 = load_file(f'./checkpoints-llama3-8b/bunny-lora-llama3-8b-recipe-1/model-{i:05d}-of-{total:05d}.safetensors')
182 |       model_2 = load_file(f'./checkpoints-llama3-8b/bunny-lora-llama3-8b-recipe-2/model-{i:05d}-of-{total:05d}.safetensors')
183 |       
184 |       assert model_1.keys() == model_2.keys()
185 |   
186 |       avg = {}
187 |       for k in model_1.keys():
188 |           avg[k] = model_1[k] * 0.5 + model_2[k] * 0.5 # the weight factor is selected empirically
189 |   
190 |       save_file(avg, f'./checkpoints-llama3-8b/bunny-llama3-8b-avg/model-{i:05d}-of-{total:05d}.safetensors', {'format': 'pt'})
191 |   ```
192 | 
193 |   
194 | 


--------------------------------------------------------------------------------
/script/train/tutorials/Bunny-v1.0-4B.md:
--------------------------------------------------------------------------------
  1 | # Training Tutorial of Bunny-v1.0-4B
  2 | 
  3 | ![Bunny-v1.0-4B](assets/Bunny-v1.0-4B.png)
  4 | 
  5 | ## Pretrain
  6 | 
  7 | ```shell
  8 | #!/bin/bash
  9 | 
 10 | MODEL_TYPE=phi-3
 11 | OUTPUT_DIR=bunny-$MODEL_TYPE-pretrain
 12 | 
 13 | mkdir -p ./checkpoints-pretrain/$OUTPUT_DIR
 14 | 
 15 | deepspeed bunny/train/train.py \
 16 |     --deepspeed ./script/deepspeed/zero2.json \
 17 |     --model_name_or_path /path/to/microsoft/Phi-3-mini-4k-instruct \
 18 |     --model_type $MODEL_TYPE \
 19 |     --version plain \
 20 |     --data_path ./data/pretrain/bunny_pretrain_laion_2m.json \
 21 |     --image_folder ./data/pretrain/images \
 22 |     --vision_tower /path/to/siglip-so400m-patch14-384 \
 23 |     --mm_projector_type mlp2x_gelu \
 24 |     --tune_mm_mlp_adapter True \
 25 |     --image_aspect_ratio square \
 26 |     --bf16 True \
 27 |     --output_dir ./checkpoints-pretrain/$OUTPUT_DIR \
 28 |     --num_train_epochs 1 \
 29 |     --per_device_train_batch_size 8 \
 30 |     --per_device_eval_batch_size 4 \
 31 |     --gradient_accumulation_steps 4 \
 32 |     --evaluation_strategy "no" \
 33 |     --save_strategy "steps" \
 34 |     --save_steps 24000 \
 35 |     --save_total_limit 1 \
 36 |     --learning_rate 1e-3 \
 37 |     --weight_decay 0. \
 38 |     --warmup_ratio 0.03 \
 39 |     --lr_scheduler_type "cosine" \
 40 |     --logging_steps 1 \
 41 |     --tf32 True \
 42 |     --model_max_length 2048 \
 43 |     --gradient_checkpointing True \
 44 |     --dataloader_num_workers 4 \
 45 |     --lazy_preprocess True \
 46 |     --report_to none | tee 2>&1 ./checkpoints-pretrain/$OUTPUT_DIR/log.txt
 47 | ```
 48 | 
 49 | ## Visual Instruction Tuning
 50 | 
 51 | ### Recipe-1
 52 | 
 53 | ```shell
 54 | #!/bin/bash
 55 | 
 56 | MODEL_TYPE=phi-3
 57 | 
 58 | PRETRAIN_DIR=bunny-$MODEL_TYPE-pretrain
 59 | OUTPUT_DIR=bunny-lora-$MODEL_TYPE-recipe-1
 60 | 
 61 | mkdir -p ./checkpoints-$MODEL_TYPE/$OUTPUT_DIR
 62 | 
 63 | deepspeed bunny/train/train.py \
 64 |     --lora_enable True --lora_r 128 --lora_alpha 256 --mm_projector_lr 2e-5 \
 65 |     --deepspeed ./script/deepspeed/zero3.json \
 66 |     --model_name_or_path /path/to/microsoft/Phi-3-mini-4k-instruct \
 67 |     --model_type $MODEL_TYPE \
 68 |     --version phi3 \
 69 |     --data_path ./data/finetune/bunny_695k.json \
 70 |     --image_folder ./data/finetune/images \
 71 |     --vision_tower /path/to/siglip-so400m-patch14-384 \
 72 |     --pretrain_mm_mlp_adapter ./checkpoints-pretrain/$PRETRAIN_DIR/mm_projector.bin \
 73 |     --mm_projector_type mlp2x_gelu \
 74 |     --image_aspect_ratio pad \
 75 |     --group_by_modality_length False \
 76 |     --bf16 True \
 77 |     --output_dir ./checkpoints-$MODEL_TYPE/$OUTPUT_DIR \
 78 |     --num_train_epochs 1 \
 79 |     --per_device_train_batch_size 4 \
 80 |     --per_device_eval_batch_size 4 \
 81 |     --gradient_accumulation_steps 4 \
 82 |     --evaluation_strategy "no" \
 83 |     --save_strategy "steps" \
 84 |     --save_steps 500 \
 85 |     --save_total_limit 1 \
 86 |     --learning_rate 2e-4 \
 87 |     --weight_decay 0. \
 88 |     --warmup_ratio 0.03 \
 89 |     --lr_scheduler_type "cosine" \
 90 |     --logging_steps 1 \
 91 |     --tf32 True \
 92 |     --model_max_length 4096 \
 93 |     --gradient_checkpointing True \
 94 |     --dataloader_num_workers 4 \
 95 |     --lazy_preprocess True \
 96 |     --report_to none | tee 2>&1 ./checkpoints-$MODEL_TYPE/$OUTPUT_DIR/log.txt
 97 | ```
 98 | 
 99 | ### Recipe-2
100 | 
101 | ```shell
102 | #!/bin/bash
103 | 
104 | MODEL_TYPE=phi-3
105 | 
106 | PRETRAIN_DIR=bunny-$MODEL_TYPE-pretrain
107 | OUTPUT_DIR=bunny-lora-$MODEL_TYPE-recipe-2
108 | 
109 | mkdir -p ./checkpoints-$MODEL_TYPE/$OUTPUT_DIR
110 | 
111 | deepspeed bunny/train/train.py \
112 |     --lora_enable True --lora_r 128 --lora_alpha 256 --mm_projector_lr 2e-5 \
113 |     --deepspeed ./script/deepspeed/zero3.json \
114 |     --model_name_or_path /path/to/microsoft/Phi-3-mini-4k-instruct \
115 |     --model_type $MODEL_TYPE \
116 |     --version phi3 \
117 |     --data_path ./data/finetune/bunny_llava_1.4m.json \
118 |     --image_folder ./data/finetune/images \
119 |     --vision_tower /path/to/siglip-so400m-patch14-384 \
120 |     --unfreeze_vision_tower True \
121 |     --pretrain_mm_mlp_adapter ./checkpoints-pretrain/$PRETRAIN_DIR/mm_projector.bin \
122 |     --mm_projector_type mlp2x_gelu \
123 |     --image_aspect_ratio pad \
124 |     --group_by_modality_length False \
125 |     --bf16 True \
126 |     --output_dir ./checkpoints-$MODEL_TYPE/$OUTPUT_DIR \
127 |     --num_train_epochs 1 \
128 |     --per_device_train_batch_size 4 \
129 |     --per_device_eval_batch_size 4 \
130 |     --gradient_accumulation_steps 4 \
131 |     --evaluation_strategy "no" \
132 |     --save_strategy "steps" \
133 |     --save_steps 500 \
134 |     --save_total_limit 1 \
135 |     --learning_rate 2e-4 \
136 |     --weight_decay 0. \
137 |     --warmup_ratio 0.03 \
138 |     --lr_scheduler_type "cosine" \
139 |     --logging_steps 1 \
140 |     --tf32 True \
141 |     --model_max_length 4096 \
142 |     --gradient_checkpointing True \
143 |     --dataloader_num_workers 4 \
144 |     --lazy_preprocess True \
145 |     --report_to none | tee 2>&1 ./checkpoints-$MODEL_TYPE/$OUTPUT_DIR/log.txt
146 | ```
147 | 
148 | ### Weight Merging
149 | 
150 | * Firstly, merge the LoRA weights and base LLM
151 | 
152 |   ```shell
153 |   python script/merge_lora_weights.py \
154 |     --model-path ./checkpoints-phi-3/bunny-lora-phi-3-recipe-1 \
155 |     --model-base /path/to/microsoft/Phi-3-mini-4k-instruct \
156 |     --model-type phi-3 \
157 |     --save-model-path ./checkpoints-phi-3/bunny-phi-3-recipe-1
158 |   ```
159 | 
160 |   ```shell
161 |   python script/merge_lora_weights.py \
162 |     --model-path ./checkpoints-phi-3/bunny-lora-phi-3-recipe-2 \
163 |     --model-base /path/to/microsoft/Phi-3-mini-4k-instruct \
164 |     --model-type phi-3 \
165 |     --save-model-path ./checkpoints-phi-3/bunny-phi-3-recipe-2
166 |   ```
167 | 
168 | * Then, inherit configurations from recipe-2
169 | 
170 |   ```shell
171 |   cp -r ./checkpoints-phi-3/bunny-phi-3-recipe-2 ./checkpoints-phi-3/bunny-phi-3-avg
172 |   ```
173 | 
174 | * Lastly, linearly avearge two models
175 | 
176 |   ```python
177 |   from safetensors.torch import load_file, save_file
178 |   
179 |   total = 2
180 |   for i in range(1, total + 1):
181 |       model_1 = load_file(f'./checkpoints-phi-3/bunny-lora-phi-3-recipe-1/model-{i:05d}-of-{total:05d}.safetensors')
182 |       model_2 = load_file(f'./checkpoints-phi-3/bunny-lora-phi-3-recipe-2/model-{i:05d}-of-{total:05d}.safetensors')
183 |       
184 |       assert model_1.keys() == model_2.keys()
185 |   
186 |       avg = {}
187 |       for k in model_1.keys():
188 |           avg[k] = model_1[k] * 0.3 + model_2[k] * 0.7 # the weight factor is selected empirically
189 |   
190 |       save_file(avg, f'./checkpoints-phi-3/bunny-phi-3-avg/model-{i:05d}-of-{total:05d}.safetensors', {'format': 'pt'})
191 |   ```
192 | 
193 |   
194 | 


--------------------------------------------------------------------------------
/script/train/tutorials/Bunny-v1.1-4B.md:
--------------------------------------------------------------------------------
  1 | # Training Tutorial of Bunny-v1.1-4B
  2 | 
  3 | ![Bunny-v1.1-4B](assets/Bunny-v1.1-4B.png)
  4 | 
  5 | ## Pretrain
  6 | 
  7 | ```shell
  8 | #!/bin/bash
  9 | 
 10 | MODEL_TYPE=phi-3
 11 | OUTPUT_DIR=bunny-$MODEL_TYPE-pretrain
 12 | 
 13 | mkdir -p ./checkpoints-pretrain/$OUTPUT_DIR
 14 | 
 15 | deepspeed bunny/train/train.py \
 16 |     --deepspeed ./script/deepspeed/zero2.json \
 17 |     --model_name_or_path /path/to/microsoft/Phi-3-mini-4k-instruct \
 18 |     --model_type $MODEL_TYPE \
 19 |     --version plain \
 20 |     --data_path ./data/pretrain/bunny_pretrain_laion_2m.json \
 21 |     --image_folder ./data/pretrain/images \
 22 |     --vision_tower /path/to/siglip-so400m-patch14-384 \
 23 |     --use_s2 True \
 24 |     --mm_projector_type mlp2x_gelu \
 25 |     --tune_mm_mlp_adapter True \
 26 |     --image_aspect_ratio square \
 27 |     --bf16 True \
 28 |     --output_dir ./checkpoints-pretrain/$OUTPUT_DIR \
 29 |     --num_train_epochs 1 \
 30 |     --per_device_train_batch_size 8 \
 31 |     --per_device_eval_batch_size 4 \
 32 |     --gradient_accumulation_steps 4 \
 33 |     --evaluation_strategy "no" \
 34 |     --save_strategy "steps" \
 35 |     --save_steps 24000 \
 36 |     --save_total_limit 1 \
 37 |     --learning_rate 1e-3 \
 38 |     --weight_decay 0. \
 39 |     --warmup_ratio 0.03 \
 40 |     --lr_scheduler_type "cosine" \
 41 |     --logging_steps 1 \
 42 |     --tf32 True \
 43 |     --model_max_length 2048 \
 44 |     --gradient_checkpointing True \
 45 |     --dataloader_num_workers 4 \
 46 |     --lazy_preprocess True \
 47 |     --report_to none | tee 2>&1 ./checkpoints-pretrain/$OUTPUT_DIR/log.txt
 48 | ```
 49 | 
 50 | ## Visual Instruction Tuning
 51 | 
 52 | ### Recipe-1
 53 | 
 54 | ```shell
 55 | #!/bin/bash
 56 | 
 57 | MODEL_TYPE=phi-3
 58 | 
 59 | PRETRAIN_DIR=bunny-$MODEL_TYPE-pretrain
 60 | OUTPUT_DIR=bunny-lora-$MODEL_TYPE-recipe-1
 61 | 
 62 | mkdir -p ./checkpoints-$MODEL_TYPE/$OUTPUT_DIR
 63 | 
 64 | deepspeed bunny/train/train.py \
 65 |     --lora_enable True --lora_r 128 --lora_alpha 256 --mm_projector_lr 2e-5 \
 66 |     --deepspeed ./script/deepspeed/zero3.json \
 67 |     --model_name_or_path /path/to/microsoft/Phi-3-mini-4k-instruct \
 68 |     --model_type $MODEL_TYPE \
 69 |     --version phi3 \
 70 |     --data_path ./data/finetune/bunny_allava_1.3m.json \
 71 |     --image_folder ./data/finetune/images \
 72 |     --vision_tower /path/to/siglip-so400m-patch14-384 \
 73 |     --use_s2 True \
 74 |     --pretrain_mm_mlp_adapter ./checkpoints-pretrain/$PRETRAIN_DIR/mm_projector.bin \
 75 |     --mm_projector_type mlp2x_gelu \
 76 |     --image_aspect_ratio pad \
 77 |     --group_by_modality_length False \
 78 |     --bf16 True \
 79 |     --output_dir ./checkpoints-$MODEL_TYPE/$OUTPUT_DIR \
 80 |     --num_train_epochs 1 \
 81 |     --per_device_train_batch_size 4 \
 82 |     --per_device_eval_batch_size 4 \
 83 |     --gradient_accumulation_steps 4 \
 84 |     --evaluation_strategy "no" \
 85 |     --save_strategy "steps" \
 86 |     --save_steps 500 \
 87 |     --save_total_limit 1 \
 88 |     --learning_rate 2e-4 \
 89 |     --weight_decay 0. \
 90 |     --warmup_ratio 0.03 \
 91 |     --lr_scheduler_type "cosine" \
 92 |     --logging_steps 1 \
 93 |     --tf32 True \
 94 |     --model_max_length 4096 \
 95 |     --gradient_checkpointing True \
 96 |     --dataloader_num_workers 4 \
 97 |     --lazy_preprocess True \
 98 |     --report_to none | tee 2>&1 ./checkpoints-$MODEL_TYPE/$OUTPUT_DIR/log.txt
 99 | ```
100 | 
101 | ### Recipe-2
102 | 
103 | ```shell
104 | #!/bin/bash
105 | 
106 | MODEL_TYPE=phi-3
107 | 
108 | PRETRAIN_DIR=bunny-$MODEL_TYPE-pretrain
109 | OUTPUT_DIR=bunny-lora-$MODEL_TYPE-recipe-2
110 | 
111 | mkdir -p ./checkpoints-$MODEL_TYPE/$OUTPUT_DIR
112 | 
113 | deepspeed bunny/train/train.py \
114 |     --lora_enable True --lora_r 128 --lora_alpha 256 --mm_projector_lr 2e-5 \
115 |     --deepspeed ./script/deepspeed/zero3.json \
116 |     --model_name_or_path /path/to/microsoft/Phi-3-mini-4k-instruct \
117 |     --model_type $MODEL_TYPE \
118 |     --version phi3 \
119 |     --data_path ./data/finetune/bunny_llava_allava_2m.json \
120 |     --image_folder ./data/finetune/images \
121 |     --vision_tower /path/to/siglip-so400m-patch14-384 \
122 |     --use_s2 True \
123 |     --unfreeze_vision_tower True \
124 |     --pretrain_mm_mlp_adapter ./checkpoints-pretrain/$PRETRAIN_DIR/mm_projector.bin \
125 |     --mm_projector_type mlp2x_gelu \
126 |     --image_aspect_ratio pad \
127 |     --group_by_modality_length False \
128 |     --bf16 True \
129 |     --output_dir ./checkpoints-$MODEL_TYPE/$OUTPUT_DIR \
130 |     --num_train_epochs 1 \
131 |     --per_device_train_batch_size 4 \
132 |     --per_device_eval_batch_size 4 \
133 |     --gradient_accumulation_steps 4 \
134 |     --evaluation_strategy "no" \
135 |     --save_strategy "steps" \
136 |     --save_steps 500 \
137 |     --save_total_limit 1 \
138 |     --learning_rate 2e-4 \
139 |     --weight_decay 0. \
140 |     --warmup_ratio 0.03 \
141 |     --lr_scheduler_type "cosine" \
142 |     --logging_steps 1 \
143 |     --tf32 True \
144 |     --model_max_length 4096 \
145 |     --gradient_checkpointing True \
146 |     --dataloader_num_workers 4 \
147 |     --lazy_preprocess True \
148 |     --report_to none | tee 2>&1 ./checkpoints-$MODEL_TYPE/$OUTPUT_DIR/log.txt
149 | ```
150 | 
151 | ### Weight Merging
152 | 
153 | * Firstly, merge the LoRA weights and base LLM
154 | 
155 |   ```shell
156 |   python script/merge_lora_weights.py \
157 |     --model-path ./checkpoints-phi-3/bunny-lora-phi-3-recipe-1 \
158 |     --model-base /path/to/microsoft/Phi-3-mini-4k-instruct \
159 |     --model-type phi-3 \
160 |     --save-model-path ./checkpoints-phi-3/bunny-phi-3-recipe-1
161 |   ```
162 | 
163 |   ```shell
164 |   python script/merge_lora_weights.py \
165 |     --model-path ./checkpoints-phi-3/bunny-lora-phi-3-recipe-2 \
166 |     --model-base /path/to/microsoft/Phi-3-mini-4k-instruct \
167 |     --model-type phi-3 \
168 |     --save-model-path ./checkpoints-phi-3/bunny-phi-3-recipe-2
169 |   ```
170 | 
171 | * Then, inherit configurations from recipe-2
172 | 
173 |   ```shell
174 |   cp -r ./checkpoints-phi-3/bunny-phi-3-recipe-2 ./checkpoints-phi-3/bunny-phi-3-avg
175 |   ```
176 | 
177 | * Lastly, linearly avearge two models
178 | 
179 |   ```python
180 |   from safetensors.torch import load_file, save_file
181 |   
182 |   total = 2
183 |   for i in range(1, total + 1):
184 |       model_1 = load_file(f'./checkpoints-phi-3/bunny-lora-phi-3-recipe-1/model-{i:05d}-of-{total:05d}.safetensors')
185 |       model_2 = load_file(f'./checkpoints-phi-3/bunny-lora-phi-3-recipe-2/model-{i:05d}-of-{total:05d}.safetensors')
186 |       
187 |       assert model_1.keys() == model_2.keys()
188 |   
189 |       avg = {}
190 |       for k in model_1.keys():
191 |           avg[k] = model_1[k] * 0.3 + model_2[k] * 0.7 # the weight factor is selected empirically
192 |   
193 |       save_file(avg, f'./checkpoints-phi-3/bunny-phi-3-avg/model-{i:05d}-of-{total:05d}.safetensors', {'format': 'pt'})
194 |   ```
195 | 
196 |   
197 | 


--------------------------------------------------------------------------------
/script/train/tutorials/assets/Bunny-Llama-3-8B-V.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BAAI-DCAI/Bunny/08273acbd83b6836c03dce25cc134eb6b9cedba5/script/train/tutorials/assets/Bunny-Llama-3-8B-V.png


--------------------------------------------------------------------------------
/script/train/tutorials/assets/Bunny-v1.0-4B.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BAAI-DCAI/Bunny/08273acbd83b6836c03dce25cc134eb6b9cedba5/script/train/tutorials/assets/Bunny-v1.0-4B.png


--------------------------------------------------------------------------------
/script/train/tutorials/assets/Bunny-v1.1-4B.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BAAI-DCAI/Bunny/08273acbd83b6836c03dce25cc134eb6b9cedba5/script/train/tutorials/assets/Bunny-v1.1-4B.png


--------------------------------------------------------------------------------
/script/train/tutorials/assets/Bunny-v1.1-Llama-3-8B-V.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BAAI-DCAI/Bunny/08273acbd83b6836c03dce25cc134eb6b9cedba5/script/train/tutorials/assets/Bunny-v1.1-Llama-3-8B-V.png


--------------------------------------------------------------------------------
/script/train/tutorials/bunny-minicpm-siglip-lora.md:
--------------------------------------------------------------------------------
 1 | # Training Tutorial of bunny-minicpm-siglip-lora
 2 | 
 3 | ## Pretrain
 4 | 
 5 | ```shell
 6 | #!/bin/bash
 7 | 
 8 | MODEL_TYPE=minicpm
 9 | OUTPUT_DIR=bunny-$MODEL_TYPE-pretrain
10 | 
11 | mkdir -p ./checkpoints-pretrain/$OUTPUT_DIR
12 | 
13 | deepspeed bunny/train/train.py \
14 |     --deepspeed ./script/deepspeed/zero2.json \
15 |     --model_name_or_path /path/to/openbmb/MiniCPM-2B \
16 |     --model_type $MODEL_TYPE \
17 |     --version plain \
18 |     --data_path ./data/pretrain/bunny_pretrain_laion_2m.json \
19 |     --image_folder ./data/pretrain/images \
20 |     --vision_tower /path/to/siglip-so400m-patch14-384 \
21 |     --mm_projector_type mlp2x_gelu \
22 |     --tune_mm_mlp_adapter True \
23 |     --image_aspect_ratio square \
24 |     --bf16 True \
25 |     --output_dir ./checkpoints-pretrain/$OUTPUT_DIR \
26 |     --num_train_epochs 1 \
27 |     --per_device_train_batch_size 8 \
28 |     --per_device_eval_batch_size 4 \
29 |     --gradient_accumulation_steps 4 \
30 |     --evaluation_strategy "no" \
31 |     --save_strategy "steps" \
32 |     --save_steps 24000 \
33 |     --save_total_limit 1 \
34 |     --learning_rate 5e-4 \
35 |     --weight_decay 0. \
36 |     --warmup_ratio 0.03 \
37 |     --lr_scheduler_type "cosine" \
38 |     --logging_steps 1 \
39 |     --tf32 True \
40 |     --model_max_length 2048 \
41 |     --gradient_checkpointing True \
42 |     --dataloader_num_workers 4 \
43 |     --lazy_preprocess True \
44 |     --report_to none | tee 2>&1 ./checkpoints-pretrain/$OUTPUT_DIR/log.txt
45 | ```
46 | 
47 | ## Visual Instruction Tuning
48 | 
49 | ```shell
50 | #!/bin/bash
51 | 
52 | MODEL_TYPE=minicpm
53 | 
54 | PRETRAIN_DIR=bunny-$MODEL_TYPE-pretrain
55 | OUTPUT_DIR=bunny-lora-$MODEL_TYPE
56 | 
57 | mkdir -p ./checkpoints-$MODEL_TYPE/$OUTPUT_DIR
58 | 
59 | deepspeed bunny/train/train.py \
60 |     --lora_enable True --lora_r 128 --lora_alpha 256 --mm_projector_lr 2e-5 \
61 |     --deepspeed ./script/deepspeed/zero3.json \
62 |     --model_name_or_path /path/to/openbmb/MiniCPM-2B \
63 |     --model_type $MODEL_TYPE \
64 |     --version minicpm \
65 |     --data_path ./data/finetune/bunny_695k.json \
66 |     --image_folder ./data/finetune/images \
67 |     --vision_tower /path/to/siglip-so400m-patch14-384 \
68 |     --pretrain_mm_mlp_adapter ./checkpoints-pretrain/$PRETRAIN_DIR/mm_projector.bin \
69 |     --mm_projector_type mlp2x_gelu \
70 |     --image_aspect_ratio pad \
71 |     --group_by_modality_length False \
72 |     --bf16 True \
73 |     --output_dir ./checkpoints-$MODEL_TYPE/$OUTPUT_DIR \
74 |     --num_train_epochs 1 \
75 |     --per_device_train_batch_size 8 \
76 |     --per_device_eval_batch_size 4 \
77 |     --gradient_accumulation_steps 2 \
78 |     --evaluation_strategy "no" \
79 |     --save_strategy "steps" \
80 |     --save_steps 500 \
81 |     --save_total_limit 1 \
82 |     --learning_rate 2e-4 \
83 |     --weight_decay 0. \
84 |     --warmup_ratio 0.03 \
85 |     --lr_scheduler_type "cosine" \
86 |     --logging_steps 1 \
87 |     --tf32 True \
88 |     --model_max_length 2048 \
89 |     --gradient_checkpointing True \
90 |     --dataloader_num_workers 4 \
91 |     --lazy_preprocess True \
92 |     --report_to none | tee 2>&1 ./checkpoints-$MODEL_TYPE/$OUTPUT_DIR/log.txt
93 | ```
94 | 
95 | 


--------------------------------------------------------------------------------
/script/train/tutorials/bunny-phi-1.5-eva-lora.md:
--------------------------------------------------------------------------------
 1 | # Training Tutorial of bunny-phi-1.5-eva-lora
 2 | 
 3 | ## Pretrain
 4 | 
 5 | ```shell
 6 | #!/bin/bash
 7 | 
 8 | MODEL_TYPE=phi-1.5
 9 | OUTPUT_DIR=bunny-$MODEL_TYPE-pretrain
10 | 
11 | mkdir -p ./checkpoints-pretrain/$OUTPUT_DIR
12 | 
13 | deepspeed bunny/train/train.py \
14 |     --deepspeed ./script/deepspeed/zero2.json \
15 |     --model_name_or_path /path/to/microsoft/phi-1_5 \
16 |     --model_type $MODEL_TYPE \
17 |     --version plain \
18 |     --data_path ./data/pretrain/bunny_pretrain_laion_2m.json \
19 |     --image_folder ./data/pretrain/images \
20 |     --vision_tower /path/to/EVA02_CLIP_L_336_psz14_s6B \
21 |     --mm_projector_type mlp2x_gelu \
22 |     --tune_mm_mlp_adapter True \
23 |     --image_aspect_ratio square \
24 |     --bf16 True \
25 |     --output_dir ./checkpoints-pretrain/$OUTPUT_DIR \
26 |     --num_train_epochs 1 \
27 |     --per_device_train_batch_size 8 \
28 |     --per_device_eval_batch_size 4 \
29 |     --gradient_accumulation_steps 4 \
30 |     --evaluation_strategy "no" \
31 |     --save_strategy "steps" \
32 |     --save_steps 24000 \
33 |     --save_total_limit 1 \
34 |     --learning_rate 1e-3 \
35 |     --weight_decay 0. \
36 |     --warmup_ratio 0.03 \
37 |     --lr_scheduler_type "cosine" \
38 |     --logging_steps 1 \
39 |     --tf32 True \
40 |     --model_max_length 2048 \
41 |     --gradient_checkpointing True \
42 |     --dataloader_num_workers 4 \
43 |     --lazy_preprocess True \
44 |     --report_to none | tee 2>&1 ./checkpoints-pretrain/$OUTPUT_DIR/log.txt
45 | ```
46 | 
47 | ## Visual Instruction Tuning
48 | 
49 | ```shell
50 | #!/bin/bash
51 | 
52 | MODEL_TYPE=phi-1.5
53 | 
54 | PRETRAIN_DIR=bunny-$MODEL_TYPE-pretrain
55 | OUTPUT_DIR=bunny-lora-$MODEL_TYPE
56 | 
57 | mkdir -p ./checkpoints-$MODEL_TYPE/$OUTPUT_DIR
58 | 
59 | deepspeed bunny/train/train.py \
60 |     --lora_enable True --lora_r 128 --lora_alpha 256 --mm_projector_lr 2e-5 \
61 |     --deepspeed ./script/deepspeed/zero3.json \
62 |     --model_name_or_path /path/to/microsoft/phi-1_5 \
63 |     --model_type $MODEL_TYPE \
64 |     --version bunny \
65 |     --data_path ./data/finetune/bunny_695k.json \
66 |     --image_folder ./data/finetune/images \
67 |     --vision_tower /path/to/EVA02_CLIP_L_336_psz14_s6B \
68 |     --pretrain_mm_mlp_adapter ./checkpoints-pretrain/$PRETRAIN_DIR/mm_projector.bin \
69 |     --mm_projector_type mlp2x_gelu \
70 |     --image_aspect_ratio pad \
71 |     --group_by_modality_length False \
72 |     --bf16 True \
73 |     --output_dir ./checkpoints-$MODEL_TYPE/$OUTPUT_DIR \
74 |     --num_train_epochs 1 \
75 |     --per_device_train_batch_size 8 \
76 |     --per_device_eval_batch_size 4 \
77 |     --gradient_accumulation_steps 2 \
78 |     --evaluation_strategy "no" \
79 |     --save_strategy "steps" \
80 |     --save_steps 500 \
81 |     --save_total_limit 1 \
82 |     --learning_rate 2e-4 \
83 |     --weight_decay 0. \
84 |     --warmup_ratio 0.03 \
85 |     --lr_scheduler_type "cosine" \
86 |     --logging_steps 1 \
87 |     --tf32 True \
88 |     --model_max_length 2048 \
89 |     --gradient_checkpointing True \
90 |     --dataloader_num_workers 4 \
91 |     --lazy_preprocess True \
92 |     --report_to none | tee 2>&1 ./checkpoints-$MODEL_TYPE/$OUTPUT_DIR/log.txt
93 | ```
94 | 
95 | 


--------------------------------------------------------------------------------
/script/train/tutorials/bunny-phi-1.5-siglip-lora.md:
--------------------------------------------------------------------------------
 1 | # Training Tutorial of bunny-phi-1.5-siglip-lora
 2 | 
 3 | ## Pretrain
 4 | 
 5 | ```shell
 6 | #!/bin/bash
 7 | 
 8 | MODEL_TYPE=phi-1.5
 9 | OUTPUT_DIR=bunny-$MODEL_TYPE-pretrain
10 | 
11 | mkdir -p ./checkpoints-pretrain/$OUTPUT_DIR
12 | 
13 | deepspeed bunny/train/train.py \
14 |     --deepspeed ./script/deepspeed/zero2.json \
15 |     --model_name_or_path /path/to/microsoft/phi-1_5 \
16 |     --model_type $MODEL_TYPE \
17 |     --version plain \
18 |     --data_path ./data/pretrain/bunny_pretrain_laion_2m.json \
19 |     --image_folder ./data/pretrain/images \
20 |     --vision_tower /path/to/siglip-so400m-patch14-384 \
21 |     --mm_projector_type mlp2x_gelu \
22 |     --tune_mm_mlp_adapter True \
23 |     --image_aspect_ratio square \
24 |     --bf16 True \
25 |     --output_dir ./checkpoints-pretrain/$OUTPUT_DIR \
26 |     --num_train_epochs 1 \
27 |     --per_device_train_batch_size 8 \
28 |     --per_device_eval_batch_size 4 \
29 |     --gradient_accumulation_steps 4 \
30 |     --evaluation_strategy "no" \
31 |     --save_strategy "steps" \
32 |     --save_steps 24000 \
33 |     --save_total_limit 1 \
34 |     --learning_rate 5e-4 \
35 |     --weight_decay 0. \
36 |     --warmup_ratio 0.03 \
37 |     --lr_scheduler_type "cosine" \
38 |     --logging_steps 1 \
39 |     --tf32 True \
40 |     --model_max_length 2048 \
41 |     --gradient_checkpointing True \
42 |     --dataloader_num_workers 4 \
43 |     --lazy_preprocess True \
44 |     --report_to none | tee 2>&1 ./checkpoints-pretrain/$OUTPUT_DIR/log.txt
45 | ```
46 | 
47 | ## Visual Instruction Tuning
48 | 
49 | ```shell
50 | #!/bin/bash
51 | 
52 | MODEL_TYPE=phi-1.5
53 | 
54 | PRETRAIN_DIR=bunny-$MODEL_TYPE-pretrain
55 | OUTPUT_DIR=bunny-lora-$MODEL_TYPE
56 | 
57 | mkdir -p ./checkpoints-$MODEL_TYPE/$OUTPUT_DIR
58 | 
59 | deepspeed bunny/train/train.py \
60 |     --lora_enable True --lora_r 128 --lora_alpha 256 --mm_projector_lr 2e-5 \
61 |     --deepspeed ./script/deepspeed/zero3.json \
62 |     --model_name_or_path /path/to/microsoft/phi-1_5 \
63 |     --model_type $MODEL_TYPE \
64 |     --version bunny \
65 |     --data_path ./data/finetune/bunny_695k.json \
66 |     --image_folder ./data/finetune/images \
67 |     --vision_tower /path/to/siglip-so400m-patch14-384 \
68 |     --pretrain_mm_mlp_adapter ./checkpoints-pretrain/$PRETRAIN_DIR/mm_projector.bin \
69 |     --mm_projector_type mlp2x_gelu \
70 |     --image_aspect_ratio pad \
71 |     --group_by_modality_length False \
72 |     --bf16 True \
73 |     --output_dir ./checkpoints-$MODEL_TYPE/$OUTPUT_DIR \
74 |     --num_train_epochs 1 \
75 |     --per_device_train_batch_size 8 \
76 |     --per_device_eval_batch_size 4 \
77 |     --gradient_accumulation_steps 2 \
78 |     --evaluation_strategy "no" \
79 |     --save_strategy "steps" \
80 |     --save_steps 500 \
81 |     --save_total_limit 1 \
82 |     --learning_rate 2e-4 \
83 |     --weight_decay 0. \
84 |     --warmup_ratio 0.03 \
85 |     --lr_scheduler_type "cosine" \
86 |     --logging_steps 1 \
87 |     --tf32 True \
88 |     --model_max_length 2048 \
89 |     --gradient_checkpointing True \
90 |     --dataloader_num_workers 4 \
91 |     --lazy_preprocess True \
92 |     --report_to none | tee 2>&1 ./checkpoints-$MODEL_TYPE/$OUTPUT_DIR/log.txt
93 | ```
94 | 
95 | 


--------------------------------------------------------------------------------
/script/train/tutorials/bunny-phi-2-eva-lora.md:
--------------------------------------------------------------------------------
 1 | # Training Tutorial of bunny-phi-2-eva-lora
 2 | 
 3 | ## Pretrain
 4 | 
 5 | ```shell
 6 | #!/bin/bash
 7 | 
 8 | MODEL_TYPE=phi-2
 9 | OUTPUT_DIR=bunny-$MODEL_TYPE-pretrain
10 | 
11 | mkdir -p ./checkpoints-pretrain/$OUTPUT_DIR
12 | 
13 | deepspeed bunny/train/train.py \
14 |     --deepspeed ./script/deepspeed/zero2.json \
15 |     --model_name_or_path /path/to/microsoft/phi-2 \
16 |     --model_type $MODEL_TYPE \
17 |     --version plain \
18 |     --data_path ./data/pretrain/bunny_pretrain_laion_2m.json \
19 |     --image_folder ./data/pretrain/images \
20 |     --vision_tower /path/to/EVA02_CLIP_L_336_psz14_s6B \
21 |     --mm_projector_type mlp2x_gelu \
22 |     --tune_mm_mlp_adapter True \
23 |     --image_aspect_ratio square \
24 |     --bf16 True \
25 |     --output_dir ./checkpoints-pretrain/$OUTPUT_DIR \
26 |     --num_train_epochs 1 \
27 |     --per_device_train_batch_size 8 \
28 |     --per_device_eval_batch_size 4 \
29 |     --gradient_accumulation_steps 4 \
30 |     --evaluation_strategy "no" \
31 |     --save_strategy "steps" \
32 |     --save_steps 24000 \
33 |     --save_total_limit 1 \
34 |     --learning_rate 5e-5 \
35 |     --weight_decay 0. \
36 |     --warmup_ratio 0.03 \
37 |     --lr_scheduler_type "cosine" \
38 |     --logging_steps 1 \
39 |     --tf32 True \
40 |     --model_max_length 2048 \
41 |     --gradient_checkpointing True \
42 |     --dataloader_num_workers 4 \
43 |     --lazy_preprocess True \
44 |     --report_to none | tee 2>&1 ./checkpoints-pretrain/$OUTPUT_DIR/log.txt
45 | ```
46 | 
47 | ## Visual Instruction Tuning
48 | 
49 | ```shell
50 | #!/bin/bash
51 | 
52 | MODEL_TYPE=phi-2
53 | 
54 | PRETRAIN_DIR=bunny-$MODEL_TYPE-pretrain
55 | OUTPUT_DIR=bunny-lora-$MODEL_TYPE
56 | 
57 | mkdir -p ./checkpoints-$MODEL_TYPE/$OUTPUT_DIR
58 | 
59 | deepspeed bunny/train/train.py \
60 |     --lora_enable True --lora_r 128 --lora_alpha 256 --mm_projector_lr 2e-5 \
61 |     --deepspeed ./script/deepspeed/zero3.json \
62 |     --model_name_or_path /path/to/microsoft/phi-2 \
63 |     --model_type $MODEL_TYPE \
64 |     --version bunny \
65 |     --data_path ./data/finetune/bunny_695k.json \
66 |     --image_folder ./data/finetune/images \
67 |     --vision_tower /path/to/EVA02_CLIP_L_336_psz14_s6B \
68 |     --pretrain_mm_mlp_adapter ./checkpoints-pretrain/$PRETRAIN_DIR/mm_projector.bin \
69 |     --mm_projector_type mlp2x_gelu \
70 |     --image_aspect_ratio pad \
71 |     --group_by_modality_length False \
72 |     --bf16 True \
73 |     --output_dir ./checkpoints-$MODEL_TYPE/$OUTPUT_DIR \
74 |     --num_train_epochs 1 \
75 |     --per_device_train_batch_size 8 \
76 |     --per_device_eval_batch_size 4 \
77 |     --gradient_accumulation_steps 2 \
78 |     --evaluation_strategy "no" \
79 |     --save_strategy "steps" \
80 |     --save_steps 500 \
81 |     --save_total_limit 1 \
82 |     --learning_rate 2e-4 \
83 |     --weight_decay 0. \
84 |     --warmup_ratio 0.03 \
85 |     --lr_scheduler_type "cosine" \
86 |     --logging_steps 1 \
87 |     --tf32 True \
88 |     --model_max_length 2048 \
89 |     --gradient_checkpointing True \
90 |     --dataloader_num_workers 4 \
91 |     --lazy_preprocess True \
92 |     --report_to none | tee 2>&1 ./checkpoints-$MODEL_TYPE/$OUTPUT_DIR/log.txt
93 | ```
94 | 
95 | 


--------------------------------------------------------------------------------
/script/train/tutorials/bunny-phi-2-siglip-lora.md:
--------------------------------------------------------------------------------
 1 | # Training Tutorial of bunny-phi-2-siglip-lora
 2 | 
 3 | ## Pretrain
 4 | 
 5 | ```shell
 6 | #!/bin/bash
 7 | 
 8 | MODEL_TYPE=phi-2
 9 | OUTPUT_DIR=bunny-$MODEL_TYPE-pretrain
10 | 
11 | mkdir -p ./checkpoints-pretrain/$OUTPUT_DIR
12 | 
13 | deepspeed bunny/train/train.py \
14 |     --deepspeed ./script/deepspeed/zero2.json \
15 |     --model_name_or_path /path/to/microsoft/phi-2 \
16 |     --model_type $MODEL_TYPE \
17 |     --version plain \
18 |     --data_path ./data/pretrain/bunny_pretrain_laion_2m.json \
19 |     --image_folder ./data/pretrain/images \
20 |     --vision_tower /path/to/siglip-so400m-patch14-384 \
21 |     --mm_projector_type mlp2x_gelu \
22 |     --tune_mm_mlp_adapter True \
23 |     --image_aspect_ratio square \
24 |     --bf16 True \
25 |     --output_dir ./checkpoints-pretrain/$OUTPUT_DIR \
26 |     --num_train_epochs 1 \
27 |     --per_device_train_batch_size 8 \
28 |     --per_device_eval_batch_size 4 \
29 |     --gradient_accumulation_steps 4 \
30 |     --evaluation_strategy "no" \
31 |     --save_strategy "steps" \
32 |     --save_steps 24000 \
33 |     --save_total_limit 1 \
34 |     --learning_rate 5e-4 \
35 |     --weight_decay 0. \
36 |     --warmup_ratio 0.03 \
37 |     --lr_scheduler_type "cosine" \
38 |     --logging_steps 1 \
39 |     --tf32 True \
40 |     --model_max_length 2048 \
41 |     --gradient_checkpointing True \
42 |     --dataloader_num_workers 4 \
43 |     --lazy_preprocess True \
44 |     --report_to none | tee 2>&1 ./checkpoints-pretrain/$OUTPUT_DIR/log.txt
45 | ```
46 | 
47 | ## Visual Instruction Tuning
48 | 
49 | ```shell
50 | #!/bin/bash
51 | 
52 | MODEL_TYPE=phi-2
53 | 
54 | PRETRAIN_DIR=bunny-$MODEL_TYPE-pretrain
55 | OUTPUT_DIR=bunny-lora-$MODEL_TYPE
56 | 
57 | mkdir -p ./checkpoints-$MODEL_TYPE/$OUTPUT_DIR
58 | 
59 | deepspeed bunny/train/train.py \
60 |     --lora_enable True --lora_r 128 --lora_alpha 256 --mm_projector_lr 2e-5 \
61 |     --deepspeed ./script/deepspeed/zero3.json \
62 |     --model_name_or_path /path/to/microsoft/phi-2 \
63 |     --model_type $MODEL_TYPE \
64 |     --version bunny \
65 |     --data_path ./data/finetune/bunny_695k.json \
66 |     --image_folder ./data/finetune/images \
67 |     --vision_tower /path/to/siglip-so400m-patch14-384 \
68 |     --pretrain_mm_mlp_adapter ./checkpoints-pretrain/$PRETRAIN_DIR/mm_projector.bin \
69 |     --mm_projector_type mlp2x_gelu \
70 |     --image_aspect_ratio pad \
71 |     --group_by_modality_length False \
72 |     --bf16 True \
73 |     --output_dir ./checkpoints-$MODEL_TYPE/$OUTPUT_DIR \
74 |     --num_train_epochs 1 \
75 |     --per_device_train_batch_size 8 \
76 |     --per_device_eval_batch_size 4 \
77 |     --gradient_accumulation_steps 2 \
78 |     --evaluation_strategy "no" \
79 |     --save_strategy "steps" \
80 |     --save_steps 500 \
81 |     --save_total_limit 1 \
82 |     --learning_rate 2e-4 \
83 |     --weight_decay 0. \
84 |     --warmup_ratio 0.03 \
85 |     --lr_scheduler_type "cosine" \
86 |     --logging_steps 1 \
87 |     --tf32 True \
88 |     --model_max_length 2048 \
89 |     --gradient_checkpointing True \
90 |     --dataloader_num_workers 4 \
91 |     --lazy_preprocess True \
92 |     --report_to none | tee 2>&1 ./checkpoints-$MODEL_TYPE/$OUTPUT_DIR/log.txt
93 | ```
94 | 
95 | 


--------------------------------------------------------------------------------
/script/train/tutorials/bunny-qwen1.5-1.8b-siglip-lora.md:
--------------------------------------------------------------------------------
 1 | # Training Tutorial of bunny-qwen1.5-1.8b-siglip-lora
 2 | 
 3 | ## Pretrain
 4 | 
 5 | ```shell
 6 | #!/bin/bash
 7 | 
 8 | MODEL_TYPE=qwen1.5-1.8b
 9 | OUTPUT_DIR=bunny-$MODEL_TYPE-pretrain
10 | 
11 | mkdir -p ./checkpoints-pretrain/$OUTPUT_DIR
12 | 
13 | deepspeed bunny/train/train.py \
14 |     --deepspeed ./script/deepspeed/zero2.json \
15 |     --model_name_or_path /path/to/Qwen/Qwen1.5-1.8B \
16 |     --model_type $MODEL_TYPE \
17 |     --version plain \
18 |     --data_path ./data/pretrain/bunny_pretrain_laion_2m.json \
19 |     --image_folder ./data/pretrain/images \
20 |     --vision_tower /path/to/siglip-so400m-patch14-384 \
21 |     --mm_projector_type mlp2x_gelu \
22 |     --tune_mm_mlp_adapter True \
23 |     --image_aspect_ratio square \
24 |     --bf16 True \
25 |     --output_dir ./checkpoints-pretrain/$OUTPUT_DIR \
26 |     --num_train_epochs 1 \
27 |     --per_device_train_batch_size 8 \
28 |     --per_device_eval_batch_size 4 \
29 |     --gradient_accumulation_steps 4 \
30 |     --evaluation_strategy "no" \
31 |     --save_strategy "steps" \
32 |     --save_steps 24000 \
33 |     --save_total_limit 1 \
34 |     --learning_rate 5e-4 \
35 |     --weight_decay 0. \
36 |     --warmup_ratio 0.03 \
37 |     --lr_scheduler_type "cosine" \
38 |     --logging_steps 1 \
39 |     --tf32 True \
40 |     --model_max_length 2048 \
41 |     --gradient_checkpointing True \
42 |     --dataloader_num_workers 4 \
43 |     --lazy_preprocess True \
44 |     --report_to none | tee 2>&1 ./checkpoints-pretrain/$OUTPUT_DIR/log.txt
45 | ```
46 | 
47 | ## Visual Instruction Tuning
48 | 
49 | ```shell
50 | #!/bin/bash
51 | 
52 | MODEL_TYPE=qwen1.5-1.8b
53 | 
54 | PRETRAIN_DIR=bunny-$MODEL_TYPE-pretrain
55 | OUTPUT_DIR=bunny-lora-$MODEL_TYPE
56 | 
57 | mkdir -p ./checkpoints-$MODEL_TYPE/$OUTPUT_DIR
58 | 
59 | deepspeed bunny/train/train.py \
60 |     --lora_enable True --lora_r 128 --lora_alpha 256 --mm_projector_lr 2e-5 \
61 |     --deepspeed ./script/deepspeed/zero3.json \
62 |     --model_name_or_path /path/to/Qwen/Qwen1.5-1.8B \
63 |     --model_type $MODEL_TYPE \
64 |     --version bunny \
65 |     --data_path ./data/finetune/bunny_695k.json \
66 |     --image_folder ./data/finetune/images \
67 |     --vision_tower /path/to/siglip-so400m-patch14-384 \
68 |     --pretrain_mm_mlp_adapter ./checkpoints-pretrain/$PRETRAIN_DIR/mm_projector.bin \
69 |     --mm_projector_type mlp2x_gelu \
70 |     --image_aspect_ratio pad \
71 |     --group_by_modality_length False \
72 |     --bf16 True \
73 |     --output_dir ./checkpoints-$MODEL_TYPE/$OUTPUT_DIR \
74 |     --num_train_epochs 1 \
75 |     --per_device_train_batch_size 8 \
76 |     --per_device_eval_batch_size 4 \
77 |     --gradient_accumulation_steps 2 \
78 |     --evaluation_strategy "no" \
79 |     --save_strategy "steps" \
80 |     --save_steps 500 \
81 |     --save_total_limit 1 \
82 |     --learning_rate 2e-4 \
83 |     --weight_decay 0. \
84 |     --warmup_ratio 0.03 \
85 |     --lr_scheduler_type "cosine" \
86 |     --logging_steps 1 \
87 |     --tf32 True \
88 |     --model_max_length 2048 \
89 |     --gradient_checkpointing True \
90 |     --dataloader_num_workers 4 \
91 |     --lazy_preprocess True \
92 |     --report_to none | tee 2>&1 ./checkpoints-$MODEL_TYPE/$OUTPUT_DIR/log.txt
93 | ```
94 | 
95 | 


--------------------------------------------------------------------------------
/script/train/tutorials/bunny-stablelm-2-eva-lora.md:
--------------------------------------------------------------------------------
 1 | # Training Tutorial of bunny-stablelm-2-eva-lora
 2 | 
 3 | ## Pretrain
 4 | 
 5 | ```shell
 6 | #!/bin/bash
 7 | 
 8 | MODEL_TYPE=stablelm-2
 9 | OUTPUT_DIR=bunny-$MODEL_TYPE-pretrain
10 | 
11 | mkdir -p ./checkpoints-pretrain/$OUTPUT_DIR
12 | 
13 | deepspeed bunny/train/train.py \
14 |     --deepspeed ./script/deepspeed/zero2.json \
15 |     --model_name_or_path /path/to/stabilityai/stablelm-2-1_6b \
16 |     --model_type $MODEL_TYPE \
17 |     --version plain \
18 |     --data_path ./data/pretrain/bunny_pretrain_laion_2m.json \
19 |     --image_folder ./data/pretrain/images \
20 |     --vision_tower /path/to/EVA02_CLIP_L_336_psz14_s6B \
21 |     --mm_projector_type mlp2x_gelu \
22 |     --tune_mm_mlp_adapter True \
23 |     --image_aspect_ratio square \
24 |     --bf16 True \
25 |     --output_dir ./checkpoints-pretrain/$OUTPUT_DIR \
26 |     --num_train_epochs 1 \
27 |     --per_device_train_batch_size 8 \
28 |     --per_device_eval_batch_size 4 \
29 |     --gradient_accumulation_steps 4 \
30 |     --evaluation_strategy "no" \
31 |     --save_strategy "steps" \
32 |     --save_steps 24000 \
33 |     --save_total_limit 1 \
34 |     --learning_rate 1e-3 \
35 |     --weight_decay 0. \
36 |     --warmup_ratio 0.03 \
37 |     --lr_scheduler_type "cosine" \
38 |     --logging_steps 1 \
39 |     --tf32 True \
40 |     --model_max_length 2048 \
41 |     --gradient_checkpointing True \
42 |     --dataloader_num_workers 4 \
43 |     --lazy_preprocess True \
44 |     --report_to none | tee 2>&1 ./checkpoints-pretrain/$OUTPUT_DIR/log.txt
45 | ```
46 | 
47 | ## Visual Instruction Tuning
48 | 
49 | ```shell
50 | #!/bin/bash
51 | 
52 | MODEL_TYPE=stablelm-2
53 | 
54 | PRETRAIN_DIR=bunny-$MODEL_TYPE-pretrain
55 | OUTPUT_DIR=bunny-lora-$MODEL_TYPE
56 | 
57 | mkdir -p ./checkpoints-$MODEL_TYPE/$OUTPUT_DIR
58 | 
59 | deepspeed bunny/train/train.py \
60 |     --lora_enable True --lora_r 128 --lora_alpha 256 --mm_projector_lr 2e-5 \
61 |     --deepspeed ./script/deepspeed/zero3.json \
62 |     --model_name_or_path /path/to/stabilityai/stablelm-2-1_6b \
63 |     --model_type $MODEL_TYPE \
64 |     --version bunny \
65 |     --data_path ./data/finetune/bunny_695k.json \
66 |     --image_folder ./data/finetune/images \
67 |     --vision_tower /path/to/EVA02_CLIP_L_336_psz14_s6B \
68 |     --pretrain_mm_mlp_adapter ./checkpoints-pretrain/$PRETRAIN_DIR/mm_projector.bin \
69 |     --mm_projector_type mlp2x_gelu \
70 |     --image_aspect_ratio pad \
71 |     --group_by_modality_length False \
72 |     --bf16 True \
73 |     --output_dir ./checkpoints-$MODEL_TYPE/$OUTPUT_DIR \
74 |     --num_train_epochs 1 \
75 |     --per_device_train_batch_size 8 \
76 |     --per_device_eval_batch_size 4 \
77 |     --gradient_accumulation_steps 2 \
78 |     --evaluation_strategy "no" \
79 |     --save_strategy "steps" \
80 |     --save_steps 500 \
81 |     --save_total_limit 1 \
82 |     --learning_rate 2e-4 \
83 |     --weight_decay 0. \
84 |     --warmup_ratio 0.03 \
85 |     --lr_scheduler_type "cosine" \
86 |     --logging_steps 1 \
87 |     --tf32 True \
88 |     --model_max_length 2048 \
89 |     --gradient_checkpointing True \
90 |     --dataloader_num_workers 4 \
91 |     --lazy_preprocess True \
92 |     --report_to none | tee 2>&1 ./checkpoints-$MODEL_TYPE/$OUTPUT_DIR/log.txt
93 | ```
94 | 
95 | 


--------------------------------------------------------------------------------
/script/train/tutorials/bunny-stablelm-2-siglip-lora.md:
--------------------------------------------------------------------------------
 1 | # Training Tutorial of bunny-stablelm-2-siglip-lora
 2 | 
 3 | ## Pretrain
 4 | 
 5 | ```shell
 6 | #!/bin/bash
 7 | 
 8 | MODEL_TYPE=stablelm-2
 9 | OUTPUT_DIR=bunny-$MODEL_TYPE-pretrain
10 | 
11 | mkdir -p ./checkpoints-pretrain/$OUTPUT_DIR
12 | 
13 | deepspeed bunny/train/train.py \
14 |     --deepspeed ./script/deepspeed/zero2.json \
15 |     --model_name_or_path /path/to/stabilityai/stablelm-2-1_6b \
16 |     --model_type $MODEL_TYPE \
17 |     --version plain \
18 |     --data_path ./data/pretrain/bunny_pretrain_laion_2m.json \
19 |     --image_folder ./data/pretrain/images \
20 |     --vision_tower /path/to/siglip-so400m-patch14-384 \
21 |     --mm_projector_type mlp2x_gelu \
22 |     --tune_mm_mlp_adapter True \
23 |     --image_aspect_ratio square \
24 |     --bf16 True \
25 |     --output_dir ./checkpoints-pretrain/$OUTPUT_DIR \
26 |     --num_train_epochs 1 \
27 |     --per_device_train_batch_size 8 \
28 |     --per_device_eval_batch_size 4 \
29 |     --gradient_accumulation_steps 4 \
30 |     --evaluation_strategy "no" \
31 |     --save_strategy "steps" \
32 |     --save_steps 24000 \
33 |     --save_total_limit 1 \
34 |     --learning_rate 5e-4 \
35 |     --weight_decay 0. \
36 |     --warmup_ratio 0.03 \
37 |     --lr_scheduler_type "cosine" \
38 |     --logging_steps 1 \
39 |     --tf32 True \
40 |     --model_max_length 2048 \
41 |     --gradient_checkpointing True \
42 |     --dataloader_num_workers 4 \
43 |     --lazy_preprocess True \
44 |     --report_to none | tee 2>&1 ./checkpoints-pretrain/$OUTPUT_DIR/log.txt
45 | ```
46 | 
47 | ## Visual Instruction Tuning
48 | 
49 | ```shell
50 | #!/bin/bash
51 | 
52 | MODEL_TYPE=stablelm-2
53 | 
54 | PRETRAIN_DIR=bunny-$MODEL_TYPE-pretrain
55 | OUTPUT_DIR=bunny-lora-$MODEL_TYPE
56 | 
57 | mkdir -p ./checkpoints-$MODEL_TYPE/$OUTPUT_DIR
58 | 
59 | deepspeed bunny/train/train.py \
60 |     --lora_enable True --lora_r 128 --lora_alpha 256 --mm_projector_lr 2e-5 \
61 |     --deepspeed ./script/deepspeed/zero3.json \
62 |     --model_name_or_path /path/to/stabilityai/stablelm-2-1_6b \
63 |     --model_type $MODEL_TYPE \
64 |     --version bunny \
65 |     --data_path ./data/finetune/bunny_695k.json \
66 |     --image_folder ./data/finetune/images \
67 |     --vision_tower /path/to/siglip-so400m-patch14-384 \
68 |     --pretrain_mm_mlp_adapter ./checkpoints-pretrain/$PRETRAIN_DIR/mm_projector.bin \
69 |     --mm_projector_type mlp2x_gelu \
70 |     --image_aspect_ratio pad \
71 |     --group_by_modality_length False \
72 |     --bf16 True \
73 |     --output_dir ./checkpoints-$MODEL_TYPE/$OUTPUT_DIR \
74 |     --num_train_epochs 1 \
75 |     --per_device_train_batch_size 8 \
76 |     --per_device_eval_batch_size 4 \
77 |     --gradient_accumulation_steps 2 \
78 |     --evaluation_strategy "no" \
79 |     --save_strategy "steps" \
80 |     --save_steps 500 \
81 |     --save_total_limit 1 \
82 |     --learning_rate 2e-4 \
83 |     --weight_decay 0. \
84 |     --warmup_ratio 0.03 \
85 |     --lr_scheduler_type "cosine" \
86 |     --logging_steps 1 \
87 |     --tf32 True \
88 |     --model_max_length 2048 \
89 |     --gradient_checkpointing True \
90 |     --dataloader_num_workers 4 \
91 |     --lazy_preprocess True \
92 |     --report_to none | tee 2>&1 ./checkpoints-$MODEL_TYPE/$OUTPUT_DIR/log.txt
93 | ```
94 | 
95 | 


--------------------------------------------------------------------------------