├── README.md ├── data_check.py ├── data_generation └── data_generation.py ├── example.png ├── figure ├── data-generation.png ├── general_benchmark.png ├── multi-agent.png ├── showcase.png ├── teaser.png └── visual_reasoning_benchmark.png ├── infer_reason_model.py ├── infer_summary_model.py ├── infer_two_models.py ├── llava ├── __init__.py ├── __pycache__ │ ├── __init__.cpython-310.pyc │ ├── constants.cpython-310.pyc │ ├── conversation.cpython-310.pyc │ ├── mm_utils.cpython-310.pyc │ └── utils.cpython-310.pyc ├── constants.py ├── conversation.py ├── eval │ ├── evaluate_interleave.py │ └── model_vqa.py ├── mm_utils.py ├── model │ ├── __init__.py │ ├── __pycache__ │ │ ├── __init__.cpython-310.pyc │ │ ├── builder.cpython-310.pyc │ │ └── llava_arch.cpython-310.pyc │ ├── apply_delta.py │ ├── builder.py │ ├── consolidate.py │ ├── language_model │ │ ├── __pycache__ │ │ │ ├── llava_llama.cpython-310.pyc │ │ │ ├── llava_mistral.cpython-310.pyc │ │ │ ├── llava_mixtral.cpython-310.pyc │ │ │ └── llava_qwen.cpython-310.pyc │ │ ├── llava_gemma.py │ │ ├── llava_llama.py │ │ ├── llava_mistral.py │ │ ├── llava_mixtral.py │ │ ├── llava_mpt.py │ │ ├── llava_qwen.py │ │ ├── llava_qwen_moe.py │ │ └── modeling_llama.py │ ├── llava_arch.py │ ├── make_delta.py │ ├── multimodal_encoder │ │ ├── __pycache__ │ │ │ ├── builder.cpython-310.pyc │ │ │ ├── clip_encoder.cpython-310.pyc │ │ │ ├── hf_vision.cpython-310.pyc │ │ │ ├── imagebind.cpython-310.pyc │ │ │ ├── open_clip_encoder.cpython-310.pyc │ │ │ └── siglip_encoder.cpython-310.pyc │ │ ├── builder.py │ │ ├── clip_encoder.py │ │ ├── dev_eva_clip │ │ │ ├── eva_clip │ │ │ │ ├── __init__.py │ │ │ │ ├── bpe_simple_vocab_16e6.txt.gz │ │ │ │ ├── constants.py │ │ │ │ ├── eva_vit_model.py │ │ │ │ ├── factory.py │ │ │ │ ├── hf_configs.py │ │ │ │ ├── hf_model.py │ │ │ │ ├── loss.py │ │ │ │ ├── model.py │ │ │ │ ├── model_configs │ │ │ │ │ ├── EVA-CLIP-18B.json │ │ │ │ │ ├── EVA-CLIP-8B-plus.json │ │ │ │ │ ├── EVA-CLIP-8B.json │ │ │ │ │ ├── EVA01-CLIP-B-16.json │ │ │ │ │ ├── EVA01-CLIP-g-14-plus.json │ │ │ │ │ ├── EVA01-CLIP-g-14.json │ │ │ │ │ ├── EVA02-CLIP-B-16.json │ │ │ │ │ ├── EVA02-CLIP-L-14-336.json │ │ │ │ │ ├── EVA02-CLIP-L-14.json │ │ │ │ │ ├── EVA02-CLIP-bigE-14-plus.json │ │ │ │ │ ├── EVA02-CLIP-bigE-14.json │ │ │ │ │ ├── Internal-EVA02-CLIP-10B-14-448.json │ │ │ │ │ └── Internal-EVA02-CLIP-10B-14.json │ │ │ │ ├── modified_resnet.py │ │ │ │ ├── openai.py │ │ │ │ ├── pretrained.py │ │ │ │ ├── rope.py │ │ │ │ ├── timm_model.py │ │ │ │ ├── tokenizer.py │ │ │ │ ├── transform.py │ │ │ │ ├── transformer.py │ │ │ │ └── utils.py │ │ │ └── eva_vit.py │ │ ├── eva_clip │ │ │ ├── eva_clip_encoder.py │ │ │ ├── eva_clip_processors.py │ │ │ ├── eva_vit.py │ │ │ ├── factory.py │ │ │ └── model_configs │ │ │ │ ├── EVA-CLIP-18B.json │ │ │ │ ├── EVA-CLIP-8B-plus.json │ │ │ │ ├── EVA-CLIP-8B.json │ │ │ │ ├── EVA01-CLIP-B-16.json │ │ │ │ ├── EVA01-CLIP-g-14-plus.json │ │ │ │ ├── EVA01-CLIP-g-14.json │ │ │ │ ├── EVA02-CLIP-B-16.json │ │ │ │ ├── EVA02-CLIP-L-14-336.json │ │ │ │ ├── EVA02-CLIP-L-14.json │ │ │ │ ├── EVA02-CLIP-bigE-14-plus.json │ │ │ │ ├── EVA02-CLIP-bigE-14.json │ │ │ │ ├── Internal-EVA02-CLIP-10B-14-448.json │ │ │ │ └── Internal-EVA02-CLIP-10B-14.json │ │ ├── hf_vision.py │ │ ├── imagebind.py │ │ ├── open_clip_encoder.py │ │ └── siglip_encoder.py │ ├── multimodal_projector │ │ ├── __pycache__ │ │ │ ├── builder.cpython-310.pyc │ │ │ └── pooler_projector.cpython-310.pyc │ │ ├── builder.py │ │ └── pooler_projector.py │ ├── multimodal_resampler │ │ ├── __pycache__ │ │ │ ├── builder.cpython-310.pyc │ │ │ ├── masked_drop.cpython-310.pyc │ │ │ ├── perceiver.cpython-310.pyc │ │ │ ├── qformer.cpython-310.pyc │ │ │ └── spatial_pool.cpython-310.pyc │ │ ├── builder.py │ │ ├── masked_drop.py │ │ ├── perceiver.py │ │ ├── qformer.py │ │ └── spatial_pool.py │ └── utils.py ├── serve │ ├── __init__.py │ ├── cli.py │ ├── controller.py │ ├── examples │ │ ├── extreme_ironing.jpg │ │ └── waterview.jpg │ ├── gradio_multi_image.py │ ├── gradio_web_server.py │ ├── model_worker.py │ ├── register_worker.py │ ├── sglang_worker.py │ └── test_message.py ├── train │ ├── __pycache__ │ │ ├── llava_trainer.cpython-310.pyc │ │ └── train.cpython-310.pyc │ ├── llama_flash_attn_monkey_patch.py │ ├── llava_trainer.py │ ├── llava_trainer_eval.py │ ├── train.py │ ├── train_dpo.py │ └── train_mem.py └── utils.py ├── requirements.txt └── scripts ├── insight-v ├── llava_dpo.sh ├── llava_next_reason.sh └── llava_next_summary.sh ├── zero2.json ├── zero2_fused_adamw.json ├── zero2_offload.json ├── zero3.json ├── zero3_offload.json └── zero3pp.json /README.md: -------------------------------------------------------------------------------- 1 | ## Insight-V: Exploring Long-Chain Visual Reasoning with Multimodal Large Language Models 2 | 3 |

4 | Yuhao Dong*,1  5 | Zuyan Liu*,2,3  6 | Hai-Long Sun2,4  7 | Jingkang Yang1  8 |

9 |

10 | Winston Hu2  11 | Yongming Rao2,3,✉  12 | Ziwei Liu1,✉  13 |

14 | 15 |

1S-Lab, NTU   2Tencent  3Tsinghua University 4Nanjing University 

16 | 17 |

* Equal Contribution  ✉ Corresponding Author

18 | 19 | **arXiv Paper:** [![Static Badge](https://img.shields.io/badge/Insight_V-paper-green)](https://arxiv.org/abs/2411.14432) 20 | 21 | **Model Checkpoints**: [![Insight-V-checkpoints](https://img.shields.io/badge/Insight_V-checkpoints-blue)](https://huggingface.co/collections/THUdyh/insight-v-673f5e1dd8ab5f2d8d332035) 22 | 23 | 24 | ## 📢 News 25 | - [04/2025] Insight-V is selected as **Highlight** paper by CVPR2025! 26 | - [02/2025] Insight-V is accepted by CVPR2025! 27 | - [11/2024] 🔧🔨**Training & Inference Scripts Release!** Try Insight-V on your own! 28 | - [11/2024] 🔥 **🚀Introducing Insight-V!** An early attempt to explore long-chain visual reasoning with MLLMs. 29 | * [[Paper]](https://arxiv.org/abs/2411.14432): Detailed introduction of Insight-V, including **structured, long-chain data generation pipeline** and **effective multi-agent system design**! 30 | * [[Checkpoints]](https://huggingface.co/collections/THUdyh/insight-v-673f5e1dd8ab5f2d8d332035): We release model checkpoints on LLaVA-NeXT-LLaMA3 and our base model. 31 | 32 | ## 🚀 Introducing Insight-V 33 | 34 | ### Main idea of Insight-V 35 | **Insight-V is an early effort to explore long-chain visual reasoning with MLLMs.** 36 | 37 | Insight-V offers **1)** a scalable data generation pipeline for long-chain, high-quality reasoning data, **2)** a multi-agent system that decomposes visual reasoning tasks into reasoning and summarization, and **3)** a two-stage training pipeline to enhance visual reasoning capabilities. Together, these contributions address key challenges in visual reasoning, providing a solid foundation for future research in MLLM reasoning. 38 | 39 | 40 | ### Overview of Data Generation Pipeline 41 | 42 | The reasoning processes are generated progressively through a reasoning generator, and then fed into a multi-granularity assessment system to ensure high-quality reasoning. 43 | 44 |
45 | 46 |
47 | 48 | ### Overview of Multi-Agent System 49 | 50 | We derive a multi-agent system from a single model. By decomposing the task into reasoning and summarization, the two agents collaborate to enhance the overall reasoning capability. 51 | 52 |
53 | 54 |
55 | 56 | 57 | ## ✅ TODO List 58 | 59 | - [x] Release paper on arXiv 60 | - [x] Release Insight-V models. 61 | - [ ] Demo code for generation. 62 | - [ ] All the training and inference code. 63 | - [ ] Evaluation code for visual reasoning benchmarks. 64 | - [ ] Insight-V SFT Data. 65 | - [ ] Insight-V with stronger MLLMs. 66 | 67 | ## 📃 Main Results 68 | 69 | ### Results on Visual Reasoning Benchmarks 70 | 71 |
72 | 73 |
74 | 75 | 76 | ### Results on Other Image Benchmarks 77 | 78 |
79 | 80 |
81 | 82 | ### Qualitative Results 83 | 84 |
85 | 86 |
87 | 88 | 89 | ## Citation 90 | 91 | If you find it useful for your research and applications, please cite our paper using this BibTeX: 92 | 93 | ```bibtex 94 | @article{dong2024insight, 95 | title={Insight-V: Exploring Long-Chain Visual Reasoning with Multimodal Large Language Models}, 96 | author={Dong, Yuhao and Liu, Zuyan and Sun, Hai-Long and Yang, Jingkang and Hu, Winston and Rao, Yongming and Liu, Ziwei}, 97 | journal={arXiv preprint arXiv:2411.14432}, 98 | year={2024} 99 | } 100 | 101 | ``` 102 | 103 | ## Acknowledgement 104 | 105 | - Our codebase is conducted on [LLaVA](https://github.com/LLaVA-VL/LLaVA-NeXT) 106 | 107 | - The data generation pipeline is mitigated from [g1](https://github.com/bklieger-groq/g1) 108 | 109 | - Thanks to [lmms-eval](https://github.com/EvolvingLMMs-Lab/lmms-eval) team, for building such a useful evaluation system! 110 | -------------------------------------------------------------------------------- /data_check.py: -------------------------------------------------------------------------------- 1 | import time 2 | import json 3 | import os 4 | import glob 5 | import tqdm 6 | from transformers import AutoTokenizer 7 | from vllm import LLM, SamplingParams 8 | import argparse 9 | 10 | prompt = '''I will give you a question, the ground truth answer and one answers generated by another model. You need to judge whether the answer is correct or not based on the ground truth answer. As long as the answer has the same meaning with the ground truth answer, even if the format is not completely the same, it is correct. For each question, if the answer is correct, just return 1. If the answer is incorrect, return 0. You should only return 1 or 0 for each question. The output should be a list containing your judgement of the answer. Here is an example: 11 | ——Example 1—— 12 | Question: \nWhat company held 15 percent of the microwave oven market in the United States in 2008?\nAnswer the question using a single word or phrase. 13 | Ground Truth: Samsung. 14 | Model 1: Samsung and Sharp held 15 percent of the microwave oven market share in the United States in 2008. 15 | Your answer: [0] 16 | ——Example 2—— 17 | Question: \nWhat company held 15 percent of the microwave oven market in the United States in 2008?\nAnswer the question using a single word or phrase. 18 | Model 1: Samsung held 15 percent of the microwave oven market in the United States in 2008. 19 | Your answer: [1] 20 | 21 | Now please generate your answers based on the given questions, ground truth answers and model answers. Please note that you should only output the list of your answers, strictly follow the format. 22 | ''' 23 | 24 | data_paths = ["data.json"] 25 | 26 | LLM_PATH = "PATH/TO/LLM" 27 | tokenizer = AutoTokenizer.from_pretrained(LLM_PATH) 28 | 29 | sampling_params = SamplingParams(temperature=0.2, top_p=0.7, repetition_penalty=1.05, max_tokens=512) 30 | 31 | llm = LLM(model=LLM_PATH,tensor_parallel_size=1) 32 | 33 | batch_size = 32 34 | 35 | for data_path in data_paths: 36 | with open(data_path, "r") as f: 37 | data = json.load(f) 38 | 39 | all_inputs = [] 40 | bad_count = 0 41 | for idx, data_line in enumerate(data): 42 | question = data_line['question'] 43 | ground_truth = data_line['conversations'][1]['value'] 44 | response = [] 45 | for meta_ in meta: 46 | # check whether we can parse the response of reasoning steps 47 | response_ = data_line['steps'][-1] 48 | try: 49 | response.append(json.loads(response_[1])['content']) 50 | except: 51 | bad_count += 1 52 | response.append("No available answer") 53 | 54 | user_input = f"Question: {question}\nGround Truth: {ground_truth}\nModel 1: {response[0]}\nYour answer: " 55 | 56 | messages = [ 57 | {"role": "system", "content": f"You are a helpful assistant.\n{prompt}"}, 58 | {"role": "user", "content": user_input} 59 | ] 60 | all_inputs.append({ 61 | "raw": data_line, 62 | "message": messages 63 | }) 64 | 65 | print(bad_count) 66 | 67 | for i in tqdm.tqdm(range(0, len(all_inputs), batch_size)): 68 | batch = all_inputs[i:i+batch_size] 69 | conversations = [] 70 | batch_raws, batch_messages = [batch[i]["raw"] for i in range(len(batch))], [batch[i]["message"] for i in range(len(batch))] 71 | for messages in batch_messages: 72 | text = tokenizer.apply_chat_template( 73 | messages, 74 | tokenize=False, 75 | add_generation_prompt=True 76 | ) 77 | conversations.append(text) 78 | 79 | outputs = llm.generate(conversations, sampling_params, use_tqdm=False) 80 | for j, output in enumerate(outputs): 81 | generated_text = output.outputs[0].text 82 | generated_text = generated_text.split("]")[0] + ']' 83 | try: 84 | all_inputs[i+j]['answer'] = json.loads(generated_text) 85 | except: 86 | print("Error") 87 | all_inputs[i+j]['answer'] = [0] 88 | 89 | save_path = "" 90 | with open(save_path, "w") as f: 91 | f.write(json.dumps(all_inputs)) 92 | -------------------------------------------------------------------------------- /example.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dongyh20/Insight-V/5e9c6f934c1929e351dbff17a6c6c7bc2d8143b1/example.png -------------------------------------------------------------------------------- /figure/data-generation.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dongyh20/Insight-V/5e9c6f934c1929e351dbff17a6c6c7bc2d8143b1/figure/data-generation.png -------------------------------------------------------------------------------- /figure/general_benchmark.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dongyh20/Insight-V/5e9c6f934c1929e351dbff17a6c6c7bc2d8143b1/figure/general_benchmark.png -------------------------------------------------------------------------------- /figure/multi-agent.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dongyh20/Insight-V/5e9c6f934c1929e351dbff17a6c6c7bc2d8143b1/figure/multi-agent.png -------------------------------------------------------------------------------- /figure/showcase.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dongyh20/Insight-V/5e9c6f934c1929e351dbff17a6c6c7bc2d8143b1/figure/showcase.png -------------------------------------------------------------------------------- /figure/teaser.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dongyh20/Insight-V/5e9c6f934c1929e351dbff17a6c6c7bc2d8143b1/figure/teaser.png -------------------------------------------------------------------------------- /figure/visual_reasoning_benchmark.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dongyh20/Insight-V/5e9c6f934c1929e351dbff17a6c6c7bc2d8143b1/figure/visual_reasoning_benchmark.png -------------------------------------------------------------------------------- /llava/__init__.py: -------------------------------------------------------------------------------- 1 | from .model import LlavaLlamaForCausalLM 2 | -------------------------------------------------------------------------------- /llava/__pycache__/__init__.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dongyh20/Insight-V/5e9c6f934c1929e351dbff17a6c6c7bc2d8143b1/llava/__pycache__/__init__.cpython-310.pyc -------------------------------------------------------------------------------- /llava/__pycache__/constants.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dongyh20/Insight-V/5e9c6f934c1929e351dbff17a6c6c7bc2d8143b1/llava/__pycache__/constants.cpython-310.pyc -------------------------------------------------------------------------------- /llava/__pycache__/conversation.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dongyh20/Insight-V/5e9c6f934c1929e351dbff17a6c6c7bc2d8143b1/llava/__pycache__/conversation.cpython-310.pyc -------------------------------------------------------------------------------- /llava/__pycache__/mm_utils.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dongyh20/Insight-V/5e9c6f934c1929e351dbff17a6c6c7bc2d8143b1/llava/__pycache__/mm_utils.cpython-310.pyc -------------------------------------------------------------------------------- /llava/__pycache__/utils.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dongyh20/Insight-V/5e9c6f934c1929e351dbff17a6c6c7bc2d8143b1/llava/__pycache__/utils.cpython-310.pyc -------------------------------------------------------------------------------- /llava/constants.py: -------------------------------------------------------------------------------- 1 | CONTROLLER_HEART_BEAT_EXPIRATION = 30 2 | WORKER_HEART_BEAT_INTERVAL = 15 3 | 4 | LOGDIR = "." 5 | 6 | # Model Constants 7 | IGNORE_INDEX = -100 8 | IMAGE_TOKEN_INDEX = -200 9 | DEFAULT_IMAGE_TOKEN = "" 10 | DEFAULT_IMAGE_PATCH_TOKEN = "" 11 | DEFAULT_IM_START_TOKEN = "" 12 | DEFAULT_IM_END_TOKEN = "" 13 | 14 | THINK_START_TOKEN = "" 15 | THINK_END_TOKEN = "" 16 | OUTPUT_START_TOKEN = "" 17 | OUTPUT_END_TOKEN = "" -------------------------------------------------------------------------------- /llava/model/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | AVAILABLE_MODELS = { 4 | "llava_llama": "LlavaLlamaForCausalLM, LlavaConfig", 5 | "llava_qwen": "LlavaQwenForCausalLM, LlavaQwenConfig", 6 | "llava_mistral": "LlavaMistralForCausalLM, LlavaMistralConfig", 7 | "llava_mixtral": "LlavaMixtralForCausalLM, LlavaMixtralConfig", 8 | # "llava_qwen_moe": "LlavaQwenMoeForCausalLM, LlavaQwenMoeConfig", 9 | # Add other models as needed 10 | } 11 | 12 | for model_name, model_classes in AVAILABLE_MODELS.items(): 13 | try: 14 | exec(f"from .language_model.{model_name} import {model_classes}") 15 | except Exception as e: 16 | print(f"Failed to import {model_name} from llava.language_model.{model_name}. Error: {e}") 17 | -------------------------------------------------------------------------------- /llava/model/__pycache__/__init__.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dongyh20/Insight-V/5e9c6f934c1929e351dbff17a6c6c7bc2d8143b1/llava/model/__pycache__/__init__.cpython-310.pyc -------------------------------------------------------------------------------- /llava/model/__pycache__/builder.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dongyh20/Insight-V/5e9c6f934c1929e351dbff17a6c6c7bc2d8143b1/llava/model/__pycache__/builder.cpython-310.pyc -------------------------------------------------------------------------------- /llava/model/__pycache__/llava_arch.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dongyh20/Insight-V/5e9c6f934c1929e351dbff17a6c6c7bc2d8143b1/llava/model/__pycache__/llava_arch.cpython-310.pyc -------------------------------------------------------------------------------- /llava/model/apply_delta.py: -------------------------------------------------------------------------------- 1 | """ 2 | Usage: 3 | python3 -m fastchat.model.apply_delta --base ~/model_weights/llama-7b --target ~/model_weights/vicuna-7b --delta lmsys/vicuna-7b-delta 4 | """ 5 | 6 | import argparse 7 | 8 | import torch 9 | from tqdm import tqdm 10 | from transformers import AutoTokenizer, AutoModelForCausalLM 11 | from llava import LlavaLlamaForCausalLM 12 | 13 | 14 | def apply_delta(base_model_path, target_model_path, delta_path): 15 | print("Loading base model") 16 | base = AutoModelForCausalLM.from_pretrained(base_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True) 17 | 18 | print("Loading delta") 19 | delta = LlavaLlamaForCausalLM.from_pretrained(delta_path, torch_dtype=torch.float16, low_cpu_mem_usage=True) 20 | delta_tokenizer = AutoTokenizer.from_pretrained(delta_path) 21 | 22 | print("Applying delta") 23 | for name, param in tqdm(delta.state_dict().items(), desc="Applying delta"): 24 | if name not in base.state_dict(): 25 | assert name in ["model.mm_projector.weight", "model.mm_projector.bias"], f"{name} not in base model" 26 | continue 27 | if param.data.shape == base.state_dict()[name].shape: 28 | param.data += base.state_dict()[name] 29 | else: 30 | assert name in ["model.embed_tokens.weight", "lm_head.weight"], f"{name} dimension mismatch: {param.data.shape} vs {base.state_dict()[name].shape}" 31 | bparam = base.state_dict()[name] 32 | param.data[: bparam.shape[0], : bparam.shape[1]] += bparam 33 | 34 | print("Saving target model") 35 | delta.save_pretrained(target_model_path) 36 | delta_tokenizer.save_pretrained(target_model_path) 37 | 38 | 39 | if __name__ == "__main__": 40 | parser = argparse.ArgumentParser() 41 | parser.add_argument("--base-model-path", type=str, required=True) 42 | parser.add_argument("--target-model-path", type=str, required=True) 43 | parser.add_argument("--delta-path", type=str, required=True) 44 | 45 | args = parser.parse_args() 46 | 47 | apply_delta(args.base_model_path, args.target_model_path, args.delta_path) 48 | -------------------------------------------------------------------------------- /llava/model/consolidate.py: -------------------------------------------------------------------------------- 1 | """ 2 | Usage: 3 | python3 -m llava.model.consolidate --src ~/model_weights/llava-7b --dst ~/model_weights/llava-7b_consolidate 4 | """ 5 | 6 | import argparse 7 | 8 | import torch 9 | from transformers import AutoTokenizer, AutoModelForCausalLM 10 | from llava.model import * 11 | from llava.model.utils import auto_upgrade 12 | 13 | 14 | def consolidate_ckpt(src_path, dst_path): 15 | print("Loading model") 16 | auto_upgrade(src_path) 17 | src_model = AutoModelForCausalLM.from_pretrained(src_path, torch_dtype=torch.float16, low_cpu_mem_usage=True) 18 | src_tokenizer = AutoTokenizer.from_pretrained(src_path, use_fast=False) 19 | src_model.save_pretrained(dst_path) 20 | src_tokenizer.save_pretrained(dst_path) 21 | 22 | 23 | if __name__ == "__main__": 24 | parser = argparse.ArgumentParser() 25 | parser.add_argument("--src", type=str, required=True) 26 | parser.add_argument("--dst", type=str, required=True) 27 | 28 | args = parser.parse_args() 29 | 30 | consolidate_ckpt(args.src, args.dst) 31 | -------------------------------------------------------------------------------- /llava/model/language_model/__pycache__/llava_llama.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dongyh20/Insight-V/5e9c6f934c1929e351dbff17a6c6c7bc2d8143b1/llava/model/language_model/__pycache__/llava_llama.cpython-310.pyc -------------------------------------------------------------------------------- /llava/model/language_model/__pycache__/llava_mistral.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dongyh20/Insight-V/5e9c6f934c1929e351dbff17a6c6c7bc2d8143b1/llava/model/language_model/__pycache__/llava_mistral.cpython-310.pyc -------------------------------------------------------------------------------- /llava/model/language_model/__pycache__/llava_mixtral.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dongyh20/Insight-V/5e9c6f934c1929e351dbff17a6c6c7bc2d8143b1/llava/model/language_model/__pycache__/llava_mixtral.cpython-310.pyc -------------------------------------------------------------------------------- /llava/model/language_model/__pycache__/llava_qwen.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dongyh20/Insight-V/5e9c6f934c1929e351dbff17a6c6c7bc2d8143b1/llava/model/language_model/__pycache__/llava_qwen.cpython-310.pyc -------------------------------------------------------------------------------- /llava/model/language_model/llava_gemma.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Duc Q. Nguyen, Haotian Liu and Bo Li 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | from typing import List, Optional, Tuple, Union 17 | 18 | import torch 19 | import torch.nn as nn 20 | from torch.nn import CrossEntropyLoss 21 | 22 | from transformers import AutoConfig, AutoModelForCausalLM, GemmaConfig, GemmaModel, GemmaForCausalLM 23 | 24 | from transformers.modeling_outputs import CausalLMOutputWithPast 25 | from transformers.generation.utils import GenerateOutput 26 | 27 | from ..llava_arch import LlavaMetaModel, LlavaMetaForCausalLM 28 | 29 | 30 | class LlavaGemmaConfig(GemmaConfig): 31 | model_type = "llava_gemma" 32 | 33 | 34 | class LlavaGemmaModel(LlavaMetaModel, GemmaModel): 35 | config_class = LlavaGemmaConfig 36 | 37 | def __init__(self, config: GemmaConfig): 38 | super(LlavaGemmaModel, self).__init__(config) 39 | 40 | 41 | class LlavaGemmaForCausalLM(GemmaForCausalLM, LlavaMetaForCausalLM): 42 | config_class = LlavaGemmaConfig 43 | 44 | def __init__(self, config): 45 | super(GemmaForCausalLM, self).__init__(config) 46 | self.model = LlavaGemmaModel(config) 47 | 48 | self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) 49 | 50 | # Initialize weights and apply final processing 51 | self.post_init() 52 | 53 | def get_model(self): 54 | return self.model 55 | 56 | def forward( 57 | self, 58 | input_ids: torch.LongTensor = None, 59 | attention_mask: Optional[torch.Tensor] = None, 60 | position_ids: Optional[torch.LongTensor] = None, 61 | past_key_values: Optional[List[torch.FloatTensor]] = None, 62 | inputs_embeds: Optional[torch.FloatTensor] = None, 63 | labels: Optional[torch.LongTensor] = None, 64 | use_cache: Optional[bool] = None, 65 | output_attentions: Optional[bool] = None, 66 | output_hidden_states: Optional[bool] = None, 67 | images: Optional[torch.FloatTensor] = None, 68 | image_sizes: Optional[List[List[int]]] = None, 69 | return_dict: Optional[bool] = None, 70 | cache_position: Optional[torch.LongTensor] = None, 71 | ) -> Union[Tuple, CausalLMOutputWithPast]: 72 | 73 | if inputs_embeds is None: 74 | (input_ids, position_ids, attention_mask, past_key_values, inputs_embeds, labels) = self.prepare_inputs_labels_for_multimodal(input_ids, position_ids, attention_mask, past_key_values, labels, images, image_sizes) 75 | 76 | return super().forward( 77 | input_ids=input_ids, 78 | attention_mask=attention_mask, 79 | position_ids=position_ids, 80 | past_key_values=past_key_values, 81 | inputs_embeds=inputs_embeds, 82 | labels=labels, 83 | use_cache=use_cache, 84 | output_attentions=output_attentions, 85 | output_hidden_states=output_hidden_states, 86 | return_dict=return_dict, 87 | cache_position=cache_position, 88 | ) 89 | 90 | @torch.no_grad() 91 | def generate( 92 | self, 93 | inputs: Optional[torch.Tensor] = None, 94 | images: Optional[torch.Tensor] = None, 95 | image_sizes: Optional[torch.Tensor] = None, 96 | **kwargs, 97 | ) -> Union[GenerateOutput, torch.LongTensor]: 98 | position_ids = kwargs.pop("position_ids", None) 99 | attention_mask = kwargs.pop("attention_mask", None) 100 | if "inputs_embeds" in kwargs: 101 | raise NotImplementedError("`inputs_embeds` is not supported") 102 | 103 | if images is not None: 104 | (inputs, position_ids, attention_mask, _, inputs_embeds, _) = self.prepare_inputs_labels_for_multimodal(inputs, position_ids, attention_mask, None, None, images, image_sizes=image_sizes) 105 | else: 106 | inputs_embeds = self.get_model().embed_tokens(inputs) 107 | 108 | return super().generate(position_ids=position_ids, attention_mask=attention_mask, inputs_embeds=inputs_embeds, **kwargs) 109 | 110 | def prepare_inputs_for_generation(self, input_ids, past_key_values=None, inputs_embeds=None, **kwargs): 111 | images = kwargs.pop("images", None) 112 | image_sizes = kwargs.pop("image_sizes", None) 113 | inputs = super().prepare_inputs_for_generation(input_ids, past_key_values=past_key_values, inputs_embeds=inputs_embeds, **kwargs) 114 | if images is not None: 115 | inputs["images"] = images 116 | if image_sizes is not None: 117 | inputs["image_sizes"] = image_sizes 118 | return inputs 119 | 120 | 121 | AutoConfig.register("llava_gemma", LlavaGemmaConfig) 122 | AutoModelForCausalLM.register(LlavaGemmaConfig, LlavaGemmaForCausalLM) 123 | -------------------------------------------------------------------------------- /llava/model/language_model/llava_llama.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Haotian Liu 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | from typing import List, Optional, Tuple, Union 17 | 18 | import torch 19 | import torch.nn as nn 20 | 21 | from transformers import AutoConfig, AutoModelForCausalLM, LlamaConfig 22 | 23 | from torch.nn import CrossEntropyLoss 24 | 25 | 26 | # , LlamaModel, LlamaForCausalLM, GenerationConfig 27 | # from .modeling_llama import LlamaModel, LlamaForCausalLM 28 | from transformers import LlamaModel, LlamaForCausalLM 29 | from transformers.modeling_outputs import CausalLMOutputWithPast 30 | from transformers.generation.utils import GenerateOutput 31 | 32 | from llava.model.llava_arch import LlavaMetaModel, LlavaMetaForCausalLM 33 | 34 | 35 | class LlavaConfig(LlamaConfig): 36 | model_type = "llava_llama" 37 | temperature: float = 0.0 # reset to 0.0, previously 0.9 for Vicuna 38 | max_new_tokens: int = 1024 39 | do_sample: bool = False 40 | top_p: Optional[float] = None 41 | # rope_scaling: Optional[dict] = {} 42 | 43 | 44 | class LlavaLlamaModel(LlavaMetaModel, LlamaModel): 45 | config_class = LlavaConfig 46 | 47 | def __init__(self, config: LlamaConfig): 48 | super(LlavaLlamaModel, self).__init__(config) 49 | 50 | 51 | class LlavaLlamaForCausalLM(LlamaForCausalLM, LlavaMetaForCausalLM): 52 | config_class = LlavaConfig 53 | 54 | def __init__(self, config): 55 | LlamaForCausalLM.__init__(self, config) 56 | 57 | # configure default generation settings 58 | config.model_type = "llava_llama" 59 | # config.rope_scaling = None 60 | 61 | self.model = LlavaLlamaModel(config) 62 | self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) 63 | # Initialize weights and apply final processing 64 | self.post_init() 65 | 66 | def get_model(self): 67 | return self.model 68 | 69 | def forward( 70 | self, 71 | input_ids: torch.LongTensor = None, 72 | attention_mask: Optional[torch.Tensor] = None, 73 | position_ids: Optional[torch.LongTensor] = None, 74 | past_key_values: Optional[List[torch.FloatTensor]] = None, 75 | inputs_embeds: Optional[torch.FloatTensor] = None, 76 | labels: Optional[torch.LongTensor] = None, 77 | use_cache: Optional[bool] = None, 78 | output_attentions: Optional[bool] = None, 79 | output_hidden_states: Optional[bool] = None, 80 | images: Optional[torch.FloatTensor] = None, 81 | image_sizes: Optional[List[List[int]]] = None, 82 | return_dict: Optional[bool] = None, 83 | modalities: Optional[List[str]] = ["image"], 84 | dpo_forward: Optional[bool] = None, 85 | cache_position=None, 86 | ) -> Union[Tuple, CausalLMOutputWithPast]: 87 | 88 | if inputs_embeds is None: 89 | (input_ids, position_ids, attention_mask, past_key_values, inputs_embeds, labels) = self.prepare_inputs_labels_for_multimodal(input_ids, position_ids, attention_mask, past_key_values, labels, images, modalities, image_sizes) 90 | 91 | if dpo_forward: 92 | outputs = self.model( 93 | input_ids=input_ids, 94 | attention_mask=attention_mask, 95 | position_ids=position_ids, 96 | past_key_values=past_key_values, 97 | inputs_embeds=inputs_embeds, 98 | use_cache=use_cache, 99 | output_attentions=output_attentions, 100 | output_hidden_states=output_hidden_states, 101 | return_dict=return_dict, 102 | ) 103 | 104 | hidden_states = outputs[0] 105 | logits = self.lm_head(hidden_states) 106 | return logits, labels 107 | 108 | else: 109 | return super().forward( 110 | input_ids=input_ids, 111 | attention_mask=attention_mask, 112 | position_ids=position_ids, 113 | past_key_values=past_key_values, 114 | inputs_embeds=inputs_embeds, 115 | labels=labels, 116 | use_cache=use_cache, 117 | output_attentions=output_attentions, 118 | output_hidden_states=output_hidden_states, 119 | return_dict=return_dict, 120 | ) 121 | 122 | @torch.no_grad() 123 | def generate( 124 | self, 125 | inputs: Optional[torch.Tensor] = None, 126 | images: Optional[torch.Tensor] = None, 127 | image_sizes: Optional[torch.Tensor] = None, 128 | modalities: Optional[List[str]] = ["image"], 129 | **kwargs, 130 | ) -> Union[GenerateOutput, torch.LongTensor]: 131 | modalities = kwargs.pop("modalities", None) if "modalities" in kwargs and modalities is None else modalities 132 | position_ids = kwargs.pop("position_ids", None) 133 | attention_mask = kwargs.pop("attention_mask", None) 134 | if "inputs_embeds" in kwargs: 135 | raise NotImplementedError("`inputs_embeds` is not supported") 136 | 137 | if images is not None: 138 | (inputs, position_ids, attention_mask, _, inputs_embeds, _) = self.prepare_inputs_labels_for_multimodal(inputs, position_ids, attention_mask, None, None, images, modalities, image_sizes=image_sizes) 139 | else: 140 | inputs_embeds = self.get_model().embed_tokens(inputs) 141 | 142 | return super().generate(position_ids=position_ids, attention_mask=attention_mask, inputs_embeds=inputs_embeds, **kwargs) 143 | 144 | def prepare_inputs_for_generation(self, input_ids, past_key_values=None, inputs_embeds=None, **kwargs): 145 | images = kwargs.pop("images", None) 146 | image_sizes = kwargs.pop("image_sizes", None) 147 | inputs = super().prepare_inputs_for_generation(input_ids, past_key_values=past_key_values, inputs_embeds=inputs_embeds, **kwargs) 148 | if images is not None: 149 | inputs["images"] = images 150 | if image_sizes is not None: 151 | inputs["image_sizes"] = image_sizes 152 | return inputs 153 | 154 | 155 | AutoConfig.register("llava_llama", LlavaConfig) 156 | AutoModelForCausalLM.register(LlavaConfig, LlavaLlamaForCausalLM) 157 | -------------------------------------------------------------------------------- /llava/model/language_model/llava_mistral.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Haotian Liu 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | from typing import List, Optional, Tuple, Union 17 | 18 | import torch 19 | import torch.nn as nn 20 | from torch.nn import CrossEntropyLoss 21 | 22 | from transformers import AutoConfig, AutoModelForCausalLM, MistralConfig, MistralModel, MistralForCausalLM, GenerationConfig 23 | 24 | from transformers.modeling_outputs import CausalLMOutputWithPast 25 | from transformers.generation.utils import GenerateOutput 26 | 27 | from ..llava_arch import LlavaMetaModel, LlavaMetaForCausalLM 28 | 29 | 30 | class LlavaMistralConfig(MistralConfig): 31 | model_type = "llava_mistral" 32 | temperature: float = 0.0 # reset to 0.0, previously 0.9 for Vicuna 33 | max_new_tokens: int = 1024 34 | do_sample: bool = False 35 | top_p: Optional[float] = None 36 | 37 | 38 | class LlavaMistralModel(LlavaMetaModel, MistralModel): 39 | config_class = LlavaMistralConfig 40 | 41 | def __init__(self, config: MistralConfig): 42 | super(LlavaMistralModel, self).__init__(config) 43 | 44 | 45 | class LlavaMistralForCausalLM(MistralForCausalLM, LlavaMetaForCausalLM): 46 | config_class = LlavaMistralConfig 47 | 48 | def __init__(self, config): 49 | super(MistralForCausalLM, self).__init__(config) 50 | 51 | config.model_type = "llava_mistral" 52 | config.rope_scaling = None 53 | 54 | self.model = LlavaMistralModel(config) 55 | self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) 56 | # Initialize weights and apply final processing 57 | self.post_init() 58 | 59 | def get_model(self): 60 | return self.model 61 | 62 | def forward( 63 | self, 64 | input_ids: torch.LongTensor = None, 65 | attention_mask: Optional[torch.Tensor] = None, 66 | position_ids: Optional[torch.LongTensor] = None, 67 | past_key_values: Optional[List[torch.FloatTensor]] = None, 68 | inputs_embeds: Optional[torch.FloatTensor] = None, 69 | labels: Optional[torch.LongTensor] = None, 70 | use_cache: Optional[bool] = None, 71 | output_attentions: Optional[bool] = None, 72 | output_hidden_states: Optional[bool] = None, 73 | images: Optional[torch.FloatTensor] = None, 74 | image_sizes: Optional[List[List[int]]] = None, 75 | return_dict: Optional[bool] = None, 76 | cache_position=None, 77 | ) -> Union[Tuple, CausalLMOutputWithPast]: 78 | 79 | if inputs_embeds is None: 80 | (input_ids, position_ids, attention_mask, past_key_values, inputs_embeds, labels) = self.prepare_inputs_labels_for_multimodal(input_ids, position_ids, attention_mask, past_key_values, labels, images, image_sizes) 81 | 82 | return super().forward( 83 | input_ids=input_ids, 84 | attention_mask=attention_mask, 85 | position_ids=position_ids, 86 | past_key_values=past_key_values, 87 | inputs_embeds=inputs_embeds, 88 | labels=labels, 89 | use_cache=use_cache, 90 | output_attentions=output_attentions, 91 | output_hidden_states=output_hidden_states, 92 | return_dict=return_dict, 93 | ) 94 | 95 | @torch.no_grad() 96 | def generate( 97 | self, 98 | inputs: Optional[torch.Tensor] = None, 99 | images: Optional[torch.Tensor] = None, 100 | image_sizes: Optional[torch.Tensor] = None, 101 | **kwargs, 102 | ) -> Union[GenerateOutput, torch.LongTensor]: 103 | position_ids = kwargs.pop("position_ids", None) 104 | attention_mask = kwargs.pop("attention_mask", None) 105 | if "inputs_embeds" in kwargs: 106 | raise NotImplementedError("`inputs_embeds` is not supported") 107 | 108 | if images is not None: 109 | (inputs, position_ids, attention_mask, _, inputs_embeds, _) = self.prepare_inputs_labels_for_multimodal(inputs, position_ids, attention_mask, None, None, images, image_sizes=image_sizes) 110 | else: 111 | inputs_embeds = self.get_model().embed_tokens(inputs) 112 | 113 | return super().generate(position_ids=position_ids, attention_mask=attention_mask, inputs_embeds=inputs_embeds, **kwargs) 114 | 115 | def prepare_inputs_for_generation(self, input_ids, past_key_values=None, inputs_embeds=None, **kwargs): 116 | images = kwargs.pop("images", None) 117 | image_sizes = kwargs.pop("image_sizes", None) 118 | inputs = super().prepare_inputs_for_generation(input_ids, past_key_values=past_key_values, inputs_embeds=inputs_embeds, **kwargs) 119 | if images is not None: 120 | inputs["images"] = images 121 | if image_sizes is not None: 122 | inputs["image_sizes"] = image_sizes 123 | return inputs 124 | 125 | 126 | AutoConfig.register("llava_mistral", LlavaMistralConfig) 127 | AutoModelForCausalLM.register(LlavaMistralConfig, LlavaMistralForCausalLM) 128 | -------------------------------------------------------------------------------- /llava/model/language_model/llava_mixtral.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Haotian Liu 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | from typing import List, Optional, Tuple, Union 17 | 18 | import torch 19 | import torch.nn as nn 20 | from torch.nn import CrossEntropyLoss 21 | 22 | from transformers import AutoConfig, AutoModelForCausalLM, MixtralConfig, MixtralModel, MixtralForCausalLM, GenerationConfig 23 | 24 | from transformers.modeling_outputs import CausalLMOutputWithPast 25 | from transformers.generation.utils import GenerateOutput 26 | 27 | from ..llava_arch import LlavaMetaModel, LlavaMetaForCausalLM 28 | 29 | 30 | class LlavaMixtralConfig(MixtralConfig): 31 | model_type = "llava_mixtral" 32 | 33 | 34 | class LlavaMixtralModel(LlavaMetaModel, MixtralModel): 35 | config_class = LlavaMixtralConfig 36 | 37 | def __init__(self, config: MixtralConfig): 38 | super(LlavaMixtralModel, self).__init__(config) 39 | 40 | 41 | class LlavaMixtralForCausalLM(MixtralForCausalLM, LlavaMetaForCausalLM): 42 | config_class = LlavaMixtralConfig 43 | 44 | def __init__(self, config): 45 | super(MixtralForCausalLM, self).__init__(config) 46 | 47 | config.model_type = "llava_mixtral" 48 | config.rope_scaling = None 49 | self.model = LlavaMixtralModel(config) 50 | self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) 51 | # Initialize weights and apply final processing 52 | self.post_init() 53 | 54 | def get_model(self): 55 | return self.model 56 | 57 | def forward( 58 | self, 59 | input_ids: torch.LongTensor = None, 60 | attention_mask: Optional[torch.Tensor] = None, 61 | position_ids: Optional[torch.LongTensor] = None, 62 | past_key_values: Optional[List[torch.FloatTensor]] = None, 63 | inputs_embeds: Optional[torch.FloatTensor] = None, 64 | labels: Optional[torch.LongTensor] = None, 65 | use_cache: Optional[bool] = None, 66 | output_attentions: Optional[bool] = None, 67 | output_hidden_states: Optional[bool] = None, 68 | images: Optional[torch.FloatTensor] = None, 69 | image_sizes: Optional[List[List[int]]] = None, 70 | return_dict: Optional[bool] = None, 71 | modalities: Optional[List[str]] = ["image"], 72 | dpo_forward: Optional[bool] = None, 73 | cache_position=None, 74 | ) -> Union[Tuple, CausalLMOutputWithPast]: 75 | 76 | if inputs_embeds is None: 77 | (input_ids, position_ids, attention_mask, past_key_values, inputs_embeds, labels) = self.prepare_inputs_labels_for_multimodal(input_ids, position_ids, attention_mask, past_key_values, labels, images, modalities, image_sizes) 78 | 79 | if dpo_forward: 80 | outputs = self.model( 81 | input_ids=input_ids, 82 | attention_mask=attention_mask, 83 | position_ids=position_ids, 84 | past_key_values=past_key_values, 85 | inputs_embeds=inputs_embeds, 86 | use_cache=use_cache, 87 | output_attentions=output_attentions, 88 | output_hidden_states=output_hidden_states, 89 | return_dict=return_dict, 90 | ) 91 | 92 | hidden_states = outputs[0] 93 | logits = self.lm_head(hidden_states) 94 | return logits, labels 95 | 96 | else: 97 | return super().forward( 98 | input_ids=input_ids, 99 | attention_mask=attention_mask, 100 | position_ids=position_ids, 101 | past_key_values=past_key_values, 102 | inputs_embeds=inputs_embeds, 103 | labels=labels, 104 | use_cache=use_cache, 105 | output_attentions=output_attentions, 106 | output_hidden_states=output_hidden_states, 107 | return_dict=return_dict, 108 | ) 109 | 110 | @torch.no_grad() 111 | def generate( 112 | self, 113 | inputs: Optional[torch.Tensor] = None, 114 | images: Optional[torch.Tensor] = None, 115 | image_sizes: Optional[torch.Tensor] = None, 116 | modalities: Optional[List[str]] = ["image"], 117 | **kwargs, 118 | ) -> Union[GenerateOutput, torch.LongTensor]: 119 | position_ids = kwargs.pop("position_ids", None) 120 | attention_mask = kwargs.pop("attention_mask", None) 121 | if "inputs_embeds" in kwargs: 122 | raise NotImplementedError("`inputs_embeds` is not supported") 123 | 124 | if images is not None: 125 | (inputs, position_ids, attention_mask, _, inputs_embeds, _) = self.prepare_inputs_labels_for_multimodal(inputs, position_ids, attention_mask, None, None, images, modalities, image_sizes=image_sizes) 126 | else: 127 | inputs_embeds = self.get_model().embed_tokens(inputs) 128 | 129 | return super().generate(position_ids=position_ids, attention_mask=attention_mask, inputs_embeds=inputs_embeds, **kwargs) 130 | 131 | def prepare_inputs_for_generation(self, input_ids, past_key_values=None, inputs_embeds=None, **kwargs): 132 | images = kwargs.pop("images", None) 133 | image_sizes = kwargs.pop("image_sizes", None) 134 | inputs = super().prepare_inputs_for_generation(input_ids, past_key_values=past_key_values, inputs_embeds=inputs_embeds, **kwargs) 135 | if images is not None: 136 | inputs["images"] = images 137 | if image_sizes is not None: 138 | inputs["image_sizes"] = image_sizes 139 | return inputs 140 | 141 | 142 | AutoConfig.register("llava_mixtral", LlavaMixtralConfig) 143 | AutoModelForCausalLM.register(LlavaMixtralConfig, LlavaMixtralForCausalLM) 144 | -------------------------------------------------------------------------------- /llava/model/language_model/llava_mpt.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Haotian Liu 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | from typing import Optional, Tuple 17 | 18 | import torch 19 | 20 | from transformers import AutoConfig, AutoModelForCausalLM, MptConfig, MptForCausalLM, MptModel, GenerationConfig 21 | from llava.model.llava_arch import LlavaMetaModel, LlavaMetaForCausalLM 22 | 23 | 24 | class LlavaMptConfig(MptConfig): 25 | model_type = "llava_mpt" 26 | 27 | 28 | class LlavaMptModel(LlavaMetaModel, MptModel): 29 | config_class = LlavaMptConfig 30 | 31 | def __init__(self, config: MptConfig): 32 | config.hidden_size = config.d_model 33 | super(LlavaMptModel, self).__init__(config) 34 | 35 | def embed_tokens(self, x): 36 | return self.wte(x) 37 | 38 | 39 | class LlavaMptForCausalLM(MptForCausalLM, LlavaMetaForCausalLM): 40 | config_class = LlavaMptConfig 41 | supports_gradient_checkpointing = True 42 | 43 | def __init__(self, config): 44 | super(MptForCausalLM, self).__init__(config) 45 | 46 | config.model_type = "llava_mpt" 47 | config.rope_scaling = None 48 | self.generation_config = GenerationConfig( 49 | temperature=0.0, 50 | max_new_tokens=1024, 51 | do_sample=False, 52 | top_p=None, 53 | ) 54 | 55 | self.transformer = LlavaMptModel(config) 56 | self.lm_head = torch.nn.Linear(config.hidden_size, config.vocab_size, bias=False) 57 | 58 | # Initialize weights and apply final processing 59 | self.post_init() 60 | 61 | def get_model(self): 62 | return self.transformer 63 | 64 | def _set_gradient_checkpointing(self, module, value=False): 65 | if isinstance(module, LlavaMptModel): 66 | module.gradient_checkpointing = value 67 | 68 | def forward( 69 | self, 70 | input_ids: Optional[torch.LongTensor] = None, 71 | past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None, 72 | attention_mask: Optional[torch.Tensor] = None, 73 | inputs_embeds: Optional[torch.Tensor] = None, 74 | labels: Optional[torch.Tensor] = None, 75 | use_cache: Optional[bool] = None, 76 | output_attentions: Optional[bool] = None, 77 | output_hidden_states: Optional[bool] = None, 78 | return_dict: Optional[bool] = None, 79 | cache_position=None, 80 | images=None, 81 | ): 82 | 83 | input_ids, attention_mask, past_key_values, inputs_embeds, labels = self.prepare_inputs_labels_for_multimodal(input_ids, attention_mask, past_key_values, labels, images) 84 | 85 | return super().forward( 86 | input_ids, 87 | past_key_values=past_key_values, 88 | attention_mask=attention_mask, 89 | inputs_embeds=inputs_embeds, 90 | labels=labels, 91 | use_cache=use_cache, 92 | output_attentions=output_attentions, 93 | output_hidden_states=output_hidden_states, 94 | return_dict=return_dict, 95 | ) 96 | 97 | def prepare_inputs_for_generation(self, input_ids, past_key_values=None, inputs_embeds=None, **kwargs): 98 | images = kwargs.pop("images", None) 99 | _inputs = super().prepare_inputs_for_generation(input_ids, past_key_values=past_key_values, inputs_embeds=inputs_embeds, **kwargs) 100 | _inputs["images"] = images 101 | return _inputs 102 | 103 | 104 | AutoConfig.register("llava_mpt", LlavaMptConfig) 105 | AutoModelForCausalLM.register(LlavaMptConfig, LlavaMptForCausalLM) 106 | -------------------------------------------------------------------------------- /llava/model/language_model/llava_qwen.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Hao Zhang 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | from typing import List, Optional, Tuple, Union, Dict 17 | import torch 18 | import torch.nn as nn 19 | from torch.nn import CrossEntropyLoss 20 | 21 | import transformers 22 | from transformers import AutoConfig, AutoModelForCausalLM, LlamaConfig, LlamaModel, LlamaForCausalLM 23 | 24 | from transformers.modeling_outputs import CausalLMOutputWithPast 25 | from transformers.generation.utils import GenerateOutput 26 | 27 | # from ...constants import IGNORE_INDEX, IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN 28 | from llava.model.llava_arch import LlavaMetaModel, LlavaMetaForCausalLM 29 | from transformers import Qwen2Config, Qwen2Model, Qwen2ForCausalLM 30 | 31 | # from .qwen.modeling_qwen import QWenLMHeadModel, QWenModel 32 | # from .qwen.configuration_qwen import QWenConfig 33 | 34 | import os 35 | if 'DPO_FORWARD' in os.environ: 36 | print("DPO_FORWARD is set") 37 | DPO_FORWARD = True 38 | else: 39 | DPO_FORWARD = False 40 | 41 | 42 | class LlavaQwenConfig(Qwen2Config): 43 | model_type = "llava_qwen" 44 | 45 | 46 | class LlavaQwenModel(LlavaMetaModel, Qwen2Model): 47 | config_class = LlavaQwenConfig 48 | 49 | def __init__(self, config: Qwen2Config): 50 | super(LlavaQwenModel, self).__init__(config) 51 | 52 | 53 | class LlavaQwenForCausalLM(Qwen2ForCausalLM, LlavaMetaForCausalLM): 54 | config_class = LlavaQwenConfig 55 | 56 | def __init__(self, config): 57 | # super(Qwen2ForCausalLM, self).__init__(config) 58 | Qwen2ForCausalLM.__init__(self, config) 59 | config.model_type = "llava_qwen" 60 | config.rope_scaling = None 61 | 62 | self.model = LlavaQwenModel(config) 63 | self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) 64 | # Initialize weights and apply final processing 65 | self.post_init() 66 | 67 | def get_model(self): 68 | return self.model 69 | 70 | def forward( 71 | self, 72 | input_ids: torch.LongTensor = None, 73 | attention_mask: Optional[torch.Tensor] = None, 74 | position_ids: Optional[torch.LongTensor] = None, 75 | past_key_values: Optional[List[torch.FloatTensor]] = None, 76 | inputs_embeds: Optional[torch.FloatTensor] = None, 77 | labels: Optional[torch.LongTensor] = None, 78 | use_cache: Optional[bool] = None, 79 | output_attentions: Optional[bool] = None, 80 | output_hidden_states: Optional[bool] = None, 81 | images: Optional[torch.FloatTensor] = None, 82 | image_sizes: Optional[List[List[int]]] = None, 83 | return_dict: Optional[bool] = None, 84 | modalities: Optional[List[str]] = ["image"], 85 | dpo_forward: Optional[bool] = False, 86 | cache_position=None, 87 | ) -> Union[Tuple, CausalLMOutputWithPast]: 88 | 89 | if inputs_embeds is None: 90 | (input_ids, position_ids, attention_mask, past_key_values, inputs_embeds, labels) = self.prepare_inputs_labels_for_multimodal(input_ids, position_ids, attention_mask, past_key_values, labels, images, modalities, image_sizes) 91 | 92 | if DPO_FORWARD: 93 | outputs = self.model( 94 | input_ids=input_ids, 95 | attention_mask=attention_mask, 96 | position_ids=position_ids, 97 | past_key_values=past_key_values, 98 | inputs_embeds=inputs_embeds, 99 | use_cache=use_cache, 100 | output_attentions=output_attentions, 101 | output_hidden_states=output_hidden_states, 102 | return_dict=return_dict, 103 | ) 104 | 105 | hidden_states = outputs[0] 106 | logits = self.lm_head(hidden_states) 107 | return logits, labels 108 | 109 | else: 110 | return super().forward( 111 | input_ids=input_ids, 112 | attention_mask=attention_mask, 113 | position_ids=position_ids, 114 | past_key_values=past_key_values, 115 | inputs_embeds=inputs_embeds, 116 | labels=labels, 117 | use_cache=use_cache, 118 | output_attentions=output_attentions, 119 | output_hidden_states=output_hidden_states, 120 | return_dict=return_dict, 121 | ) 122 | 123 | @torch.no_grad() 124 | def generate( 125 | self, 126 | inputs: Optional[torch.Tensor] = None, 127 | images: Optional[torch.Tensor] = None, 128 | image_sizes: Optional[torch.Tensor] = None, 129 | modalities: Optional[List[str]] = ["image"], 130 | **kwargs, 131 | ) -> Union[GenerateOutput, torch.LongTensor]: 132 | position_ids = kwargs.pop("position_ids", None) 133 | attention_mask = kwargs.pop("attention_mask", None) 134 | if "inputs_embeds" in kwargs: 135 | raise NotImplementedError("`inputs_embeds` is not supported") 136 | 137 | if images is not None: 138 | (inputs, position_ids, attention_mask, _, inputs_embeds, _) = self.prepare_inputs_labels_for_multimodal(inputs, position_ids, attention_mask, None, None, images, modalities, image_sizes=image_sizes) 139 | else: 140 | inputs_embeds = self.get_model().embed_tokens(inputs) 141 | 142 | return super().generate(position_ids=position_ids, attention_mask=attention_mask, inputs_embeds=inputs_embeds, **kwargs) 143 | 144 | def prepare_inputs_for_generation(self, input_ids, past_key_values=None, inputs_embeds=None, **kwargs): 145 | images = kwargs.pop("images", None) 146 | image_sizes = kwargs.pop("image_sizes", None) 147 | inputs = super().prepare_inputs_for_generation(input_ids, past_key_values=past_key_values, inputs_embeds=inputs_embeds, **kwargs) 148 | if images is not None: 149 | inputs["images"] = images 150 | if image_sizes is not None: 151 | inputs["image_sizes"] = image_sizes 152 | return inputs 153 | 154 | 155 | AutoConfig.register("llava_qwen", LlavaQwenConfig) 156 | AutoModelForCausalLM.register(LlavaQwenConfig, LlavaQwenForCausalLM) 157 | -------------------------------------------------------------------------------- /llava/model/language_model/llava_qwen_moe.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Hao Zhang 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | from typing import List, Optional, Tuple, Union, Dict 17 | import torch 18 | import torch.nn as nn 19 | from torch.nn import CrossEntropyLoss 20 | 21 | import transformers 22 | from transformers import AutoConfig, AutoModelForCausalLM 23 | 24 | from transformers.modeling_outputs import CausalLMOutputWithPast 25 | from transformers.generation.utils import GenerateOutput 26 | 27 | # from ...constants import IGNORE_INDEX, IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN 28 | from llava.model.llava_arch import LlavaMetaModel, LlavaMetaForCausalLM 29 | from transformers import Qwen2MoeConfig, Qwen2MoeModel, Qwen2MoeForCausalLM 30 | 31 | # from .qwen.modeling_qwen import QWenLMHeadModel, QWenModel 32 | # from .qwen.configuration_qwen import QWenConfig 33 | 34 | 35 | class LlavaQwenMoeConfig(Qwen2MoeConfig): 36 | model_type = "llava_qwen_moe" 37 | 38 | 39 | class LlavaQwenMoeModel(LlavaMetaModel, Qwen2MoeModel): 40 | config_class = LlavaQwenMoeConfig 41 | 42 | def __init__(self, config: Qwen2MoeConfig): 43 | super(LlavaQwenMoeModel, self).__init__(config) 44 | 45 | 46 | class LlavaQwenMoeForCausalLM(Qwen2MoeForCausalLM, LlavaMetaForCausalLM): 47 | config_class = LlavaQwenMoeConfig 48 | 49 | def __init__(self, config): 50 | # super(Qwen2MoeForCausalLM, self).__init__(config) 51 | Qwen2MoeForCausalLM.__init__(self, config) 52 | config.model_type = "llava_qwen_moe" 53 | config.rope_scaling = None 54 | 55 | self.model = LlavaQwenMoeModel(config) 56 | self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) 57 | # Initialize weights and apply final processing 58 | self.post_init() 59 | 60 | def get_model(self): 61 | return self.model 62 | 63 | def forward( 64 | self, 65 | input_ids: torch.LongTensor = None, 66 | attention_mask: Optional[torch.Tensor] = None, 67 | position_ids: Optional[torch.LongTensor] = None, 68 | past_key_values: Optional[List[torch.FloatTensor]] = None, 69 | inputs_embeds: Optional[torch.FloatTensor] = None, 70 | labels: Optional[torch.LongTensor] = None, 71 | use_cache: Optional[bool] = None, 72 | output_attentions: Optional[bool] = None, 73 | output_hidden_states: Optional[bool] = None, 74 | images: Optional[torch.FloatTensor] = None, 75 | image_sizes: Optional[List[List[int]]] = None, 76 | return_dict: Optional[bool] = None, 77 | modalities: Optional[List[str]] = ["image"], 78 | dpo_forward: Optional[bool] = False, 79 | cache_position=None, 80 | ) -> Union[Tuple, CausalLMOutputWithPast]: 81 | 82 | if inputs_embeds is None: 83 | (input_ids, position_ids, attention_mask, past_key_values, inputs_embeds, labels) = self.prepare_inputs_labels_for_multimodal(input_ids, position_ids, attention_mask, past_key_values, labels, images, modalities, image_sizes) 84 | 85 | if dpo_forward: 86 | outputs = self.model( 87 | input_ids=input_ids, 88 | attention_mask=attention_mask, 89 | position_ids=position_ids, 90 | past_key_values=past_key_values, 91 | inputs_embeds=inputs_embeds, 92 | use_cache=use_cache, 93 | output_attentions=output_attentions, 94 | output_hidden_states=output_hidden_states, 95 | return_dict=return_dict, 96 | ) 97 | 98 | hidden_states = outputs[0] 99 | logits = self.lm_head(hidden_states) 100 | return logits, labels 101 | 102 | else: 103 | return super().forward( 104 | input_ids=input_ids, 105 | attention_mask=attention_mask, 106 | position_ids=position_ids, 107 | past_key_values=past_key_values, 108 | inputs_embeds=inputs_embeds, 109 | labels=labels, 110 | use_cache=use_cache, 111 | output_attentions=output_attentions, 112 | output_hidden_states=output_hidden_states, 113 | return_dict=return_dict, 114 | ) 115 | 116 | @torch.no_grad() 117 | def generate( 118 | self, 119 | inputs: Optional[torch.Tensor] = None, 120 | images: Optional[torch.Tensor] = None, 121 | image_sizes: Optional[torch.Tensor] = None, 122 | modalities: Optional[List[str]] = ["image"], 123 | **kwargs, 124 | ) -> Union[GenerateOutput, torch.LongTensor]: 125 | position_ids = kwargs.pop("position_ids", None) 126 | attention_mask = kwargs.pop("attention_mask", None) 127 | if "inputs_embeds" in kwargs: 128 | raise NotImplementedError("`inputs_embeds` is not supported") 129 | 130 | if images is not None: 131 | (inputs, position_ids, attention_mask, _, inputs_embeds, _) = self.prepare_inputs_labels_for_multimodal(inputs, position_ids, attention_mask, None, None, images, modalities, image_sizes=image_sizes) 132 | else: 133 | inputs_embeds = self.get_model().embed_tokens(inputs) 134 | 135 | return super().generate(position_ids=position_ids, attention_mask=attention_mask, inputs_embeds=inputs_embeds, **kwargs) 136 | 137 | def prepare_inputs_for_generation(self, input_ids, past_key_values=None, inputs_embeds=None, **kwargs): 138 | images = kwargs.pop("images", None) 139 | image_sizes = kwargs.pop("image_sizes", None) 140 | inputs = super().prepare_inputs_for_generation(input_ids, past_key_values=past_key_values, inputs_embeds=inputs_embeds, **kwargs) 141 | if images is not None: 142 | inputs["images"] = images 143 | if image_sizes is not None: 144 | inputs["image_sizes"] = image_sizes 145 | return inputs 146 | 147 | 148 | AutoConfig.register("llava_qwen_moe", LlavaQwenMoeConfig) 149 | AutoModelForCausalLM.register(LlavaQwenMoeConfig, LlavaQwenMoeForCausalLM) 150 | -------------------------------------------------------------------------------- /llava/model/make_delta.py: -------------------------------------------------------------------------------- 1 | """ 2 | Usage: 3 | python3 -m llava.model.make_delta --base ~/model_weights/llama-7b --target ~/model_weights/llava-7b --delta ~/model_weights/llava-7b-delta --hub-repo-id liuhaotian/llava-7b-delta 4 | """ 5 | 6 | import argparse 7 | 8 | import torch 9 | from tqdm import tqdm 10 | from transformers import AutoTokenizer, AutoModelForCausalLM 11 | from llava.model.utils import auto_upgrade 12 | 13 | 14 | def make_delta(base_model_path, target_model_path, delta_path, hub_repo_id): 15 | print("Loading base model") 16 | base = AutoModelForCausalLM.from_pretrained(base_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True) 17 | 18 | print("Loading target model") 19 | auto_upgrade(target_model_path) 20 | target = AutoModelForCausalLM.from_pretrained(target_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True) 21 | 22 | print("Calculating delta") 23 | for name, param in tqdm(target.state_dict().items(), desc="Calculating delta"): 24 | if name not in base.state_dict(): 25 | assert name in ["model.mm_projector.weight", "model.mm_projector.bias"], f"{name} not in base model" 26 | continue 27 | if param.data.shape == base.state_dict()[name].shape: 28 | param.data -= base.state_dict()[name] 29 | else: 30 | assert name in ["model.embed_tokens.weight", "lm_head.weight"], f"{name} dimension mismatch: {param.data.shape} vs {base.state_dict()[name].shape}" 31 | bparam = base.state_dict()[name] 32 | param.data[: bparam.shape[0], : bparam.shape[1]] -= bparam 33 | 34 | print("Saving delta") 35 | if hub_repo_id: 36 | kwargs = {"push_to_hub": True, "repo_id": hub_repo_id} 37 | else: 38 | kwargs = {} 39 | target.save_pretrained(delta_path, **kwargs) 40 | target_tokenizer = AutoTokenizer.from_pretrained(target_model_path) 41 | target_tokenizer.save_pretrained(delta_path, **kwargs) 42 | 43 | 44 | if __name__ == "__main__": 45 | parser = argparse.ArgumentParser() 46 | parser.add_argument("--base-model-path", type=str, required=True) 47 | parser.add_argument("--target-model-path", type=str, required=True) 48 | parser.add_argument("--delta-path", type=str, required=True) 49 | parser.add_argument("--hub-repo-id", type=str, default=None) 50 | args = parser.parse_args() 51 | 52 | make_delta(args.base_model_path, args.target_model_path, args.delta_path, args.hub_repo_id) 53 | -------------------------------------------------------------------------------- /llava/model/multimodal_encoder/__pycache__/builder.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dongyh20/Insight-V/5e9c6f934c1929e351dbff17a6c6c7bc2d8143b1/llava/model/multimodal_encoder/__pycache__/builder.cpython-310.pyc -------------------------------------------------------------------------------- /llava/model/multimodal_encoder/__pycache__/clip_encoder.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dongyh20/Insight-V/5e9c6f934c1929e351dbff17a6c6c7bc2d8143b1/llava/model/multimodal_encoder/__pycache__/clip_encoder.cpython-310.pyc -------------------------------------------------------------------------------- /llava/model/multimodal_encoder/__pycache__/hf_vision.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dongyh20/Insight-V/5e9c6f934c1929e351dbff17a6c6c7bc2d8143b1/llava/model/multimodal_encoder/__pycache__/hf_vision.cpython-310.pyc -------------------------------------------------------------------------------- /llava/model/multimodal_encoder/__pycache__/imagebind.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dongyh20/Insight-V/5e9c6f934c1929e351dbff17a6c6c7bc2d8143b1/llava/model/multimodal_encoder/__pycache__/imagebind.cpython-310.pyc -------------------------------------------------------------------------------- /llava/model/multimodal_encoder/__pycache__/open_clip_encoder.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dongyh20/Insight-V/5e9c6f934c1929e351dbff17a6c6c7bc2d8143b1/llava/model/multimodal_encoder/__pycache__/open_clip_encoder.cpython-310.pyc -------------------------------------------------------------------------------- /llava/model/multimodal_encoder/__pycache__/siglip_encoder.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dongyh20/Insight-V/5e9c6f934c1929e351dbff17a6c6c7bc2d8143b1/llava/model/multimodal_encoder/__pycache__/siglip_encoder.cpython-310.pyc -------------------------------------------------------------------------------- /llava/model/multimodal_encoder/builder.py: -------------------------------------------------------------------------------- 1 | import os 2 | from .clip_encoder import CLIPVisionTower 3 | from .imagebind import ImageBindWrapper 4 | from .open_clip_encoder import OpenCLIPVisionTower 5 | from .hf_vision import HFVisionTower 6 | from .siglip_encoder import SigLipVisionTower 7 | from .clip_encoder import CLIPVisionTower, CLIPVisionTowerS2 8 | 9 | # from .eva_clip.eva_clip_encoder import EvaClipVisionTower 10 | # from .dev_eva_clip.eva_vit import EvaViTWrapper 11 | 12 | 13 | def build_vision_tower(vision_tower_cfg, **kwargs): 14 | vision_tower = getattr(vision_tower_cfg, "mm_vision_tower", getattr(vision_tower_cfg, "vision_tower", None)) 15 | is_absolute_path_exists = os.path.exists(vision_tower) 16 | use_s2 = getattr(vision_tower_cfg, "s2", False) 17 | if is_absolute_path_exists or vision_tower.startswith("openai") or vision_tower.startswith("laion") or "ShareGPT4V" in vision_tower: 18 | if use_s2: 19 | return CLIPVisionTowerS2(vision_tower, args=vision_tower_cfg, **kwargs) 20 | else: 21 | return CLIPVisionTower(vision_tower, args=vision_tower_cfg, **kwargs) 22 | elif "siglip" in vision_tower: 23 | return SigLipVisionTower(vision_tower, vision_tower_cfg=vision_tower_cfg, **kwargs) 24 | elif vision_tower.startswith("hf:"): 25 | return HFVisionTower(vision_tower, args=vision_tower_cfg, **kwargs) 26 | elif vision_tower in ["imagebind_huge"]: 27 | return ImageBindWrapper(vision_tower, args=vision_tower_cfg, **kwargs) 28 | elif vision_tower.startswith("open_clip_hub"): 29 | return OpenCLIPVisionTower(vision_tower, args=vision_tower_cfg, **kwargs) 30 | # elif "internal-eva" in vision_tower.lower() or "eva02" in vision_tower.lower(): 31 | # return EvaClipVisionTower(vision_tower, args=vision_tower_cfg, **kwargs) 32 | # elif vision_tower in ["EVA-CLIP-8B", "EVA-CLIP-8B-plus"]: 33 | # return EvaViTWrapper(vision_tower, args=vision_tower_cfg, **kwargs) 34 | 35 | raise ValueError(f"Unknown vision tower: {vision_tower}") 36 | -------------------------------------------------------------------------------- /llava/model/multimodal_encoder/dev_eva_clip/eva_clip/__init__.py: -------------------------------------------------------------------------------- 1 | from .constants import OPENAI_DATASET_MEAN, OPENAI_DATASET_STD 2 | from .factory import create_model, create_model_and_transforms, create_model_from_pretrained, get_tokenizer 3 | from .factory import list_models, add_model_config, get_model_config, load_checkpoint 4 | from .loss import ClipLoss 5 | from .model import CLIP, CustomCLIP, CLIPTextCfg, CLIPVisionCfg, convert_weights_to_lp, convert_weights_to_fp16, trace_model, get_cast_dtype 6 | from .openai import load_openai_model, list_openai_models 7 | from .pretrained import list_pretrained, list_pretrained_models_by_tag, list_pretrained_tags_by_model, get_pretrained_url, download_pretrained_from_url, is_pretrained_cfg, get_pretrained_cfg, download_pretrained 8 | from .tokenizer import SimpleTokenizer, tokenize 9 | from .transform import image_transform 10 | -------------------------------------------------------------------------------- /llava/model/multimodal_encoder/dev_eva_clip/eva_clip/bpe_simple_vocab_16e6.txt.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dongyh20/Insight-V/5e9c6f934c1929e351dbff17a6c6c7bc2d8143b1/llava/model/multimodal_encoder/dev_eva_clip/eva_clip/bpe_simple_vocab_16e6.txt.gz -------------------------------------------------------------------------------- /llava/model/multimodal_encoder/dev_eva_clip/eva_clip/constants.py: -------------------------------------------------------------------------------- 1 | OPENAI_DATASET_MEAN = (0.48145466, 0.4578275, 0.40821073) 2 | OPENAI_DATASET_STD = (0.26862954, 0.26130258, 0.27577711) 3 | -------------------------------------------------------------------------------- /llava/model/multimodal_encoder/dev_eva_clip/eva_clip/hf_configs.py: -------------------------------------------------------------------------------- 1 | # HF architecture dict: 2 | arch_dict = { 3 | # https://huggingface.co/docs/transformers/model_doc/roberta#roberta 4 | "roberta": { 5 | "config_names": { 6 | "context_length": "max_position_embeddings", 7 | "vocab_size": "vocab_size", 8 | "width": "hidden_size", 9 | "heads": "num_attention_heads", 10 | "layers": "num_hidden_layers", 11 | "layer_attr": "layer", 12 | "token_embeddings_attr": "embeddings", 13 | }, 14 | "pooler": "mean_pooler", 15 | }, 16 | # https://huggingface.co/docs/transformers/model_doc/xlm-roberta#transformers.XLMRobertaConfig 17 | "xlm-roberta": { 18 | "config_names": { 19 | "context_length": "max_position_embeddings", 20 | "vocab_size": "vocab_size", 21 | "width": "hidden_size", 22 | "heads": "num_attention_heads", 23 | "layers": "num_hidden_layers", 24 | "layer_attr": "layer", 25 | "token_embeddings_attr": "embeddings", 26 | }, 27 | "pooler": "mean_pooler", 28 | }, 29 | # https://huggingface.co/docs/transformers/model_doc/mt5#mt5 30 | "mt5": { 31 | "config_names": { 32 | # unlimited seqlen 33 | # https://github.com/google-research/text-to-text-transfer-transformer/issues/273 34 | # https://github.com/huggingface/transformers/blob/v4.24.0/src/transformers/models/t5/modeling_t5.py#L374 35 | "context_length": "", 36 | "vocab_size": "vocab_size", 37 | "width": "d_model", 38 | "heads": "num_heads", 39 | "layers": "num_layers", 40 | "layer_attr": "block", 41 | "token_embeddings_attr": "embed_tokens", 42 | }, 43 | "pooler": "mean_pooler", 44 | }, 45 | "bert": { 46 | "config_names": { 47 | "context_length": "max_position_embeddings", 48 | "vocab_size": "vocab_size", 49 | "width": "hidden_size", 50 | "heads": "num_attention_heads", 51 | "layers": "num_hidden_layers", 52 | "layer_attr": "layer", 53 | "token_embeddings_attr": "embeddings", 54 | }, 55 | "pooler": "mean_pooler", 56 | }, 57 | } 58 | -------------------------------------------------------------------------------- /llava/model/multimodal_encoder/dev_eva_clip/eva_clip/loss.py: -------------------------------------------------------------------------------- 1 | import math 2 | import torch 3 | import torch.nn as nn 4 | from torch.nn import functional as F 5 | 6 | try: 7 | import torch.distributed.nn 8 | from torch import distributed as dist 9 | 10 | has_distributed = True 11 | except ImportError: 12 | has_distributed = False 13 | 14 | try: 15 | import horovod.torch as hvd 16 | except ImportError: 17 | hvd = None 18 | 19 | from timm.loss import LabelSmoothingCrossEntropy 20 | 21 | 22 | def gather_features(image_features, text_features, local_loss=False, gather_with_grad=False, rank=0, world_size=1, use_horovod=False): 23 | assert has_distributed, "torch.distributed did not import correctly, please use a PyTorch version with support." 24 | if use_horovod: 25 | assert hvd is not None, "Please install horovod" 26 | if gather_with_grad: 27 | all_image_features = hvd.allgather(image_features) 28 | all_text_features = hvd.allgather(text_features) 29 | else: 30 | with torch.no_grad(): 31 | all_image_features = hvd.allgather(image_features) 32 | all_text_features = hvd.allgather(text_features) 33 | if not local_loss: 34 | # ensure grads for local rank when all_* features don't have a gradient 35 | gathered_image_features = list(all_image_features.chunk(world_size, dim=0)) 36 | gathered_text_features = list(all_text_features.chunk(world_size, dim=0)) 37 | gathered_image_features[rank] = image_features 38 | gathered_text_features[rank] = text_features 39 | all_image_features = torch.cat(gathered_image_features, dim=0) 40 | all_text_features = torch.cat(gathered_text_features, dim=0) 41 | else: 42 | # We gather tensors from all gpus 43 | if gather_with_grad: 44 | all_image_features = torch.cat(torch.distributed.nn.all_gather(image_features), dim=0) 45 | all_text_features = torch.cat(torch.distributed.nn.all_gather(text_features), dim=0) 46 | # all_image_features = torch.cat(torch.distributed.nn.all_gather(image_features, async_op=True), dim=0) 47 | # all_text_features = torch.cat(torch.distributed.nn.all_gather(text_features, async_op=True), dim=0) 48 | else: 49 | gathered_image_features = [torch.zeros_like(image_features) for _ in range(world_size)] 50 | gathered_text_features = [torch.zeros_like(text_features) for _ in range(world_size)] 51 | dist.all_gather(gathered_image_features, image_features) 52 | dist.all_gather(gathered_text_features, text_features) 53 | if not local_loss: 54 | # ensure grads for local rank when all_* features don't have a gradient 55 | gathered_image_features[rank] = image_features 56 | gathered_text_features[rank] = text_features 57 | all_image_features = torch.cat(gathered_image_features, dim=0) 58 | all_text_features = torch.cat(gathered_text_features, dim=0) 59 | 60 | return all_image_features, all_text_features 61 | 62 | 63 | class ClipLoss(nn.Module): 64 | 65 | def __init__( 66 | self, 67 | local_loss=False, 68 | gather_with_grad=False, 69 | cache_labels=False, 70 | rank=0, 71 | world_size=1, 72 | use_horovod=False, 73 | smoothing=0.0, 74 | ): 75 | super().__init__() 76 | self.local_loss = local_loss 77 | self.gather_with_grad = gather_with_grad 78 | self.cache_labels = cache_labels 79 | self.rank = rank 80 | self.world_size = world_size 81 | self.use_horovod = use_horovod 82 | self.label_smoothing_cross_entropy = LabelSmoothingCrossEntropy(smoothing=smoothing) if smoothing > 0 else None 83 | 84 | # cache state 85 | self.prev_num_logits = 0 86 | self.labels = {} 87 | 88 | def forward(self, image_features, text_features, logit_scale=1.0): 89 | device = image_features.device 90 | if self.world_size > 1: 91 | all_image_features, all_text_features = gather_features(image_features, text_features, self.local_loss, self.gather_with_grad, self.rank, self.world_size, self.use_horovod) 92 | 93 | if self.local_loss: 94 | logits_per_image = logit_scale * image_features @ all_text_features.T 95 | logits_per_text = logit_scale * text_features @ all_image_features.T 96 | else: 97 | logits_per_image = logit_scale * all_image_features @ all_text_features.T 98 | logits_per_text = logits_per_image.T 99 | else: 100 | logits_per_image = logit_scale * image_features @ text_features.T 101 | logits_per_text = logit_scale * text_features @ image_features.T 102 | # calculated ground-truth and cache if enabled 103 | num_logits = logits_per_image.shape[0] 104 | if self.prev_num_logits != num_logits or device not in self.labels: 105 | labels = torch.arange(num_logits, device=device, dtype=torch.long) 106 | if self.world_size > 1 and self.local_loss: 107 | labels = labels + num_logits * self.rank 108 | if self.cache_labels: 109 | self.labels[device] = labels 110 | self.prev_num_logits = num_logits 111 | else: 112 | labels = self.labels[device] 113 | 114 | if self.label_smoothing_cross_entropy: 115 | total_loss = (self.label_smoothing_cross_entropy(logits_per_image, labels) + self.label_smoothing_cross_entropy(logits_per_text, labels)) / 2 116 | else: 117 | total_loss = (F.cross_entropy(logits_per_image, labels) + F.cross_entropy(logits_per_text, labels)) / 2 118 | 119 | acc = None 120 | i2t_acc = (logits_per_image.argmax(-1) == labels).sum() / len(logits_per_image) 121 | t2i_acc = (logits_per_text.argmax(-1) == labels).sum() / len(logits_per_text) 122 | acc = {"i2t": i2t_acc, "t2i": t2i_acc} 123 | return total_loss, acc 124 | -------------------------------------------------------------------------------- /llava/model/multimodal_encoder/dev_eva_clip/eva_clip/model_configs/EVA-CLIP-18B.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1536, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 48, 6 | "width": 5120, 7 | "head_width": 128, 8 | "mlp_ratio": 5, 9 | "patch_size": 14, 10 | "eva_model_name": "eva-clip-18b-14-x", 11 | "drop_path_rate": 0, 12 | "qkv_bias": false, 13 | "xattn": true, 14 | "postnorm": true, 15 | "fusedLN": false, 16 | "use_rms_norm": true 17 | }, 18 | "text_cfg": { 19 | "context_length": 77, 20 | "vocab_size": 49408, 21 | "width": 1280, 22 | "heads": 20, 23 | "layers": 32, 24 | "xattn": false, 25 | "fusedLN": false 26 | } 27 | } -------------------------------------------------------------------------------- /llava/model/multimodal_encoder/dev_eva_clip/eva_clip/model_configs/EVA-CLIP-8B-plus.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1280, 3 | "vision_cfg": { 4 | "image_size": 448, 5 | "layers": 32, 6 | "width": 4096, 7 | "head_width": 128, 8 | "mlp_ratio": 5, 9 | "patch_size": 14, 10 | "eva_model_name": "eva-clip-8b-14-plus-x", 11 | "drop_path_rate": 0, 12 | "qkv_bias": false, 13 | "xattn": true, 14 | "postnorm": false, 15 | "fusedLN": false, 16 | "use_rms_norm": true 17 | }, 18 | "text_cfg": { 19 | "context_length": 77, 20 | "vocab_size": 49408, 21 | "width": 1280, 22 | "heads": 20, 23 | "layers": 32, 24 | "xattn": false, 25 | "fusedLN": false 26 | } 27 | } -------------------------------------------------------------------------------- /llava/model/multimodal_encoder/dev_eva_clip/eva_clip/model_configs/EVA-CLIP-8B.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1280, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 32, 6 | "width": 4096, 7 | "head_width": 128, 8 | "mlp_ratio": 5, 9 | "patch_size": 14, 10 | "eva_model_name": "eva-clip-8b-14-x", 11 | "drop_path_rate": 0, 12 | "qkv_bias": false, 13 | "xattn": true, 14 | "postnorm": false, 15 | "fusedLN": false, 16 | "use_rms_norm": true 17 | }, 18 | "text_cfg": { 19 | "context_length": 77, 20 | "vocab_size": 49408, 21 | "width": 1280, 22 | "heads": 20, 23 | "layers": 32, 24 | "xattn": false, 25 | "fusedLN": false 26 | } 27 | } -------------------------------------------------------------------------------- /llava/model/multimodal_encoder/dev_eva_clip/eva_clip/model_configs/EVA01-CLIP-B-16.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 12, 6 | "width": 768, 7 | "patch_size": 16, 8 | "eva_model_name": "eva-clip-b-16", 9 | "ls_init_value": 0.1, 10 | "drop_path_rate": 0.0 11 | }, 12 | "text_cfg": { 13 | "context_length": 77, 14 | "vocab_size": 49408, 15 | "width": 512, 16 | "heads": 8, 17 | "layers": 12 18 | } 19 | } -------------------------------------------------------------------------------- /llava/model/multimodal_encoder/dev_eva_clip/eva_clip/model_configs/EVA01-CLIP-g-14-plus.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 40, 6 | "width": 1408, 7 | "head_width": 88, 8 | "mlp_ratio": 4.3637, 9 | "patch_size": 14, 10 | "eva_model_name": "eva-clip-g-14-x", 11 | "drop_path_rate": 0, 12 | "xattn": true, 13 | "fusedLN": true 14 | }, 15 | "text_cfg": { 16 | "context_length": 77, 17 | "vocab_size": 49408, 18 | "width": 1024, 19 | "heads": 16, 20 | "layers": 24, 21 | "xattn": false, 22 | "fusedLN": true 23 | } 24 | } -------------------------------------------------------------------------------- /llava/model/multimodal_encoder/dev_eva_clip/eva_clip/model_configs/EVA01-CLIP-g-14.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 40, 6 | "width": 1408, 7 | "head_width": 88, 8 | "mlp_ratio": 4.3637, 9 | "patch_size": 14, 10 | "eva_model_name": "eva-clip-g-14-x", 11 | "drop_path_rate": 0.4, 12 | "xattn": true, 13 | "fusedLN": true 14 | }, 15 | "text_cfg": { 16 | "context_length": 77, 17 | "vocab_size": 49408, 18 | "width": 768, 19 | "heads": 12, 20 | "layers": 12, 21 | "xattn": false, 22 | "fusedLN": true 23 | } 24 | } -------------------------------------------------------------------------------- /llava/model/multimodal_encoder/dev_eva_clip/eva_clip/model_configs/EVA02-CLIP-B-16.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 12, 6 | "width": 768, 7 | "head_width": 64, 8 | "patch_size": 16, 9 | "mlp_ratio": 2.6667, 10 | "eva_model_name": "eva-clip-b-16-X", 11 | "drop_path_rate": 0.0, 12 | "xattn": true, 13 | "fusedLN": true, 14 | "rope": true, 15 | "pt_hw_seq_len": 16, 16 | "intp_freq": true, 17 | "naiveswiglu": true, 18 | "subln": true 19 | }, 20 | "text_cfg": { 21 | "context_length": 77, 22 | "vocab_size": 49408, 23 | "width": 512, 24 | "heads": 8, 25 | "layers": 12, 26 | "xattn": true, 27 | "fusedLN": true 28 | } 29 | } -------------------------------------------------------------------------------- /llava/model/multimodal_encoder/dev_eva_clip/eva_clip/model_configs/EVA02-CLIP-L-14-336.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 768, 3 | "vision_cfg": { 4 | "image_size": 336, 5 | "layers": 24, 6 | "width": 1024, 7 | "drop_path_rate": 0, 8 | "head_width": 64, 9 | "mlp_ratio": 2.6667, 10 | "patch_size": 14, 11 | "eva_model_name": "eva-clip-l-14-336", 12 | "xattn": true, 13 | "fusedLN": true, 14 | "rope": true, 15 | "pt_hw_seq_len": 16, 16 | "intp_freq": true, 17 | "naiveswiglu": true, 18 | "subln": true 19 | }, 20 | "text_cfg": { 21 | "context_length": 77, 22 | "vocab_size": 49408, 23 | "width": 768, 24 | "heads": 12, 25 | "layers": 12, 26 | "xattn": false, 27 | "fusedLN": true 28 | } 29 | } -------------------------------------------------------------------------------- /llava/model/multimodal_encoder/dev_eva_clip/eva_clip/model_configs/EVA02-CLIP-L-14.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 768, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 24, 6 | "width": 1024, 7 | "drop_path_rate": 0, 8 | "head_width": 64, 9 | "mlp_ratio": 2.6667, 10 | "patch_size": 14, 11 | "eva_model_name": "eva-clip-l-14", 12 | "xattn": true, 13 | "fusedLN": true, 14 | "rope": true, 15 | "pt_hw_seq_len": 16, 16 | "intp_freq": true, 17 | "naiveswiglu": true, 18 | "subln": true 19 | }, 20 | "text_cfg": { 21 | "context_length": 77, 22 | "vocab_size": 49408, 23 | "width": 768, 24 | "heads": 12, 25 | "layers": 12, 26 | "xattn": false, 27 | "fusedLN": true 28 | } 29 | } -------------------------------------------------------------------------------- /llava/model/multimodal_encoder/dev_eva_clip/eva_clip/model_configs/EVA02-CLIP-bigE-14-plus.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 64, 6 | "width": 1792, 7 | "head_width": 112, 8 | "mlp_ratio": 8.571428571428571, 9 | "patch_size": 14, 10 | "eva_model_name": "eva-clip-4b-14-x", 11 | "drop_path_rate": 0, 12 | "xattn": true, 13 | "postnorm": true, 14 | "fusedLN": true 15 | }, 16 | "text_cfg": { 17 | "context_length": 77, 18 | "vocab_size": 49408, 19 | "width": 1280, 20 | "heads": 20, 21 | "layers": 32, 22 | "xattn": false, 23 | "fusedLN": true 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /llava/model/multimodal_encoder/dev_eva_clip/eva_clip/model_configs/EVA02-CLIP-bigE-14.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 64, 6 | "width": 1792, 7 | "head_width": 112, 8 | "mlp_ratio": 8.571428571428571, 9 | "patch_size": 14, 10 | "eva_model_name": "eva-clip-4b-14-x", 11 | "drop_path_rate": 0, 12 | "xattn": true, 13 | "postnorm": true, 14 | "fusedLN": true 15 | }, 16 | "text_cfg": { 17 | "context_length": 77, 18 | "vocab_size": 49408, 19 | "width": 1024, 20 | "heads": 16, 21 | "layers": 24, 22 | "xattn": false, 23 | "fusedLN": true 24 | } 25 | } -------------------------------------------------------------------------------- /llava/model/multimodal_encoder/dev_eva_clip/eva_clip/model_configs/Internal-EVA02-CLIP-10B-14-448.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "image_size": 448, 5 | "layers": 77, 6 | "width": 2304, 7 | "head_width": 144, 8 | "mlp_ratio": 10.9722, 9 | "patch_size": 14, 10 | "eva_model_name": "eva-clip-10b-14-x", 11 | "drop_path_rate": 0, 12 | "xattn": true, 13 | "postnorm": false, 14 | "fusedLN": true 15 | }, 16 | "text_cfg": { 17 | "context_length": 77, 18 | "vocab_size": 49408, 19 | "width": 1280, 20 | "heads": 20, 21 | "layers": 32, 22 | "xattn": false, 23 | "fusedLN": true 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /llava/model/multimodal_encoder/dev_eva_clip/eva_clip/model_configs/Internal-EVA02-CLIP-10B-14.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 77, 6 | "width": 2304, 7 | "head_width": 144, 8 | "mlp_ratio": 10.9722, 9 | "patch_size": 14, 10 | "eva_model_name": "eva-clip-10b-14-x", 11 | "drop_path_rate": 0, 12 | "xattn": true, 13 | "postnorm": false, 14 | "fusedLN": true 15 | }, 16 | "text_cfg": { 17 | "context_length": 77, 18 | "vocab_size": 49408, 19 | "width": 1280, 20 | "heads": 20, 21 | "layers": 32, 22 | "xattn": false, 23 | "fusedLN": true 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /llava/model/multimodal_encoder/dev_eva_clip/eva_clip/modified_resnet.py: -------------------------------------------------------------------------------- 1 | from collections import OrderedDict 2 | 3 | import torch 4 | from torch import nn 5 | from torch.nn import functional as F 6 | 7 | from .utils import freeze_batch_norm_2d 8 | 9 | 10 | class Bottleneck(nn.Module): 11 | expansion = 4 12 | 13 | def __init__(self, inplanes, planes, stride=1): 14 | super().__init__() 15 | 16 | # all conv layers have stride 1. an avgpool is performed after the second convolution when stride > 1 17 | self.conv1 = nn.Conv2d(inplanes, planes, 1, bias=False) 18 | self.bn1 = nn.BatchNorm2d(planes) 19 | self.act1 = nn.ReLU(inplace=True) 20 | 21 | self.conv2 = nn.Conv2d(planes, planes, 3, padding=1, bias=False) 22 | self.bn2 = nn.BatchNorm2d(planes) 23 | self.act2 = nn.ReLU(inplace=True) 24 | 25 | self.avgpool = nn.AvgPool2d(stride) if stride > 1 else nn.Identity() 26 | 27 | self.conv3 = nn.Conv2d(planes, planes * self.expansion, 1, bias=False) 28 | self.bn3 = nn.BatchNorm2d(planes * self.expansion) 29 | self.act3 = nn.ReLU(inplace=True) 30 | 31 | self.downsample = None 32 | self.stride = stride 33 | 34 | if stride > 1 or inplanes != planes * Bottleneck.expansion: 35 | # downsampling layer is prepended with an avgpool, and the subsequent convolution has stride 1 36 | self.downsample = nn.Sequential(OrderedDict([("-1", nn.AvgPool2d(stride)), ("0", nn.Conv2d(inplanes, planes * self.expansion, 1, stride=1, bias=False)), ("1", nn.BatchNorm2d(planes * self.expansion))])) 37 | 38 | def forward(self, x: torch.Tensor): 39 | identity = x 40 | 41 | out = self.act1(self.bn1(self.conv1(x))) 42 | out = self.act2(self.bn2(self.conv2(out))) 43 | out = self.avgpool(out) 44 | out = self.bn3(self.conv3(out)) 45 | 46 | if self.downsample is not None: 47 | identity = self.downsample(x) 48 | 49 | out += identity 50 | out = self.act3(out) 51 | return out 52 | 53 | 54 | class AttentionPool2d(nn.Module): 55 | def __init__(self, spacial_dim: int, embed_dim: int, num_heads: int, output_dim: int = None): 56 | super().__init__() 57 | self.positional_embedding = nn.Parameter(torch.randn(spacial_dim**2 + 1, embed_dim) / embed_dim**0.5) 58 | self.k_proj = nn.Linear(embed_dim, embed_dim) 59 | self.q_proj = nn.Linear(embed_dim, embed_dim) 60 | self.v_proj = nn.Linear(embed_dim, embed_dim) 61 | self.c_proj = nn.Linear(embed_dim, output_dim or embed_dim) 62 | self.num_heads = num_heads 63 | 64 | def forward(self, x): 65 | x = x.reshape(x.shape[0], x.shape[1], x.shape[2] * x.shape[3]).permute(2, 0, 1) # NCHW -> (HW)NC 66 | x = torch.cat([x.mean(dim=0, keepdim=True), x], dim=0) # (HW+1)NC 67 | x = x + self.positional_embedding[:, None, :].to(x.dtype) # (HW+1)NC 68 | x, _ = F.multi_head_attention_forward( 69 | query=x, 70 | key=x, 71 | value=x, 72 | embed_dim_to_check=x.shape[-1], 73 | num_heads=self.num_heads, 74 | q_proj_weight=self.q_proj.weight, 75 | k_proj_weight=self.k_proj.weight, 76 | v_proj_weight=self.v_proj.weight, 77 | in_proj_weight=None, 78 | in_proj_bias=torch.cat([self.q_proj.bias, self.k_proj.bias, self.v_proj.bias]), 79 | bias_k=None, 80 | bias_v=None, 81 | add_zero_attn=False, 82 | dropout_p=0.0, 83 | out_proj_weight=self.c_proj.weight, 84 | out_proj_bias=self.c_proj.bias, 85 | use_separate_proj_weight=True, 86 | training=self.training, 87 | need_weights=False, 88 | ) 89 | 90 | return x[0] 91 | 92 | 93 | class ModifiedResNet(nn.Module): 94 | """ 95 | A ResNet class that is similar to torchvision's but contains the following changes: 96 | - There are now 3 "stem" convolutions as opposed to 1, with an average pool instead of a max pool. 97 | - Performs anti-aliasing strided convolutions, where an avgpool is prepended to convolutions with stride > 1 98 | - The final pooling layer is a QKV attention instead of an average pool 99 | """ 100 | 101 | def __init__(self, layers, output_dim, heads, image_size=224, width=64): 102 | super().__init__() 103 | self.output_dim = output_dim 104 | self.image_size = image_size 105 | 106 | # the 3-layer stem 107 | self.conv1 = nn.Conv2d(3, width // 2, kernel_size=3, stride=2, padding=1, bias=False) 108 | self.bn1 = nn.BatchNorm2d(width // 2) 109 | self.act1 = nn.ReLU(inplace=True) 110 | self.conv2 = nn.Conv2d(width // 2, width // 2, kernel_size=3, padding=1, bias=False) 111 | self.bn2 = nn.BatchNorm2d(width // 2) 112 | self.act2 = nn.ReLU(inplace=True) 113 | self.conv3 = nn.Conv2d(width // 2, width, kernel_size=3, padding=1, bias=False) 114 | self.bn3 = nn.BatchNorm2d(width) 115 | self.act3 = nn.ReLU(inplace=True) 116 | self.avgpool = nn.AvgPool2d(2) 117 | 118 | # residual layers 119 | self._inplanes = width # this is a *mutable* variable used during construction 120 | self.layer1 = self._make_layer(width, layers[0]) 121 | self.layer2 = self._make_layer(width * 2, layers[1], stride=2) 122 | self.layer3 = self._make_layer(width * 4, layers[2], stride=2) 123 | self.layer4 = self._make_layer(width * 8, layers[3], stride=2) 124 | 125 | embed_dim = width * 32 # the ResNet feature dimension 126 | self.attnpool = AttentionPool2d(image_size // 32, embed_dim, heads, output_dim) 127 | 128 | self.init_parameters() 129 | 130 | def _make_layer(self, planes, blocks, stride=1): 131 | layers = [Bottleneck(self._inplanes, planes, stride)] 132 | 133 | self._inplanes = planes * Bottleneck.expansion 134 | for _ in range(1, blocks): 135 | layers.append(Bottleneck(self._inplanes, planes)) 136 | 137 | return nn.Sequential(*layers) 138 | 139 | def init_parameters(self): 140 | if self.attnpool is not None: 141 | std = self.attnpool.c_proj.in_features**-0.5 142 | nn.init.normal_(self.attnpool.q_proj.weight, std=std) 143 | nn.init.normal_(self.attnpool.k_proj.weight, std=std) 144 | nn.init.normal_(self.attnpool.v_proj.weight, std=std) 145 | nn.init.normal_(self.attnpool.c_proj.weight, std=std) 146 | 147 | for resnet_block in [self.layer1, self.layer2, self.layer3, self.layer4]: 148 | for name, param in resnet_block.named_parameters(): 149 | if name.endswith("bn3.weight"): 150 | nn.init.zeros_(param) 151 | 152 | def lock(self, unlocked_groups=0, freeze_bn_stats=False): 153 | assert unlocked_groups == 0, "partial locking not currently supported for this model" 154 | for param in self.parameters(): 155 | param.requires_grad = False 156 | if freeze_bn_stats: 157 | freeze_batch_norm_2d(self) 158 | 159 | @torch.jit.ignore 160 | def set_grad_checkpointing(self, enable=True): 161 | # FIXME support for non-transformer 162 | pass 163 | 164 | def stem(self, x): 165 | x = self.act1(self.bn1(self.conv1(x))) 166 | x = self.act2(self.bn2(self.conv2(x))) 167 | x = self.act3(self.bn3(self.conv3(x))) 168 | x = self.avgpool(x) 169 | return x 170 | 171 | def forward(self, x): 172 | x = self.stem(x) 173 | x = self.layer1(x) 174 | x = self.layer2(x) 175 | x = self.layer3(x) 176 | x = self.layer4(x) 177 | x = self.attnpool(x) 178 | 179 | return x 180 | -------------------------------------------------------------------------------- /llava/model/multimodal_encoder/dev_eva_clip/eva_clip/openai.py: -------------------------------------------------------------------------------- 1 | """ OpenAI pretrained model functions 2 | 3 | Adapted from https://github.com/openai/CLIP. Originally MIT License, Copyright (c) 2021 OpenAI. 4 | """ 5 | 6 | import os 7 | import warnings 8 | from typing import List, Optional, Union 9 | 10 | import torch 11 | 12 | from .model import build_model_from_openai_state_dict, convert_weights_to_lp, get_cast_dtype 13 | from .pretrained import get_pretrained_url, list_pretrained_models_by_tag, download_pretrained_from_url 14 | 15 | __all__ = ["list_openai_models", "load_openai_model"] 16 | 17 | 18 | def list_openai_models() -> List[str]: 19 | """Returns the names of available CLIP models""" 20 | return list_pretrained_models_by_tag("openai") 21 | 22 | 23 | def load_openai_model( 24 | name: str, 25 | precision: Optional[str] = None, 26 | device: Optional[Union[str, torch.device]] = None, 27 | jit: bool = True, 28 | cache_dir: Optional[str] = None, 29 | ): 30 | """Load a CLIP model 31 | 32 | Parameters 33 | ---------- 34 | name : str 35 | A model name listed by `clip.available_models()`, or the path to a model checkpoint containing the state_dict 36 | precision: str 37 | Model precision, if None defaults to 'fp32' if device == 'cpu' else 'fp16'. 38 | device : Union[str, torch.device] 39 | The device to put the loaded model 40 | jit : bool 41 | Whether to load the optimized JIT model (default) or more hackable non-JIT model. 42 | cache_dir : Optional[str] 43 | The directory to cache the downloaded model weights 44 | 45 | Returns 46 | ------- 47 | model : torch.nn.Module 48 | The CLIP model 49 | preprocess : Callable[[PIL.Image], torch.Tensor] 50 | A torchvision transform that converts a PIL image into a tensor that the returned model can take as its input 51 | """ 52 | if device is None: 53 | device = "cuda" if torch.cuda.is_available() else "cpu" 54 | if precision is None: 55 | precision = "fp32" if device == "cpu" else "fp16" 56 | 57 | if get_pretrained_url(name, "openai"): 58 | model_path = download_pretrained_from_url(get_pretrained_url(name, "openai"), cache_dir=cache_dir) 59 | elif os.path.isfile(name): 60 | model_path = name 61 | else: 62 | raise RuntimeError(f"Model {name} not found; available models = {list_openai_models()}") 63 | 64 | try: 65 | # loading JIT archive 66 | model = torch.jit.load(model_path, map_location=device if jit else "cpu").eval() 67 | state_dict = None 68 | except RuntimeError: 69 | # loading saved state dict 70 | if jit: 71 | warnings.warn(f"File {model_path} is not a JIT archive. Loading as a state dict instead") 72 | jit = False 73 | state_dict = torch.load(model_path, map_location="cpu") 74 | 75 | if not jit: 76 | # Build a non-jit model from the OpenAI jitted model state dict 77 | cast_dtype = get_cast_dtype(precision) 78 | try: 79 | model = build_model_from_openai_state_dict(state_dict or model.state_dict(), cast_dtype=cast_dtype) 80 | except KeyError: 81 | sd = {k[7:]: v for k, v in state_dict["state_dict"].items()} 82 | model = build_model_from_openai_state_dict(sd, cast_dtype=cast_dtype) 83 | 84 | # model from OpenAI state dict is in manually cast fp16 mode, must be converted for AMP/fp32/bf16 use 85 | model = model.to(device) 86 | if precision.startswith("amp") or precision == "fp32": 87 | model.float() 88 | elif precision == "bf16": 89 | convert_weights_to_lp(model, dtype=torch.bfloat16) 90 | 91 | return model 92 | 93 | # patch the device names 94 | device_holder = torch.jit.trace(lambda: torch.ones([]).to(torch.device(device)), example_inputs=[]) 95 | device_node = [n for n in device_holder.graph.findAllNodes("prim::Constant") if "Device" in repr(n)][-1] 96 | 97 | def patch_device(module): 98 | try: 99 | graphs = [module.graph] if hasattr(module, "graph") else [] 100 | except RuntimeError: 101 | graphs = [] 102 | 103 | if hasattr(module, "forward1"): 104 | graphs.append(module.forward1.graph) 105 | 106 | for graph in graphs: 107 | for node in graph.findAllNodes("prim::Constant"): 108 | if "value" in node.attributeNames() and str(node["value"]).startswith("cuda"): 109 | node.copyAttributes(device_node) 110 | 111 | model.apply(patch_device) 112 | patch_device(model.encode_image) 113 | patch_device(model.encode_text) 114 | 115 | # patch dtype to float32 (typically for CPU) 116 | if precision == "fp32": 117 | float_holder = torch.jit.trace(lambda: torch.ones([]).float(), example_inputs=[]) 118 | float_input = list(float_holder.graph.findNode("aten::to").inputs())[1] 119 | float_node = float_input.node() 120 | 121 | def patch_float(module): 122 | try: 123 | graphs = [module.graph] if hasattr(module, "graph") else [] 124 | except RuntimeError: 125 | graphs = [] 126 | 127 | if hasattr(module, "forward1"): 128 | graphs.append(module.forward1.graph) 129 | 130 | for graph in graphs: 131 | for node in graph.findAllNodes("aten::to"): 132 | inputs = list(node.inputs()) 133 | for i in [1, 2]: # dtype can be the second or third argument to aten::to() 134 | if inputs[i].node()["value"] == 5: 135 | inputs[i].node().copyAttributes(float_node) 136 | 137 | model.apply(patch_float) 138 | patch_float(model.encode_image) 139 | patch_float(model.encode_text) 140 | model.float() 141 | 142 | # ensure image_size attr available at consistent location for both jit and non-jit 143 | model.visual.image_size = model.input_resolution.item() 144 | return model 145 | -------------------------------------------------------------------------------- /llava/model/multimodal_encoder/dev_eva_clip/eva_clip/rope.py: -------------------------------------------------------------------------------- 1 | from math import pi 2 | import torch 3 | from torch import nn 4 | from einops import rearrange, repeat 5 | import logging 6 | 7 | 8 | def broadcat(tensors, dim=-1): 9 | num_tensors = len(tensors) 10 | shape_lens = set(list(map(lambda t: len(t.shape), tensors))) 11 | assert len(shape_lens) == 1, "tensors must all have the same number of dimensions" 12 | shape_len = list(shape_lens)[0] 13 | dim = (dim + shape_len) if dim < 0 else dim 14 | dims = list(zip(*map(lambda t: list(t.shape), tensors))) 15 | expandable_dims = [(i, val) for i, val in enumerate(dims) if i != dim] 16 | assert all([*map(lambda t: len(set(t[1])) <= 2, expandable_dims)]), "invalid dimensions for broadcastable concatentation" 17 | max_dims = list(map(lambda t: (t[0], max(t[1])), expandable_dims)) 18 | expanded_dims = list(map(lambda t: (t[0], (t[1],) * num_tensors), max_dims)) 19 | expanded_dims.insert(dim, (dim, dims[dim])) 20 | expandable_shapes = list(zip(*map(lambda t: t[1], expanded_dims))) 21 | tensors = list(map(lambda t: t[0].expand(*t[1]), zip(tensors, expandable_shapes))) 22 | return torch.cat(tensors, dim=dim) 23 | 24 | 25 | def rotate_half(x): 26 | x = rearrange(x, "... (d r) -> ... d r", r=2) 27 | x1, x2 = x.unbind(dim=-1) 28 | x = torch.stack((-x2, x1), dim=-1) 29 | return rearrange(x, "... d r -> ... (d r)") 30 | 31 | 32 | class VisionRotaryEmbedding(nn.Module): 33 | def __init__( 34 | self, 35 | dim, 36 | pt_seq_len, 37 | ft_seq_len=None, 38 | custom_freqs=None, 39 | freqs_for="lang", 40 | theta=10000, 41 | max_freq=10, 42 | num_freqs=1, 43 | ): 44 | super().__init__() 45 | if custom_freqs: 46 | freqs = custom_freqs 47 | elif freqs_for == "lang": 48 | freqs = 1.0 / (theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim)) 49 | elif freqs_for == "pixel": 50 | freqs = torch.linspace(1.0, max_freq / 2, dim // 2) * pi 51 | elif freqs_for == "constant": 52 | freqs = torch.ones(num_freqs).float() 53 | else: 54 | raise ValueError(f"unknown modality {freqs_for}") 55 | 56 | if ft_seq_len is None: 57 | ft_seq_len = pt_seq_len 58 | t = torch.arange(ft_seq_len) / ft_seq_len * pt_seq_len 59 | 60 | freqs_h = torch.einsum("..., f -> ... f", t, freqs) 61 | freqs_h = repeat(freqs_h, "... n -> ... (n r)", r=2) 62 | 63 | freqs_w = torch.einsum("..., f -> ... f", t, freqs) 64 | freqs_w = repeat(freqs_w, "... n -> ... (n r)", r=2) 65 | 66 | freqs = broadcat((freqs_h[:, None, :], freqs_w[None, :, :]), dim=-1) 67 | 68 | self.register_buffer("freqs_cos", freqs.cos()) 69 | self.register_buffer("freqs_sin", freqs.sin()) 70 | 71 | logging.info(f"Shape of rope freq: {self.freqs_cos.shape}") 72 | 73 | def forward(self, t, start_index=0): 74 | rot_dim = self.freqs_cos.shape[-1] 75 | end_index = start_index + rot_dim 76 | assert rot_dim <= t.shape[-1], f"feature dimension {t.shape[-1]} is not of sufficient size to rotate in all the positions {rot_dim}" 77 | t_left, t, t_right = t[..., :start_index], t[..., start_index:end_index], t[..., end_index:] 78 | t = (t * self.freqs_cos) + (rotate_half(t) * self.freqs_sin) 79 | 80 | return torch.cat((t_left, t, t_right), dim=-1) 81 | 82 | 83 | class VisionRotaryEmbeddingFast(nn.Module): 84 | def __init__(self, dim, pt_seq_len, ft_seq_len=None, custom_freqs=None, freqs_for="lang", theta=10000, max_freq=10, num_freqs=1, patch_dropout=0.0): 85 | super().__init__() 86 | if custom_freqs: 87 | freqs = custom_freqs 88 | elif freqs_for == "lang": 89 | freqs = 1.0 / (theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim)) 90 | elif freqs_for == "pixel": 91 | freqs = torch.linspace(1.0, max_freq / 2, dim // 2) * pi 92 | elif freqs_for == "constant": 93 | freqs = torch.ones(num_freqs).float() 94 | else: 95 | raise ValueError(f"unknown modality {freqs_for}") 96 | 97 | if ft_seq_len is None: 98 | ft_seq_len = pt_seq_len 99 | t = torch.arange(ft_seq_len) / ft_seq_len * pt_seq_len 100 | 101 | freqs = torch.einsum("..., f -> ... f", t, freqs) 102 | freqs = repeat(freqs, "... n -> ... (n r)", r=2) 103 | freqs = broadcat((freqs[:, None, :], freqs[None, :, :]), dim=-1) 104 | 105 | freqs_cos = freqs.cos().view(-1, freqs.shape[-1]) 106 | freqs_sin = freqs.sin().view(-1, freqs.shape[-1]) 107 | 108 | self.patch_dropout = patch_dropout 109 | 110 | self.register_buffer("freqs_cos", freqs_cos) 111 | self.register_buffer("freqs_sin", freqs_sin) 112 | 113 | logging.info(f"Shape of rope freq: {self.freqs_cos.shape}") 114 | 115 | def forward(self, t, patch_indices_keep=None): 116 | if patch_indices_keep is not None: 117 | batch = t.size()[0] 118 | batch_indices = torch.arange(batch) 119 | batch_indices = batch_indices[..., None] 120 | 121 | freqs_cos = repeat(self.freqs_cos, "i j -> n i m j", n=t.shape[0], m=t.shape[1]) 122 | freqs_sin = repeat(self.freqs_sin, "i j -> n i m j", n=t.shape[0], m=t.shape[1]) 123 | 124 | freqs_cos = freqs_cos[batch_indices, patch_indices_keep] 125 | freqs_cos = rearrange(freqs_cos, "n i m j -> n m i j") 126 | freqs_sin = freqs_sin[batch_indices, patch_indices_keep] 127 | freqs_sin = rearrange(freqs_sin, "n i m j -> n m i j") 128 | 129 | return t * freqs_cos + rotate_half(t) * freqs_sin 130 | 131 | return t * self.freqs_cos + rotate_half(t) * self.freqs_sin 132 | -------------------------------------------------------------------------------- /llava/model/multimodal_encoder/dev_eva_clip/eva_clip/timm_model.py: -------------------------------------------------------------------------------- 1 | """ timm model adapter 2 | 3 | Wraps timm (https://github.com/rwightman/pytorch-image-models) models for use as a vision tower in CLIP model. 4 | """ 5 | 6 | import logging 7 | from collections import OrderedDict 8 | 9 | import torch 10 | import torch.nn as nn 11 | 12 | try: 13 | import timm 14 | from timm.models.layers import Mlp, to_2tuple 15 | 16 | try: 17 | # old timm imports < 0.8.1 18 | from timm.models.layers.attention_pool2d import RotAttentionPool2d 19 | from timm.models.layers.attention_pool2d import AttentionPool2d as AbsAttentionPool2d 20 | except ImportError: 21 | # new timm imports >= 0.8.1 22 | from timm.layers import RotAttentionPool2d 23 | from timm.layers import AttentionPool2d as AbsAttentionPool2d 24 | except ImportError: 25 | timm = None 26 | 27 | from .utils import freeze_batch_norm_2d 28 | 29 | 30 | class TimmModel(nn.Module): 31 | """timm model adapter 32 | # FIXME this adapter is a work in progress, may change in ways that break weight compat 33 | """ 34 | 35 | def __init__(self, model_name, embed_dim, image_size=224, pool="avg", proj="linear", proj_bias=False, drop=0.0, pretrained=False): 36 | super().__init__() 37 | if timm is None: 38 | raise RuntimeError("Please `pip install timm` to use timm models.") 39 | 40 | self.image_size = to_2tuple(image_size) 41 | self.trunk = timm.create_model(model_name, pretrained=pretrained) 42 | feat_size = self.trunk.default_cfg.get("pool_size", None) 43 | feature_ndim = 1 if not feat_size else 2 44 | if pool in ("abs_attn", "rot_attn"): 45 | assert feature_ndim == 2 46 | # if attn pooling used, remove both classifier and default pool 47 | self.trunk.reset_classifier(0, global_pool="") 48 | else: 49 | # reset global pool if pool config set, otherwise leave as network default 50 | reset_kwargs = dict(global_pool=pool) if pool else {} 51 | self.trunk.reset_classifier(0, **reset_kwargs) 52 | prev_chs = self.trunk.num_features 53 | 54 | head_layers = OrderedDict() 55 | if pool == "abs_attn": 56 | head_layers["pool"] = AbsAttentionPool2d(prev_chs, feat_size=feat_size, out_features=embed_dim) 57 | prev_chs = embed_dim 58 | elif pool == "rot_attn": 59 | head_layers["pool"] = RotAttentionPool2d(prev_chs, out_features=embed_dim) 60 | prev_chs = embed_dim 61 | else: 62 | assert proj, "projection layer needed if non-attention pooling is used." 63 | 64 | # NOTE attention pool ends with a projection layer, so proj should usually be set to '' if such pooling is used 65 | if proj == "linear": 66 | head_layers["drop"] = nn.Dropout(drop) 67 | head_layers["proj"] = nn.Linear(prev_chs, embed_dim, bias=proj_bias) 68 | elif proj == "mlp": 69 | head_layers["mlp"] = Mlp(prev_chs, 2 * embed_dim, embed_dim, drop=drop, bias=(True, proj_bias)) 70 | 71 | self.head = nn.Sequential(head_layers) 72 | 73 | def lock(self, unlocked_groups=0, freeze_bn_stats=False): 74 | """lock modules 75 | Args: 76 | unlocked_groups (int): leave last n layer groups unlocked (default: 0) 77 | """ 78 | if not unlocked_groups: 79 | # lock full model 80 | for param in self.trunk.parameters(): 81 | param.requires_grad = False 82 | if freeze_bn_stats: 83 | freeze_batch_norm_2d(self.trunk) 84 | else: 85 | # NOTE: partial freeze requires latest timm (master) branch and is subject to change 86 | try: 87 | # FIXME import here until API stable and in an official release 88 | from timm.models.helpers import group_parameters, group_modules 89 | except ImportError: 90 | raise RuntimeError("Please install latest timm `pip install git+https://github.com/rwightman/pytorch-image-models`") 91 | matcher = self.trunk.group_matcher() 92 | gparams = group_parameters(self.trunk, matcher) 93 | max_layer_id = max(gparams.keys()) 94 | max_layer_id = max_layer_id - unlocked_groups 95 | for group_idx in range(max_layer_id + 1): 96 | group = gparams[group_idx] 97 | for param in group: 98 | self.trunk.get_parameter(param).requires_grad = False 99 | if freeze_bn_stats: 100 | gmodules = group_modules(self.trunk, matcher, reverse=True) 101 | gmodules = {k for k, v in gmodules.items() if v <= max_layer_id} 102 | freeze_batch_norm_2d(self.trunk, gmodules) 103 | 104 | @torch.jit.ignore 105 | def set_grad_checkpointing(self, enable=True): 106 | try: 107 | self.trunk.set_grad_checkpointing(enable) 108 | except Exception as e: 109 | logging.warning("grad checkpointing not supported for this timm image tower, continuing without...") 110 | 111 | def forward(self, x): 112 | x = self.trunk(x) 113 | x = self.head(x) 114 | return x 115 | -------------------------------------------------------------------------------- /llava/model/multimodal_encoder/dev_eva_clip/eva_clip/transform.py: -------------------------------------------------------------------------------- 1 | from typing import Optional, Sequence, Tuple 2 | 3 | import torch 4 | import torch.nn as nn 5 | import torchvision.transforms.functional as F 6 | 7 | from torchvision.transforms import Normalize, Compose, RandomResizedCrop, InterpolationMode, ToTensor, Resize, CenterCrop 8 | 9 | from .constants import OPENAI_DATASET_MEAN, OPENAI_DATASET_STD 10 | 11 | 12 | class ResizeMaxSize(nn.Module): 13 | 14 | def __init__(self, max_size, interpolation=InterpolationMode.BICUBIC, fn="max", fill=0): 15 | super().__init__() 16 | if not isinstance(max_size, int): 17 | raise TypeError(f"Size should be int. Got {type(max_size)}") 18 | self.max_size = max_size 19 | self.interpolation = interpolation 20 | self.fn = min if fn == "min" else min 21 | self.fill = fill 22 | 23 | def forward(self, img): 24 | if isinstance(img, torch.Tensor): 25 | height, width = img.shape[:2] 26 | else: 27 | width, height = img.size 28 | scale = self.max_size / float(max(height, width)) 29 | if scale != 1.0: 30 | new_size = tuple(round(dim * scale) for dim in (height, width)) 31 | img = F.resize(img, new_size, self.interpolation) 32 | pad_h = self.max_size - new_size[0] 33 | pad_w = self.max_size - new_size[1] 34 | img = F.pad(img, padding=[pad_w // 2, pad_h // 2, pad_w - pad_w // 2, pad_h - pad_h // 2], fill=self.fill) 35 | return img 36 | 37 | 38 | def _convert_to_rgb(image): 39 | return image.convert("RGB") 40 | 41 | 42 | # class CatGen(nn.Module): 43 | # def __init__(self, num=4): 44 | # self.num = num 45 | # def mixgen_batch(image, text): 46 | # batch_size = image.shape[0] 47 | # index = np.random.permutation(batch_size) 48 | 49 | # cat_images = [] 50 | # for i in range(batch_size): 51 | # # image mixup 52 | # image[i,:] = lam * image[i,:] + (1 - lam) * image[index[i],:] 53 | # # text concat 54 | # text[i] = tokenizer((str(text[i]) + " " + str(text[index[i]])))[0] 55 | # text = torch.stack(text) 56 | # return image, text 57 | 58 | 59 | def image_transform( 60 | image_size: int, 61 | is_train: bool, 62 | mean: Optional[Tuple[float, ...]] = None, 63 | std: Optional[Tuple[float, ...]] = None, 64 | resize_longest_max: bool = False, 65 | fill_color: int = 0, 66 | ): 67 | mean = mean or OPENAI_DATASET_MEAN 68 | if not isinstance(mean, (list, tuple)): 69 | mean = (mean,) * 3 70 | 71 | std = std or OPENAI_DATASET_STD 72 | if not isinstance(std, (list, tuple)): 73 | std = (std,) * 3 74 | 75 | if isinstance(image_size, (list, tuple)) and image_size[0] == image_size[1]: 76 | # for square size, pass size as int so that Resize() uses aspect preserving shortest edge 77 | image_size = image_size[0] 78 | 79 | normalize = Normalize(mean=mean, std=std) 80 | if is_train: 81 | return Compose( 82 | [ 83 | RandomResizedCrop(image_size, scale=(0.9, 1.0), interpolation=InterpolationMode.BICUBIC), 84 | _convert_to_rgb, 85 | ToTensor(), 86 | normalize, 87 | ] 88 | ) 89 | else: 90 | if resize_longest_max: 91 | transforms = [ResizeMaxSize(image_size, fill=fill_color)] 92 | else: 93 | transforms = [ 94 | Resize(image_size, interpolation=InterpolationMode.BICUBIC), 95 | CenterCrop(image_size), 96 | ] 97 | transforms.extend( 98 | [ 99 | _convert_to_rgb, 100 | ToTensor(), 101 | normalize, 102 | ] 103 | ) 104 | return Compose(transforms) 105 | -------------------------------------------------------------------------------- /llava/model/multimodal_encoder/dev_eva_clip/eva_vit.py: -------------------------------------------------------------------------------- 1 | # Based on EVA, BEIT, timm and DeiT code bases 2 | # https://github.com/baaivision/EVA 3 | # https://github.com/rwightman/pytorch-image-models/tree/master/timm 4 | # https://github.com/microsoft/unilm/tree/master/beit 5 | # https://github.com/facebookresearch/deit/ 6 | # https://github.com/facebookresearch/dino 7 | # --------------------------------------------------------' 8 | # not tested yet 9 | import math 10 | from transformers import CLIPImageProcessor 11 | 12 | import torch 13 | import torch.nn as nn 14 | import torch.nn.functional as F 15 | import torch.utils.checkpoint as checkpoint 16 | from timm.models.layers import drop_path, to_2tuple, trunc_normal_ 17 | from .eva_clip import create_model_and_transforms, get_model_config 18 | import torch 19 | import torchvision 20 | import time 21 | 22 | from llava.utils import rank0_print 23 | 24 | 25 | class EvaViTWrapper(nn.Module): 26 | def __init__(self, vision_tower, args, delay_load=False): 27 | super().__init__() 28 | 29 | self.is_loaded = False 30 | self.vision_tower_name = vision_tower 31 | self.pretrained = args.vision_tower_pretrained 32 | self.args = args 33 | 34 | self.select_layer = args.mm_vision_select_layer 35 | if self.select_layer < -1: 36 | self.select_layer += 1 37 | self.select_feature = getattr(args, "mm_vision_select_feature", "patch") 38 | 39 | self.model_config = get_model_config(self.vision_tower_name) 40 | 41 | if not delay_load: 42 | rank0_print(f"Loading vision tower: {vision_tower}") 43 | self.load_model() 44 | elif getattr(args, "unfreeze_mm_vision_tower", False): 45 | # TODO: better detector is needed. 46 | rank0_print(f"The checkpoint seems to contain `vision_tower` weights: `unfreeze_mm_vision_tower`: True.") 47 | self.load_model() 48 | elif hasattr(args, "mm_tunable_parts") and "mm_vision_tower" in args.mm_tunable_parts: 49 | rank0_print(f"The checkpoint seems to contain `vision_tower` weights: `mm_tunable_parts` contains `mm_vision_tower`.") 50 | self.load_model() 51 | 52 | def load_model(self): 53 | rank0_print(f"Loading: {self.vision_tower_name}") 54 | rank0_print(f"Pretrained: {self.pretrained}") 55 | time_start = time.time() 56 | model, _, image_processor = create_model_and_transforms(self.vision_tower_name, self.pretrained, force_custom_clip=True, precision="fp16") 57 | time_end = time.time() 58 | rank0_print(f"Loaded: {self.vision_tower_name} in {time_end - time_start:.2f}s") 59 | self.device = next(model.parameters()).device 60 | self.dtype = next(model.parameters()).dtype 61 | if self.device.type != "meta": 62 | model = model.to("cuda") 63 | self.vision_tower = model.visual 64 | resize_transform = [t for t in image_processor.transforms if isinstance(t, torchvision.transforms.Resize)][0] 65 | normalize_transform = [t for t in image_processor.transforms if isinstance(t, torchvision.transforms.Normalize)][0] 66 | self.resize_transform_size = resize_transform.size 67 | self.image_processor = CLIPImageProcessor.from_pretrained( 68 | "openai/clip-vit-large-patch14", 69 | crop_size=resize_transform.size, 70 | size={"shortest_edge": resize_transform.size}, 71 | image_mean=list(normalize_transform.mean), 72 | image_std=list(normalize_transform.std), 73 | ) 74 | rank0_print(f"Loaded image processor: {self.image_processor}") 75 | self.vision_tower.requires_grad_(False) 76 | self.is_loaded = True 77 | 78 | def feature_select(self, image_features): 79 | select_feature_type = self.select_feature 80 | 81 | # if self.select_feature in ["slicefour_patch", "slicefour_cls_patch"]: 82 | # select_every_k_layer = len(image_features) // 4 83 | # image_features = torch.cat([image_features[i] for i in range(select_every_k_layer + self.select_layer, len(image_features), select_every_k_layer)], dim=-1) 84 | # select_feature_type = select_feature_type.replace("slicefour_", "") 85 | # elif self.select_feature in ["slice_m25811_f6_patch", "slice_m25811_f6_cls_patch"]: 86 | # select_layers = [-1, -4, -7, -10, 6] 87 | # image_features = torch.cat([image_features[i] for i in select_layers], dim=-1) 88 | # select_feature_type = select_feature_type.replace("slice_m25811_f6_", "") 89 | # else: 90 | # image_features = image_features[self.select_layer] 91 | 92 | if select_feature_type == "patch": 93 | image_features = image_features[:, 1:] 94 | elif select_feature_type == "cls_patch": 95 | image_features = image_features 96 | else: 97 | raise ValueError(f"Unexpected select feature: {select_feature_type}") 98 | return image_features 99 | 100 | def train(self, mode=True): 101 | self.training = mode 102 | 103 | if self.is_loaded: 104 | self.vision_tower.eval() 105 | 106 | def forward(self, images): 107 | if type(images) is list: 108 | image_features = [] 109 | for image in images: 110 | image_features = self.vision_tower.forward_features(image.to(self.dtype), return_all_features=True) 111 | image_features = self.feature_select(image_features).to(self.dtype) 112 | image_features.append(image_features) 113 | else: 114 | image_features = self.vision_tower.forward_features(images.to(self.dtype), return_all_features=True) 115 | image_features = self.feature_select(image_features).to(self.dtype) 116 | 117 | return image_features 118 | 119 | @property 120 | def dummy_feature(self): 121 | return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype) 122 | 123 | @property 124 | def hidden_size(self): 125 | return self.model_config["vision_cfg"]["width"] 126 | 127 | @property 128 | def num_patches(self): 129 | return (self.model_config["vision_cfg"]["image_size"] // self.model_config["vision_cfg"]["patch_size"]) ** 2 130 | 131 | @property 132 | def num_patches_per_side(self): 133 | return self.model_config["vision_cfg"]["image_size"] // self.model_config["vision_cfg"]["patch_size"] 134 | 135 | @property 136 | def config(self): 137 | return self.model_config 138 | 139 | @property 140 | def image_size(self): 141 | return self.model_config["vision_cfg"]["image_size"] 142 | -------------------------------------------------------------------------------- /llava/model/multimodal_encoder/eva_clip/eva_clip_encoder.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | from .eva_clip_processors import EvaClipImageTrainProcessor 5 | from .eva_vit import EVAEncoderWrapper 6 | from .factory import list_models, add_model_config, get_model_config 7 | 8 | from llava.utils import rank0_print 9 | 10 | 11 | class EvaClipVisionTower(nn.Module): 12 | def __init__(self, vision_tower, args, delay_load=False): 13 | super().__init__() 14 | 15 | self.is_loaded = False 16 | self.vision_tower_name = vision_tower 17 | self.vision_tower_pretrained = args.vision_tower_pretrained 18 | self.config = get_model_config(vision_tower) 19 | 20 | if not delay_load: 21 | rank0_print(f"Loading EVA ViT: {self.vision_tower_name}") 22 | self.load_model() 23 | elif getattr(args, "unfreeze_mm_vision_tower", False): 24 | # TODO: better detector is needed. 25 | rank0_print(f"The checkpoint seems to contain `vision_tower` weights: `unfreeze_mm_vision_tower`: True.") 26 | self.load_model() 27 | elif hasattr(args, "mm_tunable_parts") and "mm_vision_tower" in args.mm_tunable_parts: 28 | rank0_print(f"The checkpoint seems to contain `vision_tower` weights: `mm_tunable_parts` contains `mm_vision_tower`.") 29 | self.load_model() 30 | else: 31 | self.cfg_only = self.config 32 | 33 | def load_model(self, device_map=None): 34 | rank0_print(f"Pretrained: {self.vision_tower_pretrained}") 35 | self.image_processor = EvaClipImageTrainProcessor(self.config["vision_cfg"]["image_size"]) 36 | self.vision_tower = EVAEncoderWrapper(self.vision_tower_pretrained, self.config) 37 | rank0_print(f"Loaded image processor: {self.image_processor}") 38 | self.vision_tower.requires_grad_(False) 39 | self.is_loaded = True 40 | 41 | def forward(self, images): 42 | if type(images) is list: 43 | image_features = [] 44 | for image in images: 45 | image_feature = self.vision_tower(image.to(device=self.device, dtype=self.dtype).unsqueeze(0)).to(image.dtype) 46 | image_features.append(image_feature) 47 | else: 48 | image_features = self.vision_tower(images.to(device=self.device, dtype=self.dtype)).to(images.dtype) 49 | 50 | return image_features 51 | 52 | @property 53 | def dtype(self): 54 | return self.vision_tower.dtype 55 | 56 | @property 57 | def device(self): 58 | return self.vision_tower.device 59 | 60 | @property 61 | def hidden_size(self): 62 | return self.config["vision_cfg"]["width"] 63 | 64 | @property 65 | def num_patches(self): 66 | return (self.config["vision_cfg"]["image_size"] // self.config["vision_cfg"]["patch_size"]) ** 2 67 | 68 | @property 69 | def num_patches_per_side(self): 70 | return self.config["vision_cfg"]["image_size"] // self.config["vision_cfg"]["patch_size"] 71 | 72 | @property 73 | def image_size(self): 74 | return self.config["vision_cfg"]["image_size"] 75 | -------------------------------------------------------------------------------- /llava/model/multimodal_encoder/eva_clip/eva_clip_processors.py: -------------------------------------------------------------------------------- 1 | """ 2 | # Adapted from https://github.com/baaivision/EVA/tree/master/EVA-CLIP 3 | """ 4 | 5 | from torchvision import transforms 6 | from torchvision.transforms.functional import InterpolationMode 7 | from transformers.image_processing_utils import BatchFeature 8 | from PIL import Image 9 | from transformers.image_transforms import convert_to_rgb 10 | 11 | 12 | class BaseProcessor: 13 | def __init__(self): 14 | self.transform = lambda x: x 15 | return 16 | 17 | def __call__(self, item): 18 | return self.transform(item) 19 | 20 | 21 | class EvaClipImageBaseProcessor(BaseProcessor): 22 | def __init__(self, mean=None, std=None): 23 | self.mean = (0.48145466, 0.4578275, 0.40821073) if mean is None else mean 24 | self.std = (0.26862954, 0.26130258, 0.27577711) if std is None else std 25 | 26 | self.normalize = transforms.Normalize(self.mean, self.std) 27 | 28 | @property 29 | def image_mean(self): 30 | return self.mean 31 | 32 | 33 | class EvaClipImageTrainProcessor(EvaClipImageBaseProcessor): 34 | def __init__(self, image_size=224, mean=None, std=None, min_scale=0.5, max_scale=1.0): 35 | super().__init__(mean=mean, std=std) 36 | 37 | self.transform = transforms.Compose( 38 | [ 39 | convert_to_rgb, 40 | transforms.Resize( 41 | image_size, 42 | interpolation=InterpolationMode.BICUBIC, 43 | ), 44 | transforms.CenterCrop(image_size), 45 | transforms.ToTensor(), 46 | self.normalize, 47 | ] 48 | ) 49 | 50 | self.image_size = image_size 51 | 52 | def preprocess(self, images, return_tensors): 53 | if isinstance(images, Image.Image): 54 | images = [images] 55 | else: 56 | assert isinstance(images, list) 57 | 58 | transformed_images = [self.transform(image).numpy() for image in images] 59 | data = {"pixel_values": transformed_images} 60 | 61 | return BatchFeature(data=data, tensor_type=return_tensors) 62 | 63 | def __call__(self, item): 64 | return self.transform(item) 65 | 66 | @property 67 | def crop_size(self): 68 | return {"height": self.image_size, "width": self.image_size} 69 | 70 | @property 71 | def size(self): 72 | return {"shortest_edge": self.image_size} 73 | -------------------------------------------------------------------------------- /llava/model/multimodal_encoder/eva_clip/factory.py: -------------------------------------------------------------------------------- 1 | import json 2 | import logging 3 | import os 4 | import pathlib 5 | import re 6 | from copy import deepcopy 7 | from pathlib import Path 8 | from typing import Optional, Tuple, Union, Dict, Any 9 | import torch 10 | 11 | _MODEL_CONFIG_PATHS = [Path(__file__).parent / f"model_configs/"] 12 | _MODEL_CONFIGS = {} # directory (model_name: config) of model architecture configs 13 | 14 | 15 | def _natural_key(string_): 16 | return [int(s) if s.isdigit() else s for s in re.split(r"(\d+)", string_.lower())] 17 | 18 | 19 | def _rescan_model_configs(): 20 | global _MODEL_CONFIGS 21 | 22 | config_ext = (".json",) 23 | config_files = [] 24 | for config_path in _MODEL_CONFIG_PATHS: 25 | if config_path.is_file() and config_path.suffix in config_ext: 26 | config_files.append(config_path) 27 | elif config_path.is_dir(): 28 | for ext in config_ext: 29 | config_files.extend(config_path.glob(f"*{ext}")) 30 | 31 | for cf in config_files: 32 | with open(cf, "r", encoding="utf8") as f: 33 | model_cfg = json.load(f) 34 | if all(a in model_cfg for a in ("embed_dim", "vision_cfg", "text_cfg")): 35 | _MODEL_CONFIGS[cf.stem] = model_cfg 36 | 37 | _MODEL_CONFIGS = dict(sorted(_MODEL_CONFIGS.items(), key=lambda x: _natural_key(x[0]))) 38 | 39 | 40 | _rescan_model_configs() # initial populate of model config registry 41 | 42 | 43 | def list_models(): 44 | """enumerate available model architectures based on config files""" 45 | return list(_MODEL_CONFIGS.keys()) 46 | 47 | 48 | def add_model_config(path): 49 | """add model config path or file and update registry""" 50 | if not isinstance(path, Path): 51 | path = Path(path) 52 | _MODEL_CONFIG_PATHS.append(path) 53 | _rescan_model_configs() 54 | 55 | 56 | def get_model_config(model_name): 57 | if model_name in _MODEL_CONFIGS: 58 | return deepcopy(_MODEL_CONFIGS[model_name]) 59 | else: 60 | return None 61 | -------------------------------------------------------------------------------- /llava/model/multimodal_encoder/eva_clip/model_configs/EVA-CLIP-18B.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1536, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 48, 6 | "width": 5120, 7 | "head_width": 128, 8 | "mlp_ratio": 5, 9 | "patch_size": 14, 10 | "eva_model_name": "eva-clip-18b-14-x", 11 | "drop_path_rate": 0, 12 | "qkv_bias": false, 13 | "xattn": true, 14 | "postnorm": true, 15 | "fusedLN": false, 16 | "use_rms_norm": true 17 | }, 18 | "text_cfg": { 19 | "context_length": 77, 20 | "vocab_size": 49408, 21 | "width": 1280, 22 | "heads": 20, 23 | "layers": 32, 24 | "xattn": false, 25 | "fusedLN": false 26 | } 27 | } -------------------------------------------------------------------------------- /llava/model/multimodal_encoder/eva_clip/model_configs/EVA-CLIP-8B-plus.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1280, 3 | "vision_cfg": { 4 | "image_size": 448, 5 | "layers": 32, 6 | "width": 4096, 7 | "head_width": 128, 8 | "mlp_ratio": 5, 9 | "patch_size": 14, 10 | "eva_model_name": "eva-clip-8b-14-plus-x", 11 | "drop_path_rate": 0, 12 | "qkv_bias": false, 13 | "xattn": true, 14 | "postnorm": false, 15 | "fusedLN": false, 16 | "use_rms_norm": true 17 | }, 18 | "text_cfg": { 19 | "context_length": 77, 20 | "vocab_size": 49408, 21 | "width": 1280, 22 | "heads": 20, 23 | "layers": 32, 24 | "xattn": false, 25 | "fusedLN": false 26 | } 27 | } -------------------------------------------------------------------------------- /llava/model/multimodal_encoder/eva_clip/model_configs/EVA-CLIP-8B.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1280, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 32, 6 | "width": 4096, 7 | "head_width": 128, 8 | "mlp_ratio": 5, 9 | "patch_size": 14, 10 | "eva_model_name": "eva-clip-8b-14-x", 11 | "drop_path_rate": 0, 12 | "qkv_bias": false, 13 | "xattn": true, 14 | "postnorm": false, 15 | "fusedLN": false, 16 | "use_rms_norm": true 17 | }, 18 | "text_cfg": { 19 | "context_length": 77, 20 | "vocab_size": 49408, 21 | "width": 1280, 22 | "heads": 20, 23 | "layers": 32, 24 | "xattn": false, 25 | "fusedLN": false 26 | } 27 | } -------------------------------------------------------------------------------- /llava/model/multimodal_encoder/eva_clip/model_configs/EVA01-CLIP-B-16.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 12, 6 | "width": 768, 7 | "patch_size": 16, 8 | "eva_model_name": "eva-clip-b-16", 9 | "ls_init_value": 0.1, 10 | "drop_path_rate": 0.0 11 | }, 12 | "text_cfg": { 13 | "context_length": 77, 14 | "vocab_size": 49408, 15 | "width": 512, 16 | "heads": 8, 17 | "layers": 12 18 | } 19 | } -------------------------------------------------------------------------------- /llava/model/multimodal_encoder/eva_clip/model_configs/EVA01-CLIP-g-14-plus.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 40, 6 | "width": 1408, 7 | "head_width": 88, 8 | "mlp_ratio": 4.3637, 9 | "patch_size": 14, 10 | "eva_model_name": "eva-clip-g-14-x", 11 | "drop_path_rate": 0, 12 | "xattn": true, 13 | "fusedLN": true 14 | }, 15 | "text_cfg": { 16 | "context_length": 77, 17 | "vocab_size": 49408, 18 | "width": 1024, 19 | "heads": 16, 20 | "layers": 24, 21 | "xattn": false, 22 | "fusedLN": true 23 | } 24 | } -------------------------------------------------------------------------------- /llava/model/multimodal_encoder/eva_clip/model_configs/EVA01-CLIP-g-14.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 40, 6 | "width": 1408, 7 | "head_width": 88, 8 | "mlp_ratio": 4.3637, 9 | "patch_size": 14, 10 | "eva_model_name": "eva-clip-g-14-x", 11 | "drop_path_rate": 0.4, 12 | "xattn": true, 13 | "fusedLN": true 14 | }, 15 | "text_cfg": { 16 | "context_length": 77, 17 | "vocab_size": 49408, 18 | "width": 768, 19 | "heads": 12, 20 | "layers": 12, 21 | "xattn": false, 22 | "fusedLN": true 23 | } 24 | } -------------------------------------------------------------------------------- /llava/model/multimodal_encoder/eva_clip/model_configs/EVA02-CLIP-B-16.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 12, 6 | "width": 768, 7 | "head_width": 64, 8 | "patch_size": 16, 9 | "mlp_ratio": 2.6667, 10 | "eva_model_name": "eva-clip-b-16-X", 11 | "drop_path_rate": 0.0, 12 | "xattn": true, 13 | "fusedLN": true, 14 | "rope": true, 15 | "pt_hw_seq_len": 16, 16 | "intp_freq": true, 17 | "naiveswiglu": true, 18 | "subln": true 19 | }, 20 | "text_cfg": { 21 | "context_length": 77, 22 | "vocab_size": 49408, 23 | "width": 512, 24 | "heads": 8, 25 | "layers": 12, 26 | "xattn": true, 27 | "fusedLN": true 28 | } 29 | } -------------------------------------------------------------------------------- /llava/model/multimodal_encoder/eva_clip/model_configs/EVA02-CLIP-L-14-336.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 768, 3 | "vision_cfg": { 4 | "image_size": 336, 5 | "layers": 24, 6 | "width": 1024, 7 | "drop_path_rate": 0, 8 | "head_width": 64, 9 | "mlp_ratio": 2.6667, 10 | "patch_size": 14, 11 | "eva_model_name": "eva-clip-l-14-336", 12 | "xattn": true, 13 | "fusedLN": true, 14 | "rope": true, 15 | "pt_hw_seq_len": 16, 16 | "intp_freq": true, 17 | "naiveswiglu": true, 18 | "subln": true 19 | }, 20 | "text_cfg": { 21 | "context_length": 77, 22 | "vocab_size": 49408, 23 | "width": 768, 24 | "heads": 12, 25 | "layers": 12, 26 | "xattn": false, 27 | "fusedLN": true 28 | } 29 | } -------------------------------------------------------------------------------- /llava/model/multimodal_encoder/eva_clip/model_configs/EVA02-CLIP-L-14.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 768, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 24, 6 | "width": 1024, 7 | "drop_path_rate": 0, 8 | "head_width": 64, 9 | "mlp_ratio": 2.6667, 10 | "patch_size": 14, 11 | "eva_model_name": "eva-clip-l-14", 12 | "xattn": true, 13 | "fusedLN": true, 14 | "rope": true, 15 | "pt_hw_seq_len": 16, 16 | "intp_freq": true, 17 | "naiveswiglu": true, 18 | "subln": true 19 | }, 20 | "text_cfg": { 21 | "context_length": 77, 22 | "vocab_size": 49408, 23 | "width": 768, 24 | "heads": 12, 25 | "layers": 12, 26 | "xattn": false, 27 | "fusedLN": true 28 | } 29 | } -------------------------------------------------------------------------------- /llava/model/multimodal_encoder/eva_clip/model_configs/EVA02-CLIP-bigE-14-plus.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 64, 6 | "width": 1792, 7 | "head_width": 112, 8 | "mlp_ratio": 8.571428571428571, 9 | "patch_size": 14, 10 | "eva_model_name": "eva-clip-4b-14-x", 11 | "drop_path_rate": 0, 12 | "xattn": true, 13 | "postnorm": true, 14 | "fusedLN": true 15 | }, 16 | "text_cfg": { 17 | "context_length": 77, 18 | "vocab_size": 49408, 19 | "width": 1280, 20 | "heads": 20, 21 | "layers": 32, 22 | "xattn": false, 23 | "fusedLN": true 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /llava/model/multimodal_encoder/eva_clip/model_configs/EVA02-CLIP-bigE-14.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 64, 6 | "width": 1792, 7 | "head_width": 112, 8 | "mlp_ratio": 8.571428571428571, 9 | "patch_size": 14, 10 | "eva_model_name": "eva-clip-4b-14-x", 11 | "drop_path_rate": 0, 12 | "xattn": true, 13 | "postnorm": true, 14 | "fusedLN": true 15 | }, 16 | "text_cfg": { 17 | "context_length": 77, 18 | "vocab_size": 49408, 19 | "width": 1024, 20 | "heads": 16, 21 | "layers": 24, 22 | "xattn": false, 23 | "fusedLN": true 24 | } 25 | } -------------------------------------------------------------------------------- /llava/model/multimodal_encoder/eva_clip/model_configs/Internal-EVA02-CLIP-10B-14-448.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "image_size": 448, 5 | "layers": 77, 6 | "width": 2304, 7 | "head_width": 144, 8 | "mlp_ratio": 10.9722, 9 | "patch_size": 14, 10 | "eva_model_name": "eva-clip-10b-14-x", 11 | "drop_path_rate": 0, 12 | "xattn": true, 13 | "postnorm": false, 14 | "fusedLN": true 15 | }, 16 | "text_cfg": { 17 | "context_length": 77, 18 | "vocab_size": 49408, 19 | "width": 1280, 20 | "heads": 20, 21 | "layers": 32, 22 | "xattn": false, 23 | "fusedLN": true 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /llava/model/multimodal_encoder/eva_clip/model_configs/Internal-EVA02-CLIP-10B-14.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 77, 6 | "width": 2304, 7 | "head_width": 144, 8 | "mlp_ratio": 10.9722, 9 | "patch_size": 14, 10 | "eva_model_name": "eva-clip-10b-14-x", 11 | "drop_path_rate": 0, 12 | "xattn": true, 13 | "postnorm": false, 14 | "fusedLN": true 15 | }, 16 | "text_cfg": { 17 | "context_length": 77, 18 | "vocab_size": 49408, 19 | "width": 1280, 20 | "heads": 20, 21 | "layers": 32, 22 | "xattn": false, 23 | "fusedLN": true 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /llava/model/multimodal_encoder/hf_vision.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | from transformers import AutoModel, AutoImageProcessor, AutoConfig, CLIPImageProcessor 5 | from llava.utils import rank0_print 6 | 7 | 8 | class HFVisionTower(nn.Module): 9 | def __init__(self, vision_tower, args, delay_load=False): 10 | super().__init__() 11 | 12 | self.is_loaded = False 13 | 14 | self.vision_tower_name = vision_tower.replace("hf:", "", 1) 15 | self.select_layer = args.mm_vision_select_layer 16 | self.select_feature = getattr(args, "mm_vision_select_feature", "patch") 17 | 18 | if not delay_load: 19 | self.load_model() 20 | else: 21 | self.cfg_only = AutoConfig.from_pretrained(self.vision_tower_name) 22 | 23 | def load_model(self): 24 | try: 25 | self.image_processor = AutoImageProcessor.from_pretrained(self.vision_tower_name) 26 | except Exception as e: 27 | if "448" in self.vision_tower_name: 28 | image_size = 448 29 | # use image processor with conig 30 | self.image_processor = CLIPImageProcessor(size={"shortest_edge": image_size}, do_center_crop=True, crop_size=image_size) 31 | else: 32 | self.image_processor = CLIPImageProcessor.from_pretrained("openai/clip-vit-large-patch14") 33 | rank0_print(f"Loaded image processor: {self.image_processor}") 34 | self.vision_tower = AutoModel.from_pretrained(self.vision_tower_name, torch_dtype=torch.bfloat16, trust_remote_code=True).to("cuda") 35 | self.device = self.vision_tower.device 36 | self.dtype = self.vision_tower.dtype 37 | self.config = self.vision_tower.config 38 | 39 | if hasattr(self.vision_tower, "vision_model"): 40 | self.vision_tower = self.vision_tower.vision_model 41 | self.vision_tower.requires_grad_(False) 42 | # self.vision_tower.eval() 43 | self.is_loaded = True 44 | 45 | def feature_select(self, image_forward_outs): 46 | select_feature_type = self.select_feature 47 | 48 | if self.select_feature in ["slicefour_patch", "slicefour_cls_patch"]: 49 | select_every_k_layer = len(image_forward_outs.hidden_states) // 4 50 | image_features = torch.cat([image_forward_outs.hidden_states[i] for i in range(select_every_k_layer + self.select_layer, len(image_forward_outs.hidden_states), select_every_k_layer)], dim=-1) 51 | select_feature_type = select_feature_type.replace("slicefour_", "") 52 | else: 53 | image_features = image_forward_outs.hidden_states[self.select_layer] 54 | 55 | if select_feature_type == "patch": 56 | image_features = image_features[:, 1:] 57 | elif select_feature_type == "cls_patch": 58 | image_features = image_features 59 | else: 60 | raise ValueError(f"Unexpected select feature: {select_feature_type}") 61 | return image_features 62 | 63 | def forward(self, images): 64 | if type(images) is list: 65 | image_features = [] 66 | for image in images: 67 | image_forward_out = self.vision_tower(image.to(device=self.device, dtype=self.dtype).unsqueeze(0), output_hidden_states=True) 68 | image_feature = self.feature_select(image_forward_out).to(image.dtype) 69 | image_features.append(image_feature) 70 | else: 71 | image_forward_outs = self.vision_tower(images.to(device=self.device, dtype=self.dtype), output_hidden_states=True) 72 | image_features = self.feature_select(image_forward_outs).to(images.dtype) 73 | 74 | return image_features 75 | 76 | @property 77 | def dummy_feature(self): 78 | return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype) 79 | 80 | # @property 81 | # def dtype(self): 82 | # return self.vision_tower.dtype 83 | 84 | # @property 85 | # def device(self): 86 | # return self.vision_tower.device 87 | 88 | @property 89 | def hidden_size(self): 90 | try: 91 | _hidden_size = self.config.hidden_size 92 | except: 93 | _hidden_size = self.config.vision_config.hidden_size 94 | if "slicefour" in self.select_feature: 95 | _hidden_size *= 4 96 | return _hidden_size 97 | 98 | @property 99 | def num_patches(self): 100 | _num_patches = (self.config.image_size // self.config.patch_size) ** 2 101 | if "cls_patch" in self.select_feature: 102 | _num_patches += 1 103 | return _num_patches 104 | 105 | @property 106 | def num_patches_per_side(self): 107 | return self.config.image_size // self.config.patch_size 108 | 109 | @property 110 | def image_size(self): 111 | return self.config.image_size 112 | -------------------------------------------------------------------------------- /llava/model/multimodal_encoder/imagebind.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | from transformers import CLIPImageProcessor 5 | 6 | try: 7 | from imagebind.models import imagebind_model 8 | from imagebind.models.imagebind_model import ModalityType 9 | from imagebind.data import load_and_transform_audio_data 10 | except ImportError: 11 | pass 12 | 13 | 14 | class ImageBindWrapper(nn.Module): 15 | def __init__(self, vision_tower, select_layer, select_feature="patch", delay_load=False): 16 | super().__init__() 17 | 18 | self.is_loaded = False 19 | 20 | self.vision_tower_name = vision_tower 21 | self.select_layer = select_layer 22 | self.select_feature = select_feature 23 | 24 | if not delay_load: 25 | self.load_model() 26 | 27 | def load_model(self): 28 | self.image_processor = CLIPImageProcessor.from_pretrained("openai/clip-vit-large-patch14") 29 | self.vision_tower = imagebind_model.imagebind_huge(pretrained=True) 30 | for p in self.vision_tower.parameters(): 31 | p.requires_grad = False 32 | self.vision_tower.eval() 33 | self.is_loaded = True 34 | 35 | def train(self, mode=True): 36 | self.training = mode 37 | 38 | if self.is_loaded: 39 | self.vision_tower.eval() 40 | 41 | @torch.no_grad() 42 | def forward(self, x): 43 | if type(x) == dict: 44 | if x["audios"] is not None: 45 | inputs = {ModalityType.AUDIO: load_and_transform_audio_data(x["audios"], device=self.device).half()} 46 | embeddings = self.vision_tower(inputs) 47 | audio_embedding = embeddings[ModalityType.AUDIO] 48 | return audio_embedding.unsqueeze(1) 49 | else: 50 | inputs = {ModalityType.VISION: x.to(dtype=self.dtype)} 51 | embeddings = self.vision_tower(inputs) 52 | vision_embedding = embeddings[ModalityType.VISION] 53 | if vision_embedding.ndim == 2: 54 | return vision_embedding.unsqueeze(1) 55 | if vision_embedding.shape[1] == 257: 56 | return vision_embedding[:, 1:] 57 | raise ValueError(f"Unexpected shape: {vision_embedding.shape}") 58 | 59 | @property 60 | def dummy_feature(self): 61 | return torch.zeros(1, 1024, device=self.device, dtype=self.dtype) 62 | 63 | @property 64 | def dtype(self): 65 | return self.vision_tower.modality_preprocessors.vision.cls_token.dtype 66 | 67 | @property 68 | def device(self): 69 | return self.vision_tower.modality_preprocessors.vision.cls_token.device 70 | 71 | @property 72 | def hidden_size(self): 73 | return 1024 74 | -------------------------------------------------------------------------------- /llava/model/multimodal_encoder/open_clip_encoder.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from transformers import CLIPImageProcessor 4 | from llava.utils import rank0_print 5 | 6 | try: 7 | import open_clip 8 | import torchvision 9 | from open_clip.transformer import _expand_token 10 | except ImportError: 11 | print("OpenCLIP not installed") 12 | open_clip = None 13 | 14 | HIDDEN_SIZE_DICT = { 15 | "ViT-H-14-378-quickgelu": 1280, 16 | } 17 | 18 | 19 | class OpenCLIPVisionTower(nn.Module): 20 | def __init__(self, vision_tower, args, delay_load=False): 21 | super().__init__() 22 | 23 | self.is_loaded = False 24 | self.model_name = vision_tower.replace("open_clip_hub:", "") 25 | self.pretrained = args.vision_tower_pretrained 26 | self.select_layer = args.mm_vision_select_layer 27 | self.select_feature = getattr(args, "mm_vision_select_feature", "patch") 28 | 29 | if not delay_load: 30 | rank0_print(f"Loading vision tower: {vision_tower}") 31 | self.load_model() 32 | elif getattr(args, "unfreeze_mm_vision_tower", False): 33 | # TODO: better detector is needed. 34 | rank0_print(f"The checkpoint seems to contain `vision_tower` weights: `unfreeze_mm_vision_tower`: True.") 35 | self.load_model() 36 | elif hasattr(args, "mm_tunable_parts") and "mm_vision_tower" in args.mm_tunable_parts: 37 | rank0_print(f"The checkpoint seems to contain `vision_tower` weights: `mm_tunable_parts` contains `mm_vision_tower`.") 38 | self.load_model() 39 | 40 | def load_model(self, device_map="auto"): 41 | rank0_print(f"Loading OpenCLIP model: {self.model_name}") 42 | rank0_print(f"Pretrained: {self.pretrained}") 43 | vision_tower, _, image_processor = open_clip.create_model_and_transforms(model_name=self.model_name, pretrained=self.pretrained, precision="fp32", device="cuda") 44 | 45 | resize_transform = [t for t in image_processor.transforms if isinstance(t, torchvision.transforms.Resize)][0] 46 | normalize_transform = [t for t in image_processor.transforms if isinstance(t, torchvision.transforms.Normalize)][0] 47 | self.resize_transform_size = resize_transform.size # 224 or 384 48 | self.patch_size = vision_tower.visual.conv1.kernel_size[0] # 14 or 16 49 | 50 | self.image_processor = CLIPImageProcessor.from_pretrained( 51 | "openai/clip-vit-large-patch14", 52 | crop_size=resize_transform.size, 53 | size={"shortest_edge": resize_transform.size}, 54 | image_mean=list(normalize_transform.mean), 55 | image_std=list(normalize_transform.std), 56 | ) 57 | rank0_print(f"Loaded image processor: {self.image_processor}") 58 | self.vision_tower = vision_tower.visual 59 | self.vision_tower.requires_grad_(False) 60 | 61 | self.is_loaded = True 62 | 63 | def feature_select(self, image_forward_outs): 64 | image_features = image_forward_outs[self.select_layer] 65 | if self.select_feature == "patch": 66 | image_features = image_features[:, 1:] 67 | elif self.select_feature == "cls_patch": 68 | image_features = image_features 69 | elif self.select_feature == "conv_flatten": 70 | image_features = image_features.flatten(2).transpose(1, 2) 71 | else: 72 | raise ValueError(f"Unexpected select feature: {self.select_feature}") 73 | return image_features 74 | 75 | def forward_visual(self, x, output_hidden_states=False): 76 | if hasattr(self.vision_tower, "trunk") and hasattr(self.vision_tower.trunk, "_intermediate_layers"): 77 | return self.vision_tower.trunk._intermediate_layers(x, abs(self.select_layer)) 78 | else: 79 | 80 | def forward_openclip(self, x: torch.Tensor): 81 | features = [] 82 | x = self.conv1(x) # shape = [*, width, grid, grid] 83 | x = x.reshape(x.shape[0], x.shape[1], -1) # shape = [*, width, grid ** 2] 84 | x = x.permute(0, 2, 1) # shape = [*, grid ** 2, width] 85 | 86 | # class embeddings and positional embeddings 87 | x = torch.cat( 88 | [_expand_token(self.class_embedding, x.shape[0]).to(x.dtype), x], 89 | dim=1, 90 | ) 91 | # shape = [*, grid ** 2 + 1, width] 92 | x = x + self.positional_embedding.to(x.dtype) 93 | 94 | x = self.patch_dropout(x) 95 | x = self.ln_pre(x) 96 | 97 | x = x.permute(1, 0, 2) # NLD -> LND 98 | for r in self.transformer.resblocks: 99 | x = r(x, attn_mask=None) 100 | features.append(x) 101 | return features 102 | 103 | return forward_openclip(self.vision_tower, x) 104 | 105 | def forward(self, images): 106 | if type(images) is list: 107 | image_features = [] 108 | for image in images: 109 | image_forward_out = self.forward_visual(image.to(self.dtype).unsqueeze(0), output_hidden_states=True) 110 | image_feature = self.feature_select(image_forward_out).to(image.dtype) 111 | image_features.append(image_feature) 112 | else: 113 | image_forward_outs = self.forward_visual(images.to(self.dtype), output_hidden_states=True) 114 | image_features = self.feature_select(image_forward_outs).to(images.dtype) 115 | 116 | return image_features 117 | 118 | @property 119 | def dummy_feature(self): 120 | return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype) 121 | 122 | @property 123 | def dtype(self): 124 | if hasattr(self.vision_tower, "conv1"): 125 | return self.vision_tower.conv1.weight.dtype 126 | if hasattr(self.vision_tower, "trunk"): 127 | return self.vision_tower.trunk.patch_embed.proj.weight.dtype 128 | raise NotImplementedError 129 | 130 | @property 131 | def device(self): 132 | if hasattr(self.vision_tower, "conv1"): 133 | return self.vision_tower.conv1.weight.device 134 | if hasattr(self.vision_tower, "trunk"): 135 | return self.vision_tower.trunk.patch_embed.proj.weight.device 136 | raise NotImplementedError 137 | 138 | @property 139 | def config(self): 140 | return None 141 | 142 | @property 143 | def hidden_size(self): 144 | if self.model_name in HIDDEN_SIZE_DICT: 145 | return HIDDEN_SIZE_DICT[self.model_name] 146 | else: 147 | raise NotImplementedError 148 | 149 | @property 150 | def num_patches(self): 151 | image_size = self.resize_transform_size if isinstance(self.resize_transform_size, int) else self.resize_transform_size[0] 152 | _num_patches = (image_size // self.patch_size) ** 2 153 | if "cls_patch" in self.select_feature: 154 | _num_patches += 1 155 | return _num_patches 156 | 157 | @property 158 | def image_size(self): 159 | return self.resize_transform_size 160 | 161 | @property 162 | def num_patches_per_side(self): 163 | return self.resize_transform_size // self.patch_size 164 | -------------------------------------------------------------------------------- /llava/model/multimodal_projector/__pycache__/builder.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dongyh20/Insight-V/5e9c6f934c1929e351dbff17a6c6c7bc2d8143b1/llava/model/multimodal_projector/__pycache__/builder.cpython-310.pyc -------------------------------------------------------------------------------- /llava/model/multimodal_projector/__pycache__/pooler_projector.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dongyh20/Insight-V/5e9c6f934c1929e351dbff17a6c6c7bc2d8143b1/llava/model/multimodal_projector/__pycache__/pooler_projector.cpython-310.pyc -------------------------------------------------------------------------------- /llava/model/multimodal_projector/builder.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import re 4 | 5 | from .pooler_projector import PoolerProjector 6 | 7 | 8 | class IdentityMap(nn.Module): 9 | def __init__(self): 10 | super().__init__() 11 | 12 | def forward(self, x, *args, **kwargs): 13 | return x 14 | 15 | @property 16 | def config(self): 17 | return {"mm_projector_type": "identity"} 18 | 19 | 20 | class SimpleResBlock(nn.Module): 21 | def __init__(self, channels): 22 | super().__init__() 23 | self.pre_norm = nn.LayerNorm(channels) 24 | 25 | self.proj = nn.Sequential(nn.Linear(channels, channels), nn.GELU(), nn.Linear(channels, channels)) 26 | 27 | def forward(self, x): 28 | x = self.pre_norm(x) 29 | return x + self.proj(x) 30 | 31 | 32 | def build_vision_projector(config, delay_load=False, **kwargs): 33 | projector_type = getattr(config, "mm_projector_type", "linear") 34 | 35 | if projector_type == "linear": 36 | return nn.Linear(config.mm_hidden_size, config.hidden_size) 37 | 38 | if projector_type == "pooler": 39 | return PoolerProjector(config, kwargs["vision_cfg"]) 40 | 41 | mlp_gelu_match = re.match(r"^mlp(\d+)x_gelu$", projector_type) 42 | if mlp_gelu_match: 43 | mlp_depth = int(mlp_gelu_match.group(1)) 44 | modules = [nn.Linear(config.mm_hidden_size, config.hidden_size)] 45 | for _ in range(1, mlp_depth): 46 | modules.append(nn.GELU()) 47 | modules.append(nn.Linear(config.hidden_size, config.hidden_size)) 48 | return nn.Sequential(*modules) 49 | 50 | mlp_gelu_resnet_match = re.match(r"^mlp(\d+)x_res(\d+)x_gelu$", projector_type) 51 | if mlp_gelu_resnet_match: 52 | mlp_depth = int(mlp_gelu_resnet_match.group(1)) 53 | res_depth = int(mlp_gelu_resnet_match.group(2)) 54 | modules = [nn.Linear(config.mm_hidden_size, config.hidden_size)] 55 | for _ in range(1, mlp_depth): 56 | modules.append(nn.GELU()) 57 | modules.append(nn.Linear(config.hidden_size, config.hidden_size)) 58 | for _ in range(res_depth): 59 | modules.append(SimpleResBlock(config.hidden_size)) 60 | return nn.Sequential(*modules) 61 | 62 | if projector_type == "identity": 63 | return IdentityMap() 64 | 65 | raise ValueError(f"Unknown projector type: {projector_type}") 66 | -------------------------------------------------------------------------------- /llava/model/multimodal_projector/pooler_projector.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | import math 5 | 6 | from transformers.models.clip.modeling_clip import CLIPVisionModel 7 | 8 | 9 | class PoolerProjector(nn.Module): 10 | def __init__(self, config, vision_cfg): 11 | super().__init__() 12 | self._config = config 13 | self.hw = vision_cfg.image_size // vision_cfg.patch_size 14 | 15 | self.conv_pool = nn.Conv2d(config.mm_hidden_size, config.hidden_size, kernel_size=2, stride=2) 16 | 17 | self.proj = nn.Sequential( 18 | nn.GELU(), 19 | nn.Linear(config.hidden_size, config.hidden_size), 20 | ) 21 | 22 | def forward(self, x, *args, **kwargs): 23 | height = width = self.hw 24 | assert height * width == x.shape[1] 25 | x = x.view(x.shape[0], height, width, -1).permute(0, 3, 1, 2) 26 | x = self.conv_pool(x) 27 | x = x.flatten(2).transpose(1, 2) 28 | x = self.proj(x) 29 | return x 30 | 31 | @property 32 | def config(self): 33 | return {"mm_projector_type": "pooler"} 34 | -------------------------------------------------------------------------------- /llava/model/multimodal_resampler/__pycache__/builder.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dongyh20/Insight-V/5e9c6f934c1929e351dbff17a6c6c7bc2d8143b1/llava/model/multimodal_resampler/__pycache__/builder.cpython-310.pyc -------------------------------------------------------------------------------- /llava/model/multimodal_resampler/__pycache__/masked_drop.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dongyh20/Insight-V/5e9c6f934c1929e351dbff17a6c6c7bc2d8143b1/llava/model/multimodal_resampler/__pycache__/masked_drop.cpython-310.pyc -------------------------------------------------------------------------------- /llava/model/multimodal_resampler/__pycache__/perceiver.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dongyh20/Insight-V/5e9c6f934c1929e351dbff17a6c6c7bc2d8143b1/llava/model/multimodal_resampler/__pycache__/perceiver.cpython-310.pyc -------------------------------------------------------------------------------- /llava/model/multimodal_resampler/__pycache__/qformer.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dongyh20/Insight-V/5e9c6f934c1929e351dbff17a6c6c7bc2d8143b1/llava/model/multimodal_resampler/__pycache__/qformer.cpython-310.pyc -------------------------------------------------------------------------------- /llava/model/multimodal_resampler/__pycache__/spatial_pool.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dongyh20/Insight-V/5e9c6f934c1929e351dbff17a6c6c7bc2d8143b1/llava/model/multimodal_resampler/__pycache__/spatial_pool.cpython-310.pyc -------------------------------------------------------------------------------- /llava/model/multimodal_resampler/builder.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | from .masked_drop import MaskedDrop 4 | from .spatial_pool import SpatialPool 5 | from .perceiver import PerceiverResampler 6 | from .qformer import Qformer 7 | 8 | 9 | class IdentityMap(torch.nn.Module): 10 | def __init__(self): 11 | super().__init__() 12 | 13 | def forward(self, x, *args, **kwargs): 14 | return x 15 | 16 | @property 17 | def config(self): 18 | return {"mm_resampler_type": None} 19 | 20 | 21 | def build_vision_resampler(model_args, delay_load=False, **kwargs): 22 | resampler_type = getattr(model_args, "mm_resampler_type", None) 23 | if resampler_type == "masked_drop": 24 | return MaskedDrop(model_args) 25 | elif resampler_type == "spatial_pool": 26 | return SpatialPool(model_args, **kwargs) 27 | elif resampler_type == "perceiver": 28 | return PerceiverResampler(model_args, **kwargs) 29 | elif resampler_type == "qformer": 30 | return Qformer(model_args, **kwargs) 31 | elif resampler_type is None: 32 | return IdentityMap() 33 | 34 | raise ValueError(f"Unknown resampler type: {resampler_type}") 35 | -------------------------------------------------------------------------------- /llava/model/multimodal_resampler/masked_drop.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | import random 5 | 6 | 7 | class MaskedDrop(nn.Module): 8 | def __init__(self, model_args): 9 | super().__init__() 10 | 11 | self.mode = model_args.mm_mask_drop_mode 12 | self.skip_percentage = model_args.mm_mask_drop_skip_percentage 13 | self.ratio = model_args.mm_mask_drop_ratio 14 | self.ratio_upper = model_args.mm_mask_drop_ratio_upper 15 | self.ratio_lower = model_args.mm_mask_drop_ratio_lower 16 | 17 | def forward(self, image_features, *args, **kwargs): 18 | 19 | if not self.training: 20 | return image_features 21 | 22 | if self.skip_percentage > random.random(): 23 | return image_features 24 | 25 | masked_features = [] 26 | 27 | for image_feature in image_features: 28 | num_tokens = image_feature.shape[0] 29 | if self.mode == "fixed": 30 | num_keep = int(num_tokens * self.ratio) 31 | masked_features.append(self.random_masking(image_feature.unsqueeze(0), num_keep)[0][0]) 32 | elif self.mode == "range": 33 | num_keep = int(num_tokens * random.uniform(self.ratio_lower, self.ratio_upper)) 34 | masked_features.append(self.random_masking(image_feature.unsqueeze(0), num_keep)[0]) 35 | elif self.mode == "cls_only": 36 | masked_features.append(image_feature[0:1]) 37 | else: 38 | raise ValueError(f"Unexpected masked drop mode: {self.mode}") 39 | 40 | if self.mode not in ["range"] and (type(image_features) is not list or self.mode in ["cls_only"]): 41 | masked_features = torch.stack(masked_features, dim=0) 42 | 43 | return masked_features 44 | 45 | @property 46 | def config(self): 47 | return { 48 | "mm_resampler_type": "masked_drop", 49 | "mm_mask_drop_mode": self.mode, 50 | "mm_mask_drop_skip_percentage": self.skip_percentage, 51 | "mm_mask_drop_ratio": self.ratio, 52 | "mm_mask_drop_ratio_upper": self.ratio_upper, 53 | "mm_mask_drop_ratio_lower": self.ratio_lower, 54 | } 55 | 56 | def random_masking(self, x, len_keep): 57 | """ 58 | Perform per-sample random masking by per-sample shuffling. 59 | Per-sample shuffling is done by argsort random noise. 60 | x: [N, L, D], sequence 61 | """ 62 | N, L, D = x.shape # batch, length, dim 63 | 64 | noise = torch.rand(N, L, device=x.device) # noise in [0, 1] 65 | 66 | # sort noise for each sample 67 | ids_shuffle = torch.argsort(noise, dim=1) # ascend: small is keep, large is remove 68 | ids_restore = torch.argsort(ids_shuffle, dim=1) 69 | 70 | # keep the first subset 71 | ids_keep = ids_shuffle[:, :len_keep] 72 | x_masked = torch.gather(x, dim=1, index=ids_keep.unsqueeze(-1).repeat(1, 1, D)) 73 | 74 | # generate the binary mask: 0 is keep, 1 is remove 75 | mask = torch.ones([N, L], device=x.device) 76 | mask[:, :len_keep] = 0 77 | # unshuffle to get the binary mask 78 | mask = torch.gather(mask, dim=1, index=ids_restore) 79 | 80 | return x_masked, mask, ids_restore 81 | -------------------------------------------------------------------------------- /llava/model/multimodal_resampler/perceiver.py: -------------------------------------------------------------------------------- 1 | """ 2 | Taken from https://github.com/lucidrains/flamingo-pytorch 3 | """ 4 | 5 | import torch 6 | from einops import rearrange, repeat 7 | 8 | try: 9 | from einops_exts import rearrange_many 10 | except: 11 | pass 12 | 13 | from torch import einsum, nn 14 | 15 | 16 | def exists(val): 17 | return val is not None 18 | 19 | 20 | def FeedForward(dim, mult=4): 21 | inner_dim = int(dim * mult) 22 | return nn.Sequential( 23 | nn.LayerNorm(dim), 24 | nn.Linear(dim, inner_dim, bias=False), 25 | nn.GELU(), 26 | nn.Linear(inner_dim, dim, bias=False), 27 | ) 28 | 29 | 30 | class PerceiverAttention(nn.Module): 31 | def __init__(self, *, dim, dim_head=64, heads=8): 32 | super().__init__() 33 | self.scale = dim_head**-0.5 34 | self.heads = heads 35 | inner_dim = dim_head * heads 36 | 37 | self.norm_media = nn.LayerNorm(dim) 38 | self.norm_latents = nn.LayerNorm(dim) 39 | 40 | self.to_q = nn.Linear(dim, inner_dim, bias=False) 41 | self.to_kv = nn.Linear(dim, inner_dim * 2, bias=False) 42 | self.to_out = nn.Linear(inner_dim, dim, bias=False) 43 | 44 | def forward(self, x, latents): 45 | """ 46 | Args: 47 | x (torch.Tensor): image features 48 | shape (b, T, n1, D) 49 | latent (torch.Tensor): latent features 50 | shape (b, T, n2, D) 51 | """ 52 | x = self.norm_media(x) 53 | latents = self.norm_latents(latents) 54 | 55 | h = self.heads 56 | 57 | q = self.to_q(latents) 58 | kv_input = torch.cat((x, latents), dim=-2) 59 | k, v = self.to_kv(kv_input).chunk(2, dim=-1) 60 | q, k, v = rearrange_many((q, k, v), "b t n (h d) -> b h t n d", h=h) 61 | q = q * self.scale 62 | 63 | # attention 64 | sim = einsum("... i d, ... j d -> ... i j", q, k) 65 | sim = sim - sim.amax(dim=-1, keepdim=True).detach() 66 | attn = sim.softmax(dim=-1) 67 | 68 | out = einsum("... i j, ... j d -> ... i d", attn, v) 69 | out = rearrange(out, "b h t n d -> b t n (h d)", h=h) 70 | return self.to_out(out) 71 | 72 | 73 | class PerceiverResamplerModule(nn.Module): 74 | def __init__( 75 | self, 76 | *, 77 | dim, 78 | depth=6, 79 | dim_head=64, 80 | heads=8, 81 | num_latents=64, 82 | max_num_media=None, 83 | max_num_frames=None, 84 | ff_mult=4, 85 | ): 86 | super().__init__() 87 | self.latents = nn.Parameter(torch.randn(num_latents, dim)) 88 | self.frame_embs = nn.Parameter(torch.randn(max_num_frames, dim)) if exists(max_num_frames) else None 89 | self.media_time_embs = nn.Parameter(torch.randn(max_num_media, 1, dim)) if exists(max_num_media) else None 90 | 91 | self.layers = nn.ModuleList([]) 92 | for _ in range(depth): 93 | self.layers.append( 94 | nn.ModuleList( 95 | [ 96 | PerceiverAttention(dim=dim, dim_head=dim_head, heads=heads), 97 | FeedForward(dim=dim, mult=ff_mult) if ff_mult > 0 else nn.Identity(), 98 | ] 99 | ) 100 | ) 101 | 102 | self.norm = nn.LayerNorm(dim) 103 | 104 | def forward(self, x): 105 | """ 106 | Args: 107 | x (torch.Tensor): image features 108 | shape (b, T, F, v, D) 109 | Returns: 110 | shape (b, T, n, D) where n is self.num_latents 111 | """ 112 | b, T, F, v = x.shape[:4] 113 | 114 | # frame and media time embeddings 115 | if exists(self.frame_embs): 116 | frame_embs = repeat(self.frame_embs[:F], "F d -> b T F v d", b=b, T=T, v=v) 117 | x = x + frame_embs 118 | x = rearrange(x, "b T F v d -> b T (F v) d") # flatten the frame and spatial dimensions 119 | if exists(self.media_time_embs): 120 | x = x + self.media_time_embs[:T] 121 | 122 | # blocks 123 | latents = repeat(self.latents, "n d -> b T n d", b=b, T=T) 124 | for attn, ff in self.layers: 125 | latents = attn(x, latents) + latents 126 | latents = ff(latents) + latents 127 | return self.norm(latents) 128 | 129 | 130 | class PerceiverResampler(nn.Module): 131 | def __init__(self, model_args, vision_tower): 132 | super().__init__() 133 | 134 | self.depth = model_args.mm_perceiver_depth 135 | self.num_latents = model_args.mm_perceiver_latents 136 | self.ff_mult = model_args.mm_perceiver_ff_mult 137 | self.pretrained = model_args.mm_perceiver_pretrained 138 | 139 | self.perceiver = PerceiverResamplerModule(dim=vision_tower.hidden_size, depth=self.depth, num_latents=self.num_latents, ff_mult=self.ff_mult) 140 | 141 | if self.pretrained is not None: 142 | self.load_state_dict(torch.load(self.pretrained)) 143 | 144 | def forward(self, image_features, *args, **kwargs): 145 | return self.perceiver(image_features[:, None, None]).squeeze(1) 146 | 147 | @property 148 | def config(self): 149 | return { 150 | "mm_resampler_type": "perceiver", 151 | "mm_perceiver_depth": self.depth, 152 | "mm_perceiver_latents": self.num_latents, 153 | "mm_perceiver_ff_mult": self.ff_mult, 154 | "mm_perceiver_pretrained": self.pretrained, 155 | } 156 | -------------------------------------------------------------------------------- /llava/model/multimodal_resampler/spatial_pool.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import math 4 | 5 | 6 | class SpatialPool(nn.Module): 7 | def __init__(self, model_args, vision_tower): 8 | super().__init__() 9 | 10 | self.mode = model_args.mm_spatial_pool_mode 11 | self.stride = model_args.mm_spatial_pool_stride 12 | self.out_channels = getattr(model_args, "mm_spatial_pool_out_channels", vision_tower.hidden_size) 13 | 14 | if self.mode == "average": 15 | self.pool = nn.AvgPool2d(kernel_size=self.stride, stride=self.stride) 16 | elif self.mode == "max": 17 | self.pool = nn.MaxPool2d(kernel_size=self.stride, stride=self.stride) 18 | elif self.mode == "conv": 19 | self.pool = nn.Conv2d(in_channels=vision_tower.hidden_size, out_channels=self.out_channels, kernel_size=self.stride, stride=self.stride) 20 | else: 21 | raise ValueError(f"Unknown pooling mode: {self.pool}.") 22 | 23 | def forward(self, image_features, images, *args, **kwargs): 24 | ori_W = int(math.sqrt(image_features.shape[1] * images.shape[3] // images.shape[2])) 25 | ori_H = int(ori_W * images.shape[2] // images.shape[3]) 26 | 27 | B, _, F = image_features.shape 28 | 29 | image_features_spatial = image_features.view(B, ori_H, ori_H, F).permute(0, 3, 1, 2) 30 | image_features_spatial_pool = self.pool(image_features_spatial) 31 | 32 | return image_features_spatial_pool.flatten(2).transpose(1, 2).contiguous() 33 | 34 | @property 35 | def config(self): 36 | return { 37 | "mm_resampler_type": "spatial_pool", 38 | "mm_spatial_pool_stride": self.stride, 39 | "mm_spatial_pool_mode": self.mode, 40 | "mm_spatial_pool_out_channels": self.out_channels, 41 | } 42 | 43 | @property 44 | def hidden_size(self): 45 | return self.out_channels 46 | -------------------------------------------------------------------------------- /llava/model/utils.py: -------------------------------------------------------------------------------- 1 | from transformers import AutoConfig 2 | 3 | 4 | def auto_upgrade(config): 5 | cfg = AutoConfig.from_pretrained(config) 6 | if "llava" in config and "llava" not in cfg.model_type: 7 | assert cfg.model_type == "llama" 8 | print("You are using newer LLaVA code base, while the checkpoint of v0 is from older code base.") 9 | print("You must upgrade the checkpoint to the new code base (this can be done automatically).") 10 | confirm = input("Please confirm that you want to upgrade the checkpoint. [Y/N]") 11 | if confirm.lower() in ["y", "yes"]: 12 | print("Upgrading checkpoint...") 13 | assert len(cfg.architectures) == 1 14 | setattr(cfg.__class__, "model_type", "llava") 15 | cfg.architectures[0] = "LlavaLlamaForCausalLM" 16 | cfg.save_pretrained(config) 17 | print("Checkpoint upgraded.") 18 | else: 19 | print("Checkpoint upgrade aborted.") 20 | exit(1) 21 | -------------------------------------------------------------------------------- /llava/serve/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dongyh20/Insight-V/5e9c6f934c1929e351dbff17a6c6c7bc2d8143b1/llava/serve/__init__.py -------------------------------------------------------------------------------- /llava/serve/cli.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import torch 3 | 4 | from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN 5 | from llava.conversation import conv_templates, SeparatorStyle 6 | from llava.model.builder import load_pretrained_model 7 | from llava.utils import disable_torch_init 8 | from llava.mm_utils import tokenizer_image_token, get_model_name_from_path, KeywordsStoppingCriteria 9 | 10 | from PIL import Image 11 | 12 | import requests 13 | from PIL import Image 14 | from io import BytesIO 15 | from transformers import TextStreamer 16 | 17 | 18 | def load_image(image_file): 19 | if image_file.startswith("http") or image_file.startswith("https"): 20 | response = requests.get(image_file) 21 | image = Image.open(BytesIO(response.content)).convert("RGB") 22 | else: 23 | image = Image.open(image_file).convert("RGB") 24 | return image 25 | 26 | 27 | def main(args): 28 | # Model 29 | disable_torch_init() 30 | 31 | model_name = get_model_name_from_path(args.model_path) 32 | tokenizer, model, image_processor, context_len = load_pretrained_model(args.model_path, args.model_base, model_name, args.load_8bit, args.load_4bit) 33 | 34 | if "llama-2" in model_name.lower(): 35 | conv_mode = "llava_llama_2" 36 | elif "v1" in model_name.lower(): 37 | conv_mode = "llava_v1" 38 | elif "mpt" in model_name.lower(): 39 | conv_mode = "mpt" 40 | else: 41 | conv_mode = "llava_v0" 42 | 43 | if args.conv_mode is not None and conv_mode != args.conv_mode: 44 | print("[WARNING] the auto inferred conversation mode is {}, while `--conv-mode` is {}, using {}".format(conv_mode, args.conv_mode, args.conv_mode)) 45 | else: 46 | args.conv_mode = conv_mode 47 | 48 | conv = conv_templates[args.conv_mode].copy() 49 | if "mpt" in model_name.lower(): 50 | roles = ("user", "assistant") 51 | else: 52 | roles = conv.roles 53 | 54 | image = load_image(args.image_file) 55 | image_tensor = image_processor.preprocess(image, return_tensors="pt")["pixel_values"].half().cuda() 56 | 57 | while True: 58 | try: 59 | inp = input(f"{roles[0]}: ") 60 | except EOFError: 61 | inp = "" 62 | if not inp: 63 | print("exit...") 64 | break 65 | 66 | print(f"{roles[1]}: ", end="") 67 | 68 | if image is not None: 69 | # first message 70 | if model.config.mm_use_im_start_end: 71 | inp = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + "\n" + inp 72 | else: 73 | inp = DEFAULT_IMAGE_TOKEN + "\n" + inp 74 | conv.append_message(conv.roles[0], inp) 75 | image = None 76 | else: 77 | # later messages 78 | conv.append_message(conv.roles[0], inp) 79 | conv.append_message(conv.roles[1], None) 80 | prompt = conv.get_prompt() 81 | 82 | input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(0).cuda() 83 | stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2 84 | keywords = [stop_str] 85 | stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids) 86 | streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True) 87 | 88 | with torch.inference_mode(): 89 | output_ids = model.generate(input_ids, images=image_tensor, do_sample=True, temperature=0.2, max_new_tokens=1024, streamer=streamer, use_cache=True, stopping_criteria=[stopping_criteria]) 90 | 91 | outputs = tokenizer.decode(output_ids[0, input_ids.shape[1] :]).strip() 92 | conv.messages[-1][-1] = outputs 93 | 94 | if args.debug: 95 | print("\n", {"prompt": prompt, "outputs": outputs}, "\n") 96 | 97 | 98 | if __name__ == "__main__": 99 | parser = argparse.ArgumentParser() 100 | parser.add_argument("--model-path", type=str, default="facebook/opt-350m") 101 | parser.add_argument("--model-base", type=str, default=None) 102 | parser.add_argument("--image-file", type=str, required=True) 103 | parser.add_argument("--num-gpus", type=int, default=1) 104 | parser.add_argument("--conv-mode", type=str, default=None) 105 | parser.add_argument("--temperature", type=float, default=0.2) 106 | parser.add_argument("--max-new-tokens", type=int, default=512) 107 | parser.add_argument("--load-8bit", action="store_true") 108 | parser.add_argument("--load-4bit", action="store_true") 109 | parser.add_argument("--debug", action="store_true") 110 | args = parser.parse_args() 111 | main(args) 112 | -------------------------------------------------------------------------------- /llava/serve/examples/extreme_ironing.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dongyh20/Insight-V/5e9c6f934c1929e351dbff17a6c6c7bc2d8143b1/llava/serve/examples/extreme_ironing.jpg -------------------------------------------------------------------------------- /llava/serve/examples/waterview.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dongyh20/Insight-V/5e9c6f934c1929e351dbff17a6c6c7bc2d8143b1/llava/serve/examples/waterview.jpg -------------------------------------------------------------------------------- /llava/serve/register_worker.py: -------------------------------------------------------------------------------- 1 | """ 2 | Manually register workers. 3 | 4 | Usage: 5 | python3 -m fastchat.serve.register_worker --controller http://localhost:21001 --worker-name http://localhost:21002 6 | """ 7 | 8 | import argparse 9 | 10 | import requests 11 | 12 | if __name__ == "__main__": 13 | parser = argparse.ArgumentParser() 14 | parser.add_argument("--controller-address", type=str) 15 | parser.add_argument("--worker-name", type=str) 16 | parser.add_argument("--check-heart-beat", action="store_true") 17 | args = parser.parse_args() 18 | 19 | url = args.controller_address + "/register_worker" 20 | data = { 21 | "worker_name": args.worker_name, 22 | "check_heart_beat": args.check_heart_beat, 23 | "worker_status": None, 24 | } 25 | r = requests.post(url, json=data) 26 | assert r.status_code == 200 27 | -------------------------------------------------------------------------------- /llava/serve/test_message.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | 4 | import requests 5 | 6 | from llava.conversation import default_conversation 7 | 8 | 9 | def main(): 10 | if args.worker_address: 11 | worker_addr = args.worker_address 12 | else: 13 | controller_addr = args.controller_address 14 | ret = requests.post(controller_addr + "/refresh_all_workers") 15 | ret = requests.post(controller_addr + "/list_models") 16 | models = ret.json()["models"] 17 | models.sort() 18 | print(f"Models: {models}") 19 | 20 | ret = requests.post(controller_addr + "/get_worker_address", json={"model": args.model_name}) 21 | worker_addr = ret.json()["address"] 22 | print(f"worker_addr: {worker_addr}") 23 | 24 | if worker_addr == "": 25 | return 26 | 27 | conv = default_conversation.copy() 28 | conv.append_message(conv.roles[0], args.message) 29 | prompt = conv.get_prompt() 30 | 31 | headers = {"User-Agent": "LLaVA Client"} 32 | pload = { 33 | "model": args.model_name, 34 | "prompt": prompt, 35 | "max_new_tokens": args.max_new_tokens, 36 | "temperature": 0.7, 37 | "stop": conv.sep, 38 | } 39 | response = requests.post(worker_addr + "/worker_generate_stream", headers=headers, json=pload, stream=True) 40 | 41 | print(prompt.replace(conv.sep, "\n"), end="") 42 | for chunk in response.iter_lines(chunk_size=8192, decode_unicode=False, delimiter=b"\0"): 43 | if chunk: 44 | data = json.loads(chunk.decode("utf-8")) 45 | output = data["text"].split(conv.sep)[-1] 46 | print(output, end="\r") 47 | print("") 48 | 49 | 50 | if __name__ == "__main__": 51 | parser = argparse.ArgumentParser() 52 | parser.add_argument("--controller-address", type=str, default="http://localhost:21001") 53 | parser.add_argument("--worker-address", type=str) 54 | parser.add_argument("--model-name", type=str, default="facebook/opt-350m") 55 | parser.add_argument("--max-new-tokens", type=int, default=32) 56 | parser.add_argument("--message", type=str, default="Tell me a story with more than 1000 words.") 57 | args = parser.parse_args() 58 | 59 | main() 60 | -------------------------------------------------------------------------------- /llava/train/__pycache__/llava_trainer.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dongyh20/Insight-V/5e9c6f934c1929e351dbff17a6c6c7bc2d8143b1/llava/train/__pycache__/llava_trainer.cpython-310.pyc -------------------------------------------------------------------------------- /llava/train/__pycache__/train.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dongyh20/Insight-V/5e9c6f934c1929e351dbff17a6c6c7bc2d8143b1/llava/train/__pycache__/train.cpython-310.pyc -------------------------------------------------------------------------------- /llava/train/llama_flash_attn_monkey_patch.py: -------------------------------------------------------------------------------- 1 | from typing import Optional, Tuple 2 | import warnings 3 | 4 | import torch 5 | 6 | import transformers 7 | from transformers.models.llama.modeling_llama import apply_rotary_pos_emb, repeat_kv 8 | 9 | try: 10 | from flash_attn.flash_attn_interface import flash_attn_unpadded_qkvpacked_func 11 | except ImportError: 12 | from flash_attn.flash_attn_interface import flash_attn_varlen_qkvpacked_func as flash_attn_unpadded_qkvpacked_func 13 | from flash_attn.bert_padding import unpad_input, pad_input 14 | 15 | 16 | def forward( 17 | self, 18 | hidden_states: torch.Tensor, 19 | attention_mask: Optional[torch.Tensor] = None, 20 | position_ids: Optional[torch.Tensor] = None, 21 | past_key_value: Optional[Tuple[torch.Tensor]] = None, 22 | output_attentions: bool = False, 23 | use_cache: bool = False, 24 | padding_mask: Optional[torch.Tensor] = None, 25 | ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: 26 | if output_attentions: 27 | warnings.warn("Output attentions is not supported for patched `LlamaAttention`, returning `None` instead.") 28 | 29 | bsz, q_len, _ = hidden_states.size() 30 | 31 | query_states = self.q_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) 32 | key_states = self.k_proj(hidden_states).view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) 33 | value_states = self.v_proj(hidden_states).view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) # shape: (b, num_heads, s, head_dim) 34 | 35 | kv_seq_len = key_states.shape[-2] 36 | if past_key_value is not None: 37 | kv_seq_len += past_key_value[0].shape[-2] 38 | 39 | cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) 40 | query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) 41 | 42 | if past_key_value is not None: 43 | # reuse k, v 44 | key_states = torch.cat([past_key_value[0], key_states], dim=2) 45 | value_states = torch.cat([past_key_value[1], value_states], dim=2) 46 | 47 | past_key_value = (key_states, value_states) if use_cache else None 48 | 49 | # repeat k/v heads if n_kv_heads < n_heads 50 | key_states = repeat_kv(key_states, self.num_key_value_groups) 51 | value_states = repeat_kv(value_states, self.num_key_value_groups) 52 | 53 | # Transform the data into the format required by flash attention 54 | qkv = torch.stack([query_states, key_states, value_states], dim=2) 55 | qkv = qkv.transpose(1, 3) # shape: [b, s, 3, num_heads, head_dim] 56 | key_padding_mask = attention_mask 57 | 58 | if key_padding_mask is None: 59 | qkv = qkv.reshape(-1, 3, self.num_heads, self.head_dim) 60 | cu_q_lens = torch.arange(0, (bsz + 1) * q_len, step=q_len, dtype=torch.int32, device=qkv.device) 61 | max_s = q_len 62 | output = flash_attn_unpadded_qkvpacked_func(qkv, cu_q_lens, max_s, 0.0, softmax_scale=None, causal=True) 63 | output = output.view(bsz, q_len, -1) 64 | else: 65 | qkv = qkv.reshape(bsz, q_len, -1) 66 | qkv, indices, cu_q_lens, max_s = unpad_input(qkv, key_padding_mask) 67 | qkv = qkv.view(-1, 3, self.num_heads, self.head_dim) 68 | output_unpad = flash_attn_unpadded_qkvpacked_func(qkv, cu_q_lens, max_s, 0.0, softmax_scale=None, causal=True) 69 | output_unpad = output_unpad.reshape(-1, self.num_heads * self.head_dim) 70 | output = pad_input(output_unpad, indices, bsz, q_len) 71 | 72 | return self.o_proj(output), None, past_key_value 73 | 74 | 75 | # Disable the transformation of the attention mask in LlamaModel as the flash attention 76 | # requires the attention mask to be the same as the key_padding_mask 77 | def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_embeds, past_key_values_length): 78 | # [bsz, seq_len] 79 | return attention_mask 80 | 81 | 82 | def replace_llama_attn_with_flash_attn(): 83 | cuda_major, cuda_minor = torch.cuda.get_device_capability() 84 | if cuda_major < 8: 85 | warnings.warn("Flash attention is only supported on A100 or H100 GPU during training due to head dim > 64 backward." "ref: https://github.com/HazyResearch/flash-attention/issues/190#issuecomment-1523359593") 86 | transformers.models.llama.modeling_llama.LlamaModel._prepare_decoder_attention_mask = _prepare_decoder_attention_mask 87 | transformers.models.llama.modeling_llama.LlamaAttention.forward = forward 88 | -------------------------------------------------------------------------------- /llava/train/llava_trainer_eval.py: -------------------------------------------------------------------------------- 1 | import json 2 | import subprocess 3 | 4 | from llava.train.llava_trainer import LLaVATrainer 5 | 6 | 7 | class LLaVAEvalTrainer(LLaVATrainer): 8 | def evaluate(self, evaluate_args): 9 | cmd = f"accelerate launch --num_processes {evaluate_args.eval_num_processes} -m lmms_eval \ 10 | --model {evaluate_args.model} \ 11 | --model_args {evaluate_args.model_args} \ 12 | --tasks {evaluate_args.task_names} \ 13 | --batch_size {evaluate_args.batch_size} \ 14 | --log_samples_suffix {evaluate_args.log_samples_suffix} \ 15 | --output_path {evaluate_args.output_path}" 16 | if evaluate_args.limit: 17 | cmd += f" --limit {evaluate_args.limit}" 18 | if evaluate_args.num_fewshot: 19 | cmd += f" --num_fewshot {evaluate_args.num_fewshot}" 20 | if evaluate_args.gen_kwargs != "": 21 | cmd += f" --gen_kwargs {evaluate_args.gen_kwargs}" 22 | if evaluate_args.log_samples: 23 | cmd += f" --log_samples" 24 | else: 25 | assert False, "Please log samples so that the result can be parsed" 26 | results = subprocess.run([cmd], shell=True, capture_output=True, text=True) 27 | try: 28 | result_file_index_start = results.stdout.index("Saved samples to ") 29 | result_file_index_end = results.stdout.index(f".json") 30 | result_file_index_start += len("Saved samples to ") 31 | file = results.stdout[result_file_index_start:result_file_index_end] 32 | except: 33 | result_file_index_start = results.stderr.index("Saved samples to ") 34 | result_file_index_end = results.stderr.index(f".json") 35 | result_file_index_start += len("Saved samples to ") 36 | file = results.stderr[result_file_index_start:result_file_index_end] 37 | file = file.split("/")[:-1] 38 | file = "/".join(file) + "/results.json" 39 | with open(file, "r") as f: 40 | lmms_eval_results = json.load(f) 41 | result_dict = {} 42 | tasks_list = evaluate_args.task_names.split(",") 43 | for task in tasks_list: 44 | task_results = lmms_eval_results["results"][task] 45 | for k, v in task_results.items(): 46 | if k != "alias" and "stderr" not in k: 47 | metric = k.split(",")[0] 48 | result_dict[f"{task}_{metric}"] = v 49 | return result_dict 50 | 51 | """def evaluate(self, evaluate_args): 52 | initialize_tasks() 53 | tasks_list = evaluate_args.task_names.split(",") 54 | result_dict = {} 55 | results = evaluator.simple_evaluate( 56 | model=evaluate_args.model, 57 | model_args=evaluate_args.model_args, 58 | tasks=tasks_list, 59 | num_fewshot=evaluate_args.num_fewshot, 60 | batch_size=evaluate_args.batch_size, 61 | device=evaluate_args.device, 62 | limit=evaluate_args.limit, 63 | check_integrity=evaluate_args.check_integrity, 64 | show_task_to_terminal=evaluate_args.show_task_to_terminal, 65 | log_samples=evaluate_args.log_samples, 66 | gen_kwargs=evaluate_args.gen_kwargs, 67 | cli_args=evaluate_args, 68 | ) 69 | for task in tasks_list: 70 | task_results = results["results"][task] 71 | for k,v in task_results.items(): 72 | if k != "alias" and "stderr" not in k: 73 | metric = k.split(",")[0] 74 | result_dict[f"{task}_{metric}"] = v 75 | 76 | return result_dict""" 77 | -------------------------------------------------------------------------------- /llava/train/train_mem.py: -------------------------------------------------------------------------------- 1 | from llava.train.train import train 2 | 3 | if __name__ == "__main__": 4 | train() 5 | -------------------------------------------------------------------------------- /llava/utils.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import logging 3 | import logging.handlers 4 | import os 5 | import sys 6 | import numpy as np 7 | 8 | import requests 9 | 10 | from llava.constants import LOGDIR 11 | 12 | server_error_msg = "**NETWORK ERROR DUE TO HIGH TRAFFIC. PLEASE REGENERATE OR REFRESH THIS PAGE.**" 13 | moderation_msg = "I am sorry. Your input may violate our content moderation guidelines. Please avoid using harmful or offensive content." 14 | 15 | handler = None 16 | 17 | import torch.distributed as dist 18 | 19 | try: 20 | import av 21 | from decord import VideoReader, cpu 22 | except ImportError: 23 | print("Please install pyav to use video processing functions.") 24 | 25 | def process_video_with_decord(video_file, data_args): 26 | vr = VideoReader(video_file, ctx=cpu(0), num_threads=1) 27 | total_frame_num = len(vr) 28 | video_time = total_frame_num / vr.get_avg_fps() 29 | avg_fps = round(vr.get_avg_fps() / data_args.video_fps) 30 | frame_idx = [i for i in range(0, total_frame_num, avg_fps)] 31 | frame_time = [i/avg_fps for i in frame_idx] 32 | 33 | 34 | if data_args.frames_upbound > 0: 35 | if len(frame_idx) > data_args.frames_upbound or data_args.force_sample: 36 | uniform_sampled_frames = np.linspace(0, total_frame_num - 1, data_args.frames_upbound, dtype=int) 37 | frame_idx = uniform_sampled_frames.tolist() 38 | frame_time = [i/vr.get_avg_fps() for i in frame_idx] 39 | 40 | video = vr.get_batch(frame_idx).asnumpy() 41 | frame_time = ",".join([f"{i:.2f}s" for i in frame_time]) 42 | 43 | num_frames_to_sample = num_frames = len(frame_idx) 44 | # https://github.com/dmlc/decord/issues/208 45 | vr.seek(0) 46 | return video, video_time, frame_time, num_frames_to_sample 47 | 48 | def process_video_with_pyav(video_file, data_args): 49 | container = av.open(video_file) 50 | # !!! This is the only difference. Using auto threading 51 | container.streams.video[0].thread_type = "AUTO" 52 | 53 | video_frames = [] 54 | for packet in container.demux(): 55 | if packet.stream.type == 'video': 56 | for frame in packet.decode(): 57 | video_frames.append(frame) 58 | total_frame_num = len(video_frames) 59 | video_time = video_frames[-1].time 60 | avg_fps = round(total_frame_num / video_time / data_args.video_fps) 61 | frame_idx = [i for i in range(0, total_frame_num, avg_fps)] 62 | 63 | if data_args.frames_upbound > 0: 64 | if len(frame_idx) > data_args.frames_upbound: 65 | uniform_sampled_frames = np.linspace(0, total_frame_num - 1, data_args.frames_upbound, dtype=int) 66 | frame_idx = uniform_sampled_frames.tolist() 67 | 68 | 69 | frames = [video_frames[i] for i in frame_idx] 70 | return np.stack([x.to_ndarray(format="rgb24") for x in frames]) 71 | 72 | 73 | def rank0_print(*args): 74 | if dist.is_initialized(): 75 | if dist.get_rank() == 0: 76 | print(f"Rank {dist.get_rank()}: ", *args) 77 | else: 78 | print(*args) 79 | 80 | 81 | def rank_print(*args): 82 | if dist.is_initialized(): 83 | print(f"Rank {dist.get_rank()}: ", *args) 84 | else: 85 | print(*args) 86 | 87 | def build_logger(logger_name, logger_filename): 88 | global handler 89 | 90 | formatter = logging.Formatter( 91 | fmt="%(asctime)s | %(levelname)s | %(name)s | %(message)s", 92 | datefmt="%Y-%m-%d %H:%M:%S", 93 | ) 94 | 95 | # Set the format of root handlers 96 | if not logging.getLogger().handlers: 97 | logging.basicConfig(level=logging.INFO) 98 | logging.getLogger().handlers[0].setFormatter(formatter) 99 | 100 | # Redirect stdout and stderr to loggers 101 | stdout_logger = logging.getLogger("stdout") 102 | stdout_logger.setLevel(logging.INFO) 103 | sl = StreamToLogger(stdout_logger, logging.INFO) 104 | sys.stdout = sl 105 | 106 | stderr_logger = logging.getLogger("stderr") 107 | stderr_logger.setLevel(logging.ERROR) 108 | sl = StreamToLogger(stderr_logger, logging.ERROR) 109 | sys.stderr = sl 110 | 111 | # Get logger 112 | logger = logging.getLogger(logger_name) 113 | logger.setLevel(logging.INFO) 114 | 115 | # Add a file handler for all loggers 116 | if handler is None: 117 | os.makedirs(LOGDIR, exist_ok=True) 118 | filename = os.path.join(LOGDIR, logger_filename) 119 | handler = logging.handlers.TimedRotatingFileHandler(filename, when="D", utc=True) 120 | handler.setFormatter(formatter) 121 | 122 | for name, item in logging.root.manager.loggerDict.items(): 123 | if isinstance(item, logging.Logger): 124 | item.addHandler(handler) 125 | 126 | return logger 127 | 128 | 129 | class StreamToLogger(object): 130 | """ 131 | Fake file-like stream object that redirects writes to a logger instance. 132 | """ 133 | 134 | def __init__(self, logger, log_level=logging.INFO): 135 | self.terminal = sys.stdout 136 | self.logger = logger 137 | self.log_level = log_level 138 | self.linebuf = "" 139 | 140 | def __getattr__(self, attr): 141 | return getattr(self.terminal, attr) 142 | 143 | def write(self, buf): 144 | temp_linebuf = self.linebuf + buf 145 | self.linebuf = "" 146 | for line in temp_linebuf.splitlines(True): 147 | # From the io.TextIOWrapper docs: 148 | # On output, if newline is None, any '\n' characters written 149 | # are translated to the system default line separator. 150 | # By default sys.stdout.write() expects '\n' newlines and then 151 | # translates them so this is still cross platform. 152 | if line[-1] == "\n": 153 | self.logger.log(self.log_level, line.rstrip()) 154 | else: 155 | self.linebuf += line 156 | 157 | def flush(self): 158 | if self.linebuf != "": 159 | self.logger.log(self.log_level, self.linebuf.rstrip()) 160 | self.linebuf = "" 161 | 162 | 163 | def disable_torch_init(): 164 | """ 165 | Disable the redundant torch default initialization to accelerate model creation. 166 | """ 167 | import torch 168 | 169 | setattr(torch.nn.Linear, "reset_parameters", lambda self: None) 170 | setattr(torch.nn.LayerNorm, "reset_parameters", lambda self: None) 171 | 172 | 173 | def violates_moderation(text): 174 | """ 175 | Check whether the text violates OpenAI moderation API. 176 | """ 177 | url = "https://api.openai.com/v1/moderations" 178 | headers = {"Content-Type": "application/json", "Authorization": "Bearer " + os.environ["OPENAI_API_KEY"]} 179 | text = text.replace("\n", "") 180 | data = "{" + '"input": ' + f'"{text}"' + "}" 181 | data = data.encode("utf-8") 182 | try: 183 | ret = requests.post(url, headers=headers, data=data, timeout=5) 184 | flagged = ret.json()["results"][0]["flagged"] 185 | except requests.exceptions.RequestException as e: 186 | print(f"######################### Moderation Error: {e} #########################") 187 | flagged = False 188 | except KeyError as e: 189 | print(f"######################### Moderation Error: {e} #########################") 190 | flagged = False 191 | 192 | return flagged 193 | 194 | 195 | def pretty_print_semaphore(semaphore): 196 | if semaphore is None: 197 | return "None" 198 | return f"Semaphore(value={semaphore._value}, locked={semaphore.locked()})" 199 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | Babel==2.14.0 2 | DataProperty==1.0.1 3 | Deprecated==1.2.14 4 | GitPython==3.1.43 5 | Jinja2==3.1.3 6 | Levenshtein==0.25.1 7 | MarkupSafe==2.1.5 8 | PyJWT==2.8.0 9 | PyYAML==6.0.1 10 | Pygments==2.17.2 11 | QtPy==2.4.1 12 | Send2Trash==1.8.3 13 | absl-py==2.1.0 14 | accelerate==0.29.3 15 | aiofiles==22.1.0 16 | aiohttp==3.9.5 17 | aiosignal==1.3.1 18 | aiosqlite==0.20.0 19 | altair==5.3.0 20 | anyio==4.3.0 21 | appdirs==1.4.4 22 | argon2-cffi-bindings==21.2.0 23 | argon2-cffi==23.1.0 24 | arrow==1.3.0 25 | asttokens==2.4.1 26 | async-timeout==4.0.3 27 | attrs==23.1.0 28 | beautifulsoup4==4.12.3 29 | bidict==0.23.1 30 | bitsandbytes==0.41.0 31 | black==24.1.0 32 | bleach==6.1.0 33 | byted-remote-ikernel==0.4.8 34 | byted-torch-monitor==0.0.1 35 | byted-wandb==0.13.72 36 | bytedance-context==0.7.1 37 | bytedance-metrics==0.5.1 38 | bytedance.modelhub==0.0.64 39 | bytedance.servicediscovery==0.1.2 40 | bytedbackgrounds==0.0.6 41 | byteddatabus==1.0.6 42 | byteddps==0.1.2 43 | bytedenv==0.6.2 44 | bytedlogger==0.15.1 45 | bytedmemfd==0.2 46 | bytedmetrics==0.10.2 47 | bytedpymongo==2.0.5 48 | bytedrh2==1.18.7a2 49 | bytedservicediscovery==0.17.4 50 | bytedtcc==1.4.2 51 | bytedtos==1.1.16 52 | bytedtrace==0.3.0 53 | bytedztijwthelper==0.0.22 54 | bytedztispiffe==0.0.11 55 | certifi==2024.2.2 56 | cffi==1.16.0 57 | cfgv==3.4.0 58 | chardet==5.2.0 59 | charset-normalizer==3.3.2 60 | click==8.1.7 61 | colorama==0.4.6 62 | comm==0.2.2 63 | contourpy==1.2.1 64 | crcmod==1.7 65 | cryptography==38.0.4 66 | cycler==0.12.1 67 | datasets==2.16.1 68 | debugpy==1.8.1 69 | decorator==5.1.1 70 | decord==0.6.0 71 | deepspeed==0.12.2 72 | defusedxml==0.7.1 73 | dill==0.3.7 74 | distlib==0.3.8 75 | distro==1.9.0 76 | dnspython==2.6.1 77 | docker-pycreds==0.4.0 78 | docstring_parser==0.16 79 | einops-exts==0.0.4 80 | einops==0.6.1 81 | entrypoints==0.4 82 | et-xmlfile==1.1.0 83 | eval_type_backport==0.2.0 84 | evaluate==0.4.1 85 | exceptiongroup==1.2.1 86 | executing==2.0.1 87 | fastapi==0.110.2 88 | fastjsonschema==2.19.1 89 | ffmpy==0.3.2 90 | filelock==3.13.4 91 | flash-attn==2.5.7 92 | fonttools==4.51.0 93 | fqdn==1.5.1 94 | frozenlist==1.4.1 95 | fsspec==2023.10.0 96 | ftfy==6.2.0 97 | gitdb==4.0.11 98 | gradio==3.35.2 99 | gradio_client==0.2.9 100 | grpcio==1.62.2 101 | h11==0.14.0 102 | hf_transfer==0.1.6 103 | hjson==3.1.0 104 | httpcore==0.17.3 105 | httpx==0.24.0 106 | huggingface-hub==0.22.2 107 | identify==2.5.36 108 | idna==3.7 109 | importlib_metadata==7.1.0 110 | importlib_resources==6.4.0 111 | iniconfig==2.0.0 112 | ipaddress==1.0.23 113 | ipykernel==6.29.4 114 | ipython-genutils==0.2.0 115 | ipython==8.18.1 116 | ipywidgets==8.1.2 117 | isoduration==20.11.0 118 | jedi==0.19.1 119 | joblib==1.4.0 120 | json5==0.9.25 121 | jsonlines==4.0.0 122 | jsonpointer==2.4 123 | jsonschema-specifications==2023.12.1 124 | jsonschema==4.21.1 125 | jupyter-client==7.0.0 126 | jupyter-console==6.6.3 127 | jupyter-events==0.10.0 128 | jupyter-ydoc==0.2.5 129 | jupyter==1.0.0 130 | jupyter_core==5.7.2 131 | jupyter_server==2.14.0 132 | jupyter_server_fileid==0.9.2 133 | jupyter_server_terminals==0.5.3 134 | jupyter_server_ydoc==0.8.0 135 | jupyterlab==3.6.4 136 | jupyterlab_pygments==0.3.0 137 | jupyterlab_server==2.27.1 138 | jupyterlab_widgets==3.0.10 139 | kiwisolver==1.4.5 140 | linkify-it-py==2.0.3 141 | llava==1.7.0.dev0 142 | llava==1.7.0.dev0 143 | lmms_eval==0.1.1 144 | lxml==5.2.1 145 | markdown-it-py==2.2.0 146 | markdown2==2.4.13 147 | matplotlib-inline==0.1.7 148 | matplotlib==3.8.4 149 | mbstrdecoder==1.1.3 150 | mdit-py-plugins==0.3.3 151 | mdurl==0.1.2 152 | mistune==3.0.2 153 | mpmath==1.3.0 154 | msgpack==1.0.8 155 | multidict==6.0.5 156 | multiprocess==0.70.15 157 | mypy-extensions==1.0.0 158 | nbclassic==1.0.0 159 | nbclient==0.10.0 160 | nbconvert==7.16.3 161 | nbformat==5.10.4 162 | nest-asyncio==1.6.0 163 | networkx==3.2.1 164 | ninja==1.11.1.1 165 | nltk==3.8.1 166 | nodeenv==1.8.0 167 | notebook==6.5.6 168 | notebook_shim==0.2.4 169 | numexpr==2.10.0 170 | numpy==1.26.4 171 | nvidia-cublas-cu12==12.1.3.1 172 | nvidia-cuda-cupti-cu12==12.1.105 173 | nvidia-cuda-nvrtc-cu12==12.1.105 174 | nvidia-cuda-runtime-cu12==12.1.105 175 | nvidia-cudnn-cu12==8.9.2.26 176 | nvidia-cufft-cu12==11.0.2.54 177 | nvidia-curand-cu12==10.3.2.106 178 | nvidia-cusolver-cu12==11.4.5.107 179 | nvidia-cusparse-cu12==12.1.0.106 180 | nvidia-nccl-cu12==2.18.1 181 | nvidia-nvjitlink-cu12==12.4.127 182 | nvidia-nvtx-cu12==12.1.105 183 | open-clip-torch==2.24.0 184 | openai==1.23.6 185 | opencv-python-headless==4.9.0.80 186 | openpyxl==3.1.2 187 | orjson==3.10.1 188 | overrides==7.7.0 189 | packaging==24.0 190 | pandas==2.2.2 191 | pandocfilters==1.5.1 192 | parso==0.8.4 193 | pathlib2==2.3.7.post1 194 | pathspec==0.12.1 195 | pathtools==0.1.2 196 | pathvalidate==3.2.0 197 | peft==0.4.0 198 | pexpect==4.8.0 199 | pillow==10.3.0 200 | pip==23.3.1 201 | pip==24.0 202 | platformdirs==4.2.1 203 | pluggy==1.5.0 204 | ply==3.11 205 | portalocker==2.8.2 206 | pre-commit==3.7.0 207 | prometheus_client==0.20.0 208 | promise==2.3 209 | prompt-toolkit==3.0.43 210 | protobuf==3.20.3 211 | psutil==5.9.8 212 | ptyprocess==0.7.0 213 | pure-eval==0.2.2 214 | py-cpuinfo==9.0.0 215 | py-spy==0.3.14 216 | py==1.11.0 217 | pyOpenSSL==22.1.0 218 | pyarrow-hotfix==0.6 219 | pyarrow==16.0.0 220 | pybind11==2.12.0 221 | pycocoevalcap==1.2 222 | pycocotools==2.0.7 223 | pycparser==2.22 224 | pycryptodomex==3.20.0 225 | pydantic==1.10.8 226 | pydub==0.25.1 227 | pynvml==11.5.0 228 | pyparsing==3.1.2 229 | pytablewriter==1.2.0 230 | pytest==6.2.5 231 | python-consul==1.1.0 232 | python-dateutil==2.9.0.post0 233 | python-engineio==4.9.0 234 | python-etcd==0.4.5 235 | python-json-logger==2.0.7 236 | python-multipart==0.0.9 237 | python-socketio==5.11.2 238 | pytz==2024.1 239 | pyzmq==24.0.1 240 | qtconsole==5.5.1 241 | rapidfuzz==3.8.1 242 | referencing==0.35.0 243 | regex==2024.4.16 244 | requests==2.31.0 245 | responses==0.18.0 246 | rfc3339-validator==0.1.4 247 | rfc3986-validator==0.1.1 248 | rich==13.7.1 249 | rouge-score==0.1.2 250 | rpds-py==0.18.0 251 | sacrebleu==2.4.2 252 | safetensors==0.4.3 253 | schedule==1.2.1 254 | scikit-learn==1.2.2 255 | scipy==1.13.0 256 | semantic-version==2.10.0 257 | sentencepiece==0.1.99 258 | sentry-sdk==2.0.0 259 | setproctitle==1.3.3 260 | setuptools==68.2.2 261 | shortuuid==1.0.13 262 | shtab==1.7.1 263 | simple-websocket==1.0.0 264 | six==1.16.0 265 | smmap==5.0.1 266 | sniffio==1.3.1 267 | soupsieve==2.5 268 | sqlitedict==2.1.0 269 | stack-data==0.6.3 270 | starlette==0.37.2 271 | svgwrite==1.4.3 272 | sympy==1.12 273 | tabledata==1.3.3 274 | tabulate==0.9.0 275 | tcolorpy==0.1.4 276 | tenacity==8.2.3 277 | terminado==0.18.1 278 | threadpoolctl==3.4.0 279 | thriftpy2==0.4.20 280 | tiktoken==0.6.0 281 | timm==0.9.16 282 | tinycss2==1.3.0 283 | tokenizers==0.15.2 284 | toml==0.10.2 285 | tomli==2.0.1 286 | toolz==0.12.1 287 | torch==2.1.2 288 | torchvision==0.16.2 289 | tornado==6.4 290 | tox==3.28.0 291 | tqdm-multiprocess==0.0.11 292 | tqdm==4.66.2 293 | traitlets==5.14.3 294 | transformers-stream-generator==0.0.5 295 | transformers==4.40.0.dev0 296 | triton==2.1.0 297 | typepy==1.3.2 298 | types-python-dateutil==2.9.0.20240316 299 | typing_extensions==4.11.0 300 | tyro==0.8.3 301 | tzdata==2024.1 302 | uc-micro-py==1.0.3 303 | uri-template==1.3.0 304 | urllib3==2.2.1 305 | uvicorn==0.29.0 306 | virtualenv==20.26.0 307 | wandb==0.16.5 308 | watchdog==4.0.0 309 | wavedrom==2.0.3.post3 310 | wcwidth==0.2.13 311 | webcolors==1.13 312 | webencodings==0.5.1 313 | websocket-client==1.8.0 314 | websockets==12.0 315 | wheel==0.41.2 316 | widgetsnbextension==4.0.10 317 | wrapt==1.16.0 318 | wsproto==1.2.0 319 | xxhash==3.4.1 320 | y-py==0.6.2 321 | yarl==1.9.4 322 | ypy-websocket==0.8.4 323 | zipp==3.18.1 324 | zstandard==0.22.0 -------------------------------------------------------------------------------- /scripts/insight-v/llava_dpo.sh: -------------------------------------------------------------------------------- 1 | EXP_NAME="llama3-llava-next-reason-dpo" 2 | VISION_TOWER='/home/models/clip-vit-large-patch14-336' 3 | 4 | echo $MASTER_ADDR; echo $nnode; echo $nrank 5 | 6 | # ----------- finetune mlp+llm ------------- 7 | 8 | export DPO_FORWARD=1 9 | 10 | torchrun --nproc_per_node 8 --nnodes=$nnode --node_rank=$nrank --master_addr=$MASTER_ADDR --master_port=23333 \ 11 | llava/train/train_dpo.py \ 12 | --deepspeed ./scripts/zero3.json \ 13 | --model_name_or_path /path/to/reason_models \ 14 | --version llava_llama_3 \ 15 | --data_path /path/to/dpo_data \ 16 | --image_folder ./playground/data \ 17 | --mm_tunable_parts="mm_mlp_adapter,mm_language_model" \ 18 | --vision_tower $VISION_TOWER \ 19 | --mm_projector_type mlp2x_gelu \ 20 | --mm_vision_select_layer -1 \ 21 | --mm_use_im_start_end False \ 22 | --mm_use_im_patch_token False \ 23 | --mm_use_think_token True \ 24 | --group_by_modality_length True \ 25 | --image_aspect_ratio anyres \ 26 | --image_grid_pinpoints "(1x1),...,(3x3)" \ 27 | --mm_patch_merge_type spatial_unpad \ 28 | --bf16 True \ 29 | --output_dir ./checkpoints_new/$EXP_NAME \ 30 | --num_train_epochs 2 \ 31 | --per_device_train_batch_size 2 \ 32 | --per_device_eval_batch_size 4 \ 33 | --gradient_accumulation_steps 4 \ 34 | --evaluation_strategy "no" \ 35 | --save_strategy "steps" \ 36 | --save_steps 500 \ 37 | --save_total_limit 1 \ 38 | --learning_rate 5e-6 \ 39 | --weight_decay 0. \ 40 | --warmup_ratio 0.05 \ 41 | --lr_scheduler_type "cosine" \ 42 | --logging_steps 1 \ 43 | --tf32 True \ 44 | --model_max_length 12288 \ 45 | --gradient_checkpointing True \ 46 | --dataloader_num_workers 4 \ 47 | --lazy_preprocess True \ 48 | --report_to none 49 | -------------------------------------------------------------------------------- /scripts/insight-v/llava_next_reason.sh: -------------------------------------------------------------------------------- 1 | EXP_NAME="llama3-llava-next-reason" 2 | VISION_TOWER='/home/models/clip-vit-large-patch14-336' 3 | 4 | echo $MASTER_ADDR; echo $nnode; echo $nrank 5 | 6 | # ----------- finetune mlp+llm ------------- 7 | 8 | torchrun --nproc_per_node 8 --nnodes=$nnode --node_rank=$nrank --master_addr=$MASTER_ADDR --master_port=23333 \ 9 | llava/train/train_mem.py \ 10 | --deepspeed ./scripts/zero3.json \ 11 | --model_name_or_path /path/to/models \ 12 | --version llava_llama_3 \ 13 | --data_path /path/to/reason_data \ 14 | --image_folder ./playground/data \ 15 | --mm_tunable_parts="mm_mlp_adapter,mm_language_model" \ 16 | --vision_tower $VISION_TOWER \ 17 | --mm_projector_type mlp2x_gelu \ 18 | --mm_vision_select_layer -1 \ 19 | --mm_use_im_start_end False \ 20 | --mm_use_im_patch_token False \ 21 | --mm_use_think_token True \ 22 | --group_by_modality_length True \ 23 | --image_aspect_ratio anyres \ 24 | --image_grid_pinpoints "(1x1),...,(3x3)" \ 25 | --mm_patch_merge_type spatial_unpad \ 26 | --bf16 True \ 27 | --output_dir ./checkpoints_new/$EXP_NAME \ 28 | --num_train_epochs 2 \ 29 | --per_device_train_batch_size 2 \ 30 | --per_device_eval_batch_size 4 \ 31 | --gradient_accumulation_steps 4 \ 32 | --evaluation_strategy "no" \ 33 | --save_strategy "steps" \ 34 | --save_steps 500 \ 35 | --save_total_limit 1 \ 36 | --learning_rate 5e-6 \ 37 | --weight_decay 0. \ 38 | --warmup_ratio 0.05 \ 39 | --lr_scheduler_type "cosine" \ 40 | --logging_steps 1 \ 41 | --tf32 True \ 42 | --model_max_length 12288 \ 43 | --gradient_checkpointing True \ 44 | --dataloader_num_workers 4 \ 45 | --lazy_preprocess True \ 46 | --report_to none 47 | -------------------------------------------------------------------------------- /scripts/insight-v/llava_next_summary.sh: -------------------------------------------------------------------------------- 1 | EXP_NAME="llama3-llava-next-summary" 2 | VISION_TOWER='/home/models/clip-vit-large-patch14-336' 3 | 4 | echo $MASTER_ADDR; echo $nnode; echo $nrank 5 | 6 | torchrun --nproc_per_node 8 --nnodes=$nnode --node_rank=$nrank --master_addr=$MASTER_ADDR --master_port=23333 \ 7 | llava/train/train_mem.py \ 8 | --deepspeed ./scripts/zero3.json \ 9 | --model_name_or_path /path/to/model \ 10 | --version llava_llama_3 \ 11 | --data_path /path/to/data \ 12 | --image_folder ./playground/data \ 13 | --mm_tunable_parts="mm_mlp_adapter,mm_language_model" \ 14 | --vision_tower $VISION_TOWER \ 15 | --mm_projector_type mlp2x_gelu \ 16 | --mm_vision_select_layer -2 \ 17 | --mm_use_im_start_end False \ 18 | --mm_use_im_patch_token False \ 19 | --mm_use_think_token True \ 20 | --group_by_modality_length True \ 21 | --image_aspect_ratio anyres \ 22 | --image_grid_pinpoints "(1x1),...,(3x3)" \ 23 | --mm_patch_merge_type spatial_unpad \ 24 | --bf16 True \ 25 | --output_dir ./checkpoints_new/$EXP_NAME \ 26 | --num_train_epochs 1 \ 27 | --per_device_train_batch_size 2 \ 28 | --per_device_eval_batch_size 4 \ 29 | --gradient_accumulation_steps 1 \ 30 | --evaluation_strategy "no" \ 31 | --save_strategy "steps" \ 32 | --save_steps 1000 \ 33 | --save_total_limit 1 \ 34 | --learning_rate 1e-5 \ 35 | --weight_decay 0. \ 36 | --warmup_ratio 0.05 \ 37 | --lr_scheduler_type "cosine" \ 38 | --logging_steps 1 \ 39 | --tf32 True \ 40 | --model_max_length 12288 \ 41 | --gradient_checkpointing True \ 42 | --dataloader_num_workers 4 \ 43 | --lazy_preprocess True \ 44 | --report_to none 45 | -------------------------------------------------------------------------------- /scripts/zero2.json: -------------------------------------------------------------------------------- 1 | { 2 | "fp16": { 3 | "enabled": "auto", 4 | "loss_scale": 0, 5 | "loss_scale_window": 1000, 6 | "initial_scale_power": 16, 7 | "hysteresis": 2, 8 | "min_loss_scale": 1 9 | }, 10 | "bf16": { 11 | "enabled": "auto" 12 | }, 13 | "optimizer": { 14 | "type": "AdamW", 15 | "params": { 16 | "lr": "auto", 17 | "betas": "auto", 18 | "eps": "auto", 19 | "weight_decay": "auto" 20 | } 21 | }, 22 | "zero_optimization": { 23 | "stage": 2, 24 | "offload_optimizer": { 25 | "device": "none", 26 | "pin_memory": true 27 | }, 28 | "allgather_partitions": true, 29 | "allgather_bucket_size": 2e8, 30 | "overlap_comm": false, 31 | "reduce_scatter": true, 32 | "reduce_bucket_size": 2e8, 33 | "contiguous_gradients": true 34 | }, 35 | "gradient_accumulation_steps": "auto", 36 | "gradient_clipping": "auto", 37 | "steps_per_print": 100, 38 | "train_batch_size": "auto", 39 | "train_micro_batch_size_per_gpu": "auto", 40 | "wall_clock_breakdown": false 41 | } -------------------------------------------------------------------------------- /scripts/zero2_fused_adamw.json: -------------------------------------------------------------------------------- 1 | { 2 | "fp16": { 3 | "enabled": "auto", 4 | "loss_scale": 0, 5 | "loss_scale_window": 1000, 6 | "initial_scale_power": 16, 7 | "hysteresis": 2, 8 | "min_loss_scale": 1 9 | }, 10 | "bf16": { 11 | "enabled": "auto" 12 | }, 13 | "optimizer": { 14 | "type": "AdamW", 15 | "params": { 16 | "lr": "auto", 17 | "betas": "auto", 18 | "eps": "auto", 19 | "weight_decay": "auto" 20 | } 21 | }, 22 | "zero_optimization": { 23 | "stage": 2, 24 | "offload_optimizer": { 25 | "device": "none", 26 | "pin_memory": true 27 | }, 28 | "allgather_partitions": true, 29 | "allgather_bucket_size": 2e8, 30 | "overlap_comm": true, 31 | "reduce_scatter": true, 32 | "reduce_bucket_size": 2e8, 33 | "contiguous_gradients": true 34 | }, 35 | "gradient_accumulation_steps": "auto", 36 | "gradient_clipping": "auto", 37 | "steps_per_print": 100, 38 | "train_batch_size": "auto", 39 | "train_micro_batch_size_per_gpu": "auto", 40 | "wall_clock_breakdown": false 41 | } -------------------------------------------------------------------------------- /scripts/zero2_offload.json: -------------------------------------------------------------------------------- 1 | { 2 | "fp16": { 3 | "enabled": "auto", 4 | "loss_scale": 0, 5 | "loss_scale_window": 1000, 6 | "initial_scale_power": 16, 7 | "hysteresis": 2, 8 | "min_loss_scale": 1 9 | }, 10 | "bf16": { 11 | "enabled": "auto" 12 | }, 13 | "train_micro_batch_size_per_gpu": "auto", 14 | "train_batch_size": "auto", 15 | "gradient_accumulation_steps": "auto", 16 | "zero_optimization": { 17 | "stage": 2, 18 | "offload_optimizer": { 19 | "device": "cpu", 20 | "pin_memory": true 21 | }, 22 | "offload_param": { 23 | "device": "cpu", 24 | "pin_memory": true 25 | }, 26 | "overlap_comm": true, 27 | "contiguous_gradients": true, 28 | "sub_group_size": 1e9, 29 | "reduce_bucket_size": "auto" 30 | } 31 | } -------------------------------------------------------------------------------- /scripts/zero3.json: -------------------------------------------------------------------------------- 1 | { 2 | "fp16": { 3 | "enabled": "auto", 4 | "loss_scale": 0, 5 | "loss_scale_window": 1000, 6 | "initial_scale_power": 16, 7 | "hysteresis": 2, 8 | "min_loss_scale": 1 9 | }, 10 | "bf16": { 11 | "enabled": "auto" 12 | }, 13 | 14 | "zero_optimization": { 15 | "stage": 3, 16 | "offload_optimizer": { 17 | "device": "none", 18 | "pin_memory": true 19 | }, 20 | "offload_param": { 21 | "device": "none", 22 | "pin_memory": true 23 | }, 24 | "overlap_comm": true, 25 | "contiguous_gradients": true, 26 | "sub_group_size": 1e9, 27 | "reduce_bucket_size": "auto", 28 | "stage3_prefetch_bucket_size": "auto", 29 | "stage3_param_persistence_threshold": "auto", 30 | "stage3_max_live_parameters": 1e9, 31 | "stage3_max_reuse_distance": 1e9, 32 | "stage3_gather_16bit_weights_on_model_save": true 33 | }, 34 | 35 | "gradient_accumulation_steps": "auto", 36 | "gradient_clipping": "auto", 37 | "steps_per_print": 100, 38 | "train_batch_size": "auto", 39 | "train_micro_batch_size_per_gpu": "auto", 40 | "wall_clock_breakdown": false 41 | } -------------------------------------------------------------------------------- /scripts/zero3_offload.json: -------------------------------------------------------------------------------- 1 | { 2 | "fp16": { 3 | "enabled": "auto", 4 | "loss_scale": 0, 5 | "loss_scale_window": 1000, 6 | "initial_scale_power": 16, 7 | "hysteresis": 2, 8 | "min_loss_scale": 1 9 | }, 10 | "bf16": { 11 | "enabled": "auto" 12 | }, 13 | "optimizer": { 14 | "type": "AdamW", 15 | "params": { 16 | "lr": "auto", 17 | "betas": "auto", 18 | "eps": "auto", 19 | "weight_decay": "auto" 20 | } 21 | }, 22 | "zero_optimization": { 23 | "stage": 3, 24 | "offload_optimizer": { 25 | "device": "cpu", 26 | "pin_memory": true 27 | }, 28 | "offload_param": { 29 | "device": "cpu", 30 | "pin_memory": true 31 | }, 32 | "overlap_comm": true, 33 | "contiguous_gradients": true, 34 | "sub_group_size": 1e9, 35 | "reduce_bucket_size": "auto", 36 | "stage3_prefetch_bucket_size": "auto", 37 | "stage3_param_persistence_threshold": "auto", 38 | "stage3_max_live_parameters": 1e9, 39 | "stage3_max_reuse_distance": 1e9, 40 | "gather_16bit_weights_on_model_save": true 41 | }, 42 | "gradient_accumulation_steps": "auto", 43 | "gradient_clipping": "auto", 44 | "train_batch_size": "auto", 45 | "train_micro_batch_size_per_gpu": "auto", 46 | "steps_per_print": 1e5, 47 | "wall_clock_breakdown": false 48 | } -------------------------------------------------------------------------------- /scripts/zero3pp.json: -------------------------------------------------------------------------------- 1 | { 2 | "fp16": { 3 | "enabled": "auto", 4 | "loss_scale": 0, 5 | "loss_scale_window": 1000, 6 | "initial_scale_power": 16, 7 | "hysteresis": 2, 8 | "min_loss_scale": 1 9 | }, 10 | "bf16": { 11 | "enabled": "auto" 12 | }, 13 | "optimizer": { 14 | "type": "AdamW", 15 | "params": { 16 | "lr": "auto", 17 | "betas": "auto", 18 | "eps": "auto", 19 | "weight_decay": "auto" 20 | } 21 | }, 22 | 23 | "zero_optimization": { 24 | "stage": 3, 25 | "offload_optimizer": { 26 | "device": "none", 27 | "pin_memory": true 28 | }, 29 | "offload_param": { 30 | "device": "none", 31 | "pin_memory": true 32 | }, 33 | "overlap_comm": true, 34 | "contiguous_gradients": true, 35 | "zero_quantized_weights": true, 36 | "zero_hpz_partition_size": 16, 37 | "zero_quantized_gradients": true, 38 | "sub_group_size": 1e9, 39 | "reduce_bucket_size": "auto", 40 | "stage3_prefetch_bucket_size": "auto", 41 | "stage3_param_persistence_threshold": "auto", 42 | "stage3_max_live_parameters": 1e9, 43 | "stage3_max_reuse_distance": 1e9, 44 | "stage3_gather_16bit_weights_on_model_save": true 45 | }, 46 | 47 | "gradient_accumulation_steps": "auto", 48 | "gradient_clipping": "auto", 49 | "steps_per_print": 100, 50 | "train_batch_size": "auto", 51 | "train_micro_batch_size_per_gpu": "auto", 52 | "wall_clock_breakdown": false 53 | } --------------------------------------------------------------------------------