├── README.md
├── data_check.py
├── data_generation
    └── data_generation.py
├── example.png
├── figure
    ├── data-generation.png
    ├── general_benchmark.png
    ├── multi-agent.png
    ├── showcase.png
    ├── teaser.png
    └── visual_reasoning_benchmark.png
├── infer_reason_model.py
├── infer_summary_model.py
├── infer_two_models.py
├── llava
    ├── __init__.py
    ├── __pycache__
    │   ├── __init__.cpython-310.pyc
    │   ├── constants.cpython-310.pyc
    │   ├── conversation.cpython-310.pyc
    │   ├── mm_utils.cpython-310.pyc
    │   └── utils.cpython-310.pyc
    ├── constants.py
    ├── conversation.py
    ├── eval
    │   ├── evaluate_interleave.py
    │   └── model_vqa.py
    ├── mm_utils.py
    ├── model
    │   ├── __init__.py
    │   ├── __pycache__
    │   │   ├── __init__.cpython-310.pyc
    │   │   ├── builder.cpython-310.pyc
    │   │   └── llava_arch.cpython-310.pyc
    │   ├── apply_delta.py
    │   ├── builder.py
    │   ├── consolidate.py
    │   ├── language_model
    │   │   ├── __pycache__
    │   │   │   ├── llava_llama.cpython-310.pyc
    │   │   │   ├── llava_mistral.cpython-310.pyc
    │   │   │   ├── llava_mixtral.cpython-310.pyc
    │   │   │   └── llava_qwen.cpython-310.pyc
    │   │   ├── llava_gemma.py
    │   │   ├── llava_llama.py
    │   │   ├── llava_mistral.py
    │   │   ├── llava_mixtral.py
    │   │   ├── llava_mpt.py
    │   │   ├── llava_qwen.py
    │   │   ├── llava_qwen_moe.py
    │   │   └── modeling_llama.py
    │   ├── llava_arch.py
    │   ├── make_delta.py
    │   ├── multimodal_encoder
    │   │   ├── __pycache__
    │   │   │   ├── builder.cpython-310.pyc
    │   │   │   ├── clip_encoder.cpython-310.pyc
    │   │   │   ├── hf_vision.cpython-310.pyc
    │   │   │   ├── imagebind.cpython-310.pyc
    │   │   │   ├── open_clip_encoder.cpython-310.pyc
    │   │   │   └── siglip_encoder.cpython-310.pyc
    │   │   ├── builder.py
    │   │   ├── clip_encoder.py
    │   │   ├── dev_eva_clip
    │   │   │   ├── eva_clip
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── bpe_simple_vocab_16e6.txt.gz
    │   │   │   │   ├── constants.py
    │   │   │   │   ├── eva_vit_model.py
    │   │   │   │   ├── factory.py
    │   │   │   │   ├── hf_configs.py
    │   │   │   │   ├── hf_model.py
    │   │   │   │   ├── loss.py
    │   │   │   │   ├── model.py
    │   │   │   │   ├── model_configs
    │   │   │   │   │   ├── EVA-CLIP-18B.json
    │   │   │   │   │   ├── EVA-CLIP-8B-plus.json
    │   │   │   │   │   ├── EVA-CLIP-8B.json
    │   │   │   │   │   ├── EVA01-CLIP-B-16.json
    │   │   │   │   │   ├── EVA01-CLIP-g-14-plus.json
    │   │   │   │   │   ├── EVA01-CLIP-g-14.json
    │   │   │   │   │   ├── EVA02-CLIP-B-16.json
    │   │   │   │   │   ├── EVA02-CLIP-L-14-336.json
    │   │   │   │   │   ├── EVA02-CLIP-L-14.json
    │   │   │   │   │   ├── EVA02-CLIP-bigE-14-plus.json
    │   │   │   │   │   ├── EVA02-CLIP-bigE-14.json
    │   │   │   │   │   ├── Internal-EVA02-CLIP-10B-14-448.json
    │   │   │   │   │   └── Internal-EVA02-CLIP-10B-14.json
    │   │   │   │   ├── modified_resnet.py
    │   │   │   │   ├── openai.py
    │   │   │   │   ├── pretrained.py
    │   │   │   │   ├── rope.py
    │   │   │   │   ├── timm_model.py
    │   │   │   │   ├── tokenizer.py
    │   │   │   │   ├── transform.py
    │   │   │   │   ├── transformer.py
    │   │   │   │   └── utils.py
    │   │   │   └── eva_vit.py
    │   │   ├── eva_clip
    │   │   │   ├── eva_clip_encoder.py
    │   │   │   ├── eva_clip_processors.py
    │   │   │   ├── eva_vit.py
    │   │   │   ├── factory.py
    │   │   │   └── model_configs
    │   │   │   │   ├── EVA-CLIP-18B.json
    │   │   │   │   ├── EVA-CLIP-8B-plus.json
    │   │   │   │   ├── EVA-CLIP-8B.json
    │   │   │   │   ├── EVA01-CLIP-B-16.json
    │   │   │   │   ├── EVA01-CLIP-g-14-plus.json
    │   │   │   │   ├── EVA01-CLIP-g-14.json
    │   │   │   │   ├── EVA02-CLIP-B-16.json
    │   │   │   │   ├── EVA02-CLIP-L-14-336.json
    │   │   │   │   ├── EVA02-CLIP-L-14.json
    │   │   │   │   ├── EVA02-CLIP-bigE-14-plus.json
    │   │   │   │   ├── EVA02-CLIP-bigE-14.json
    │   │   │   │   ├── Internal-EVA02-CLIP-10B-14-448.json
    │   │   │   │   └── Internal-EVA02-CLIP-10B-14.json
    │   │   ├── hf_vision.py
    │   │   ├── imagebind.py
    │   │   ├── open_clip_encoder.py
    │   │   └── siglip_encoder.py
    │   ├── multimodal_projector
    │   │   ├── __pycache__
    │   │   │   ├── builder.cpython-310.pyc
    │   │   │   └── pooler_projector.cpython-310.pyc
    │   │   ├── builder.py
    │   │   └── pooler_projector.py
    │   ├── multimodal_resampler
    │   │   ├── __pycache__
    │   │   │   ├── builder.cpython-310.pyc
    │   │   │   ├── masked_drop.cpython-310.pyc
    │   │   │   ├── perceiver.cpython-310.pyc
    │   │   │   ├── qformer.cpython-310.pyc
    │   │   │   └── spatial_pool.cpython-310.pyc
    │   │   ├── builder.py
    │   │   ├── masked_drop.py
    │   │   ├── perceiver.py
    │   │   ├── qformer.py
    │   │   └── spatial_pool.py
    │   └── utils.py
    ├── serve
    │   ├── __init__.py
    │   ├── cli.py
    │   ├── controller.py
    │   ├── examples
    │   │   ├── extreme_ironing.jpg
    │   │   └── waterview.jpg
    │   ├── gradio_multi_image.py
    │   ├── gradio_web_server.py
    │   ├── model_worker.py
    │   ├── register_worker.py
    │   ├── sglang_worker.py
    │   └── test_message.py
    ├── train
    │   ├── __pycache__
    │   │   ├── llava_trainer.cpython-310.pyc
    │   │   └── train.cpython-310.pyc
    │   ├── llama_flash_attn_monkey_patch.py
    │   ├── llava_trainer.py
    │   ├── llava_trainer_eval.py
    │   ├── train.py
    │   ├── train_dpo.py
    │   └── train_mem.py
    └── utils.py
├── requirements.txt
└── scripts
    ├── insight-v
        ├── llava_dpo.sh
        ├── llava_next_reason.sh
        └── llava_next_summary.sh
    ├── zero2.json
    ├── zero2_fused_adamw.json
    ├── zero2_offload.json
    ├── zero3.json
    ├── zero3_offload.json
    └── zero3pp.json


/README.md:
--------------------------------------------------------------------------------
  1 | ## Insight-V: Exploring Long-Chain Visual Reasoning with Multimodal Large Language Models
  2 | 
  3 | <p align="left">
  4 |     <a href='https://github.com/dongyh20/' target='_blank'>Yuhao Dong<sup>*,1</sup></a>&emsp;
  5 |     <a href='https://github.com/liuzuyan' target='_blank'>Zuyan Liu<sup>*,2,3</sup></a>&emsp;
  6 |     Hai-Long Sun<sup>2,4</sup></a>&emsp;
  7 |     <a href='https://jingkang50.github.io/' target='_blank'>Jingkang Yang<sup>1</sup></a>&emsp;
  8 | </p>
  9 | <p align="left">
 10 |     Winston Hu<sup>2</sup></a>&emsp;
 11 |    <a href='https://raoyongming.github.io/' target='_blank'>Yongming Rao<sup>2,3,&#x2709</sup></a>&emsp;
 12 | 	 <a href='https://liuziwei7.github.io/' target='_blank'>Ziwei Liu<sup>1,&#x2709</sup></a>&emsp;
 13 | </p>
 14 | 
 15 | <p align="left"><sup>1</sup>S-Lab, NTU &ensp; <sup>2</sup>Tencent&ensp; <sup>3</sup>Tsinghua University&ensp;<sup>4</sup>Nanjing University&ensp; </p>
 16 | 
 17 | <p align="left"><sup>*</sup> Equal Contribution<sup>&ensp; &#x2709</sup>  Corresponding Author</p>
 18 | 
 19 | **arXiv Paper:** [![Static Badge](https://img.shields.io/badge/Insight_V-paper-green)](https://arxiv.org/abs/2411.14432) 
 20 | 
 21 | **Model Checkpoints**: [![Insight-V-checkpoints](https://img.shields.io/badge/Insight_V-checkpoints-blue)](https://huggingface.co/collections/THUdyh/insight-v-673f5e1dd8ab5f2d8d332035) 
 22 | 
 23 | 
 24 | ## 📢 News
 25 | - [04/2025] Insight-V is selected as **Highlight** paper by CVPR2025!
 26 | - [02/2025] Insight-V is accepted by CVPR2025!
 27 | - [11/2024] 🔧🔨**Training & Inference Scripts Release!** Try Insight-V on your own!
 28 | - [11/2024] 🔥 **🚀Introducing Insight-V!** An early attempt to explore long-chain visual reasoning with MLLMs.
 29 |   * [[Paper]](https://arxiv.org/abs/2411.14432): Detailed introduction of Insight-V, including **structured, long-chain data generation pipeline** and **effective multi-agent system design**! 
 30 |   * [[Checkpoints]](https://huggingface.co/collections/THUdyh/insight-v-673f5e1dd8ab5f2d8d332035): We release model checkpoints on LLaVA-NeXT-LLaMA3 and our base model.
 31 | 
 32 | ## 🚀 Introducing Insight-V
 33 | 
 34 | ### Main idea of Insight-V
 35 | **Insight-V is an early effort to explore long-chain visual reasoning with MLLMs.**
 36 | 
 37 | Insight-V offers **1)** a scalable data generation pipeline for long-chain, high-quality reasoning data, **2)** a multi-agent system that decomposes visual reasoning tasks into reasoning and summarization, and **3)** a two-stage training pipeline to enhance visual reasoning capabilities. Together, these contributions address key challenges in visual reasoning, providing a solid foundation for future research in MLLM reasoning.
 38 | 
 39 | 
 40 | ### Overview of Data Generation Pipeline
 41 | 
 42 | The reasoning processes are generated progressively through a reasoning generator, and then fed into a multi-granularity assessment system to ensure high-quality reasoning.
 43 | 
 44 | <div align=center>
 45 | <img src="figure/data-generation.png" width="640px">
 46 | </div>
 47 | 
 48 | ### Overview of Multi-Agent System
 49 | 
 50 | We derive a multi-agent system from a single model. By decomposing the task into reasoning and summarization, the two agents collaborate to enhance the overall reasoning capability.
 51 | 
 52 | <div align=center>
 53 | <img src="figure/multi-agent.png" width="640px">
 54 | </div>
 55 | 
 56 | 
 57 | ## ✅ TODO List
 58 | 
 59 |  - [x] Release paper on arXiv
 60 |  - [x] Release Insight-V models.
 61 |  - [ ] Demo code for generation.
 62 |  - [ ] All the training and inference code.
 63 |  - [ ] Evaluation code for visual reasoning benchmarks.
 64 |  - [ ] Insight-V SFT Data.
 65 |  - [ ] Insight-V with stronger MLLMs.
 66 | 
 67 | ## 📃 Main Results
 68 | 
 69 | ### Results on Visual Reasoning Benchmarks
 70 | 
 71 | <div align=center>
 72 | <img src="figure/visual_reasoning_benchmark.png" width="640px">
 73 | </div>
 74 | 
 75 | 
 76 | ### Results on Other Image Benchmarks
 77 | 
 78 | <div align=center>
 79 | <img src="figure/general_benchmark.png" width="640px">
 80 | </div>
 81 | 
 82 | ### Qualitative Results
 83 | 
 84 | <div align=center>
 85 | <img src="figure/showcase.png" width="640px">
 86 | </div>
 87 | 
 88 | 
 89 | ## Citation
 90 | 
 91 | If you find it useful for your research and applications, please cite our paper using this BibTeX:
 92 | 
 93 | ```bibtex
 94 | @article{dong2024insight,
 95 |   title={Insight-V: Exploring Long-Chain Visual Reasoning with Multimodal Large Language Models},
 96 |   author={Dong, Yuhao and Liu, Zuyan and Sun, Hai-Long and Yang, Jingkang and Hu, Winston and Rao, Yongming and Liu, Ziwei},
 97 |   journal={arXiv preprint arXiv:2411.14432},
 98 |   year={2024}
 99 | }
100 | 
101 | ```
102 | 
103 | ## Acknowledgement
104 | 
105 | - Our codebase is conducted on [LLaVA](https://github.com/LLaVA-VL/LLaVA-NeXT)
106 | 
107 | - The data generation pipeline is mitigated from [g1](https://github.com/bklieger-groq/g1)
108 | 
109 | - Thanks to [lmms-eval](https://github.com/EvolvingLMMs-Lab/lmms-eval) team, for building such a useful evaluation system!
110 | 


--------------------------------------------------------------------------------
/data_check.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | import json
 3 | import os
 4 | import glob
 5 | import tqdm
 6 | from transformers import AutoTokenizer
 7 | from vllm import LLM, SamplingParams
 8 | import argparse
 9 | 
10 | prompt = '''I will give you a question, the ground truth answer and one answers generated by another model. You need to judge whether the answer is correct or not based on the ground truth answer. As long as the answer has the same meaning with the ground truth answer, even if the format is not completely the same, it is correct. For each question, if the answer is correct, just return 1. If the answer is incorrect, return 0. You should only return 1 or 0 for each question. The output should be a list containing your judgement of the answer. Here is an example:
11 | ——Example 1——
12 | Question: <image>\nWhat company held 15 percent of the microwave oven market in the United States in 2008?\nAnswer the question using a single word or phrase.
13 | Ground Truth: Samsung.
14 | Model 1: Samsung and Sharp held 15 percent of the microwave oven market share in the United States in 2008.
15 | Your answer: [0]
16 | ——Example 2——
17 | Question: <image>\nWhat company held 15 percent of the microwave oven market in the United States in 2008?\nAnswer the question using a single word or phrase.
18 | Model 1: Samsung held 15 percent of the microwave oven market in the United States in 2008.
19 | Your answer: [1]
20 | 
21 | Now please generate your answers based on the given questions, ground truth answers and model answers. Please note that you should only output the list of your answers, strictly follow the format.   
22 | '''
23 | 
24 | data_paths = ["data.json"]
25 | 
26 | LLM_PATH = "PATH/TO/LLM"
27 | tokenizer = AutoTokenizer.from_pretrained(LLM_PATH)
28 | 
29 | sampling_params = SamplingParams(temperature=0.2, top_p=0.7, repetition_penalty=1.05, max_tokens=512)
30 | 
31 | llm = LLM(model=LLM_PATH,tensor_parallel_size=1)
32 | 
33 | batch_size = 32
34 | 
35 | for data_path in data_paths:
36 |     with open(data_path, "r") as f:
37 |         data = json.load(f)
38 | 
39 |     all_inputs = []
40 |     bad_count = 0
41 |     for idx, data_line in enumerate(data):
42 |         question = data_line['question']
43 |         ground_truth = data_line['conversations'][1]['value']
44 |         response = []
45 |         for meta_ in meta:
46 |             # check whether we can parse the response of reasoning steps
47 |             response_ = data_line['steps'][-1]
48 |             try:
49 |                 response.append(json.loads(response_[1])['content'])
50 |             except:
51 |                 bad_count += 1
52 |                 response.append("No available answer")
53 | 
54 |         user_input = f"Question: {question}\nGround Truth: {ground_truth}\nModel 1: {response[0]}\nYour answer: "
55 |         
56 |         messages = [
57 |             {"role": "system", "content": f"You are a helpful assistant.\n{prompt}"},
58 |             {"role": "user", "content": user_input}
59 |         ]
60 |         all_inputs.append({
61 |             "raw": data_line,
62 |             "message": messages
63 |         })
64 | 
65 |     print(bad_count)
66 | 
67 |     for i in tqdm.tqdm(range(0, len(all_inputs), batch_size)):
68 |         batch = all_inputs[i:i+batch_size]
69 |         conversations = []
70 |         batch_raws, batch_messages = [batch[i]["raw"] for i in range(len(batch))], [batch[i]["message"] for i in range(len(batch))]
71 |         for messages in batch_messages:
72 |             text = tokenizer.apply_chat_template(
73 |                 messages,
74 |                 tokenize=False,
75 |                 add_generation_prompt=True
76 |             )
77 |             conversations.append(text)
78 | 
79 |         outputs = llm.generate(conversations, sampling_params, use_tqdm=False)
80 |         for j, output in enumerate(outputs):
81 |             generated_text = output.outputs[0].text
82 |             generated_text = generated_text.split("]")[0] + ']'
83 |             try:
84 |                 all_inputs[i+j]['answer'] = json.loads(generated_text)
85 |             except:
86 |                 print("Error")
87 |                 all_inputs[i+j]['answer'] = [0]
88 | 
89 |     save_path = ""
90 |     with open(save_path, "w") as f:
91 |         f.write(json.dumps(all_inputs))
92 | 


--------------------------------------------------------------------------------
/example.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dongyh20/Insight-V/5e9c6f934c1929e351dbff17a6c6c7bc2d8143b1/example.png


--------------------------------------------------------------------------------
/figure/data-generation.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dongyh20/Insight-V/5e9c6f934c1929e351dbff17a6c6c7bc2d8143b1/figure/data-generation.png


--------------------------------------------------------------------------------
/figure/general_benchmark.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dongyh20/Insight-V/5e9c6f934c1929e351dbff17a6c6c7bc2d8143b1/figure/general_benchmark.png


--------------------------------------------------------------------------------
/figure/multi-agent.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dongyh20/Insight-V/5e9c6f934c1929e351dbff17a6c6c7bc2d8143b1/figure/multi-agent.png


--------------------------------------------------------------------------------
/figure/showcase.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dongyh20/Insight-V/5e9c6f934c1929e351dbff17a6c6c7bc2d8143b1/figure/showcase.png


--------------------------------------------------------------------------------
/figure/teaser.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dongyh20/Insight-V/5e9c6f934c1929e351dbff17a6c6c7bc2d8143b1/figure/teaser.png


--------------------------------------------------------------------------------
/figure/visual_reasoning_benchmark.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dongyh20/Insight-V/5e9c6f934c1929e351dbff17a6c6c7bc2d8143b1/figure/visual_reasoning_benchmark.png


--------------------------------------------------------------------------------
/llava/__init__.py:
--------------------------------------------------------------------------------
1 | from .model import LlavaLlamaForCausalLM
2 | 


--------------------------------------------------------------------------------
/llava/__pycache__/__init__.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dongyh20/Insight-V/5e9c6f934c1929e351dbff17a6c6c7bc2d8143b1/llava/__pycache__/__init__.cpython-310.pyc


--------------------------------------------------------------------------------
/llava/__pycache__/constants.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dongyh20/Insight-V/5e9c6f934c1929e351dbff17a6c6c7bc2d8143b1/llava/__pycache__/constants.cpython-310.pyc


--------------------------------------------------------------------------------
/llava/__pycache__/conversation.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dongyh20/Insight-V/5e9c6f934c1929e351dbff17a6c6c7bc2d8143b1/llava/__pycache__/conversation.cpython-310.pyc


--------------------------------------------------------------------------------
/llava/__pycache__/mm_utils.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dongyh20/Insight-V/5e9c6f934c1929e351dbff17a6c6c7bc2d8143b1/llava/__pycache__/mm_utils.cpython-310.pyc


--------------------------------------------------------------------------------
/llava/__pycache__/utils.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dongyh20/Insight-V/5e9c6f934c1929e351dbff17a6c6c7bc2d8143b1/llava/__pycache__/utils.cpython-310.pyc


--------------------------------------------------------------------------------
/llava/constants.py:
--------------------------------------------------------------------------------
 1 | CONTROLLER_HEART_BEAT_EXPIRATION = 30
 2 | WORKER_HEART_BEAT_INTERVAL = 15
 3 | 
 4 | LOGDIR = "."
 5 | 
 6 | # Model Constants
 7 | IGNORE_INDEX = -100
 8 | IMAGE_TOKEN_INDEX = -200
 9 | DEFAULT_IMAGE_TOKEN = "<image>"
10 | DEFAULT_IMAGE_PATCH_TOKEN = "<im_patch>"
11 | DEFAULT_IM_START_TOKEN = "<im_start>"
12 | DEFAULT_IM_END_TOKEN = "<im_end>"
13 | 
14 | THINK_START_TOKEN = "<thoughts>"
15 | THINK_END_TOKEN  = "</thoughts>"
16 | OUTPUT_START_TOKEN = "<output>"
17 | OUTPUT_END_TOKEN = "</output>"


--------------------------------------------------------------------------------
/llava/model/__init__.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | AVAILABLE_MODELS = {
 4 |     "llava_llama": "LlavaLlamaForCausalLM, LlavaConfig",
 5 |     "llava_qwen": "LlavaQwenForCausalLM, LlavaQwenConfig",
 6 |     "llava_mistral": "LlavaMistralForCausalLM, LlavaMistralConfig",
 7 |     "llava_mixtral": "LlavaMixtralForCausalLM, LlavaMixtralConfig",
 8 |     # "llava_qwen_moe": "LlavaQwenMoeForCausalLM, LlavaQwenMoeConfig",    
 9 |     # Add other models as needed
10 | }
11 | 
12 | for model_name, model_classes in AVAILABLE_MODELS.items():
13 |     try:
14 |         exec(f"from .language_model.{model_name} import {model_classes}")
15 |     except Exception as e:
16 |         print(f"Failed to import {model_name} from llava.language_model.{model_name}. Error: {e}")
17 | 


--------------------------------------------------------------------------------
/llava/model/__pycache__/__init__.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dongyh20/Insight-V/5e9c6f934c1929e351dbff17a6c6c7bc2d8143b1/llava/model/__pycache__/__init__.cpython-310.pyc


--------------------------------------------------------------------------------
/llava/model/__pycache__/builder.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dongyh20/Insight-V/5e9c6f934c1929e351dbff17a6c6c7bc2d8143b1/llava/model/__pycache__/builder.cpython-310.pyc


--------------------------------------------------------------------------------
/llava/model/__pycache__/llava_arch.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dongyh20/Insight-V/5e9c6f934c1929e351dbff17a6c6c7bc2d8143b1/llava/model/__pycache__/llava_arch.cpython-310.pyc


--------------------------------------------------------------------------------
/llava/model/apply_delta.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Usage:
 3 | python3 -m fastchat.model.apply_delta --base ~/model_weights/llama-7b --target ~/model_weights/vicuna-7b --delta lmsys/vicuna-7b-delta
 4 | """
 5 | 
 6 | import argparse
 7 | 
 8 | import torch
 9 | from tqdm import tqdm
10 | from transformers import AutoTokenizer, AutoModelForCausalLM
11 | from llava import LlavaLlamaForCausalLM
12 | 
13 | 
14 | def apply_delta(base_model_path, target_model_path, delta_path):
15 |     print("Loading base model")
16 |     base = AutoModelForCausalLM.from_pretrained(base_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True)
17 | 
18 |     print("Loading delta")
19 |     delta = LlavaLlamaForCausalLM.from_pretrained(delta_path, torch_dtype=torch.float16, low_cpu_mem_usage=True)
20 |     delta_tokenizer = AutoTokenizer.from_pretrained(delta_path)
21 | 
22 |     print("Applying delta")
23 |     for name, param in tqdm(delta.state_dict().items(), desc="Applying delta"):
24 |         if name not in base.state_dict():
25 |             assert name in ["model.mm_projector.weight", "model.mm_projector.bias"], f"{name} not in base model"
26 |             continue
27 |         if param.data.shape == base.state_dict()[name].shape:
28 |             param.data += base.state_dict()[name]
29 |         else:
30 |             assert name in ["model.embed_tokens.weight", "lm_head.weight"], f"{name} dimension mismatch: {param.data.shape} vs {base.state_dict()[name].shape}"
31 |             bparam = base.state_dict()[name]
32 |             param.data[: bparam.shape[0], : bparam.shape[1]] += bparam
33 | 
34 |     print("Saving target model")
35 |     delta.save_pretrained(target_model_path)
36 |     delta_tokenizer.save_pretrained(target_model_path)
37 | 
38 | 
39 | if __name__ == "__main__":
40 |     parser = argparse.ArgumentParser()
41 |     parser.add_argument("--base-model-path", type=str, required=True)
42 |     parser.add_argument("--target-model-path", type=str, required=True)
43 |     parser.add_argument("--delta-path", type=str, required=True)
44 | 
45 |     args = parser.parse_args()
46 | 
47 |     apply_delta(args.base_model_path, args.target_model_path, args.delta_path)
48 | 


--------------------------------------------------------------------------------
/llava/model/consolidate.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Usage:
 3 | python3 -m llava.model.consolidate --src ~/model_weights/llava-7b --dst ~/model_weights/llava-7b_consolidate
 4 | """
 5 | 
 6 | import argparse
 7 | 
 8 | import torch
 9 | from transformers import AutoTokenizer, AutoModelForCausalLM
10 | from llava.model import *
11 | from llava.model.utils import auto_upgrade
12 | 
13 | 
14 | def consolidate_ckpt(src_path, dst_path):
15 |     print("Loading model")
16 |     auto_upgrade(src_path)
17 |     src_model = AutoModelForCausalLM.from_pretrained(src_path, torch_dtype=torch.float16, low_cpu_mem_usage=True)
18 |     src_tokenizer = AutoTokenizer.from_pretrained(src_path, use_fast=False)
19 |     src_model.save_pretrained(dst_path)
20 |     src_tokenizer.save_pretrained(dst_path)
21 | 
22 | 
23 | if __name__ == "__main__":
24 |     parser = argparse.ArgumentParser()
25 |     parser.add_argument("--src", type=str, required=True)
26 |     parser.add_argument("--dst", type=str, required=True)
27 | 
28 |     args = parser.parse_args()
29 | 
30 |     consolidate_ckpt(args.src, args.dst)
31 | 


--------------------------------------------------------------------------------
/llava/model/language_model/__pycache__/llava_llama.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dongyh20/Insight-V/5e9c6f934c1929e351dbff17a6c6c7bc2d8143b1/llava/model/language_model/__pycache__/llava_llama.cpython-310.pyc


--------------------------------------------------------------------------------
/llava/model/language_model/__pycache__/llava_mistral.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dongyh20/Insight-V/5e9c6f934c1929e351dbff17a6c6c7bc2d8143b1/llava/model/language_model/__pycache__/llava_mistral.cpython-310.pyc


--------------------------------------------------------------------------------
/llava/model/language_model/__pycache__/llava_mixtral.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dongyh20/Insight-V/5e9c6f934c1929e351dbff17a6c6c7bc2d8143b1/llava/model/language_model/__pycache__/llava_mixtral.cpython-310.pyc


--------------------------------------------------------------------------------
/llava/model/language_model/__pycache__/llava_qwen.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dongyh20/Insight-V/5e9c6f934c1929e351dbff17a6c6c7bc2d8143b1/llava/model/language_model/__pycache__/llava_qwen.cpython-310.pyc


--------------------------------------------------------------------------------
/llava/model/language_model/llava_gemma.py:
--------------------------------------------------------------------------------
  1 | #    Copyright 2024 Duc Q. Nguyen, Haotian Liu and Bo Li
  2 | #
  3 | #    Licensed under the Apache License, Version 2.0 (the "License");
  4 | #    you may not use this file except in compliance with the License.
  5 | #    You may obtain a copy of the License at
  6 | #
  7 | #        http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | #    Unless required by applicable law or agreed to in writing, software
 10 | #    distributed under the License is distributed on an "AS IS" BASIS,
 11 | #    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | #    See the License for the specific language governing permissions and
 13 | #    limitations under the License.
 14 | 
 15 | 
 16 | from typing import List, Optional, Tuple, Union
 17 | 
 18 | import torch
 19 | import torch.nn as nn
 20 | from torch.nn import CrossEntropyLoss
 21 | 
 22 | from transformers import AutoConfig, AutoModelForCausalLM, GemmaConfig, GemmaModel, GemmaForCausalLM
 23 | 
 24 | from transformers.modeling_outputs import CausalLMOutputWithPast
 25 | from transformers.generation.utils import GenerateOutput
 26 | 
 27 | from ..llava_arch import LlavaMetaModel, LlavaMetaForCausalLM
 28 | 
 29 | 
 30 | class LlavaGemmaConfig(GemmaConfig):
 31 |     model_type = "llava_gemma"
 32 | 
 33 | 
 34 | class LlavaGemmaModel(LlavaMetaModel, GemmaModel):
 35 |     config_class = LlavaGemmaConfig
 36 | 
 37 |     def __init__(self, config: GemmaConfig):
 38 |         super(LlavaGemmaModel, self).__init__(config)
 39 | 
 40 | 
 41 | class LlavaGemmaForCausalLM(GemmaForCausalLM, LlavaMetaForCausalLM):
 42 |     config_class = LlavaGemmaConfig
 43 | 
 44 |     def __init__(self, config):
 45 |         super(GemmaForCausalLM, self).__init__(config)
 46 |         self.model = LlavaGemmaModel(config)
 47 | 
 48 |         self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
 49 | 
 50 |         # Initialize weights and apply final processing
 51 |         self.post_init()
 52 | 
 53 |     def get_model(self):
 54 |         return self.model
 55 | 
 56 |     def forward(
 57 |         self,
 58 |         input_ids: torch.LongTensor = None,
 59 |         attention_mask: Optional[torch.Tensor] = None,
 60 |         position_ids: Optional[torch.LongTensor] = None,
 61 |         past_key_values: Optional[List[torch.FloatTensor]] = None,
 62 |         inputs_embeds: Optional[torch.FloatTensor] = None,
 63 |         labels: Optional[torch.LongTensor] = None,
 64 |         use_cache: Optional[bool] = None,
 65 |         output_attentions: Optional[bool] = None,
 66 |         output_hidden_states: Optional[bool] = None,
 67 |         images: Optional[torch.FloatTensor] = None,
 68 |         image_sizes: Optional[List[List[int]]] = None,
 69 |         return_dict: Optional[bool] = None,
 70 |         cache_position: Optional[torch.LongTensor] = None,
 71 |     ) -> Union[Tuple, CausalLMOutputWithPast]:
 72 | 
 73 |         if inputs_embeds is None:
 74 |             (input_ids, position_ids, attention_mask, past_key_values, inputs_embeds, labels) = self.prepare_inputs_labels_for_multimodal(input_ids, position_ids, attention_mask, past_key_values, labels, images, image_sizes)
 75 | 
 76 |         return super().forward(
 77 |             input_ids=input_ids,
 78 |             attention_mask=attention_mask,
 79 |             position_ids=position_ids,
 80 |             past_key_values=past_key_values,
 81 |             inputs_embeds=inputs_embeds,
 82 |             labels=labels,
 83 |             use_cache=use_cache,
 84 |             output_attentions=output_attentions,
 85 |             output_hidden_states=output_hidden_states,
 86 |             return_dict=return_dict,
 87 |             cache_position=cache_position,
 88 |         )
 89 | 
 90 |     @torch.no_grad()
 91 |     def generate(
 92 |         self,
 93 |         inputs: Optional[torch.Tensor] = None,
 94 |         images: Optional[torch.Tensor] = None,
 95 |         image_sizes: Optional[torch.Tensor] = None,
 96 |         **kwargs,
 97 |     ) -> Union[GenerateOutput, torch.LongTensor]:
 98 |         position_ids = kwargs.pop("position_ids", None)
 99 |         attention_mask = kwargs.pop("attention_mask", None)
100 |         if "inputs_embeds" in kwargs:
101 |             raise NotImplementedError("`inputs_embeds` is not supported")
102 | 
103 |         if images is not None:
104 |             (inputs, position_ids, attention_mask, _, inputs_embeds, _) = self.prepare_inputs_labels_for_multimodal(inputs, position_ids, attention_mask, None, None, images, image_sizes=image_sizes)
105 |         else:
106 |             inputs_embeds = self.get_model().embed_tokens(inputs)
107 | 
108 |         return super().generate(position_ids=position_ids, attention_mask=attention_mask, inputs_embeds=inputs_embeds, **kwargs)
109 | 
110 |     def prepare_inputs_for_generation(self, input_ids, past_key_values=None, inputs_embeds=None, **kwargs):
111 |         images = kwargs.pop("images", None)
112 |         image_sizes = kwargs.pop("image_sizes", None)
113 |         inputs = super().prepare_inputs_for_generation(input_ids, past_key_values=past_key_values, inputs_embeds=inputs_embeds, **kwargs)
114 |         if images is not None:
115 |             inputs["images"] = images
116 |         if image_sizes is not None:
117 |             inputs["image_sizes"] = image_sizes
118 |         return inputs
119 | 
120 | 
121 | AutoConfig.register("llava_gemma", LlavaGemmaConfig)
122 | AutoModelForCausalLM.register(LlavaGemmaConfig, LlavaGemmaForCausalLM)
123 | 


--------------------------------------------------------------------------------
/llava/model/language_model/llava_llama.py:
--------------------------------------------------------------------------------
  1 | #    Copyright 2023 Haotian Liu
  2 | #
  3 | #    Licensed under the Apache License, Version 2.0 (the "License");
  4 | #    you may not use this file except in compliance with the License.
  5 | #    You may obtain a copy of the License at
  6 | #
  7 | #        http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | #    Unless required by applicable law or agreed to in writing, software
 10 | #    distributed under the License is distributed on an "AS IS" BASIS,
 11 | #    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | #    See the License for the specific language governing permissions and
 13 | #    limitations under the License.
 14 | 
 15 | 
 16 | from typing import List, Optional, Tuple, Union
 17 | 
 18 | import torch
 19 | import torch.nn as nn
 20 | 
 21 | from transformers import AutoConfig, AutoModelForCausalLM, LlamaConfig
 22 | 
 23 | from torch.nn import CrossEntropyLoss
 24 | 
 25 | 
 26 | # , LlamaModel, LlamaForCausalLM, GenerationConfig
 27 | # from .modeling_llama import LlamaModel, LlamaForCausalLM
 28 | from transformers import LlamaModel, LlamaForCausalLM
 29 | from transformers.modeling_outputs import CausalLMOutputWithPast
 30 | from transformers.generation.utils import GenerateOutput
 31 | 
 32 | from llava.model.llava_arch import LlavaMetaModel, LlavaMetaForCausalLM
 33 | 
 34 | 
 35 | class LlavaConfig(LlamaConfig):
 36 |     model_type = "llava_llama"
 37 |     temperature: float = 0.0  # reset to 0.0, previously 0.9 for Vicuna
 38 |     max_new_tokens: int = 1024
 39 |     do_sample: bool = False
 40 |     top_p: Optional[float] = None
 41 |     # rope_scaling: Optional[dict] = {}
 42 | 
 43 | 
 44 | class LlavaLlamaModel(LlavaMetaModel, LlamaModel):
 45 |     config_class = LlavaConfig
 46 | 
 47 |     def __init__(self, config: LlamaConfig):
 48 |         super(LlavaLlamaModel, self).__init__(config)
 49 | 
 50 | 
 51 | class LlavaLlamaForCausalLM(LlamaForCausalLM, LlavaMetaForCausalLM):
 52 |     config_class = LlavaConfig
 53 | 
 54 |     def __init__(self, config):
 55 |         LlamaForCausalLM.__init__(self, config)
 56 | 
 57 |         # configure default generation settings
 58 |         config.model_type = "llava_llama"
 59 |         # config.rope_scaling = None
 60 | 
 61 |         self.model = LlavaLlamaModel(config)
 62 |         self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
 63 |         # Initialize weights and apply final processing
 64 |         self.post_init()
 65 | 
 66 |     def get_model(self):
 67 |         return self.model
 68 | 
 69 |     def forward(
 70 |         self,
 71 |         input_ids: torch.LongTensor = None,
 72 |         attention_mask: Optional[torch.Tensor] = None,
 73 |         position_ids: Optional[torch.LongTensor] = None,
 74 |         past_key_values: Optional[List[torch.FloatTensor]] = None,
 75 |         inputs_embeds: Optional[torch.FloatTensor] = None,
 76 |         labels: Optional[torch.LongTensor] = None,
 77 |         use_cache: Optional[bool] = None,
 78 |         output_attentions: Optional[bool] = None,
 79 |         output_hidden_states: Optional[bool] = None,
 80 |         images: Optional[torch.FloatTensor] = None,
 81 |         image_sizes: Optional[List[List[int]]] = None,
 82 |         return_dict: Optional[bool] = None,
 83 |         modalities: Optional[List[str]] = ["image"],
 84 |         dpo_forward: Optional[bool] = None,
 85 |         cache_position=None,
 86 |     ) -> Union[Tuple, CausalLMOutputWithPast]:
 87 | 
 88 |         if inputs_embeds is None:
 89 |             (input_ids, position_ids, attention_mask, past_key_values, inputs_embeds, labels) = self.prepare_inputs_labels_for_multimodal(input_ids, position_ids, attention_mask, past_key_values, labels, images, modalities, image_sizes)
 90 | 
 91 |         if dpo_forward:
 92 |             outputs = self.model(
 93 |                 input_ids=input_ids,
 94 |                 attention_mask=attention_mask,
 95 |                 position_ids=position_ids,
 96 |                 past_key_values=past_key_values,
 97 |                 inputs_embeds=inputs_embeds,
 98 |                 use_cache=use_cache,
 99 |                 output_attentions=output_attentions,
100 |                 output_hidden_states=output_hidden_states,
101 |                 return_dict=return_dict,
102 |             )
103 | 
104 |             hidden_states = outputs[0]
105 |             logits = self.lm_head(hidden_states)
106 |             return logits, labels
107 | 
108 |         else:
109 |             return super().forward(
110 |                 input_ids=input_ids,
111 |                 attention_mask=attention_mask,
112 |                 position_ids=position_ids,
113 |                 past_key_values=past_key_values,
114 |                 inputs_embeds=inputs_embeds,
115 |                 labels=labels,
116 |                 use_cache=use_cache,
117 |                 output_attentions=output_attentions,
118 |                 output_hidden_states=output_hidden_states,
119 |                 return_dict=return_dict,
120 |             )
121 | 
122 |     @torch.no_grad()
123 |     def generate(
124 |         self,
125 |         inputs: Optional[torch.Tensor] = None,
126 |         images: Optional[torch.Tensor] = None,
127 |         image_sizes: Optional[torch.Tensor] = None,
128 |         modalities: Optional[List[str]] = ["image"],
129 |         **kwargs,
130 |     ) -> Union[GenerateOutput, torch.LongTensor]:
131 |         modalities = kwargs.pop("modalities", None) if "modalities" in kwargs and modalities is None else modalities
132 |         position_ids = kwargs.pop("position_ids", None)
133 |         attention_mask = kwargs.pop("attention_mask", None)
134 |         if "inputs_embeds" in kwargs:
135 |             raise NotImplementedError("`inputs_embeds` is not supported")
136 | 
137 |         if images is not None:
138 |             (inputs, position_ids, attention_mask, _, inputs_embeds, _) = self.prepare_inputs_labels_for_multimodal(inputs, position_ids, attention_mask, None, None, images, modalities, image_sizes=image_sizes)
139 |         else:
140 |             inputs_embeds = self.get_model().embed_tokens(inputs)
141 | 
142 |         return super().generate(position_ids=position_ids, attention_mask=attention_mask, inputs_embeds=inputs_embeds, **kwargs)
143 | 
144 |     def prepare_inputs_for_generation(self, input_ids, past_key_values=None, inputs_embeds=None, **kwargs):
145 |         images = kwargs.pop("images", None)
146 |         image_sizes = kwargs.pop("image_sizes", None)
147 |         inputs = super().prepare_inputs_for_generation(input_ids, past_key_values=past_key_values, inputs_embeds=inputs_embeds, **kwargs)
148 |         if images is not None:
149 |             inputs["images"] = images
150 |         if image_sizes is not None:
151 |             inputs["image_sizes"] = image_sizes
152 |         return inputs
153 | 
154 | 
155 | AutoConfig.register("llava_llama", LlavaConfig)
156 | AutoModelForCausalLM.register(LlavaConfig, LlavaLlamaForCausalLM)
157 | 


--------------------------------------------------------------------------------
/llava/model/language_model/llava_mistral.py:
--------------------------------------------------------------------------------
  1 | #    Copyright 2023 Haotian Liu
  2 | #
  3 | #    Licensed under the Apache License, Version 2.0 (the "License");
  4 | #    you may not use this file except in compliance with the License.
  5 | #    You may obtain a copy of the License at
  6 | #
  7 | #        http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | #    Unless required by applicable law or agreed to in writing, software
 10 | #    distributed under the License is distributed on an "AS IS" BASIS,
 11 | #    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | #    See the License for the specific language governing permissions and
 13 | #    limitations under the License.
 14 | 
 15 | 
 16 | from typing import List, Optional, Tuple, Union
 17 | 
 18 | import torch
 19 | import torch.nn as nn
 20 | from torch.nn import CrossEntropyLoss
 21 | 
 22 | from transformers import AutoConfig, AutoModelForCausalLM, MistralConfig, MistralModel, MistralForCausalLM, GenerationConfig
 23 | 
 24 | from transformers.modeling_outputs import CausalLMOutputWithPast
 25 | from transformers.generation.utils import GenerateOutput
 26 | 
 27 | from ..llava_arch import LlavaMetaModel, LlavaMetaForCausalLM
 28 | 
 29 | 
 30 | class LlavaMistralConfig(MistralConfig):
 31 |     model_type = "llava_mistral"
 32 |     temperature: float = 0.0  # reset to 0.0, previously 0.9 for Vicuna
 33 |     max_new_tokens: int = 1024
 34 |     do_sample: bool = False
 35 |     top_p: Optional[float] = None
 36 | 
 37 | 
 38 | class LlavaMistralModel(LlavaMetaModel, MistralModel):
 39 |     config_class = LlavaMistralConfig
 40 | 
 41 |     def __init__(self, config: MistralConfig):
 42 |         super(LlavaMistralModel, self).__init__(config)
 43 | 
 44 | 
 45 | class LlavaMistralForCausalLM(MistralForCausalLM, LlavaMetaForCausalLM):
 46 |     config_class = LlavaMistralConfig
 47 | 
 48 |     def __init__(self, config):
 49 |         super(MistralForCausalLM, self).__init__(config)
 50 | 
 51 |         config.model_type = "llava_mistral"
 52 |         config.rope_scaling = None
 53 | 
 54 |         self.model = LlavaMistralModel(config)
 55 |         self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
 56 |         # Initialize weights and apply final processing
 57 |         self.post_init()
 58 | 
 59 |     def get_model(self):
 60 |         return self.model
 61 | 
 62 |     def forward(
 63 |         self,
 64 |         input_ids: torch.LongTensor = None,
 65 |         attention_mask: Optional[torch.Tensor] = None,
 66 |         position_ids: Optional[torch.LongTensor] = None,
 67 |         past_key_values: Optional[List[torch.FloatTensor]] = None,
 68 |         inputs_embeds: Optional[torch.FloatTensor] = None,
 69 |         labels: Optional[torch.LongTensor] = None,
 70 |         use_cache: Optional[bool] = None,
 71 |         output_attentions: Optional[bool] = None,
 72 |         output_hidden_states: Optional[bool] = None,
 73 |         images: Optional[torch.FloatTensor] = None,
 74 |         image_sizes: Optional[List[List[int]]] = None,
 75 |         return_dict: Optional[bool] = None,
 76 |         cache_position=None,
 77 |     ) -> Union[Tuple, CausalLMOutputWithPast]:
 78 | 
 79 |         if inputs_embeds is None:
 80 |             (input_ids, position_ids, attention_mask, past_key_values, inputs_embeds, labels) = self.prepare_inputs_labels_for_multimodal(input_ids, position_ids, attention_mask, past_key_values, labels, images, image_sizes)
 81 | 
 82 |         return super().forward(
 83 |             input_ids=input_ids,
 84 |             attention_mask=attention_mask,
 85 |             position_ids=position_ids,
 86 |             past_key_values=past_key_values,
 87 |             inputs_embeds=inputs_embeds,
 88 |             labels=labels,
 89 |             use_cache=use_cache,
 90 |             output_attentions=output_attentions,
 91 |             output_hidden_states=output_hidden_states,
 92 |             return_dict=return_dict,
 93 |         )
 94 | 
 95 |     @torch.no_grad()
 96 |     def generate(
 97 |         self,
 98 |         inputs: Optional[torch.Tensor] = None,
 99 |         images: Optional[torch.Tensor] = None,
100 |         image_sizes: Optional[torch.Tensor] = None,
101 |         **kwargs,
102 |     ) -> Union[GenerateOutput, torch.LongTensor]:
103 |         position_ids = kwargs.pop("position_ids", None)
104 |         attention_mask = kwargs.pop("attention_mask", None)
105 |         if "inputs_embeds" in kwargs:
106 |             raise NotImplementedError("`inputs_embeds` is not supported")
107 | 
108 |         if images is not None:
109 |             (inputs, position_ids, attention_mask, _, inputs_embeds, _) = self.prepare_inputs_labels_for_multimodal(inputs, position_ids, attention_mask, None, None, images, image_sizes=image_sizes)
110 |         else:
111 |             inputs_embeds = self.get_model().embed_tokens(inputs)
112 | 
113 |         return super().generate(position_ids=position_ids, attention_mask=attention_mask, inputs_embeds=inputs_embeds, **kwargs)
114 | 
115 |     def prepare_inputs_for_generation(self, input_ids, past_key_values=None, inputs_embeds=None, **kwargs):
116 |         images = kwargs.pop("images", None)
117 |         image_sizes = kwargs.pop("image_sizes", None)
118 |         inputs = super().prepare_inputs_for_generation(input_ids, past_key_values=past_key_values, inputs_embeds=inputs_embeds, **kwargs)
119 |         if images is not None:
120 |             inputs["images"] = images
121 |         if image_sizes is not None:
122 |             inputs["image_sizes"] = image_sizes
123 |         return inputs
124 | 
125 | 
126 | AutoConfig.register("llava_mistral", LlavaMistralConfig)
127 | AutoModelForCausalLM.register(LlavaMistralConfig, LlavaMistralForCausalLM)
128 | 


--------------------------------------------------------------------------------
/llava/model/language_model/llava_mixtral.py:
--------------------------------------------------------------------------------
  1 | #    Copyright 2023 Haotian Liu
  2 | #
  3 | #    Licensed under the Apache License, Version 2.0 (the "License");
  4 | #    you may not use this file except in compliance with the License.
  5 | #    You may obtain a copy of the License at
  6 | #
  7 | #        http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | #    Unless required by applicable law or agreed to in writing, software
 10 | #    distributed under the License is distributed on an "AS IS" BASIS,
 11 | #    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | #    See the License for the specific language governing permissions and
 13 | #    limitations under the License.
 14 | 
 15 | 
 16 | from typing import List, Optional, Tuple, Union
 17 | 
 18 | import torch
 19 | import torch.nn as nn
 20 | from torch.nn import CrossEntropyLoss
 21 | 
 22 | from transformers import AutoConfig, AutoModelForCausalLM, MixtralConfig, MixtralModel, MixtralForCausalLM, GenerationConfig
 23 | 
 24 | from transformers.modeling_outputs import CausalLMOutputWithPast
 25 | from transformers.generation.utils import GenerateOutput
 26 | 
 27 | from ..llava_arch import LlavaMetaModel, LlavaMetaForCausalLM
 28 | 
 29 | 
 30 | class LlavaMixtralConfig(MixtralConfig):
 31 |     model_type = "llava_mixtral"
 32 | 
 33 | 
 34 | class LlavaMixtralModel(LlavaMetaModel, MixtralModel):
 35 |     config_class = LlavaMixtralConfig
 36 | 
 37 |     def __init__(self, config: MixtralConfig):
 38 |         super(LlavaMixtralModel, self).__init__(config)
 39 | 
 40 | 
 41 | class LlavaMixtralForCausalLM(MixtralForCausalLM, LlavaMetaForCausalLM):
 42 |     config_class = LlavaMixtralConfig
 43 | 
 44 |     def __init__(self, config):
 45 |         super(MixtralForCausalLM, self).__init__(config)
 46 | 
 47 |         config.model_type = "llava_mixtral"
 48 |         config.rope_scaling = None
 49 |         self.model = LlavaMixtralModel(config)
 50 |         self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
 51 |         # Initialize weights and apply final processing
 52 |         self.post_init()
 53 | 
 54 |     def get_model(self):
 55 |         return self.model
 56 | 
 57 |     def forward(
 58 |         self,
 59 |         input_ids: torch.LongTensor = None,
 60 |         attention_mask: Optional[torch.Tensor] = None,
 61 |         position_ids: Optional[torch.LongTensor] = None,
 62 |         past_key_values: Optional[List[torch.FloatTensor]] = None,
 63 |         inputs_embeds: Optional[torch.FloatTensor] = None,
 64 |         labels: Optional[torch.LongTensor] = None,
 65 |         use_cache: Optional[bool] = None,
 66 |         output_attentions: Optional[bool] = None,
 67 |         output_hidden_states: Optional[bool] = None,
 68 |         images: Optional[torch.FloatTensor] = None,
 69 |         image_sizes: Optional[List[List[int]]] = None,
 70 |         return_dict: Optional[bool] = None,
 71 |         modalities: Optional[List[str]] = ["image"],
 72 |         dpo_forward: Optional[bool] = None,
 73 |         cache_position=None,
 74 |     ) -> Union[Tuple, CausalLMOutputWithPast]:
 75 | 
 76 |         if inputs_embeds is None:
 77 |             (input_ids, position_ids, attention_mask, past_key_values, inputs_embeds, labels) = self.prepare_inputs_labels_for_multimodal(input_ids, position_ids, attention_mask, past_key_values, labels, images, modalities, image_sizes)
 78 | 
 79 |         if dpo_forward:
 80 |             outputs = self.model(
 81 |                 input_ids=input_ids,
 82 |                 attention_mask=attention_mask,
 83 |                 position_ids=position_ids,
 84 |                 past_key_values=past_key_values,
 85 |                 inputs_embeds=inputs_embeds,
 86 |                 use_cache=use_cache,
 87 |                 output_attentions=output_attentions,
 88 |                 output_hidden_states=output_hidden_states,
 89 |                 return_dict=return_dict,
 90 |             )
 91 | 
 92 |             hidden_states = outputs[0]
 93 |             logits = self.lm_head(hidden_states)
 94 |             return logits, labels
 95 | 
 96 |         else:
 97 |             return super().forward(
 98 |                 input_ids=input_ids,
 99 |                 attention_mask=attention_mask,
100 |                 position_ids=position_ids,
101 |                 past_key_values=past_key_values,
102 |                 inputs_embeds=inputs_embeds,
103 |                 labels=labels,
104 |                 use_cache=use_cache,
105 |                 output_attentions=output_attentions,
106 |                 output_hidden_states=output_hidden_states,
107 |                 return_dict=return_dict,
108 |             )
109 | 
110 |     @torch.no_grad()
111 |     def generate(
112 |         self,
113 |         inputs: Optional[torch.Tensor] = None,
114 |         images: Optional[torch.Tensor] = None,
115 |         image_sizes: Optional[torch.Tensor] = None,
116 |         modalities: Optional[List[str]] = ["image"],
117 |         **kwargs,
118 |     ) -> Union[GenerateOutput, torch.LongTensor]:
119 |         position_ids = kwargs.pop("position_ids", None)
120 |         attention_mask = kwargs.pop("attention_mask", None)
121 |         if "inputs_embeds" in kwargs:
122 |             raise NotImplementedError("`inputs_embeds` is not supported")
123 | 
124 |         if images is not None:
125 |             (inputs, position_ids, attention_mask, _, inputs_embeds, _) = self.prepare_inputs_labels_for_multimodal(inputs, position_ids, attention_mask, None, None, images, modalities, image_sizes=image_sizes)
126 |         else:
127 |             inputs_embeds = self.get_model().embed_tokens(inputs)
128 | 
129 |         return super().generate(position_ids=position_ids, attention_mask=attention_mask, inputs_embeds=inputs_embeds, **kwargs)
130 | 
131 |     def prepare_inputs_for_generation(self, input_ids, past_key_values=None, inputs_embeds=None, **kwargs):
132 |         images = kwargs.pop("images", None)
133 |         image_sizes = kwargs.pop("image_sizes", None)
134 |         inputs = super().prepare_inputs_for_generation(input_ids, past_key_values=past_key_values, inputs_embeds=inputs_embeds, **kwargs)
135 |         if images is not None:
136 |             inputs["images"] = images
137 |         if image_sizes is not None:
138 |             inputs["image_sizes"] = image_sizes
139 |         return inputs
140 | 
141 | 
142 | AutoConfig.register("llava_mixtral", LlavaMixtralConfig)
143 | AutoModelForCausalLM.register(LlavaMixtralConfig, LlavaMixtralForCausalLM)
144 | 


--------------------------------------------------------------------------------
/llava/model/language_model/llava_mpt.py:
--------------------------------------------------------------------------------
  1 | #    Copyright 2023 Haotian Liu
  2 | #
  3 | #    Licensed under the Apache License, Version 2.0 (the "License");
  4 | #    you may not use this file except in compliance with the License.
  5 | #    You may obtain a copy of the License at
  6 | #
  7 | #        http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | #    Unless required by applicable law or agreed to in writing, software
 10 | #    distributed under the License is distributed on an "AS IS" BASIS,
 11 | #    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | #    See the License for the specific language governing permissions and
 13 | #    limitations under the License.
 14 | 
 15 | 
 16 | from typing import Optional, Tuple
 17 | 
 18 | import torch
 19 | 
 20 | from transformers import AutoConfig, AutoModelForCausalLM, MptConfig, MptForCausalLM, MptModel, GenerationConfig
 21 | from llava.model.llava_arch import LlavaMetaModel, LlavaMetaForCausalLM
 22 | 
 23 | 
 24 | class LlavaMptConfig(MptConfig):
 25 |     model_type = "llava_mpt"
 26 | 
 27 | 
 28 | class LlavaMptModel(LlavaMetaModel, MptModel):
 29 |     config_class = LlavaMptConfig
 30 | 
 31 |     def __init__(self, config: MptConfig):
 32 |         config.hidden_size = config.d_model
 33 |         super(LlavaMptModel, self).__init__(config)
 34 | 
 35 |     def embed_tokens(self, x):
 36 |         return self.wte(x)
 37 | 
 38 | 
 39 | class LlavaMptForCausalLM(MptForCausalLM, LlavaMetaForCausalLM):
 40 |     config_class = LlavaMptConfig
 41 |     supports_gradient_checkpointing = True
 42 | 
 43 |     def __init__(self, config):
 44 |         super(MptForCausalLM, self).__init__(config)
 45 | 
 46 |         config.model_type = "llava_mpt"
 47 |         config.rope_scaling = None
 48 |         self.generation_config = GenerationConfig(
 49 |             temperature=0.0,
 50 |             max_new_tokens=1024,
 51 |             do_sample=False,
 52 |             top_p=None,
 53 |         )
 54 | 
 55 |         self.transformer = LlavaMptModel(config)
 56 |         self.lm_head = torch.nn.Linear(config.hidden_size, config.vocab_size, bias=False)
 57 | 
 58 |         # Initialize weights and apply final processing
 59 |         self.post_init()
 60 | 
 61 |     def get_model(self):
 62 |         return self.transformer
 63 | 
 64 |     def _set_gradient_checkpointing(self, module, value=False):
 65 |         if isinstance(module, LlavaMptModel):
 66 |             module.gradient_checkpointing = value
 67 | 
 68 |     def forward(
 69 |         self,
 70 |         input_ids: Optional[torch.LongTensor] = None,
 71 |         past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None,
 72 |         attention_mask: Optional[torch.Tensor] = None,
 73 |         inputs_embeds: Optional[torch.Tensor] = None,
 74 |         labels: Optional[torch.Tensor] = None,
 75 |         use_cache: Optional[bool] = None,
 76 |         output_attentions: Optional[bool] = None,
 77 |         output_hidden_states: Optional[bool] = None,
 78 |         return_dict: Optional[bool] = None,
 79 |         cache_position=None,
 80 |         images=None,
 81 |     ):
 82 | 
 83 |         input_ids, attention_mask, past_key_values, inputs_embeds, labels = self.prepare_inputs_labels_for_multimodal(input_ids, attention_mask, past_key_values, labels, images)
 84 | 
 85 |         return super().forward(
 86 |             input_ids,
 87 |             past_key_values=past_key_values,
 88 |             attention_mask=attention_mask,
 89 |             inputs_embeds=inputs_embeds,
 90 |             labels=labels,
 91 |             use_cache=use_cache,
 92 |             output_attentions=output_attentions,
 93 |             output_hidden_states=output_hidden_states,
 94 |             return_dict=return_dict,
 95 |         )
 96 | 
 97 |     def prepare_inputs_for_generation(self, input_ids, past_key_values=None, inputs_embeds=None, **kwargs):
 98 |         images = kwargs.pop("images", None)
 99 |         _inputs = super().prepare_inputs_for_generation(input_ids, past_key_values=past_key_values, inputs_embeds=inputs_embeds, **kwargs)
100 |         _inputs["images"] = images
101 |         return _inputs
102 | 
103 | 
104 | AutoConfig.register("llava_mpt", LlavaMptConfig)
105 | AutoModelForCausalLM.register(LlavaMptConfig, LlavaMptForCausalLM)
106 | 


--------------------------------------------------------------------------------
/llava/model/language_model/llava_qwen.py:
--------------------------------------------------------------------------------
  1 | #    Copyright 2024 Hao Zhang
  2 | #
  3 | #    Licensed under the Apache License, Version 2.0 (the "License");
  4 | #    you may not use this file except in compliance with the License.
  5 | #    You may obtain a copy of the License at
  6 | #
  7 | #        http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | #    Unless required by applicable law or agreed to in writing, software
 10 | #    distributed under the License is distributed on an "AS IS" BASIS,
 11 | #    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | #    See the License for the specific language governing permissions and
 13 | #    limitations under the License.
 14 | 
 15 | 
 16 | from typing import List, Optional, Tuple, Union, Dict
 17 | import torch
 18 | import torch.nn as nn
 19 | from torch.nn import CrossEntropyLoss
 20 | 
 21 | import transformers
 22 | from transformers import AutoConfig, AutoModelForCausalLM, LlamaConfig, LlamaModel, LlamaForCausalLM
 23 | 
 24 | from transformers.modeling_outputs import CausalLMOutputWithPast
 25 | from transformers.generation.utils import GenerateOutput
 26 | 
 27 | # from ...constants import IGNORE_INDEX, IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
 28 | from llava.model.llava_arch import LlavaMetaModel, LlavaMetaForCausalLM
 29 | from transformers import Qwen2Config, Qwen2Model, Qwen2ForCausalLM
 30 | 
 31 | # from .qwen.modeling_qwen import QWenLMHeadModel, QWenModel
 32 | # from .qwen.configuration_qwen import QWenConfig
 33 | 
 34 | import os
 35 | if 'DPO_FORWARD' in os.environ:
 36 |     print("DPO_FORWARD is set")
 37 |     DPO_FORWARD = True
 38 | else:
 39 |     DPO_FORWARD = False
 40 | 
 41 | 
 42 | class LlavaQwenConfig(Qwen2Config):
 43 |     model_type = "llava_qwen"
 44 | 
 45 | 
 46 | class LlavaQwenModel(LlavaMetaModel, Qwen2Model):
 47 |     config_class = LlavaQwenConfig
 48 | 
 49 |     def __init__(self, config: Qwen2Config):
 50 |         super(LlavaQwenModel, self).__init__(config)
 51 | 
 52 | 
 53 | class LlavaQwenForCausalLM(Qwen2ForCausalLM, LlavaMetaForCausalLM):
 54 |     config_class = LlavaQwenConfig
 55 | 
 56 |     def __init__(self, config):
 57 |         # super(Qwen2ForCausalLM, self).__init__(config)
 58 |         Qwen2ForCausalLM.__init__(self, config)
 59 |         config.model_type = "llava_qwen"
 60 |         config.rope_scaling = None
 61 | 
 62 |         self.model = LlavaQwenModel(config)
 63 |         self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
 64 |         # Initialize weights and apply final processing
 65 |         self.post_init()
 66 | 
 67 |     def get_model(self):
 68 |         return self.model
 69 | 
 70 |     def forward(
 71 |         self,
 72 |         input_ids: torch.LongTensor = None,
 73 |         attention_mask: Optional[torch.Tensor] = None,
 74 |         position_ids: Optional[torch.LongTensor] = None,
 75 |         past_key_values: Optional[List[torch.FloatTensor]] = None,
 76 |         inputs_embeds: Optional[torch.FloatTensor] = None,
 77 |         labels: Optional[torch.LongTensor] = None,
 78 |         use_cache: Optional[bool] = None,
 79 |         output_attentions: Optional[bool] = None,
 80 |         output_hidden_states: Optional[bool] = None,
 81 |         images: Optional[torch.FloatTensor] = None,
 82 |         image_sizes: Optional[List[List[int]]] = None,
 83 |         return_dict: Optional[bool] = None,
 84 |         modalities: Optional[List[str]] = ["image"],
 85 |         dpo_forward: Optional[bool] = False,
 86 |         cache_position=None,
 87 |     ) -> Union[Tuple, CausalLMOutputWithPast]:
 88 | 
 89 |         if inputs_embeds is None:
 90 |             (input_ids, position_ids, attention_mask, past_key_values, inputs_embeds, labels) = self.prepare_inputs_labels_for_multimodal(input_ids, position_ids, attention_mask, past_key_values, labels, images, modalities, image_sizes)
 91 | 
 92 |         if DPO_FORWARD:
 93 |             outputs = self.model(
 94 |                 input_ids=input_ids,
 95 |                 attention_mask=attention_mask,
 96 |                 position_ids=position_ids,
 97 |                 past_key_values=past_key_values,
 98 |                 inputs_embeds=inputs_embeds,
 99 |                 use_cache=use_cache,
100 |                 output_attentions=output_attentions,
101 |                 output_hidden_states=output_hidden_states,
102 |                 return_dict=return_dict,
103 |             )
104 | 
105 |             hidden_states = outputs[0]
106 |             logits = self.lm_head(hidden_states)
107 |             return logits, labels
108 | 
109 |         else:
110 |             return super().forward(
111 |                 input_ids=input_ids,
112 |                 attention_mask=attention_mask,
113 |                 position_ids=position_ids,
114 |                 past_key_values=past_key_values,
115 |                 inputs_embeds=inputs_embeds,
116 |                 labels=labels,
117 |                 use_cache=use_cache,
118 |                 output_attentions=output_attentions,
119 |                 output_hidden_states=output_hidden_states,
120 |                 return_dict=return_dict,
121 |             )
122 | 
123 |     @torch.no_grad()
124 |     def generate(
125 |         self,
126 |         inputs: Optional[torch.Tensor] = None,
127 |         images: Optional[torch.Tensor] = None,
128 |         image_sizes: Optional[torch.Tensor] = None,
129 |         modalities: Optional[List[str]] = ["image"],
130 |         **kwargs,
131 |     ) -> Union[GenerateOutput, torch.LongTensor]:
132 |         position_ids = kwargs.pop("position_ids", None)
133 |         attention_mask = kwargs.pop("attention_mask", None)
134 |         if "inputs_embeds" in kwargs:
135 |             raise NotImplementedError("`inputs_embeds` is not supported")
136 | 
137 |         if images is not None:
138 |             (inputs, position_ids, attention_mask, _, inputs_embeds, _) = self.prepare_inputs_labels_for_multimodal(inputs, position_ids, attention_mask, None, None, images, modalities, image_sizes=image_sizes)
139 |         else:
140 |             inputs_embeds = self.get_model().embed_tokens(inputs)
141 | 
142 |         return super().generate(position_ids=position_ids, attention_mask=attention_mask, inputs_embeds=inputs_embeds, **kwargs)
143 | 
144 |     def prepare_inputs_for_generation(self, input_ids, past_key_values=None, inputs_embeds=None, **kwargs):
145 |         images = kwargs.pop("images", None)
146 |         image_sizes = kwargs.pop("image_sizes", None)
147 |         inputs = super().prepare_inputs_for_generation(input_ids, past_key_values=past_key_values, inputs_embeds=inputs_embeds, **kwargs)
148 |         if images is not None:
149 |             inputs["images"] = images
150 |         if image_sizes is not None:
151 |             inputs["image_sizes"] = image_sizes
152 |         return inputs
153 | 
154 | 
155 | AutoConfig.register("llava_qwen", LlavaQwenConfig)
156 | AutoModelForCausalLM.register(LlavaQwenConfig, LlavaQwenForCausalLM)
157 | 


--------------------------------------------------------------------------------
/llava/model/language_model/llava_qwen_moe.py:
--------------------------------------------------------------------------------
  1 | #    Copyright 2024 Hao Zhang
  2 | #
  3 | #    Licensed under the Apache License, Version 2.0 (the "License");
  4 | #    you may not use this file except in compliance with the License.
  5 | #    You may obtain a copy of the License at
  6 | #
  7 | #        http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | #    Unless required by applicable law or agreed to in writing, software
 10 | #    distributed under the License is distributed on an "AS IS" BASIS,
 11 | #    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | #    See the License for the specific language governing permissions and
 13 | #    limitations under the License.
 14 | 
 15 | 
 16 | from typing import List, Optional, Tuple, Union, Dict
 17 | import torch
 18 | import torch.nn as nn
 19 | from torch.nn import CrossEntropyLoss
 20 | 
 21 | import transformers
 22 | from transformers import AutoConfig, AutoModelForCausalLM
 23 | 
 24 | from transformers.modeling_outputs import CausalLMOutputWithPast
 25 | from transformers.generation.utils import GenerateOutput
 26 | 
 27 | # from ...constants import IGNORE_INDEX, IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
 28 | from llava.model.llava_arch import LlavaMetaModel, LlavaMetaForCausalLM
 29 | from transformers import Qwen2MoeConfig, Qwen2MoeModel, Qwen2MoeForCausalLM
 30 | 
 31 | # from .qwen.modeling_qwen import QWenLMHeadModel, QWenModel
 32 | # from .qwen.configuration_qwen import QWenConfig
 33 | 
 34 | 
 35 | class LlavaQwenMoeConfig(Qwen2MoeConfig):
 36 |     model_type = "llava_qwen_moe"
 37 | 
 38 | 
 39 | class LlavaQwenMoeModel(LlavaMetaModel, Qwen2MoeModel):
 40 |     config_class = LlavaQwenMoeConfig
 41 | 
 42 |     def __init__(self, config: Qwen2MoeConfig):
 43 |         super(LlavaQwenMoeModel, self).__init__(config)
 44 | 
 45 | 
 46 | class LlavaQwenMoeForCausalLM(Qwen2MoeForCausalLM, LlavaMetaForCausalLM):
 47 |     config_class = LlavaQwenMoeConfig
 48 | 
 49 |     def __init__(self, config):
 50 |         # super(Qwen2MoeForCausalLM, self).__init__(config)
 51 |         Qwen2MoeForCausalLM.__init__(self, config)
 52 |         config.model_type = "llava_qwen_moe"
 53 |         config.rope_scaling = None
 54 | 
 55 |         self.model = LlavaQwenMoeModel(config)
 56 |         self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
 57 |         # Initialize weights and apply final processing
 58 |         self.post_init()
 59 | 
 60 |     def get_model(self):
 61 |         return self.model
 62 | 
 63 |     def forward(
 64 |         self,
 65 |         input_ids: torch.LongTensor = None,
 66 |         attention_mask: Optional[torch.Tensor] = None,
 67 |         position_ids: Optional[torch.LongTensor] = None,
 68 |         past_key_values: Optional[List[torch.FloatTensor]] = None,
 69 |         inputs_embeds: Optional[torch.FloatTensor] = None,
 70 |         labels: Optional[torch.LongTensor] = None,
 71 |         use_cache: Optional[bool] = None,
 72 |         output_attentions: Optional[bool] = None,
 73 |         output_hidden_states: Optional[bool] = None,
 74 |         images: Optional[torch.FloatTensor] = None,
 75 |         image_sizes: Optional[List[List[int]]] = None,
 76 |         return_dict: Optional[bool] = None,
 77 |         modalities: Optional[List[str]] = ["image"],
 78 |         dpo_forward: Optional[bool] = False,
 79 |         cache_position=None,
 80 |     ) -> Union[Tuple, CausalLMOutputWithPast]:
 81 | 
 82 |         if inputs_embeds is None:
 83 |             (input_ids, position_ids, attention_mask, past_key_values, inputs_embeds, labels) = self.prepare_inputs_labels_for_multimodal(input_ids, position_ids, attention_mask, past_key_values, labels, images, modalities, image_sizes)
 84 | 
 85 |         if dpo_forward:
 86 |             outputs = self.model(
 87 |                 input_ids=input_ids,
 88 |                 attention_mask=attention_mask,
 89 |                 position_ids=position_ids,
 90 |                 past_key_values=past_key_values,
 91 |                 inputs_embeds=inputs_embeds,
 92 |                 use_cache=use_cache,
 93 |                 output_attentions=output_attentions,
 94 |                 output_hidden_states=output_hidden_states,
 95 |                 return_dict=return_dict,
 96 |             )
 97 | 
 98 |             hidden_states = outputs[0]
 99 |             logits = self.lm_head(hidden_states)
100 |             return logits, labels
101 | 
102 |         else:
103 |             return super().forward(
104 |                 input_ids=input_ids,
105 |                 attention_mask=attention_mask,
106 |                 position_ids=position_ids,
107 |                 past_key_values=past_key_values,
108 |                 inputs_embeds=inputs_embeds,
109 |                 labels=labels,
110 |                 use_cache=use_cache,
111 |                 output_attentions=output_attentions,
112 |                 output_hidden_states=output_hidden_states,
113 |                 return_dict=return_dict,
114 |             )
115 | 
116 |     @torch.no_grad()
117 |     def generate(
118 |         self,
119 |         inputs: Optional[torch.Tensor] = None,
120 |         images: Optional[torch.Tensor] = None,
121 |         image_sizes: Optional[torch.Tensor] = None,
122 |         modalities: Optional[List[str]] = ["image"],
123 |         **kwargs,
124 |     ) -> Union[GenerateOutput, torch.LongTensor]:
125 |         position_ids = kwargs.pop("position_ids", None)
126 |         attention_mask = kwargs.pop("attention_mask", None)
127 |         if "inputs_embeds" in kwargs:
128 |             raise NotImplementedError("`inputs_embeds` is not supported")
129 | 
130 |         if images is not None:
131 |             (inputs, position_ids, attention_mask, _, inputs_embeds, _) = self.prepare_inputs_labels_for_multimodal(inputs, position_ids, attention_mask, None, None, images, modalities, image_sizes=image_sizes)
132 |         else:
133 |             inputs_embeds = self.get_model().embed_tokens(inputs)
134 | 
135 |         return super().generate(position_ids=position_ids, attention_mask=attention_mask, inputs_embeds=inputs_embeds, **kwargs)
136 | 
137 |     def prepare_inputs_for_generation(self, input_ids, past_key_values=None, inputs_embeds=None, **kwargs):
138 |         images = kwargs.pop("images", None)
139 |         image_sizes = kwargs.pop("image_sizes", None)
140 |         inputs = super().prepare_inputs_for_generation(input_ids, past_key_values=past_key_values, inputs_embeds=inputs_embeds, **kwargs)
141 |         if images is not None:
142 |             inputs["images"] = images
143 |         if image_sizes is not None:
144 |             inputs["image_sizes"] = image_sizes
145 |         return inputs
146 | 
147 | 
148 | AutoConfig.register("llava_qwen_moe", LlavaQwenMoeConfig)
149 | AutoModelForCausalLM.register(LlavaQwenMoeConfig, LlavaQwenMoeForCausalLM)
150 | 


--------------------------------------------------------------------------------
/llava/model/make_delta.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Usage:
 3 | python3 -m llava.model.make_delta --base ~/model_weights/llama-7b --target ~/model_weights/llava-7b --delta ~/model_weights/llava-7b-delta --hub-repo-id liuhaotian/llava-7b-delta
 4 | """
 5 | 
 6 | import argparse
 7 | 
 8 | import torch
 9 | from tqdm import tqdm
10 | from transformers import AutoTokenizer, AutoModelForCausalLM
11 | from llava.model.utils import auto_upgrade
12 | 
13 | 
14 | def make_delta(base_model_path, target_model_path, delta_path, hub_repo_id):
15 |     print("Loading base model")
16 |     base = AutoModelForCausalLM.from_pretrained(base_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True)
17 | 
18 |     print("Loading target model")
19 |     auto_upgrade(target_model_path)
20 |     target = AutoModelForCausalLM.from_pretrained(target_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True)
21 | 
22 |     print("Calculating delta")
23 |     for name, param in tqdm(target.state_dict().items(), desc="Calculating delta"):
24 |         if name not in base.state_dict():
25 |             assert name in ["model.mm_projector.weight", "model.mm_projector.bias"], f"{name} not in base model"
26 |             continue
27 |         if param.data.shape == base.state_dict()[name].shape:
28 |             param.data -= base.state_dict()[name]
29 |         else:
30 |             assert name in ["model.embed_tokens.weight", "lm_head.weight"], f"{name} dimension mismatch: {param.data.shape} vs {base.state_dict()[name].shape}"
31 |             bparam = base.state_dict()[name]
32 |             param.data[: bparam.shape[0], : bparam.shape[1]] -= bparam
33 | 
34 |     print("Saving delta")
35 |     if hub_repo_id:
36 |         kwargs = {"push_to_hub": True, "repo_id": hub_repo_id}
37 |     else:
38 |         kwargs = {}
39 |     target.save_pretrained(delta_path, **kwargs)
40 |     target_tokenizer = AutoTokenizer.from_pretrained(target_model_path)
41 |     target_tokenizer.save_pretrained(delta_path, **kwargs)
42 | 
43 | 
44 | if __name__ == "__main__":
45 |     parser = argparse.ArgumentParser()
46 |     parser.add_argument("--base-model-path", type=str, required=True)
47 |     parser.add_argument("--target-model-path", type=str, required=True)
48 |     parser.add_argument("--delta-path", type=str, required=True)
49 |     parser.add_argument("--hub-repo-id", type=str, default=None)
50 |     args = parser.parse_args()
51 | 
52 |     make_delta(args.base_model_path, args.target_model_path, args.delta_path, args.hub_repo_id)
53 | 


--------------------------------------------------------------------------------
/llava/model/multimodal_encoder/__pycache__/builder.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dongyh20/Insight-V/5e9c6f934c1929e351dbff17a6c6c7bc2d8143b1/llava/model/multimodal_encoder/__pycache__/builder.cpython-310.pyc


--------------------------------------------------------------------------------
/llava/model/multimodal_encoder/__pycache__/clip_encoder.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dongyh20/Insight-V/5e9c6f934c1929e351dbff17a6c6c7bc2d8143b1/llava/model/multimodal_encoder/__pycache__/clip_encoder.cpython-310.pyc


--------------------------------------------------------------------------------
/llava/model/multimodal_encoder/__pycache__/hf_vision.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dongyh20/Insight-V/5e9c6f934c1929e351dbff17a6c6c7bc2d8143b1/llava/model/multimodal_encoder/__pycache__/hf_vision.cpython-310.pyc


--------------------------------------------------------------------------------
/llava/model/multimodal_encoder/__pycache__/imagebind.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dongyh20/Insight-V/5e9c6f934c1929e351dbff17a6c6c7bc2d8143b1/llava/model/multimodal_encoder/__pycache__/imagebind.cpython-310.pyc


--------------------------------------------------------------------------------
/llava/model/multimodal_encoder/__pycache__/open_clip_encoder.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dongyh20/Insight-V/5e9c6f934c1929e351dbff17a6c6c7bc2d8143b1/llava/model/multimodal_encoder/__pycache__/open_clip_encoder.cpython-310.pyc


--------------------------------------------------------------------------------
/llava/model/multimodal_encoder/__pycache__/siglip_encoder.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dongyh20/Insight-V/5e9c6f934c1929e351dbff17a6c6c7bc2d8143b1/llava/model/multimodal_encoder/__pycache__/siglip_encoder.cpython-310.pyc


--------------------------------------------------------------------------------
/llava/model/multimodal_encoder/builder.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from .clip_encoder import CLIPVisionTower
 3 | from .imagebind import ImageBindWrapper
 4 | from .open_clip_encoder import OpenCLIPVisionTower
 5 | from .hf_vision import HFVisionTower
 6 | from .siglip_encoder import SigLipVisionTower
 7 | from .clip_encoder import CLIPVisionTower, CLIPVisionTowerS2
 8 | 
 9 | # from .eva_clip.eva_clip_encoder import EvaClipVisionTower
10 | # from .dev_eva_clip.eva_vit import EvaViTWrapper
11 | 
12 | 
13 | def build_vision_tower(vision_tower_cfg, **kwargs):
14 |     vision_tower = getattr(vision_tower_cfg, "mm_vision_tower", getattr(vision_tower_cfg, "vision_tower", None))
15 |     is_absolute_path_exists = os.path.exists(vision_tower)
16 |     use_s2 = getattr(vision_tower_cfg, "s2", False)
17 |     if is_absolute_path_exists or vision_tower.startswith("openai") or vision_tower.startswith("laion") or "ShareGPT4V" in vision_tower:
18 |         if use_s2:
19 |             return CLIPVisionTowerS2(vision_tower, args=vision_tower_cfg, **kwargs)
20 |         else:
21 |             return CLIPVisionTower(vision_tower, args=vision_tower_cfg, **kwargs)
22 |     elif "siglip" in vision_tower:
23 |         return SigLipVisionTower(vision_tower, vision_tower_cfg=vision_tower_cfg, **kwargs)
24 |     elif vision_tower.startswith("hf:"):
25 |         return HFVisionTower(vision_tower, args=vision_tower_cfg, **kwargs)
26 |     elif vision_tower in ["imagebind_huge"]:
27 |         return ImageBindWrapper(vision_tower, args=vision_tower_cfg, **kwargs)
28 |     elif vision_tower.startswith("open_clip_hub"):
29 |         return OpenCLIPVisionTower(vision_tower, args=vision_tower_cfg, **kwargs)
30 |     # elif "internal-eva" in vision_tower.lower() or "eva02" in vision_tower.lower():
31 |     #     return EvaClipVisionTower(vision_tower, args=vision_tower_cfg, **kwargs)
32 |     # elif vision_tower in ["EVA-CLIP-8B", "EVA-CLIP-8B-plus"]:
33 |     #     return EvaViTWrapper(vision_tower, args=vision_tower_cfg, **kwargs)
34 | 
35 |     raise ValueError(f"Unknown vision tower: {vision_tower}")
36 | 


--------------------------------------------------------------------------------
/llava/model/multimodal_encoder/dev_eva_clip/eva_clip/__init__.py:
--------------------------------------------------------------------------------
 1 | from .constants import OPENAI_DATASET_MEAN, OPENAI_DATASET_STD
 2 | from .factory import create_model, create_model_and_transforms, create_model_from_pretrained, get_tokenizer
 3 | from .factory import list_models, add_model_config, get_model_config, load_checkpoint
 4 | from .loss import ClipLoss
 5 | from .model import CLIP, CustomCLIP, CLIPTextCfg, CLIPVisionCfg, convert_weights_to_lp, convert_weights_to_fp16, trace_model, get_cast_dtype
 6 | from .openai import load_openai_model, list_openai_models
 7 | from .pretrained import list_pretrained, list_pretrained_models_by_tag, list_pretrained_tags_by_model, get_pretrained_url, download_pretrained_from_url, is_pretrained_cfg, get_pretrained_cfg, download_pretrained
 8 | from .tokenizer import SimpleTokenizer, tokenize
 9 | from .transform import image_transform
10 | 


--------------------------------------------------------------------------------
/llava/model/multimodal_encoder/dev_eva_clip/eva_clip/bpe_simple_vocab_16e6.txt.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dongyh20/Insight-V/5e9c6f934c1929e351dbff17a6c6c7bc2d8143b1/llava/model/multimodal_encoder/dev_eva_clip/eva_clip/bpe_simple_vocab_16e6.txt.gz


--------------------------------------------------------------------------------
/llava/model/multimodal_encoder/dev_eva_clip/eva_clip/constants.py:
--------------------------------------------------------------------------------
1 | OPENAI_DATASET_MEAN = (0.48145466, 0.4578275, 0.40821073)
2 | OPENAI_DATASET_STD = (0.26862954, 0.26130258, 0.27577711)
3 | 


--------------------------------------------------------------------------------
/llava/model/multimodal_encoder/dev_eva_clip/eva_clip/hf_configs.py:
--------------------------------------------------------------------------------
 1 | # HF architecture dict:
 2 | arch_dict = {
 3 |     # https://huggingface.co/docs/transformers/model_doc/roberta#roberta
 4 |     "roberta": {
 5 |         "config_names": {
 6 |             "context_length": "max_position_embeddings",
 7 |             "vocab_size": "vocab_size",
 8 |             "width": "hidden_size",
 9 |             "heads": "num_attention_heads",
10 |             "layers": "num_hidden_layers",
11 |             "layer_attr": "layer",
12 |             "token_embeddings_attr": "embeddings",
13 |         },
14 |         "pooler": "mean_pooler",
15 |     },
16 |     # https://huggingface.co/docs/transformers/model_doc/xlm-roberta#transformers.XLMRobertaConfig
17 |     "xlm-roberta": {
18 |         "config_names": {
19 |             "context_length": "max_position_embeddings",
20 |             "vocab_size": "vocab_size",
21 |             "width": "hidden_size",
22 |             "heads": "num_attention_heads",
23 |             "layers": "num_hidden_layers",
24 |             "layer_attr": "layer",
25 |             "token_embeddings_attr": "embeddings",
26 |         },
27 |         "pooler": "mean_pooler",
28 |     },
29 |     # https://huggingface.co/docs/transformers/model_doc/mt5#mt5
30 |     "mt5": {
31 |         "config_names": {
32 |             # unlimited seqlen
33 |             # https://github.com/google-research/text-to-text-transfer-transformer/issues/273
34 |             # https://github.com/huggingface/transformers/blob/v4.24.0/src/transformers/models/t5/modeling_t5.py#L374
35 |             "context_length": "",
36 |             "vocab_size": "vocab_size",
37 |             "width": "d_model",
38 |             "heads": "num_heads",
39 |             "layers": "num_layers",
40 |             "layer_attr": "block",
41 |             "token_embeddings_attr": "embed_tokens",
42 |         },
43 |         "pooler": "mean_pooler",
44 |     },
45 |     "bert": {
46 |         "config_names": {
47 |             "context_length": "max_position_embeddings",
48 |             "vocab_size": "vocab_size",
49 |             "width": "hidden_size",
50 |             "heads": "num_attention_heads",
51 |             "layers": "num_hidden_layers",
52 |             "layer_attr": "layer",
53 |             "token_embeddings_attr": "embeddings",
54 |         },
55 |         "pooler": "mean_pooler",
56 |     },
57 | }
58 | 


--------------------------------------------------------------------------------
/llava/model/multimodal_encoder/dev_eva_clip/eva_clip/loss.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import torch
  3 | import torch.nn as nn
  4 | from torch.nn import functional as F
  5 | 
  6 | try:
  7 |     import torch.distributed.nn
  8 |     from torch import distributed as dist
  9 | 
 10 |     has_distributed = True
 11 | except ImportError:
 12 |     has_distributed = False
 13 | 
 14 | try:
 15 |     import horovod.torch as hvd
 16 | except ImportError:
 17 |     hvd = None
 18 | 
 19 | from timm.loss import LabelSmoothingCrossEntropy
 20 | 
 21 | 
 22 | def gather_features(image_features, text_features, local_loss=False, gather_with_grad=False, rank=0, world_size=1, use_horovod=False):
 23 |     assert has_distributed, "torch.distributed did not import correctly, please use a PyTorch version with support."
 24 |     if use_horovod:
 25 |         assert hvd is not None, "Please install horovod"
 26 |         if gather_with_grad:
 27 |             all_image_features = hvd.allgather(image_features)
 28 |             all_text_features = hvd.allgather(text_features)
 29 |         else:
 30 |             with torch.no_grad():
 31 |                 all_image_features = hvd.allgather(image_features)
 32 |                 all_text_features = hvd.allgather(text_features)
 33 |             if not local_loss:
 34 |                 # ensure grads for local rank when all_* features don't have a gradient
 35 |                 gathered_image_features = list(all_image_features.chunk(world_size, dim=0))
 36 |                 gathered_text_features = list(all_text_features.chunk(world_size, dim=0))
 37 |                 gathered_image_features[rank] = image_features
 38 |                 gathered_text_features[rank] = text_features
 39 |                 all_image_features = torch.cat(gathered_image_features, dim=0)
 40 |                 all_text_features = torch.cat(gathered_text_features, dim=0)
 41 |     else:
 42 |         # We gather tensors from all gpus
 43 |         if gather_with_grad:
 44 |             all_image_features = torch.cat(torch.distributed.nn.all_gather(image_features), dim=0)
 45 |             all_text_features = torch.cat(torch.distributed.nn.all_gather(text_features), dim=0)
 46 |             # all_image_features = torch.cat(torch.distributed.nn.all_gather(image_features, async_op=True), dim=0)
 47 |             # all_text_features = torch.cat(torch.distributed.nn.all_gather(text_features, async_op=True), dim=0)
 48 |         else:
 49 |             gathered_image_features = [torch.zeros_like(image_features) for _ in range(world_size)]
 50 |             gathered_text_features = [torch.zeros_like(text_features) for _ in range(world_size)]
 51 |             dist.all_gather(gathered_image_features, image_features)
 52 |             dist.all_gather(gathered_text_features, text_features)
 53 |             if not local_loss:
 54 |                 # ensure grads for local rank when all_* features don't have a gradient
 55 |                 gathered_image_features[rank] = image_features
 56 |                 gathered_text_features[rank] = text_features
 57 |             all_image_features = torch.cat(gathered_image_features, dim=0)
 58 |             all_text_features = torch.cat(gathered_text_features, dim=0)
 59 | 
 60 |     return all_image_features, all_text_features
 61 | 
 62 | 
 63 | class ClipLoss(nn.Module):
 64 | 
 65 |     def __init__(
 66 |         self,
 67 |         local_loss=False,
 68 |         gather_with_grad=False,
 69 |         cache_labels=False,
 70 |         rank=0,
 71 |         world_size=1,
 72 |         use_horovod=False,
 73 |         smoothing=0.0,
 74 |     ):
 75 |         super().__init__()
 76 |         self.local_loss = local_loss
 77 |         self.gather_with_grad = gather_with_grad
 78 |         self.cache_labels = cache_labels
 79 |         self.rank = rank
 80 |         self.world_size = world_size
 81 |         self.use_horovod = use_horovod
 82 |         self.label_smoothing_cross_entropy = LabelSmoothingCrossEntropy(smoothing=smoothing) if smoothing > 0 else None
 83 | 
 84 |         # cache state
 85 |         self.prev_num_logits = 0
 86 |         self.labels = {}
 87 | 
 88 |     def forward(self, image_features, text_features, logit_scale=1.0):
 89 |         device = image_features.device
 90 |         if self.world_size > 1:
 91 |             all_image_features, all_text_features = gather_features(image_features, text_features, self.local_loss, self.gather_with_grad, self.rank, self.world_size, self.use_horovod)
 92 | 
 93 |             if self.local_loss:
 94 |                 logits_per_image = logit_scale * image_features @ all_text_features.T
 95 |                 logits_per_text = logit_scale * text_features @ all_image_features.T
 96 |             else:
 97 |                 logits_per_image = logit_scale * all_image_features @ all_text_features.T
 98 |                 logits_per_text = logits_per_image.T
 99 |         else:
100 |             logits_per_image = logit_scale * image_features @ text_features.T
101 |             logits_per_text = logit_scale * text_features @ image_features.T
102 |         # calculated ground-truth and cache if enabled
103 |         num_logits = logits_per_image.shape[0]
104 |         if self.prev_num_logits != num_logits or device not in self.labels:
105 |             labels = torch.arange(num_logits, device=device, dtype=torch.long)
106 |             if self.world_size > 1 and self.local_loss:
107 |                 labels = labels + num_logits * self.rank
108 |             if self.cache_labels:
109 |                 self.labels[device] = labels
110 |                 self.prev_num_logits = num_logits
111 |         else:
112 |             labels = self.labels[device]
113 | 
114 |         if self.label_smoothing_cross_entropy:
115 |             total_loss = (self.label_smoothing_cross_entropy(logits_per_image, labels) + self.label_smoothing_cross_entropy(logits_per_text, labels)) / 2
116 |         else:
117 |             total_loss = (F.cross_entropy(logits_per_image, labels) + F.cross_entropy(logits_per_text, labels)) / 2
118 | 
119 |         acc = None
120 |         i2t_acc = (logits_per_image.argmax(-1) == labels).sum() / len(logits_per_image)
121 |         t2i_acc = (logits_per_text.argmax(-1) == labels).sum() / len(logits_per_text)
122 |         acc = {"i2t": i2t_acc, "t2i": t2i_acc}
123 |         return total_loss, acc
124 | 


--------------------------------------------------------------------------------
/llava/model/multimodal_encoder/dev_eva_clip/eva_clip/model_configs/EVA-CLIP-18B.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1536,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 48,
 6 |         "width": 5120,
 7 |         "head_width": 128,
 8 |         "mlp_ratio": 5,
 9 |         "patch_size": 14,
10 |         "eva_model_name": "eva-clip-18b-14-x",
11 |         "drop_path_rate": 0,
12 |         "qkv_bias": false,
13 |         "xattn": true,
14 |         "postnorm": true,
15 |         "fusedLN": false,
16 |         "use_rms_norm": true
17 |     },
18 |     "text_cfg": {
19 |         "context_length": 77,
20 |         "vocab_size": 49408,
21 |         "width": 1280,
22 |         "heads": 20,
23 |         "layers": 32,
24 |         "xattn": false,
25 |         "fusedLN": false
26 |     }
27 | }


--------------------------------------------------------------------------------
/llava/model/multimodal_encoder/dev_eva_clip/eva_clip/model_configs/EVA-CLIP-8B-plus.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1280,
 3 |     "vision_cfg": {
 4 |         "image_size": 448,
 5 |         "layers": 32,
 6 |         "width": 4096,
 7 |         "head_width": 128,
 8 |         "mlp_ratio": 5,
 9 |         "patch_size": 14,
10 |         "eva_model_name": "eva-clip-8b-14-plus-x",
11 |         "drop_path_rate": 0,
12 |         "qkv_bias": false,
13 |         "xattn": true,
14 |         "postnorm": false,
15 |         "fusedLN": false,
16 |         "use_rms_norm": true
17 |     },
18 |     "text_cfg": {
19 |         "context_length": 77,
20 |         "vocab_size": 49408,
21 |         "width": 1280,
22 |         "heads": 20,
23 |         "layers": 32,
24 |         "xattn": false,
25 |         "fusedLN": false
26 |     }
27 | }


--------------------------------------------------------------------------------
/llava/model/multimodal_encoder/dev_eva_clip/eva_clip/model_configs/EVA-CLIP-8B.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1280,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 32,
 6 |         "width": 4096,
 7 |         "head_width": 128,
 8 |         "mlp_ratio": 5,
 9 |         "patch_size": 14,
10 |         "eva_model_name": "eva-clip-8b-14-x",
11 |         "drop_path_rate": 0,
12 |         "qkv_bias": false,
13 |         "xattn": true,
14 |         "postnorm": false,
15 |         "fusedLN": false,
16 |         "use_rms_norm": true
17 |     },
18 |     "text_cfg": {
19 |         "context_length": 77,
20 |         "vocab_size": 49408,
21 |         "width": 1280,
22 |         "heads": 20,
23 |         "layers": 32,
24 |         "xattn": false,
25 |         "fusedLN": false
26 |     }
27 | }


--------------------------------------------------------------------------------
/llava/model/multimodal_encoder/dev_eva_clip/eva_clip/model_configs/EVA01-CLIP-B-16.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 512,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 12,
 6 |         "width": 768,
 7 |         "patch_size": 16,
 8 |         "eva_model_name": "eva-clip-b-16",
 9 |         "ls_init_value": 0.1,
10 |         "drop_path_rate": 0.0
11 |     },
12 |     "text_cfg": {
13 |         "context_length": 77,
14 |         "vocab_size": 49408,
15 |         "width": 512,
16 |         "heads": 8,
17 |         "layers": 12
18 |     }
19 | }


--------------------------------------------------------------------------------
/llava/model/multimodal_encoder/dev_eva_clip/eva_clip/model_configs/EVA01-CLIP-g-14-plus.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1024,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 40,
 6 |         "width": 1408,
 7 |         "head_width": 88,
 8 |         "mlp_ratio": 4.3637,
 9 |         "patch_size": 14,
10 |         "eva_model_name": "eva-clip-g-14-x",
11 |         "drop_path_rate": 0,
12 |         "xattn": true,
13 |         "fusedLN": true
14 |     },
15 |     "text_cfg": {
16 |         "context_length": 77,
17 |         "vocab_size": 49408,
18 |         "width": 1024,
19 |         "heads": 16,
20 |         "layers": 24,
21 |         "xattn": false,
22 |         "fusedLN": true
23 |     }
24 | }


--------------------------------------------------------------------------------
/llava/model/multimodal_encoder/dev_eva_clip/eva_clip/model_configs/EVA01-CLIP-g-14.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1024,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 40,
 6 |         "width": 1408,
 7 |         "head_width": 88,
 8 |         "mlp_ratio": 4.3637,
 9 |         "patch_size": 14,
10 |         "eva_model_name": "eva-clip-g-14-x",
11 |         "drop_path_rate": 0.4,
12 |         "xattn": true,
13 |         "fusedLN": true
14 |     },
15 |     "text_cfg": {
16 |         "context_length": 77,
17 |         "vocab_size": 49408,
18 |         "width": 768,
19 |         "heads": 12,
20 |         "layers": 12,
21 |         "xattn": false,
22 |         "fusedLN": true
23 |     }
24 | }


--------------------------------------------------------------------------------
/llava/model/multimodal_encoder/dev_eva_clip/eva_clip/model_configs/EVA02-CLIP-B-16.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 512,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 12,
 6 |         "width": 768,
 7 |         "head_width": 64,
 8 |         "patch_size": 16,
 9 |         "mlp_ratio": 2.6667,
10 |         "eva_model_name": "eva-clip-b-16-X",
11 |         "drop_path_rate": 0.0,
12 |         "xattn": true,
13 |         "fusedLN": true,
14 |         "rope": true,
15 |         "pt_hw_seq_len": 16,
16 |         "intp_freq": true,
17 |         "naiveswiglu": true,
18 |         "subln": true
19 |     },
20 |     "text_cfg": {
21 |         "context_length": 77,
22 |         "vocab_size": 49408,
23 |         "width": 512,
24 |         "heads": 8,
25 |         "layers": 12,
26 |         "xattn": true,
27 |         "fusedLN": true
28 |     }
29 | }


--------------------------------------------------------------------------------
/llava/model/multimodal_encoder/dev_eva_clip/eva_clip/model_configs/EVA02-CLIP-L-14-336.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 768,
 3 |     "vision_cfg": {
 4 |         "image_size": 336,
 5 |         "layers": 24,
 6 |         "width": 1024,
 7 |         "drop_path_rate": 0,
 8 |         "head_width": 64,
 9 |         "mlp_ratio": 2.6667,
10 |         "patch_size": 14,
11 |         "eva_model_name": "eva-clip-l-14-336",
12 |         "xattn": true,
13 |         "fusedLN": true,
14 |         "rope": true,
15 |         "pt_hw_seq_len": 16,
16 |         "intp_freq": true,
17 |         "naiveswiglu": true,
18 |         "subln": true
19 |     },
20 |     "text_cfg": {
21 |         "context_length": 77,
22 |         "vocab_size": 49408,
23 |         "width": 768,
24 |         "heads": 12,
25 |         "layers": 12,
26 |         "xattn": false,
27 |         "fusedLN": true
28 |     }
29 | }


--------------------------------------------------------------------------------
/llava/model/multimodal_encoder/dev_eva_clip/eva_clip/model_configs/EVA02-CLIP-L-14.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 768,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 24,
 6 |         "width": 1024,
 7 |         "drop_path_rate": 0,
 8 |         "head_width": 64,
 9 |         "mlp_ratio": 2.6667,
10 |         "patch_size": 14,
11 |         "eva_model_name": "eva-clip-l-14",
12 |         "xattn": true,
13 |         "fusedLN": true,
14 |         "rope": true,
15 |         "pt_hw_seq_len": 16,
16 |         "intp_freq": true,
17 |         "naiveswiglu": true,
18 |         "subln": true
19 |     },
20 |     "text_cfg": {
21 |         "context_length": 77,
22 |         "vocab_size": 49408,
23 |         "width": 768,
24 |         "heads": 12,
25 |         "layers": 12,
26 |         "xattn": false,
27 |         "fusedLN": true
28 |     }
29 | }


--------------------------------------------------------------------------------
/llava/model/multimodal_encoder/dev_eva_clip/eva_clip/model_configs/EVA02-CLIP-bigE-14-plus.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1024,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 64,
 6 |         "width": 1792,
 7 |         "head_width": 112,
 8 |         "mlp_ratio": 8.571428571428571,
 9 |         "patch_size": 14,
10 |         "eva_model_name": "eva-clip-4b-14-x",
11 |         "drop_path_rate": 0,
12 |         "xattn": true,
13 |         "postnorm": true,
14 |         "fusedLN": true
15 |     },
16 |     "text_cfg": {
17 |         "context_length": 77,
18 |         "vocab_size": 49408,
19 |         "width": 1280,
20 |         "heads": 20,
21 |         "layers": 32,
22 |         "xattn": false,
23 |         "fusedLN": true
24 |     }
25 | }
26 | 


--------------------------------------------------------------------------------
/llava/model/multimodal_encoder/dev_eva_clip/eva_clip/model_configs/EVA02-CLIP-bigE-14.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1024,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 64,
 6 |         "width": 1792,
 7 |         "head_width": 112,
 8 |         "mlp_ratio": 8.571428571428571,
 9 |         "patch_size": 14,
10 |         "eva_model_name": "eva-clip-4b-14-x",
11 |         "drop_path_rate": 0,
12 |         "xattn": true,
13 |         "postnorm": true,
14 |         "fusedLN": true
15 |     },
16 |     "text_cfg": {
17 |         "context_length": 77,
18 |         "vocab_size": 49408,
19 |         "width": 1024,
20 |         "heads": 16,
21 |         "layers": 24,
22 |         "xattn": false,
23 |         "fusedLN": true
24 |     }
25 | }


--------------------------------------------------------------------------------
/llava/model/multimodal_encoder/dev_eva_clip/eva_clip/model_configs/Internal-EVA02-CLIP-10B-14-448.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1024,
 3 |     "vision_cfg": {
 4 |         "image_size": 448,
 5 |         "layers": 77,
 6 |         "width": 2304,
 7 |         "head_width": 144,
 8 |         "mlp_ratio": 10.9722,
 9 |         "patch_size": 14,
10 |         "eva_model_name": "eva-clip-10b-14-x",
11 |         "drop_path_rate": 0,
12 |         "xattn": true,
13 |         "postnorm": false,
14 |         "fusedLN": true
15 |     },
16 |     "text_cfg": {
17 |         "context_length": 77,
18 |         "vocab_size": 49408,
19 |         "width": 1280,
20 |         "heads": 20,
21 |         "layers": 32,
22 |         "xattn": false,
23 |         "fusedLN": true
24 |     }
25 | }
26 | 


--------------------------------------------------------------------------------
/llava/model/multimodal_encoder/dev_eva_clip/eva_clip/model_configs/Internal-EVA02-CLIP-10B-14.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1024,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 77,
 6 |         "width": 2304,
 7 |         "head_width": 144,
 8 |         "mlp_ratio": 10.9722,
 9 |         "patch_size": 14,
10 |         "eva_model_name": "eva-clip-10b-14-x",
11 |         "drop_path_rate": 0,
12 |         "xattn": true,
13 |         "postnorm": false,
14 |         "fusedLN": true
15 |     },
16 |     "text_cfg": {
17 |         "context_length": 77,
18 |         "vocab_size": 49408,
19 |         "width": 1280,
20 |         "heads": 20,
21 |         "layers": 32,
22 |         "xattn": false,
23 |         "fusedLN": true
24 |     }
25 | }
26 | 


--------------------------------------------------------------------------------
/llava/model/multimodal_encoder/dev_eva_clip/eva_clip/modified_resnet.py:
--------------------------------------------------------------------------------
  1 | from collections import OrderedDict
  2 | 
  3 | import torch
  4 | from torch import nn
  5 | from torch.nn import functional as F
  6 | 
  7 | from .utils import freeze_batch_norm_2d
  8 | 
  9 | 
 10 | class Bottleneck(nn.Module):
 11 |     expansion = 4
 12 | 
 13 |     def __init__(self, inplanes, planes, stride=1):
 14 |         super().__init__()
 15 | 
 16 |         # all conv layers have stride 1. an avgpool is performed after the second convolution when stride > 1
 17 |         self.conv1 = nn.Conv2d(inplanes, planes, 1, bias=False)
 18 |         self.bn1 = nn.BatchNorm2d(planes)
 19 |         self.act1 = nn.ReLU(inplace=True)
 20 | 
 21 |         self.conv2 = nn.Conv2d(planes, planes, 3, padding=1, bias=False)
 22 |         self.bn2 = nn.BatchNorm2d(planes)
 23 |         self.act2 = nn.ReLU(inplace=True)
 24 | 
 25 |         self.avgpool = nn.AvgPool2d(stride) if stride > 1 else nn.Identity()
 26 | 
 27 |         self.conv3 = nn.Conv2d(planes, planes * self.expansion, 1, bias=False)
 28 |         self.bn3 = nn.BatchNorm2d(planes * self.expansion)
 29 |         self.act3 = nn.ReLU(inplace=True)
 30 | 
 31 |         self.downsample = None
 32 |         self.stride = stride
 33 | 
 34 |         if stride > 1 or inplanes != planes * Bottleneck.expansion:
 35 |             # downsampling layer is prepended with an avgpool, and the subsequent convolution has stride 1
 36 |             self.downsample = nn.Sequential(OrderedDict([("-1", nn.AvgPool2d(stride)), ("0", nn.Conv2d(inplanes, planes * self.expansion, 1, stride=1, bias=False)), ("1", nn.BatchNorm2d(planes * self.expansion))]))
 37 | 
 38 |     def forward(self, x: torch.Tensor):
 39 |         identity = x
 40 | 
 41 |         out = self.act1(self.bn1(self.conv1(x)))
 42 |         out = self.act2(self.bn2(self.conv2(out)))
 43 |         out = self.avgpool(out)
 44 |         out = self.bn3(self.conv3(out))
 45 | 
 46 |         if self.downsample is not None:
 47 |             identity = self.downsample(x)
 48 | 
 49 |         out += identity
 50 |         out = self.act3(out)
 51 |         return out
 52 | 
 53 | 
 54 | class AttentionPool2d(nn.Module):
 55 |     def __init__(self, spacial_dim: int, embed_dim: int, num_heads: int, output_dim: int = None):
 56 |         super().__init__()
 57 |         self.positional_embedding = nn.Parameter(torch.randn(spacial_dim**2 + 1, embed_dim) / embed_dim**0.5)
 58 |         self.k_proj = nn.Linear(embed_dim, embed_dim)
 59 |         self.q_proj = nn.Linear(embed_dim, embed_dim)
 60 |         self.v_proj = nn.Linear(embed_dim, embed_dim)
 61 |         self.c_proj = nn.Linear(embed_dim, output_dim or embed_dim)
 62 |         self.num_heads = num_heads
 63 | 
 64 |     def forward(self, x):
 65 |         x = x.reshape(x.shape[0], x.shape[1], x.shape[2] * x.shape[3]).permute(2, 0, 1)  # NCHW -> (HW)NC
 66 |         x = torch.cat([x.mean(dim=0, keepdim=True), x], dim=0)  # (HW+1)NC
 67 |         x = x + self.positional_embedding[:, None, :].to(x.dtype)  # (HW+1)NC
 68 |         x, _ = F.multi_head_attention_forward(
 69 |             query=x,
 70 |             key=x,
 71 |             value=x,
 72 |             embed_dim_to_check=x.shape[-1],
 73 |             num_heads=self.num_heads,
 74 |             q_proj_weight=self.q_proj.weight,
 75 |             k_proj_weight=self.k_proj.weight,
 76 |             v_proj_weight=self.v_proj.weight,
 77 |             in_proj_weight=None,
 78 |             in_proj_bias=torch.cat([self.q_proj.bias, self.k_proj.bias, self.v_proj.bias]),
 79 |             bias_k=None,
 80 |             bias_v=None,
 81 |             add_zero_attn=False,
 82 |             dropout_p=0.0,
 83 |             out_proj_weight=self.c_proj.weight,
 84 |             out_proj_bias=self.c_proj.bias,
 85 |             use_separate_proj_weight=True,
 86 |             training=self.training,
 87 |             need_weights=False,
 88 |         )
 89 | 
 90 |         return x[0]
 91 | 
 92 | 
 93 | class ModifiedResNet(nn.Module):
 94 |     """
 95 |     A ResNet class that is similar to torchvision's but contains the following changes:
 96 |     - There are now 3 "stem" convolutions as opposed to 1, with an average pool instead of a max pool.
 97 |     - Performs anti-aliasing strided convolutions, where an avgpool is prepended to convolutions with stride > 1
 98 |     - The final pooling layer is a QKV attention instead of an average pool
 99 |     """
100 | 
101 |     def __init__(self, layers, output_dim, heads, image_size=224, width=64):
102 |         super().__init__()
103 |         self.output_dim = output_dim
104 |         self.image_size = image_size
105 | 
106 |         # the 3-layer stem
107 |         self.conv1 = nn.Conv2d(3, width // 2, kernel_size=3, stride=2, padding=1, bias=False)
108 |         self.bn1 = nn.BatchNorm2d(width // 2)
109 |         self.act1 = nn.ReLU(inplace=True)
110 |         self.conv2 = nn.Conv2d(width // 2, width // 2, kernel_size=3, padding=1, bias=False)
111 |         self.bn2 = nn.BatchNorm2d(width // 2)
112 |         self.act2 = nn.ReLU(inplace=True)
113 |         self.conv3 = nn.Conv2d(width // 2, width, kernel_size=3, padding=1, bias=False)
114 |         self.bn3 = nn.BatchNorm2d(width)
115 |         self.act3 = nn.ReLU(inplace=True)
116 |         self.avgpool = nn.AvgPool2d(2)
117 | 
118 |         # residual layers
119 |         self._inplanes = width  # this is a *mutable* variable used during construction
120 |         self.layer1 = self._make_layer(width, layers[0])
121 |         self.layer2 = self._make_layer(width * 2, layers[1], stride=2)
122 |         self.layer3 = self._make_layer(width * 4, layers[2], stride=2)
123 |         self.layer4 = self._make_layer(width * 8, layers[3], stride=2)
124 | 
125 |         embed_dim = width * 32  # the ResNet feature dimension
126 |         self.attnpool = AttentionPool2d(image_size // 32, embed_dim, heads, output_dim)
127 | 
128 |         self.init_parameters()
129 | 
130 |     def _make_layer(self, planes, blocks, stride=1):
131 |         layers = [Bottleneck(self._inplanes, planes, stride)]
132 | 
133 |         self._inplanes = planes * Bottleneck.expansion
134 |         for _ in range(1, blocks):
135 |             layers.append(Bottleneck(self._inplanes, planes))
136 | 
137 |         return nn.Sequential(*layers)
138 | 
139 |     def init_parameters(self):
140 |         if self.attnpool is not None:
141 |             std = self.attnpool.c_proj.in_features**-0.5
142 |             nn.init.normal_(self.attnpool.q_proj.weight, std=std)
143 |             nn.init.normal_(self.attnpool.k_proj.weight, std=std)
144 |             nn.init.normal_(self.attnpool.v_proj.weight, std=std)
145 |             nn.init.normal_(self.attnpool.c_proj.weight, std=std)
146 | 
147 |         for resnet_block in [self.layer1, self.layer2, self.layer3, self.layer4]:
148 |             for name, param in resnet_block.named_parameters():
149 |                 if name.endswith("bn3.weight"):
150 |                     nn.init.zeros_(param)
151 | 
152 |     def lock(self, unlocked_groups=0, freeze_bn_stats=False):
153 |         assert unlocked_groups == 0, "partial locking not currently supported for this model"
154 |         for param in self.parameters():
155 |             param.requires_grad = False
156 |         if freeze_bn_stats:
157 |             freeze_batch_norm_2d(self)
158 | 
159 |     @torch.jit.ignore
160 |     def set_grad_checkpointing(self, enable=True):
161 |         # FIXME support for non-transformer
162 |         pass
163 | 
164 |     def stem(self, x):
165 |         x = self.act1(self.bn1(self.conv1(x)))
166 |         x = self.act2(self.bn2(self.conv2(x)))
167 |         x = self.act3(self.bn3(self.conv3(x)))
168 |         x = self.avgpool(x)
169 |         return x
170 | 
171 |     def forward(self, x):
172 |         x = self.stem(x)
173 |         x = self.layer1(x)
174 |         x = self.layer2(x)
175 |         x = self.layer3(x)
176 |         x = self.layer4(x)
177 |         x = self.attnpool(x)
178 | 
179 |         return x
180 | 


--------------------------------------------------------------------------------
/llava/model/multimodal_encoder/dev_eva_clip/eva_clip/openai.py:
--------------------------------------------------------------------------------
  1 | """ OpenAI pretrained model functions
  2 | 
  3 | Adapted from https://github.com/openai/CLIP. Originally MIT License, Copyright (c) 2021 OpenAI.
  4 | """
  5 | 
  6 | import os
  7 | import warnings
  8 | from typing import List, Optional, Union
  9 | 
 10 | import torch
 11 | 
 12 | from .model import build_model_from_openai_state_dict, convert_weights_to_lp, get_cast_dtype
 13 | from .pretrained import get_pretrained_url, list_pretrained_models_by_tag, download_pretrained_from_url
 14 | 
 15 | __all__ = ["list_openai_models", "load_openai_model"]
 16 | 
 17 | 
 18 | def list_openai_models() -> List[str]:
 19 |     """Returns the names of available CLIP models"""
 20 |     return list_pretrained_models_by_tag("openai")
 21 | 
 22 | 
 23 | def load_openai_model(
 24 |     name: str,
 25 |     precision: Optional[str] = None,
 26 |     device: Optional[Union[str, torch.device]] = None,
 27 |     jit: bool = True,
 28 |     cache_dir: Optional[str] = None,
 29 | ):
 30 |     """Load a CLIP model
 31 | 
 32 |     Parameters
 33 |     ----------
 34 |     name : str
 35 |         A model name listed by `clip.available_models()`, or the path to a model checkpoint containing the state_dict
 36 |     precision: str
 37 |         Model precision, if None defaults to 'fp32' if device == 'cpu' else 'fp16'.
 38 |     device : Union[str, torch.device]
 39 |         The device to put the loaded model
 40 |     jit : bool
 41 |         Whether to load the optimized JIT model (default) or more hackable non-JIT model.
 42 |     cache_dir : Optional[str]
 43 |         The directory to cache the downloaded model weights
 44 | 
 45 |     Returns
 46 |     -------
 47 |     model : torch.nn.Module
 48 |         The CLIP model
 49 |     preprocess : Callable[[PIL.Image], torch.Tensor]
 50 |         A torchvision transform that converts a PIL image into a tensor that the returned model can take as its input
 51 |     """
 52 |     if device is None:
 53 |         device = "cuda" if torch.cuda.is_available() else "cpu"
 54 |     if precision is None:
 55 |         precision = "fp32" if device == "cpu" else "fp16"
 56 | 
 57 |     if get_pretrained_url(name, "openai"):
 58 |         model_path = download_pretrained_from_url(get_pretrained_url(name, "openai"), cache_dir=cache_dir)
 59 |     elif os.path.isfile(name):
 60 |         model_path = name
 61 |     else:
 62 |         raise RuntimeError(f"Model {name} not found; available models = {list_openai_models()}")
 63 | 
 64 |     try:
 65 |         # loading JIT archive
 66 |         model = torch.jit.load(model_path, map_location=device if jit else "cpu").eval()
 67 |         state_dict = None
 68 |     except RuntimeError:
 69 |         # loading saved state dict
 70 |         if jit:
 71 |             warnings.warn(f"File {model_path} is not a JIT archive. Loading as a state dict instead")
 72 |             jit = False
 73 |         state_dict = torch.load(model_path, map_location="cpu")
 74 | 
 75 |     if not jit:
 76 |         # Build a non-jit model from the OpenAI jitted model state dict
 77 |         cast_dtype = get_cast_dtype(precision)
 78 |         try:
 79 |             model = build_model_from_openai_state_dict(state_dict or model.state_dict(), cast_dtype=cast_dtype)
 80 |         except KeyError:
 81 |             sd = {k[7:]: v for k, v in state_dict["state_dict"].items()}
 82 |             model = build_model_from_openai_state_dict(sd, cast_dtype=cast_dtype)
 83 | 
 84 |         # model from OpenAI state dict is in manually cast fp16 mode, must be converted for AMP/fp32/bf16 use
 85 |         model = model.to(device)
 86 |         if precision.startswith("amp") or precision == "fp32":
 87 |             model.float()
 88 |         elif precision == "bf16":
 89 |             convert_weights_to_lp(model, dtype=torch.bfloat16)
 90 | 
 91 |         return model
 92 | 
 93 |     # patch the device names
 94 |     device_holder = torch.jit.trace(lambda: torch.ones([]).to(torch.device(device)), example_inputs=[])
 95 |     device_node = [n for n in device_holder.graph.findAllNodes("prim::Constant") if "Device" in repr(n)][-1]
 96 | 
 97 |     def patch_device(module):
 98 |         try:
 99 |             graphs = [module.graph] if hasattr(module, "graph") else []
100 |         except RuntimeError:
101 |             graphs = []
102 | 
103 |         if hasattr(module, "forward1"):
104 |             graphs.append(module.forward1.graph)
105 | 
106 |         for graph in graphs:
107 |             for node in graph.findAllNodes("prim::Constant"):
108 |                 if "value" in node.attributeNames() and str(node["value"]).startswith("cuda"):
109 |                     node.copyAttributes(device_node)
110 | 
111 |     model.apply(patch_device)
112 |     patch_device(model.encode_image)
113 |     patch_device(model.encode_text)
114 | 
115 |     # patch dtype to float32 (typically for CPU)
116 |     if precision == "fp32":
117 |         float_holder = torch.jit.trace(lambda: torch.ones([]).float(), example_inputs=[])
118 |         float_input = list(float_holder.graph.findNode("aten::to").inputs())[1]
119 |         float_node = float_input.node()
120 | 
121 |         def patch_float(module):
122 |             try:
123 |                 graphs = [module.graph] if hasattr(module, "graph") else []
124 |             except RuntimeError:
125 |                 graphs = []
126 | 
127 |             if hasattr(module, "forward1"):
128 |                 graphs.append(module.forward1.graph)
129 | 
130 |             for graph in graphs:
131 |                 for node in graph.findAllNodes("aten::to"):
132 |                     inputs = list(node.inputs())
133 |                     for i in [1, 2]:  # dtype can be the second or third argument to aten::to()
134 |                         if inputs[i].node()["value"] == 5:
135 |                             inputs[i].node().copyAttributes(float_node)
136 | 
137 |         model.apply(patch_float)
138 |         patch_float(model.encode_image)
139 |         patch_float(model.encode_text)
140 |         model.float()
141 | 
142 |     # ensure image_size attr available at consistent location for both jit and non-jit
143 |     model.visual.image_size = model.input_resolution.item()
144 |     return model
145 | 


--------------------------------------------------------------------------------
/llava/model/multimodal_encoder/dev_eva_clip/eva_clip/rope.py:
--------------------------------------------------------------------------------
  1 | from math import pi
  2 | import torch
  3 | from torch import nn
  4 | from einops import rearrange, repeat
  5 | import logging
  6 | 
  7 | 
  8 | def broadcat(tensors, dim=-1):
  9 |     num_tensors = len(tensors)
 10 |     shape_lens = set(list(map(lambda t: len(t.shape), tensors)))
 11 |     assert len(shape_lens) == 1, "tensors must all have the same number of dimensions"
 12 |     shape_len = list(shape_lens)[0]
 13 |     dim = (dim + shape_len) if dim < 0 else dim
 14 |     dims = list(zip(*map(lambda t: list(t.shape), tensors)))
 15 |     expandable_dims = [(i, val) for i, val in enumerate(dims) if i != dim]
 16 |     assert all([*map(lambda t: len(set(t[1])) <= 2, expandable_dims)]), "invalid dimensions for broadcastable concatentation"
 17 |     max_dims = list(map(lambda t: (t[0], max(t[1])), expandable_dims))
 18 |     expanded_dims = list(map(lambda t: (t[0], (t[1],) * num_tensors), max_dims))
 19 |     expanded_dims.insert(dim, (dim, dims[dim]))
 20 |     expandable_shapes = list(zip(*map(lambda t: t[1], expanded_dims)))
 21 |     tensors = list(map(lambda t: t[0].expand(*t[1]), zip(tensors, expandable_shapes)))
 22 |     return torch.cat(tensors, dim=dim)
 23 | 
 24 | 
 25 | def rotate_half(x):
 26 |     x = rearrange(x, "... (d r) -> ... d r", r=2)
 27 |     x1, x2 = x.unbind(dim=-1)
 28 |     x = torch.stack((-x2, x1), dim=-1)
 29 |     return rearrange(x, "... d r -> ... (d r)")
 30 | 
 31 | 
 32 | class VisionRotaryEmbedding(nn.Module):
 33 |     def __init__(
 34 |         self,
 35 |         dim,
 36 |         pt_seq_len,
 37 |         ft_seq_len=None,
 38 |         custom_freqs=None,
 39 |         freqs_for="lang",
 40 |         theta=10000,
 41 |         max_freq=10,
 42 |         num_freqs=1,
 43 |     ):
 44 |         super().__init__()
 45 |         if custom_freqs:
 46 |             freqs = custom_freqs
 47 |         elif freqs_for == "lang":
 48 |             freqs = 1.0 / (theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim))
 49 |         elif freqs_for == "pixel":
 50 |             freqs = torch.linspace(1.0, max_freq / 2, dim // 2) * pi
 51 |         elif freqs_for == "constant":
 52 |             freqs = torch.ones(num_freqs).float()
 53 |         else:
 54 |             raise ValueError(f"unknown modality {freqs_for}")
 55 | 
 56 |         if ft_seq_len is None:
 57 |             ft_seq_len = pt_seq_len
 58 |         t = torch.arange(ft_seq_len) / ft_seq_len * pt_seq_len
 59 | 
 60 |         freqs_h = torch.einsum("..., f -> ... f", t, freqs)
 61 |         freqs_h = repeat(freqs_h, "... n -> ... (n r)", r=2)
 62 | 
 63 |         freqs_w = torch.einsum("..., f -> ... f", t, freqs)
 64 |         freqs_w = repeat(freqs_w, "... n -> ... (n r)", r=2)
 65 | 
 66 |         freqs = broadcat((freqs_h[:, None, :], freqs_w[None, :, :]), dim=-1)
 67 | 
 68 |         self.register_buffer("freqs_cos", freqs.cos())
 69 |         self.register_buffer("freqs_sin", freqs.sin())
 70 | 
 71 |         logging.info(f"Shape of rope freq: {self.freqs_cos.shape}")
 72 | 
 73 |     def forward(self, t, start_index=0):
 74 |         rot_dim = self.freqs_cos.shape[-1]
 75 |         end_index = start_index + rot_dim
 76 |         assert rot_dim <= t.shape[-1], f"feature dimension {t.shape[-1]} is not of sufficient size to rotate in all the positions {rot_dim}"
 77 |         t_left, t, t_right = t[..., :start_index], t[..., start_index:end_index], t[..., end_index:]
 78 |         t = (t * self.freqs_cos) + (rotate_half(t) * self.freqs_sin)
 79 | 
 80 |         return torch.cat((t_left, t, t_right), dim=-1)
 81 | 
 82 | 
 83 | class VisionRotaryEmbeddingFast(nn.Module):
 84 |     def __init__(self, dim, pt_seq_len, ft_seq_len=None, custom_freqs=None, freqs_for="lang", theta=10000, max_freq=10, num_freqs=1, patch_dropout=0.0):
 85 |         super().__init__()
 86 |         if custom_freqs:
 87 |             freqs = custom_freqs
 88 |         elif freqs_for == "lang":
 89 |             freqs = 1.0 / (theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim))
 90 |         elif freqs_for == "pixel":
 91 |             freqs = torch.linspace(1.0, max_freq / 2, dim // 2) * pi
 92 |         elif freqs_for == "constant":
 93 |             freqs = torch.ones(num_freqs).float()
 94 |         else:
 95 |             raise ValueError(f"unknown modality {freqs_for}")
 96 | 
 97 |         if ft_seq_len is None:
 98 |             ft_seq_len = pt_seq_len
 99 |         t = torch.arange(ft_seq_len) / ft_seq_len * pt_seq_len
100 | 
101 |         freqs = torch.einsum("..., f -> ... f", t, freqs)
102 |         freqs = repeat(freqs, "... n -> ... (n r)", r=2)
103 |         freqs = broadcat((freqs[:, None, :], freqs[None, :, :]), dim=-1)
104 | 
105 |         freqs_cos = freqs.cos().view(-1, freqs.shape[-1])
106 |         freqs_sin = freqs.sin().view(-1, freqs.shape[-1])
107 | 
108 |         self.patch_dropout = patch_dropout
109 | 
110 |         self.register_buffer("freqs_cos", freqs_cos)
111 |         self.register_buffer("freqs_sin", freqs_sin)
112 | 
113 |         logging.info(f"Shape of rope freq: {self.freqs_cos.shape}")
114 | 
115 |     def forward(self, t, patch_indices_keep=None):
116 |         if patch_indices_keep is not None:
117 |             batch = t.size()[0]
118 |             batch_indices = torch.arange(batch)
119 |             batch_indices = batch_indices[..., None]
120 | 
121 |             freqs_cos = repeat(self.freqs_cos, "i j -> n i m j", n=t.shape[0], m=t.shape[1])
122 |             freqs_sin = repeat(self.freqs_sin, "i j -> n i m j", n=t.shape[0], m=t.shape[1])
123 | 
124 |             freqs_cos = freqs_cos[batch_indices, patch_indices_keep]
125 |             freqs_cos = rearrange(freqs_cos, "n i m j -> n m i j")
126 |             freqs_sin = freqs_sin[batch_indices, patch_indices_keep]
127 |             freqs_sin = rearrange(freqs_sin, "n i m j -> n m i j")
128 | 
129 |             return t * freqs_cos + rotate_half(t) * freqs_sin
130 | 
131 |         return t * self.freqs_cos + rotate_half(t) * self.freqs_sin
132 | 


--------------------------------------------------------------------------------
/llava/model/multimodal_encoder/dev_eva_clip/eva_clip/timm_model.py:
--------------------------------------------------------------------------------
  1 | """ timm model adapter
  2 | 
  3 | Wraps timm (https://github.com/rwightman/pytorch-image-models) models for use as a vision tower in CLIP model.
  4 | """
  5 | 
  6 | import logging
  7 | from collections import OrderedDict
  8 | 
  9 | import torch
 10 | import torch.nn as nn
 11 | 
 12 | try:
 13 |     import timm
 14 |     from timm.models.layers import Mlp, to_2tuple
 15 | 
 16 |     try:
 17 |         # old timm imports < 0.8.1
 18 |         from timm.models.layers.attention_pool2d import RotAttentionPool2d
 19 |         from timm.models.layers.attention_pool2d import AttentionPool2d as AbsAttentionPool2d
 20 |     except ImportError:
 21 |         # new timm imports >= 0.8.1
 22 |         from timm.layers import RotAttentionPool2d
 23 |         from timm.layers import AttentionPool2d as AbsAttentionPool2d
 24 | except ImportError:
 25 |     timm = None
 26 | 
 27 | from .utils import freeze_batch_norm_2d
 28 | 
 29 | 
 30 | class TimmModel(nn.Module):
 31 |     """timm model adapter
 32 |     # FIXME this adapter is a work in progress, may change in ways that break weight compat
 33 |     """
 34 | 
 35 |     def __init__(self, model_name, embed_dim, image_size=224, pool="avg", proj="linear", proj_bias=False, drop=0.0, pretrained=False):
 36 |         super().__init__()
 37 |         if timm is None:
 38 |             raise RuntimeError("Please `pip install timm` to use timm models.")
 39 | 
 40 |         self.image_size = to_2tuple(image_size)
 41 |         self.trunk = timm.create_model(model_name, pretrained=pretrained)
 42 |         feat_size = self.trunk.default_cfg.get("pool_size", None)
 43 |         feature_ndim = 1 if not feat_size else 2
 44 |         if pool in ("abs_attn", "rot_attn"):
 45 |             assert feature_ndim == 2
 46 |             # if attn pooling used, remove both classifier and default pool
 47 |             self.trunk.reset_classifier(0, global_pool="")
 48 |         else:
 49 |             # reset global pool if pool config set, otherwise leave as network default
 50 |             reset_kwargs = dict(global_pool=pool) if pool else {}
 51 |             self.trunk.reset_classifier(0, **reset_kwargs)
 52 |         prev_chs = self.trunk.num_features
 53 | 
 54 |         head_layers = OrderedDict()
 55 |         if pool == "abs_attn":
 56 |             head_layers["pool"] = AbsAttentionPool2d(prev_chs, feat_size=feat_size, out_features=embed_dim)
 57 |             prev_chs = embed_dim
 58 |         elif pool == "rot_attn":
 59 |             head_layers["pool"] = RotAttentionPool2d(prev_chs, out_features=embed_dim)
 60 |             prev_chs = embed_dim
 61 |         else:
 62 |             assert proj, "projection layer needed if non-attention pooling is used."
 63 | 
 64 |         # NOTE attention pool ends with a projection layer, so proj should usually be set to '' if such pooling is used
 65 |         if proj == "linear":
 66 |             head_layers["drop"] = nn.Dropout(drop)
 67 |             head_layers["proj"] = nn.Linear(prev_chs, embed_dim, bias=proj_bias)
 68 |         elif proj == "mlp":
 69 |             head_layers["mlp"] = Mlp(prev_chs, 2 * embed_dim, embed_dim, drop=drop, bias=(True, proj_bias))
 70 | 
 71 |         self.head = nn.Sequential(head_layers)
 72 | 
 73 |     def lock(self, unlocked_groups=0, freeze_bn_stats=False):
 74 |         """lock modules
 75 |         Args:
 76 |             unlocked_groups (int): leave last n layer groups unlocked (default: 0)
 77 |         """
 78 |         if not unlocked_groups:
 79 |             # lock full model
 80 |             for param in self.trunk.parameters():
 81 |                 param.requires_grad = False
 82 |             if freeze_bn_stats:
 83 |                 freeze_batch_norm_2d(self.trunk)
 84 |         else:
 85 |             # NOTE: partial freeze requires latest timm (master) branch and is subject to change
 86 |             try:
 87 |                 # FIXME import here until API stable and in an official release
 88 |                 from timm.models.helpers import group_parameters, group_modules
 89 |             except ImportError:
 90 |                 raise RuntimeError("Please install latest timm `pip install git+https://github.com/rwightman/pytorch-image-models`")
 91 |             matcher = self.trunk.group_matcher()
 92 |             gparams = group_parameters(self.trunk, matcher)
 93 |             max_layer_id = max(gparams.keys())
 94 |             max_layer_id = max_layer_id - unlocked_groups
 95 |             for group_idx in range(max_layer_id + 1):
 96 |                 group = gparams[group_idx]
 97 |                 for param in group:
 98 |                     self.trunk.get_parameter(param).requires_grad = False
 99 |             if freeze_bn_stats:
100 |                 gmodules = group_modules(self.trunk, matcher, reverse=True)
101 |                 gmodules = {k for k, v in gmodules.items() if v <= max_layer_id}
102 |                 freeze_batch_norm_2d(self.trunk, gmodules)
103 | 
104 |     @torch.jit.ignore
105 |     def set_grad_checkpointing(self, enable=True):
106 |         try:
107 |             self.trunk.set_grad_checkpointing(enable)
108 |         except Exception as e:
109 |             logging.warning("grad checkpointing not supported for this timm image tower, continuing without...")
110 | 
111 |     def forward(self, x):
112 |         x = self.trunk(x)
113 |         x = self.head(x)
114 |         return x
115 | 


--------------------------------------------------------------------------------
/llava/model/multimodal_encoder/dev_eva_clip/eva_clip/transform.py:
--------------------------------------------------------------------------------
  1 | from typing import Optional, Sequence, Tuple
  2 | 
  3 | import torch
  4 | import torch.nn as nn
  5 | import torchvision.transforms.functional as F
  6 | 
  7 | from torchvision.transforms import Normalize, Compose, RandomResizedCrop, InterpolationMode, ToTensor, Resize, CenterCrop
  8 | 
  9 | from .constants import OPENAI_DATASET_MEAN, OPENAI_DATASET_STD
 10 | 
 11 | 
 12 | class ResizeMaxSize(nn.Module):
 13 | 
 14 |     def __init__(self, max_size, interpolation=InterpolationMode.BICUBIC, fn="max", fill=0):
 15 |         super().__init__()
 16 |         if not isinstance(max_size, int):
 17 |             raise TypeError(f"Size should be int. Got {type(max_size)}")
 18 |         self.max_size = max_size
 19 |         self.interpolation = interpolation
 20 |         self.fn = min if fn == "min" else min
 21 |         self.fill = fill
 22 | 
 23 |     def forward(self, img):
 24 |         if isinstance(img, torch.Tensor):
 25 |             height, width = img.shape[:2]
 26 |         else:
 27 |             width, height = img.size
 28 |         scale = self.max_size / float(max(height, width))
 29 |         if scale != 1.0:
 30 |             new_size = tuple(round(dim * scale) for dim in (height, width))
 31 |             img = F.resize(img, new_size, self.interpolation)
 32 |             pad_h = self.max_size - new_size[0]
 33 |             pad_w = self.max_size - new_size[1]
 34 |             img = F.pad(img, padding=[pad_w // 2, pad_h // 2, pad_w - pad_w // 2, pad_h - pad_h // 2], fill=self.fill)
 35 |         return img
 36 | 
 37 | 
 38 | def _convert_to_rgb(image):
 39 |     return image.convert("RGB")
 40 | 
 41 | 
 42 | # class CatGen(nn.Module):
 43 | #     def __init__(self, num=4):
 44 | #         self.num = num
 45 | #     def mixgen_batch(image, text):
 46 | #         batch_size = image.shape[0]
 47 | #         index = np.random.permutation(batch_size)
 48 | 
 49 | #         cat_images = []
 50 | #         for i in range(batch_size):
 51 | #             # image mixup
 52 | #             image[i,:] = lam * image[i,:] + (1 - lam) * image[index[i],:]
 53 | #             # text concat
 54 | #             text[i] = tokenizer((str(text[i]) + " " + str(text[index[i]])))[0]
 55 | #         text = torch.stack(text)
 56 | #         return image, text
 57 | 
 58 | 
 59 | def image_transform(
 60 |     image_size: int,
 61 |     is_train: bool,
 62 |     mean: Optional[Tuple[float, ...]] = None,
 63 |     std: Optional[Tuple[float, ...]] = None,
 64 |     resize_longest_max: bool = False,
 65 |     fill_color: int = 0,
 66 | ):
 67 |     mean = mean or OPENAI_DATASET_MEAN
 68 |     if not isinstance(mean, (list, tuple)):
 69 |         mean = (mean,) * 3
 70 | 
 71 |     std = std or OPENAI_DATASET_STD
 72 |     if not isinstance(std, (list, tuple)):
 73 |         std = (std,) * 3
 74 | 
 75 |     if isinstance(image_size, (list, tuple)) and image_size[0] == image_size[1]:
 76 |         # for square size, pass size as int so that Resize() uses aspect preserving shortest edge
 77 |         image_size = image_size[0]
 78 | 
 79 |     normalize = Normalize(mean=mean, std=std)
 80 |     if is_train:
 81 |         return Compose(
 82 |             [
 83 |                 RandomResizedCrop(image_size, scale=(0.9, 1.0), interpolation=InterpolationMode.BICUBIC),
 84 |                 _convert_to_rgb,
 85 |                 ToTensor(),
 86 |                 normalize,
 87 |             ]
 88 |         )
 89 |     else:
 90 |         if resize_longest_max:
 91 |             transforms = [ResizeMaxSize(image_size, fill=fill_color)]
 92 |         else:
 93 |             transforms = [
 94 |                 Resize(image_size, interpolation=InterpolationMode.BICUBIC),
 95 |                 CenterCrop(image_size),
 96 |             ]
 97 |         transforms.extend(
 98 |             [
 99 |                 _convert_to_rgb,
100 |                 ToTensor(),
101 |                 normalize,
102 |             ]
103 |         )
104 |         return Compose(transforms)
105 | 


--------------------------------------------------------------------------------
/llava/model/multimodal_encoder/dev_eva_clip/eva_vit.py:
--------------------------------------------------------------------------------
  1 | # Based on EVA, BEIT, timm and DeiT code bases
  2 | # https://github.com/baaivision/EVA
  3 | # https://github.com/rwightman/pytorch-image-models/tree/master/timm
  4 | # https://github.com/microsoft/unilm/tree/master/beit
  5 | # https://github.com/facebookresearch/deit/
  6 | # https://github.com/facebookresearch/dino
  7 | # --------------------------------------------------------'
  8 | # not tested yet
  9 | import math
 10 | from transformers import CLIPImageProcessor
 11 | 
 12 | import torch
 13 | import torch.nn as nn
 14 | import torch.nn.functional as F
 15 | import torch.utils.checkpoint as checkpoint
 16 | from timm.models.layers import drop_path, to_2tuple, trunc_normal_
 17 | from .eva_clip import create_model_and_transforms, get_model_config
 18 | import torch
 19 | import torchvision
 20 | import time
 21 | 
 22 | from llava.utils import rank0_print
 23 | 
 24 | 
 25 | class EvaViTWrapper(nn.Module):
 26 |     def __init__(self, vision_tower, args, delay_load=False):
 27 |         super().__init__()
 28 | 
 29 |         self.is_loaded = False
 30 |         self.vision_tower_name = vision_tower
 31 |         self.pretrained = args.vision_tower_pretrained
 32 |         self.args = args
 33 | 
 34 |         self.select_layer = args.mm_vision_select_layer
 35 |         if self.select_layer < -1:
 36 |             self.select_layer += 1
 37 |         self.select_feature = getattr(args, "mm_vision_select_feature", "patch")
 38 | 
 39 |         self.model_config = get_model_config(self.vision_tower_name)
 40 | 
 41 |         if not delay_load:
 42 |             rank0_print(f"Loading vision tower: {vision_tower}")
 43 |             self.load_model()
 44 |         elif getattr(args, "unfreeze_mm_vision_tower", False):
 45 |             # TODO: better detector is needed.
 46 |             rank0_print(f"The checkpoint seems to contain `vision_tower` weights: `unfreeze_mm_vision_tower`: True.")
 47 |             self.load_model()
 48 |         elif hasattr(args, "mm_tunable_parts") and "mm_vision_tower" in args.mm_tunable_parts:
 49 |             rank0_print(f"The checkpoint seems to contain `vision_tower` weights: `mm_tunable_parts` contains `mm_vision_tower`.")
 50 |             self.load_model()
 51 | 
 52 |     def load_model(self):
 53 |         rank0_print(f"Loading: {self.vision_tower_name}")
 54 |         rank0_print(f"Pretrained: {self.pretrained}")
 55 |         time_start = time.time()
 56 |         model, _, image_processor = create_model_and_transforms(self.vision_tower_name, self.pretrained, force_custom_clip=True, precision="fp16")
 57 |         time_end = time.time()
 58 |         rank0_print(f"Loaded: {self.vision_tower_name} in {time_end - time_start:.2f}s")
 59 |         self.device = next(model.parameters()).device
 60 |         self.dtype = next(model.parameters()).dtype
 61 |         if self.device.type != "meta":
 62 |             model = model.to("cuda")
 63 |         self.vision_tower = model.visual
 64 |         resize_transform = [t for t in image_processor.transforms if isinstance(t, torchvision.transforms.Resize)][0]
 65 |         normalize_transform = [t for t in image_processor.transforms if isinstance(t, torchvision.transforms.Normalize)][0]
 66 |         self.resize_transform_size = resize_transform.size
 67 |         self.image_processor = CLIPImageProcessor.from_pretrained(
 68 |             "openai/clip-vit-large-patch14",
 69 |             crop_size=resize_transform.size,
 70 |             size={"shortest_edge": resize_transform.size},
 71 |             image_mean=list(normalize_transform.mean),
 72 |             image_std=list(normalize_transform.std),
 73 |         )
 74 |         rank0_print(f"Loaded image processor: {self.image_processor}")
 75 |         self.vision_tower.requires_grad_(False)
 76 |         self.is_loaded = True
 77 | 
 78 |     def feature_select(self, image_features):
 79 |         select_feature_type = self.select_feature
 80 | 
 81 |         # if self.select_feature in ["slicefour_patch", "slicefour_cls_patch"]:
 82 |         #     select_every_k_layer = len(image_features) // 4
 83 |         #     image_features = torch.cat([image_features[i] for i in range(select_every_k_layer + self.select_layer, len(image_features), select_every_k_layer)], dim=-1)
 84 |         #     select_feature_type = select_feature_type.replace("slicefour_", "")
 85 |         # elif self.select_feature in ["slice_m25811_f6_patch", "slice_m25811_f6_cls_patch"]:
 86 |         #     select_layers = [-1, -4, -7, -10, 6]
 87 |         #     image_features = torch.cat([image_features[i] for i in select_layers], dim=-1)
 88 |         #     select_feature_type = select_feature_type.replace("slice_m25811_f6_", "")
 89 |         # else:
 90 |         #     image_features = image_features[self.select_layer]
 91 | 
 92 |         if select_feature_type == "patch":
 93 |             image_features = image_features[:, 1:]
 94 |         elif select_feature_type == "cls_patch":
 95 |             image_features = image_features
 96 |         else:
 97 |             raise ValueError(f"Unexpected select feature: {select_feature_type}")
 98 |         return image_features
 99 | 
100 |     def train(self, mode=True):
101 |         self.training = mode
102 | 
103 |         if self.is_loaded:
104 |             self.vision_tower.eval()
105 | 
106 |     def forward(self, images):
107 |         if type(images) is list:
108 |             image_features = []
109 |             for image in images:
110 |                 image_features = self.vision_tower.forward_features(image.to(self.dtype), return_all_features=True)
111 |                 image_features = self.feature_select(image_features).to(self.dtype)
112 |                 image_features.append(image_features)
113 |         else:
114 |             image_features = self.vision_tower.forward_features(images.to(self.dtype), return_all_features=True)
115 |             image_features = self.feature_select(image_features).to(self.dtype)
116 | 
117 |         return image_features
118 | 
119 |     @property
120 |     def dummy_feature(self):
121 |         return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype)
122 | 
123 |     @property
124 |     def hidden_size(self):
125 |         return self.model_config["vision_cfg"]["width"]
126 | 
127 |     @property
128 |     def num_patches(self):
129 |         return (self.model_config["vision_cfg"]["image_size"] // self.model_config["vision_cfg"]["patch_size"]) ** 2
130 | 
131 |     @property
132 |     def num_patches_per_side(self):
133 |         return self.model_config["vision_cfg"]["image_size"] // self.model_config["vision_cfg"]["patch_size"]
134 | 
135 |     @property
136 |     def config(self):
137 |         return self.model_config
138 | 
139 |     @property
140 |     def image_size(self):
141 |         return self.model_config["vision_cfg"]["image_size"]
142 | 


--------------------------------------------------------------------------------
/llava/model/multimodal_encoder/eva_clip/eva_clip_encoder.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | 
 4 | from .eva_clip_processors import EvaClipImageTrainProcessor
 5 | from .eva_vit import EVAEncoderWrapper
 6 | from .factory import list_models, add_model_config, get_model_config
 7 | 
 8 | from llava.utils import rank0_print
 9 | 
10 | 
11 | class EvaClipVisionTower(nn.Module):
12 |     def __init__(self, vision_tower, args, delay_load=False):
13 |         super().__init__()
14 | 
15 |         self.is_loaded = False
16 |         self.vision_tower_name = vision_tower
17 |         self.vision_tower_pretrained = args.vision_tower_pretrained
18 |         self.config = get_model_config(vision_tower)
19 | 
20 |         if not delay_load:
21 |             rank0_print(f"Loading EVA ViT: {self.vision_tower_name}")
22 |             self.load_model()
23 |         elif getattr(args, "unfreeze_mm_vision_tower", False):
24 |             # TODO: better detector is needed.
25 |             rank0_print(f"The checkpoint seems to contain `vision_tower` weights: `unfreeze_mm_vision_tower`: True.")
26 |             self.load_model()
27 |         elif hasattr(args, "mm_tunable_parts") and "mm_vision_tower" in args.mm_tunable_parts:
28 |             rank0_print(f"The checkpoint seems to contain `vision_tower` weights: `mm_tunable_parts` contains `mm_vision_tower`.")
29 |             self.load_model()
30 |         else:
31 |             self.cfg_only = self.config
32 | 
33 |     def load_model(self, device_map=None):
34 |         rank0_print(f"Pretrained: {self.vision_tower_pretrained}")
35 |         self.image_processor = EvaClipImageTrainProcessor(self.config["vision_cfg"]["image_size"])
36 |         self.vision_tower = EVAEncoderWrapper(self.vision_tower_pretrained, self.config)
37 |         rank0_print(f"Loaded image processor: {self.image_processor}")
38 |         self.vision_tower.requires_grad_(False)
39 |         self.is_loaded = True
40 | 
41 |     def forward(self, images):
42 |         if type(images) is list:
43 |             image_features = []
44 |             for image in images:
45 |                 image_feature = self.vision_tower(image.to(device=self.device, dtype=self.dtype).unsqueeze(0)).to(image.dtype)
46 |                 image_features.append(image_feature)
47 |         else:
48 |             image_features = self.vision_tower(images.to(device=self.device, dtype=self.dtype)).to(images.dtype)
49 | 
50 |         return image_features
51 | 
52 |     @property
53 |     def dtype(self):
54 |         return self.vision_tower.dtype
55 | 
56 |     @property
57 |     def device(self):
58 |         return self.vision_tower.device
59 | 
60 |     @property
61 |     def hidden_size(self):
62 |         return self.config["vision_cfg"]["width"]
63 | 
64 |     @property
65 |     def num_patches(self):
66 |         return (self.config["vision_cfg"]["image_size"] // self.config["vision_cfg"]["patch_size"]) ** 2
67 | 
68 |     @property
69 |     def num_patches_per_side(self):
70 |         return self.config["vision_cfg"]["image_size"] // self.config["vision_cfg"]["patch_size"]
71 | 
72 |     @property
73 |     def image_size(self):
74 |         return self.config["vision_cfg"]["image_size"]
75 | 


--------------------------------------------------------------------------------
/llava/model/multimodal_encoder/eva_clip/eva_clip_processors.py:
--------------------------------------------------------------------------------
 1 | """
 2 | # Adapted from https://github.com/baaivision/EVA/tree/master/EVA-CLIP
 3 | """
 4 | 
 5 | from torchvision import transforms
 6 | from torchvision.transforms.functional import InterpolationMode
 7 | from transformers.image_processing_utils import BatchFeature
 8 | from PIL import Image
 9 | from transformers.image_transforms import convert_to_rgb
10 | 
11 | 
12 | class BaseProcessor:
13 |     def __init__(self):
14 |         self.transform = lambda x: x
15 |         return
16 | 
17 |     def __call__(self, item):
18 |         return self.transform(item)
19 | 
20 | 
21 | class EvaClipImageBaseProcessor(BaseProcessor):
22 |     def __init__(self, mean=None, std=None):
23 |         self.mean = (0.48145466, 0.4578275, 0.40821073) if mean is None else mean
24 |         self.std = (0.26862954, 0.26130258, 0.27577711) if std is None else std
25 | 
26 |         self.normalize = transforms.Normalize(self.mean, self.std)
27 | 
28 |     @property
29 |     def image_mean(self):
30 |         return self.mean
31 | 
32 | 
33 | class EvaClipImageTrainProcessor(EvaClipImageBaseProcessor):
34 |     def __init__(self, image_size=224, mean=None, std=None, min_scale=0.5, max_scale=1.0):
35 |         super().__init__(mean=mean, std=std)
36 | 
37 |         self.transform = transforms.Compose(
38 |             [
39 |                 convert_to_rgb,
40 |                 transforms.Resize(
41 |                     image_size,
42 |                     interpolation=InterpolationMode.BICUBIC,
43 |                 ),
44 |                 transforms.CenterCrop(image_size),
45 |                 transforms.ToTensor(),
46 |                 self.normalize,
47 |             ]
48 |         )
49 | 
50 |         self.image_size = image_size
51 | 
52 |     def preprocess(self, images, return_tensors):
53 |         if isinstance(images, Image.Image):
54 |             images = [images]
55 |         else:
56 |             assert isinstance(images, list)
57 | 
58 |         transformed_images = [self.transform(image).numpy() for image in images]
59 |         data = {"pixel_values": transformed_images}
60 | 
61 |         return BatchFeature(data=data, tensor_type=return_tensors)
62 | 
63 |     def __call__(self, item):
64 |         return self.transform(item)
65 | 
66 |     @property
67 |     def crop_size(self):
68 |         return {"height": self.image_size, "width": self.image_size}
69 | 
70 |     @property
71 |     def size(self):
72 |         return {"shortest_edge": self.image_size}
73 | 


--------------------------------------------------------------------------------
/llava/model/multimodal_encoder/eva_clip/factory.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import logging
 3 | import os
 4 | import pathlib
 5 | import re
 6 | from copy import deepcopy
 7 | from pathlib import Path
 8 | from typing import Optional, Tuple, Union, Dict, Any
 9 | import torch
10 | 
11 | _MODEL_CONFIG_PATHS = [Path(__file__).parent / f"model_configs/"]
12 | _MODEL_CONFIGS = {}  # directory (model_name: config) of model architecture configs
13 | 
14 | 
15 | def _natural_key(string_):
16 |     return [int(s) if s.isdigit() else s for s in re.split(r"(\d+)", string_.lower())]
17 | 
18 | 
19 | def _rescan_model_configs():
20 |     global _MODEL_CONFIGS
21 | 
22 |     config_ext = (".json",)
23 |     config_files = []
24 |     for config_path in _MODEL_CONFIG_PATHS:
25 |         if config_path.is_file() and config_path.suffix in config_ext:
26 |             config_files.append(config_path)
27 |         elif config_path.is_dir():
28 |             for ext in config_ext:
29 |                 config_files.extend(config_path.glob(f"*{ext}"))
30 | 
31 |     for cf in config_files:
32 |         with open(cf, "r", encoding="utf8") as f:
33 |             model_cfg = json.load(f)
34 |             if all(a in model_cfg for a in ("embed_dim", "vision_cfg", "text_cfg")):
35 |                 _MODEL_CONFIGS[cf.stem] = model_cfg
36 | 
37 |     _MODEL_CONFIGS = dict(sorted(_MODEL_CONFIGS.items(), key=lambda x: _natural_key(x[0])))
38 | 
39 | 
40 | _rescan_model_configs()  # initial populate of model config registry
41 | 
42 | 
43 | def list_models():
44 |     """enumerate available model architectures based on config files"""
45 |     return list(_MODEL_CONFIGS.keys())
46 | 
47 | 
48 | def add_model_config(path):
49 |     """add model config path or file and update registry"""
50 |     if not isinstance(path, Path):
51 |         path = Path(path)
52 |     _MODEL_CONFIG_PATHS.append(path)
53 |     _rescan_model_configs()
54 | 
55 | 
56 | def get_model_config(model_name):
57 |     if model_name in _MODEL_CONFIGS:
58 |         return deepcopy(_MODEL_CONFIGS[model_name])
59 |     else:
60 |         return None
61 | 


--------------------------------------------------------------------------------
/llava/model/multimodal_encoder/eva_clip/model_configs/EVA-CLIP-18B.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1536,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 48,
 6 |         "width": 5120,
 7 |         "head_width": 128,
 8 |         "mlp_ratio": 5,
 9 |         "patch_size": 14,
10 |         "eva_model_name": "eva-clip-18b-14-x",
11 |         "drop_path_rate": 0,
12 |         "qkv_bias": false,
13 |         "xattn": true,
14 |         "postnorm": true,
15 |         "fusedLN": false,
16 |         "use_rms_norm": true
17 |     },
18 |     "text_cfg": {
19 |         "context_length": 77,
20 |         "vocab_size": 49408,
21 |         "width": 1280,
22 |         "heads": 20,
23 |         "layers": 32,
24 |         "xattn": false,
25 |         "fusedLN": false
26 |     }
27 | }


--------------------------------------------------------------------------------
/llava/model/multimodal_encoder/eva_clip/model_configs/EVA-CLIP-8B-plus.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1280,
 3 |     "vision_cfg": {
 4 |         "image_size": 448,
 5 |         "layers": 32,
 6 |         "width": 4096,
 7 |         "head_width": 128,
 8 |         "mlp_ratio": 5,
 9 |         "patch_size": 14,
10 |         "eva_model_name": "eva-clip-8b-14-plus-x",
11 |         "drop_path_rate": 0,
12 |         "qkv_bias": false,
13 |         "xattn": true,
14 |         "postnorm": false,
15 |         "fusedLN": false,
16 |         "use_rms_norm": true
17 |     },
18 |     "text_cfg": {
19 |         "context_length": 77,
20 |         "vocab_size": 49408,
21 |         "width": 1280,
22 |         "heads": 20,
23 |         "layers": 32,
24 |         "xattn": false,
25 |         "fusedLN": false
26 |     }
27 | }


--------------------------------------------------------------------------------
/llava/model/multimodal_encoder/eva_clip/model_configs/EVA-CLIP-8B.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1280,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 32,
 6 |         "width": 4096,
 7 |         "head_width": 128,
 8 |         "mlp_ratio": 5,
 9 |         "patch_size": 14,
10 |         "eva_model_name": "eva-clip-8b-14-x",
11 |         "drop_path_rate": 0,
12 |         "qkv_bias": false,
13 |         "xattn": true,
14 |         "postnorm": false,
15 |         "fusedLN": false,
16 |         "use_rms_norm": true
17 |     },
18 |     "text_cfg": {
19 |         "context_length": 77,
20 |         "vocab_size": 49408,
21 |         "width": 1280,
22 |         "heads": 20,
23 |         "layers": 32,
24 |         "xattn": false,
25 |         "fusedLN": false
26 |     }
27 | }


--------------------------------------------------------------------------------
/llava/model/multimodal_encoder/eva_clip/model_configs/EVA01-CLIP-B-16.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 512,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 12,
 6 |         "width": 768,
 7 |         "patch_size": 16,
 8 |         "eva_model_name": "eva-clip-b-16",
 9 |         "ls_init_value": 0.1,
10 |         "drop_path_rate": 0.0
11 |     },
12 |     "text_cfg": {
13 |         "context_length": 77,
14 |         "vocab_size": 49408,
15 |         "width": 512,
16 |         "heads": 8,
17 |         "layers": 12
18 |     }
19 | }


--------------------------------------------------------------------------------
/llava/model/multimodal_encoder/eva_clip/model_configs/EVA01-CLIP-g-14-plus.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1024,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 40,
 6 |         "width": 1408,
 7 |         "head_width": 88,
 8 |         "mlp_ratio": 4.3637,
 9 |         "patch_size": 14,
10 |         "eva_model_name": "eva-clip-g-14-x",
11 |         "drop_path_rate": 0,
12 |         "xattn": true,
13 |         "fusedLN": true
14 |     },
15 |     "text_cfg": {
16 |         "context_length": 77,
17 |         "vocab_size": 49408,
18 |         "width": 1024,
19 |         "heads": 16,
20 |         "layers": 24,
21 |         "xattn": false,
22 |         "fusedLN": true
23 |     }
24 | }


--------------------------------------------------------------------------------
/llava/model/multimodal_encoder/eva_clip/model_configs/EVA01-CLIP-g-14.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1024,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 40,
 6 |         "width": 1408,
 7 |         "head_width": 88,
 8 |         "mlp_ratio": 4.3637,
 9 |         "patch_size": 14,
10 |         "eva_model_name": "eva-clip-g-14-x",
11 |         "drop_path_rate": 0.4,
12 |         "xattn": true,
13 |         "fusedLN": true
14 |     },
15 |     "text_cfg": {
16 |         "context_length": 77,
17 |         "vocab_size": 49408,
18 |         "width": 768,
19 |         "heads": 12,
20 |         "layers": 12,
21 |         "xattn": false,
22 |         "fusedLN": true
23 |     }
24 | }


--------------------------------------------------------------------------------
/llava/model/multimodal_encoder/eva_clip/model_configs/EVA02-CLIP-B-16.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 512,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 12,
 6 |         "width": 768,
 7 |         "head_width": 64,
 8 |         "patch_size": 16,
 9 |         "mlp_ratio": 2.6667,
10 |         "eva_model_name": "eva-clip-b-16-X",
11 |         "drop_path_rate": 0.0,
12 |         "xattn": true,
13 |         "fusedLN": true,
14 |         "rope": true,
15 |         "pt_hw_seq_len": 16,
16 |         "intp_freq": true,
17 |         "naiveswiglu": true,
18 |         "subln": true
19 |     },
20 |     "text_cfg": {
21 |         "context_length": 77,
22 |         "vocab_size": 49408,
23 |         "width": 512,
24 |         "heads": 8,
25 |         "layers": 12,
26 |         "xattn": true,
27 |         "fusedLN": true
28 |     }
29 | }


--------------------------------------------------------------------------------
/llava/model/multimodal_encoder/eva_clip/model_configs/EVA02-CLIP-L-14-336.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 768,
 3 |     "vision_cfg": {
 4 |         "image_size": 336,
 5 |         "layers": 24,
 6 |         "width": 1024,
 7 |         "drop_path_rate": 0,
 8 |         "head_width": 64,
 9 |         "mlp_ratio": 2.6667,
10 |         "patch_size": 14,
11 |         "eva_model_name": "eva-clip-l-14-336",
12 |         "xattn": true,
13 |         "fusedLN": true,
14 |         "rope": true,
15 |         "pt_hw_seq_len": 16,
16 |         "intp_freq": true,
17 |         "naiveswiglu": true,
18 |         "subln": true
19 |     },
20 |     "text_cfg": {
21 |         "context_length": 77,
22 |         "vocab_size": 49408,
23 |         "width": 768,
24 |         "heads": 12,
25 |         "layers": 12,
26 |         "xattn": false,
27 |         "fusedLN": true
28 |     }
29 | }


--------------------------------------------------------------------------------
/llava/model/multimodal_encoder/eva_clip/model_configs/EVA02-CLIP-L-14.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 768,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 24,
 6 |         "width": 1024,
 7 |         "drop_path_rate": 0,
 8 |         "head_width": 64,
 9 |         "mlp_ratio": 2.6667,
10 |         "patch_size": 14,
11 |         "eva_model_name": "eva-clip-l-14",
12 |         "xattn": true,
13 |         "fusedLN": true,
14 |         "rope": true,
15 |         "pt_hw_seq_len": 16,
16 |         "intp_freq": true,
17 |         "naiveswiglu": true,
18 |         "subln": true
19 |     },
20 |     "text_cfg": {
21 |         "context_length": 77,
22 |         "vocab_size": 49408,
23 |         "width": 768,
24 |         "heads": 12,
25 |         "layers": 12,
26 |         "xattn": false,
27 |         "fusedLN": true
28 |     }
29 | }


--------------------------------------------------------------------------------
/llava/model/multimodal_encoder/eva_clip/model_configs/EVA02-CLIP-bigE-14-plus.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1024,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 64,
 6 |         "width": 1792,
 7 |         "head_width": 112,
 8 |         "mlp_ratio": 8.571428571428571,
 9 |         "patch_size": 14,
10 |         "eva_model_name": "eva-clip-4b-14-x",
11 |         "drop_path_rate": 0,
12 |         "xattn": true,
13 |         "postnorm": true,
14 |         "fusedLN": true
15 |     },
16 |     "text_cfg": {
17 |         "context_length": 77,
18 |         "vocab_size": 49408,
19 |         "width": 1280,
20 |         "heads": 20,
21 |         "layers": 32,
22 |         "xattn": false,
23 |         "fusedLN": true
24 |     }
25 | }
26 | 


--------------------------------------------------------------------------------
/llava/model/multimodal_encoder/eva_clip/model_configs/EVA02-CLIP-bigE-14.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1024,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 64,
 6 |         "width": 1792,
 7 |         "head_width": 112,
 8 |         "mlp_ratio": 8.571428571428571,
 9 |         "patch_size": 14,
10 |         "eva_model_name": "eva-clip-4b-14-x",
11 |         "drop_path_rate": 0,
12 |         "xattn": true,
13 |         "postnorm": true,
14 |         "fusedLN": true
15 |     },
16 |     "text_cfg": {
17 |         "context_length": 77,
18 |         "vocab_size": 49408,
19 |         "width": 1024,
20 |         "heads": 16,
21 |         "layers": 24,
22 |         "xattn": false,
23 |         "fusedLN": true
24 |     }
25 | }


--------------------------------------------------------------------------------
/llava/model/multimodal_encoder/eva_clip/model_configs/Internal-EVA02-CLIP-10B-14-448.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1024,
 3 |     "vision_cfg": {
 4 |         "image_size": 448,
 5 |         "layers": 77,
 6 |         "width": 2304,
 7 |         "head_width": 144,
 8 |         "mlp_ratio": 10.9722,
 9 |         "patch_size": 14,
10 |         "eva_model_name": "eva-clip-10b-14-x",
11 |         "drop_path_rate": 0,
12 |         "xattn": true,
13 |         "postnorm": false,
14 |         "fusedLN": true
15 |     },
16 |     "text_cfg": {
17 |         "context_length": 77,
18 |         "vocab_size": 49408,
19 |         "width": 1280,
20 |         "heads": 20,
21 |         "layers": 32,
22 |         "xattn": false,
23 |         "fusedLN": true
24 |     }
25 | }
26 | 


--------------------------------------------------------------------------------
/llava/model/multimodal_encoder/eva_clip/model_configs/Internal-EVA02-CLIP-10B-14.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1024,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 77,
 6 |         "width": 2304,
 7 |         "head_width": 144,
 8 |         "mlp_ratio": 10.9722,
 9 |         "patch_size": 14,
10 |         "eva_model_name": "eva-clip-10b-14-x",
11 |         "drop_path_rate": 0,
12 |         "xattn": true,
13 |         "postnorm": false,
14 |         "fusedLN": true
15 |     },
16 |     "text_cfg": {
17 |         "context_length": 77,
18 |         "vocab_size": 49408,
19 |         "width": 1280,
20 |         "heads": 20,
21 |         "layers": 32,
22 |         "xattn": false,
23 |         "fusedLN": true
24 |     }
25 | }
26 | 


--------------------------------------------------------------------------------
/llava/model/multimodal_encoder/hf_vision.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | 
  4 | from transformers import AutoModel, AutoImageProcessor, AutoConfig, CLIPImageProcessor
  5 | from llava.utils import rank0_print
  6 | 
  7 | 
  8 | class HFVisionTower(nn.Module):
  9 |     def __init__(self, vision_tower, args, delay_load=False):
 10 |         super().__init__()
 11 | 
 12 |         self.is_loaded = False
 13 | 
 14 |         self.vision_tower_name = vision_tower.replace("hf:", "", 1)
 15 |         self.select_layer = args.mm_vision_select_layer
 16 |         self.select_feature = getattr(args, "mm_vision_select_feature", "patch")
 17 | 
 18 |         if not delay_load:
 19 |             self.load_model()
 20 |         else:
 21 |             self.cfg_only = AutoConfig.from_pretrained(self.vision_tower_name)
 22 | 
 23 |     def load_model(self):
 24 |         try:
 25 |             self.image_processor = AutoImageProcessor.from_pretrained(self.vision_tower_name)
 26 |         except Exception as e:
 27 |             if "448" in self.vision_tower_name:
 28 |                 image_size = 448
 29 |                 # use image processor with conig
 30 |                 self.image_processor = CLIPImageProcessor(size={"shortest_edge": image_size}, do_center_crop=True, crop_size=image_size)
 31 |             else:
 32 |                 self.image_processor = CLIPImageProcessor.from_pretrained("openai/clip-vit-large-patch14")
 33 |         rank0_print(f"Loaded image processor: {self.image_processor}")
 34 |         self.vision_tower = AutoModel.from_pretrained(self.vision_tower_name, torch_dtype=torch.bfloat16, trust_remote_code=True).to("cuda")
 35 |         self.device = self.vision_tower.device
 36 |         self.dtype = self.vision_tower.dtype
 37 |         self.config = self.vision_tower.config
 38 | 
 39 |         if hasattr(self.vision_tower, "vision_model"):
 40 |             self.vision_tower = self.vision_tower.vision_model
 41 |         self.vision_tower.requires_grad_(False)
 42 |         # self.vision_tower.eval()
 43 |         self.is_loaded = True
 44 | 
 45 |     def feature_select(self, image_forward_outs):
 46 |         select_feature_type = self.select_feature
 47 | 
 48 |         if self.select_feature in ["slicefour_patch", "slicefour_cls_patch"]:
 49 |             select_every_k_layer = len(image_forward_outs.hidden_states) // 4
 50 |             image_features = torch.cat([image_forward_outs.hidden_states[i] for i in range(select_every_k_layer + self.select_layer, len(image_forward_outs.hidden_states), select_every_k_layer)], dim=-1)
 51 |             select_feature_type = select_feature_type.replace("slicefour_", "")
 52 |         else:
 53 |             image_features = image_forward_outs.hidden_states[self.select_layer]
 54 | 
 55 |         if select_feature_type == "patch":
 56 |             image_features = image_features[:, 1:]
 57 |         elif select_feature_type == "cls_patch":
 58 |             image_features = image_features
 59 |         else:
 60 |             raise ValueError(f"Unexpected select feature: {select_feature_type}")
 61 |         return image_features
 62 | 
 63 |     def forward(self, images):
 64 |         if type(images) is list:
 65 |             image_features = []
 66 |             for image in images:
 67 |                 image_forward_out = self.vision_tower(image.to(device=self.device, dtype=self.dtype).unsqueeze(0), output_hidden_states=True)
 68 |                 image_feature = self.feature_select(image_forward_out).to(image.dtype)
 69 |                 image_features.append(image_feature)
 70 |         else:
 71 |             image_forward_outs = self.vision_tower(images.to(device=self.device, dtype=self.dtype), output_hidden_states=True)
 72 |             image_features = self.feature_select(image_forward_outs).to(images.dtype)
 73 | 
 74 |         return image_features
 75 | 
 76 |     @property
 77 |     def dummy_feature(self):
 78 |         return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype)
 79 | 
 80 |     # @property
 81 |     # def dtype(self):
 82 |     #     return self.vision_tower.dtype
 83 | 
 84 |     # @property
 85 |     # def device(self):
 86 |     #     return self.vision_tower.device
 87 | 
 88 |     @property
 89 |     def hidden_size(self):
 90 |         try:
 91 |             _hidden_size = self.config.hidden_size
 92 |         except:
 93 |             _hidden_size = self.config.vision_config.hidden_size
 94 |         if "slicefour" in self.select_feature:
 95 |             _hidden_size *= 4
 96 |         return _hidden_size
 97 | 
 98 |     @property
 99 |     def num_patches(self):
100 |         _num_patches = (self.config.image_size // self.config.patch_size) ** 2
101 |         if "cls_patch" in self.select_feature:
102 |             _num_patches += 1
103 |         return _num_patches
104 | 
105 |     @property
106 |     def num_patches_per_side(self):
107 |         return self.config.image_size // self.config.patch_size
108 | 
109 |     @property
110 |     def image_size(self):
111 |         return self.config.image_size
112 | 


--------------------------------------------------------------------------------
/llava/model/multimodal_encoder/imagebind.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | 
 4 | from transformers import CLIPImageProcessor
 5 | 
 6 | try:
 7 |     from imagebind.models import imagebind_model
 8 |     from imagebind.models.imagebind_model import ModalityType
 9 |     from imagebind.data import load_and_transform_audio_data
10 | except ImportError:
11 |     pass
12 | 
13 | 
14 | class ImageBindWrapper(nn.Module):
15 |     def __init__(self, vision_tower, select_layer, select_feature="patch", delay_load=False):
16 |         super().__init__()
17 | 
18 |         self.is_loaded = False
19 | 
20 |         self.vision_tower_name = vision_tower
21 |         self.select_layer = select_layer
22 |         self.select_feature = select_feature
23 | 
24 |         if not delay_load:
25 |             self.load_model()
26 | 
27 |     def load_model(self):
28 |         self.image_processor = CLIPImageProcessor.from_pretrained("openai/clip-vit-large-patch14")
29 |         self.vision_tower = imagebind_model.imagebind_huge(pretrained=True)
30 |         for p in self.vision_tower.parameters():
31 |             p.requires_grad = False
32 |         self.vision_tower.eval()
33 |         self.is_loaded = True
34 | 
35 |     def train(self, mode=True):
36 |         self.training = mode
37 | 
38 |         if self.is_loaded:
39 |             self.vision_tower.eval()
40 | 
41 |     @torch.no_grad()
42 |     def forward(self, x):
43 |         if type(x) == dict:
44 |             if x["audios"] is not None:
45 |                 inputs = {ModalityType.AUDIO: load_and_transform_audio_data(x["audios"], device=self.device).half()}
46 |                 embeddings = self.vision_tower(inputs)
47 |                 audio_embedding = embeddings[ModalityType.AUDIO]
48 |                 return audio_embedding.unsqueeze(1)
49 |         else:
50 |             inputs = {ModalityType.VISION: x.to(dtype=self.dtype)}
51 |             embeddings = self.vision_tower(inputs)
52 |             vision_embedding = embeddings[ModalityType.VISION]
53 |             if vision_embedding.ndim == 2:
54 |                 return vision_embedding.unsqueeze(1)
55 |             if vision_embedding.shape[1] == 257:
56 |                 return vision_embedding[:, 1:]
57 |             raise ValueError(f"Unexpected shape: {vision_embedding.shape}")
58 | 
59 |     @property
60 |     def dummy_feature(self):
61 |         return torch.zeros(1, 1024, device=self.device, dtype=self.dtype)
62 | 
63 |     @property
64 |     def dtype(self):
65 |         return self.vision_tower.modality_preprocessors.vision.cls_token.dtype
66 | 
67 |     @property
68 |     def device(self):
69 |         return self.vision_tower.modality_preprocessors.vision.cls_token.device
70 | 
71 |     @property
72 |     def hidden_size(self):
73 |         return 1024
74 | 


--------------------------------------------------------------------------------
/llava/model/multimodal_encoder/open_clip_encoder.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | from transformers import CLIPImageProcessor
  4 | from llava.utils import rank0_print
  5 | 
  6 | try:
  7 |     import open_clip
  8 |     import torchvision
  9 |     from open_clip.transformer import _expand_token
 10 | except ImportError:
 11 |     print("OpenCLIP not installed")
 12 |     open_clip = None
 13 | 
 14 | HIDDEN_SIZE_DICT = {
 15 |     "ViT-H-14-378-quickgelu": 1280,
 16 | }
 17 | 
 18 | 
 19 | class OpenCLIPVisionTower(nn.Module):
 20 |     def __init__(self, vision_tower, args, delay_load=False):
 21 |         super().__init__()
 22 | 
 23 |         self.is_loaded = False
 24 |         self.model_name = vision_tower.replace("open_clip_hub:", "")
 25 |         self.pretrained = args.vision_tower_pretrained
 26 |         self.select_layer = args.mm_vision_select_layer
 27 |         self.select_feature = getattr(args, "mm_vision_select_feature", "patch")
 28 | 
 29 |         if not delay_load:
 30 |             rank0_print(f"Loading vision tower: {vision_tower}")
 31 |             self.load_model()
 32 |         elif getattr(args, "unfreeze_mm_vision_tower", False):
 33 |             # TODO: better detector is needed.
 34 |             rank0_print(f"The checkpoint seems to contain `vision_tower` weights: `unfreeze_mm_vision_tower`: True.")
 35 |             self.load_model()
 36 |         elif hasattr(args, "mm_tunable_parts") and "mm_vision_tower" in args.mm_tunable_parts:
 37 |             rank0_print(f"The checkpoint seems to contain `vision_tower` weights: `mm_tunable_parts` contains `mm_vision_tower`.")
 38 |             self.load_model()
 39 | 
 40 |     def load_model(self, device_map="auto"):
 41 |         rank0_print(f"Loading OpenCLIP model: {self.model_name}")
 42 |         rank0_print(f"Pretrained: {self.pretrained}")
 43 |         vision_tower, _, image_processor = open_clip.create_model_and_transforms(model_name=self.model_name, pretrained=self.pretrained, precision="fp32", device="cuda")
 44 | 
 45 |         resize_transform = [t for t in image_processor.transforms if isinstance(t, torchvision.transforms.Resize)][0]
 46 |         normalize_transform = [t for t in image_processor.transforms if isinstance(t, torchvision.transforms.Normalize)][0]
 47 |         self.resize_transform_size = resize_transform.size  # 224 or 384
 48 |         self.patch_size = vision_tower.visual.conv1.kernel_size[0]  # 14 or 16
 49 | 
 50 |         self.image_processor = CLIPImageProcessor.from_pretrained(
 51 |             "openai/clip-vit-large-patch14",
 52 |             crop_size=resize_transform.size,
 53 |             size={"shortest_edge": resize_transform.size},
 54 |             image_mean=list(normalize_transform.mean),
 55 |             image_std=list(normalize_transform.std),
 56 |         )
 57 |         rank0_print(f"Loaded image processor: {self.image_processor}")
 58 |         self.vision_tower = vision_tower.visual
 59 |         self.vision_tower.requires_grad_(False)
 60 | 
 61 |         self.is_loaded = True
 62 | 
 63 |     def feature_select(self, image_forward_outs):
 64 |         image_features = image_forward_outs[self.select_layer]
 65 |         if self.select_feature == "patch":
 66 |             image_features = image_features[:, 1:]
 67 |         elif self.select_feature == "cls_patch":
 68 |             image_features = image_features
 69 |         elif self.select_feature == "conv_flatten":
 70 |             image_features = image_features.flatten(2).transpose(1, 2)
 71 |         else:
 72 |             raise ValueError(f"Unexpected select feature: {self.select_feature}")
 73 |         return image_features
 74 | 
 75 |     def forward_visual(self, x, output_hidden_states=False):
 76 |         if hasattr(self.vision_tower, "trunk") and hasattr(self.vision_tower.trunk, "_intermediate_layers"):
 77 |             return self.vision_tower.trunk._intermediate_layers(x, abs(self.select_layer))
 78 |         else:
 79 | 
 80 |             def forward_openclip(self, x: torch.Tensor):
 81 |                 features = []
 82 |                 x = self.conv1(x)  # shape = [*, width, grid, grid]
 83 |                 x = x.reshape(x.shape[0], x.shape[1], -1)  # shape = [*, width, grid ** 2]
 84 |                 x = x.permute(0, 2, 1)  # shape = [*, grid ** 2, width]
 85 | 
 86 |                 # class embeddings and positional embeddings
 87 |                 x = torch.cat(
 88 |                     [_expand_token(self.class_embedding, x.shape[0]).to(x.dtype), x],
 89 |                     dim=1,
 90 |                 )
 91 |                 # shape = [*, grid ** 2 + 1, width]
 92 |                 x = x + self.positional_embedding.to(x.dtype)
 93 | 
 94 |                 x = self.patch_dropout(x)
 95 |                 x = self.ln_pre(x)
 96 | 
 97 |                 x = x.permute(1, 0, 2)  # NLD -> LND
 98 |                 for r in self.transformer.resblocks:
 99 |                     x = r(x, attn_mask=None)
100 |                     features.append(x)
101 |                 return features
102 | 
103 |             return forward_openclip(self.vision_tower, x)
104 | 
105 |     def forward(self, images):
106 |         if type(images) is list:
107 |             image_features = []
108 |             for image in images:
109 |                 image_forward_out = self.forward_visual(image.to(self.dtype).unsqueeze(0), output_hidden_states=True)
110 |                 image_feature = self.feature_select(image_forward_out).to(image.dtype)
111 |                 image_features.append(image_feature)
112 |         else:
113 |             image_forward_outs = self.forward_visual(images.to(self.dtype), output_hidden_states=True)
114 |             image_features = self.feature_select(image_forward_outs).to(images.dtype)
115 | 
116 |         return image_features
117 | 
118 |     @property
119 |     def dummy_feature(self):
120 |         return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype)
121 | 
122 |     @property
123 |     def dtype(self):
124 |         if hasattr(self.vision_tower, "conv1"):
125 |             return self.vision_tower.conv1.weight.dtype
126 |         if hasattr(self.vision_tower, "trunk"):
127 |             return self.vision_tower.trunk.patch_embed.proj.weight.dtype
128 |         raise NotImplementedError
129 | 
130 |     @property
131 |     def device(self):
132 |         if hasattr(self.vision_tower, "conv1"):
133 |             return self.vision_tower.conv1.weight.device
134 |         if hasattr(self.vision_tower, "trunk"):
135 |             return self.vision_tower.trunk.patch_embed.proj.weight.device
136 |         raise NotImplementedError
137 | 
138 |     @property
139 |     def config(self):
140 |         return None
141 | 
142 |     @property
143 |     def hidden_size(self):
144 |         if self.model_name in HIDDEN_SIZE_DICT:
145 |             return HIDDEN_SIZE_DICT[self.model_name]
146 |         else:
147 |             raise NotImplementedError
148 | 
149 |     @property
150 |     def num_patches(self):
151 |         image_size = self.resize_transform_size if isinstance(self.resize_transform_size, int) else self.resize_transform_size[0]
152 |         _num_patches = (image_size // self.patch_size) ** 2
153 |         if "cls_patch" in self.select_feature:
154 |             _num_patches += 1
155 |         return _num_patches
156 | 
157 |     @property
158 |     def image_size(self):
159 |         return self.resize_transform_size
160 | 
161 |     @property
162 |     def num_patches_per_side(self):
163 |         return self.resize_transform_size // self.patch_size
164 | 


--------------------------------------------------------------------------------
/llava/model/multimodal_projector/__pycache__/builder.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dongyh20/Insight-V/5e9c6f934c1929e351dbff17a6c6c7bc2d8143b1/llava/model/multimodal_projector/__pycache__/builder.cpython-310.pyc


--------------------------------------------------------------------------------
/llava/model/multimodal_projector/__pycache__/pooler_projector.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dongyh20/Insight-V/5e9c6f934c1929e351dbff17a6c6c7bc2d8143b1/llava/model/multimodal_projector/__pycache__/pooler_projector.cpython-310.pyc


--------------------------------------------------------------------------------
/llava/model/multimodal_projector/builder.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import re
 4 | 
 5 | from .pooler_projector import PoolerProjector
 6 | 
 7 | 
 8 | class IdentityMap(nn.Module):
 9 |     def __init__(self):
10 |         super().__init__()
11 | 
12 |     def forward(self, x, *args, **kwargs):
13 |         return x
14 | 
15 |     @property
16 |     def config(self):
17 |         return {"mm_projector_type": "identity"}
18 | 
19 | 
20 | class SimpleResBlock(nn.Module):
21 |     def __init__(self, channels):
22 |         super().__init__()
23 |         self.pre_norm = nn.LayerNorm(channels)
24 | 
25 |         self.proj = nn.Sequential(nn.Linear(channels, channels), nn.GELU(), nn.Linear(channels, channels))
26 | 
27 |     def forward(self, x):
28 |         x = self.pre_norm(x)
29 |         return x + self.proj(x)
30 | 
31 | 
32 | def build_vision_projector(config, delay_load=False, **kwargs):
33 |     projector_type = getattr(config, "mm_projector_type", "linear")
34 | 
35 |     if projector_type == "linear":
36 |         return nn.Linear(config.mm_hidden_size, config.hidden_size)
37 | 
38 |     if projector_type == "pooler":
39 |         return PoolerProjector(config, kwargs["vision_cfg"])
40 | 
41 |     mlp_gelu_match = re.match(r"^mlp(\d+)x_gelu$", projector_type)
42 |     if mlp_gelu_match:
43 |         mlp_depth = int(mlp_gelu_match.group(1))
44 |         modules = [nn.Linear(config.mm_hidden_size, config.hidden_size)]
45 |         for _ in range(1, mlp_depth):
46 |             modules.append(nn.GELU())
47 |             modules.append(nn.Linear(config.hidden_size, config.hidden_size))
48 |         return nn.Sequential(*modules)
49 | 
50 |     mlp_gelu_resnet_match = re.match(r"^mlp(\d+)x_res(\d+)x_gelu$", projector_type)
51 |     if mlp_gelu_resnet_match:
52 |         mlp_depth = int(mlp_gelu_resnet_match.group(1))
53 |         res_depth = int(mlp_gelu_resnet_match.group(2))
54 |         modules = [nn.Linear(config.mm_hidden_size, config.hidden_size)]
55 |         for _ in range(1, mlp_depth):
56 |             modules.append(nn.GELU())
57 |             modules.append(nn.Linear(config.hidden_size, config.hidden_size))
58 |         for _ in range(res_depth):
59 |             modules.append(SimpleResBlock(config.hidden_size))
60 |         return nn.Sequential(*modules)
61 | 
62 |     if projector_type == "identity":
63 |         return IdentityMap()
64 | 
65 |     raise ValueError(f"Unknown projector type: {projector_type}")
66 | 


--------------------------------------------------------------------------------
/llava/model/multimodal_projector/pooler_projector.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | 
 4 | import math
 5 | 
 6 | from transformers.models.clip.modeling_clip import CLIPVisionModel
 7 | 
 8 | 
 9 | class PoolerProjector(nn.Module):
10 |     def __init__(self, config, vision_cfg):
11 |         super().__init__()
12 |         self._config = config
13 |         self.hw = vision_cfg.image_size // vision_cfg.patch_size
14 | 
15 |         self.conv_pool = nn.Conv2d(config.mm_hidden_size, config.hidden_size, kernel_size=2, stride=2)
16 | 
17 |         self.proj = nn.Sequential(
18 |             nn.GELU(),
19 |             nn.Linear(config.hidden_size, config.hidden_size),
20 |         )
21 | 
22 |     def forward(self, x, *args, **kwargs):
23 |         height = width = self.hw
24 |         assert height * width == x.shape[1]
25 |         x = x.view(x.shape[0], height, width, -1).permute(0, 3, 1, 2)
26 |         x = self.conv_pool(x)
27 |         x = x.flatten(2).transpose(1, 2)
28 |         x = self.proj(x)
29 |         return x
30 | 
31 |     @property
32 |     def config(self):
33 |         return {"mm_projector_type": "pooler"}
34 | 


--------------------------------------------------------------------------------
/llava/model/multimodal_resampler/__pycache__/builder.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dongyh20/Insight-V/5e9c6f934c1929e351dbff17a6c6c7bc2d8143b1/llava/model/multimodal_resampler/__pycache__/builder.cpython-310.pyc


--------------------------------------------------------------------------------
/llava/model/multimodal_resampler/__pycache__/masked_drop.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dongyh20/Insight-V/5e9c6f934c1929e351dbff17a6c6c7bc2d8143b1/llava/model/multimodal_resampler/__pycache__/masked_drop.cpython-310.pyc


--------------------------------------------------------------------------------
/llava/model/multimodal_resampler/__pycache__/perceiver.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dongyh20/Insight-V/5e9c6f934c1929e351dbff17a6c6c7bc2d8143b1/llava/model/multimodal_resampler/__pycache__/perceiver.cpython-310.pyc


--------------------------------------------------------------------------------
/llava/model/multimodal_resampler/__pycache__/qformer.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dongyh20/Insight-V/5e9c6f934c1929e351dbff17a6c6c7bc2d8143b1/llava/model/multimodal_resampler/__pycache__/qformer.cpython-310.pyc


--------------------------------------------------------------------------------
/llava/model/multimodal_resampler/__pycache__/spatial_pool.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dongyh20/Insight-V/5e9c6f934c1929e351dbff17a6c6c7bc2d8143b1/llava/model/multimodal_resampler/__pycache__/spatial_pool.cpython-310.pyc


--------------------------------------------------------------------------------
/llava/model/multimodal_resampler/builder.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | from .masked_drop import MaskedDrop
 4 | from .spatial_pool import SpatialPool
 5 | from .perceiver import PerceiverResampler
 6 | from .qformer import Qformer
 7 | 
 8 | 
 9 | class IdentityMap(torch.nn.Module):
10 |     def __init__(self):
11 |         super().__init__()
12 | 
13 |     def forward(self, x, *args, **kwargs):
14 |         return x
15 | 
16 |     @property
17 |     def config(self):
18 |         return {"mm_resampler_type": None}
19 | 
20 | 
21 | def build_vision_resampler(model_args, delay_load=False, **kwargs):
22 |     resampler_type = getattr(model_args, "mm_resampler_type", None)
23 |     if resampler_type == "masked_drop":
24 |         return MaskedDrop(model_args)
25 |     elif resampler_type == "spatial_pool":
26 |         return SpatialPool(model_args, **kwargs)
27 |     elif resampler_type == "perceiver":
28 |         return PerceiverResampler(model_args, **kwargs)
29 |     elif resampler_type == "qformer":
30 |         return Qformer(model_args, **kwargs)
31 |     elif resampler_type is None:
32 |         return IdentityMap()
33 | 
34 |     raise ValueError(f"Unknown resampler type: {resampler_type}")
35 | 


--------------------------------------------------------------------------------
/llava/model/multimodal_resampler/masked_drop.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | 
 4 | import random
 5 | 
 6 | 
 7 | class MaskedDrop(nn.Module):
 8 |     def __init__(self, model_args):
 9 |         super().__init__()
10 | 
11 |         self.mode = model_args.mm_mask_drop_mode
12 |         self.skip_percentage = model_args.mm_mask_drop_skip_percentage
13 |         self.ratio = model_args.mm_mask_drop_ratio
14 |         self.ratio_upper = model_args.mm_mask_drop_ratio_upper
15 |         self.ratio_lower = model_args.mm_mask_drop_ratio_lower
16 | 
17 |     def forward(self, image_features, *args, **kwargs):
18 | 
19 |         if not self.training:
20 |             return image_features
21 | 
22 |         if self.skip_percentage > random.random():
23 |             return image_features
24 | 
25 |         masked_features = []
26 | 
27 |         for image_feature in image_features:
28 |             num_tokens = image_feature.shape[0]
29 |             if self.mode == "fixed":
30 |                 num_keep = int(num_tokens * self.ratio)
31 |                 masked_features.append(self.random_masking(image_feature.unsqueeze(0), num_keep)[0][0])
32 |             elif self.mode == "range":
33 |                 num_keep = int(num_tokens * random.uniform(self.ratio_lower, self.ratio_upper))
34 |                 masked_features.append(self.random_masking(image_feature.unsqueeze(0), num_keep)[0])
35 |             elif self.mode == "cls_only":
36 |                 masked_features.append(image_feature[0:1])
37 |             else:
38 |                 raise ValueError(f"Unexpected masked drop mode: {self.mode}")
39 | 
40 |         if self.mode not in ["range"] and (type(image_features) is not list or self.mode in ["cls_only"]):
41 |             masked_features = torch.stack(masked_features, dim=0)
42 | 
43 |         return masked_features
44 | 
45 |     @property
46 |     def config(self):
47 |         return {
48 |             "mm_resampler_type": "masked_drop",
49 |             "mm_mask_drop_mode": self.mode,
50 |             "mm_mask_drop_skip_percentage": self.skip_percentage,
51 |             "mm_mask_drop_ratio": self.ratio,
52 |             "mm_mask_drop_ratio_upper": self.ratio_upper,
53 |             "mm_mask_drop_ratio_lower": self.ratio_lower,
54 |         }
55 | 
56 |     def random_masking(self, x, len_keep):
57 |         """
58 |         Perform per-sample random masking by per-sample shuffling.
59 |         Per-sample shuffling is done by argsort random noise.
60 |         x: [N, L, D], sequence
61 |         """
62 |         N, L, D = x.shape  # batch, length, dim
63 | 
64 |         noise = torch.rand(N, L, device=x.device)  # noise in [0, 1]
65 | 
66 |         # sort noise for each sample
67 |         ids_shuffle = torch.argsort(noise, dim=1)  # ascend: small is keep, large is remove
68 |         ids_restore = torch.argsort(ids_shuffle, dim=1)
69 | 
70 |         # keep the first subset
71 |         ids_keep = ids_shuffle[:, :len_keep]
72 |         x_masked = torch.gather(x, dim=1, index=ids_keep.unsqueeze(-1).repeat(1, 1, D))
73 | 
74 |         # generate the binary mask: 0 is keep, 1 is remove
75 |         mask = torch.ones([N, L], device=x.device)
76 |         mask[:, :len_keep] = 0
77 |         # unshuffle to get the binary mask
78 |         mask = torch.gather(mask, dim=1, index=ids_restore)
79 | 
80 |         return x_masked, mask, ids_restore
81 | 


--------------------------------------------------------------------------------
/llava/model/multimodal_resampler/perceiver.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Taken from https://github.com/lucidrains/flamingo-pytorch
  3 | """
  4 | 
  5 | import torch
  6 | from einops import rearrange, repeat
  7 | 
  8 | try:
  9 |     from einops_exts import rearrange_many
 10 | except:
 11 |     pass
 12 | 
 13 | from torch import einsum, nn
 14 | 
 15 | 
 16 | def exists(val):
 17 |     return val is not None
 18 | 
 19 | 
 20 | def FeedForward(dim, mult=4):
 21 |     inner_dim = int(dim * mult)
 22 |     return nn.Sequential(
 23 |         nn.LayerNorm(dim),
 24 |         nn.Linear(dim, inner_dim, bias=False),
 25 |         nn.GELU(),
 26 |         nn.Linear(inner_dim, dim, bias=False),
 27 |     )
 28 | 
 29 | 
 30 | class PerceiverAttention(nn.Module):
 31 |     def __init__(self, *, dim, dim_head=64, heads=8):
 32 |         super().__init__()
 33 |         self.scale = dim_head**-0.5
 34 |         self.heads = heads
 35 |         inner_dim = dim_head * heads
 36 | 
 37 |         self.norm_media = nn.LayerNorm(dim)
 38 |         self.norm_latents = nn.LayerNorm(dim)
 39 | 
 40 |         self.to_q = nn.Linear(dim, inner_dim, bias=False)
 41 |         self.to_kv = nn.Linear(dim, inner_dim * 2, bias=False)
 42 |         self.to_out = nn.Linear(inner_dim, dim, bias=False)
 43 | 
 44 |     def forward(self, x, latents):
 45 |         """
 46 |         Args:
 47 |             x (torch.Tensor): image features
 48 |                 shape (b, T, n1, D)
 49 |             latent (torch.Tensor): latent features
 50 |                 shape (b, T, n2, D)
 51 |         """
 52 |         x = self.norm_media(x)
 53 |         latents = self.norm_latents(latents)
 54 | 
 55 |         h = self.heads
 56 | 
 57 |         q = self.to_q(latents)
 58 |         kv_input = torch.cat((x, latents), dim=-2)
 59 |         k, v = self.to_kv(kv_input).chunk(2, dim=-1)
 60 |         q, k, v = rearrange_many((q, k, v), "b t n (h d) -> b h t n d", h=h)
 61 |         q = q * self.scale
 62 | 
 63 |         # attention
 64 |         sim = einsum("... i d, ... j d  -> ... i j", q, k)
 65 |         sim = sim - sim.amax(dim=-1, keepdim=True).detach()
 66 |         attn = sim.softmax(dim=-1)
 67 | 
 68 |         out = einsum("... i j, ... j d -> ... i d", attn, v)
 69 |         out = rearrange(out, "b h t n d -> b t n (h d)", h=h)
 70 |         return self.to_out(out)
 71 | 
 72 | 
 73 | class PerceiverResamplerModule(nn.Module):
 74 |     def __init__(
 75 |         self,
 76 |         *,
 77 |         dim,
 78 |         depth=6,
 79 |         dim_head=64,
 80 |         heads=8,
 81 |         num_latents=64,
 82 |         max_num_media=None,
 83 |         max_num_frames=None,
 84 |         ff_mult=4,
 85 |     ):
 86 |         super().__init__()
 87 |         self.latents = nn.Parameter(torch.randn(num_latents, dim))
 88 |         self.frame_embs = nn.Parameter(torch.randn(max_num_frames, dim)) if exists(max_num_frames) else None
 89 |         self.media_time_embs = nn.Parameter(torch.randn(max_num_media, 1, dim)) if exists(max_num_media) else None
 90 | 
 91 |         self.layers = nn.ModuleList([])
 92 |         for _ in range(depth):
 93 |             self.layers.append(
 94 |                 nn.ModuleList(
 95 |                     [
 96 |                         PerceiverAttention(dim=dim, dim_head=dim_head, heads=heads),
 97 |                         FeedForward(dim=dim, mult=ff_mult) if ff_mult > 0 else nn.Identity(),
 98 |                     ]
 99 |                 )
100 |             )
101 | 
102 |         self.norm = nn.LayerNorm(dim)
103 | 
104 |     def forward(self, x):
105 |         """
106 |         Args:
107 |             x (torch.Tensor): image features
108 |                 shape (b, T, F, v, D)
109 |         Returns:
110 |             shape (b, T, n, D) where n is self.num_latents
111 |         """
112 |         b, T, F, v = x.shape[:4]
113 | 
114 |         # frame and media time embeddings
115 |         if exists(self.frame_embs):
116 |             frame_embs = repeat(self.frame_embs[:F], "F d -> b T F v d", b=b, T=T, v=v)
117 |             x = x + frame_embs
118 |         x = rearrange(x, "b T F v d -> b T (F v) d")  # flatten the frame and spatial dimensions
119 |         if exists(self.media_time_embs):
120 |             x = x + self.media_time_embs[:T]
121 | 
122 |         # blocks
123 |         latents = repeat(self.latents, "n d -> b T n d", b=b, T=T)
124 |         for attn, ff in self.layers:
125 |             latents = attn(x, latents) + latents
126 |             latents = ff(latents) + latents
127 |         return self.norm(latents)
128 | 
129 | 
130 | class PerceiverResampler(nn.Module):
131 |     def __init__(self, model_args, vision_tower):
132 |         super().__init__()
133 | 
134 |         self.depth = model_args.mm_perceiver_depth
135 |         self.num_latents = model_args.mm_perceiver_latents
136 |         self.ff_mult = model_args.mm_perceiver_ff_mult
137 |         self.pretrained = model_args.mm_perceiver_pretrained
138 | 
139 |         self.perceiver = PerceiverResamplerModule(dim=vision_tower.hidden_size, depth=self.depth, num_latents=self.num_latents, ff_mult=self.ff_mult)
140 | 
141 |         if self.pretrained is not None:
142 |             self.load_state_dict(torch.load(self.pretrained))
143 | 
144 |     def forward(self, image_features, *args, **kwargs):
145 |         return self.perceiver(image_features[:, None, None]).squeeze(1)
146 | 
147 |     @property
148 |     def config(self):
149 |         return {
150 |             "mm_resampler_type": "perceiver",
151 |             "mm_perceiver_depth": self.depth,
152 |             "mm_perceiver_latents": self.num_latents,
153 |             "mm_perceiver_ff_mult": self.ff_mult,
154 |             "mm_perceiver_pretrained": self.pretrained,
155 |         }
156 | 


--------------------------------------------------------------------------------
/llava/model/multimodal_resampler/spatial_pool.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import math
 4 | 
 5 | 
 6 | class SpatialPool(nn.Module):
 7 |     def __init__(self, model_args, vision_tower):
 8 |         super().__init__()
 9 | 
10 |         self.mode = model_args.mm_spatial_pool_mode
11 |         self.stride = model_args.mm_spatial_pool_stride
12 |         self.out_channels = getattr(model_args, "mm_spatial_pool_out_channels", vision_tower.hidden_size)
13 | 
14 |         if self.mode == "average":
15 |             self.pool = nn.AvgPool2d(kernel_size=self.stride, stride=self.stride)
16 |         elif self.mode == "max":
17 |             self.pool = nn.MaxPool2d(kernel_size=self.stride, stride=self.stride)
18 |         elif self.mode == "conv":
19 |             self.pool = nn.Conv2d(in_channels=vision_tower.hidden_size, out_channels=self.out_channels, kernel_size=self.stride, stride=self.stride)
20 |         else:
21 |             raise ValueError(f"Unknown pooling mode: {self.pool}.")
22 | 
23 |     def forward(self, image_features, images, *args, **kwargs):
24 |         ori_W = int(math.sqrt(image_features.shape[1] * images.shape[3] // images.shape[2]))
25 |         ori_H = int(ori_W * images.shape[2] // images.shape[3])
26 | 
27 |         B, _, F = image_features.shape
28 | 
29 |         image_features_spatial = image_features.view(B, ori_H, ori_H, F).permute(0, 3, 1, 2)
30 |         image_features_spatial_pool = self.pool(image_features_spatial)
31 | 
32 |         return image_features_spatial_pool.flatten(2).transpose(1, 2).contiguous()
33 | 
34 |     @property
35 |     def config(self):
36 |         return {
37 |             "mm_resampler_type": "spatial_pool",
38 |             "mm_spatial_pool_stride": self.stride,
39 |             "mm_spatial_pool_mode": self.mode,
40 |             "mm_spatial_pool_out_channels": self.out_channels,
41 |         }
42 | 
43 |     @property
44 |     def hidden_size(self):
45 |         return self.out_channels
46 | 


--------------------------------------------------------------------------------
/llava/model/utils.py:
--------------------------------------------------------------------------------
 1 | from transformers import AutoConfig
 2 | 
 3 | 
 4 | def auto_upgrade(config):
 5 |     cfg = AutoConfig.from_pretrained(config)
 6 |     if "llava" in config and "llava" not in cfg.model_type:
 7 |         assert cfg.model_type == "llama"
 8 |         print("You are using newer LLaVA code base, while the checkpoint of v0 is from older code base.")
 9 |         print("You must upgrade the checkpoint to the new code base (this can be done automatically).")
10 |         confirm = input("Please confirm that you want to upgrade the checkpoint. [Y/N]")
11 |         if confirm.lower() in ["y", "yes"]:
12 |             print("Upgrading checkpoint...")
13 |             assert len(cfg.architectures) == 1
14 |             setattr(cfg.__class__, "model_type", "llava")
15 |             cfg.architectures[0] = "LlavaLlamaForCausalLM"
16 |             cfg.save_pretrained(config)
17 |             print("Checkpoint upgraded.")
18 |         else:
19 |             print("Checkpoint upgrade aborted.")
20 |             exit(1)
21 | 


--------------------------------------------------------------------------------
/llava/serve/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dongyh20/Insight-V/5e9c6f934c1929e351dbff17a6c6c7bc2d8143b1/llava/serve/__init__.py


--------------------------------------------------------------------------------
/llava/serve/cli.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import torch
  3 | 
  4 | from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
  5 | from llava.conversation import conv_templates, SeparatorStyle
  6 | from llava.model.builder import load_pretrained_model
  7 | from llava.utils import disable_torch_init
  8 | from llava.mm_utils import tokenizer_image_token, get_model_name_from_path, KeywordsStoppingCriteria
  9 | 
 10 | from PIL import Image
 11 | 
 12 | import requests
 13 | from PIL import Image
 14 | from io import BytesIO
 15 | from transformers import TextStreamer
 16 | 
 17 | 
 18 | def load_image(image_file):
 19 |     if image_file.startswith("http") or image_file.startswith("https"):
 20 |         response = requests.get(image_file)
 21 |         image = Image.open(BytesIO(response.content)).convert("RGB")
 22 |     else:
 23 |         image = Image.open(image_file).convert("RGB")
 24 |     return image
 25 | 
 26 | 
 27 | def main(args):
 28 |     # Model
 29 |     disable_torch_init()
 30 | 
 31 |     model_name = get_model_name_from_path(args.model_path)
 32 |     tokenizer, model, image_processor, context_len = load_pretrained_model(args.model_path, args.model_base, model_name, args.load_8bit, args.load_4bit)
 33 | 
 34 |     if "llama-2" in model_name.lower():
 35 |         conv_mode = "llava_llama_2"
 36 |     elif "v1" in model_name.lower():
 37 |         conv_mode = "llava_v1"
 38 |     elif "mpt" in model_name.lower():
 39 |         conv_mode = "mpt"
 40 |     else:
 41 |         conv_mode = "llava_v0"
 42 | 
 43 |     if args.conv_mode is not None and conv_mode != args.conv_mode:
 44 |         print("[WARNING] the auto inferred conversation mode is {}, while `--conv-mode` is {}, using {}".format(conv_mode, args.conv_mode, args.conv_mode))
 45 |     else:
 46 |         args.conv_mode = conv_mode
 47 | 
 48 |     conv = conv_templates[args.conv_mode].copy()
 49 |     if "mpt" in model_name.lower():
 50 |         roles = ("user", "assistant")
 51 |     else:
 52 |         roles = conv.roles
 53 | 
 54 |     image = load_image(args.image_file)
 55 |     image_tensor = image_processor.preprocess(image, return_tensors="pt")["pixel_values"].half().cuda()
 56 | 
 57 |     while True:
 58 |         try:
 59 |             inp = input(f"{roles[0]}: ")
 60 |         except EOFError:
 61 |             inp = ""
 62 |         if not inp:
 63 |             print("exit...")
 64 |             break
 65 | 
 66 |         print(f"{roles[1]}: ", end="")
 67 | 
 68 |         if image is not None:
 69 |             # first message
 70 |             if model.config.mm_use_im_start_end:
 71 |                 inp = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + "\n" + inp
 72 |             else:
 73 |                 inp = DEFAULT_IMAGE_TOKEN + "\n" + inp
 74 |             conv.append_message(conv.roles[0], inp)
 75 |             image = None
 76 |         else:
 77 |             # later messages
 78 |             conv.append_message(conv.roles[0], inp)
 79 |         conv.append_message(conv.roles[1], None)
 80 |         prompt = conv.get_prompt()
 81 | 
 82 |         input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(0).cuda()
 83 |         stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
 84 |         keywords = [stop_str]
 85 |         stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids)
 86 |         streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
 87 | 
 88 |         with torch.inference_mode():
 89 |             output_ids = model.generate(input_ids, images=image_tensor, do_sample=True, temperature=0.2, max_new_tokens=1024, streamer=streamer, use_cache=True, stopping_criteria=[stopping_criteria])
 90 | 
 91 |         outputs = tokenizer.decode(output_ids[0, input_ids.shape[1] :]).strip()
 92 |         conv.messages[-1][-1] = outputs
 93 | 
 94 |         if args.debug:
 95 |             print("\n", {"prompt": prompt, "outputs": outputs}, "\n")
 96 | 
 97 | 
 98 | if __name__ == "__main__":
 99 |     parser = argparse.ArgumentParser()
100 |     parser.add_argument("--model-path", type=str, default="facebook/opt-350m")
101 |     parser.add_argument("--model-base", type=str, default=None)
102 |     parser.add_argument("--image-file", type=str, required=True)
103 |     parser.add_argument("--num-gpus", type=int, default=1)
104 |     parser.add_argument("--conv-mode", type=str, default=None)
105 |     parser.add_argument("--temperature", type=float, default=0.2)
106 |     parser.add_argument("--max-new-tokens", type=int, default=512)
107 |     parser.add_argument("--load-8bit", action="store_true")
108 |     parser.add_argument("--load-4bit", action="store_true")
109 |     parser.add_argument("--debug", action="store_true")
110 |     args = parser.parse_args()
111 |     main(args)
112 | 


--------------------------------------------------------------------------------
/llava/serve/examples/extreme_ironing.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dongyh20/Insight-V/5e9c6f934c1929e351dbff17a6c6c7bc2d8143b1/llava/serve/examples/extreme_ironing.jpg


--------------------------------------------------------------------------------
/llava/serve/examples/waterview.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dongyh20/Insight-V/5e9c6f934c1929e351dbff17a6c6c7bc2d8143b1/llava/serve/examples/waterview.jpg


--------------------------------------------------------------------------------
/llava/serve/register_worker.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Manually register workers.
 3 | 
 4 | Usage:
 5 | python3 -m fastchat.serve.register_worker --controller http://localhost:21001 --worker-name http://localhost:21002
 6 | """
 7 | 
 8 | import argparse
 9 | 
10 | import requests
11 | 
12 | if __name__ == "__main__":
13 |     parser = argparse.ArgumentParser()
14 |     parser.add_argument("--controller-address", type=str)
15 |     parser.add_argument("--worker-name", type=str)
16 |     parser.add_argument("--check-heart-beat", action="store_true")
17 |     args = parser.parse_args()
18 | 
19 |     url = args.controller_address + "/register_worker"
20 |     data = {
21 |         "worker_name": args.worker_name,
22 |         "check_heart_beat": args.check_heart_beat,
23 |         "worker_status": None,
24 |     }
25 |     r = requests.post(url, json=data)
26 |     assert r.status_code == 200
27 | 


--------------------------------------------------------------------------------
/llava/serve/test_message.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import json
 3 | 
 4 | import requests
 5 | 
 6 | from llava.conversation import default_conversation
 7 | 
 8 | 
 9 | def main():
10 |     if args.worker_address:
11 |         worker_addr = args.worker_address
12 |     else:
13 |         controller_addr = args.controller_address
14 |         ret = requests.post(controller_addr + "/refresh_all_workers")
15 |         ret = requests.post(controller_addr + "/list_models")
16 |         models = ret.json()["models"]
17 |         models.sort()
18 |         print(f"Models: {models}")
19 | 
20 |         ret = requests.post(controller_addr + "/get_worker_address", json={"model": args.model_name})
21 |         worker_addr = ret.json()["address"]
22 |         print(f"worker_addr: {worker_addr}")
23 | 
24 |     if worker_addr == "":
25 |         return
26 | 
27 |     conv = default_conversation.copy()
28 |     conv.append_message(conv.roles[0], args.message)
29 |     prompt = conv.get_prompt()
30 | 
31 |     headers = {"User-Agent": "LLaVA Client"}
32 |     pload = {
33 |         "model": args.model_name,
34 |         "prompt": prompt,
35 |         "max_new_tokens": args.max_new_tokens,
36 |         "temperature": 0.7,
37 |         "stop": conv.sep,
38 |     }
39 |     response = requests.post(worker_addr + "/worker_generate_stream", headers=headers, json=pload, stream=True)
40 | 
41 |     print(prompt.replace(conv.sep, "\n"), end="")
42 |     for chunk in response.iter_lines(chunk_size=8192, decode_unicode=False, delimiter=b"\0"):
43 |         if chunk:
44 |             data = json.loads(chunk.decode("utf-8"))
45 |             output = data["text"].split(conv.sep)[-1]
46 |             print(output, end="\r")
47 |     print("")
48 | 
49 | 
50 | if __name__ == "__main__":
51 |     parser = argparse.ArgumentParser()
52 |     parser.add_argument("--controller-address", type=str, default="http://localhost:21001")
53 |     parser.add_argument("--worker-address", type=str)
54 |     parser.add_argument("--model-name", type=str, default="facebook/opt-350m")
55 |     parser.add_argument("--max-new-tokens", type=int, default=32)
56 |     parser.add_argument("--message", type=str, default="Tell me a story with more than 1000 words.")
57 |     args = parser.parse_args()
58 | 
59 |     main()
60 | 


--------------------------------------------------------------------------------
/llava/train/__pycache__/llava_trainer.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dongyh20/Insight-V/5e9c6f934c1929e351dbff17a6c6c7bc2d8143b1/llava/train/__pycache__/llava_trainer.cpython-310.pyc


--------------------------------------------------------------------------------
/llava/train/__pycache__/train.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dongyh20/Insight-V/5e9c6f934c1929e351dbff17a6c6c7bc2d8143b1/llava/train/__pycache__/train.cpython-310.pyc


--------------------------------------------------------------------------------
/llava/train/llama_flash_attn_monkey_patch.py:
--------------------------------------------------------------------------------
 1 | from typing import Optional, Tuple
 2 | import warnings
 3 | 
 4 | import torch
 5 | 
 6 | import transformers
 7 | from transformers.models.llama.modeling_llama import apply_rotary_pos_emb, repeat_kv
 8 | 
 9 | try:
10 |     from flash_attn.flash_attn_interface import flash_attn_unpadded_qkvpacked_func
11 | except ImportError:
12 |     from flash_attn.flash_attn_interface import flash_attn_varlen_qkvpacked_func as flash_attn_unpadded_qkvpacked_func
13 | from flash_attn.bert_padding import unpad_input, pad_input
14 | 
15 | 
16 | def forward(
17 |     self,
18 |     hidden_states: torch.Tensor,
19 |     attention_mask: Optional[torch.Tensor] = None,
20 |     position_ids: Optional[torch.Tensor] = None,
21 |     past_key_value: Optional[Tuple[torch.Tensor]] = None,
22 |     output_attentions: bool = False,
23 |     use_cache: bool = False,
24 |     padding_mask: Optional[torch.Tensor] = None,
25 | ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
26 |     if output_attentions:
27 |         warnings.warn("Output attentions is not supported for patched `LlamaAttention`, returning `None` instead.")
28 | 
29 |     bsz, q_len, _ = hidden_states.size()
30 | 
31 |     query_states = self.q_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
32 |     key_states = self.k_proj(hidden_states).view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
33 |     value_states = self.v_proj(hidden_states).view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)  # shape: (b, num_heads, s, head_dim)
34 | 
35 |     kv_seq_len = key_states.shape[-2]
36 |     if past_key_value is not None:
37 |         kv_seq_len += past_key_value[0].shape[-2]
38 | 
39 |     cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
40 |     query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
41 | 
42 |     if past_key_value is not None:
43 |         # reuse k, v
44 |         key_states = torch.cat([past_key_value[0], key_states], dim=2)
45 |         value_states = torch.cat([past_key_value[1], value_states], dim=2)
46 | 
47 |     past_key_value = (key_states, value_states) if use_cache else None
48 | 
49 |     # repeat k/v heads if n_kv_heads < n_heads
50 |     key_states = repeat_kv(key_states, self.num_key_value_groups)
51 |     value_states = repeat_kv(value_states, self.num_key_value_groups)
52 | 
53 |     # Transform the data into the format required by flash attention
54 |     qkv = torch.stack([query_states, key_states, value_states], dim=2)
55 |     qkv = qkv.transpose(1, 3)  # shape: [b, s, 3, num_heads, head_dim]
56 |     key_padding_mask = attention_mask
57 | 
58 |     if key_padding_mask is None:
59 |         qkv = qkv.reshape(-1, 3, self.num_heads, self.head_dim)
60 |         cu_q_lens = torch.arange(0, (bsz + 1) * q_len, step=q_len, dtype=torch.int32, device=qkv.device)
61 |         max_s = q_len
62 |         output = flash_attn_unpadded_qkvpacked_func(qkv, cu_q_lens, max_s, 0.0, softmax_scale=None, causal=True)
63 |         output = output.view(bsz, q_len, -1)
64 |     else:
65 |         qkv = qkv.reshape(bsz, q_len, -1)
66 |         qkv, indices, cu_q_lens, max_s = unpad_input(qkv, key_padding_mask)
67 |         qkv = qkv.view(-1, 3, self.num_heads, self.head_dim)
68 |         output_unpad = flash_attn_unpadded_qkvpacked_func(qkv, cu_q_lens, max_s, 0.0, softmax_scale=None, causal=True)
69 |         output_unpad = output_unpad.reshape(-1, self.num_heads * self.head_dim)
70 |         output = pad_input(output_unpad, indices, bsz, q_len)
71 | 
72 |     return self.o_proj(output), None, past_key_value
73 | 
74 | 
75 | # Disable the transformation of the attention mask in LlamaModel as the flash attention
76 | # requires the attention mask to be the same as the key_padding_mask
77 | def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_embeds, past_key_values_length):
78 |     # [bsz, seq_len]
79 |     return attention_mask
80 | 
81 | 
82 | def replace_llama_attn_with_flash_attn():
83 |     cuda_major, cuda_minor = torch.cuda.get_device_capability()
84 |     if cuda_major < 8:
85 |         warnings.warn("Flash attention is only supported on A100 or H100 GPU during training due to head dim > 64 backward." "ref: https://github.com/HazyResearch/flash-attention/issues/190#issuecomment-1523359593")
86 |     transformers.models.llama.modeling_llama.LlamaModel._prepare_decoder_attention_mask = _prepare_decoder_attention_mask
87 |     transformers.models.llama.modeling_llama.LlamaAttention.forward = forward
88 | 


--------------------------------------------------------------------------------
/llava/train/llava_trainer_eval.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import subprocess
 3 | 
 4 | from llava.train.llava_trainer import LLaVATrainer
 5 | 
 6 | 
 7 | class LLaVAEvalTrainer(LLaVATrainer):
 8 |     def evaluate(self, evaluate_args):
 9 |         cmd = f"accelerate launch --num_processes {evaluate_args.eval_num_processes} -m lmms_eval \
10 |                 --model {evaluate_args.model} \
11 |                 --model_args {evaluate_args.model_args} \
12 |                 --tasks {evaluate_args.task_names} \
13 |                 --batch_size {evaluate_args.batch_size} \
14 |                 --log_samples_suffix {evaluate_args.log_samples_suffix} \
15 |                 --output_path {evaluate_args.output_path}"
16 |         if evaluate_args.limit:
17 |             cmd += f" --limit {evaluate_args.limit}"
18 |         if evaluate_args.num_fewshot:
19 |             cmd += f" --num_fewshot {evaluate_args.num_fewshot}"
20 |         if evaluate_args.gen_kwargs != "":
21 |             cmd += f" --gen_kwargs {evaluate_args.gen_kwargs}"
22 |         if evaluate_args.log_samples:
23 |             cmd += f" --log_samples"
24 |         else:
25 |             assert False, "Please log samples so that the result can be parsed"
26 |         results = subprocess.run([cmd], shell=True, capture_output=True, text=True)
27 |         try:
28 |             result_file_index_start = results.stdout.index("Saved samples to ")
29 |             result_file_index_end = results.stdout.index(f".json")
30 |             result_file_index_start += len("Saved samples to ")
31 |             file = results.stdout[result_file_index_start:result_file_index_end]
32 |         except:
33 |             result_file_index_start = results.stderr.index("Saved samples to ")
34 |             result_file_index_end = results.stderr.index(f".json")
35 |             result_file_index_start += len("Saved samples to ")
36 |             file = results.stderr[result_file_index_start:result_file_index_end]
37 |         file = file.split("/")[:-1]
38 |         file = "/".join(file) + "/results.json"
39 |         with open(file, "r") as f:
40 |             lmms_eval_results = json.load(f)
41 |         result_dict = {}
42 |         tasks_list = evaluate_args.task_names.split(",")
43 |         for task in tasks_list:
44 |             task_results = lmms_eval_results["results"][task]
45 |             for k, v in task_results.items():
46 |                 if k != "alias" and "stderr" not in k:
47 |                     metric = k.split(",")[0]
48 |                     result_dict[f"{task}_{metric}"] = v
49 |         return result_dict
50 | 
51 |     """def evaluate(self, evaluate_args):
52 |         initialize_tasks()
53 |         tasks_list = evaluate_args.task_names.split(",")
54 |         result_dict = {}
55 |         results = evaluator.simple_evaluate(
56 |             model=evaluate_args.model,
57 |             model_args=evaluate_args.model_args,
58 |             tasks=tasks_list,
59 |             num_fewshot=evaluate_args.num_fewshot,
60 |             batch_size=evaluate_args.batch_size,
61 |             device=evaluate_args.device,
62 |             limit=evaluate_args.limit,
63 |             check_integrity=evaluate_args.check_integrity,
64 |             show_task_to_terminal=evaluate_args.show_task_to_terminal,
65 |             log_samples=evaluate_args.log_samples,
66 |             gen_kwargs=evaluate_args.gen_kwargs,
67 |             cli_args=evaluate_args,
68 |         )
69 |         for task in tasks_list:
70 |             task_results = results["results"][task]
71 |             for k,v in task_results.items():
72 |                 if k != "alias" and "stderr" not in k:
73 |                     metric = k.split(",")[0]
74 |                     result_dict[f"{task}_{metric}"] = v
75 |             
76 |         return result_dict"""
77 | 


--------------------------------------------------------------------------------
/llava/train/train_mem.py:
--------------------------------------------------------------------------------
1 | from llava.train.train import train
2 | 
3 | if __name__ == "__main__":
4 |     train()
5 | 


--------------------------------------------------------------------------------
/llava/utils.py:
--------------------------------------------------------------------------------
  1 | import datetime
  2 | import logging
  3 | import logging.handlers
  4 | import os
  5 | import sys
  6 | import numpy as np
  7 | 
  8 | import requests
  9 | 
 10 | from llava.constants import LOGDIR
 11 | 
 12 | server_error_msg = "**NETWORK ERROR DUE TO HIGH TRAFFIC. PLEASE REGENERATE OR REFRESH THIS PAGE.**"
 13 | moderation_msg = "I am sorry. Your input may violate our content moderation guidelines. Please avoid using harmful or offensive content."
 14 | 
 15 | handler = None
 16 | 
 17 | import torch.distributed as dist
 18 | 
 19 | try:
 20 |     import av
 21 |     from decord import VideoReader, cpu
 22 | except ImportError:
 23 |     print("Please install pyav to use video processing functions.")
 24 | 
 25 | def process_video_with_decord(video_file, data_args):
 26 |     vr = VideoReader(video_file, ctx=cpu(0), num_threads=1)
 27 |     total_frame_num = len(vr)
 28 |     video_time = total_frame_num / vr.get_avg_fps()
 29 |     avg_fps = round(vr.get_avg_fps() / data_args.video_fps)
 30 |     frame_idx = [i for i in range(0, total_frame_num, avg_fps)]
 31 |     frame_time = [i/avg_fps for i in frame_idx]
 32 | 
 33 |     
 34 |     if data_args.frames_upbound > 0:
 35 |         if len(frame_idx) > data_args.frames_upbound or data_args.force_sample:
 36 |             uniform_sampled_frames = np.linspace(0, total_frame_num - 1, data_args.frames_upbound, dtype=int)
 37 |             frame_idx = uniform_sampled_frames.tolist()
 38 |             frame_time = [i/vr.get_avg_fps() for i in frame_idx]
 39 |     
 40 |     video = vr.get_batch(frame_idx).asnumpy()
 41 |     frame_time = ",".join([f"{i:.2f}s" for i in frame_time])
 42 | 
 43 |     num_frames_to_sample = num_frames = len(frame_idx)
 44 |     # https://github.com/dmlc/decord/issues/208
 45 |     vr.seek(0)
 46 |     return video, video_time, frame_time, num_frames_to_sample
 47 | 
 48 | def process_video_with_pyav(video_file, data_args):
 49 |     container = av.open(video_file)
 50 |     # !!! This is the only difference. Using auto threading
 51 |     container.streams.video[0].thread_type = "AUTO"
 52 | 
 53 |     video_frames = []
 54 |     for packet in container.demux():
 55 |         if packet.stream.type == 'video':
 56 |             for frame in packet.decode():
 57 |                 video_frames.append(frame)
 58 |     total_frame_num = len(video_frames)
 59 |     video_time = video_frames[-1].time
 60 |     avg_fps = round(total_frame_num / video_time / data_args.video_fps)
 61 |     frame_idx = [i for i in range(0, total_frame_num, avg_fps)]
 62 | 
 63 |     if data_args.frames_upbound > 0:
 64 |         if len(frame_idx) > data_args.frames_upbound:
 65 |             uniform_sampled_frames = np.linspace(0, total_frame_num - 1, data_args.frames_upbound, dtype=int)
 66 |             frame_idx = uniform_sampled_frames.tolist()
 67 | 
 68 | 
 69 |     frames = [video_frames[i] for i in frame_idx]
 70 |     return np.stack([x.to_ndarray(format="rgb24") for x in frames])
 71 | 
 72 | 
 73 | def rank0_print(*args):
 74 |     if dist.is_initialized():
 75 |         if dist.get_rank() == 0:
 76 |             print(f"Rank {dist.get_rank()}: ", *args)
 77 |     else:
 78 |         print(*args)
 79 | 
 80 | 
 81 | def rank_print(*args):
 82 |     if dist.is_initialized():
 83 |         print(f"Rank {dist.get_rank()}: ", *args)
 84 |     else:
 85 |         print(*args)
 86 | 
 87 | def build_logger(logger_name, logger_filename):
 88 |     global handler
 89 | 
 90 |     formatter = logging.Formatter(
 91 |         fmt="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
 92 |         datefmt="%Y-%m-%d %H:%M:%S",
 93 |     )
 94 | 
 95 |     # Set the format of root handlers
 96 |     if not logging.getLogger().handlers:
 97 |         logging.basicConfig(level=logging.INFO)
 98 |     logging.getLogger().handlers[0].setFormatter(formatter)
 99 | 
100 |     # Redirect stdout and stderr to loggers
101 |     stdout_logger = logging.getLogger("stdout")
102 |     stdout_logger.setLevel(logging.INFO)
103 |     sl = StreamToLogger(stdout_logger, logging.INFO)
104 |     sys.stdout = sl
105 | 
106 |     stderr_logger = logging.getLogger("stderr")
107 |     stderr_logger.setLevel(logging.ERROR)
108 |     sl = StreamToLogger(stderr_logger, logging.ERROR)
109 |     sys.stderr = sl
110 | 
111 |     # Get logger
112 |     logger = logging.getLogger(logger_name)
113 |     logger.setLevel(logging.INFO)
114 | 
115 |     # Add a file handler for all loggers
116 |     if handler is None:
117 |         os.makedirs(LOGDIR, exist_ok=True)
118 |         filename = os.path.join(LOGDIR, logger_filename)
119 |         handler = logging.handlers.TimedRotatingFileHandler(filename, when="D", utc=True)
120 |         handler.setFormatter(formatter)
121 | 
122 |         for name, item in logging.root.manager.loggerDict.items():
123 |             if isinstance(item, logging.Logger):
124 |                 item.addHandler(handler)
125 | 
126 |     return logger
127 | 
128 | 
129 | class StreamToLogger(object):
130 |     """
131 |     Fake file-like stream object that redirects writes to a logger instance.
132 |     """
133 | 
134 |     def __init__(self, logger, log_level=logging.INFO):
135 |         self.terminal = sys.stdout
136 |         self.logger = logger
137 |         self.log_level = log_level
138 |         self.linebuf = ""
139 | 
140 |     def __getattr__(self, attr):
141 |         return getattr(self.terminal, attr)
142 | 
143 |     def write(self, buf):
144 |         temp_linebuf = self.linebuf + buf
145 |         self.linebuf = ""
146 |         for line in temp_linebuf.splitlines(True):
147 |             # From the io.TextIOWrapper docs:
148 |             #   On output, if newline is None, any '\n' characters written
149 |             #   are translated to the system default line separator.
150 |             # By default sys.stdout.write() expects '\n' newlines and then
151 |             # translates them so this is still cross platform.
152 |             if line[-1] == "\n":
153 |                 self.logger.log(self.log_level, line.rstrip())
154 |             else:
155 |                 self.linebuf += line
156 | 
157 |     def flush(self):
158 |         if self.linebuf != "":
159 |             self.logger.log(self.log_level, self.linebuf.rstrip())
160 |         self.linebuf = ""
161 | 
162 | 
163 | def disable_torch_init():
164 |     """
165 |     Disable the redundant torch default initialization to accelerate model creation.
166 |     """
167 |     import torch
168 | 
169 |     setattr(torch.nn.Linear, "reset_parameters", lambda self: None)
170 |     setattr(torch.nn.LayerNorm, "reset_parameters", lambda self: None)
171 | 
172 | 
173 | def violates_moderation(text):
174 |     """
175 |     Check whether the text violates OpenAI moderation API.
176 |     """
177 |     url = "https://api.openai.com/v1/moderations"
178 |     headers = {"Content-Type": "application/json", "Authorization": "Bearer " + os.environ["OPENAI_API_KEY"]}
179 |     text = text.replace("\n", "")
180 |     data = "{" + '"input": ' + f'"{text}"' + "}"
181 |     data = data.encode("utf-8")
182 |     try:
183 |         ret = requests.post(url, headers=headers, data=data, timeout=5)
184 |         flagged = ret.json()["results"][0]["flagged"]
185 |     except requests.exceptions.RequestException as e:
186 |         print(f"######################### Moderation Error: {e} #########################")
187 |         flagged = False
188 |     except KeyError as e:
189 |         print(f"######################### Moderation Error: {e} #########################")
190 |         flagged = False
191 | 
192 |     return flagged
193 | 
194 | 
195 | def pretty_print_semaphore(semaphore):
196 |     if semaphore is None:
197 |         return "None"
198 |     return f"Semaphore(value={semaphore._value}, locked={semaphore.locked()})"
199 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
  1 | Babel==2.14.0
  2 | DataProperty==1.0.1
  3 | Deprecated==1.2.14
  4 | GitPython==3.1.43
  5 | Jinja2==3.1.3
  6 | Levenshtein==0.25.1
  7 | MarkupSafe==2.1.5
  8 | PyJWT==2.8.0
  9 | PyYAML==6.0.1
 10 | Pygments==2.17.2
 11 | QtPy==2.4.1
 12 | Send2Trash==1.8.3
 13 | absl-py==2.1.0
 14 | accelerate==0.29.3
 15 | aiofiles==22.1.0
 16 | aiohttp==3.9.5
 17 | aiosignal==1.3.1
 18 | aiosqlite==0.20.0
 19 | altair==5.3.0
 20 | anyio==4.3.0
 21 | appdirs==1.4.4
 22 | argon2-cffi-bindings==21.2.0
 23 | argon2-cffi==23.1.0
 24 | arrow==1.3.0
 25 | asttokens==2.4.1
 26 | async-timeout==4.0.3
 27 | attrs==23.1.0
 28 | beautifulsoup4==4.12.3
 29 | bidict==0.23.1
 30 | bitsandbytes==0.41.0
 31 | black==24.1.0
 32 | bleach==6.1.0
 33 | byted-remote-ikernel==0.4.8
 34 | byted-torch-monitor==0.0.1
 35 | byted-wandb==0.13.72
 36 | bytedance-context==0.7.1
 37 | bytedance-metrics==0.5.1
 38 | bytedance.modelhub==0.0.64
 39 | bytedance.servicediscovery==0.1.2
 40 | bytedbackgrounds==0.0.6
 41 | byteddatabus==1.0.6
 42 | byteddps==0.1.2
 43 | bytedenv==0.6.2
 44 | bytedlogger==0.15.1
 45 | bytedmemfd==0.2
 46 | bytedmetrics==0.10.2
 47 | bytedpymongo==2.0.5
 48 | bytedrh2==1.18.7a2
 49 | bytedservicediscovery==0.17.4
 50 | bytedtcc==1.4.2
 51 | bytedtos==1.1.16
 52 | bytedtrace==0.3.0
 53 | bytedztijwthelper==0.0.22
 54 | bytedztispiffe==0.0.11
 55 | certifi==2024.2.2
 56 | cffi==1.16.0
 57 | cfgv==3.4.0
 58 | chardet==5.2.0
 59 | charset-normalizer==3.3.2
 60 | click==8.1.7
 61 | colorama==0.4.6
 62 | comm==0.2.2
 63 | contourpy==1.2.1
 64 | crcmod==1.7
 65 | cryptography==38.0.4
 66 | cycler==0.12.1
 67 | datasets==2.16.1
 68 | debugpy==1.8.1
 69 | decorator==5.1.1
 70 | decord==0.6.0
 71 | deepspeed==0.12.2
 72 | defusedxml==0.7.1
 73 | dill==0.3.7
 74 | distlib==0.3.8
 75 | distro==1.9.0
 76 | dnspython==2.6.1
 77 | docker-pycreds==0.4.0
 78 | docstring_parser==0.16
 79 | einops-exts==0.0.4
 80 | einops==0.6.1
 81 | entrypoints==0.4
 82 | et-xmlfile==1.1.0
 83 | eval_type_backport==0.2.0
 84 | evaluate==0.4.1
 85 | exceptiongroup==1.2.1
 86 | executing==2.0.1
 87 | fastapi==0.110.2
 88 | fastjsonschema==2.19.1
 89 | ffmpy==0.3.2
 90 | filelock==3.13.4
 91 | flash-attn==2.5.7
 92 | fonttools==4.51.0
 93 | fqdn==1.5.1
 94 | frozenlist==1.4.1
 95 | fsspec==2023.10.0
 96 | ftfy==6.2.0
 97 | gitdb==4.0.11
 98 | gradio==3.35.2
 99 | gradio_client==0.2.9
100 | grpcio==1.62.2
101 | h11==0.14.0
102 | hf_transfer==0.1.6
103 | hjson==3.1.0
104 | httpcore==0.17.3
105 | httpx==0.24.0
106 | huggingface-hub==0.22.2
107 | identify==2.5.36
108 | idna==3.7
109 | importlib_metadata==7.1.0
110 | importlib_resources==6.4.0
111 | iniconfig==2.0.0
112 | ipaddress==1.0.23
113 | ipykernel==6.29.4
114 | ipython-genutils==0.2.0
115 | ipython==8.18.1
116 | ipywidgets==8.1.2
117 | isoduration==20.11.0
118 | jedi==0.19.1
119 | joblib==1.4.0
120 | json5==0.9.25
121 | jsonlines==4.0.0
122 | jsonpointer==2.4
123 | jsonschema-specifications==2023.12.1
124 | jsonschema==4.21.1
125 | jupyter-client==7.0.0
126 | jupyter-console==6.6.3
127 | jupyter-events==0.10.0
128 | jupyter-ydoc==0.2.5
129 | jupyter==1.0.0
130 | jupyter_core==5.7.2
131 | jupyter_server==2.14.0
132 | jupyter_server_fileid==0.9.2
133 | jupyter_server_terminals==0.5.3
134 | jupyter_server_ydoc==0.8.0
135 | jupyterlab==3.6.4
136 | jupyterlab_pygments==0.3.0
137 | jupyterlab_server==2.27.1
138 | jupyterlab_widgets==3.0.10
139 | kiwisolver==1.4.5
140 | linkify-it-py==2.0.3
141 | llava==1.7.0.dev0
142 | llava==1.7.0.dev0
143 | lmms_eval==0.1.1
144 | lxml==5.2.1
145 | markdown-it-py==2.2.0
146 | markdown2==2.4.13
147 | matplotlib-inline==0.1.7
148 | matplotlib==3.8.4
149 | mbstrdecoder==1.1.3
150 | mdit-py-plugins==0.3.3
151 | mdurl==0.1.2
152 | mistune==3.0.2
153 | mpmath==1.3.0
154 | msgpack==1.0.8
155 | multidict==6.0.5
156 | multiprocess==0.70.15
157 | mypy-extensions==1.0.0
158 | nbclassic==1.0.0
159 | nbclient==0.10.0
160 | nbconvert==7.16.3
161 | nbformat==5.10.4
162 | nest-asyncio==1.6.0
163 | networkx==3.2.1
164 | ninja==1.11.1.1
165 | nltk==3.8.1
166 | nodeenv==1.8.0
167 | notebook==6.5.6
168 | notebook_shim==0.2.4
169 | numexpr==2.10.0
170 | numpy==1.26.4
171 | nvidia-cublas-cu12==12.1.3.1
172 | nvidia-cuda-cupti-cu12==12.1.105
173 | nvidia-cuda-nvrtc-cu12==12.1.105
174 | nvidia-cuda-runtime-cu12==12.1.105
175 | nvidia-cudnn-cu12==8.9.2.26
176 | nvidia-cufft-cu12==11.0.2.54
177 | nvidia-curand-cu12==10.3.2.106
178 | nvidia-cusolver-cu12==11.4.5.107
179 | nvidia-cusparse-cu12==12.1.0.106
180 | nvidia-nccl-cu12==2.18.1
181 | nvidia-nvjitlink-cu12==12.4.127
182 | nvidia-nvtx-cu12==12.1.105
183 | open-clip-torch==2.24.0
184 | openai==1.23.6
185 | opencv-python-headless==4.9.0.80
186 | openpyxl==3.1.2
187 | orjson==3.10.1
188 | overrides==7.7.0
189 | packaging==24.0
190 | pandas==2.2.2
191 | pandocfilters==1.5.1
192 | parso==0.8.4
193 | pathlib2==2.3.7.post1
194 | pathspec==0.12.1
195 | pathtools==0.1.2
196 | pathvalidate==3.2.0
197 | peft==0.4.0
198 | pexpect==4.8.0
199 | pillow==10.3.0
200 | pip==23.3.1
201 | pip==24.0
202 | platformdirs==4.2.1
203 | pluggy==1.5.0
204 | ply==3.11
205 | portalocker==2.8.2
206 | pre-commit==3.7.0
207 | prometheus_client==0.20.0
208 | promise==2.3
209 | prompt-toolkit==3.0.43
210 | protobuf==3.20.3
211 | psutil==5.9.8
212 | ptyprocess==0.7.0
213 | pure-eval==0.2.2
214 | py-cpuinfo==9.0.0
215 | py-spy==0.3.14
216 | py==1.11.0
217 | pyOpenSSL==22.1.0
218 | pyarrow-hotfix==0.6
219 | pyarrow==16.0.0
220 | pybind11==2.12.0
221 | pycocoevalcap==1.2
222 | pycocotools==2.0.7
223 | pycparser==2.22
224 | pycryptodomex==3.20.0
225 | pydantic==1.10.8
226 | pydub==0.25.1
227 | pynvml==11.5.0
228 | pyparsing==3.1.2
229 | pytablewriter==1.2.0
230 | pytest==6.2.5
231 | python-consul==1.1.0
232 | python-dateutil==2.9.0.post0
233 | python-engineio==4.9.0
234 | python-etcd==0.4.5
235 | python-json-logger==2.0.7
236 | python-multipart==0.0.9
237 | python-socketio==5.11.2
238 | pytz==2024.1
239 | pyzmq==24.0.1
240 | qtconsole==5.5.1
241 | rapidfuzz==3.8.1
242 | referencing==0.35.0
243 | regex==2024.4.16
244 | requests==2.31.0
245 | responses==0.18.0
246 | rfc3339-validator==0.1.4
247 | rfc3986-validator==0.1.1
248 | rich==13.7.1
249 | rouge-score==0.1.2
250 | rpds-py==0.18.0
251 | sacrebleu==2.4.2
252 | safetensors==0.4.3
253 | schedule==1.2.1
254 | scikit-learn==1.2.2
255 | scipy==1.13.0
256 | semantic-version==2.10.0
257 | sentencepiece==0.1.99
258 | sentry-sdk==2.0.0
259 | setproctitle==1.3.3
260 | setuptools==68.2.2
261 | shortuuid==1.0.13
262 | shtab==1.7.1
263 | simple-websocket==1.0.0
264 | six==1.16.0
265 | smmap==5.0.1
266 | sniffio==1.3.1
267 | soupsieve==2.5
268 | sqlitedict==2.1.0
269 | stack-data==0.6.3
270 | starlette==0.37.2
271 | svgwrite==1.4.3
272 | sympy==1.12
273 | tabledata==1.3.3
274 | tabulate==0.9.0
275 | tcolorpy==0.1.4
276 | tenacity==8.2.3
277 | terminado==0.18.1
278 | threadpoolctl==3.4.0
279 | thriftpy2==0.4.20
280 | tiktoken==0.6.0
281 | timm==0.9.16
282 | tinycss2==1.3.0
283 | tokenizers==0.15.2
284 | toml==0.10.2
285 | tomli==2.0.1
286 | toolz==0.12.1
287 | torch==2.1.2
288 | torchvision==0.16.2
289 | tornado==6.4
290 | tox==3.28.0
291 | tqdm-multiprocess==0.0.11
292 | tqdm==4.66.2
293 | traitlets==5.14.3
294 | transformers-stream-generator==0.0.5
295 | transformers==4.40.0.dev0
296 | triton==2.1.0
297 | typepy==1.3.2
298 | types-python-dateutil==2.9.0.20240316
299 | typing_extensions==4.11.0
300 | tyro==0.8.3
301 | tzdata==2024.1
302 | uc-micro-py==1.0.3
303 | uri-template==1.3.0
304 | urllib3==2.2.1
305 | uvicorn==0.29.0
306 | virtualenv==20.26.0
307 | wandb==0.16.5
308 | watchdog==4.0.0
309 | wavedrom==2.0.3.post3
310 | wcwidth==0.2.13
311 | webcolors==1.13
312 | webencodings==0.5.1
313 | websocket-client==1.8.0
314 | websockets==12.0
315 | wheel==0.41.2
316 | widgetsnbextension==4.0.10
317 | wrapt==1.16.0
318 | wsproto==1.2.0
319 | xxhash==3.4.1
320 | y-py==0.6.2
321 | yarl==1.9.4
322 | ypy-websocket==0.8.4
323 | zipp==3.18.1
324 | zstandard==0.22.0


--------------------------------------------------------------------------------
/scripts/insight-v/llava_dpo.sh:
--------------------------------------------------------------------------------
 1 | EXP_NAME="llama3-llava-next-reason-dpo"
 2 | VISION_TOWER='/home/models/clip-vit-large-patch14-336'
 3 | 
 4 | echo $MASTER_ADDR; echo $nnode; echo $nrank
 5 | 
 6 | # ----------- finetune mlp+llm -------------
 7 | 
 8 | export DPO_FORWARD=1
 9 | 
10 | torchrun  --nproc_per_node 8 --nnodes=$nnode --node_rank=$nrank --master_addr=$MASTER_ADDR --master_port=23333 \
11 |     llava/train/train_dpo.py \
12 |     --deepspeed ./scripts/zero3.json \
13 |     --model_name_or_path /path/to/reason_models \
14 |     --version llava_llama_3 \
15 |     --data_path /path/to/dpo_data \
16 |     --image_folder ./playground/data \
17 |     --mm_tunable_parts="mm_mlp_adapter,mm_language_model" \
18 |     --vision_tower $VISION_TOWER \
19 |     --mm_projector_type mlp2x_gelu \
20 |     --mm_vision_select_layer -1 \
21 |     --mm_use_im_start_end False \
22 |     --mm_use_im_patch_token False \
23 |     --mm_use_think_token True \
24 |     --group_by_modality_length True \
25 |     --image_aspect_ratio anyres \
26 |     --image_grid_pinpoints  "(1x1),...,(3x3)" \
27 |     --mm_patch_merge_type spatial_unpad \
28 |     --bf16 True \
29 |     --output_dir ./checkpoints_new/$EXP_NAME \
30 |     --num_train_epochs 2 \
31 |     --per_device_train_batch_size 2 \
32 |     --per_device_eval_batch_size 4 \
33 |     --gradient_accumulation_steps 4 \
34 |     --evaluation_strategy "no" \
35 |     --save_strategy "steps" \
36 |     --save_steps 500 \
37 |     --save_total_limit 1 \
38 |     --learning_rate 5e-6 \
39 |     --weight_decay 0. \
40 |     --warmup_ratio 0.05 \
41 |     --lr_scheduler_type "cosine" \
42 |     --logging_steps 1 \
43 |     --tf32 True \
44 |     --model_max_length 12288 \
45 |     --gradient_checkpointing True \
46 |     --dataloader_num_workers 4 \
47 |     --lazy_preprocess True \
48 |     --report_to none
49 | 


--------------------------------------------------------------------------------
/scripts/insight-v/llava_next_reason.sh:
--------------------------------------------------------------------------------
 1 | EXP_NAME="llama3-llava-next-reason"
 2 | VISION_TOWER='/home/models/clip-vit-large-patch14-336'
 3 | 
 4 | echo $MASTER_ADDR; echo $nnode; echo $nrank
 5 | 
 6 | # ----------- finetune mlp+llm -------------
 7 | 
 8 | torchrun  --nproc_per_node 8 --nnodes=$nnode --node_rank=$nrank --master_addr=$MASTER_ADDR --master_port=23333 \
 9 |     llava/train/train_mem.py \
10 |     --deepspeed ./scripts/zero3.json \
11 |     --model_name_or_path /path/to/models \
12 |     --version llava_llama_3 \
13 |     --data_path /path/to/reason_data \
14 |     --image_folder ./playground/data \
15 |     --mm_tunable_parts="mm_mlp_adapter,mm_language_model" \
16 |     --vision_tower $VISION_TOWER \
17 |     --mm_projector_type mlp2x_gelu \
18 |     --mm_vision_select_layer -1 \
19 |     --mm_use_im_start_end False \
20 |     --mm_use_im_patch_token False \
21 |     --mm_use_think_token True \
22 |     --group_by_modality_length True \
23 |     --image_aspect_ratio anyres \
24 |     --image_grid_pinpoints  "(1x1),...,(3x3)" \
25 |     --mm_patch_merge_type spatial_unpad \
26 |     --bf16 True \
27 |     --output_dir ./checkpoints_new/$EXP_NAME \
28 |     --num_train_epochs 2 \
29 |     --per_device_train_batch_size 2 \
30 |     --per_device_eval_batch_size 4 \
31 |     --gradient_accumulation_steps 4 \
32 |     --evaluation_strategy "no" \
33 |     --save_strategy "steps" \
34 |     --save_steps 500 \
35 |     --save_total_limit 1 \
36 |     --learning_rate 5e-6 \
37 |     --weight_decay 0. \
38 |     --warmup_ratio 0.05 \
39 |     --lr_scheduler_type "cosine" \
40 |     --logging_steps 1 \
41 |     --tf32 True \
42 |     --model_max_length 12288 \
43 |     --gradient_checkpointing True \
44 |     --dataloader_num_workers 4 \
45 |     --lazy_preprocess True \
46 |     --report_to none
47 | 


--------------------------------------------------------------------------------
/scripts/insight-v/llava_next_summary.sh:
--------------------------------------------------------------------------------
 1 | EXP_NAME="llama3-llava-next-summary"
 2 | VISION_TOWER='/home/models/clip-vit-large-patch14-336'
 3 | 
 4 | echo $MASTER_ADDR; echo $nnode; echo $nrank
 5 | 
 6 | torchrun  --nproc_per_node 8 --nnodes=$nnode --node_rank=$nrank --master_addr=$MASTER_ADDR --master_port=23333 \
 7 |     llava/train/train_mem.py \
 8 |     --deepspeed ./scripts/zero3.json \
 9 |     --model_name_or_path /path/to/model \
10 |     --version llava_llama_3 \
11 |     --data_path /path/to/data \
12 |     --image_folder ./playground/data \
13 |     --mm_tunable_parts="mm_mlp_adapter,mm_language_model" \
14 |     --vision_tower $VISION_TOWER \
15 |     --mm_projector_type mlp2x_gelu \
16 |     --mm_vision_select_layer -2 \
17 |     --mm_use_im_start_end False \
18 |     --mm_use_im_patch_token False \
19 |     --mm_use_think_token True \
20 |     --group_by_modality_length True \
21 |     --image_aspect_ratio anyres \
22 |     --image_grid_pinpoints  "(1x1),...,(3x3)" \
23 |     --mm_patch_merge_type spatial_unpad \
24 |     --bf16 True \
25 |     --output_dir ./checkpoints_new/$EXP_NAME \
26 |     --num_train_epochs 1 \
27 |     --per_device_train_batch_size 2 \
28 |     --per_device_eval_batch_size 4 \
29 |     --gradient_accumulation_steps 1 \
30 |     --evaluation_strategy "no" \
31 |     --save_strategy "steps" \
32 |     --save_steps 1000 \
33 |     --save_total_limit 1 \
34 |     --learning_rate 1e-5  \
35 |     --weight_decay 0. \
36 |     --warmup_ratio 0.05 \
37 |     --lr_scheduler_type "cosine" \
38 |     --logging_steps 1 \
39 |     --tf32 True \
40 |     --model_max_length 12288 \
41 |     --gradient_checkpointing True \
42 |     --dataloader_num_workers 4 \
43 |     --lazy_preprocess True \
44 |     --report_to none
45 | 


--------------------------------------------------------------------------------
/scripts/zero2.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "fp16": {
 3 |         "enabled": "auto",
 4 |         "loss_scale": 0,
 5 |         "loss_scale_window": 1000,
 6 |         "initial_scale_power": 16,
 7 |         "hysteresis": 2,
 8 |         "min_loss_scale": 1
 9 |     },
10 |     "bf16": {
11 |         "enabled": "auto"
12 |     },
13 |     "optimizer": {
14 |         "type": "AdamW",
15 |         "params": {
16 |             "lr": "auto",
17 |             "betas": "auto",
18 |             "eps": "auto",
19 |             "weight_decay": "auto"
20 |         }
21 |     },
22 |     "zero_optimization": {
23 |         "stage": 2,
24 |         "offload_optimizer": {
25 |             "device": "none",
26 |             "pin_memory": true
27 |         },
28 |         "allgather_partitions": true,
29 |         "allgather_bucket_size": 2e8,
30 |         "overlap_comm": false,
31 |         "reduce_scatter": true,
32 |         "reduce_bucket_size": 2e8,
33 |         "contiguous_gradients": true
34 |     },
35 |     "gradient_accumulation_steps": "auto",
36 |     "gradient_clipping": "auto",
37 |     "steps_per_print": 100,
38 |     "train_batch_size": "auto",
39 |     "train_micro_batch_size_per_gpu": "auto",
40 |     "wall_clock_breakdown": false
41 | }


--------------------------------------------------------------------------------
/scripts/zero2_fused_adamw.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "fp16": {
 3 |         "enabled": "auto",
 4 |         "loss_scale": 0,
 5 |         "loss_scale_window": 1000,
 6 |         "initial_scale_power": 16,
 7 |         "hysteresis": 2,
 8 |         "min_loss_scale": 1
 9 |     },
10 |     "bf16": {
11 |         "enabled": "auto"
12 |     },
13 |     "optimizer": {
14 |         "type": "AdamW",
15 |         "params": {
16 |             "lr": "auto",
17 |             "betas": "auto",
18 |             "eps": "auto",
19 |             "weight_decay": "auto"
20 |         }
21 |     },
22 |     "zero_optimization": {
23 |         "stage": 2,
24 |         "offload_optimizer": {
25 |             "device": "none",
26 |             "pin_memory": true
27 |         },
28 |         "allgather_partitions": true,
29 |         "allgather_bucket_size": 2e8,
30 |         "overlap_comm": true,
31 |         "reduce_scatter": true,
32 |         "reduce_bucket_size": 2e8,
33 |         "contiguous_gradients": true
34 |     },
35 |     "gradient_accumulation_steps": "auto",
36 |     "gradient_clipping": "auto",
37 |     "steps_per_print": 100,
38 |     "train_batch_size": "auto",
39 |     "train_micro_batch_size_per_gpu": "auto",
40 |     "wall_clock_breakdown": false
41 | }


--------------------------------------------------------------------------------
/scripts/zero2_offload.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "fp16": {
 3 |         "enabled": "auto",
 4 |         "loss_scale": 0,
 5 |         "loss_scale_window": 1000,
 6 |         "initial_scale_power": 16,
 7 |         "hysteresis": 2,
 8 |         "min_loss_scale": 1
 9 |     },
10 |     "bf16": {
11 |         "enabled": "auto"
12 |     },
13 |     "train_micro_batch_size_per_gpu": "auto",
14 |     "train_batch_size": "auto",
15 |     "gradient_accumulation_steps": "auto",
16 |     "zero_optimization": {
17 |         "stage": 2,
18 |         "offload_optimizer": {
19 |             "device": "cpu",
20 |             "pin_memory": true
21 |         },
22 |         "offload_param": {
23 |             "device": "cpu",
24 |             "pin_memory": true
25 |         },
26 |         "overlap_comm": true,
27 |         "contiguous_gradients": true,
28 |         "sub_group_size": 1e9,
29 |         "reduce_bucket_size": "auto"
30 |     }
31 | }


--------------------------------------------------------------------------------
/scripts/zero3.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "fp16": {
 3 |         "enabled": "auto",
 4 |         "loss_scale": 0,
 5 |         "loss_scale_window": 1000,
 6 |         "initial_scale_power": 16,
 7 |         "hysteresis": 2,
 8 |         "min_loss_scale": 1
 9 |     },
10 |     "bf16": {
11 |         "enabled": "auto"
12 |     },
13 | 
14 |     "zero_optimization": {
15 |         "stage": 3,
16 |         "offload_optimizer": {
17 |             "device": "none",
18 |             "pin_memory": true
19 |         },
20 |         "offload_param": {
21 |             "device": "none",
22 |             "pin_memory": true
23 |         },
24 |         "overlap_comm": true,
25 |         "contiguous_gradients": true,
26 |         "sub_group_size": 1e9,
27 |         "reduce_bucket_size": "auto",
28 |         "stage3_prefetch_bucket_size": "auto",
29 |         "stage3_param_persistence_threshold": "auto",
30 |         "stage3_max_live_parameters": 1e9,
31 |         "stage3_max_reuse_distance": 1e9,
32 |         "stage3_gather_16bit_weights_on_model_save": true
33 |     },
34 | 
35 |     "gradient_accumulation_steps": "auto",
36 |     "gradient_clipping": "auto",
37 |     "steps_per_print": 100,
38 |     "train_batch_size": "auto",
39 |     "train_micro_batch_size_per_gpu": "auto",
40 |     "wall_clock_breakdown": false
41 | }


--------------------------------------------------------------------------------
/scripts/zero3_offload.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "fp16": {
 3 |         "enabled": "auto",
 4 |         "loss_scale": 0,
 5 |         "loss_scale_window": 1000,
 6 |         "initial_scale_power": 16,
 7 |         "hysteresis": 2,
 8 |         "min_loss_scale": 1
 9 |     },
10 |     "bf16": {
11 |         "enabled": "auto"
12 |     },
13 |     "optimizer": {
14 |         "type": "AdamW",
15 |         "params": {
16 |             "lr": "auto",
17 |             "betas": "auto",
18 |             "eps": "auto",
19 |             "weight_decay": "auto"
20 |         }
21 |     },
22 |     "zero_optimization": {
23 |         "stage": 3,
24 |         "offload_optimizer": {
25 |             "device": "cpu",
26 |             "pin_memory": true
27 |         },
28 |         "offload_param": {
29 |             "device": "cpu",
30 |             "pin_memory": true
31 |         },
32 |         "overlap_comm": true,
33 |         "contiguous_gradients": true,
34 |         "sub_group_size": 1e9,
35 |         "reduce_bucket_size": "auto",
36 |         "stage3_prefetch_bucket_size": "auto",
37 |         "stage3_param_persistence_threshold": "auto",
38 |         "stage3_max_live_parameters": 1e9,
39 |         "stage3_max_reuse_distance": 1e9,
40 |         "gather_16bit_weights_on_model_save": true
41 |     },
42 |     "gradient_accumulation_steps": "auto",
43 |     "gradient_clipping": "auto",
44 |     "train_batch_size": "auto",
45 |     "train_micro_batch_size_per_gpu": "auto",
46 |     "steps_per_print": 1e5,
47 |     "wall_clock_breakdown": false
48 | }


--------------------------------------------------------------------------------
/scripts/zero3pp.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "fp16": {
 3 |         "enabled": "auto",
 4 |         "loss_scale": 0,
 5 |         "loss_scale_window": 1000,
 6 |         "initial_scale_power": 16,
 7 |         "hysteresis": 2,
 8 |         "min_loss_scale": 1
 9 |     },
10 |     "bf16": {
11 |         "enabled": "auto"
12 |     },
13 |     "optimizer": {
14 |         "type": "AdamW",
15 |         "params": {
16 |             "lr": "auto",
17 |             "betas": "auto",
18 |             "eps": "auto",
19 |             "weight_decay": "auto"
20 |         }
21 |     },
22 | 
23 |     "zero_optimization": {
24 |         "stage": 3,
25 |         "offload_optimizer": {
26 |             "device": "none",
27 |             "pin_memory": true
28 |         },
29 |         "offload_param": {
30 |             "device": "none",
31 |             "pin_memory": true
32 |         },
33 |         "overlap_comm": true,
34 |         "contiguous_gradients": true,
35 |         "zero_quantized_weights": true,
36 |         "zero_hpz_partition_size": 16,
37 |         "zero_quantized_gradients": true,
38 |         "sub_group_size": 1e9,
39 |         "reduce_bucket_size": "auto",
40 |         "stage3_prefetch_bucket_size": "auto",
41 |         "stage3_param_persistence_threshold": "auto",
42 |         "stage3_max_live_parameters": 1e9,
43 |         "stage3_max_reuse_distance": 1e9,
44 |         "stage3_gather_16bit_weights_on_model_save": true
45 |     },
46 | 
47 |     "gradient_accumulation_steps": "auto",
48 |     "gradient_clipping": "auto",
49 |     "steps_per_print": 100,
50 |     "train_batch_size": "auto",
51 |     "train_micro_batch_size_per_gpu": "auto",
52 |     "wall_clock_breakdown": false
53 | }


--------------------------------------------------------------------------------