├── .gitignore ├── Eval ├── README.md ├── eval_gpt_mmhal.py ├── eval_gpt_review.py ├── eval_gpt_review_bench.py ├── eval_gpt_review_visual.py ├── eval_pope.py ├── eval_science_qa.py ├── eval_science_qa_gpt4.py ├── eval_science_qa_gpt4_requery.py ├── eval_scripts │ ├── eval_llava_bench.sh │ ├── eval_mmbench.sh │ ├── eval_mmhal.sh │ └── eval_pope.sh ├── generate_webpage_data_from_table.py ├── llava │ ├── qa90_gpt4_answer.jsonl │ └── qa90_questions.jsonl ├── mmagibench.py ├── model_mmbench.py ├── model_qa.py ├── model_vqa.py ├── model_vqa_ds.py ├── model_vqa_logit.py ├── model_vqa_mmhal.py ├── model_vqa_science.py ├── pope │ ├── coco_pope_adversarial.jsonl │ ├── coco_pope_popular.jsonl │ └── coco_pope_random.jsonl ├── qa_baseline_gpt35.py ├── run_llava.py ├── summarize_eval_pope.py ├── summarize_gpt_mmhal.py ├── summarize_gpt_review.py ├── table │ ├── answer │ │ ├── answer_alpaca-13b.jsonl │ │ ├── answer_bard.jsonl │ │ ├── answer_gpt35.jsonl │ │ ├── answer_llama-13b.jsonl │ │ └── answer_vicuna-13b.jsonl │ ├── caps_boxes_coco2014_val_80.jsonl │ ├── model.jsonl │ ├── prompt.jsonl │ ├── question.jsonl │ ├── results │ │ └── test_sqa_llava_13b_v0.json │ ├── review │ │ ├── review_alpaca-13b_vicuna-13b.jsonl │ │ ├── review_bard_vicuna-13b.jsonl │ │ ├── review_gpt35_vicuna-13b.jsonl │ │ └── review_llama-13b_vicuna-13b.jsonl │ ├── reviewer.jsonl │ └── rule.json └── webpage │ ├── figures │ ├── alpaca.png │ ├── bard.jpg │ ├── chatgpt.svg │ ├── llama.jpg │ ├── swords_FILL0_wght300_GRAD0_opsz48.svg │ └── vicuna.jpeg │ ├── index.html │ ├── script.js │ └── styles.css ├── LICENSE ├── README.md ├── RLHF ├── README.md ├── data_utils │ ├── common_utils.py │ ├── constants.py │ ├── data_utils_ppo.py │ ├── data_utils_rm.py │ └── data_utils_sft.py ├── finetune_lora_ppo.py ├── finetune_lora_rm.py ├── finetune_lora_sft_ds.py ├── lora_utils.py ├── models │ ├── distributed_utils.py │ ├── ppo_trainer.py │ ├── qlora_model.py │ ├── reward_model.py │ ├── rl_models.py │ ├── rl_trainer.py │ └── trainer_utils.py ├── prompts │ ├── fact_rlhf_reward_prompt.txt │ └── reward_prompt.txt └── scripts │ ├── 13b-v1.5-336 │ ├── initialize_policy_model.sh │ ├── train_reward_model.sh │ └── train_rl_model.sh │ ├── 7b-v1.5-224 │ ├── initialize_policy_model.sh │ └── train_rl_model.sh │ └── zero2.json ├── SFT ├── README.md ├── scripts │ ├── 13b-v1.5-336 │ │ ├── pretrain.sh │ │ └── sft.sh │ ├── 7b-v1.5-224 │ │ ├── pretrain.sh │ │ └── sft.sh │ └── zero3.json └── train │ ├── __init__.py │ ├── llava_trainer.py │ └── train.py ├── assets └── images │ └── llava_rlhf_logo.png ├── demo ├── README.md ├── model_builder.py └── model_worker.py └── llava_setup ├── .gitignore ├── README.md └── fix_llava_padding.patch /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | 131 | .DS_Store 132 | .idea 133 | 134 | # temporary scripts 135 | tmp_scripts/ 136 | 137 | # Data 138 | !**/alpaca-data-conversation.json 139 | 140 | # Editor 141 | *.swp 142 | 143 | # Other 144 | wandb 145 | output 146 | 147 | checkpoints 148 | ckpts* 149 | 150 | DeepSpeed 151 | vlm_eval_hard_v2_images 152 | vlm_eval_hard.zip 153 | -------------------------------------------------------------------------------- /Eval/README.md: -------------------------------------------------------------------------------- 1 | # Evaluations 2 | 3 | This directory contains end-to-end pipelines for AI-enhanced evaluation. We will introduce the evaluation pipeline and the data format in this document. 4 | 5 | ## Generate Answers 6 | 7 | ### ChatGPT (gpt-3.5-turbo) 8 | 9 | Make sure you have setup the OpenAI API Key in your environment. Then run: 10 | 11 | ```bash 12 | python qa_baseline_gpt35.py --question table/question.jsonl --output table/answer/answer_gpt35.jsonl 13 | ``` 14 | 15 | ### Bard 16 | 17 | Unfortunately, Bard has not release its public APIs till now. You may have to enter the anwsers manually. Or you could find a third-party project that interfaces with Bard. 18 | 19 | ### Vicuna and others 20 | 21 | To generate answers with Vicuna or other models, specify path to the model checkpoint. Then run: 22 | ```bash 23 | python model_qa.py --model-name /model/path --question-file tables/question.jsonl --answer-file table/answer/answer.jsonl 24 | ``` 25 | 26 | ## Evaluate Answers Automatically 27 | 28 | ### Generete Reviews with GPT-4 29 | 30 | PS: If you do not current have access to GPT-4 API, but you have access to GPT-4 chatbot, you can evaluate the answers manually, according to the instructions in the **Data Format** section. `table/review/*.jsonl` are some examples of reviews. 31 | 32 | TODO: add instructions 33 | 34 | ## Visualize Results 35 | 36 | You can generate the data for the webpage by running: 37 | 38 | ```bash 39 | python eval/generate_webpage_data_from_table.py 40 | ``` 41 | 42 | Then you can serve a static website in `webpage` to see the results. 43 | 44 | ## Data Format 45 | 46 | If you want to have a deeper understanding of our evaluation pipeline or want to contribute to the evaluation process, you need to learn the data format we used for evaluation. 47 | 48 | Our evaluation data are encoded with [JSON Lines](https://jsonlines.org/). 49 | 50 | ### Random ID Generation 51 | 52 | We use the `shortuuid` Python library for generating short random UUIDs. 53 | 54 | ```python 55 | import shortuuid 56 | shortuuid.uuid() -> str 57 | ``` 58 | 59 | ### Models 60 | 61 | `model.jsonl` contains model information we used for generating anwsers. 62 | 63 | Each row contains a record of a model with the following field: 64 | 65 | * `model_id` (str): A unique ID for a model. Models with different IDs is supposed to have different performance. This ID is generated by `{model_name}:{model_version}`. 66 | * `model_name` (str): The name of a model. This is not unique, because a model could be trained and updated continuously, but it is still considered as the same model with different versions. 67 | * `model_version` (str): The version of a model. 68 | * `model_metadata` (Any): Any metadata of a model (descriptions etc). This is optional. 69 | 70 | For example: 71 | 72 | ```json 73 | { 74 | "model_id": "vicuna-13b:v1", 75 | "model_name": "vicuna-13b", 76 | "model_version": "v1", 77 | "model_metadata": "learning rate 1e-5, 3 epochs, 13b" 78 | } 79 | ``` 80 | 81 | ### Prompts 82 | 83 | We store prompts in `prompt.jsonl`. Each row contains a record of a prompt with the following field: 84 | 85 | * `prompt_id` (int): A unique integer ID for a prompt. Prompts with different IDs are supposed to have different purpose. 86 | * `system_prompt` (str): The system prompt given to a model. This is the prompt that the model sees first. 87 | * `prompt_template` (str): The prompt body. This is the user prompt that the model sees after the system prompt. It is a Python f-string template, so that we can fill in the inputs later. 88 | * `defaults` (dict): A dictionary of default values for the prompt template. It can be empty. 89 | * `description` (str): A description of the functionality of the prompt. 90 | 91 | For example: 92 | 93 | ```json 94 | { 95 | "prompt_id": 1, 96 | "system_prompt": "You are a helpful assistant.", 97 | "prompt_template": "[Question]\n{question}\n\n[Assistant 1]\n{answer_1}\n\n[End of Assistant 1]\n\n[Assistant 2]\n{answer_2}\n\n[End of Assistant 2]\n\n[System]\n{prompt}\n\n", 98 | "defaults": {"prompt": "Which assistant is more helpful?"}, 99 | "description": "Compare two assistants' answers to a question." 100 | } 101 | ``` 102 | 103 | ### Reviewers 104 | 105 | `reviewer.jsonl` contains reviewer information we used for reviewing answers generated by different models. Each row contains a record of a reviewer with the following field: 106 | 107 | * `reviewer_id` (str): A unique ID for a reviewer. Reviewers with different IDs is supposed to have different reviewing performance. 108 | * `prompt_id` (str): The ID of the prompt given to the reviewer (e.g., an AI assistant). Different prompts could result in different reviewing performance. 109 | * `metadata` (dict): Metadata of a reviewer about its configurations. 110 | * `description` (str): A description of the reviewer. 111 | 112 | For example: 113 | 114 | ```json 115 | { 116 | "reviewer_id": "gpt-4-0328-default", 117 | "prompt_id": 1, 118 | "temperature": 0.2, 119 | "max_tokens": 8192, 120 | "description": "GPT-4 for generic questions." 121 | } 122 | ``` 123 | 124 | ### Questions 125 | 126 | `question.jsonl` contains questions we used for evaluation. Each row contains a record of a question with the following field: 127 | 128 | * `question_id` (int): A unique integer for a question. Questions with different IDs is supposed to be different. 129 | * `text` (str): The question text. 130 | * `category` (str): The category of the question. Questions with the same category are supposed to be similar or originate from the same source. 131 | 132 | ### Answers 133 | 134 | `answer/xxx.jsonl` contains answers generated by different models. Each row contains a record of an answer with the following field: 135 | 136 | * `answer_id` (str): A unique UUID for an answer. Answers with different IDs is supposed to be different. 137 | * `question_id` (int): The ID of the question the answer is generated for. 138 | * `model_id` (str): The ID of the model the answer is generated by. 139 | * `text` (str): The answer text. 140 | * `metadata` (dict): Any metadata of the answer. 141 | 142 | Example: 143 | 144 | ```json 145 | { 146 | "answer_id": "[short uuid]", 147 | "question_id": 1, 148 | "model_id": "vicuna-13b:v1", 149 | "text": "Here are five tips...", 150 | "metadata": {} 151 | } 152 | ``` 153 | 154 | ### Reviews 155 | 156 | `review/xxx.jsonl` contains reviews given by reviewers, comparing peformance between a pair of models. Each row contains a record of a review with the following field: 157 | 158 | * `review_id` (str): A unique UUID for a review. Reviews with different IDs is supposed to be different. 159 | * `question_id` (int): The ID of the question the review is given for. 160 | * `answer1_id` (str): The ID of the first answer. 161 | * `answer2_id` (str): The ID of the second answer. 162 | * `text` (str): The review text. 163 | * `score` (list): A list of scores given by the reviewer. The first score is for the first answer, and the second score is for the second answer. 164 | * `reviewer_id` (str): The ID of the reviewer. 165 | * `metadata` (dict): Any metadata of the review. 166 | 167 | ```json 168 | { 169 | "review_id": "[short uuid]", 170 | "question_id": 1, 171 | "answer1_id": "[answer1_id]", 172 | "answer2_id": "[answer2_id]", 173 | "text": "Assistant 2 is better...", 174 | "score": [9.0, 7.5], 175 | "reviewer_id": "gpt-4-0328-default", 176 | "metadata": {} 177 | } 178 | ``` 179 | -------------------------------------------------------------------------------- /Eval/eval_gpt_mmhal.py: -------------------------------------------------------------------------------- 1 | import openai 2 | import argparse 3 | import json 4 | import time 5 | 6 | template = '''Please act as an impartial and objective judge and evaluate the quality of the response provided by a Large Multimodal Model (LMM) to the user question. Your evaluation should be mainly based on whether the response is informative, and whether the response contains any hallucination. Hallucination, in this context, refers to a situation where the LMM generates a response that includes information not present or implied in the image or previous conversation. A hallucination could be a false claim about an object, action, emotion, or any other detail that is not grounded in the image. 7 | 8 | For clarity, consider these examples: 9 | 10 | ### Example 1: 11 | - Image Contents: Lynx, Cat 12 | - Question: How many cats can you see in this image? 13 | - Standard Human-Generated Answer: There are two cats visible in the image. 14 | - LMM Response to Evaluate: There is one cat in the image. 15 | - Analysis: The LMM's response is false because it claims there is only one cat in the image, when in fact there are two. 16 | - Hallucination: Yes. 17 | 18 | ### Example 2: 19 | - Image Contents: Tree, Plant, Panda, Animal 20 | - Question: What can you see from this photo? 21 | - Standard Human-Generated Answer: In this photo we can see branches. We can see a panda on a branch. The background is blurry. 22 | - LMM Response to Evaluate: In the photo, I can see a large black and white panda sitting on a tree branch. The image captures the panda's unique appearance and its natural habitat, showcasing the beauty and rarity of these animals. 23 | - Analysis: The LMM's response is not false because it does not make any false claims about the image contents. For example, it does not claim that the panda is eating bamboo. 24 | - Hallucination: No. 25 | 26 | ### Example 3: 27 | - Image Contents: Fashion accessory, Mammal, Dog, Umbrella 28 | - Question: Describe this image in details. 29 | - Standard Human-Generated Answer: In this picture I can see a dog laying under a yellow umbrella on a beach, and there is a rope on the right side. In the background I can see few persons talking or lying. At the top of the image, I can see the blue and clear sky. 30 | - LMM Response to Evaluate: The image features a brown dog laying on the sand at the beach, underneath a yellow umbrella. The dog is positioned near the water, and there are people in the background. The beach is sandy, and the sun is shining, creating a pleasant atmosphere. The dog is enjoying the shade provided by the umbrella, which is a common way to protect pets from the sun's harmful rays. The presence of people in the background suggests that the beach is a popular spot for leisure activities, such as sunbathing, swimming, and relaxing. 31 | - Analysis: The LMM's response is false because it claims that the dog is positioned near the water, but there is no water in the image. 32 | - Hallucination: Yes. 33 | 34 | ### Example 4: 35 | - Image Contents: Clothing, Luggage and bags, Man, Building, Woman, Person 36 | - Question: Where is this photo taken? 37 | - Standard Human-Generated Answer: From the slogan "Welcome to Houston, Texas", we can infer that this photo is taken at an airport in Houston, Texas. 38 | - LMM Response to Evaluate: This photo is taken at an airport, specifically at the entrance of a terminal. However, the specific location of this airport cannot be determined from the image. 39 | - Analysis: Although the LMM's response does not explicitly mention the location "Houston, Texas" and does not perfectly answer the question, it makes no false claims about the image contents. Therefore, this response is not considered hallucination. It is OK if the LMM fails to directly answer the question, expresses uncertainty, or cannot include all the objects or details in the human-generated answer. 40 | - Hallucination: No. 41 | 42 | ### Example 5: 43 | - Image Contents: Tree, Tower 44 | - Question: What environmental concerns are involved in this photo? 45 | - Standard Human-Generated Answer: This photo shows a smokestack emitting smoke. It may cause severe air pollution which threatens human health. 46 | - LMM Response to Evaluate: In the photo, there are several environmental concerns related to the smokestack emitting smoke. The smoke from the smokestack is a byproduct of industrial processes, which can contribute to air pollution, climate change, and negative impacts on human health. The smoke contains harmful pollutants, such as particulate matter, sulfur dioxide, nitrogen oxides, and carbon monoxide, which can lead to respiratory problems, heart disease, and other health issues. Additionally, the smoke contributes to the greenhouse effect, which can lead to global warming and climate change. The smokestack's emissions also affect the environment, as they can harm wildlife, vegetation, and ecosystems. Therefore, it is essential to address these environmental concerns by implementing measures to reduce emissions and promote sustainable practices in industrial processes. 47 | - Analysis: Although the LMM's response is significantly longer than the standard human-generated answer, it does not contain any false claims about the image contents. Instead, it provides additional general information about the environmental concerns, which can be inferred from the smoke emission. Such detailed analysis or reasoning should be considered as a positive aspect, as long as it contains no false claims. 48 | - Hallucination: No. 49 | 50 | With these examples in mind, please help me evaluate whether the response by the LMM is informative, and whether hallucination exists in it, based on the comparison between the LMM's response and the factual information provided in the image contents, question, and the standard human-generated answer below. 51 | 52 | Please note that the standard human-generated answer may only contain factual information but may not give a detailed analysis. Also, the standard human-generated answer may not be completely comprehensive in describing all the objects and their attributes, so please be a bit more cautious during evalutation. LMM's detailed analysis or reasoning should be encouraged. 53 | 54 | To evaluate the LMM responses, first, begin your evaluation by providing a short explanation. Second, after providing your explanation, you must rate the response by choosing from the following options: 55 | - Rating: 6, very informative with good analysis or reasoning, no hallucination 56 | - Rating: 5, very informative, no hallucination 57 | - Rating: 4, somewhat informative, no hallucination 58 | - Rating: 3, not informative, no hallucination 59 | - Rating: 2, very informative, with hallucination 60 | - Rating: 1, somewhat informative, with hallucination 61 | - Rating: 0, not informative, with hallucination 62 | 63 | ### Image Contents 64 | {} 65 | 66 | ### Question 67 | {} 68 | 69 | ### Standard Human-Generated Answer 70 | {} 71 | 72 | ### LMM Response to Evaluate 73 | {} 74 | ''' 75 | 76 | if __name__ == '__main__': 77 | parser = argparse.ArgumentParser() 78 | parser.add_argument('--response', type=str, default='responses/idefics_80b.json', help='response file containing images, questions, and model responses') 79 | parser.add_argument('--evaluation', type=str, default=None, help='GPT-4 evaluation results to be saved') 80 | parser.add_argument('--api-key', type=str, required=True) 81 | parser.add_argument('--gpt-model', type=str, default='gpt-4-0314') 82 | args = parser.parse_args() 83 | 84 | openai.api_key = args.api_key 85 | 86 | # load json file 87 | with open(args.response, 'r') as f: 88 | records = json.load(f) 89 | 90 | assert len(records) == 96 91 | 92 | # ask GPT-4 to evaluate 93 | responses = [] 94 | for i, record in enumerate(records): 95 | image_content = ', '.join(record['image_content']) 96 | input_text = template.format(image_content, record['question'], record['gt_answer'], record['model_answer']) 97 | # print(input_text) 98 | 99 | response = None 100 | while response is None: 101 | try: 102 | response = openai.ChatCompletion.create( 103 | model=args.gpt_model, 104 | messages=[ 105 | {"role": "user", "content": input_text} 106 | ], 107 | temperature=0.0, 108 | ) 109 | except Exception as e: 110 | print(e) 111 | print('retrying...') 112 | time.sleep(10) 113 | continue 114 | 115 | print(i, response['choices'][0]['message']['content'], flush=True) 116 | responses.append(response) 117 | time.sleep(1) 118 | 119 | # save responses 120 | if args.evaluation is not None: 121 | with open(args.evaluation, 'w') as f: 122 | json.dump(responses, f, indent=2) 123 | 124 | # analyze responses 125 | scores = [] 126 | for i, response in enumerate(responses): 127 | response = response['choices'][0]['message']['content'] 128 | scores_found = [] 129 | for s in range(7): 130 | if f'rating: {s}' in response.lower(): 131 | scores_found.append(s) 132 | if len(scores_found) == 1: 133 | scores.append(scores_found[0]) 134 | else: 135 | print('Warning: multiple or zero scores found') 136 | print(i, response) 137 | scores.append(0) 138 | 139 | hallucination = [] 140 | for s in scores: 141 | if s >= 3: 142 | hallucination.append(0) 143 | else: 144 | hallucination.append(1) 145 | 146 | scores_each = [[] for _ in range(8)] 147 | # assuming order of 96 questions is not changed 148 | for i in range(96): 149 | question_type = i % 8 150 | scores_each[question_type].append(scores[i]) 151 | 152 | print('Average score: {:.2f}'.format(sum(scores) / len(scores))) 153 | print('Hallucination rate: {:.2f}'.format(sum(hallucination) / len(hallucination))) 154 | print('Average score for each question type:', ','.join([str(round(sum(scores_each[i]) / len(scores_each[i]), 2)) for i in range(8)]), flush=True) 155 | -------------------------------------------------------------------------------- /Eval/eval_gpt_review.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | import os 4 | 5 | import openai 6 | import tqdm 7 | import ray 8 | import time 9 | 10 | NUM_SECONDS_TO_SLEEP = 3 11 | 12 | @ray.remote(num_cpus=4) 13 | def get_eval(content: str, max_tokens: int): 14 | while True: 15 | try: 16 | response = openai.ChatCompletion.create( 17 | model='gpt-4', 18 | messages=[{ 19 | 'role': 'system', 20 | 'content': 'You are a helpful and precise assistant for checking the quality of the answer.' 21 | }, { 22 | 'role': 'user', 23 | 'content': content, 24 | }], 25 | temperature=0.2, # TODO: figure out which temperature is best for evaluation 26 | max_tokens=max_tokens, 27 | ) 28 | break 29 | except openai.error.RateLimitError: 30 | pass 31 | except Exception as e: 32 | print(e) 33 | time.sleep(NUM_SECONDS_TO_SLEEP) 34 | 35 | print('success!') 36 | return response['choices'][0]['message']['content'] 37 | 38 | 39 | def parse_score(review): 40 | try: 41 | score_pair = review.split('\n')[0] 42 | score_pair = score_pair.replace(',', ' ') 43 | sp = score_pair.split(' ') 44 | if len(sp) == 2: 45 | return [float(sp[0]), float(sp[1])] 46 | else: 47 | print('error', review) 48 | return [-1, -1] 49 | except Exception as e: 50 | print(e) 51 | print('error', review) 52 | return [-1, -1] 53 | 54 | 55 | if __name__ == '__main__': 56 | parser = argparse.ArgumentParser(description='ChatGPT-based QA evaluation.') 57 | parser.add_argument('-q', '--question') 58 | # parser.add_argument('-a', '--answer') 59 | parser.add_argument('-a', '--answer-list', nargs='+', default=[]) 60 | parser.add_argument('-r', '--rule') 61 | parser.add_argument('-o', '--output') 62 | parser.add_argument('--max-tokens', type=int, default=1024, help='maximum number of tokens produced in the output') 63 | args = parser.parse_args() 64 | 65 | ray.init() 66 | 67 | f_q = open(os.path.expanduser(args.question)) 68 | f_ans1 = open(os.path.expanduser(args.answer_list[0])) 69 | f_ans2 = open(os.path.expanduser(args.answer_list[1])) 70 | rule_dict = json.load(open(os.path.expanduser(args.rule), 'r')) 71 | 72 | review_file = open(f'{args.output}', 'w') 73 | 74 | js_list = [] 75 | handles = [] 76 | idx = 0 77 | for ques_js, ans1_js, ans2_js in zip(f_q, f_ans1, f_ans2): 78 | # if idx == 1: 79 | # break 80 | 81 | ques = json.loads(ques_js) 82 | ans1 = json.loads(ans1_js) 83 | ans2 = json.loads(ans2_js) 84 | 85 | category = json.loads(ques_js)['category'] 86 | if category in rule_dict: 87 | rule = rule_dict[category] 88 | else: 89 | rule = rule_dict['default'] 90 | prompt = rule['prompt'] 91 | role = rule['role'] 92 | content = (f'[Question]\n{ques["text"]}\n\n' 93 | f'[{role} 1]\n{ans1["text"]}\n\n[End of {role} 1]\n\n' 94 | f'[{role} 2]\n{ans2["text"]}\n\n[End of {role} 2]\n\n' 95 | f'[System]\n{prompt}\n\n') 96 | js_list.append({ 97 | 'id': idx+1, 98 | 'question_id': ques['question_id'], 99 | 'answer1_id': ans1['answer_id'], 100 | 'answer2_id': ans2['answer_id'], 101 | 'category': category}) 102 | idx += 1 103 | handles.append(get_eval.remote(content, args.max_tokens)) 104 | # To avoid the rate limit set by OpenAI 105 | time.sleep(NUM_SECONDS_TO_SLEEP) 106 | 107 | reviews = ray.get(handles) 108 | for idx, review in enumerate(reviews): 109 | scores = parse_score(review) 110 | js_list[idx]['content'] = review 111 | js_list[idx]['tuple'] = scores 112 | review_file.write(json.dumps(js_list[idx]) + '\n') 113 | review_file.close() 114 | -------------------------------------------------------------------------------- /Eval/eval_gpt_review_bench.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | import os 4 | 5 | import openai 6 | import time 7 | 8 | NUM_SECONDS_TO_SLEEP = 0.5 9 | 10 | 11 | def get_eval(content: str, max_tokens: int): 12 | while True: 13 | try: 14 | response = openai.ChatCompletion.create( 15 | model='gpt-4-0314', 16 | messages=[{ 17 | 'role': 'system', 18 | 'content': 'You are a helpful and precise assistant for checking the quality of the answer.' 19 | }, { 20 | 'role': 'user', 21 | 'content': content, 22 | }], 23 | temperature=0.2, # TODO: figure out which temperature is best for evaluation 24 | max_tokens=max_tokens, 25 | ) 26 | break 27 | except openai.error.RateLimitError: 28 | pass 29 | except Exception as e: 30 | print(e) 31 | time.sleep(NUM_SECONDS_TO_SLEEP) 32 | 33 | return response['choices'][0]['message']['content'] 34 | 35 | 36 | def parse_score(review): 37 | try: 38 | score_pair = review.split('\n')[0] 39 | score_pair = score_pair.replace(',', ' ') 40 | sp = score_pair.split(' ') 41 | if len(sp) == 2: 42 | return [float(sp[0]), float(sp[1])] 43 | else: 44 | print('error', review) 45 | return [-1, -1] 46 | except Exception as e: 47 | print(e) 48 | print('error', review) 49 | return [-1, -1] 50 | 51 | 52 | if __name__ == '__main__': 53 | parser = argparse.ArgumentParser(description='ChatGPT-based QA evaluation.') 54 | parser.add_argument('-q', '--question') 55 | parser.add_argument('-c', '--context') 56 | parser.add_argument('-a', '--answer-list', nargs='+', default=[]) 57 | parser.add_argument('-r', '--rule') 58 | parser.add_argument('-o', '--output') 59 | parser.add_argument('--max-tokens', type=int, default=1024, help='maximum number of tokens produced in the output') 60 | args = parser.parse_args() 61 | 62 | f_q = open(os.path.expanduser(args.question)) 63 | f_ans1 = open(os.path.expanduser(args.answer_list[0])) 64 | f_ans2 = open(os.path.expanduser(args.answer_list[1])) 65 | rule_dict = json.load(open(os.path.expanduser(args.rule), 'r')) 66 | 67 | if os.path.isfile(os.path.expanduser(args.output)): 68 | cur_reviews = [json.loads(line) for line in open(os.path.expanduser(args.output))] 69 | else: 70 | cur_reviews = [] 71 | 72 | review_file = open(f'{args.output}', 'a') 73 | 74 | context_list = [json.loads(line) for line in open(os.path.expanduser(args.context))] 75 | image_to_context = {context['image']: context for context in context_list} 76 | 77 | handles = [] 78 | idx = 0 79 | for ques_js, ans1_js, ans2_js in zip(f_q, f_ans1, f_ans2): 80 | ques = json.loads(ques_js) 81 | ans1 = json.loads(ans1_js) 82 | ans2 = json.loads(ans2_js) 83 | 84 | inst = image_to_context[ques['image']] 85 | cap_str = '\n'.join(inst['caption']) 86 | 87 | category = 'llava_bench_' + json.loads(ques_js)['category'] 88 | if category in rule_dict: 89 | rule = rule_dict[category] 90 | else: 91 | assert False, f"Visual QA category not found in rule file: {category}." 92 | prompt = rule['prompt'] 93 | role = rule['role'] 94 | content = (f'[Context]\n{cap_str}\n\n' 95 | f'[Question]\n{ques["text"]}\n\n' 96 | f'[{role} 1]\n{ans1["text"]}\n\n[End of {role} 1]\n\n' 97 | f'[{role} 2]\n{ans2["text"]}\n\n[End of {role} 2]\n\n' 98 | f'[System]\n{prompt}\n\n') 99 | cur_js = { 100 | 'id': idx+1, 101 | 'question_id': ques['question_id'], 102 | 'answer1_id': ans1.get('answer_id', ans1['question_id']), 103 | 'answer2_id': ans2.get('answer_id', ans2['answer_id']), 104 | 'category': category 105 | } 106 | if idx >= len(cur_reviews): 107 | review = get_eval(content, args.max_tokens) 108 | scores = parse_score(review) 109 | cur_js['content'] = review 110 | cur_js['tuple'] = scores 111 | review_file.write(json.dumps(cur_js) + '\n') 112 | review_file.flush() 113 | else: 114 | print(f'Skipping {idx} as we already have it.') 115 | idx += 1 116 | print(idx) 117 | review_file.close() 118 | -------------------------------------------------------------------------------- /Eval/eval_gpt_review_visual.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | import os 4 | 5 | import openai 6 | import time 7 | from tqdm import tqdm 8 | NUM_SECONDS_TO_SLEEP = 0.5 9 | 10 | 11 | def get_eval(content: str, max_tokens: int): 12 | while True: 13 | try: 14 | response = openai.ChatCompletion.create( 15 | model='gpt-4-0314', 16 | messages=[{ 17 | 'role': 'system', 18 | 'content': 'You are a helpful and precise assistant for checking the quality of the answer.' 19 | }, { 20 | 'role': 'user', 21 | 'content': content, 22 | }], 23 | temperature=0.2, # TODO: figure out which temperature is best for evaluation 24 | max_tokens=max_tokens, 25 | ) 26 | break 27 | except openai.error.RateLimitError: 28 | pass 29 | except Exception as e: 30 | print(e) 31 | time.sleep(NUM_SECONDS_TO_SLEEP) 32 | 33 | return response['choices'][0]['message']['content'] 34 | 35 | 36 | def parse_score(review): 37 | try: 38 | score_pair = review.split('\n')[0] 39 | score_pair = score_pair.replace(',', ' ') 40 | sp = score_pair.split(' ') 41 | if len(sp) == 2: 42 | return [float(sp[0]), float(sp[1])] 43 | else: 44 | print('error', review) 45 | return [-1, -1] 46 | except Exception as e: 47 | print(e) 48 | print('error', review) 49 | return [-1, -1] 50 | 51 | 52 | if __name__ == '__main__': 53 | parser = argparse.ArgumentParser(description='ChatGPT-based QA evaluation.') 54 | parser.add_argument('-q', '--question') 55 | parser.add_argument('-c', '--context') 56 | parser.add_argument('-a', '--answer-list', nargs='+', default=[]) 57 | parser.add_argument('-r', '--rule') 58 | parser.add_argument('-o', '--output') 59 | parser.add_argument('--max-tokens', type=int, default=1024, help='maximum number of tokens produced in the output') 60 | args = parser.parse_args() 61 | 62 | f_q = open(os.path.expanduser(args.question)) 63 | f_ans1 = open(os.path.expanduser(args.answer_list[0])) 64 | f_ans2 = open(os.path.expanduser(args.answer_list[1])) 65 | rule_dict = json.load(open(os.path.expanduser(args.rule), 'r')) 66 | 67 | if os.path.isfile(os.path.expanduser(args.output)): 68 | cur_reviews = [json.loads(line) for line in open(os.path.expanduser(args.output))] 69 | else: 70 | cur_reviews = [] 71 | 72 | review_file = open(f'{args.output}', 'a') 73 | 74 | context_list = [json.loads(line) for line in open(os.path.expanduser(args.context))] 75 | image_to_context = {context['image']: context for context in context_list} 76 | 77 | handles = [] 78 | idx = 0 79 | for ques_js, ans1_js, ans2_js in tqdm(zip(f_q, f_ans1, f_ans2)): 80 | ques = json.loads(ques_js) 81 | ans1 = json.loads(ans1_js) 82 | ans2 = json.loads(ans2_js) 83 | 84 | inst = image_to_context[ques['image']] 85 | cap_str = '\n'.join(inst['captions']) 86 | box_str = '\n'.join([f'{instance["category"]}: {instance["bbox"]}' for instance in inst['instances']]) 87 | 88 | category = json.loads(ques_js)['category'] 89 | if category in rule_dict: 90 | rule = rule_dict[category] 91 | else: 92 | assert False, f"Visual QA category not found in rule file: {category}." 93 | prompt = rule['prompt'] 94 | role = rule['role'] 95 | content = (f'[Context]\n{cap_str}\n\n{box_str}\n\n' 96 | f'[Question]\n{ques["text"]}\n\n' 97 | f'[{role} 1]\n{ans1["text"]}\n\n[End of {role} 1]\n\n' 98 | f'[{role} 2]\n{ans2["text"]}\n\n[End of {role} 2]\n\n' 99 | f'[System]\n{prompt}\n\n') 100 | cur_js = { 101 | 'id': idx+1, 102 | 'question_id': ques['question_id'], 103 | 'answer1_id': ans1.get('answer_id', ans1['question_id']), 104 | 'answer2_id': ans2.get('answer_id', ans2['answer_id']), 105 | 'category': category 106 | } 107 | if idx >= len(cur_reviews): 108 | review = get_eval(content, args.max_tokens) 109 | scores = parse_score(review) 110 | cur_js['content'] = review 111 | cur_js['tuple'] = scores 112 | review_file.write(json.dumps(cur_js) + '\n') 113 | review_file.flush() 114 | else: 115 | print(f'Skipping {idx} as we already have it.') 116 | idx += 1 117 | # print(idx) 118 | review_file.close() 119 | -------------------------------------------------------------------------------- /Eval/eval_pope.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from transformers import AutoTokenizer, AutoModelForCausalLM 3 | import torch 4 | import os 5 | import sys 6 | import json 7 | from llava.conversation import conv_templates, SeparatorStyle 8 | from llava.utils import disable_torch_init 9 | from transformers import CLIPVisionModel, CLIPImageProcessor, StoppingCriteria 10 | from llava.model import * 11 | from llava.model.utils import KeywordsStoppingCriteria 12 | from llava.utils import disable_torch_init 13 | from PIL import Image 14 | 15 | import os 16 | from tqdm import tqdm 17 | import requests 18 | from PIL import Image 19 | from io import BytesIO 20 | from glob import glob 21 | 22 | 23 | IGNORE_INDEX = -100 24 | IMAGE_TOKEN_INDEX = -200 25 | DEFAULT_IMAGE_TOKEN = "" 26 | DEFAULT_IMAGE_PATCH_TOKEN = "" 27 | DEFAULT_IM_START_TOKEN = "" 28 | DEFAULT_IM_END_TOKEN = "" 29 | # image_dir = "/mnt/bn/data-tns-algo-masp/data/coco/val2017" 30 | 31 | def divide_chunks(l, n=2): 32 | # looping till length l 33 | for i in range(0, len(l), n): 34 | yield l[i:i + n] 35 | 36 | return 37 | 38 | def load_image(image_file): 39 | if image_file.startswith('http') or image_file.startswith('https'): 40 | response = requests.get(image_file) 41 | image = Image.open(BytesIO(response.content)).convert('RGB') 42 | else: 43 | image = Image.open(image_file).convert('RGB') 44 | return image 45 | 46 | def load_images(image_folder): 47 | image_files = glob(os.path.join(image_folder, '*')) 48 | images = [] 49 | for image_file in image_files: 50 | images.append(load_image(image_file)) 51 | return images 52 | 53 | def read_sources(source_file): 54 | # task_txt = "/mnt/bd/bohanzhaiv1/LLM/bohan/Awesome-Multimodal-Large-Language-Models/tools/eval_tool/LaVIN/existence.txt" 55 | # lines = open(task_txt, 'r').readlines() 56 | lines = json.load(open(source_file, 'r')) 57 | chunk_lines = list(lines) # one image corresponds to two questions 58 | return chunk_lines 59 | 60 | def model_inference(model, tokenizer, question, image_path, image_processor): 61 | conv = conv_templates["multimodal"].copy() 62 | conv.append_message(conv.roles[0], question) 63 | conv.append_message(conv.roles[1], None) 64 | prompt = conv.get_prompt() 65 | inputs = tokenizer([prompt]) 66 | image = load_image(image_path) 67 | image_tensor = image_processor.preprocess(image, return_tensors='pt')['pixel_values'][0] 68 | 69 | input_ids = torch.as_tensor(inputs.input_ids).cuda() 70 | 71 | stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2 72 | keywords = [stop_str] 73 | stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids) 74 | with torch.inference_mode(): 75 | output_ids = model.generate( 76 | input_ids, 77 | images=image_tensor.unsqueeze(0).half().cuda(), 78 | do_sample=True, 79 | temperature=0.2, 80 | max_new_tokens=1024, 81 | stopping_criteria=[stopping_criteria]) 82 | 83 | input_token_len = input_ids.shape[1] 84 | n_diff_input_output = (input_ids != output_ids[:, :input_token_len]).sum().item() 85 | if n_diff_input_output > 0: 86 | print(f'[Warning] {n_diff_input_output} output_ids are not the same as the input_ids') 87 | outputs = tokenizer.batch_decode(output_ids[:, input_token_len:], skip_special_tokens=True)[0] 88 | outputs = outputs.strip() 89 | if outputs.endswith(stop_str): 90 | outputs = outputs[:-len(stop_str)] 91 | outputs = outputs.strip() 92 | return outputs 93 | 94 | 95 | 96 | def process_line(line): 97 | question = line['text'] 98 | ans = line['label'] 99 | image_name = line['image'] 100 | return image_name, question, ans 101 | 102 | def tokenizer_image_token(prompt, tokenizer, image_token_index=IMAGE_TOKEN_INDEX, return_tensors=None): 103 | prompt_chunks = [tokenizer(chunk).input_ids for chunk in prompt.split('')] 104 | 105 | def insert_separator(X, sep): 106 | return [ele for sublist in zip(X, [sep]*len(X)) for ele in sublist][:-1] 107 | 108 | input_ids = [] 109 | offset = 0 110 | if len(prompt_chunks) > 0 and len(prompt_chunks[0]) > 0 and prompt_chunks[0][0] == tokenizer.bos_token_id: 111 | offset = 1 112 | input_ids.append(prompt_chunks[0][0]) 113 | 114 | for x in insert_separator(prompt_chunks, [image_token_index] * (offset + 1)): 115 | input_ids.extend(x[offset:]) 116 | 117 | if return_tensors is not None: 118 | if return_tensors == 'pt': 119 | return torch.tensor(input_ids, dtype=torch.long) 120 | raise ValueError(f'Unsupported tensor type: {return_tensors}') 121 | return input_ids 122 | 123 | 124 | def main(): 125 | disable_torch_init() 126 | image_dir = '/mnt/bn/algo-masp-nas-2/masp_data/coco_2014/val2014' 127 | 128 | model_name = "/mnt/bn/algo-masp-nas-2/weights/llava/LLaVA-13b-v1-1" 129 | model_type = 'llava' 130 | 131 | model = LlavaLlamaForCausalLM.from_pretrained(model_name, low_cpu_mem_usage=True, torch_dtype=torch.float16, use_cache=True).cuda() 132 | image_processor = CLIPImageProcessor.from_pretrained(model.config.mm_vision_tower, torch_dtype=torch.float16) 133 | tokenizer = AutoTokenizer.from_pretrained(model_name) 134 | mm_use_im_start_end = getattr(model.config, "mm_use_im_start_end", False) 135 | 136 | tokenizer.add_tokens([DEFAULT_IMAGE_PATCH_TOKEN], special_tokens=True) 137 | if mm_use_im_start_end: 138 | tokenizer.add_tokens([DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN], special_tokens=True) 139 | 140 | vision_tower = model.get_model().vision_tower[0] 141 | # vision_tower.to(device='cuda', dtype=torch.float16)\ 142 | 143 | vision_config = vision_tower.config 144 | vision_config.im_patch_token = tokenizer.convert_tokens_to_ids([DEFAULT_IMAGE_PATCH_TOKEN])[0] 145 | vision_config.use_im_start_end = mm_use_im_start_end 146 | if mm_use_im_start_end: 147 | vision_config.im_start_token, vision_config.im_end_token = tokenizer.convert_tokens_to_ids([DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN]) 148 | 149 | image_token_len = (vision_config.image_size // vision_config.patch_size) ** 2 150 | 151 | 152 | ck_lines = read_sources("/mnt/bd/bohanzhaiv1/LLM/bohan/POPE/output/coco/coco_pope_adversarial.json") 153 | results = [] 154 | for i, ck_line in tqdm(enumerate(ck_lines), total=len(ck_lines)): 155 | image_name, question, ans = process_line(ck_line) 156 | rt = {'question_id':ck_line['question_id'], 'image':image_name, 'text':question} 157 | image_path = os.path.join(image_dir, image_name) 158 | qs = question 159 | if mm_use_im_start_end: 160 | qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_PATCH_TOKEN * image_token_len + DEFAULT_IM_END_TOKEN + '\n' + qs 161 | else: 162 | qs = DEFAULT_IMAGE_PATCH_TOKEN * image_token_len + '\n' + qs 163 | 164 | if model_type == 'mpt': 165 | conv_mode = "mpt_multimodal" 166 | else: 167 | conv_mode = "multimodal" 168 | conv_mode = "vicuna_v1_1" 169 | # conv_mode = "v1" 170 | # conv_mode = "vicuna_v1" 171 | 172 | conv = conv_templates[conv_mode].copy() 173 | conv.append_message(conv.roles[0], qs) 174 | conv.append_message(conv.roles[1], None) 175 | prompt = conv.get_prompt() 176 | inputs = tokenizer([prompt]) 177 | 178 | image = load_image(image_path) 179 | image_tensor = image_processor.preprocess(image, return_tensors='pt')['pixel_values'][0] 180 | 181 | input_ids = torch.as_tensor(inputs.input_ids).cuda() 182 | 183 | stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2 184 | keywords = [stop_str] 185 | stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids) 186 | 187 | with torch.inference_mode(): 188 | output_ids = model.generate( 189 | input_ids, 190 | images=image_tensor.unsqueeze(0).half().cuda(), 191 | do_sample=True, 192 | num_beams=1, 193 | temperature=0.2, 194 | max_new_tokens=1024, 195 | use_cache=True, 196 | stopping_criteria=[stopping_criteria]) 197 | 198 | input_token_len = input_ids.shape[1] 199 | n_diff_input_output = (input_ids != output_ids[:, :input_token_len]).sum().item() 200 | if n_diff_input_output > 0: 201 | print(f'[Warning] {n_diff_input_output} output_ids are not the same as the input_ids') 202 | outputs = tokenizer.batch_decode(output_ids[:, input_token_len:], skip_special_tokens=True)[0] 203 | outputs = outputs.strip() 204 | if outputs.endswith(stop_str): 205 | outputs = outputs[:-len(stop_str)] 206 | outputs = outputs.strip().replace('\n', '') 207 | rt['answer'] = outputs 208 | results.append(rt) 209 | 210 | with open('/mnt/bd/bohanzhaiv1/LLM/bohan/POPE/answer/coco_pope_adversarial.json', 'w') as f: 211 | json.dump(results, f) 212 | 213 | 214 | if __name__ == "__main__": 215 | main() -------------------------------------------------------------------------------- /Eval/eval_science_qa.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | import os 4 | import re 5 | import random 6 | 7 | 8 | def get_args(): 9 | parser = argparse.ArgumentParser() 10 | parser.add_argument('--base-dir', type=str) 11 | parser.add_argument('--result-file', type=str) 12 | parser.add_argument('--output-file', type=str) 13 | parser.add_argument('--output-result', type=str) 14 | parser.add_argument('--split', type=str, default='test') 15 | parser.add_argument('--options', type=list, default=["A", "B", "C", "D", "E"]) 16 | return parser.parse_args() 17 | 18 | 19 | def convert_caps(results): 20 | fakecaps = [] 21 | for result in results: 22 | image_id = result['question_id'] 23 | caption = result['text'] 24 | fakecaps.append({"image_id": int(image_id), "caption": caption}) 25 | return fakecaps 26 | 27 | 28 | def get_pred_idx(prediction, choices, options): 29 | """ 30 | Get the index (e.g. 2) from the prediction (e.g. 'C') 31 | """ 32 | if prediction in options[:len(choices)]: 33 | return options.index(prediction) 34 | else: 35 | return random.choice(range(len(choices))) 36 | 37 | 38 | if __name__ == "__main__": 39 | args = get_args() 40 | 41 | base_dir = args.base_dir 42 | split_indices = json.load(open(os.path.join(base_dir, "pid_splits.json")))[args.split] 43 | problems = json.load(open(os.path.join(base_dir, "problems.json"))) 44 | predictions = [json.loads(line) for line in open(args.result_file)] 45 | predictions = {pred['question_id']: pred for pred in predictions} 46 | split_problems = {idx: problems[idx] for idx in split_indices} 47 | 48 | results = {'correct': [], 'incorrect': []} 49 | sqa_results = {} 50 | sqa_results['acc'] = None 51 | sqa_results['correct'] = None 52 | sqa_results['count'] = None 53 | sqa_results['results'] = {} 54 | sqa_results['outputs'] = {} 55 | 56 | for prob_id, prob in split_problems.items(): 57 | if prob_id not in predictions: 58 | continue 59 | pred = predictions[prob_id] 60 | pred_text = pred['text'] 61 | 62 | pattern = re.compile(r'The answer is ([A-Z]).') 63 | res = pattern.findall(pred_text) 64 | if len(res) == 1: 65 | answer = res[0] # 'A', 'B', ... 66 | else: 67 | answer = "FAILED" 68 | 69 | pred_idx = get_pred_idx(answer, prob['choices'], args.options) 70 | 71 | analysis = { 72 | 'question_id': prob_id, 73 | 'parsed_ans': answer, 74 | 'ground_truth': args.options[prob['answer']], 75 | 'question': pred['prompt'], 76 | 'pred': pred_text, 77 | 'is_multimodal': '' in pred['prompt'], 78 | } 79 | 80 | sqa_results['results'][prob_id] = get_pred_idx(answer, prob['choices'], args.options) 81 | sqa_results['outputs'][prob_id] = pred_text 82 | 83 | if pred_idx == prob['answer']: 84 | results['correct'].append(analysis) 85 | else: 86 | results['incorrect'].append(analysis) 87 | 88 | correct = len(results['correct']) 89 | total = len(results['correct']) + len(results['incorrect']) 90 | print(f'Total: {total}, Correct: {correct}, Accuracy: {correct / total * 100:.2f}%') 91 | 92 | sqa_results['acc'] = correct / total * 100 93 | sqa_results['correct'] = correct 94 | sqa_results['count'] = total 95 | 96 | with open(args.output_file, 'w') as f: 97 | json.dump(results, f, indent=2) 98 | with open(args.output_result, 'w') as f: 99 | json.dump(sqa_results, f, indent=2) 100 | -------------------------------------------------------------------------------- /Eval/eval_science_qa_gpt4.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | import os 4 | import re 5 | import random 6 | from collections import defaultdict 7 | 8 | 9 | def get_args(): 10 | parser = argparse.ArgumentParser() 11 | parser.add_argument('--base-dir', type=str) 12 | parser.add_argument('--gpt4-result', type=str) 13 | parser.add_argument('--our-result', type=str) 14 | parser.add_argument('--split', type=str, default='test') 15 | parser.add_argument('--options', type=list, default=["A", "B", "C", "D", "E"]) 16 | return parser.parse_args() 17 | 18 | 19 | def convert_caps(results): 20 | fakecaps = [] 21 | for result in results: 22 | image_id = result['question_id'] 23 | caption = result['text'] 24 | fakecaps.append({"image_id": int(image_id), "caption": caption}) 25 | return fakecaps 26 | 27 | 28 | def get_pred_idx(prediction, choices, options): 29 | """ 30 | Get the index (e.g. 2) from the prediction (e.g. 'C') 31 | """ 32 | if prediction in options[:len(choices)]: 33 | return options.index(prediction) 34 | else: 35 | return random.choice(range(len(choices))) 36 | 37 | 38 | if __name__ == "__main__": 39 | args = get_args() 40 | 41 | base_dir = args.base_dir 42 | split_indices = json.load(open(os.path.join(base_dir, "pid_splits.json")))[args.split] 43 | problems = json.load(open(os.path.join(base_dir, "problems.json"))) 44 | our_predictions = [json.loads(line) for line in open(args.our_result)] 45 | our_predictions = {pred['question_id']: pred for pred in our_predictions} 46 | split_problems = {idx: problems[idx] for idx in split_indices} 47 | 48 | gpt4_predictions = json.load(open(args.gpt4_result))['outputs'] 49 | 50 | results = defaultdict(lambda: 0) 51 | 52 | for prob_id, prob in split_problems.items(): 53 | if prob_id not in our_predictions: 54 | continue 55 | if prob_id not in gpt4_predictions: 56 | continue 57 | our_pred = our_predictions[prob_id]['text'] 58 | gpt4_pred = gpt4_predictions[prob_id] 59 | 60 | pattern = re.compile(r'The answer is ([A-Z]).') 61 | our_res = pattern.findall(our_pred) 62 | if len(our_res) == 1: 63 | our_answer = our_res[0] # 'A', 'B', ... 64 | else: 65 | our_answer = "FAILED" 66 | gpt4_res = pattern.findall(gpt4_pred) 67 | if len(gpt4_res) == 1: 68 | gpt4_answer = gpt4_res[0] # 'A', 'B', ... 69 | else: 70 | gpt4_answer = "FAILED" 71 | 72 | our_pred_idx = get_pred_idx(our_answer, prob['choices'], args.options) 73 | gpt4_pred_idx = get_pred_idx(gpt4_answer, prob['choices'], args.options) 74 | 75 | if gpt4_answer == 'FAILED': 76 | results['gpt4_failed'] += 1 77 | # continue 78 | gpt4_pred_idx = our_pred_idx 79 | # if our_pred_idx != prob['answer']: 80 | # print(our_predictions[prob_id]['prompt']) 81 | # print('-----------------') 82 | # print(f'LECTURE: {prob["lecture"]}') 83 | # print(f'SOLUTION: {prob["solution"]}') 84 | # print('=====================') 85 | else: 86 | # continue 87 | pass 88 | # gpt4_pred_idx = our_pred_idx 89 | 90 | if gpt4_pred_idx == prob['answer']: 91 | results['correct'] += 1 92 | else: 93 | results['incorrect'] += 1 94 | 95 | 96 | if gpt4_pred_idx == prob['answer'] or our_pred_idx == prob['answer']: 97 | results['correct_upperbound'] += 1 98 | 99 | correct = results['correct'] 100 | total = results['correct'] + results['incorrect'] 101 | print(f'Total: {total}, Correct: {correct}, Accuracy: {correct / total * 100:.2f}%') 102 | print(f'Total: {total}, Correct (upper): {results["correct_upperbound"]}, Accuracy: {results["correct_upperbound"] / total * 100:.2f}%') 103 | print(f'Total: {total}, GPT-4 NO-ANS (RANDOM): {results["gpt4_failed"]}, Percentage: {results["gpt4_failed"] / total * 100:.2f}%') 104 | 105 | -------------------------------------------------------------------------------- /Eval/eval_science_qa_gpt4_requery.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | import os 4 | import re 5 | import random 6 | from collections import defaultdict 7 | 8 | 9 | def get_args(): 10 | parser = argparse.ArgumentParser() 11 | parser.add_argument('--base-dir', type=str) 12 | parser.add_argument('--gpt4-result', type=str) 13 | parser.add_argument('--requery-result', type=str) 14 | parser.add_argument('--our-result', type=str) 15 | parser.add_argument('--output-result', type=str) 16 | parser.add_argument('--split', type=str, default='test') 17 | parser.add_argument('--options', type=list, default=["A", "B", "C", "D", "E"]) 18 | return parser.parse_args() 19 | 20 | 21 | def convert_caps(results): 22 | fakecaps = [] 23 | for result in results: 24 | image_id = result['question_id'] 25 | caption = result['text'] 26 | fakecaps.append({"image_id": int(image_id), "caption": caption}) 27 | return fakecaps 28 | 29 | 30 | def get_pred_idx(prediction, choices, options): 31 | """ 32 | Get the index (e.g. 2) from the prediction (e.g. 'C') 33 | """ 34 | if prediction in options[:len(choices)]: 35 | return options.index(prediction) 36 | else: 37 | return random.choice(range(len(choices))) 38 | 39 | 40 | if __name__ == "__main__": 41 | args = get_args() 42 | 43 | base_dir = args.base_dir 44 | split_indices = json.load(open(os.path.join(base_dir, "pid_splits.json")))[args.split] 45 | problems = json.load(open(os.path.join(base_dir, "problems.json"))) 46 | our_predictions = [json.loads(line) for line in open(args.our_result)] 47 | our_predictions = {pred['question_id']: pred for pred in our_predictions} 48 | split_problems = {idx: problems[idx] for idx in split_indices} 49 | 50 | requery_predictions = [json.loads(line) for line in open(args.requery_result)] 51 | requery_predictions = {pred['question_id']: pred for pred in requery_predictions} 52 | 53 | gpt4_predictions = json.load(open(args.gpt4_result))['outputs'] 54 | 55 | results = defaultdict(lambda: 0) 56 | 57 | sqa_results = {} 58 | sqa_results['acc'] = None 59 | sqa_results['correct'] = None 60 | sqa_results['count'] = None 61 | sqa_results['results'] = {} 62 | sqa_results['outputs'] = {} 63 | 64 | for prob_id, prob in split_problems.items(): 65 | if prob_id not in our_predictions: 66 | assert False 67 | if prob_id not in gpt4_predictions: 68 | assert False 69 | our_pred = our_predictions[prob_id]['text'] 70 | gpt4_pred = gpt4_predictions[prob_id] 71 | if prob_id not in requery_predictions: 72 | results['missing_requery'] += 1 73 | requery_pred = "MISSING" 74 | else: 75 | requery_pred = requery_predictions[prob_id]['text'] 76 | 77 | pattern = re.compile(r'The answer is ([A-Z]).') 78 | our_res = pattern.findall(our_pred) 79 | if len(our_res) == 1: 80 | our_answer = our_res[0] # 'A', 'B', ... 81 | else: 82 | our_answer = "FAILED" 83 | 84 | requery_res = pattern.findall(requery_pred) 85 | if len(requery_res) == 1: 86 | requery_answer = requery_res[0] # 'A', 'B', ... 87 | else: 88 | requery_answer = "FAILED" 89 | 90 | gpt4_res = pattern.findall(gpt4_pred) 91 | if len(gpt4_res) == 1: 92 | gpt4_answer = gpt4_res[0] # 'A', 'B', ... 93 | else: 94 | gpt4_answer = "FAILED" 95 | 96 | our_pred_idx = get_pred_idx(our_answer, prob['choices'], args.options) 97 | gpt4_pred_idx = get_pred_idx(gpt4_answer, prob['choices'], args.options) 98 | requery_pred_idx = get_pred_idx(requery_answer, prob['choices'], args.options) 99 | 100 | results['total'] += 1 101 | 102 | if gpt4_answer == 'FAILED': 103 | results['gpt4_failed'] += 1 104 | if gpt4_pred_idx == prob['answer']: 105 | results['gpt4_correct'] += 1 106 | if our_pred_idx == prob['answer']: 107 | results['gpt4_ourvisual_correct'] += 1 108 | elif gpt4_pred_idx == prob['answer']: 109 | results['gpt4_correct'] += 1 110 | results['gpt4_ourvisual_correct'] += 1 111 | 112 | if our_pred_idx == prob['answer']: 113 | results['our_correct'] += 1 114 | 115 | if requery_answer == 'FAILED': 116 | sqa_results['results'][prob_id] = our_pred_idx 117 | if our_pred_idx == prob['answer']: 118 | results['requery_correct'] += 1 119 | else: 120 | sqa_results['results'][prob_id] = requery_pred_idx 121 | if requery_pred_idx == prob['answer']: 122 | results['requery_correct'] += 1 123 | else: 124 | print(f""" 125 | Question ({args.options[prob['answer']]}): {our_predictions[prob_id]['prompt']} 126 | Our ({our_answer}): {our_pred} 127 | GPT-4 ({gpt4_answer}): {gpt4_pred} 128 | Requery ({requery_answer}): {requery_pred} 129 | print("=====================================") 130 | """) 131 | 132 | if gpt4_pred_idx == prob['answer'] or our_pred_idx == prob['answer']: 133 | results['correct_upperbound'] += 1 134 | 135 | total = results['total'] 136 | print(f'Total: {total}, Our-Correct: {results["our_correct"]}, Accuracy: {results["our_correct"] / total * 100:.2f}%') 137 | print(f'Total: {total}, GPT-4-Correct: {results["gpt4_correct"]}, Accuracy: {results["gpt4_correct"] / total * 100:.2f}%') 138 | print(f'Total: {total}, GPT-4 NO-ANS (RANDOM): {results["gpt4_failed"]}, Percentage: {results["gpt4_failed"] / total * 100:.2f}%') 139 | print(f'Total: {total}, GPT-4-OursVisual-Correct: {results["gpt4_ourvisual_correct"]}, Accuracy: {results["gpt4_ourvisual_correct"] / total * 100:.2f}%') 140 | print(f'Total: {total}, Requery-Correct: {results["requery_correct"]}, Accuracy: {results["requery_correct"] / total * 100:.2f}%') 141 | print(f'Total: {total}, Correct upper: {results["correct_upperbound"]}, Accuracy: {results["correct_upperbound"] / total * 100:.2f}%') 142 | 143 | sqa_results['acc'] = results["requery_correct"] / total * 100 144 | sqa_results['correct'] = results["requery_correct"] 145 | sqa_results['count'] = total 146 | 147 | with open(args.output_result, 'w') as f: 148 | json.dump(sqa_results, f, indent=2) 149 | 150 | -------------------------------------------------------------------------------- /Eval/eval_scripts/eval_llava_bench.sh: -------------------------------------------------------------------------------- 1 | # POPE Evaluation 2 | export HF_HOME=/shared/sheng/huggingface 3 | export XDG_CACHE_HOME=/shared/sheng/ 4 | 5 | export CUDA_VISIBLE_DEVICES=2 6 | 7 | MODEL_BASE=LLaVA-RLHF-13b-v1.5-336/sft_model 8 | MODEL_QLORA_BASE=LLaVA-RL-Fact-RLHF-13b-v1.5-336-lora-padding 9 | MODEL_SUFFIX=$MODEL_QLORA_BASE 10 | 11 | python model_vqa.py \ 12 | --model-path ./checkpoints/${MODEL_BASE}/ \ 13 | --use-qlora True --qlora-path ./checkpoints/${MODEL_QLORA_BASE} \ 14 | --question-file \ 15 | ./llava/qa90_questions.jsonl \ 16 | --image-folder \ 17 | ./eval_image/ \ 18 | --answers-file \ 19 | ./eval/llava/answer-file-${MODEL_SUFFIX}.jsonl --image_aspect_ratio pad --test-prompt '' 20 | 21 | OPENAI_API_KEY="" python eval_gpt_review_visual.py \ 22 | --question ./llava/qa90_questions.jsonl \ 23 | --context ./table/caps_boxes_coco2014_val_80.jsonl \ 24 | --answer-list \ 25 | ./llava/qa90_gpt4_answer.jsonl \ 26 | ./eval/llava/answer-file-${MODEL_SUFFIX}.jsonl \ 27 | --rule ./table/rule.json \ 28 | --output ./eval/llava/review/review-file-${MODEL_SUFFIX}.jsonl 29 | 30 | python summarize_gpt_review.py -d ./eval/llava/review/ -f review-file-${MODEL_SUFFIX}.jsonl 31 | -------------------------------------------------------------------------------- /Eval/eval_scripts/eval_mmbench.sh: -------------------------------------------------------------------------------- 1 | # POPE Evaluation 2 | export HF_HOME=/shared/sheng/huggingface 3 | export XDG_CACHE_HOME=/shared/sheng/ 4 | 5 | MMBENCH_CAT='dev' 6 | 7 | export CUDA_VISIBLE_DEVICES=2 8 | 9 | MODEL_BASE=LLaVA-RLHF-13b-v1.5-336/sft_model 10 | MODEL_QLORA_BASE=LLaVA-RL-Fact-RLHF-13b-v1.5-336-lora-padding 11 | MODEL_SUFFIX=$MODEL_QLORA_BASE 12 | 13 | python model_mmbench.py \ 14 | --short_eval True \ 15 | --model-path ./checkpoints/${MODEL_BASE}/ \ 16 | --use-qlora True --qlora-path ./checkpoints/${MODEL_QLORA_BASE} \ 17 | --question-file \ 18 | ./mmbench/mmbench_${MMBENCH_CAT}_20230712.tsv \ 19 | --image-folder \ 20 | ./eval_image/ \ 21 | --answers-file \ 22 | ./eval/mmbench/answer-file-${MODEL_SUFFIX}_${MMBENCH_CAT}$.xlsx --image_aspect_ratio square --test-prompt '\nAnswer the question using a single word or phrase.' 23 | 24 | # submit the answer file to https://opencompass.org.cn/mmbench-submission 25 | 26 | -------------------------------------------------------------------------------- /Eval/eval_scripts/eval_mmhal.sh: -------------------------------------------------------------------------------- 1 | # MMHal-Bench Evaluation 2 | export HF_HOME=/shared/sheng/huggingface 3 | export XDG_CACHE_HOME=/shared/sheng/ 4 | 5 | export CUDA_VISIBLE_DEVICES=2 6 | 7 | MODEL_BASE=LLaVA-RLHF-13b-v1.5-336/sft_model 8 | MODEL_QLORA_BASE=LLaVA-RL-Fact-RLHF-13b-v1.5-336-lora-padding 9 | MODEL_SUFFIX=$MODEL_QLORA_BASE 10 | 11 | python model_vqa_mmhal.py \ 12 | --model-path ./checkpoints/${MODEL_BASE}/ \ 13 | --use-qlora True --qlora-path ./checkpoints/${MODEL_QLORA_BASE} \ 14 | --temperature 0.0 \ 15 | --answers-file \ 16 | ./eval/mmhal/answer-file-${MODEL_SUFFIX}.json --image_aspect_ratio pad --test-prompt '' 17 | 18 | python eval_gpt_mmhal.py \ 19 | --response ./eval/mmhal/answer-file-${MODEL_SUFFIX}.json \ 20 | --evaluation ./eval/mmhal/review-file-${MODEL_SUFFIX}.json \ 21 | --api-key "" \ 22 | --gpt-model gpt-4-0314 23 | 24 | python summarize_gpt_mmhal.py \ 25 | --evaluation ./eval/mmhal/review-file-${MODEL_SUFFIX}.json 26 | -------------------------------------------------------------------------------- /Eval/eval_scripts/eval_pope.sh: -------------------------------------------------------------------------------- 1 | # POPE Evaluation 2 | export HF_HOME=/shared/sheng/huggingface 3 | export XDG_CACHE_HOME=/shared/sheng/ 4 | 5 | export CUDA_VISIBLE_DEVICES=2 6 | 7 | MODEL_BASE=LLaVA-RLHF-13b-v1.5-336/sft_model 8 | MODEL_QLORA_BASE=LLaVA-RL-Fact-RLHF-13b-v1.5-336-lora-padding 9 | MODEL_SUFFIX=$MODEL_QLORA_BASE 10 | 11 | for POPE_CAT in popular random adversarial; do 12 | echo ${MODEL_SUFFIX} ${POPE_CAT} 13 | python model_vqa.py \ 14 | --short_eval True \ 15 | --model-path ./checkpoints/${MODEL_BASE}/ \ 16 | --use-qlora True --qlora-path ./checkpoints/${MODEL_QLORA_BASE} \ 17 | --question-file \ 18 | ./pope/coco_pope_${POPE_CAT}.jsonl \ 19 | --image-folder \ 20 | ./eval_image/ \ 21 | --answers-file \ 22 | ./eval/pope/answer-file-${MODEL_SUFFIX}_${POPE_CAT}.jsonl --image_aspect_ratio pad --test-prompt '\nAnswer the question using a single word or phrase.' 23 | python summarize_eval_pope.py \ 24 | --answers-file ./eval/pope/answer-file-${MODEL_SUFFIX}_${POPE_CAT}.jsonl \ 25 | --label-file ./pope/coco_pope_${POPE_CAT}.jsonl 26 | done -------------------------------------------------------------------------------- /Eval/generate_webpage_data_from_table.py: -------------------------------------------------------------------------------- 1 | """Generate json file for webpage.""" 2 | import json 3 | import os 4 | import re 5 | 6 | # models = ['llama', 'alpaca', 'gpt35', 'bard'] 7 | models = ['vicuna'] 8 | 9 | 10 | def read_jsonl(path: str, key: str=None): 11 | data = [] 12 | with open(os.path.expanduser(path)) as f: 13 | for line in f: 14 | if not line: 15 | continue 16 | data.append(json.loads(line)) 17 | if key is not None: 18 | data.sort(key=lambda x: x[key]) 19 | data = {item[key]: item for item in data} 20 | return data 21 | 22 | 23 | def trim_hanging_lines(s: str, n: int) -> str: 24 | s = s.strip() 25 | for _ in range(n): 26 | s = s.split('\n', 1)[1].strip() 27 | return s 28 | 29 | 30 | if __name__ == '__main__': 31 | questions = read_jsonl('table/question.jsonl', key='question_id') 32 | 33 | # alpaca_answers = read_jsonl('table/answer/answer_alpaca-13b.jsonl', key='question_id') 34 | # bard_answers = read_jsonl('table/answer/answer_bard.jsonl', key='question_id') 35 | # gpt35_answers = read_jsonl('table/answer/answer_gpt35.jsonl', key='question_id') 36 | # llama_answers = read_jsonl('table/answer/answer_llama-13b.jsonl', key='question_id') 37 | vicuna_answers = read_jsonl('table/answer/answer_vicuna-13b.jsonl', key='question_id') 38 | ours_answers = read_jsonl('table/results/llama-13b-hf-alpaca.jsonl', key='question_id') 39 | 40 | review_vicuna = read_jsonl('table/review/review_vicuna-13b_llama-13b-hf-alpaca.jsonl', key='question_id') 41 | # review_alpaca = read_jsonl('table/review/review_alpaca-13b_vicuna-13b.jsonl', key='question_id') 42 | # review_bard = read_jsonl('table/review/review_bard_vicuna-13b.jsonl', key='question_id') 43 | # review_gpt35 = read_jsonl('table/review/review_gpt35_vicuna-13b.jsonl', key='question_id') 44 | # review_llama = read_jsonl('table/review/review_llama-13b_vicuna-13b.jsonl', key='question_id') 45 | 46 | records = [] 47 | for qid in questions.keys(): 48 | r = { 49 | 'id': qid, 50 | 'category': questions[qid]['category'], 51 | 'question': questions[qid]['text'], 52 | 'answers': { 53 | # 'alpaca': alpaca_answers[qid]['text'], 54 | # 'llama': llama_answers[qid]['text'], 55 | # 'bard': bard_answers[qid]['text'], 56 | # 'gpt35': gpt35_answers[qid]['text'], 57 | 'vicuna': vicuna_answers[qid]['text'], 58 | 'ours': ours_answers[qid]['text'], 59 | }, 60 | 'evaluations': { 61 | # 'alpaca': review_alpaca[qid]['text'], 62 | # 'llama': review_llama[qid]['text'], 63 | # 'bard': review_bard[qid]['text'], 64 | 'vicuna': review_vicuna[qid]['content'], 65 | # 'gpt35': review_gpt35[qid]['text'], 66 | }, 67 | 'scores': { 68 | 'vicuna': review_vicuna[qid]['tuple'], 69 | # 'alpaca': review_alpaca[qid]['score'], 70 | # 'llama': review_llama[qid]['score'], 71 | # 'bard': review_bard[qid]['score'], 72 | # 'gpt35': review_gpt35[qid]['score'], 73 | }, 74 | } 75 | 76 | # cleanup data 77 | cleaned_evals = {} 78 | for k, v in r['evaluations'].items(): 79 | v = v.strip() 80 | lines = v.split('\n') 81 | # trim the first line if it's a pair of numbers 82 | if re.match(r'\d+[, ]+\d+', lines[0]): 83 | lines = lines[1:] 84 | v = '\n'.join(lines) 85 | cleaned_evals[k] = v.replace('Assistant 1', "**Assistant 1**").replace('Assistant 2', '**Assistant 2**') 86 | 87 | r['evaluations'] = cleaned_evals 88 | records.append(r) 89 | 90 | # Reorder the records, this is optional 91 | for r in records: 92 | if r['id'] <= 20: 93 | r['id'] += 60 94 | else: 95 | r['id'] -= 20 96 | for r in records: 97 | if r['id'] <= 50: 98 | r['id'] += 10 99 | elif 50 < r['id'] <= 60: 100 | r['id'] -= 50 101 | for r in records: 102 | if r['id'] == 7: 103 | r['id'] = 1 104 | elif r['id'] < 7: 105 | r['id'] += 1 106 | 107 | records.sort(key=lambda x: x['id']) 108 | 109 | # Write to file 110 | with open('webpage/data.json', 'w') as f: 111 | json.dump({'questions': records, 'models': models}, f, indent=2) 112 | -------------------------------------------------------------------------------- /Eval/mmagibench.py: -------------------------------------------------------------------------------- 1 | import base64 2 | import io 3 | import random 4 | 5 | import pandas as pd 6 | # from mmengine.dataset import Compose 7 | from PIL import Image 8 | from torch.utils.data import Dataset 9 | 10 | 11 | def decode_base64_to_image(base64_string): 12 | image_data = base64.b64decode(base64_string) 13 | image = Image.open(io.BytesIO(image_data)) 14 | return image 15 | 16 | class MMAGIBenchDataset(Dataset): 17 | def __init__(self, 18 | data_file, 19 | sys_prompt='There are several options:'): 20 | self.df = pd.read_csv(data_file, sep='\t') 21 | # self.pipeline = Compose(pipeline) 22 | self.sys_prompt = sys_prompt 23 | 24 | def __len__(self): 25 | return len(self.df) 26 | 27 | def __getitem__(self, idx): 28 | index = self.df.iloc[idx]['index'] 29 | image = self.df.iloc[idx]['image'] 30 | image = decode_base64_to_image(image) 31 | question = self.df.iloc[idx]['question'] 32 | answer = self.df.iloc[idx]['answer'] if 'answer' in self.df.iloc[0].keys() else None 33 | catetory = self.df.iloc[idx]['category'] 34 | l2_catetory = self.df.iloc[idx]['l2-category'] 35 | 36 | option_candidate = ['A', 'B', 'C', 'D', 'E'] 37 | options = { 38 | cand: self.load_from_df(idx, cand) 39 | for cand in option_candidate 40 | if self.load_from_df(idx, cand) is not None 41 | } 42 | options_prompt = f'{self.sys_prompt}\n' 43 | for key, item in options.items(): 44 | options_prompt += f'{key}. {item}\n' 45 | 46 | # trim last \n 47 | options_prompt = options_prompt[:-1] 48 | hint = self.load_from_df(idx, 'hint') 49 | data = { 50 | 'img': image, 51 | 'question': question, 52 | 'answer': answer, 53 | 'options': options_prompt, 54 | 'category': catetory, 55 | 'l2-category': l2_catetory, 56 | 'options_dict': options, 57 | 'index': index, 58 | 'context': hint, 59 | } 60 | # data = self.pipeline(data) 61 | return data 62 | 63 | def load_from_df(self, idx, key): 64 | if key in self.df.iloc[idx] and not pd.isna(self.df.iloc[idx][key]): 65 | return self.df.iloc[idx][key] 66 | else: 67 | return None 68 | 69 | -------------------------------------------------------------------------------- /Eval/model_qa.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from transformers import AutoTokenizer, AutoModelForCausalLM, StoppingCriteria 3 | import torch 4 | import os 5 | import json 6 | from tqdm import tqdm 7 | import shortuuid 8 | 9 | from llava.conversation import default_conversation 10 | from llava.utils import disable_torch_init 11 | 12 | 13 | # new stopping implementation 14 | class KeywordsStoppingCriteria(StoppingCriteria): 15 | def __init__(self, keywords, tokenizer, input_ids): 16 | self.keywords = keywords 17 | self.tokenizer = tokenizer 18 | self.start_len = None 19 | self.input_ids = input_ids 20 | 21 | def __call__(self, output_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool: 22 | if self.start_len is None: 23 | self.start_len = self.input_ids.shape[1] 24 | else: 25 | outputs = self.tokenizer.batch_decode(output_ids[:, self.start_len:], skip_special_tokens=True)[0] 26 | for keyword in self.keywords: 27 | if keyword in outputs: 28 | return True 29 | return False 30 | 31 | 32 | @torch.inference_mode() 33 | def eval_model(model_name, questions_file, answers_file): 34 | # Model 35 | disable_torch_init() 36 | model_name = os.path.expanduser(model_name) 37 | tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False) 38 | model = AutoModelForCausalLM.from_pretrained(model_name, 39 | torch_dtype=torch.float16).cuda() 40 | 41 | 42 | ques_file = open(os.path.expanduser(questions_file), "r") 43 | ans_file = open(os.path.expanduser(answers_file), "w") 44 | for i, line in enumerate(tqdm(ques_file)): 45 | idx = json.loads(line)["question_id"] 46 | qs = json.loads(line)["text"] 47 | cat = json.loads(line)["category"] 48 | conv = default_conversation.copy() 49 | conv.append_message(conv.roles[0], qs) 50 | prompt = conv.get_prompt() 51 | inputs = tokenizer([prompt]) 52 | input_ids = torch.as_tensor(inputs.input_ids).cuda() 53 | stopping_criteria = KeywordsStoppingCriteria([conv.sep], tokenizer, input_ids) 54 | output_ids = model.generate( 55 | input_ids, 56 | do_sample=True, 57 | use_cache=True, 58 | temperature=0.7, 59 | max_new_tokens=1024, 60 | stopping_criteria=[stopping_criteria]) 61 | outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0] 62 | try: 63 | index = outputs.index(conv.sep, len(prompt)) 64 | except ValueError: 65 | outputs += conv.sep 66 | index = outputs.index(conv.sep, len(prompt)) 67 | 68 | outputs = outputs[len(prompt) + len(conv.roles[1]) + 2:index].strip() 69 | ans_id = shortuuid.uuid() 70 | ans_file.write(json.dumps({"question_id": idx, 71 | "text": outputs, 72 | "answer_id": ans_id, 73 | "model_id": model_name, 74 | "metadata": {}}) + "\n") 75 | ans_file.flush() 76 | ans_file.close() 77 | 78 | if __name__ == "__main__": 79 | parser = argparse.ArgumentParser() 80 | parser.add_argument("--model-name", type=str, default="facebook/opt-350m") 81 | parser.add_argument("--question-file", type=str, default="tables/question.jsonl") 82 | parser.add_argument("--answers-file", type=str, default="answer.jsonl") 83 | args = parser.parse_args() 84 | 85 | eval_model(args.model_name, args.question_file, args.answers_file) 86 | -------------------------------------------------------------------------------- /Eval/model_vqa.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import torch 3 | import os 4 | import json 5 | from tqdm import tqdm 6 | import shortuuid 7 | 8 | from llava.constants import ( 9 | IMAGE_TOKEN_INDEX, 10 | DEFAULT_IMAGE_TOKEN, 11 | DEFAULT_IM_START_TOKEN, 12 | DEFAULT_IM_END_TOKEN, 13 | DEFAULT_IMAGE_PATCH_TOKEN, 14 | ) 15 | from llava.conversation import conv_templates, SeparatorStyle 16 | from llava.model.builder import load_pretrained_model 17 | from llava.utils import disable_torch_init 18 | from llava.mm_utils import ( 19 | tokenizer_image_token, 20 | get_model_name_from_path, 21 | KeywordsStoppingCriteria, 22 | ) 23 | from llava.model import * 24 | from PIL import Image 25 | import math 26 | from peft import PeftModel 27 | 28 | from transformers import ( 29 | AutoTokenizer, 30 | BitsAndBytesConfig, 31 | ) 32 | 33 | 34 | def split_list(lst, n): 35 | """Split a list into n (roughly) equal-sized chunks""" 36 | chunk_size = math.ceil(len(lst) / n) # integer division 37 | return [lst[i : i + chunk_size] for i in range(0, len(lst), chunk_size)] 38 | 39 | 40 | def get_chunk(lst, n, k): 41 | chunks = split_list(lst, n) 42 | return chunks[k] 43 | 44 | 45 | def eval_model(args): 46 | # Model 47 | disable_torch_init() 48 | model_path = os.path.expanduser(args.model_path) 49 | model_name = get_model_name_from_path(model_path) 50 | compute_dtype = torch.float16 51 | if args.use_qlora: 52 | tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False) 53 | 54 | bits = 16 55 | dtype = torch.bfloat16 56 | compute_dtype = torch.bfloat16 57 | 58 | model = LlavaLlamaForCausalLM.from_pretrained( 59 | model_path, 60 | device_map={"": "cuda:0"}, 61 | torch_dtype=dtype, 62 | load_in_4bit=(bits == 4), 63 | load_in_8bit=(bits == 8), 64 | quantization_config=BitsAndBytesConfig( 65 | load_in_4bit=(bits == 4), 66 | load_in_8bit=(bits == 8), 67 | llm_int8_threshold=6.0, 68 | llm_int8_skip_modules=["mm_projector", "lm_head"], 69 | llm_int8_has_fp16_weight=False, 70 | bnb_4bit_compute_dtype=compute_dtype, 71 | bnb_4bit_use_double_quant=True, 72 | bnb_4bit_quant_type="nf4", 73 | ), 74 | ) 75 | model = PeftModel.from_pretrained( 76 | model, 77 | args.qlora_path, 78 | ) 79 | 80 | mm_use_im_start_end = getattr(model.config, "mm_use_im_start_end", False) 81 | mm_use_im_patch_token = getattr(model.config, "mm_use_im_patch_token", True) 82 | if mm_use_im_patch_token: 83 | tokenizer.add_tokens([DEFAULT_IMAGE_PATCH_TOKEN], special_tokens=True) 84 | if mm_use_im_start_end: 85 | tokenizer.add_tokens( 86 | [DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN], special_tokens=True 87 | ) 88 | model.resize_token_embeddings(len(tokenizer)) 89 | 90 | vision_tower = model.get_vision_tower() 91 | if not vision_tower.is_loaded: 92 | vision_tower.load_model() 93 | vision_tower.to(device="cuda", dtype=compute_dtype) 94 | image_processor = vision_tower.image_processor 95 | else: 96 | tokenizer, model, image_processor, context_len = load_pretrained_model( 97 | model_path, args.model_base, model_name 98 | ) 99 | 100 | questions = [ 101 | json.loads(q) for q in open(os.path.expanduser(args.question_file), "r") 102 | ] 103 | questions = get_chunk(questions, args.num_chunks, args.chunk_idx) 104 | answers_file = os.path.expanduser(args.answers_file) 105 | os.makedirs(os.path.dirname(answers_file), exist_ok=True) 106 | ans_file = open(answers_file, "w") 107 | for line in tqdm(questions): 108 | idx = line["question_id"] 109 | image_file = line["image"] 110 | # image_file = 'COCO_val2014_' + image_file 111 | qs = line["text"] 112 | cur_prompt = qs 113 | if model.config.mm_use_im_start_end: 114 | qs = ( 115 | DEFAULT_IM_START_TOKEN 116 | + DEFAULT_IMAGE_TOKEN 117 | + DEFAULT_IM_END_TOKEN 118 | + "\n" 119 | + qs 120 | ) 121 | else: 122 | qs = DEFAULT_IMAGE_TOKEN + "\n" + qs 123 | if args.test_prompt: 124 | qs += args.test_prompt 125 | conv = conv_templates[args.conv_mode].copy() 126 | conv.append_message(conv.roles[0], qs) 127 | conv.append_message(conv.roles[1], None) 128 | prompt = conv.get_prompt() 129 | 130 | input_ids = ( 131 | tokenizer_image_token( 132 | prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt" 133 | ) 134 | .unsqueeze(0) 135 | .cuda() 136 | ) 137 | 138 | image = Image.open(os.path.join(args.image_folder, image_file)) 139 | if args.image_aspect_ratio == "pad": 140 | image = image.convert("RGB") 141 | 142 | def expand2square(pil_img, background_color): 143 | # print(background_color) 144 | width, height = pil_img.size 145 | if width == height: 146 | return pil_img 147 | elif width > height: 148 | result = Image.new(pil_img.mode, (width, width), background_color) 149 | result.paste(pil_img, (0, (width - height) // 2)) 150 | return result 151 | else: 152 | result = Image.new(pil_img.mode, (height, height), background_color) 153 | result.paste(pil_img, ((height - width) // 2, 0)) 154 | return result 155 | 156 | image = expand2square( 157 | image, tuple(int(x * 255) for x in image_processor.image_mean) 158 | ) 159 | image_tensor = image_processor.preprocess(image, return_tensors="pt")[ 160 | "pixel_values" 161 | ][0] 162 | 163 | stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2 164 | keywords = [stop_str] 165 | stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids) 166 | 167 | model.config.use_cache = True 168 | model.config.cache_shape = (2048,) 169 | with torch.inference_mode(): 170 | output_ids = model.generate( 171 | input_ids=input_ids, 172 | images=image_tensor.unsqueeze(0).to(dtype=compute_dtype).cuda(), 173 | do_sample=True if args.temperature > 0 else False, 174 | temperature=args.temperature if args.temperature > 0 else 1.0, 175 | top_p=args.top_p, 176 | num_beams=args.num_beams, 177 | # no_repeat_ngram_size=3, 178 | max_new_tokens=64 if args.short_eval else 1024, 179 | # stopping_criteria=[stopping_criteria], 180 | use_cache=True, 181 | ) 182 | 183 | input_token_len = input_ids.shape[1] 184 | n_diff_input_output = ( 185 | (input_ids != output_ids[:, :input_token_len]).sum().item() 186 | ) 187 | if n_diff_input_output > 0: 188 | print( 189 | f"[Warning] {n_diff_input_output} output_ids are not the same as the input_ids" 190 | ) 191 | outputs = tokenizer.batch_decode( 192 | output_ids[:, input_token_len:], skip_special_tokens=True 193 | )[0] 194 | outputs = outputs.strip() 195 | if outputs.endswith(stop_str): 196 | outputs = outputs[: -len(stop_str)] 197 | outputs = outputs.strip() 198 | 199 | ans_id = shortuuid.uuid() 200 | ans_file.write( 201 | json.dumps( 202 | { 203 | "question_id": idx, 204 | "prompt": cur_prompt, 205 | "text": outputs, 206 | "answer_id": ans_id, 207 | "model_id": model_name, 208 | "metadata": {}, 209 | } 210 | ) 211 | + "\n" 212 | ) 213 | ans_file.flush() 214 | ans_file.close() 215 | 216 | 217 | if __name__ == "__main__": 218 | parser = argparse.ArgumentParser() 219 | parser.add_argument("--model-path", type=str, default="facebook/opt-350m") 220 | parser.add_argument("--model-base", type=str, default=None) 221 | parser.add_argument("--image-folder", type=str, default="") 222 | parser.add_argument("--question-file", type=str, default="tables/question.jsonl") 223 | parser.add_argument("--answers-file", type=str, default="answer.jsonl") 224 | parser.add_argument("--conv-mode", type=str, default="llava_v1") 225 | parser.add_argument("--num-chunks", type=int, default=1) 226 | parser.add_argument("--chunk-idx", type=int, default=0) 227 | parser.add_argument("--temperature", type=float, default=0.2) 228 | parser.add_argument("--top_p", type=float, default=None) 229 | parser.add_argument("--num_beams", type=int, default=1) 230 | parser.add_argument("--use-qlora", type=bool, default=False) 231 | parser.add_argument("--qlora-path", type=str, default="") 232 | parser.add_argument("--short_eval", type=bool, default=False) 233 | parser.add_argument("--image_aspect_ratio", type=str, default="pad") 234 | parser.add_argument( 235 | "--test-prompt", 236 | type=str, 237 | default="\nAnswer the question using a single word or phrase.", 238 | ) 239 | args = parser.parse_args() 240 | 241 | if os.path.exists(args.answers_file): 242 | print(f"{args.answers_file} already exists. Please delete it first.") 243 | exit(1) 244 | eval_model(args) 245 | -------------------------------------------------------------------------------- /Eval/model_vqa_ds.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import torch 3 | import os 4 | import json 5 | from tqdm import tqdm 6 | import shortuuid 7 | 8 | from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN 9 | from llava.conversation import conv_templates, SeparatorStyle 10 | from llava.model.builder import load_pretrained_model 11 | from llava.utils import disable_torch_init 12 | from llava.mm_utils import tokenizer_image_token, get_model_name_from_path, KeywordsStoppingCriteria 13 | 14 | import deepspeed 15 | 16 | from PIL import Image 17 | import math 18 | 19 | 20 | def split_list(lst, n): 21 | """Split a list into n (roughly) equal-sized chunks""" 22 | chunk_size = math.ceil(len(lst) / n) # integer division 23 | return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)] 24 | 25 | 26 | def get_chunk(lst, n, k): 27 | chunks = split_list(lst, n) 28 | return chunks[k] 29 | 30 | 31 | def eval_model(args): 32 | # Model 33 | disable_torch_init() 34 | model_path = os.path.expanduser(args.model_path) 35 | model_name = get_model_name_from_path(model_path) 36 | tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, args.model_base, model_name, device_map=None) 37 | vision_tower = model.model.vision_tower 38 | model.model.vision_tower = None 39 | model = deepspeed.init_inference(model, mp_size=1, dtype=torch.half, replace_with_kernel_inject=True) 40 | model.module.model.vision_tower = vision_tower 41 | questions = [json.loads(q) for q in open(os.path.expanduser(args.question_file), "r")] 42 | questions = get_chunk(questions, args.num_chunks, args.chunk_idx) 43 | answers_file = os.path.expanduser(args.answers_file) 44 | os.makedirs(os.path.dirname(answers_file), exist_ok=True) 45 | ans_file = open(answers_file, "w") 46 | for line in tqdm(questions): 47 | idx = line["question_id"] 48 | image_file = line["image"] 49 | qs = line["text"] 50 | cur_prompt = qs 51 | if model.config.mm_use_im_start_end: 52 | qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + qs 53 | else: 54 | qs = DEFAULT_IMAGE_TOKEN + '\n' + qs 55 | 56 | conv = conv_templates[args.conv_mode].copy() 57 | conv.append_message(conv.roles[0], qs) 58 | conv.append_message(conv.roles[1], None) 59 | prompt = conv.get_prompt() 60 | 61 | input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda() 62 | 63 | image = Image.open(os.path.join(args.image_folder, image_file)) 64 | image_tensor = image_processor.preprocess(image, return_tensors='pt')['pixel_values'][0] 65 | 66 | stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2 67 | keywords = [stop_str] 68 | stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids) 69 | 70 | with torch.inference_mode(): 71 | output_ids = model.generate( 72 | input_ids, 73 | images=image_tensor.unsqueeze(0).half().cuda(), 74 | do_sample=True, 75 | temperature=0.2, 76 | max_new_tokens=1024, 77 | use_cache=True, 78 | stopping_criteria=[stopping_criteria]) 79 | 80 | input_token_len = input_ids.shape[1] 81 | n_diff_input_output = (input_ids != output_ids[:, :input_token_len]).sum().item() 82 | if n_diff_input_output > 0: 83 | print(f'[Warning] {n_diff_input_output} output_ids are not the same as the input_ids') 84 | outputs = tokenizer.batch_decode(output_ids[:, input_token_len:], skip_special_tokens=True)[0] 85 | outputs = outputs.strip() 86 | if outputs.endswith(stop_str): 87 | outputs = outputs[:-len(stop_str)] 88 | outputs = outputs.strip() 89 | 90 | ans_id = shortuuid.uuid() 91 | ans_file.write(json.dumps({"question_id": idx, 92 | "prompt": cur_prompt, 93 | "text": outputs, 94 | "answer_id": ans_id, 95 | "model_id": model_name, 96 | "metadata": {}}) + "\n") 97 | ans_file.flush() 98 | ans_file.close() 99 | 100 | if __name__ == "__main__": 101 | parser = argparse.ArgumentParser() 102 | parser.add_argument("--model-path", type=str, default="facebook/opt-350m") 103 | parser.add_argument("--model-base", type=str, default=None) 104 | parser.add_argument("--image-folder", type=str, default="") 105 | parser.add_argument("--question-file", type=str, default="tables/question.jsonl") 106 | parser.add_argument("--answers-file", type=str, default="answer.jsonl") 107 | parser.add_argument("--conv-mode", type=str, default="llava_v1") 108 | parser.add_argument("--num-chunks", type=int, default=1) 109 | parser.add_argument("--chunk-idx", type=int, default=0) 110 | parser.add_argument("--local_rank", type=int, default=0) 111 | args = parser.parse_args() 112 | 113 | eval_model(args) 114 | -------------------------------------------------------------------------------- /Eval/model_vqa_mmhal.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import torch 3 | import os 4 | import json 5 | from tqdm import tqdm 6 | import shortuuid 7 | 8 | from llava.constants import ( 9 | IMAGE_TOKEN_INDEX, 10 | DEFAULT_IMAGE_TOKEN, 11 | DEFAULT_IM_START_TOKEN, 12 | DEFAULT_IM_END_TOKEN, 13 | DEFAULT_IMAGE_PATCH_TOKEN, 14 | ) 15 | from llava.conversation import conv_templates, SeparatorStyle 16 | from llava.model.builder import load_pretrained_model 17 | from llava.utils import disable_torch_init 18 | from llava.mm_utils import ( 19 | tokenizer_image_token, 20 | get_model_name_from_path, 21 | KeywordsStoppingCriteria, 22 | ) 23 | from llava.model import * 24 | from PIL import Image 25 | import math 26 | from peft import PeftModel 27 | 28 | from transformers import ( 29 | AutoTokenizer, 30 | BitsAndBytesConfig, 31 | ) 32 | from datasets import load_dataset 33 | 34 | 35 | def split_list(lst, n): 36 | """Split a list into n (roughly) equal-sized chunks""" 37 | chunk_size = math.ceil(len(lst) / n) # integer division 38 | return [lst[i : i + chunk_size] for i in range(0, len(lst), chunk_size)] 39 | 40 | 41 | def get_chunk(lst, n, k): 42 | chunks = split_list(lst, n) 43 | return chunks[k] 44 | 45 | 46 | def eval_model(args): 47 | # Model 48 | disable_torch_init() 49 | model_path = os.path.expanduser(args.model_path) 50 | model_name = 'LLaVa-RLHF_' + get_model_name_from_path(model_path) 51 | compute_dtype = torch.float16 52 | if args.use_qlora: 53 | tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False) 54 | 55 | bits = 16 56 | dtype = torch.bfloat16 57 | compute_dtype = torch.bfloat16 58 | 59 | model = LlavaLlamaForCausalLM.from_pretrained( 60 | model_path, 61 | device_map={"": "cuda:0"}, 62 | torch_dtype=dtype, 63 | load_in_4bit=(bits == 4), 64 | load_in_8bit=(bits == 8), 65 | quantization_config=BitsAndBytesConfig( 66 | load_in_4bit=(bits == 4), 67 | load_in_8bit=(bits == 8), 68 | llm_int8_threshold=6.0, 69 | llm_int8_skip_modules=["mm_projector", "lm_head"], 70 | llm_int8_has_fp16_weight=False, 71 | bnb_4bit_compute_dtype=compute_dtype, 72 | bnb_4bit_use_double_quant=True, 73 | bnb_4bit_quant_type="nf4", 74 | ), 75 | ) 76 | model = PeftModel.from_pretrained( 77 | model, 78 | args.qlora_path, 79 | ) 80 | 81 | mm_use_im_start_end = getattr(model.config, "mm_use_im_start_end", False) 82 | mm_use_im_patch_token = getattr(model.config, "mm_use_im_patch_token", True) 83 | if mm_use_im_patch_token: 84 | tokenizer.add_tokens([DEFAULT_IMAGE_PATCH_TOKEN], special_tokens=True) 85 | if mm_use_im_start_end: 86 | tokenizer.add_tokens( 87 | [DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN], special_tokens=True 88 | ) 89 | model.resize_token_embeddings(len(tokenizer)) 90 | 91 | vision_tower = model.get_vision_tower() 92 | if not vision_tower.is_loaded: 93 | vision_tower.load_model() 94 | vision_tower.to(device="cuda", dtype=compute_dtype) 95 | image_processor = vision_tower.image_processor 96 | else: 97 | tokenizer, model, image_processor, context_len = load_pretrained_model( 98 | model_path, args.model_base, model_name 99 | ) 100 | 101 | dataset = load_dataset("Shengcao1006/MMHal-Bench")['test'] 102 | answers_file = os.path.expanduser(args.answers_file) 103 | os.makedirs(os.path.dirname(answers_file), exist_ok=True) 104 | ans_file = open(answers_file, "w") 105 | record = [] 106 | for line in tqdm(dataset): 107 | # use the pre-downloaded images 108 | qs = line["question"] 109 | cur_prompt = qs 110 | if model.config.mm_use_im_start_end: 111 | qs = ( 112 | DEFAULT_IM_START_TOKEN 113 | + DEFAULT_IMAGE_TOKEN 114 | + DEFAULT_IM_END_TOKEN 115 | + "\n" 116 | + qs 117 | ) 118 | else: 119 | qs = DEFAULT_IMAGE_TOKEN + "\n" + qs 120 | if args.test_prompt: 121 | qs += args.test_prompt 122 | conv = conv_templates[args.conv_mode].copy() 123 | conv.append_message(conv.roles[0], qs) 124 | conv.append_message(conv.roles[1], None) 125 | prompt = conv.get_prompt() 126 | 127 | input_ids = ( 128 | tokenizer_image_token( 129 | prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt" 130 | ) 131 | .unsqueeze(0) 132 | .cuda() 133 | ) 134 | 135 | image = Image.open(line['image_path']) 136 | if args.image_aspect_ratio == 'pad': 137 | image = image.convert('RGB') 138 | def expand2square(pil_img, background_color): 139 | # print(background_color) 140 | width, height = pil_img.size 141 | if width == height: 142 | return pil_img 143 | elif width > height: 144 | result = Image.new(pil_img.mode, (width, width), background_color) 145 | result.paste(pil_img, (0, (width - height) // 2)) 146 | return result 147 | else: 148 | result = Image.new(pil_img.mode, (height, height), background_color) 149 | result.paste(pil_img, ((height - width) // 2, 0)) 150 | return result 151 | image = expand2square(image, tuple(int(x*255) for x in image_processor.image_mean)) 152 | image_tensor = image_processor.preprocess(image, return_tensors="pt")[ 153 | "pixel_values" 154 | ][0] 155 | 156 | stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2 157 | keywords = [stop_str] 158 | stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids) 159 | 160 | model.config.use_cache = True 161 | model.config.cache_shape = (2048,) 162 | with torch.inference_mode(): 163 | output_ids = model.generate( 164 | input_ids=input_ids, 165 | images=image_tensor.unsqueeze(0).to(dtype=compute_dtype).cuda(), 166 | do_sample=True if args.temperature > 0 else False, 167 | temperature=args.temperature if args.temperature > 0 else 1.0, 168 | top_p=args.top_p, 169 | num_beams=args.num_beams, 170 | # no_repeat_ngram_size=3, 171 | max_new_tokens=64 if args.short_eval else 1024, 172 | # stopping_criteria=[stopping_criteria], 173 | use_cache=True, 174 | ) 175 | 176 | input_token_len = input_ids.shape[1] 177 | n_diff_input_output = ( 178 | (input_ids != output_ids[:, :input_token_len]).sum().item() 179 | ) 180 | if n_diff_input_output > 0: 181 | print( 182 | f"[Warning] {n_diff_input_output} output_ids are not the same as the input_ids" 183 | ) 184 | outputs = tokenizer.batch_decode( 185 | output_ids[:, input_token_len:], skip_special_tokens=True 186 | )[0] 187 | outputs = outputs.strip() 188 | if outputs.endswith(stop_str): 189 | outputs = outputs[: -len(stop_str)] 190 | outputs = outputs.strip() 191 | line["model_answer"] = outputs 192 | record.append(line) 193 | 194 | json.dump(record, ans_file, indent=2) 195 | ans_file.close() 196 | 197 | 198 | if __name__ == "__main__": 199 | parser = argparse.ArgumentParser() 200 | parser.add_argument("--model-path", type=str, default="facebook/opt-350m") 201 | parser.add_argument("--model-base", type=str, default=None) 202 | parser.add_argument("--answers-file", type=str, default="answer.jsonl") 203 | parser.add_argument("--conv-mode", type=str, default="llava_v1") 204 | parser.add_argument("--num-chunks", type=int, default=1) 205 | parser.add_argument("--chunk-idx", type=int, default=0) 206 | parser.add_argument("--temperature", type=float, default=0.0) 207 | parser.add_argument("--top_p", type=float, default=None) 208 | parser.add_argument("--num_beams", type=int, default=1) 209 | parser.add_argument("--use-qlora", type=bool, default=False) 210 | parser.add_argument("--qlora-path", type=str, default="") 211 | parser.add_argument("--short_eval", type=bool, default=False) 212 | parser.add_argument("--image_aspect_ratio", type=str, default='pad') 213 | parser.add_argument("--test-prompt", type=str, default='\nAnswer the question using a single word or phrase.') 214 | args = parser.parse_args() 215 | 216 | if os.path.exists(args.answers_file): 217 | print(f"{args.answers_file} already exists. Please delete it first.") 218 | exit(1) 219 | eval_model(args) 220 | -------------------------------------------------------------------------------- /Eval/model_vqa_science.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import torch 3 | import os 4 | import json 5 | from tqdm import tqdm 6 | import shortuuid 7 | 8 | from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN 9 | from llava.conversation import conv_templates, SeparatorStyle 10 | from llava.model.builder import load_pretrained_model 11 | from llava.utils import disable_torch_init 12 | from llava.mm_utils import tokenizer_image_token, get_model_name_from_path, KeywordsStoppingCriteria 13 | 14 | from PIL import Image 15 | import math 16 | 17 | 18 | def split_list(lst, n): 19 | """Split a list into n (roughly) equal-sized chunks""" 20 | chunk_size = math.ceil(len(lst) / n) # integer division 21 | return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)] 22 | 23 | 24 | def get_chunk(lst, n, k): 25 | chunks = split_list(lst, n) 26 | return chunks[k] 27 | 28 | 29 | def eval_model(args): 30 | # Model 31 | disable_torch_init() 32 | model_path = os.path.expanduser(args.model_path) 33 | model_name = get_model_name_from_path(model_path) 34 | tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, args.model_base, model_name) 35 | 36 | questions = json.load(open(os.path.expanduser(args.question_file), "r")) 37 | questions = get_chunk(questions, args.num_chunks, args.chunk_idx) 38 | answers_file = os.path.expanduser(args.answers_file) 39 | os.makedirs(os.path.dirname(answers_file), exist_ok=True) 40 | ans_file = open(answers_file, "w") 41 | for i, line in enumerate(tqdm(questions)): 42 | idx = line["id"] 43 | question = line['conversations'][0] 44 | gt_ans = line["conversations"][1] 45 | qs = question['value'].replace('', '').strip() 46 | cur_prompt = qs 47 | 48 | if 'image' in line: 49 | image_file = line["image"] 50 | image = Image.open(os.path.join(args.image_folder, image_file)) 51 | image_tensor = image_processor.preprocess(image, return_tensors='pt')['pixel_values'][0] 52 | images = image_tensor.unsqueeze(0).half().cuda() 53 | if getattr(model.config, 'mm_use_im_start_end', False): 54 | qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + qs 55 | else: 56 | qs = DEFAULT_IMAGE_TOKEN + '\n' + qs 57 | cur_prompt = '' + '\n' + cur_prompt 58 | else: 59 | images = None 60 | 61 | conv = conv_templates[args.conv_mode].copy() 62 | conv.append_message(conv.roles[0], qs) 63 | conv.append_message(conv.roles[1], None) 64 | prompt = conv.get_prompt() 65 | 66 | input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda() 67 | 68 | stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2 69 | keywords = [stop_str] 70 | stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids) 71 | 72 | with torch.inference_mode(): 73 | output_ids = model.generate( 74 | input_ids, 75 | images=images, 76 | do_sample=True, 77 | temperature=0.2, 78 | max_new_tokens=1024, 79 | use_cache=True, 80 | stopping_criteria=[stopping_criteria]) 81 | 82 | input_token_len = input_ids.shape[1] 83 | n_diff_input_output = (input_ids != output_ids[:, :input_token_len]).sum().item() 84 | if n_diff_input_output > 0: 85 | print(f'[Warning] {n_diff_input_output} output_ids are not the same as the input_ids') 86 | outputs = tokenizer.batch_decode(output_ids[:, input_token_len:], skip_special_tokens=True)[0] 87 | outputs = outputs.strip() 88 | if outputs.endswith(stop_str): 89 | outputs = outputs[:-len(stop_str)] 90 | outputs = outputs.strip() 91 | 92 | # prompt for answer 93 | if args.answer_prompter: 94 | outputs_reasoning = outputs 95 | input_ids = tokenizer_image_token(prompt + outputs_reasoning + ' ###\nANSWER:', tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda() 96 | 97 | with torch.inference_mode(): 98 | output_ids = model.generate( 99 | input_ids, 100 | images=images, 101 | do_sample=True, 102 | temperature=0.2, 103 | max_new_tokens=64, 104 | use_cache=True, 105 | stopping_criteria=[stopping_criteria]) 106 | 107 | input_token_len = input_ids.shape[1] 108 | n_diff_input_output = (input_ids != output_ids[:, :input_token_len]).sum().item() 109 | if n_diff_input_output > 0: 110 | print(f'[Warning] {n_diff_input_output} output_ids are not the same as the input_ids') 111 | outputs = tokenizer.batch_decode(output_ids[:, input_token_len:], skip_special_tokens=True)[0] 112 | outputs = outputs.strip() 113 | if outputs.endswith(stop_str): 114 | outputs = outputs[:-len(stop_str)] 115 | outputs = outputs.strip() 116 | outputs = outputs_reasoning + '\n The answer is ' + outputs 117 | 118 | ans_id = shortuuid.uuid() 119 | ans_file.write(json.dumps({"question_id": idx, 120 | "prompt": cur_prompt, 121 | "text": outputs, 122 | "answer_id": ans_id, 123 | "model_id": model_name, 124 | "metadata": {}}) + "\n") 125 | ans_file.flush() 126 | ans_file.close() 127 | 128 | if __name__ == "__main__": 129 | parser = argparse.ArgumentParser() 130 | parser.add_argument("--model-path", type=str, default="facebook/opt-350m") 131 | parser.add_argument("--model-base", type=str, default=None) 132 | parser.add_argument("--image-folder", type=str, default="") 133 | parser.add_argument("--question-file", type=str, default="tables/question.json") 134 | parser.add_argument("--answers-file", type=str, default="answer.jsonl") 135 | parser.add_argument("--conv-mode", type=str, default="llava_v0") 136 | parser.add_argument("--num-chunks", type=int, default=1) 137 | parser.add_argument("--chunk-idx", type=int, default=0) 138 | parser.add_argument("--answer-prompter", action="store_true") 139 | args = parser.parse_args() 140 | 141 | eval_model(args) 142 | -------------------------------------------------------------------------------- /Eval/qa_baseline_gpt35.py: -------------------------------------------------------------------------------- 1 | """Generate answers with GPT-3.5""" 2 | # Note: you need to be using OpenAI Python v0.27.0 for the code below to work 3 | import argparse 4 | import json 5 | import os 6 | import time 7 | import concurrent.futures 8 | 9 | import openai 10 | import tqdm 11 | import shortuuid 12 | 13 | MODEL = 'gpt-3.5-turbo' 14 | MODEL_ID = 'gpt-3.5-turbo:20230327' 15 | 16 | def get_answer(question_id: int, question: str, max_tokens: int): 17 | ans = { 18 | 'answer_id': shortuuid.uuid(), 19 | 'question_id': question_id, 20 | 'model_id': MODEL_ID, 21 | } 22 | for _ in range(3): 23 | try: 24 | response = openai.ChatCompletion.create( 25 | model=MODEL, 26 | messages=[{ 27 | 'role': 'system', 28 | 'content': 'You are a helpful assistant.' 29 | }, { 30 | 'role': 'user', 31 | 'content': question, 32 | }], 33 | max_tokens=max_tokens, 34 | ) 35 | ans['text'] = response['choices'][0]['message']['content'] 36 | return ans 37 | except Exception as e: 38 | print('[ERROR]', e) 39 | ans['text'] = '#ERROR#' 40 | time.sleep(1) 41 | return ans 42 | 43 | 44 | if __name__ == '__main__': 45 | parser = argparse.ArgumentParser(description='ChatGPT answer generation.') 46 | parser.add_argument('-q', '--question') 47 | parser.add_argument('-o', '--output') 48 | parser.add_argument('--max-tokens', type=int, default=1024, help='maximum number of tokens produced in the output') 49 | args = parser.parse_args() 50 | 51 | questions_dict = {} 52 | with open(os.path.expanduser(args.question)) as f: 53 | for line in f: 54 | if not line: 55 | continue 56 | q = json.loads(line) 57 | questions_dict[q['question_id']] = q['text'] 58 | 59 | answers = [] 60 | 61 | with concurrent.futures.ThreadPoolExecutor(max_workers=32) as executor: 62 | futures = [] 63 | for qid, question in questions_dict.items(): 64 | future = executor.submit(get_answer, qid, question, args.max_tokens) 65 | futures.append(future) 66 | 67 | for future in tqdm.tqdm(concurrent.futures.as_completed(futures), total=len(futures)): 68 | answers.append(future.result()) 69 | 70 | answers.sort(key=lambda x: x['question_id']) 71 | 72 | with open(os.path.expanduser(args.output), 'w') as f: 73 | table = [json.dumps(ans) for ans in answers] 74 | f.write('\n'.join(table)) 75 | -------------------------------------------------------------------------------- /Eval/run_llava.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import torch 3 | 4 | from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN 5 | from llava.conversation import conv_templates, SeparatorStyle 6 | from llava.model.builder import load_pretrained_model 7 | from llava.utils import disable_torch_init 8 | from llava.mm_utils import tokenizer_image_token, get_model_name_from_path, KeywordsStoppingCriteria 9 | 10 | from PIL import Image 11 | 12 | import requests 13 | from PIL import Image 14 | from io import BytesIO 15 | 16 | 17 | def load_image(image_file): 18 | if image_file.startswith('http') or image_file.startswith('https'): 19 | response = requests.get(image_file) 20 | image = Image.open(BytesIO(response.content)).convert('RGB') 21 | else: 22 | image = Image.open(image_file).convert('RGB') 23 | return image 24 | 25 | 26 | def eval_model(args): 27 | # Model 28 | disable_torch_init() 29 | 30 | model_name = get_model_name_from_path(args.model_path) 31 | tokenizer, model, image_processor, context_len = load_pretrained_model(args.model_path, args.model_base, model_name) 32 | 33 | qs = args.query 34 | if model.config.mm_use_im_start_end: 35 | qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + qs 36 | else: 37 | qs = DEFAULT_IMAGE_TOKEN + '\n' + qs 38 | 39 | if "v1" in model_name.lower(): 40 | conv_mode = "llava_v1" 41 | elif "mpt" in model_name.lower(): 42 | conv_mode = "mpt" 43 | else: 44 | conv_mode = "llava_v0" 45 | 46 | if args.conv_mode is not None and conv_mode != args.conv_mode: 47 | print('[WARNING] the auto inferred conversation mode is {}, while `--conv-mode` is {}, using {}'.format(conv_mode, args.conv_mode, args.conv_mode)) 48 | else: 49 | args.conv_mode = conv_mode 50 | 51 | conv = conv_templates[args.conv_mode].copy() 52 | conv.append_message(conv.roles[0], qs) 53 | conv.append_message(conv.roles[1], None) 54 | prompt = conv.get_prompt() 55 | 56 | image = load_image(args.image_file) 57 | image_tensor = image_processor.preprocess(image, return_tensors='pt')['pixel_values'].half().cuda() 58 | 59 | input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda() 60 | 61 | stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2 62 | keywords = [stop_str] 63 | stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids) 64 | 65 | with torch.inference_mode(): 66 | output_ids = model.generate( 67 | input_ids, 68 | images=image_tensor, 69 | do_sample=True, 70 | temperature=0.2, 71 | max_new_tokens=1024, 72 | use_cache=True, 73 | stopping_criteria=[stopping_criteria]) 74 | 75 | input_token_len = input_ids.shape[1] 76 | n_diff_input_output = (input_ids != output_ids[:, :input_token_len]).sum().item() 77 | if n_diff_input_output > 0: 78 | print(f'[Warning] {n_diff_input_output} output_ids are not the same as the input_ids') 79 | outputs = tokenizer.batch_decode(output_ids[:, input_token_len:], skip_special_tokens=True)[0] 80 | outputs = outputs.strip() 81 | if outputs.endswith(stop_str): 82 | outputs = outputs[:-len(stop_str)] 83 | outputs = outputs.strip() 84 | print(outputs) 85 | 86 | if __name__ == "__main__": 87 | parser = argparse.ArgumentParser() 88 | parser.add_argument("--model-path", type=str, default="facebook/opt-350m") 89 | parser.add_argument("--model-base", type=str, default=None) 90 | parser.add_argument("--image-file", type=str, required=True) 91 | parser.add_argument("--query", type=str, required=True) 92 | parser.add_argument("--conv-mode", type=str, default=None) 93 | args = parser.parse_args() 94 | 95 | eval_model(args) 96 | -------------------------------------------------------------------------------- /Eval/summarize_eval_pope.py: -------------------------------------------------------------------------------- 1 | import json 2 | import argparse 3 | 4 | def eval_pope(args): 5 | ans_file = args.answers_file 6 | label_file = args.label_file 7 | 8 | answers = [json.loads(q) for q in open(ans_file, 'r')] 9 | label_list = [json.loads(q)['label'] for q in open(label_file, 'r')] 10 | 11 | for answer in answers: 12 | text = answer['text'] 13 | 14 | # Only keep the first sentence 15 | if text.find('.') != -1: 16 | text = text.split('.')[0] 17 | 18 | text = text.replace(',', '') 19 | words = text.split(' ') 20 | if 'No' in words or 'not' in words or 'no' in words: 21 | answer['text'] = 'no' 22 | else: 23 | answer['text'] = 'yes' 24 | 25 | for i in range(len(label_list)): 26 | if label_list[i] == 'no': 27 | label_list[i] = 0 28 | else: 29 | label_list[i] = 1 30 | 31 | pred_list = [] 32 | for answer in answers: 33 | if answer['text'] == 'no': 34 | pred_list.append(0) 35 | else: 36 | pred_list.append(1) 37 | 38 | pos = 1 39 | neg = 0 40 | yes_ratio = pred_list.count(1) / len(pred_list) 41 | 42 | TP, TN, FP, FN = 0, 0, 0, 0 43 | for pred, label in zip(pred_list, label_list): 44 | if pred == pos and label == pos: 45 | TP += 1 46 | elif pred == pos and label == neg: 47 | FP += 1 48 | elif pred == neg and label == neg: 49 | TN += 1 50 | elif pred == neg and label == pos: 51 | FN += 1 52 | 53 | print('TP\tFP\tTN\tFN\t') 54 | print('{}\t{}\t{}\t{}'.format(TP, FP, TN, FN)) 55 | 56 | precision = float(TP) / float(TP + FP) 57 | recall = float(TP) / float(TP + FN) 58 | f1 = 2*precision*recall / (precision + recall) 59 | acc = (TP + TN) / (TP + TN + FP + FN) 60 | print('Accuracy: {}'.format(acc)) 61 | print('Precision: {}'.format(precision)) 62 | print('Recall: {}'.format(recall)) 63 | print('F1 score: {}'.format(f1)) 64 | print('Yes ratio: {}'.format(yes_ratio)) 65 | print('# Example: {}'.format(len(answers))) 66 | print('%.3f, %.3f, %.3f, %.3f, %.3f' % (f1, acc, precision, recall, yes_ratio) ) 67 | 68 | if __name__ == "__main__": 69 | parser = argparse.ArgumentParser() 70 | parser.add_argument("--label-file", type=str, default="") 71 | parser.add_argument("--answers-file", type=str, default="tables/question.jsonl") 72 | args = parser.parse_args() 73 | 74 | eval_pope(args) -------------------------------------------------------------------------------- /Eval/summarize_gpt_mmhal.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | 4 | if __name__ == '__main__': 5 | parser = argparse.ArgumentParser() 6 | parser.add_argument('--evaluation', type=str) 7 | args = parser.parse_args() 8 | 9 | responses = json.load(open(args.evaluation, 'r')) 10 | assert(len(responses) == 96) 11 | 12 | # analyze responses 13 | scores = [] 14 | for i, response in enumerate(responses): 15 | response = response['choices'][0]['message']['content'] 16 | scores_found = [] 17 | for s in range(7): 18 | if f'rating: {s}' in response.lower(): 19 | scores_found.append(s) 20 | if len(scores_found) == 1: 21 | scores.append(scores_found[0]) 22 | else: 23 | print('Warning: multiple or zero scores found') 24 | print(i, response) 25 | scores.append(0) 26 | 27 | hallucination = [] 28 | for s in scores: 29 | if s >= 3: 30 | hallucination.append(0) 31 | else: 32 | hallucination.append(1) 33 | 34 | scores_each = [[] for _ in range(8)] 35 | # assuming order of 96 questions is not changed 36 | for i in range(96): 37 | question_type = i % 8 38 | scores_each[question_type].append(scores[i]) 39 | 40 | print('Average score: {:.2f}'.format(sum(scores) / len(scores))) 41 | print('Hallucination rate: {:.2f}'.format(sum(hallucination) / len(hallucination))) 42 | print('Average score for each question type:', ','.join([str(round(sum(scores_each[i]) / len(scores_each[i]), 2)) for i in range(8)]), flush=True) 43 | -------------------------------------------------------------------------------- /Eval/summarize_gpt_review.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | from collections import defaultdict 4 | 5 | import numpy as np 6 | 7 | import argparse 8 | 9 | 10 | def parse_args(): 11 | parser = argparse.ArgumentParser(description="ChatGPT-based QA evaluation.") 12 | parser.add_argument("-d", "--dir", default=None) 13 | parser.add_argument("-f", "--files", nargs="*", default=None) 14 | parser.add_argument("-i", "--ignore", nargs="*", default=None) 15 | return parser.parse_args() 16 | 17 | 18 | if __name__ == "__main__": 19 | args = parse_args() 20 | 21 | if args.ignore is not None: 22 | args.ignore = [int(x) for x in args.ignore] 23 | 24 | if args.files is not None and len(args.files) > 0: 25 | review_files = args.files 26 | else: 27 | review_files = [ 28 | x 29 | for x in os.listdir(args.dir) 30 | if x.endswith(".json") 31 | and ( 32 | x.startswith("gpt4_text") 33 | or x.startswith("reviews_") 34 | or x.startswith("review-") 35 | ) 36 | ] 37 | 38 | for review_file in sorted(review_files): 39 | config = ( 40 | os.path.basename(review_file) 41 | .replace("gpt4_text_", "") 42 | .replace(".jsonl", "") 43 | ) 44 | scores = defaultdict(list) 45 | print(config) 46 | with open( 47 | os.path.join(args.dir, review_file) if args.dir is not None else review_file 48 | ) as f: 49 | for review_str in f: 50 | review = json.loads(review_str) 51 | if args.ignore is not None and review["question_id"] in args.ignore: 52 | continue 53 | if "category" in review: 54 | scores[review["category"]].append(review["tuple"]) 55 | scores["all"].append(review["tuple"]) 56 | else: 57 | if "tuple" in review: 58 | scores["all"].append(review["tuple"]) 59 | else: 60 | scores["all"].append(review["score"]) 61 | print_out = [] 62 | for k, v in sorted(scores.items()): 63 | stats = np.asarray(v).mean(0).tolist() 64 | stats = [round(x, 3) for x in stats] 65 | # print(k, stats, round(stats[1]/stats[0]*100, 1)) 66 | print(k, round(stats[1] / stats[0] * 100, 1)) 67 | print_out.append( str( round(stats[1] / stats[0] * 100, 1) ) ) 68 | print(",".join(print_out)) 69 | print("=================================") 70 | -------------------------------------------------------------------------------- /Eval/table/model.jsonl: -------------------------------------------------------------------------------- 1 | {"model_id": "vicuna-13b:20230322-clean-lang", "model_name": "vicuna-13b", "model_version": "20230322-clean-lang", "model_metadata": "vicuna-13b-20230322-clean-lang"} 2 | {"model_id": "alpaca-13b:v1", "model_name": "alpaca-13b", "model_version": "v1", "model_metadata": "alpaca-13b"} 3 | {"model_id": "llama-13b:v1", "model_name": "llama-13b", "model_version": "v1", "model_metadata": "hf-llama-13b"} 4 | {"model_id": "bard:20230327", "model_name": "bard", "model_version": "20230327", "model_metadata": "Google Bard 20230327"} 5 | {"model_id": "gpt-3.5-turbo:20230327", "model_name": "gpt-3.5-turbo", "model_version": "20230327", "model_metadata": "OpenAI ChatGPT gpt-3.5-turbo Chat Completion"} 6 | -------------------------------------------------------------------------------- /Eval/table/prompt.jsonl: -------------------------------------------------------------------------------- 1 | {"prompt_id": 1, "system_prompt": "You are a helpful and precise assistant for checking the quality of the answer.", "prompt_template": "[Question]\n{question}\n\n[Assistant 1]\n{answer_1}\n\n[End of Assistant 1]\n\n[Assistant 2]\n{answer_2}\n\n[End of Assistant 2]\n\n[System]\n{prompt}\n\n", "defaults": {"prompt": "We would like to request your feedback on the performance of two AI assistants in response to the user question displayed above.\nPlease rate the helpfulness, relevance, accuracy, level of details of their responses. Each assistant receives an overall score on a scale of 1 to 10, where a higher score indicates better overall performance.\nPlease first output a single line containing only two values indicating the scores for Assistant 1 and 2, respectively. The two scores are separated by a space.\nIn the subsequent line, please provide a comprehensive explanation of your evaluation, avoiding any potential bias and ensuring that the order in which the responses were presented does not affect your judgment."}, "description": "Prompt for general questions"} 2 | {"prompt_id": 2, "system_prompt": "You are a helpful and precise assistant for checking the quality of the answer.", "prompt_template": "[Question]\n{question}\n\n[Assistant 1]\n{answer_1}\n\n[End of Assistant 1]\n\n[Assistant 2]\n{answer_2}\n\n[End of Assistant 2]\n\n[System]\n{prompt}\n\n", "defaults": {"prompt": "Your task is to evaluate the coding abilities of the above two assistants. They have been asked to implement a program to solve a given problem. Please review their code submissions, paying close attention to their problem-solving approach, code structure, readability, and the inclusion of helpful comments.\n\nPlease ensure that the assistants' submissions:\n\n1. Correctly implement the given problem statement.\n2. Contain accurate and efficient code.\n3. Include clear and concise comments that explain the code's logic and functionality.\n4. Adhere to proper coding standards and best practices.\n\nOnce you have carefully reviewed both submissions, provide detailed feedback on their strengths and weaknesses, along with any suggestions for improvement. You should first output a single line containing two scores on the scale of 1-10 (1: no code/no sense; 10: perfect) for Assistant 1 and 2, respectively. Then give extra comments starting from the next line."}, "description": "Prompt for coding questions"} 3 | {"prompt_id": 3, "system_prompt": "You are a helpful and precise assistant for checking the quality of the answer.", "prompt_template": "[Question]\n{question}\n\n[Assistant 1]\n{answer_1}\n\n[End of Assistant 1]\n\n[Assistant 2]\n{answer_2}\n\n[End of Assistant 2]\n\n[System]\n{prompt}\n\n", "defaults": {"prompt": "We would like to request your feedback on the mathematical proficiency of two AI assistants regarding the given user question.\nFirstly, please solve the problem independently, without referring to the answers provided by Assistant 1 and Assistant 2.\nAfterward, please examine the problem-solving process of Assistant 1 and Assistant 2 step-by-step to ensure their correctness, identifying any incorrect steps if present. Your evaluation should take into account not only the answer but also the problem-solving steps.\nFinally, please output a Python tuple containing two numerical scores for Assistant 1 and Assistant 2, ranging from 1 to 10, respectively. If applicable, explain the reasons for any variations in their scores and determine which assistant performed better."}, "description": "Prompt for math questions"} 4 | {"prompt_id": 4, "system_prompt": "You are a helpful and precise assistant for checking the quality of the answer.", "prompt_template": "[Visual Context]\n{context}\n[Question]\n{question}\n\n[Assistant 1]\n{answer_1}\n\n[End of Assistant 1]\n\n[Assistant 2]\n{answer_2}\n\n[End of Assistant 2]\n\n[System]\n{prompt}\n\n", "defaults": {"prompt": "We would like to request your feedback on the performance of two AI assistants in response to the user question displayed above. The user asks the question on observing an image. For your reference, the visual content in the image is represented with five descriptive sentences describing the same image and the bounding box coordinates of each object in the scene. These coordinates are in the form of bounding boxes, represented as (x1, y1, x2, y2) with floating numbers ranging from 0 to 1. These values correspond to the top left x, top left y, bottom right x, and bottom right y. \nPlease rate the helpfulness, relevance, accuracy, level of details of their responses. Each assistant receives an overall score on a scale of 1 to 10, where a higher score indicates better overall performance.\nPlease first output a single line containing only two values indicating the scores for Assistant 1 and 2, respectively. The two scores are separated by a space.\nIn the subsequent line, please provide a comprehensive explanation of your evaluation, avoiding any potential bias and ensuring that the order in which the responses were presented does not affect your judgment."}, "description": "Prompt for visual questions"} 5 | -------------------------------------------------------------------------------- /Eval/table/reviewer.jsonl: -------------------------------------------------------------------------------- 1 | {"reviewer_id": "gpt-4-0328-default", "prompt_id": 1, "metadata": {"temperature": 0.2, "max_tokens": 1024}, "description": "GPT-4 for general questions"} 2 | {"reviewer_id": "gpt-4-0328-coding", "prompt_id": 2, "metadata": {"temperature": 0.2, "max_tokens": 1024}, "description": "GPT-4 for coding questions"} 3 | {"reviewer_id": "gpt-4-0328-math", "prompt_id": 3, "metadata": {"temperature": 0.2, "max_tokens": 1024}, "description": "GPT-4 for math questions"} 4 | {"reviewer_id": "gpt-4-0417-visual", "prompt_id": 4, "metadata": {"temperature": 0.2, "max_tokens": 1024}, "description": "GPT-4 for math questions"} 5 | -------------------------------------------------------------------------------- /Eval/table/rule.json: -------------------------------------------------------------------------------- 1 | { 2 | "coding": {"role": "Assistant", "prompt": "Your task is to evaluate the coding abilities of the above two assistants. They have been asked to implement a program to solve a given problem. Please review their code submissions, paying close attention to their problem-solving approach, code structure, readability, and the inclusion of helpful comments.\n\nPlease ensure that the assistants' submissions:\n\n1. Correctly implement the given problem statement.\n2. Contain accurate and efficient code.\n3. Include clear and concise comments that explain the code's logic and functionality.\n4. Adhere to proper coding standards and best practices.\n\nOnce you have carefully reviewed both submissions, provide detailed feedback on their strengths and weaknesses, along with any suggestions for improvement. You should first output a single line containing two scores on the scale of 1-10 (1: no code/no sense; 10: perfect) for Assistant 1 and 2, respectively. Then give extra comments starting from the next line."}, 3 | "math": {"role": "Assistant", "prompt": "We would like to request your feedback on the mathematical proficiency of two AI assistants regarding the given user question.\nFirstly, please solve the problem independently, without referring to the answers provided by Assistant 1 and Assistant 2.\nAfterward, please examine the problem-solving process of Assistant 1 and Assistant 2 step-by-step to ensure their correctness, identifying any incorrect steps if present. Your evaluation should take into account not only the answer but also the problem-solving steps.\nFinally, please output a Python tuple containing two numerical scores for Assistant 1 and Assistant 2, ranging from 1 to 10, respectively. If applicable, explain the reasons for any variations in their scores and determine which assistant performed better."}, 4 | "default": {"role": "Assistant", "prompt": "We would like to request your feedback on the performance of two AI assistants in response to the user question displayed above.\nPlease rate the helpfulness, relevance, accuracy, level of details of their responses. Each assistant receives an overall score on a scale of 1 to 10, where a higher score indicates better overall performance.\nPlease first output a single line containing only two values indicating the scores for Assistant 1 and 2, respectively. The two scores are separated by a space.\nIn the subsequent line, please provide a comprehensive explanation of your evaluation, avoiding any potential bias and ensuring that the order in which the responses were presented does not affect your judgment."}, 5 | "conv": {"role": "Assistant", "prompt": "We would like to request your feedback on the performance of two AI assistants in response to the user question displayed above. The user asks the question on observing an image. For your reference, the visual content in the image is represented with five descriptive sentences describing the same image and the bounding box coordinates of each object in the scene. These coordinates are in the form of bounding boxes, represented as (x1, y1, x2, y2) with floating numbers ranging from 0 to 1. These values correspond to the top left x, top left y, bottom right x, and bottom right y. \nPlease rate the helpfulness, relevance, accuracy, level of details of their responses. Each assistant receives an overall score on a scale of 1 to 10, where a higher score indicates better overall performance.\nPlease first output a single line containing only two values indicating the scores for Assistant 1 and 2, respectively. The two scores are separated by a space.\nIn the subsequent line, please provide a comprehensive explanation of your evaluation, avoiding any potential bias and ensuring that the order in which the responses were presented does not affect your judgment."}, 6 | "detail": {"role": "Assistant", "prompt": "We would like to request your feedback on the performance of two AI assistants in response to the user question displayed above. The user asks the question on observing an image. For your reference, the visual content in the image is represented with five descriptive sentences describing the same image and the bounding box coordinates of each object in the scene. These coordinates are in the form of bounding boxes, represented as (x1, y1, x2, y2) with floating numbers ranging from 0 to 1. These values correspond to the top left x, top left y, bottom right x, and bottom right y. \nPlease rate the helpfulness, relevance, accuracy, level of details of their responses. Each assistant receives an overall score on a scale of 1 to 10, where a higher score indicates better overall performance.\nPlease first output a single line containing only two values indicating the scores for Assistant 1 and 2, respectively. The two scores are separated by a space.\nIn the subsequent line, please provide a comprehensive explanation of your evaluation, avoiding any potential bias and ensuring that the order in which the responses were presented does not affect your judgment."}, 7 | "complex": {"role": "Assistant", "prompt": "We would like to request your feedback on the performance of two AI assistants in response to the user question displayed above. The user asks the question on observing an image. For your reference, the visual content in the image is represented with five descriptive sentences describing the same image and the bounding box coordinates of each object in the scene. These coordinates are in the form of bounding boxes, represented as (x1, y1, x2, y2) with floating numbers ranging from 0 to 1. These values correspond to the top left x, top left y, bottom right x, and bottom right y. \nPlease rate the helpfulness, relevance, accuracy, level of details of their responses. Each assistant receives an overall score on a scale of 1 to 10, where a higher score indicates better overall performance.\nPlease first output a single line containing only two values indicating the scores for Assistant 1 and 2, respectively. The two scores are separated by a space.\nIn the subsequent line, please provide a comprehensive explanation of your evaluation, avoiding any potential bias and ensuring that the order in which the responses were presented does not affect your judgment."}, 8 | "llava_bench_conv": {"role": "Assistant", "prompt": "We would like to request your feedback on the performance of two AI assistants in response to the user question displayed above. The user asks the question on observing an image. For your reference, the visual content in the image is represented with a few sentences describing the image. \nPlease rate the helpfulness, relevance, accuracy, level of details of their responses. Each assistant receives an overall score on a scale of 1 to 10, where a higher score indicates better overall performance.\nPlease first output a single line containing only two values indicating the scores for Assistant 1 and 2, respectively. The two scores are separated by a space.\nIn the subsequent line, please provide a comprehensive explanation of your evaluation, avoiding any potential bias and ensuring that the order in which the responses were presented does not affect your judgment."}, 9 | "llava_bench_detail": {"role": "Assistant", "prompt": "We would like to request your feedback on the performance of two AI assistants in response to the user question displayed above. The user asks the question on observing an image. For your reference, the visual content in the image is represented with a few sentences describing the image. \nPlease rate the helpfulness, relevance, accuracy, level of details of their responses. Each assistant receives an overall score on a scale of 1 to 10, where a higher score indicates better overall performance.\nPlease first output a single line containing only two values indicating the scores for Assistant 1 and 2, respectively. The two scores are separated by a space.\nIn the subsequent line, please provide a comprehensive explanation of your evaluation, avoiding any potential bias and ensuring that the order in which the responses were presented does not affect your judgment."}, 10 | "llava_bench_complex": {"role": "Assistant", "prompt": "We would like to request your feedback on the performance of two AI assistants in response to the user question displayed above. The user asks the question on observing an image. For your reference, the visual content in the image is represented with a few sentences describing the image. \nPlease rate the helpfulness, relevance, accuracy, level of details of their responses. Each assistant receives an overall score on a scale of 1 to 10, where a higher score indicates better overall performance.\nPlease first output a single line containing only two values indicating the scores for Assistant 1 and 2, respectively. The two scores are separated by a space.\nIn the subsequent line, please provide a comprehensive explanation of your evaluation, avoiding any potential bias and ensuring that the order in which the responses were presented does not affect your judgment."} 11 | } -------------------------------------------------------------------------------- /Eval/webpage/figures/alpaca.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/llava-rlhf/LLaVA-RLHF/830a083fd83e607da3c35f2e6aef833523c1e722/Eval/webpage/figures/alpaca.png -------------------------------------------------------------------------------- /Eval/webpage/figures/bard.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/llava-rlhf/LLaVA-RLHF/830a083fd83e607da3c35f2e6aef833523c1e722/Eval/webpage/figures/bard.jpg -------------------------------------------------------------------------------- /Eval/webpage/figures/chatgpt.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /Eval/webpage/figures/llama.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/llava-rlhf/LLaVA-RLHF/830a083fd83e607da3c35f2e6aef833523c1e722/Eval/webpage/figures/llama.jpg -------------------------------------------------------------------------------- /Eval/webpage/figures/swords_FILL0_wght300_GRAD0_opsz48.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /Eval/webpage/figures/vicuna.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/llava-rlhf/LLaVA-RLHF/830a083fd83e607da3c35f2e6aef833523c1e722/Eval/webpage/figures/vicuna.jpeg -------------------------------------------------------------------------------- /Eval/webpage/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | Who's GPT-4's favorite? Battles between State-of-the-Art Chatbots 7 | 8 | 9 | 10 | 11 | 12 | 13 | 32 | 33 |
34 |

Who's GPT-4's favorite? Battles between State-of-the-Art Chatbots

35 | 36 | 37 |
38 |
39 | 40 | 41 |
42 |
43 | 44 | 45 |
46 |
47 |
48 |
49 | 50 | 51 |
52 |
53 |
54 | 55 | 56 |
57 |
58 | 59 |
60 |
61 |
62 | other logo 63 |
64 |
65 |
66 |
67 | 68 | 69 |
70 |
71 |
72 |
73 | vicuna logo 74 |
75 |
76 |
77 | 78 |
79 |
80 | 81 | 82 |
83 |
84 |
85 | 86 | 87 |
88 |
89 |
90 |
91 |
92 |
93 | 94 |
95 |
96 | 97 |
98 |
99 |
100 |
101 |
102 |
103 |
104 |
105 |
106 |
107 |
108 |
109 |
110 |
111 | Assistant #2 (Vicuna, our model) 112 |
113 |
114 |
115 |
116 |
117 |
118 |
119 |
120 |
121 |
122 | 123 | 124 |
125 |
GPT-4 Evaluation
126 |
127 |
128 |
129 |
130 |
131 |
132 |
133 |
134 | 135 |
136 |
137 | This website is co-authored with GPT-4. 138 |
139 |
140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | 160 | 161 | 162 | 163 | -------------------------------------------------------------------------------- /Eval/webpage/script.js: -------------------------------------------------------------------------------- 1 | // Description: Script for the evaluation webpage. 2 | 3 | let currentQuestionIndex = 1; 4 | 5 | // Store the model name mapping for later use. 6 | modelNameMapping = { 7 | "gpt35": "ChatGPT-3.5", 8 | "gpt4": "GPT-4", 9 | "alpaca": "Alpaca-13b", 10 | "vicuna": "Vicuna-13b", 11 | "llama": "LLaMA-13b", 12 | "bard": "Bard", 13 | }; 14 | 15 | modelFigureMapping = { 16 | "vicuna": "figures/vicuna.jpeg", 17 | // Image from: https://commons.wikimedia.org/wiki/File:ChatGPT_logo.svg 18 | "gpt35": "figures/chatgpt.svg", 19 | // Image from: https://www.reddit.com/r/logodesign/comments/1128aat/google_ai_bard_logo_design/ 20 | "bard": "figures/bard.jpg", 21 | // Image from: https://crfm.stanford.edu/2023/03/13/alpaca.html 22 | "alpaca": "figures/alpaca.png", 23 | // Image adapted from https://commons.wikimedia.org/wiki/File:Llama_on_Machu_Picchu.jpg 24 | "llama": "figures/llama.jpg", 25 | } 26 | 27 | // Store the question data in a mapping for later use. 28 | questionMapping = {}; 29 | // Store the question ids in a mapping for later use. 30 | categoryMapping = {}; 31 | // Store the number of questions for later use. 32 | questionsCount = 0; 33 | 34 | 35 | function text2Markdown(text) { 36 | // Normalize the text for markdown rendering. 37 | text = text.trim().replaceAll('\n\n', '\n').replaceAll('\n', '\n\n'); 38 | return marked.parse(text); 39 | } 40 | 41 | function capitalizeFirstChar(str) { 42 | if (!str || str.length === 0) { 43 | return str; 44 | } 45 | return str.charAt(0).toUpperCase() + str.slice(1); 46 | } 47 | 48 | function updateQuestionSelect(question_id) { 49 | const select = document.getElementById('question-select'); 50 | // Clear the question select. 51 | select.innerHTML = ''; 52 | // Populate the question select. 53 | category = questionMapping[question_id].category; 54 | categoryMapping[category].forEach(question_id => { 55 | const question = questionMapping[question_id]; 56 | const option = document.createElement('option'); 57 | option.value = question_id; 58 | option.textContent = 'Q' + question_id.toString() + ': ' + question.question; 59 | select.appendChild(option); 60 | }); 61 | select.value = question_id; 62 | } 63 | 64 | function updateModelSelect() { 65 | const select = document.getElementById('model-select'); 66 | img_path = modelFigureMapping[select.value]; 67 | document.getElementById('other-model-figure').src = img_path; 68 | } 69 | 70 | function populateModels(models) { 71 | const select = document.getElementById('model-select'); 72 | models.forEach(model => { 73 | const option = document.createElement('option'); 74 | option.value = model; 75 | option.textContent = modelNameMapping[model]; 76 | select.appendChild(option); 77 | }); 78 | updateModelSelect(); 79 | } 80 | 81 | function populateQuestions(questions) { 82 | const category_select = document.getElementById('category-select'); 83 | 84 | questionsCount = questions.length; 85 | questions.forEach(question => { 86 | const option = document.createElement('option'); 87 | // Store the question data in a mapping for later use. 88 | questionMapping[question.id] = { 89 | category: question.category, 90 | question: question.question, 91 | answers: question.answers, 92 | evaluations: question.evaluations, 93 | scores: question.scores, 94 | }; 95 | // Store the question id in the category mapping. 96 | if (question.category in categoryMapping) { 97 | categoryMapping[question.category].push(question.id); 98 | } else { 99 | categoryMapping[question.category] = [question.id]; 100 | const category_option = document.createElement('option'); 101 | category_option.value = question.category; 102 | category_option.textContent = capitalizeFirstChar(question.category); 103 | category_select.appendChild(category_option); 104 | } 105 | }); 106 | // Set the default category. 107 | updateQuestionSelect(currentQuestionIndex); 108 | } 109 | 110 | function displayQuestion(index) { 111 | const question = questionMapping[index].question; 112 | document.getElementById('selected-question').innerHTML = text2Markdown('**Question:** ' + question); 113 | displayAnswers(index); 114 | } 115 | 116 | function displayAnswers(index) { 117 | const question = questionMapping[index]; 118 | const otherModel = document.getElementById('model-select').value; 119 | // render the answers with markdown 120 | document.getElementById('other-model-answer').innerHTML = text2Markdown(question.answers[otherModel]); 121 | document.getElementById('our-model-answer').innerHTML = text2Markdown(question.answers.vicuna); 122 | 123 | // Display evaluation 124 | score = question.scores[otherModel]; 125 | score_text = modelNameMapping[otherModel] + " " + score[0] + "/10, Vicuna-13b " + score[1] + "/10"; 126 | document.getElementById('evaluation-header').textContent = "GPT-4 Evaluation" + " (Score: " + score_text + ")"; 127 | document.getElementById('evaluation-result').innerHTML = text2Markdown(question.evaluations[otherModel]); 128 | 129 | // Update model names 130 | let assistant1_title = "Assistant #1"; // (" + modelNameMapping[otherModel] + ")"; 131 | let assistant2_title = "Assistant #2 (Vicuna-13b, our model)"; 132 | // Update scores/labels. 133 | let assistant1_score_label = score[0].toString() + '/10'; 134 | let assistant2_score_label = score[1].toString() + '/10'; 135 | 136 | const colorRed ='#fa9'; // '#eb978d'; 137 | // const colorGreen = '#c9f2c9'; 138 | const colorBlue = '#8ef'; // '#71dbf9'; 139 | const colorYellow = '#fe7'; // '#fada57'; 140 | let otherModelHeaderColor = ''; 141 | let ourModelHeaderColor = ''; 142 | // Update the winner. 143 | if (score[0] == score[1]) { 144 | assistant1_title = '🏆 ' + assistant1_title; 145 | assistant1_score_label = '🏆 ' + assistant1_score_label; 146 | assistant2_title = '🏆 ' + assistant2_title; 147 | assistant2_score_label = '🏆 ' + assistant2_score_label; 148 | otherModelHeaderColor = colorYellow; 149 | ourModelHeaderColor = colorYellow; 150 | } else if (score[0] > score[1]) { 151 | assistant1_title = '🏆 ' + assistant1_title; 152 | assistant1_score_label = '🏆 ' + assistant1_score_label; 153 | otherModelHeaderColor = colorBlue; 154 | ourModelHeaderColor = colorRed; 155 | } else if (score[0] < score[1]) { 156 | assistant2_title = '🏆 ' + assistant2_title; 157 | assistant2_score_label = '🏆 ' + assistant2_score_label; 158 | otherModelHeaderColor = colorRed; 159 | ourModelHeaderColor = colorBlue; 160 | } 161 | 162 | document.getElementById('other-model-header-bg').style.backgroundColor = otherModelHeaderColor; 163 | document.getElementById('our-model-header').style.backgroundColor = ourModelHeaderColor; 164 | 165 | document.getElementById('other-model-header').textContent = assistant1_title; 166 | document.getElementById('our-model-header').textContent = assistant2_title; 167 | 168 | document.getElementById('other-score-label').textContent = assistant1_score_label; 169 | document.getElementById('our-score-label').textContent = assistant2_score_label; 170 | 171 | // Update expand buttons visibility for both cards after displaying answers 172 | // Reset the expanded state and update expand buttons visibility for both cards after displaying answers 173 | document.querySelectorAll('.expandable-card').forEach(card => { 174 | card.classList.remove('expanded'); 175 | updateExpandButtonVisibility(card); 176 | const expandBtn = card.querySelector('.expand-btn'); 177 | expandBtn.innerHTML = 'keyboard_arrow_down Show more'; // .textContent = 'Show more'; 178 | }); 179 | } 180 | 181 | document.getElementById('question-select').addEventListener('change', e => { 182 | currentQuestionIndex = parseInt(e.target.value); 183 | displayQuestion(currentQuestionIndex); 184 | }); 185 | 186 | document.getElementById('category-select').addEventListener('change', e => { 187 | let currentCategory = e.target.value; 188 | const questionIds = categoryMapping[currentCategory]; 189 | currentQuestionIndex = questionIds[0]; 190 | updateQuestionSelect(currentQuestionIndex); 191 | displayQuestion(currentQuestionIndex); 192 | }); 193 | 194 | // Update expand buttons whenever the model is changed 195 | document.getElementById('model-select').addEventListener('change', () => { 196 | displayAnswers(currentQuestionIndex); 197 | document.querySelectorAll('.expandable-card').forEach(card => { 198 | updateExpandButtonVisibility(card); 199 | }); 200 | updateModelSelect(); 201 | }); 202 | 203 | function switchQuestionAndCategory() { 204 | document.getElementById('question-select').value = currentQuestionIndex; 205 | old_category = document.getElementById('category-select').value; 206 | new_category = questionMapping[currentQuestionIndex].category; 207 | if (old_category != new_category) { 208 | document.getElementById('category-select').value = new_category; 209 | updateQuestionSelect(currentQuestionIndex); 210 | } 211 | displayQuestion(currentQuestionIndex); 212 | } 213 | 214 | document.getElementById('prev-question').addEventListener('click', () => { 215 | // Question index starts from 1. 216 | currentQuestionIndex = Math.max(1, currentQuestionIndex - 1); 217 | switchQuestionAndCategory(); 218 | }); 219 | 220 | document.getElementById('next-question').addEventListener('click', () => { 221 | // Question index starts from 1. 222 | currentQuestionIndex = Math.min(questionsCount, currentQuestionIndex + 1); 223 | switchQuestionAndCategory(); 224 | }); 225 | 226 | function updateExpandButtonVisibility(card) { 227 | const cardTextContainer = card.querySelector('.card-text-container'); 228 | const expandBtn = card.querySelector('.expand-btn'); 229 | if (cardTextContainer.scrollHeight > cardTextContainer.offsetHeight) { 230 | expandBtn.style.display = 'flex'; 231 | } else { 232 | expandBtn.style.display = 'none'; 233 | card.classList.add('expanded'); 234 | } 235 | } 236 | 237 | document.querySelectorAll('.expand-btn').forEach(btn => { 238 | btn.addEventListener('click', e => { 239 | const card = e.target.closest('.expandable-card'); 240 | card.classList.toggle('expanded'); 241 | const more = 'keyboard_arrow_down Show more'; 242 | const less = 'keyboard_arrow_up Show less'; 243 | e.target.innerHTML = card.classList.contains('expanded') ? less : more; 244 | }); 245 | }); 246 | -------------------------------------------------------------------------------- /Eval/webpage/styles.css: -------------------------------------------------------------------------------- 1 | body { 2 | font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif; 3 | background-color: #f8f9fa; 4 | } 5 | 6 | .navbar-dark .navbar-nav .nav-link { 7 | color: #f1cf68; 8 | font-size: 1.1rem; 9 | padding: 0.5rem 0.6rem; 10 | } 11 | 12 | .card-header { 13 | font-weight: bold; 14 | } 15 | 16 | .card { 17 | box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1); 18 | transition: 0.3s; 19 | } 20 | 21 | .card:hover { 22 | box-shadow: 0 8px 16px rgba(0, 0, 0, 0.2); 23 | } 24 | 25 | button { 26 | transition: background-color 0.3s; 27 | } 28 | 29 | button:hover { 30 | background-color: #007bff; 31 | } 32 | 33 | @media (max-width: 767px) { 34 | .form-row .form-group { 35 | margin-bottom: 10px; 36 | } 37 | } 38 | 39 | /* Extra styles */ 40 | 41 | .expandable-card .card-text-container { 42 | max-height: 200px; 43 | overflow-y: hidden; 44 | position: relative; 45 | } 46 | 47 | .expandable-card.expanded .card-text-container { 48 | max-height: none; 49 | } 50 | 51 | .expand-btn { 52 | position: relative; 53 | display: none; 54 | background-color: rgba(255, 255, 255, 0.8); 55 | color: #510c75; 56 | border-color: transparent; 57 | } 58 | 59 | .expand-btn:hover { 60 | background-color: rgba(200, 200, 200, 0.8); 61 | text-decoration: none; 62 | border-color: transparent; 63 | color: #510c75; 64 | } 65 | 66 | .expand-btn:focus { 67 | outline: none; 68 | text-decoration: none; 69 | } 70 | 71 | .expandable-card:not(.expanded) .card-text-container:after { 72 | content: ""; 73 | position: absolute; 74 | bottom: 0; 75 | left: 0; 76 | width: 100%; 77 | height: 90px; 78 | background: linear-gradient(rgba(255, 255, 255, 0.2), rgba(255, 255, 255, 1)); 79 | } 80 | 81 | .expandable-card:not(.expanded) .expand-btn { 82 | margin-top: -40px; 83 | } 84 | 85 | .card-body { 86 | padding-bottom: 5px; 87 | } 88 | 89 | .vertical-flex-layout { 90 | justify-content: center; 91 | align-items: center; 92 | height: 100%; 93 | display: flex; 94 | flex-direction: column; 95 | gap: 5px; 96 | } 97 | 98 | .figure-img { 99 | max-width: 100%; 100 | height: auto; 101 | } 102 | 103 | .adjustable-font-size { 104 | font-size: calc(0.5rem + 2vw); 105 | } 106 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |
2 | LLaVA-RLHF Logo 3 |

Generated by DALL·E 3

4 |
5 | 6 |
7 | 8 | 9 | 10 | # LLaVA-RLHF: Aligning Large Multimodal Models with Factually Augmented RLHF 11 | 12 |
13 | 14 | [[Project Page / Demo / Model Weights](https://llava-rlhf.github.io/)] 15 | 16 | LLaVA-RLHF represents the first open-source **RLHF-trained** large multimodal model for general-purpose visual and language understanding, achieving impressive visual reasoning and perception capabilities. For comprehensive details and insights, we kindly direct you to our [project page](https://llava-rlhf.github.io/) and [paper](https://arxiv.org/abs/2309.14525). 17 | 18 | ## Inference 19 | 20 | To deploy or play with our model, please refer to [the demo directory](./demo). 21 | 22 | ## Train 23 | 24 | We propose a new alignment algorithm called **Factually Augmented RLHF (Fact-RLHF)** that augments the reward model with additional factual information such as image captions and ground-truth multi-choice options, which alleviates the reward hacking phenomenon in RLHF and further improves the performance. 25 | 26 | LLaVA-RLHF is trained on 8 A100 GPUs with 80GB memory. To train on fewer GPUs, you can reduce the `per_device_train_batch_size` and increase the `gradient_accumulation_steps` accordingly. Always keep the global batch size the same: `per_device_train_batch_size` x `gradient_accumulation_steps` x `num_gpus`. 27 | 28 | The SFT training pipeline is provided in [the SFT directory](./SFT), and the RLHF training pipeline is provided in [the RLHF directory](./RLHF). 29 | 30 | ## Examples 31 | 32 |
33 | Example 1 34 |
35 | 36 |
37 | Example 2 38 |
39 | 40 | ## Citations 41 | 42 | If you find this repo useful for your research, please consider citing the paper 43 | 44 | LLaVA-RLHF: 45 | 46 | ```bibtex 47 | @article{sun2023aligning, 48 | title={Aligning large multimodal models with factually augmented rlhf}, 49 | author={Sun, Zhiqing and Shen, Sheng and Cao, Shengcao and Liu, Haotian and Li, Chunyuan and Shen, Yikang and Gan, Chuang and Gui, Liang-Yan and Wang, Yu-Xiong and Yang, Yiming and others}, 50 | journal={arXiv preprint arXiv:2309.14525}, 51 | year={2023} 52 | } 53 | ``` 54 | 55 | LLaVA: 56 | 57 | ```bibtex 58 | @misc{liu2023llava, 59 | title={Visual Instruction Tuning}, 60 | author={Liu, Haotian and Li, Chunyuan and Wu, Qingyang and Lee, Yong Jae}, 61 | publisher={arXiv:2304.08485}, 62 | year={2023}, 63 | } 64 | ``` 65 | 66 | SALMON: 67 | 68 | ```bibtex 69 | @article{sun2023salmon, 70 | title={SALMON: Self-Alignment with Principle-Following Reward Models}, 71 | author={Sun, Zhiqing and Shen, Yikang and Zhang, Hongxin and Zhou, Qinhong and Chen, Zhenfang and Cox, David and Yang, Yiming and Gan, Chuang}, 72 | journal={arXiv preprint arXiv:2310.05910}, 73 | year={2023} 74 | } 75 | ``` 76 | 77 | ## Acknowledgements 78 | 79 | We thank [Meta LLaMA team](https://github.com/facebookresearch/llama), [Standford Alpaca team](https://github.com/tatsu-lab/stanford_alpaca), [Vicuna team](https://github.com/lm-sys/FastChat), [LLaVA team](https://github.com/haotian-liu/LLaVA), [QLoRA team](https://github.com/artidoro/qlora), [Hugging Face PEFT](https://github.com/huggingface/peft), and [AlpacaFarm team](https://github.com/tatsu-lab/alpaca_farm) for their open-source efforts in democratizing large language models. 80 | -------------------------------------------------------------------------------- /RLHF/README.md: -------------------------------------------------------------------------------- 1 | # (Factually Augmented) RL from Human Feedback 2 | 3 | This RLHF codebase is mainly adapted from the [SALMON](https://github.com/Edward-Sun/SALMON) codebase, which is adapted from [AlpacaFarm](https://github.com/tatsu-lab/alpaca_farm) and [QLoRA](https://github.com/artidoro/qlora). 4 | 5 | ## 0. Setup 6 | 7 | Please refer to [`llava_setup`](../llava_setup) for instructions on how to set up the customized llava package. 8 | 9 | Additionally, you **should** run the following command to make sure the versions of some essential packages are correct: 10 | 11 | ```bash 12 | pip install torch==2.0.1+cu118 torchvision==0.15.2+cu118 torchaudio==2.0.2 --index-url https://download.pytorch.org/whl/cu118 13 | pip install deepspeed==0.9.3 14 | pip install peft==0.4.0 15 | pip install transformers==4.31.0 16 | pip install bitsandbytes==0.41.0 17 | pip install datasets 18 | ``` 19 | 20 | **Note:** please install Pytorch 2.0.1 following the guidelines [here](https://pytorch.org/get-started/previous-versions/#v201). We found that the flash-attention implementation in the newest Pytorch Stable (2.1.0) could lead to buggy results. The codebase is tested with `torch==2.0.1+cu118`. 21 | 22 | ## 1. Training the Instruction-Following Reward Model 23 | 24 | We first train an [instruction-following reward model](https://arxiv.org/abs/2310.05910) based on the [following judging creteria](prompts/reward_prompt.txt): 25 | 26 | ```text 27 | 1. Accurate: The AI should provide factual and accurate information from the image, and refrain from making statements that are not supported by the image or inconsistent with the image. 28 | 2. Helpful: The AI’s response should precisely serve the user's needs and interests, while grounding the response in the image. 29 | 3. Language Natural: The AI should employ language that flows smoothly and is free from repetitive or awkward constructs. 30 | 4. Concise: The AI should efficiently address the task or answer the question, communicating the necessary information with brevity and clarity. 31 | ``` 32 | 33 | After downloading the SFT model checkpoint from [`LLaVA-RLHF-13b-v1.5-336`](https://huggingface.co/zhiqings/LLaVA-RLHF-13b-v1.5-336), the human preference data from [`LLaVA-Human-Preference-10K`](https://huggingface.co/datasets/zhiqings/LLaVA-Human-Preference-10K), and the image captions from [`LLaVA-RLHF-Data/image_to_caption.json`](https://huggingface.co/datasets/zhiqings/LLaVA-RLHF-Data/tree/main), you can run the training script for the reward model: 34 | 35 | ```bash 36 | bash scripts/13b-v1.5-336/train_reward_model.sh 37 | ``` 38 | 39 | **Note**: For both 7b and 13b policy models, we use the same 13b reward model. We also provide the pretrained reward model checkpoint at [`LLaVA-RLHF-13b-v1.5-336/rm_lora_adapter_model`](https://huggingface.co/zhiqings/LLaVA-RLHF-13b-v1.5-336/tree/main/rm_lora_adapter_model). To use the pretrained LoRA checkpoint, the `base_model_name_or_path` in [adapter_config.json](https://huggingface.co/zhiqings/LLaVA-RLHF-13b-v1.5-336/blob/main/rm_lora_adapter_model/adapter_config.json) need to be modified to the actual path of the [SFT model](https://huggingface.co/zhiqings/LLaVA-RLHF-13b-v1.5-336/tree/main/sft_model). 40 | 41 | ## 2. Initialize the RL Model 42 | 43 | We initialize the LoRA weights of the policy model by fine-tuning the SFT model for one epoch on the combination of: 44 | 45 | 1. Our preference modeling split of the LLaVA data (10k) 46 | 2. A-OKVQA in the CoT format (5k) 47 | 48 | We provide the processed data in [`LLaVA-RLHF-Data/llava_reward10k-aokvqa5k.json`](https://huggingface.co/datasets/zhiqings/LLaVA-RLHF-Data/tree/main). After downloading the data (and potentially the 7b SFT model checkpoint from [`LLaVA-RLHF-7b-v1.5-224`](https://huggingface.co/zhiqings/LLaVA-RLHF-7b-v1.5-224)), you can run the following script to initialize the policy model: 49 | 50 | ```bash 51 | bash scripts/7b-v1.5-224/initialize_policy_model.sh 52 | bash scripts/13b-v1.5-336/initialize_policy_model.sh 53 | ``` 54 | 55 | ## 3. Training the RL Model with PPO 56 | 57 | The PPO training of the policy model is based on the prompt combination of: 58 | 59 | 1. Our RL split of the LLaVA data (50k) 60 | 2. A-OKVQA in the CoT format (12k) 61 | 3. Yes/No Questions from VQA-v2 (10k) 62 | 63 | We provide the processed data in [`LLaVA-RLHF-Data/llava_ppo50k-aokvqa12k-vqa10k.json`](https://huggingface.co/datasets/zhiqings/LLaVA-RLHF-Data/tree/main). After downloading the data, you can run the following script to train the RL model: 64 | 65 | ```bash 66 | bash scripts/7b-v1.5-224/train_rl_model.sh 67 | bash scripts/13b-v1.5-336/train_rl_model.sh 68 | ``` 69 | -------------------------------------------------------------------------------- /RLHF/data_utils/constants.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 The LLaVA-RLHF Team 2 | # Copyright 2023 The Alpaca Team 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | # PPO Constants 17 | from enum import Enum 18 | 19 | FACTUAL_PROMPT = "Specifically, the AI's response should be fully supported by the combination of the following captions:\n" 20 | 21 | class AnswerType(Enum): 22 | GENERAL = 1 23 | A_IN_ABCD = 2 24 | B_IN_ABCD = 3 25 | C_IN_ABCD = 4 26 | D_IN_ABCD = 5 27 | NO_IN_YESNO = 6 28 | YES_IN_YESNO = 7 29 | -------------------------------------------------------------------------------- /RLHF/data_utils/data_utils_ppo.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 The LLaVA-RLHF Team 2 | # Copyright 2023 The Alpaca Team 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | import dataclasses 17 | from typing import Callable, Dict, Optional, List, Sequence 18 | 19 | import logging 20 | import pandas as pd 21 | 22 | import torch 23 | from torch.utils.data import Dataset 24 | 25 | import transformers 26 | import datasets 27 | 28 | import data_utils.common_utils as utils 29 | 30 | from PIL import Image 31 | import copy 32 | import os 33 | import tqdm 34 | 35 | import data_utils.common_utils as utils 36 | from data_utils.common_utils import preprocess, preprocess_multimodal 37 | 38 | logger = logging.getLogger(__name__) 39 | 40 | 41 | class QueryResponseDataset(Dataset): 42 | """Dataset that emits tokenized left-padded queries.""" 43 | 44 | def __init__( 45 | self, 46 | df: pd.DataFrame, 47 | tokenizer: transformers.PreTrainedTokenizer, 48 | query_len: int, 49 | df_postprocessor: Optional[Callable] = None, 50 | data_args: Optional[Dict] = None, 51 | ): 52 | self.data_args = data_args 53 | super(QueryResponseDataset, self).__init__() 54 | 55 | if df_postprocessor is not None: 56 | df = df_postprocessor(df) 57 | list_dict_data = df.to_dict(orient="records") 58 | 59 | _s = copy.deepcopy([ex["conversations"] for ex in list_dict_data]) 60 | _s = preprocess_multimodal(_s, data_args) 61 | 62 | _s = [__s[:2] for __s in _s] 63 | 64 | for __s in _s: 65 | assert __s[-1]["from"] == "gpt", f"{__s}" 66 | __s[-1]["value"] = "\n" 67 | 68 | queries = [ 69 | preprocess( 70 | [__s], 71 | tokenizer, 72 | has_image=True, 73 | mask_target=False, 74 | query_len=query_len, 75 | )["input_ids"] 76 | for __s in tqdm.tqdm(_s) 77 | ] 78 | 79 | queries = [ 80 | torch.tensor(query, dtype=torch.long).view(-1)[:-3] for query in queries 81 | ] 82 | 83 | filtered_queries = [] 84 | 85 | for query in queries: 86 | if len(query) <= query_len: 87 | filtered_queries.append(query) 88 | 89 | max_query_len = max(len(query) for query in filtered_queries) 90 | logger.warning(f"Max query length: {max_query_len}") 91 | 92 | logger.warning( 93 | f"Filtered out {len(queries) - len(filtered_queries)} instances out of {len(queries)} that " 94 | f"exceed length limit. These examples are not used for training, but will still be used in evaluation. " 95 | ) 96 | 97 | queries = torch.stack( 98 | [ 99 | utils.left_pad( 100 | query, target_size=(query_len,), value=tokenizer.pad_token_id 101 | ) 102 | for query in filtered_queries 103 | ] 104 | ) 105 | 106 | self.queries = queries 107 | self.query_attn_masks = queries.ne(tokenizer.pad_token_id).long() 108 | 109 | # Auxiliary data. 110 | self.list_dict_data = list_dict_data 111 | 112 | def __getitem__(self, idx): 113 | return_dict = dict( 114 | queries=self.queries[idx], 115 | query_attn_masks=self.query_attn_masks[idx], 116 | ) 117 | 118 | image_file = self.list_dict_data[idx]["image"] 119 | image_folder = self.data_args.image_folder 120 | processor = self.data_args.image_processor 121 | reward_processor = self.data_args.reward_image_processor 122 | 123 | if "caption_type" in self.list_dict_data[idx]: 124 | caption_type = self.list_dict_data[idx]["caption_type"] 125 | else: 126 | caption_type = 1 127 | 128 | if "length_bonus" in self.list_dict_data[idx]: 129 | length_bonus = self.list_dict_data[idx]["length_bonus"] 130 | else: 131 | length_bonus = 1.0 132 | 133 | try: 134 | image = Image.open(os.path.join(image_folder, image_file)).convert("RGB") 135 | except: 136 | raise ValueError(f"Error loading image {image_file} for index {idx}") 137 | 138 | original_image = image 139 | 140 | if self.data_args.image_aspect_ratio == "pad": 141 | 142 | def expand2square(pil_img, background_color): 143 | width, height = pil_img.size 144 | if width == height: 145 | return pil_img 146 | elif width > height: 147 | result = Image.new(pil_img.mode, (width, width), background_color) 148 | result.paste(pil_img, (0, (width - height) // 2)) 149 | return result 150 | else: 151 | result = Image.new(pil_img.mode, (height, height), background_color) 152 | result.paste(pil_img, ((height - width) // 2, 0)) 153 | return result 154 | 155 | image = expand2square( 156 | image, tuple(int(x * 255) for x in processor.image_mean) 157 | ) 158 | image = processor.preprocess(image, return_tensors="pt")["pixel_values"][0] 159 | 160 | reward_image = expand2square( 161 | original_image, tuple(int(x * 255) for x in reward_processor.image_mean) 162 | ) 163 | reward_image = reward_processor.preprocess( 164 | reward_image, return_tensors="pt" 165 | )["pixel_values"][0] 166 | else: 167 | image = processor.preprocess(image, return_tensors="pt")["pixel_values"][0] 168 | reward_image = reward_processor.preprocess( 169 | original_image, return_tensors="pt" 170 | )["pixel_values"][0] 171 | 172 | return_dict["images"] = image 173 | return_dict["reward_images"] = reward_image 174 | return_dict["image_file_ids"] = torch.tensor( 175 | int(image_file.split(".")[0]), dtype=torch.long 176 | ) 177 | return_dict["caption_types"] = torch.tensor(caption_type, dtype=torch.long) 178 | return_dict["length_bonus_multiplier"] = torch.tensor( 179 | length_bonus, dtype=torch.float 180 | ) 181 | 182 | return return_dict 183 | 184 | def __len__(self): 185 | return len(self.queries) 186 | 187 | 188 | @dataclasses.dataclass 189 | class DataCollatorForQueryResponseDataset(object): 190 | def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]: 191 | return { 192 | key: torch.stack([instance[key] for instance in instances]) 193 | for key in instances[0].keys() 194 | } 195 | 196 | 197 | def make_rl_data_module( 198 | tokenizer: transformers.PreTrainedTokenizer, 199 | data_args, 200 | training_args, 201 | ): 202 | if data_args.dataset_path.endswith("json"): 203 | train_instructions = datasets.load_dataset( 204 | "json", data_files=data_args.dataset_path 205 | ) 206 | else: 207 | train_instructions = datasets.load_dataset( 208 | data_args.dataset_path, data_args.dataset_name 209 | ) 210 | train_df = pd.concat( 211 | [pd.DataFrame(train_instructions[split]) for split in data_args.train_splits] 212 | ) 213 | 214 | train_dataset = QueryResponseDataset( 215 | df=train_df, 216 | tokenizer=tokenizer, 217 | query_len=training_args.query_len, 218 | data_args=data_args, 219 | ) 220 | return dict( 221 | train_dataset=train_dataset, 222 | eval_dataset=None, 223 | data_collator=DataCollatorForQueryResponseDataset(), 224 | ) 225 | -------------------------------------------------------------------------------- /RLHF/lora_utils.py: -------------------------------------------------------------------------------- 1 | # This source code is licensed under the MIT license found in the 2 | # LICENSE file in the root directory of this source tree. 3 | 4 | import glob 5 | import os 6 | from os.path import exists, join, isdir 7 | import shutil 8 | import sys 9 | from typing import Optional, Dict, Sequence, List 10 | 11 | import torch 12 | import transformers 13 | from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR 14 | 15 | from models.reward_model import RewardModel 16 | 17 | DEFAULT_PAD_TOKEN = "[PAD]" 18 | 19 | 20 | class SavePeftModelCallback(transformers.TrainerCallback): 21 | def save_model(self, args, state, kwargs): 22 | print("Saving PEFT checkpoint...") 23 | 24 | global_rank = int(os.environ.get("RANK", 0)) 25 | 26 | if global_rank == 0: 27 | print("Saving model checkpoint to %s" % args.output_dir) 28 | if state.best_model_checkpoint is not None: 29 | checkpoint_folder = state.best_model_checkpoint 30 | else: 31 | checkpoint_folder = os.path.join( 32 | args.output_dir, f"{PREFIX_CHECKPOINT_DIR}-{state.global_step}" 33 | ) 34 | 35 | peft_model_path = os.path.join(checkpoint_folder, "adapter_model") 36 | reward_head_path = os.path.join(checkpoint_folder, "reward_head") 37 | 38 | if isinstance(kwargs["model"], RewardModel): 39 | kwargs["model"].backbone_model.save_pretrained(peft_model_path) 40 | torch.save( 41 | kwargs["model"].reward_head.state_dict(), 42 | reward_head_path, 43 | ) 44 | else: 45 | kwargs["model"].save_pretrained(peft_model_path) 46 | 47 | pytorch_model_paths = glob.glob( 48 | os.path.join(checkpoint_folder, "pytorch_model*.bin") 49 | ) 50 | for pytorch_model_path in pytorch_model_paths: 51 | if os.path.exists(pytorch_model_path): 52 | os.remove(pytorch_model_path) 53 | 54 | optimizer_path = os.path.join(checkpoint_folder, "optimizer.pt") 55 | if os.path.exists(optimizer_path): 56 | os.remove(optimizer_path) 57 | 58 | else: 59 | print("Skipping PEFT checkpoint save on rank %d" % global_rank) 60 | 61 | def on_save(self, args, state, control, **kwargs): 62 | self.save_model(args, state, kwargs) 63 | return control 64 | 65 | def on_train_end(self, args, state, control, **kwargs): 66 | def touch(fname, times=None): 67 | global_rank = int(os.environ.get("RANK", 0)) 68 | if global_rank == 0: 69 | with open(fname, "a"): 70 | os.utime(fname, times) 71 | 72 | touch(join(args.output_dir, "completed")) 73 | self.save_model(args, state, kwargs) 74 | 75 | 76 | def print_trainable_parameters(args, model): 77 | """ 78 | Prints the number of trainable parameters in the model. 79 | """ 80 | trainable_params = 0 81 | all_param = 0 82 | for _, param in model.named_parameters(): 83 | all_param += param.numel() 84 | if param.requires_grad: 85 | trainable_params += param.numel() 86 | if args.bits == 4: 87 | trainable_params /= 2 88 | print( 89 | f"trainable params: {trainable_params} || " 90 | f"all params: {all_param} || " 91 | f"trainable: {100 * trainable_params / all_param}" 92 | ) 93 | 94 | 95 | def get_last_checkpoint(checkpoint_dir): 96 | if isdir(checkpoint_dir): 97 | is_completed = exists(join(checkpoint_dir, "completed")) 98 | if is_completed: 99 | return None, True # already finished 100 | max_step = 0 101 | for filename in os.listdir(checkpoint_dir): 102 | if isdir(join(checkpoint_dir, filename)) and filename.startswith( 103 | "checkpoint" 104 | ): 105 | max_step = max(max_step, int(filename.replace("checkpoint-", ""))) 106 | if max_step == 0: 107 | return None, is_completed # training started, but no checkpoint 108 | checkpoint_dir = join(checkpoint_dir, f"checkpoint-{max_step}") 109 | print(f"Found a previous checkpoint at: {checkpoint_dir}") 110 | return checkpoint_dir, is_completed # checkpoint found! 111 | return None, False # first training 112 | -------------------------------------------------------------------------------- /RLHF/models/distributed_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 The Alpaca Team 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | """Utilities for PyTorch's distributed training. 16 | 17 | Compatible with torchrun / elastic. 18 | 19 | Internal map: 20 | https://github.com/lxuechen/ml-swissknife/blob/main/ml_swissknife/distributed_utils.py 21 | """ 22 | 23 | import os 24 | import sys 25 | from typing import Optional 26 | 27 | import torch 28 | import torch.distributed as dist 29 | 30 | 31 | def setup(rank: Optional[int] = None, world_size: Optional[int] = None): 32 | if rank is None: 33 | rank = get_rank() 34 | if world_size is None: 35 | world_size = get_world_size() 36 | 37 | if world_size <= 1: 38 | return rank, world_size 39 | 40 | if not dist.is_initialized(): 41 | if sys.platform == "win32": 42 | # Distributed package only covers collective communications with Gloo 43 | # backend and FileStore on Windows platform. Set init_method parameter 44 | # in init_process_group to a local file. 45 | # Example init_method="file:///f:/libtmp/some_file" 46 | init_method = "file:///f:/libtmp/dist-tmp" 47 | dist.init_process_group( 48 | backend="gloo", 49 | init_method=init_method, 50 | rank=rank, 51 | world_size=world_size, 52 | ) 53 | elif torch.cuda.is_available(): 54 | dist.init_process_group(backend="nccl", rank=rank, world_size=world_size) 55 | else: 56 | dist.init_process_group(backend="gloo", rank=rank, world_size=world_size) 57 | 58 | return rank, world_size 59 | 60 | 61 | def cleanup(): 62 | dist.destroy_process_group() 63 | 64 | 65 | def get_rank(): 66 | return int(os.getenv("RANK", 0)) 67 | 68 | 69 | def get_local_rank(): 70 | return int(os.getenv("LOCAL_RANK", 0)) 71 | 72 | 73 | def get_world_size(): 74 | return int(os.getenv("WORLD_SIZE", 1)) 75 | 76 | 77 | def should_save(): 78 | """Return True if the current process is the main process.""" 79 | return get_rank() <= 0 80 | 81 | 82 | def all_gather_and_cat(tensor: torch.Tensor, dim=0): 83 | if get_world_size() > 1: 84 | tensor_list = [torch.empty_like(tensor) for _ in range(get_world_size())] 85 | dist.all_gather(tensor_list, tensor) 86 | tensor = torch.cat(tensor_list, dim=dim) 87 | return tensor 88 | 89 | 90 | is_main_process = should_save 91 | -------------------------------------------------------------------------------- /RLHF/models/trainer_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 The Alpaca Team 2 | # Copyright 2022 The HuggingFace Team. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | from typing import Optional 17 | 18 | from torch import nn, optim 19 | from transformers import Trainer 20 | from transformers.optimization import get_scheduler 21 | from transformers.pytorch_utils import ALL_LAYERNORM_LAYERS 22 | from transformers.trainer_pt_utils import get_parameter_names 23 | 24 | 25 | def create_optimizer( 26 | args, model: nn.Module, optimizer: Optional[optim.Optimizer] = None 27 | ): 28 | """Create optimizer for trainer. 29 | 30 | This is detached version of the `Trainer.create_optimizer` method. 31 | We don't support sagemaker and fairscale for simplicity. 32 | 33 | Reference: 34 | https://github.com/huggingface/transformers/blob/main/src/transformers/trainer.py 35 | """ 36 | opt_model = model 37 | 38 | if optimizer is None: 39 | decay_parameters = get_parameter_names(opt_model, ALL_LAYERNORM_LAYERS) 40 | decay_parameters = [name for name in decay_parameters if "bias" not in name] 41 | optimizer_grouped_parameters = [ 42 | { 43 | "params": [ 44 | p 45 | for n, p in opt_model.named_parameters() 46 | if (n in decay_parameters and p.requires_grad) 47 | ], 48 | "weight_decay": args.weight_decay, 49 | }, 50 | { 51 | "params": [ 52 | p 53 | for n, p in opt_model.named_parameters() 54 | if (n not in decay_parameters and p.requires_grad) 55 | ], 56 | "weight_decay": 0.0, 57 | }, 58 | ] 59 | 60 | optimizer_cls, optimizer_kwargs = Trainer.get_optimizer_cls_and_kwargs(args) 61 | 62 | optimizer = optimizer_cls(optimizer_grouped_parameters, **optimizer_kwargs) 63 | return optimizer 64 | 65 | 66 | def create_scheduler(args, optimizer, lr_scheduler, num_training_steps): 67 | """Create scheduler for trainer. 68 | 69 | This is detached version of the `Trainer.create_scheduler` method. 70 | 71 | Reference: 72 | https://github.com/huggingface/transformers/blob/main/src/transformers/trainer.py 73 | """ 74 | if lr_scheduler is None: 75 | lr_scheduler = get_scheduler( 76 | args.lr_scheduler_type, 77 | optimizer=optimizer, 78 | num_warmup_steps=args.get_warmup_steps(num_training_steps), 79 | num_training_steps=num_training_steps, 80 | ) 81 | return lr_scheduler 82 | -------------------------------------------------------------------------------- /RLHF/prompts/fact_rlhf_reward_prompt.txt: -------------------------------------------------------------------------------- 1 | USER: Please evaluate the quality of your last response. There are several dimensions you should consider in your evaluation: 2 | 3 | 1. Accurate: The AI should provide factual and accurate information from the image, and refrain from making statements that are not supported by the image or inconsistent with the image. {factual_prompt} 4 | 2. Helpful: The AI’s response should precisely serve the user's needs and interests, while grounding the response in the image. 5 | 3. Language Natural: The AI should employ language that flows smoothly and is free from repetitive or awkward constructs. 6 | 4. Concise: The AI should efficiently address the task or answer the question, communicating the necessary information with brevity and clarity. 7 | 8 | A good response should be accurate, helpful, language natural, and concise. ASSISTANT: Following your definitions, the quality score of my last response is -------------------------------------------------------------------------------- /RLHF/prompts/reward_prompt.txt: -------------------------------------------------------------------------------- 1 | USER: Please evaluate the quality of your last response. There are several dimensions you should consider in your evaluation: 2 | 3 | 1. Accurate: The AI should provide factual and accurate information from the image, and refrain from making statements that are not supported by the image or inconsistent with the image. 4 | 2. Helpful: The AI’s response should precisely serve the user's needs and interests, while grounding the response in the image. 5 | 3. Language Natural: The AI should employ language that flows smoothly and is free from repetitive or awkward constructs. 6 | 4. Concise: The AI should efficiently address the task or answer the question, communicating the necessary information with brevity and clarity. 7 | 8 | A good response should be accurate, helpful, language natural, and concise. ASSISTANT: Following your definitions, the quality score of my last response is -------------------------------------------------------------------------------- /RLHF/scripts/13b-v1.5-336/initialize_policy_model.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | set -x 5 | 6 | export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 7 | export DATA_DIR="/path/to/your/data/directory" 8 | export MODEL_DIR="/path/to/your/model/directory" 9 | export PYTHONPATH="$PWD:$PYTHONPATH" 10 | export GPUS_PER_NODE=8 11 | export OMP_NUM_THREADS=8 12 | 13 | # MODEL CONFIG 14 | VISION_TOWER=openai/clip-vit-large-patch14-336 15 | LM_MODEL_NAME=LLaVA-RLHF-13b-v1.5-336/sft_model 16 | 17 | # SAVE CONFIG 18 | MODEL_NAME=LLaVA-RL-INIT-13b-v1.5-336-lora-padding 19 | 20 | # TRAINING CONFIG 21 | NUM_EPOCHS=1 22 | LEARNING_RATE=1e-4 23 | BATCH_SIZE=8 24 | GRAD_ACCUMULATION=2 25 | 26 | deepspeed \ 27 | finetune_lora_sft_ds.py \ 28 | --deepspeed scripts/zero2.json \ 29 | --do_train \ 30 | --do_eval \ 31 | --seed 42 \ 32 | --per_device_train_batch_size $BATCH_SIZE \ 33 | --per_device_eval_batch_size 8 \ 34 | --gradient_accumulation_steps $GRAD_ACCUMULATION \ 35 | --model_name_or_path $MODEL_DIR/$LM_MODEL_NAME \ 36 | --image_folder $DATA_DIR/coco/train2017 \ 37 | --vision_tower $VISION_TOWER \ 38 | --learning_rate $LEARNING_RATE \ 39 | --mm_vision_select_layer -2 \ 40 | --mm_use_im_start_end False \ 41 | --mm_use_im_patch_token False \ 42 | --freeze_mm_mlp_adapter True \ 43 | --query_len 1280 \ 44 | --response_len 768 \ 45 | --dataset $DATA_DIR/llava_reward10k-aokvqa5k.json \ 46 | --dataset_format "v1" \ 47 | --eval_size 500 \ 48 | --bits 16 \ 49 | --lora_r 64 \ 50 | --lora_modules q_proj k_proj v_proj o_proj gate_proj up_proj down_proj \ 51 | --output_dir "$MODEL_DIR/$MODEL_NAME" \ 52 | --num_train_epochs $NUM_EPOCHS \ 53 | --group_by_length False \ 54 | --evaluation_strategy "steps" \ 55 | --eval_steps 50 \ 56 | --save_strategy "steps" \ 57 | --save_steps 1000000 \ 58 | --save_total_limit 1 \ 59 | --weight_decay 0.0 \ 60 | --warmup_ratio 0.03 \ 61 | --lr_scheduler_type "cosine" \ 62 | --logging_steps 5 \ 63 | --report_to "tensorboard" \ 64 | --ddp_backend "nccl" \ 65 | --bf16 True \ 66 | --ddp_find_unused_parameters False \ 67 | --resume_from_training True \ 68 | --image_aspect_ratio 'pad' 69 | -------------------------------------------------------------------------------- /RLHF/scripts/13b-v1.5-336/train_reward_model.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | set -x 5 | 6 | export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 7 | export DATA_DIR="/path/to/your/data/directory" 8 | export MODEL_DIR="/path/to/your/model/directory" 9 | export PYTHONPATH="$PWD:$PYTHONPATH" 10 | export GPUS_PER_NODE=8 11 | export OMP_NUM_THREADS=8 12 | 13 | 14 | # MODEL CONFIG 15 | VISION_TOWER=openai/clip-vit-large-patch14-336 16 | LM_MODEL_NAME=LLaVA-RLHF-13b-v1.5-336/sft_model 17 | 18 | # DATA CONFIG 19 | PREFERENCE_DATA=llava_7b_v1_preference.json 20 | 21 | # SAVE CONFIG 22 | MODEL_NAME=LLaVA-Fact-RM-13b-v1.5-336-lora-padding 23 | 24 | # TRAINING CONFIG 25 | NUM_EPOCHS=1 26 | LEARNING_RATE=2e-5 27 | BATCH_SIZE=4 28 | GRAD_ACCUMULATION=1 29 | 30 | torchrun \ 31 | --standalone \ 32 | --nnodes=1 \ 33 | --nproc-per-node=$GPUS_PER_NODE \ 34 | finetune_lora_rm.py \ 35 | --do_train \ 36 | --do_eval \ 37 | --seed 42 \ 38 | --per_device_train_batch_size $BATCH_SIZE \ 39 | --per_device_eval_batch_size $BATCH_SIZE \ 40 | --gradient_accumulation_steps $GRAD_ACCUMULATION \ 41 | --model_name_or_path $MODEL_DIR/$LM_MODEL_NAME \ 42 | --image_folder $DATA_DIR/coco/train2017 \ 43 | --vision_tower $VISION_TOWER \ 44 | --learning_rate $LEARNING_RATE \ 45 | --mm_vision_select_layer -2 \ 46 | --mm_use_im_start_end False \ 47 | --mm_use_im_patch_token False \ 48 | --freeze_mm_mlp_adapter True \ 49 | --model_max_length 2048 \ 50 | --query_len 1280 \ 51 | --response_len 768 \ 52 | --dataset_path $DATA_DIR/$PREFERENCE_DATA \ 53 | --eval_dataset_path $DATA_DIR/$PREFERENCE_DATA \ 54 | --dataset_name "none" \ 55 | --eval_dataset_name "none" \ 56 | --eval_size 500 \ 57 | --bits 16 \ 58 | --lora_r 64 \ 59 | --lora_modules q_proj k_proj v_proj o_proj gate_proj up_proj down_proj \ 60 | --output_dir "$MODEL_DIR/$MODEL_NAME" \ 61 | --num_train_epochs $NUM_EPOCHS \ 62 | --group_by_length False \ 63 | --evaluation_strategy "steps" \ 64 | --eval_steps 50 \ 65 | --save_strategy "steps" \ 66 | --save_steps 50 \ 67 | --save_total_limit 10 \ 68 | --weight_decay 0.0 \ 69 | --warmup_ratio 0.03 \ 70 | --lr_scheduler_type "constant_with_warmup" \ 71 | --logging_steps 5 \ 72 | --report_to "tensorboard" \ 73 | --ddp_backend "nccl" \ 74 | --bf16 True \ 75 | --ddp_find_unused_parameters False \ 76 | --resume_from_training True \ 77 | --reward_prompt_file "./prompts/fact_rlhf_reward_prompt.txt" \ 78 | --image_to_caption_file "$DATA_DIR/image_to_caption.json" \ 79 | --image_aspect_ratio 'pad' 80 | -------------------------------------------------------------------------------- /RLHF/scripts/13b-v1.5-336/train_rl_model.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | set -x 5 | 6 | export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 7 | export DATA_DIR="/path/to/your/data/directory" 8 | export MODEL_DIR="/path/to/your/model/directory" 9 | export PYTHONPATH="$PWD:$PYTHONPATH" 10 | export GPUS_PER_NODE=8 11 | export OMP_NUM_THREADS=8 12 | export TRANSFORMERS_OFFLINE=1 13 | 14 | # MODEL CONFIG 15 | VISION_TOWER=openai/clip-vit-large-patch14-336 16 | BASE_MODEL_NAME=LLaVA-RLHF-13b-v1.5-336/sft_model 17 | 18 | POLICY_LORA=LLaVA-RL-INIT-13b-v1.5-336-lora-padding/lora_default 19 | RM_LORA=LLaVA-Fact-RM-13b-v1.5-336-lora-padding/checkpoint-200 # we use early stopping 20 | 21 | # SAVE CONFIG 22 | MODEL_NAME=LLaVA-RL-Fact-RLHF-13b-v1.5-336-lora-padding 23 | 24 | # TRAINING CONFIG 25 | LEARNING_RATE=3e-5 26 | KL_COEF=0.1 27 | EPOCH=4 28 | ROLLOUT_BATCH_SIZE=512 29 | STEP_BATCH_SZIE=256 30 | ROLLOUT_PER_DEVICE_BATCH_SIZE=32 31 | REWARD_MODEL_PER_DEVICE_BATCH_SIZE=16 32 | STEP_PER_DEVICE_BATCH_SIZE=16 33 | NOPTEPOCHS=2 34 | 35 | # FACT-RLHF CONFIG 36 | INCOMPLETE_RESPONSE=-8.0 37 | LENGTH_BONUS=-10.0 38 | CORRECT_BONUS=2.0 39 | 40 | torchrun \ 41 | --standalone \ 42 | --nnodes=1 \ 43 | --nproc-per-node=$GPUS_PER_NODE \ 44 | finetune_lora_ppo.py \ 45 | --do_train \ 46 | --seed 42 \ 47 | --step_batch_size $STEP_BATCH_SZIE \ 48 | --step_per_device_batch_size $STEP_PER_DEVICE_BATCH_SIZE \ 49 | --rollout_batch_size $ROLLOUT_BATCH_SIZE \ 50 | --rollout_per_device_batch_size $ROLLOUT_PER_DEVICE_BATCH_SIZE \ 51 | --reward_model_per_device_batch_size $REWARD_MODEL_PER_DEVICE_BATCH_SIZE \ 52 | --base_model_name "$LM_MODEL_CKPT" \ 53 | --policy_model_name_or_path "$MODEL_DIR/$POLICY_LORA" \ 54 | --reward_model_name_or_path "$MODEL_DIR/$RM_LORA" \ 55 | --learning_rate $LEARNING_RATE \ 56 | --init_value_with_reward True \ 57 | --warmup_steps 5 \ 58 | --dataset_path $DATA_DIR/llava_ppo50k-aokvqa12k-vqa10k.json \ 59 | --train_splits "train" \ 60 | --output_dir "$MODEL_DIR/$MODEL_NAME" \ 61 | --total_epochs $EPOCH \ 62 | --group_by_length False \ 63 | --evaluation_strategy "no" \ 64 | --save_strategy "steps" \ 65 | --save_steps 10 \ 66 | --save_total_limit 100000 \ 67 | --weight_decay 0.0 \ 68 | --lr_scheduler_type "cosine" \ 69 | --logging_steps 1 \ 70 | --report_to "tensorboard" \ 71 | --ddp_backend "nccl" \ 72 | --bf16 True \ 73 | --penalty_reward_value $INCOMPLETE_RESPONSE \ 74 | --length_bonus_score $LENGTH_BONUS \ 75 | --correct_bonus_score $CORRECT_BONUS \ 76 | --relative_stop_token_penalty True \ 77 | --penalize_no_stop_token True \ 78 | --ddp_find_unused_parameters False \ 79 | --resume_from_training True \ 80 | --kl_coef $KL_COEF \ 81 | --max_grad_norm 1.0 \ 82 | --whitening_async_stats "full_batch" \ 83 | --clean_tokens_after_eos True \ 84 | --temperature 1.0 \ 85 | --whiten_rewards False \ 86 | --model_max_length 2048 \ 87 | --query_len 128 \ 88 | --response_len 896 \ 89 | --noptepochs $NOPTEPOCHS \ 90 | --image_folder $DATA_DIR/coco/train2017 \ 91 | --vision_tower $VISION_TOWER \ 92 | --mm_vision_select_layer -2 \ 93 | --mm_use_im_start_end False \ 94 | --mm_use_im_patch_token False \ 95 | --freeze_mm_mlp_adapter True \ 96 | --reward_prompt_file "./prompts/fact_rlhf_reward_prompt.txt" \ 97 | --image_to_caption_file "$DATA_DIR/image_to_caption.json" \ 98 | --image_aspect_ratio 'pad' 99 | -------------------------------------------------------------------------------- /RLHF/scripts/7b-v1.5-224/initialize_policy_model.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | set -x 5 | 6 | export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 7 | export DATA_DIR="/path/to/your/data/directory" 8 | export MODEL_DIR="/path/to/your/model/directory" 9 | export PYTHONPATH="$PWD:$PYTHONPATH" 10 | export GPUS_PER_NODE=8 11 | export OMP_NUM_THREADS=8 12 | 13 | # MODEL CONFIG 14 | VISION_TOWER=openai/clip-vit-large-patch14 15 | LM_MODEL_NAME=LLaVA-RLHF-7b-v1.5-224/sft_model 16 | 17 | # SAVE CONFIG 18 | MODEL_NAME=LLaVA-RL-INIT-7b-v1.5-224-lora-padding 19 | 20 | # TRAINING CONFIG 21 | NUM_EPOCHS=1 22 | LEARNING_RATE=1e-4 23 | BATCH_SIZE=8 24 | GRAD_ACCUMULATION=2 25 | 26 | deepspeed \ 27 | finetune_lora_sft_ds.py \ 28 | --deepspeed scripts/zero2.json \ 29 | --do_train \ 30 | --do_eval \ 31 | --seed 42 \ 32 | --per_device_train_batch_size $BATCH_SIZE \ 33 | --per_device_eval_batch_size 8 \ 34 | --gradient_accumulation_steps $GRAD_ACCUMULATION \ 35 | --model_name_or_path $MODEL_DIR/$LM_MODEL_NAME \ 36 | --image_folder $DATA_DIR/coco/train2017 \ 37 | --vision_tower $VISION_TOWER \ 38 | --learning_rate $LEARNING_RATE \ 39 | --mm_vision_select_layer -2 \ 40 | --mm_use_im_start_end False \ 41 | --mm_use_im_patch_token False \ 42 | --freeze_mm_mlp_adapter True \ 43 | --query_len 1280 \ 44 | --response_len 768 \ 45 | --dataset $DATA_DIR/llava_reward10k-aokvqa5k.json \ 46 | --dataset_format "v1" \ 47 | --eval_size 500 \ 48 | --bits 16 \ 49 | --lora_r 64 \ 50 | --lora_modules q_proj k_proj v_proj o_proj gate_proj up_proj down_proj \ 51 | --output_dir "$MODEL_DIR/$MODEL_NAME" \ 52 | --num_train_epochs $NUM_EPOCHS \ 53 | --group_by_length False \ 54 | --evaluation_strategy "steps" \ 55 | --eval_steps 50 \ 56 | --save_strategy "steps" \ 57 | --save_steps 1000000 \ 58 | --save_total_limit 1 \ 59 | --weight_decay 0.0 \ 60 | --warmup_ratio 0.03 \ 61 | --lr_scheduler_type "cosine" \ 62 | --logging_steps 5 \ 63 | --report_to "tensorboard" \ 64 | --ddp_backend "nccl" \ 65 | --bf16 True \ 66 | --ddp_find_unused_parameters False \ 67 | --resume_from_training True \ 68 | --image_aspect_ratio 'pad' 69 | -------------------------------------------------------------------------------- /RLHF/scripts/7b-v1.5-224/train_rl_model.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | set -x 5 | 6 | export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 7 | export DATA_DIR="/path/to/your/data/directory" 8 | export MODEL_DIR="/path/to/your/model/directory" 9 | export PYTHONPATH="$PWD:$PYTHONPATH" 10 | export GPUS_PER_NODE=8 11 | export OMP_NUM_THREADS=8 12 | export TRANSFORMERS_OFFLINE=1 13 | 14 | # MODEL CONFIG 15 | POLICY_BASE_MODEL_NAME=LLaVA-RLHF-7b-v1.5-224/sft_model 16 | RM_BASE_MODEL_NAME=LLaVA-RLHF-13b-v1.5-336/sft_model 17 | 18 | POLICY_LORA=LLaVA-RL-INIT-7b-v1.5-224-lora-padding/lora_default 19 | RM_LORA=LLaVA-Fact-RM-13b-v1.5-336-lora-padding/checkpoint-200 # we use early stopping 20 | 21 | # SAVE CONFIG 22 | MODEL_NAME=LLaVA-RL-Fact-RLHF-7b-v1.5-224-lora-padding 23 | 24 | # TRAINING CONFIG 25 | LEARNING_RATE=3e-5 26 | KL_COEF=0.1 27 | EPOCH=4 28 | ROLLOUT_BATCH_SIZE=512 29 | STEP_BATCH_SZIE=256 30 | ROLLOUT_PER_DEVICE_BATCH_SIZE=64 31 | REWARD_MODEL_PER_DEVICE_BATCH_SIZE=16 32 | STEP_PER_DEVICE_BATCH_SIZE=16 33 | NOPTEPOCHS=2 34 | 35 | # FACT-RLHF CONFIG 36 | INCOMPLETE_RESPONSE=-8.0 37 | LENGTH_BONUS=-10.0 38 | CORRECT_BONUS=2.0 39 | 40 | torchrun \ 41 | --standalone \ 42 | --nnodes=1 \ 43 | --nproc-per-node=$GPUS_PER_NODE \ 44 | finetune_lora_ppo.py \ 45 | --do_train \ 46 | --seed 42 \ 47 | --step_batch_size $STEP_BATCH_SZIE \ 48 | --step_per_device_batch_size $STEP_PER_DEVICE_BATCH_SIZE \ 49 | --rollout_batch_size $ROLLOUT_BATCH_SIZE \ 50 | --rollout_per_device_batch_size $ROLLOUT_PER_DEVICE_BATCH_SIZE \ 51 | --reward_model_per_device_batch_size $REWARD_MODEL_PER_DEVICE_BATCH_SIZE \ 52 | --base_model_name "$MODEL_DIR/$POLICY_BASE_MODEL_NAME" \ 53 | --reward_base_model_name "$MODEL_DIR/$RM_BASE_MODEL_NAME" \ 54 | --policy_model_name_or_path "$MODEL_DIR/$POLICY_LORA" \ 55 | --reward_model_name_or_path "$MODEL_DIR/$RM_LORA" \ 56 | --learning_rate $LEARNING_RATE \ 57 | --init_value_with_reward True \ 58 | --warmup_steps 5 \ 59 | --dataset_path $DATA_DIR/llava_ppo50k-aokvqa12k-vqa10k.json \ 60 | --train_splits "train" \ 61 | --output_dir "$MODEL_DIR/$MODEL_NAME" \ 62 | --total_epochs $EPOCH \ 63 | --group_by_length False \ 64 | --evaluation_strategy "no" \ 65 | --save_strategy "steps" \ 66 | --save_steps 10 \ 67 | --save_total_limit 100000 \ 68 | --weight_decay 0.0 \ 69 | --lr_scheduler_type "cosine" \ 70 | --logging_steps 1 \ 71 | --report_to "tensorboard" \ 72 | --ddp_backend "nccl" \ 73 | --bf16 True \ 74 | --penalty_reward_value $INCOMPLETE_RESPONSE \ 75 | --length_bonus_score $LENGTH_BONUS \ 76 | --correct_bonus_score $CORRECT_BONUS \ 77 | --relative_stop_token_penalty True \ 78 | --penalize_no_stop_token True \ 79 | --ddp_find_unused_parameters False \ 80 | --resume_from_training True \ 81 | --kl_coef $KL_COEF \ 82 | --max_grad_norm 1.0 \ 83 | --whitening_async_stats "full_batch" \ 84 | --clean_tokens_after_eos True \ 85 | --temperature 1.0 \ 86 | --whiten_rewards False \ 87 | --model_max_length 2048 \ 88 | --query_len 128 \ 89 | --response_len 896 \ 90 | --noptepochs $NOPTEPOCHS \ 91 | --image_folder $DATA_DIR/coco/train2017 \ 92 | --vision_tower different \ 93 | --mm_vision_select_layer -2 \ 94 | --mm_use_im_start_end False \ 95 | --mm_use_im_patch_token False \ 96 | --freeze_mm_mlp_adapter True \ 97 | --reward_prompt_file "./prompts/fact_rlhf_reward_prompt.txt" \ 98 | --image_to_caption_file "$DATA_DIR/image_to_caption.json" \ 99 | --image_aspect_ratio 'pad' 100 | -------------------------------------------------------------------------------- /RLHF/scripts/zero2.json: -------------------------------------------------------------------------------- 1 | { 2 | "bf16": { 3 | "enabled": true 4 | }, 5 | "train_micro_batch_size_per_gpu": "auto", 6 | "train_batch_size": "auto", 7 | "gradient_accumulation_steps": "auto", 8 | "zero_optimization": { 9 | "stage": 2, 10 | "overlap_comm": true, 11 | "contiguous_gradients": true, 12 | "sub_group_size": 1e9, 13 | "reduce_bucket_size": "auto" 14 | } 15 | } -------------------------------------------------------------------------------- /SFT/README.md: -------------------------------------------------------------------------------- 1 | # Supervised Fine-Tuning 2 | 3 | [Coming Soon] 4 | -------------------------------------------------------------------------------- /SFT/scripts/13b-v1.5-336/pretrain.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Uncomment and set the following variables correspondingly to run this script: 4 | 5 | ########### DO NOT CHANGE ########### 6 | ########### USE THIS FOR BOTH ########### 7 | PROMPT_VERSION=v0_plain 8 | ########### DO NOT CHANGE ########### 9 | 10 | DATA_PATH=/data/blip_laion_cc_sbu_558k.json 11 | LM_MODEL_CKPT="lmsys/vicuna-13b-v1.5" 12 | MODEL_VERSION=vicuna-13b-v1.5-336 13 | 14 | deepspeed train/train.py \ 15 | --deepspeed ./scripts/zero3.json \ 16 | --model_name_or_path $LM_MODEL_CKPT \ 17 | --version $PROMPT_VERSION \ 18 | --data_path $DATA_PATH \ 19 | --image_folder /shared/group/coco/train2017 \ 20 | --vision_tower openai/clip-vit-large-patch14-336 \ 21 | --tune_mm_mlp_adapter True \ 22 | --mm_vision_select_layer -2 \ 23 | --mm_use_im_start_end False \ 24 | --mm_use_im_patch_token False \ 25 | --bf16 True \ 26 | --output_dir /model/llava-$MODEL_VERSION-pretrain \ 27 | --num_train_epochs 1 \ 28 | --per_device_train_batch_size 8 \ 29 | --per_device_eval_batch_size 8 \ 30 | --gradient_accumulation_steps 2 \ 31 | --evaluation_strategy "no" \ 32 | --save_strategy "steps" \ 33 | --save_steps 24000 \ 34 | --save_total_limit 1 \ 35 | --learning_rate 2e-3 \ 36 | --weight_decay 0. \ 37 | --warmup_ratio 0.03 \ 38 | --lr_scheduler_type "cosine" \ 39 | --logging_steps 1 \ 40 | --tf32 True \ 41 | --model_max_length 2048 \ 42 | --gradient_checkpointing True \ 43 | --dataloader_num_workers 4 \ 44 | --lazy_preprocess True \ 45 | --report_to wandb \ 46 | --image_folder 47 | -------------------------------------------------------------------------------- /SFT/scripts/13b-v1.5-336/sft.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Uncomment and set the following variables correspondingly to run this script: 4 | export HF_HOME=/shared/sheng/huggingface 5 | 6 | ################## VICUNA ################## 7 | PROMPT_VERSION=v1 8 | MODEL_VERSION=vicuna-13b-v1.5-336 9 | ################## VICUNA ################## 10 | 11 | ################## LLaMA-2 ################## 12 | # PROMPT_VERSION="llava_llama_2" 13 | # MODEL_VERSION="llama-2-7b-chat" 14 | ################## LLaMA-2 ################## 15 | 16 | LM_MODEL_CKPT=lmsys/vicuna-13b-v1.5 17 | MM_CKPT=/shared/llava-$MODEL_VERSION-pretrain/mm_projector.bin 18 | DATA_PATH=mix-llava-sft90k-vqav2_83k-okvqa_16k-flickr_23k.json 19 | 20 | 21 | deepspeed train/train.py \ 22 | --deepspeed ./scripts/zero3.json \ 23 | --model_name_or_path $LM_MODEL_CKPT \ 24 | --version $PROMPT_VERSION \ 25 | --data_path ${DATA_PATH} \ 26 | --image_folder /shared/group/coco/train2017 \ 27 | --vision_tower openai/clip-vit-large-patch14-336 \ 28 | --pretrain_mm_mlp_adapter $MM_CKPT \ 29 | --mm_vision_select_layer -2 \ 30 | --mm_use_im_start_end False \ 31 | --mm_use_im_patch_token False \ 32 | --bf16 True \ 33 | --output_dir ./checkpoints/llava-$MODEL_VERSION-sft-llava-sft90k-vqav2_83k-okvqa_16k-pad-ep3 \ 34 | --num_train_epochs 3 \ 35 | --per_device_train_batch_size 8 \ 36 | --per_device_eval_batch_size 8 \ 37 | --gradient_accumulation_steps 2 \ 38 | --evaluation_strategy "no" \ 39 | --save_strategy "steps" \ 40 | --save_steps 500 \ 41 | --save_total_limit 1 \ 42 | --learning_rate 2e-5 \ 43 | --weight_decay 0. \ 44 | --warmup_ratio 0.03 \ 45 | --lr_scheduler_type "cosine" \ 46 | --logging_steps 1 \ 47 | --tf32 True \ 48 | --model_max_length 1280 \ 49 | --gradient_checkpointing True \ 50 | --dataloader_num_workers 4 \ 51 | --lazy_preprocess True \ 52 | --report_to wandb \ 53 | --image_aspect_ratio 'pad' 54 | -------------------------------------------------------------------------------- /SFT/scripts/7b-v1.5-224/pretrain.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Uncomment and set the following variables correspondingly to run this script: 4 | LM_MODEL_CKPT="lmsys/vicuna-13b-v1.5" 5 | MODEL_VERSION=vicuna-7b-v1.5 6 | ########### DO NOT CHANGE ########### 7 | ########### USE THIS FOR BOTH ########### 8 | PROMPT_VERSION=v0_plain 9 | ########### DO NOT CHANGE ########### 10 | 11 | DATA_PATH=/data/blip_laion_cc_sbu_558k.json 12 | 13 | deepspeed train/train.py \ 14 | --deepspeed ./scripts/zero3.json \ 15 | --model_name_or_path $LM_MODEL_CKPT \ 16 | --version $PROMPT_VERSION \ 17 | --data_path $DATA_PATH \ 18 | --image_folder /shared/group/coco/train2017 \ 19 | --vision_tower openai/clip-vit-large-patch14 \ 20 | --tune_mm_mlp_adapter True \ 21 | --mm_vision_select_layer -2 \ 22 | --mm_use_im_start_end False \ 23 | --mm_use_im_patch_token False \ 24 | --bf16 True \ 25 | --output_dir /model/llava-$MODEL_VERSION-pretrain \ 26 | --num_train_epochs 1 \ 27 | --per_device_train_batch_size 16 \ 28 | --per_device_eval_batch_size 16 \ 29 | --gradient_accumulation_steps 1 \ 30 | --evaluation_strategy "no" \ 31 | --save_strategy "steps" \ 32 | --save_steps 24000 \ 33 | --save_total_limit 1 \ 34 | --learning_rate 2e-3 \ 35 | --weight_decay 0. \ 36 | --warmup_ratio 0.03 \ 37 | --lr_scheduler_type "cosine" \ 38 | --logging_steps 1 \ 39 | --tf32 True \ 40 | --model_max_length 2048 \ 41 | --gradient_checkpointing True \ 42 | --dataloader_num_workers 4 \ 43 | --lazy_preprocess True \ 44 | --report_to wandb \ 45 | --image_folder 46 | -------------------------------------------------------------------------------- /SFT/scripts/7b-v1.5-224/sft.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Uncomment and set the following variables correspondingly to run this script: 4 | export HF_HOME=/shared/sheng/huggingface 5 | 6 | ################## VICUNA ################## 7 | PROMPT_VERSION=v1 8 | MODEL_VERSION=vicuna-v1-5-7b 9 | ################## VICUNA ################## 10 | 11 | ################## LLaMA-2 ################## 12 | # PROMPT_VERSION="llava_llama_2" 13 | # MODEL_VERSION="llama-2-7b-chat" 14 | ################## LLaMA-2 ################## 15 | 16 | LM_MODEL_CKPT=lmsys/vicuna-7b-v1.5 17 | MM_CKPT=/shared/llava-$MODEL_VERSION-pretrain/mm_projector.bin 18 | DATA_PATH=mix-llava-sft90k-vqav2_83k-okvqa_16k-flickr_23k.json 19 | 20 | deepspeed train/train.py \ 21 | --deepspeed ./scripts/zero3.json \ 22 | --model_name_or_path $LM_MODEL_CKPT \ 23 | --version $PROMPT_VERSION \ 24 | --data_path ${DATA_PATH} \ 25 | --image_folder /shared/group/coco/train2017 \ 26 | --vision_tower openai/clip-vit-large-patch14 \ 27 | --pretrain_mm_mlp_adapter $MM_CKPT \ 28 | --mm_vision_select_layer -2 \ 29 | --mm_use_im_start_end False \ 30 | --mm_use_im_patch_token False \ 31 | --bf16 True \ 32 | --output_dir ./checkpoints/llava-$MODEL_VERSION-sft-llava-sft90k-vqav2_83k-okvqa_16k-pad-ep3 \ 33 | --num_train_epochs 3 \ 34 | --per_device_train_batch_size 16 \ 35 | --per_device_eval_batch_size 16 \ 36 | --gradient_accumulation_steps 1 \ 37 | --evaluation_strategy "no" \ 38 | --save_strategy "steps" \ 39 | --save_steps 500 \ 40 | --save_total_limit 1 \ 41 | --learning_rate 2e-5 \ 42 | --weight_decay 0. \ 43 | --warmup_ratio 0.03 \ 44 | --lr_scheduler_type "cosine" \ 45 | --logging_steps 1 \ 46 | --tf32 True \ 47 | --model_max_length 1280 \ 48 | --gradient_checkpointing True \ 49 | --dataloader_num_workers 4 \ 50 | --lazy_preprocess True \ 51 | --report_to wandb \ 52 | --image_aspect_ratio 'pad' 53 | -------------------------------------------------------------------------------- /SFT/scripts/zero3.json: -------------------------------------------------------------------------------- 1 | { 2 | "fp16": { 3 | "enabled": "auto", 4 | "loss_scale": 0, 5 | "loss_scale_window": 1000, 6 | "initial_scale_power": 16, 7 | "hysteresis": 2, 8 | "min_loss_scale": 1 9 | }, 10 | "bf16": { 11 | "enabled": "auto" 12 | }, 13 | "train_micro_batch_size_per_gpu": "auto", 14 | "train_batch_size": "auto", 15 | "gradient_accumulation_steps": "auto", 16 | "zero_optimization": { 17 | "stage": 3, 18 | "overlap_comm": true, 19 | "contiguous_gradients": true, 20 | "sub_group_size": 1e9, 21 | "reduce_bucket_size": "auto", 22 | "stage3_prefetch_bucket_size": "auto", 23 | "stage3_param_persistence_threshold": "auto", 24 | "stage3_max_live_parameters": 1e9, 25 | "stage3_max_reuse_distance": 1e9, 26 | "stage3_gather_16bit_weights_on_model_save": true 27 | } 28 | } -------------------------------------------------------------------------------- /SFT/train/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/llava-rlhf/LLaVA-RLHF/830a083fd83e607da3c35f2e6aef833523c1e722/SFT/train/__init__.py -------------------------------------------------------------------------------- /SFT/train/llava_trainer.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | 4 | from transformers import Trainer 5 | from typing import Optional 6 | 7 | 8 | def maybe_zero_3(param, ignore_status=False, name=None): 9 | from deepspeed import zero 10 | from deepspeed.runtime.zero.partition_parameters import ZeroParamStatus 11 | if hasattr(param, "ds_id"): 12 | if param.ds_status == ZeroParamStatus.NOT_AVAILABLE: 13 | if not ignore_status: 14 | print(name, 'no ignore status') 15 | with zero.GatheredParameters([param]): 16 | param = param.data.detach().cpu().clone() 17 | else: 18 | param = param.detach().cpu().clone() 19 | return param 20 | 21 | 22 | def get_mm_adapter_state_maybe_zero_3(named_params, keys_to_match): 23 | to_return = {k: t for k, t in named_params if any(key_match in k for key_match in keys_to_match)} 24 | to_return = {k: maybe_zero_3(v, ignore_status=True, name=k).cpu() for k, v in to_return.items()} 25 | return to_return 26 | 27 | 28 | class LLaVATrainer(Trainer): 29 | 30 | def _save_checkpoint(self, model, trial, metrics=None): 31 | if getattr(self.args, 'tune_mm_mlp_adapter', False): 32 | from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR 33 | checkpoint_folder = f"{PREFIX_CHECKPOINT_DIR}-{self.state.global_step}" 34 | 35 | run_dir = self._get_output_dir(trial=trial) 36 | output_dir = os.path.join(run_dir, checkpoint_folder) 37 | 38 | # Only save Adapter 39 | keys_to_match = ['mm_projector'] 40 | if getattr(self.args, "use_im_start_end", False): 41 | keys_to_match.extend(['embed_tokens', 'embed_in']) 42 | 43 | weight_to_save = get_mm_adapter_state_maybe_zero_3(self.model.named_parameters(), keys_to_match) 44 | 45 | if self.args.local_rank == 0 or self.args.local_rank == -1: 46 | self.model.config.save_pretrained(output_dir) 47 | torch.save(weight_to_save, os.path.join(output_dir, f'mm_projector.bin')) 48 | else: 49 | super(LLaVATrainer, self)._save_checkpoint(model, trial, metrics) 50 | 51 | def _save(self, output_dir: Optional[str] = None, state_dict=None): 52 | if getattr(self.args, 'tune_mm_mlp_adapter', False): 53 | pass 54 | else: 55 | super(LLaVATrainer, self)._save(output_dir, state_dict) 56 | -------------------------------------------------------------------------------- /assets/images/llava_rlhf_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/llava-rlhf/LLaVA-RLHF/830a083fd83e607da3c35f2e6aef833523c1e722/assets/images/llava_rlhf_logo.png -------------------------------------------------------------------------------- /demo/README.md: -------------------------------------------------------------------------------- 1 | # Demo 2 | 3 | This is a minimal example to launch a LLaVA-RLHF demo. In order to download the model checkpoint, please check this [Hugging Face model hub link](https://huggingface.co/zhiqings/LLaVA-RLHF-13b-v1.5-336). 4 | 5 | ## Install LLaVA 6 | 7 | To run our demo, you need to install the LLaVA package. Please follow the instructions in the [original repository](https://github.com/haotian-liu/LLaVA/tree/main#install) to install LLaVA. 8 | 9 | ## Gradio Web UI 10 | 11 | To launch a Gradio demo locally, please run the following commands one by one. If you plan to launch multiple model workers to compare between different checkpoints, you only need to launch the controller and the web server *ONCE*. 12 | 13 | ### Launch a controller 14 | 15 | ```Shell 16 | python -m llava.serve.controller --host 0.0.0.0 --port 10000 17 | ``` 18 | 19 | ### Launch a gradio web server 20 | 21 | ```Shell 22 | python -m llava.serve.gradio_web_server --controller http://localhost:10000 --model-list-mode reload 23 | ``` 24 | 25 | You just launched the Gradio web interface. Now, you can open the web interface with the URL printed on the screen. You may notice that there is no model in the model list. Do not worry, as we have not launched any model worker yet. It will be automatically updated when you launch a model worker. 26 | 27 | ### Launch a model worker 28 | 29 | This is the actual *worker* that performs the inference on the GPU. Each worker is responsible for a single model specified in `--model-path`. 30 | 31 | ```Shell 32 | export CUDA_VISIBLE_DEVICES=0 33 | 34 | python -m model_worker --host 0.0.0.0 \ 35 | --controller http://localhost:10000 \ 36 | --port 40000 \ 37 | --worker http://localhost:40000 \ 38 | --load-bf16 \ 39 | --model-name llava-rlhf-13b-v1.5-336 \ 40 | --model-path /path/to/LLaVA-RLHF-13b-v1.5-336/sft_model \ 41 | --lora-path /path/to/LLaVA-RLHF-13b-v1.5-336/rlhf_lora_adapter_model 42 | ``` 43 | 44 | Wait until the process finishes loading the model and you see "Uvicorn running on ...". Now, refresh your Gradio web UI, and you will see the model you just launched in the model list. 45 | 46 | You can launch as many workers as you want, and compare between different model checkpoints in the same Gradio interface. Please keep the `--controller` the same, and modify the `--port` and `--worker` to a different port number for each worker. 47 | 48 | ```Shell 49 | export CUDA_VISIBLE_DEVICES=1 50 | 51 | python -m model_worker --host 0.0.0.0 \ 52 | --controller http://localhost:10000 \ 53 | --port \ 54 | --worker http://localhost: \ 55 | --load-bf16 \ 56 | --model-name llava-rlhf-13b-v1.5-336 \ 57 | --model-path /path/to/LLaVA-RLHF-13b-v1.5-336/sft_model \ 58 | --lora-path /path/to/LLaVA-RLHF-13b-v1.5-336/rlhf_lora_adapter_model 59 | ``` 60 | 61 | If you are using an Apple device with an M1 or M2 chip, you can specify the mps device by using the `--device` flag: `--device mps`. 62 | 63 | ### Launch a model worker (Multiple GPUs, when GPU VRAM <= 24GB) 64 | 65 | If the VRAM of your GPU is less than 24GB (e.g., RTX 3090, RTX 4090, etc.), you may try running it with multiple GPUs. Our latest code base will automatically try to use multiple GPUs if you have more than one GPU. You can specify which GPUs to use with `CUDA_VISIBLE_DEVICES`. Below is an example of running with the first two GPUs. 66 | 67 | ```Shell 68 | export CUDA_VISIBLE_DEVICES=0,1 69 | 70 | python -m model_worker --host 0.0.0.0 \ 71 | --controller http://localhost:10000 \ 72 | --port 40000 \ 73 | --worker http://localhost:40000 \ 74 | --load-bf16 \ 75 | --model-name llava-rlhf-13b-v1.5-336 \ 76 | --model-path /path/to/LLaVA-RLHF-13b-v1.5-336/sft_model \ 77 | --lora-path /path/to/LLaVA-RLHF-13b-v1.5-336/rlhf_lora_adapter_model 78 | ``` 79 | -------------------------------------------------------------------------------- /demo/model_builder.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Haotian Liu 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | import os 17 | import warnings 18 | import shutil 19 | 20 | from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig, BitsAndBytesConfig 21 | import torch 22 | from llava.model import * 23 | from llava.constants import DEFAULT_IMAGE_PATCH_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN 24 | 25 | 26 | def load_pretrained_model(model_path, model_base, model_name, load_8bit=False, load_4bit=False, device_map="auto", load_bf16=False): 27 | kwargs = {"device_map": device_map} 28 | 29 | if load_8bit: 30 | kwargs['load_in_8bit'] = True 31 | elif load_4bit: 32 | kwargs['load_in_4bit'] = True 33 | kwargs['quantization_config'] = BitsAndBytesConfig( 34 | load_in_4bit=True, 35 | bnb_4bit_compute_dtype=torch.float16, 36 | bnb_4bit_use_double_quant=True, 37 | bnb_4bit_quant_type='nf4' 38 | ) 39 | elif load_bf16: 40 | kwargs['torch_dtype'] = torch.bfloat16 41 | else: 42 | kwargs['torch_dtype'] = torch.float16 43 | 44 | if 'llava' in model_name.lower(): 45 | # Load LLaVA model 46 | if 'lora' in model_name.lower() and model_base is None: 47 | warnings.warn('There is `lora` in model name but no `model_base` is provided. If you are loading a LoRA model, please provide the `model_base` argument. Detailed instruction: https://github.com/haotian-liu/LLaVA#launch-a-model-worker-lora-weights-unmerged.') 48 | if 'lora' in model_name.lower() and model_base is not None: 49 | lora_cfg_pretrained = AutoConfig.from_pretrained(model_path) 50 | tokenizer = AutoTokenizer.from_pretrained(model_base, use_fast=False) 51 | print('Loading LLaVA from base model...') 52 | model = LlavaLlamaForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, config=lora_cfg_pretrained, **kwargs) 53 | token_num, tokem_dim = model.lm_head.out_features, model.lm_head.in_features 54 | if model.lm_head.weight.shape[0] != token_num: 55 | model.lm_head.weight = torch.nn.Parameter(torch.empty(token_num, tokem_dim, device=model.device, dtype=model.dtype)) 56 | model.model.embed_tokens.weight = torch.nn.Parameter(torch.empty(token_num, tokem_dim, device=model.device, dtype=model.dtype)) 57 | 58 | print('Loading additional LLaVA weights...') 59 | if os.path.exists(os.path.join(model_path, 'non_lora_trainables.bin')): 60 | non_lora_trainables = torch.load(os.path.join(model_path, 'non_lora_trainables.bin'), map_location='cpu') 61 | else: 62 | # this is probably from HF Hub 63 | from huggingface_hub import hf_hub_download 64 | def load_from_hf(repo_id, filename, subfolder=None): 65 | cache_file = hf_hub_download( 66 | repo_id=repo_id, 67 | filename=filename, 68 | subfolder=subfolder) 69 | return torch.load(cache_file, map_location='cpu') 70 | non_lora_trainables = load_from_hf(model_path, 'non_lora_trainables.bin') 71 | non_lora_trainables = {(k[11:] if k.startswith('base_model.') else k): v for k, v in non_lora_trainables.items()} 72 | if any(k.startswith('model.model.') for k in non_lora_trainables): 73 | non_lora_trainables = {(k[6:] if k.startswith('model.') else k): v for k, v in non_lora_trainables.items()} 74 | model.load_state_dict(non_lora_trainables, strict=False) 75 | 76 | from peft import PeftModel 77 | print('Loading LoRA weights...') 78 | model = PeftModel.from_pretrained(model, model_path) 79 | print('Merging LoRA weights...') 80 | model = model.merge_and_unload() 81 | print('Model is loaded...') 82 | elif model_base is not None: 83 | # this may be mm projector only 84 | print('Loading LLaVA from base model...') 85 | if 'mpt' in model_name.lower(): 86 | if not os.path.isfile(os.path.join(model_path, 'configuration_mpt.py')): 87 | shutil.copyfile(os.path.join(model_base, 'configuration_mpt.py'), os.path.join(model_path, 'configuration_mpt.py')) 88 | tokenizer = AutoTokenizer.from_pretrained(model_base, use_fast=True) 89 | cfg_pretrained = AutoConfig.from_pretrained(model_path, trust_remote_code=True) 90 | model = LlavaMPTForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, config=cfg_pretrained, **kwargs) 91 | else: 92 | tokenizer = AutoTokenizer.from_pretrained(model_base, use_fast=False) 93 | cfg_pretrained = AutoConfig.from_pretrained(model_path) 94 | model = LlavaLlamaForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, config=cfg_pretrained, **kwargs) 95 | 96 | mm_projector_weights = torch.load(os.path.join(model_path, 'mm_projector.bin'), map_location='cpu') 97 | mm_projector_weights = {k: v.to(torch.float16) for k, v in mm_projector_weights.items()} 98 | model.load_state_dict(mm_projector_weights, strict=False) 99 | else: 100 | if 'mpt' in model_name.lower(): 101 | tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True) 102 | model = LlavaMPTForCausalLM.from_pretrained(model_path, low_cpu_mem_usage=True, **kwargs) 103 | else: 104 | tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False) 105 | model = LlavaLlamaForCausalLM.from_pretrained(model_path, low_cpu_mem_usage=True, **kwargs) 106 | else: 107 | # Load language model 108 | if model_base is not None: 109 | # PEFT model 110 | from peft import PeftModel 111 | tokenizer = AutoTokenizer.from_pretrained(model_base, use_fast=False) 112 | model = AutoModelForCausalLM.from_pretrained(model_base, torch_dtype=torch.float16, low_cpu_mem_usage=True, device_map="auto") 113 | print(f"Loading LoRA weights from {model_path}") 114 | model = PeftModel.from_pretrained(model, model_path) 115 | print(f"Merging weights") 116 | model = model.merge_and_unload() 117 | print('Convert to FP16...') 118 | model.to(torch.float16) 119 | else: 120 | use_fast = False 121 | if 'mpt' in model_name.lower(): 122 | tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True) 123 | model = AutoModelForCausalLM.from_pretrained(model_path, low_cpu_mem_usage=True, trust_remote_code=True, **kwargs) 124 | else: 125 | tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False) 126 | model = AutoModelForCausalLM.from_pretrained(model_path, low_cpu_mem_usage=True, **kwargs) 127 | 128 | image_processor = None 129 | 130 | if 'llava' in model_name.lower(): 131 | mm_use_im_start_end = getattr(model.config, "mm_use_im_start_end", False) 132 | mm_use_im_patch_token = getattr(model.config, "mm_use_im_patch_token", True) 133 | if mm_use_im_patch_token: 134 | tokenizer.add_tokens([DEFAULT_IMAGE_PATCH_TOKEN], special_tokens=True) 135 | if mm_use_im_start_end: 136 | tokenizer.add_tokens([DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN], special_tokens=True) 137 | model.resize_token_embeddings(len(tokenizer)) 138 | 139 | vision_tower = model.get_vision_tower() 140 | if not vision_tower.is_loaded: 141 | vision_tower.load_model() 142 | if load_bf16: 143 | vision_tower.to(device='cuda', dtype=torch.bfloat16) 144 | else: 145 | vision_tower.to(device='cuda', dtype=torch.float16) 146 | image_processor = vision_tower.image_processor 147 | 148 | if hasattr(model.config, "max_sequence_length"): 149 | context_len = model.config.max_sequence_length 150 | else: 151 | context_len = 2048 152 | 153 | return tokenizer, model, image_processor, context_len 154 | -------------------------------------------------------------------------------- /llava_setup/.gitignore: -------------------------------------------------------------------------------- 1 | LLaVA -------------------------------------------------------------------------------- /llava_setup/README.md: -------------------------------------------------------------------------------- 1 | # Install LLaVA 2 | 3 | We use LLaVA version `6cea223` for training the SFT and RLHF models. 4 | 5 | ## Apply the custom patch 6 | 7 | ```bash 8 | git clone https://github.com/haotian-liu/LLaVA.git 9 | 10 | cd LLaVA 11 | 12 | git reset --hard 6cea223 13 | 14 | git apply < ../fix_llava_padding.patch 15 | ``` 16 | 17 | ## Install LLaVA 18 | 19 | Next, please follow the instructions in the [original repository](https://github.com/haotian-liu/LLaVA/tree/6cea223532a7ab7bda8116336c59772faccdcbca#install) to install LLaVA. 20 | 21 | ## Update Packages 22 | 23 | Finally, please update the following packages: 24 | 25 | ```bash 26 | pip install torch==2.0.1+cu118 torchvision==0.15.2+cu118 torchaudio==2.0.2 --index-url https://download.pytorch.org/whl/cu118 27 | pip install deepspeed==0.9.3 28 | pip install peft==0.4.0 29 | pip install transformers==4.31.0 30 | pip install bitsandbytes==0.41.0 31 | pip install datasets 32 | ``` 33 | 34 | **Note:** please install Pytorch 2.0.1 following the guidelines [here](https://pytorch.org/get-started/previous-versions/#v201). We found that the flash-attention implementation in the newest Pytorch Stable (2.1.0) could lead to buggy results. The codebase is tested with `torch==2.0.1+cu118`. 35 | --------------------------------------------------------------------------------