├── GPT-4V ├── GPT4V_Seed.py ├── GPT4V_Whoops.py ├── GPT4V_mmbench.py ├── GPT4V_wino.py └── Sphinx_bench.py ├── InstructBLIP-13b ├── InstructBLIP_MMBench.py ├── InstructBLIP_Seed.py ├── InstructBLIP_Whoops.py └── InstructBLIP_wino.py ├── LICENSE ├── LLaVA-1.5-13b ├── LLaVA_MMBench.py ├── LLaVA_SEED.py ├── LLaVA_Whoops.py ├── LLaVA_bench.py ├── LLaVA_wino.py └── llava_seed.sh ├── README.md ├── Sphinx ├── Sphinx_SEED.py ├── Sphinx_Whoops.py ├── Sphinx_mmbench.py └── Sphinx_wino.py ├── data ├── filter_qs.py └── llava-seed-bench-filtered.jsonl ├── eval_winoground.py ├── images ├── fig1_v7.png └── fig2_v8.png └── parsed_winoground.jsonl /GPT-4V/GPT4V_Seed.py: -------------------------------------------------------------------------------- 1 | from PIL import Image 2 | import base64 3 | import requests 4 | import json 5 | from tqdm import tqdm 6 | from openai import OpenAI 7 | import json 8 | 9 | image_file = '' # Image Path 10 | question_path = "" #Question Path 11 | result_path = "" #File to store result 12 | result_file = open(result_path, 'w') 13 | api_key='' #Openai api key 14 | 15 | sgPrompt=''' 16 | For the provided image and its associated question, generate only a scene graph in JSON format that includes the following: 17 | 1. Objects that are relevant to answering the question 18 | 2. Object attributes that are relevant to answering the question 19 | 3. Object relationships that are relevant to answering the question 20 | ''' 21 | 22 | 23 | def encode_image(image_path): 24 | with open(image_path, "rb") as image_file: 25 | return base64.b64encode(image_file.read()).decode('utf-8') 26 | 27 | 28 | def create_payload(cur_image, cur_text): 29 | payload = { 30 | "model": "gpt-4-vision-preview", 31 | "messages": [ 32 | { 33 | "role": "user", 34 | "content": [ 35 | { 36 | "type": "image_url", 37 | "image_url": { 38 | "url": f"data:image/jpeg;base64,{cur_image}" 39 | } 40 | }, 41 | { 42 | "type": "text", 43 | "text": cur_text 44 | } 45 | ] 46 | } 47 | ], 48 | "max_tokens": 512, 49 | "temperature":0 50 | } 51 | return payload 52 | 53 | 54 | with open(question_path, 'r') as json_file: 55 | json_list = list(json_file) 56 | 57 | headers = { 58 | "Content-Type": "application/json", 59 | "Authorization": f"Bearer {api_key}" 60 | } 61 | 62 | 63 | for json_str in tqdm(json_list): 64 | is_done = False 65 | fail_count = 0 66 | 67 | while not is_done: 68 | try: 69 | result = json.loads(json_str) 70 | 71 | 72 | cur_image = encode_image(image_file + result["image"]) 73 | cur_question = result["text"] 74 | cur_id = result["question_id"] 75 | 76 | payload = create_payload(cur_image, "Question: " + cur_question.split("?")[0] + "?" + sgPrompt) 77 | response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload) 78 | cur_sg = response.json()["choices"][0]["message"]["content"] 79 | 80 | new_q = f"Scene Graph: {cur_sg}. Use the image and scene graph as context and answer the following question: {cur_question}" 81 | payload = create_payload(cur_image, new_q) 82 | 83 | response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload) 84 | final_ans = response.json()["choices"][0]["message"]["content"] 85 | 86 | 87 | temp_result = {"question_id":cur_id, "text":final_ans} 88 | result_file.write(json.dumps(temp_result) + "\n") 89 | is_done = True 90 | except: 91 | fail_count += 1 92 | if fail_count == 5: 93 | break 94 | is_done = False 95 | 96 | result_file.close() 97 | -------------------------------------------------------------------------------- /GPT-4V/GPT4V_Whoops.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import torch 3 | import os 4 | import json 5 | from tqdm import tqdm 6 | import shortuuid 7 | import requests 8 | from PIL import Image 9 | import math 10 | from datasets import load_dataset 11 | import base64 12 | from io import BytesIO 13 | 14 | 15 | result_path="" ##path to store result 16 | api_key="" #Openai api key 17 | hf_key="" #Huggingface auth key 18 | 19 | 20 | sgPrompt=''' 21 | For the provided image and its associated question, generate only a scene graph in JSON format that includes the following: 22 | 1. Objects that are relevant to answering the question 23 | 2. Object attributes that are relevant to answering the question 24 | 3. Object relationships that are relevant to answering the question 25 | ''' 26 | answerPrompt=".Use the image and scene graph as context and answer the following question: " 27 | 28 | 29 | def get_ans(question, image_tensor, pred_sg=None): 30 | if pred_sg is None: 31 | cur_prompt = question + sgPrompt 32 | max_tokens = 512 33 | else: 34 | cur_prompt = "Scene Graph: " + pred_sg + answerPrompt + question 35 | max_tokens = 128 36 | 37 | buffered = BytesIO() 38 | image_tensor.convert('RGB').save(buffered, format="JPEG") 39 | img_str = base64.b64encode(buffered.getvalue()).decode('utf-8') 40 | 41 | 42 | headers = { 43 | "Content-Type": "application/json", 44 | "Authorization": f"Bearer {api_key}" 45 | } 46 | 47 | payload = { 48 | "model": "gpt-4-vision-preview", 49 | "messages": [ 50 | { 51 | "role": "user", 52 | "content": [ 53 | { 54 | "type": "image_url", 55 | "image_url": { 56 | "url": f"data:image/jpeg;base64,{img_str}" 57 | } 58 | }, 59 | { 60 | "type": "text", 61 | "text": cur_prompt 62 | } 63 | ] 64 | } 65 | ], 66 | "max_tokens": max_tokens, 67 | "temperature":0 68 | } 69 | 70 | response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload) 71 | cur_ans = response.json()["choices"][0]["message"]["content"] 72 | return cur_ans 73 | 74 | 75 | result_file = open(result_path, "w") 76 | examples = load_dataset('nlphuji/whoops', use_auth_token=hf_key) 77 | for item in tqdm(examples["test"]): 78 | 79 | is_done = False 80 | fail_count = 0 81 | 82 | while not is_done: 83 | try: 84 | image = item["image"] 85 | 86 | all_pred = [] 87 | for q_a in item["question_answering_pairs"]: 88 | 89 | question = q_a[0] 90 | pred_sg = get_ans(question, image) 91 | pred_ans = get_ans(question, image, pred_sg) 92 | all_pred.append(pred_ans) 93 | 94 | 95 | result_file.write(json.dumps({ "image_id": item["image_id"], 96 | "question_answering_pairs": item["question_answering_pairs"], 97 | "prediction": all_pred}) + "\n") 98 | is_done = True 99 | except: 100 | fail_count += 1 101 | if fail_count == 5: 102 | break 103 | is_done = False 104 | 105 | result_file.close() 106 | -------------------------------------------------------------------------------- /GPT-4V/GPT4V_mmbench.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import pandas as pd 4 | from tqdm import tqdm 5 | from PIL import Image 6 | import math 7 | import requests 8 | 9 | 10 | api_key='' #Openai api key 11 | question_path="" #MMbench tsv file path 12 | result_path="" #Path to store result 13 | 14 | sgPrompt=''' 15 | For the provided image and its associated question, generate only a scene graph in JSON format that includes the following: 16 | 1. Objects that are relevant to answering the question 17 | 2. Object attributes that are relevant to answering the question 18 | 3. Object relationships that are relevant to answering the question 19 | ''' 20 | answerPrompt="Use the image and scene graph as context and answer the following question: " 21 | all_options = ['A', 'B', 'C', 'D'] 22 | 23 | 24 | def create_payload(cur_image, cur_text): 25 | payload = { 26 | "model": "gpt-4-vision-preview", 27 | "messages": [ 28 | { 29 | "role": "user", 30 | "content": [ 31 | { 32 | "type": "image_url", 33 | "image_url": { 34 | "url": f"data:image/jpeg;base64,{cur_image}" 35 | } 36 | }, 37 | { 38 | "type": "text", 39 | "text": cur_text 40 | } 41 | ] 42 | } 43 | ], 44 | "max_tokens": 512, 45 | "temperature":0 46 | } 47 | return payload 48 | 49 | 50 | def split_list(lst, n): 51 | """Split a list into n (roughly) equal-sized chunks""" 52 | chunk_size = math.ceil(len(lst) / n) # integer division 53 | return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)] 54 | 55 | 56 | def get_chunk(lst, n, k): 57 | chunks = split_list(lst, n) 58 | return chunks[k] 59 | 60 | 61 | def is_none(value): 62 | if value is None: 63 | return True 64 | if type(value) is float and math.isnan(value): 65 | return True 66 | if type(value) is str and value.lower() == 'nan': 67 | return True 68 | if type(value) is str and value.lower() == 'none': 69 | return True 70 | return False 71 | 72 | def get_options(row, options): 73 | parsed_options = [] 74 | for option in options: 75 | option_value = row[option] 76 | if is_none(option_value): 77 | break 78 | parsed_options.append(option_value) 79 | return parsed_options 80 | 81 | headers = { 82 | "Content-Type": "application/json", 83 | "Authorization": f"Bearer {api_key}" 84 | } 85 | 86 | questions = pd.read_table(question_path) 87 | questions = get_chunk(questions, 1, 0) 88 | result_file = open(result_path, "w") 89 | 90 | 91 | for index, row in tqdm(questions.iterrows(), total=len(questions)): 92 | is_done = False 93 | fail_count = 0 94 | 95 | while not is_done: 96 | try: 97 | options = get_options(row, all_options) 98 | cur_option_char = all_options[:len(options)] 99 | 100 | 101 | idx = row['index'] 102 | question = row["question"] 103 | image = row['image'] 104 | 105 | 106 | for option_char, option in zip(all_options[:len(options)], options): 107 | question = question + '\n' + option_char + '. ' + option 108 | qs = question 109 | 110 | payload = create_payload(image, qs + sgPrompt) 111 | response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload) 112 | cur_sg = response.json()["choices"][0]["message"]["content"] 113 | 114 | 115 | new_q = f"Scene Graph: {cur_sg}\n\n{answerPrompt}{qs}\nAnswer with the option's letter from the given choices directly." 116 | payload = create_payload(image, new_q) 117 | response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload) 118 | final_ans = response.json()["choices"][0]["message"]["content"] 119 | 120 | 121 | temp_result = {"question_id":idx, "text":final_ans} 122 | result_file.write(json.dumps(temp_result) + "\n") 123 | 124 | 125 | is_done = True 126 | # rotate options 127 | options = options[1:] + options[:1] 128 | cur_option_char = cur_option_char[1:] + cur_option_char[:1] 129 | except: 130 | fail_count += 1 131 | if fail_count == 5: 132 | break 133 | is_done = False 134 | result_file.close() 135 | -------------------------------------------------------------------------------- /GPT-4V/GPT4V_wino.py: -------------------------------------------------------------------------------- 1 | from PIL import Image 2 | import base64 3 | import requests 4 | import json 5 | from tqdm import tqdm 6 | from openai import OpenAI 7 | import json 8 | 9 | image_file = "" # Image Path 10 | question_path = "" #Path to question 11 | result_path = "" #Path to store result 12 | result_file = open(result_path, 'w') 13 | api_key="" #Openai api key 14 | 15 | sgPrompt=''' 16 | For the provided image and its associated question, generate only a scene graph in JSON format that includes the following: 17 | 1. Objects that are relevant to answering the question 18 | 2. Object attributes that are relevant to answering the question 19 | 3. Object relationships that are relevant to answering the question 20 | ''' 21 | answerPrompt=".Use the image and scene graph as context and answer the following question: " 22 | 23 | def create_payload(cur_image, cur_text): 24 | payload = { 25 | "model": "gpt-4-vision-preview", 26 | "messages": [ 27 | { 28 | "role": "user", 29 | "content": [ 30 | { 31 | "type": "image_url", 32 | "image_url": { 33 | "url": f"data:image/jpeg;base64,{cur_image}" 34 | } 35 | }, 36 | { 37 | "type": "text", 38 | "text": cur_text 39 | } 40 | ] 41 | } 42 | ], 43 | "max_tokens": 512, 44 | "temperature":0 45 | } 46 | return payload 47 | 48 | 49 | def encode_image(image_path): 50 | with open(image_path, "rb") as image_file: 51 | return base64.b64encode(image_file.read()).decode('utf-8') 52 | 53 | 54 | headers = { 55 | "Content-Type": "application/json", 56 | "Authorization": f"Bearer {api_key}" 57 | } 58 | 59 | 60 | with open(question_path, 'r') as json_file: 61 | json_list = list(json_file) 62 | 63 | 64 | for json_str in tqdm(json_list): 65 | is_done = False 66 | fail_count = 0 67 | 68 | while not is_done: 69 | try: 70 | result = json.loads(json_str) 71 | cur_image = encode_image(image_file + result["image"] + ".png") 72 | cur_caption = result["caption"] 73 | cur_question = f"Does the given caption accurately describe the given image? Caption:{cur_caption}\n\n{sgPrompt}" 74 | 75 | payload = create_payload(cur_image, cur_question) 76 | response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload) 77 | cur_sg = response.json()["choices"][0]["message"]["content"] 78 | 79 | 80 | new_q = f"Question: Does the given caption accurately describe the given image? Caption:{cur_caption}.\n\nAnswer: Scene graph:{cur_sg}\n\nUse the scene graph and image to reason and answer the question:" 81 | payload = create_payload(cur_image, new_q) 82 | response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload) 83 | final_ans = response.json()["choices"][0]["message"]["content"] 84 | 85 | 86 | temp_result = {"text":final_ans} 87 | result_file.write(json.dumps(temp_result) + "\n") 88 | is_done = True 89 | except: 90 | fail_count += 1 91 | if fail_count == 5: 92 | break 93 | is_done = False 94 | result_file.close() 95 | -------------------------------------------------------------------------------- /GPT-4V/Sphinx_bench.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import torch 3 | import os 4 | import json 5 | from tqdm import tqdm 6 | import shortuuid 7 | 8 | # from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN 9 | # from llava.conversation import conv_templates, SeparatorStyle 10 | # from llava.model.builder import load_pretrained_model 11 | # from llava.utils import disable_torch_init 12 | # from llava.mm_utils import tokenizer_image_token, get_model_name_from_path, KeywordsStoppingCriteria 13 | 14 | from PIL import Image 15 | from SPHINX import SPHINXModel 16 | import math 17 | 18 | answerPrompt="Use the image and scene graph as context to improve the detail and clarity of the original answer: " 19 | 20 | sgPrompt=''' 21 | 22 | For the provided image and question-answer pair, generate a scene graph in JSON format to improve the quality and/or detail of the answer. The scene graph can include the following: 23 | 1. Objects that are relevant to answering the question. 24 | 2. Object attributes that are relevant to answering the question. 25 | 3. Object relationships that are relevant to answering the question. 26 | 27 | Scene Graph: 28 | ''' 29 | 30 | def split_list(lst, n): 31 | """Split a list into n (roughly) equal-sized chunks""" 32 | chunk_size = math.ceil(len(lst) / n) # integer division 33 | return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)] 34 | 35 | 36 | def get_chunk(lst, n, k): 37 | chunks = split_list(lst, n) 38 | return chunks[k] 39 | 40 | 41 | def eval_model(args): 42 | # Model 43 | # disable_torch_init() 44 | # model_path = os.path.expanduser(args.model_path) 45 | # model_name = get_model_name_from_path(model_path) 46 | # tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, args.model_base, model_name) 47 | 48 | model = SPHINXModel.from_pretrained(pretrained_path="/home/chancharikm/compVL/LLaMA2-Accessory/SPHINX/SPHINX-v2-1k-weights", with_visual=True).to(device='cuda') 49 | model_name = "Sphinx-v2-1k" 50 | 51 | questions = [json.loads(q) for q in open(os.path.expanduser(args.question_file), "r")] 52 | questions = get_chunk(questions, args.num_chunks, args.chunk_idx) 53 | answers_file = os.path.expanduser(args.answers_file) 54 | os.makedirs(os.path.dirname(answers_file), exist_ok=True) 55 | ans_file = open(answers_file, "w") 56 | for line in tqdm(questions): 57 | idx = line["question_id"] 58 | image_file = line["image"] 59 | 60 | #----ZS Ans generation------ 61 | qs = line["text"] 62 | # cur_prompt = qs 63 | # if model.config.mm_use_im_start_end: 64 | # qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + qs 65 | # else: 66 | # qs = DEFAULT_IMAGE_TOKEN + '\n' + qs 67 | 68 | # conv = conv_templates[args.conv_mode].copy() 69 | # conv.append_message(conv.roles[0], qs) 70 | # conv.append_message(conv.roles[1], None) 71 | # prompt = conv.get_prompt() 72 | 73 | # input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda() 74 | 75 | image = Image.open(os.path.join(args.image_folder, image_file)).convert('RGB') 76 | # image_tensor = image_processor.preprocess(image, return_tensors='pt')['pixel_values'][0] 77 | 78 | # stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2 79 | # keywords = [stop_str] 80 | # stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids) 81 | 82 | # with torch.inference_mode(): 83 | # output_ids = model.generate( 84 | # input_ids, 85 | # images=image_tensor.unsqueeze(0).half().cuda(), 86 | # do_sample=True if args.temperature > 0 else False, 87 | # temperature=args.temperature, 88 | # top_p=args.top_p, 89 | # num_beams=args.num_beams, 90 | # # no_repeat_ngram_size=3, 91 | # max_new_tokens=1024, 92 | # use_cache=True) 93 | with torch.cuda.amp.autocast(dtype=torch.float16): 94 | outputs = model.generate_reponse( #No, this is not a typo. This seems to be typo inherited from the Sphinx codebase - let them know via Github! 95 | [[qs, None]], 96 | image, 97 | temperature=args.temperature, 98 | top_p=args.top_p, 99 | max_gen_len=1024, 100 | seed=0) 101 | outputs = outputs.strip() 102 | 103 | # input_token_len = input_ids.shape[1] 104 | # n_diff_input_output = (input_ids != output_ids[:, :input_token_len]).sum().item() 105 | # if n_diff_input_output > 0: 106 | # print(f'[Warning] {n_diff_input_output} output_ids are not the same as the input_ids') 107 | # outputs = tokenizer.batch_decode(output_ids[:, input_token_len:], skip_special_tokens=True)[0] 108 | # outputs = outputs.strip() 109 | # if outputs.endswith(stop_str): 110 | # outputs = outputs[:-len(stop_str)] 111 | # outputs = outputs.strip() 112 | 113 | #-----SG Generation------- 114 | og_ans = outputs 115 | 116 | qs = "Question: " + line["text"] + "\nAnswer: " + og_ans + "\n\n" + sgPrompt 117 | 118 | # cur_prompt = qs 119 | # if model.config.mm_use_im_start_end: 120 | # qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + qs 121 | # else: 122 | # qs = DEFAULT_IMAGE_TOKEN + '\n' + qs 123 | 124 | # conv = conv_templates[args.conv_mode].copy() 125 | # conv.append_message(conv.roles[0], qs) 126 | # conv.append_message(conv.roles[1], None) 127 | # prompt = conv.get_prompt() 128 | 129 | # input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda() 130 | 131 | #image = Image.open(os.path.join(args.image_folder, image_file)) 132 | # image_tensor = image_processor.preprocess(image, return_tensors='pt')['pixel_values'][0] 133 | 134 | # stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2 135 | # keywords = [stop_str] 136 | # stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids) 137 | 138 | # with torch.inference_mode(): 139 | # output_ids = model.generate( 140 | # input_ids, 141 | # images=image_tensor.unsqueeze(0).half().cuda(), 142 | # do_sample=True if args.temperature > 0 else False, 143 | # temperature=args.temperature, 144 | # top_p=args.top_p, 145 | # num_beams=args.num_beams, 146 | # # no_repeat_ngram_size=3, 147 | # max_new_tokens=1024, 148 | # use_cache=True) 149 | 150 | # input_token_len = input_ids.shape[1] 151 | # n_diff_input_output = (input_ids != output_ids[:, :input_token_len]).sum().item() 152 | # if n_diff_input_output > 0: 153 | # print(f'[Warning] {n_diff_input_output} output_ids are not the same as the input_ids') 154 | # outputs = tokenizer.batch_decode(output_ids[:, input_token_len:], skip_special_tokens=True)[0] 155 | # outputs = outputs.strip() 156 | # if outputs.endswith(stop_str): 157 | # outputs = outputs[:-len(stop_str)] 158 | # outputs = outputs.strip() 159 | with torch.cuda.amp.autocast(dtype=torch.float16): 160 | outputs = model.generate_reponse( #No, this is not a typo. This seems to be typo inherited from the Sphinx codebase - let them know via Github! 161 | [[qs, None]], 162 | image, 163 | temperature=args.temperature, 164 | top_p=args.top_p, 165 | max_gen_len=256, 166 | seed=0) 167 | outputs = outputs.strip() 168 | 169 | sg = outputs 170 | #----Improved Answer----- 171 | qs = answerPrompt + "\n\nScene Graph: " + sg + "\n\nQuestion: " + line["text"] + "\nOriginal Answer: " + og_ans + "\nImproved Answer: " 172 | cur_prompt = qs 173 | # cur_prompt = qs 174 | # if model.config.mm_use_im_start_end: 175 | # qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + qs 176 | # else: 177 | # qs = DEFAULT_IMAGE_TOKEN + '\n' + qs 178 | 179 | # conv = conv_templates[args.conv_mode].copy() 180 | # conv.append_message(conv.roles[0], qs) 181 | # conv.append_message(conv.roles[1], None) 182 | # prompt = conv.get_prompt() 183 | 184 | # input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda() 185 | 186 | #image = Image.open(os.path.join(args.image_folder, image_file)) 187 | # image_tensor = image_processor.preprocess(image, return_tensors='pt')['pixel_values'][0] 188 | 189 | # stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2 190 | # keywords = [stop_str] 191 | # stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids) 192 | 193 | # with torch.inference_mode(): 194 | # output_ids = model.generate( 195 | # input_ids, 196 | # images=image_tensor.unsqueeze(0).half().cuda(), 197 | # do_sample=True if args.temperature > 0 else False, 198 | # temperature=args.temperature, 199 | # top_p=args.top_p, 200 | # num_beams=args.num_beams, 201 | # # no_repeat_ngram_size=3, 202 | # max_new_tokens=1024, 203 | # use_cache=True) 204 | 205 | # input_token_len = input_ids.shape[1] 206 | # n_diff_input_output = (input_ids != output_ids[:, :input_token_len]).sum().item() 207 | # if n_diff_input_output > 0: 208 | # print(f'[Warning] {n_diff_input_output} output_ids are not the same as the input_ids') 209 | # outputs = tokenizer.batch_decode(output_ids[:, input_token_len:], skip_special_tokens=True)[0] 210 | # outputs = outputs.strip() 211 | # if outputs.endswith(stop_str): 212 | # outputs = outputs[:-len(stop_str)] 213 | # outputs = outputs.strip() 214 | 215 | with torch.cuda.amp.autocast(dtype=torch.float16): 216 | outputs = model.generate_reponse( #No, this is not a typo. This seems to be typo inherited from the Sphinx codebase - let them know via Github! 217 | [[qs, None]], 218 | image, 219 | temperature=args.temperature, 220 | top_p=args.top_p, 221 | max_gen_len=1024, 222 | seed=0) 223 | outputs = outputs.strip() 224 | ans_id = shortuuid.uuid() 225 | ans_file.write(json.dumps({"question_id": idx, 226 | "prompt": cur_prompt, 227 | "text": outputs, 228 | "answer_id": ans_id, 229 | "model_id": model_name, 230 | "metadata": {}}) + "\n") 231 | ans_file.flush() 232 | ans_file.close() 233 | 234 | if __name__ == "__main__": 235 | parser = argparse.ArgumentParser() 236 | parser.add_argument("--model-path", type=str, default="facebook/opt-350m") 237 | parser.add_argument("--model-base", type=str, default=None) 238 | parser.add_argument("--image-folder", type=str, default="") 239 | parser.add_argument("--question-file", type=str, default="tables/question.jsonl") 240 | parser.add_argument("--answers-file", type=str, default="answer.jsonl") 241 | parser.add_argument("--conv-mode", type=str, default="llava_v1") 242 | parser.add_argument("--num-chunks", type=int, default=1) 243 | parser.add_argument("--chunk-idx", type=int, default=0) 244 | parser.add_argument("--temperature", type=float, default=0.2) 245 | parser.add_argument("--top_p", type=float, default=None) 246 | parser.add_argument("--num_beams", type=int, default=1) 247 | args = parser.parse_args() 248 | 249 | eval_model(args) 250 | -------------------------------------------------------------------------------- /InstructBLIP-13b/InstructBLIP_MMBench.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import torch 3 | import os 4 | import json 5 | import pandas as pd 6 | from tqdm import tqdm 7 | import shortuuid 8 | 9 | from llava.utils import disable_torch_init 10 | from llava.mm_utils import load_image_from_base64 11 | from PIL import Image 12 | import math 13 | from transformers import InstructBlipProcessor, InstructBlipForConditionalGeneration, InstructBlipConfig, AutoModelForVision2Seq 14 | import torch 15 | from PIL import Image 16 | import requests 17 | from accelerate import init_empty_weights, infer_auto_device_map 18 | import json 19 | import os 20 | 21 | 22 | processor = InstructBlipProcessor.from_pretrained("Salesforce/instructblip-vicuna-13b", device_map="auto") 23 | model = InstructBlipForConditionalGeneration.from_pretrained("Salesforce/instructblip-vicuna-13b") 24 | model.to("cuda:1") 25 | 26 | 27 | answerPrompt="Use the image and scene graph as context and answer the following question: " 28 | 29 | sgPrompt=''' 30 | For the provided image and its associated question, generate a scene graph in JSON format that includes the following: 31 | 1. Objects that are relevant to answering the question 32 | 2. Object attributes that are relevant to answering the question 33 | 3. Object relationships that are relevant to answering the question 34 | 35 | Scene Graph: 36 | ''' 37 | 38 | all_options = ['A', 'B', 'C', 'D'] 39 | 40 | 41 | def split_list(lst, n): 42 | """Split a list into n (roughly) equal-sized chunks""" 43 | chunk_size = math.ceil(len(lst) / n) # integer division 44 | return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)] 45 | 46 | 47 | def get_chunk(lst, n, k): 48 | chunks = split_list(lst, n) 49 | return chunks[k] 50 | 51 | 52 | def is_none(value): 53 | if value is None: 54 | return True 55 | if type(value) is float and math.isnan(value): 56 | return True 57 | if type(value) is str and value.lower() == 'nan': 58 | return True 59 | if type(value) is str and value.lower() == 'none': 60 | return True 61 | return False 62 | 63 | def get_options(row, options): 64 | parsed_options = [] 65 | for option in options: 66 | option_value = row[option] 67 | if is_none(option_value): 68 | break 69 | parsed_options.append(option_value) 70 | return parsed_options 71 | 72 | 73 | def eval_model(args): 74 | # Model 75 | disable_torch_init() 76 | 77 | questions = pd.read_table(os.path.expanduser(args.question_file)) 78 | questions = get_chunk(questions, args.num_chunks, args.chunk_idx) 79 | answers_file = os.path.expanduser(args.answers_file) 80 | os.makedirs(os.path.dirname(answers_file), exist_ok=True) 81 | ans_file = open(answers_file, "w") 82 | 83 | for index, row in tqdm(questions.iterrows(), total=len(questions)): 84 | options = get_options(row, all_options) 85 | cur_option_char = all_options[:len(options)] 86 | 87 | if args.all_rounds: 88 | num_rounds = len(options) 89 | else: 90 | num_rounds = 1 91 | 92 | for round_idx in range(num_rounds): 93 | idx = row['index'] 94 | 95 | question = row["question"] 96 | 97 | image = load_image_from_base64(row['image']) 98 | 99 | for option_char, option in zip(all_options[:len(options)], options): 100 | question = question + '\n' + option_char + '. ' + option 101 | qs = cur_prompt = question 102 | 103 | 104 | if args.single_pred_prompt: 105 | if args.lang == 'cn': 106 | qs = qs + '\n' + "请直接回答选项字母。" 107 | else: 108 | 109 | qs = qs + sgPrompt 110 | 111 | 112 | prompt = " " + qs 113 | inputs = processor(images=image, text=prompt, return_tensors="pt").to("cuda:1") 114 | outputs = model.generate( 115 | **inputs, 116 | do_sample=False, 117 | num_beams=5, 118 | max_length=256, 119 | min_length=1, 120 | top_p=0.9, 121 | repetition_penalty=1.5, 122 | length_penalty=0.5, 123 | temperature=1, 124 | ) 125 | generated_text = processor.batch_decode(outputs, skip_special_tokens=True)[0].strip() 126 | 127 | 128 | prompt_score = "" + " Scene Graph: " + generated_text + '\n\n' + answerPrompt + cur_prompt + ". The correct letter is" 129 | inputs2 = processor(images=image, text=prompt_score, return_tensors="pt").to("cuda:1") 130 | outputs2 = model.generate( 131 | **inputs2, 132 | do_sample=False, 133 | num_beams=5, 134 | max_length=256, 135 | min_length=1, 136 | top_p=0.9, 137 | repetition_penalty=1.5, 138 | length_penalty=0.5, 139 | temperature=1, 140 | ) 141 | 142 | generated_text = processor.batch_decode(outputs2, skip_special_tokens=True)[0].strip() 143 | 144 | ans_file.write(json.dumps({"question_id": idx, 145 | "round_id": round_idx, 146 | "prompt": cur_prompt, 147 | "text": generated_text, 148 | "options": options, 149 | "option_char": cur_option_char, 150 | "metadata": {}}) + "\n") 151 | ans_file.flush() 152 | 153 | # rotate options 154 | options = options[1:] + options[:1] 155 | cur_option_char = cur_option_char[1:] + cur_option_char[:1] 156 | ans_file.close() 157 | 158 | if __name__ == "__main__": 159 | parser = argparse.ArgumentParser() 160 | parser.add_argument("--model-path", type=str, default="facebook/opt-350m") 161 | parser.add_argument("--model-base", type=str, default=None) 162 | parser.add_argument("--image-folder", type=str, default="") 163 | parser.add_argument("--question-file", type=str, default="") 164 | parser.add_argument("--answers-file", type=str, default="") 165 | parser.add_argument("--conv-mode", type=str, default="llava_v1") 166 | parser.add_argument("--num-chunks", type=int, default=1) 167 | parser.add_argument("--chunk-idx", type=int, default=0) 168 | parser.add_argument("--temperature", type=float, default=0.2) 169 | parser.add_argument("--top_p", type=float, default=None) 170 | parser.add_argument("--num_beams", type=int, default=1) 171 | parser.add_argument("--all-rounds", action="store_true") 172 | parser.add_argument("--single-pred-prompt", action="store_true") 173 | parser.add_argument("--lang", type=str, default="en") 174 | 175 | args = parser.parse_args() 176 | 177 | eval_model(args) 178 | -------------------------------------------------------------------------------- /InstructBLIP-13b/InstructBLIP_Seed.py: -------------------------------------------------------------------------------- 1 | from transformers import InstructBlipProcessor, InstructBlipForConditionalGeneration, InstructBlipConfig, AutoModelForVision2Seq 2 | import torch 3 | from PIL import Image 4 | import requests 5 | from accelerate import init_empty_weights, infer_auto_device_map 6 | import json 7 | import os 8 | from tqdm import tqdm 9 | 10 | 11 | # Determine if CUDA (GPU) is available. 12 | device = "cuda" if torch.cuda.is_available() else "cpu" 13 | 14 | 15 | # Load the model configuration. 16 | config = InstructBlipConfig.from_pretrained("Salesforce/instructblip-vicuna-13b") 17 | 18 | # Initialize the model with the given configuration. 19 | with init_empty_weights(): 20 | 21 | model = AutoModelForVision2Seq.from_config(config) 22 | model.tie_weights() 23 | 24 | # Infer device map based on the available resources. 25 | device_map = infer_auto_device_map(model, max_memory={7: "20GiB", 8: "20GiB", 9: "20GiB"}, 26 | no_split_module_classes=['InstructBlipEncoderLayer', 'InstructBlipQFormerLayer', 27 | 'LlamaDecoderLayer']) 28 | 29 | device_map['language_model.lm_head'] = device_map['language_projection'] = device_map[('language_model.model' 30 | '.embed_tokens')] 31 | 32 | offload = "" 33 | # Load the processor and model for image processing. 34 | processor = InstructBlipProcessor.from_pretrained("Salesforce/instructblip-vicuna-13b", device_map="auto") 35 | model = InstructBlipForConditionalGeneration.from_pretrained("Salesforce/instructblip-vicuna-13b", 36 | device_map=device_map, 37 | offload_folder=offload, offload_state_dict=True) 38 | 39 | 40 | sgPrompt=''' 41 | For the provided image and its associated question, generate a scene graph in JSON format that includes the following: 42 | 1. Objects that are relevant to answering the question. 43 | 2. Object attributes that are relevant to answering the question. 44 | 3. Object relationships that are relevant to answering the question. 45 | 46 | Scene Graph: 47 | ''' 48 | 49 | 50 | qs_path = "" #Path to question 51 | ans_path = "" #Path to store result 52 | img_dir = "" #Path to image 53 | ans_file = open(ans_path, 'w') 54 | 55 | 56 | with open(qs_path, 'r') as json_file: 57 | json_list = list(json_file) 58 | 59 | 60 | count = 0 61 | for json_str in tqdm(json_list): 62 | result = json.loads(json_str) 63 | try: 64 | cur_image = img_dir + result["image"] 65 | image = Image.open(cur_image).convert("RGB") 66 | prompt = " " + result["text"].split("?")[0] + "?" + sgPrompt 67 | 68 | 69 | inputs = processor(images=image, text=prompt, return_tensors="pt").to("cuda") 70 | outputs = model.generate( 71 | **inputs, 72 | do_sample=False, 73 | num_beams=5, 74 | max_length=256, 75 | min_length=1, 76 | top_p=0.9, 77 | repetition_penalty=1.5, 78 | length_penalty=0.5, 79 | temperature=0, 80 | ) 81 | generated_text = processor.batch_decode(outputs, skip_special_tokens=True)[0].strip() 82 | 83 | 84 | answerPrompt="Use the image and scene graph as context and answer the following question: " 85 | prompt_score = " Scene Graph: " + generated_text + '\n\n' + answerPrompt + result["text"] + ". The correct letter is" 86 | inputs2 = processor(images=image, text=prompt_score, return_tensors="pt").to("cuda") 87 | outputs2 = model.generate( 88 | **inputs2, 89 | do_sample=False, 90 | num_beams=5, 91 | max_length=256, 92 | min_length=1, 93 | top_p=0.9, 94 | repetition_penalty=1.5, 95 | length_penalty=0.5, 96 | temperature=0, 97 | ) 98 | generated_text = processor.batch_decode(outputs2, skip_special_tokens=True)[0].strip() 99 | except: 100 | generated_text = "None" 101 | 102 | temp_result = {"question_id":result["question_id"], "text":generated_text} 103 | ans_file.write(json.dumps(temp_result) + "\n") 104 | ans_file.close() 105 | -------------------------------------------------------------------------------- /InstructBLIP-13b/InstructBLIP_Whoops.py: -------------------------------------------------------------------------------- 1 | from transformers import InstructBlipProcessor, InstructBlipForConditionalGeneration, InstructBlipConfig, AutoModelForVision2Seq 2 | import torch 3 | from PIL import Image 4 | import requests 5 | from accelerate import init_empty_weights, infer_auto_device_map 6 | import json 7 | import os 8 | from tqdm import tqdm 9 | from datasets import load_dataset 10 | 11 | 12 | hf_key="" #Huggingface auth key 13 | result_path="" #Path to store result 14 | 15 | sgPrompt=''' 16 | For the provided image and its associated question, generate a scene graph in JSON format that includes the following: 17 | 1. Objects that are relevant to answering the question. 18 | 2. Object attributes that are relevant to answering the question. 19 | 3. Object relationships that are relevant to answering the question. 20 | 21 | Scene Graph: 22 | ''' 23 | 24 | 25 | # Determine if CUDA (GPU) is available. 26 | device = "cuda" if torch.cuda.is_available() else "cpu" 27 | 28 | 29 | # Load the model configuration. 30 | config = InstructBlipConfig.from_pretrained("Salesforce/instructblip-vicuna-13b") 31 | 32 | # Initialize the model with the given configuration. 33 | with init_empty_weights(): 34 | 35 | model = AutoModelForVision2Seq.from_config(config) 36 | model.tie_weights() 37 | 38 | # Load the processor and model for image processing. 39 | processor = InstructBlipProcessor.from_pretrained("Salesforce/instructblip-vicuna-13b", device_map="auto") 40 | model = InstructBlipForConditionalGeneration.from_pretrained("Salesforce/instructblip-vicuna-13b", 41 | device_map="auto") 42 | 43 | 44 | def get_ans(question, image_tensor, pred_sg=None): 45 | if pred_sg is None: 46 | prompt = " " + question + "\n\n" + sgPrompt 47 | max_token = 256 48 | else: 49 | prompt = f" Question:{question} Scene Graph:{pred_sg}\n\n Use the image and scene graph to reason and provide a short answer:" 50 | max_token = 64 51 | inputs = processor(images=image_tensor, text=prompt, return_tensors="pt").to(model.device) 52 | outputs = model.generate( 53 | **inputs, 54 | do_sample=False, 55 | num_beams=5, 56 | max_length=max_token, 57 | min_length=1, 58 | top_p=0.9, 59 | repetition_penalty=1.5, 60 | length_penalty=0.5, 61 | temperature=0, 62 | ) 63 | generated_text = processor.batch_decode(outputs, skip_special_tokens=True)[0].strip() 64 | 65 | return generated_text 66 | 67 | 68 | result_file = open(result_path, "w") 69 | examples = load_dataset('nlphuji/whoops', use_auth_token=hf_key) 70 | 71 | for item in tqdm(examples["test"]): 72 | 73 | image_tensor = item["image"].convert("RGB") 74 | all_pred = [] 75 | for q_a in item["question_answering_pairs"]: 76 | 77 | 78 | question = q_a[0] 79 | pred_sg = get_ans(question, image_tensor) 80 | pred_ans = get_ans(question, image_tensor, pred_sg) 81 | all_pred.append(pred_ans) 82 | 83 | 84 | result_file.write(json.dumps({ "image_id": item["image_id"], 85 | "question_answering_pairs": item["question_answering_pairs"], 86 | "prediction": all_pred}) + "\n") 87 | 88 | 89 | result_file.close() 90 | -------------------------------------------------------------------------------- /InstructBLIP-13b/InstructBLIP_wino.py: -------------------------------------------------------------------------------- 1 | from transformers import InstructBlipProcessor, InstructBlipForConditionalGeneration, InstructBlipConfig, AutoModelForVision2Seq 2 | import torch 3 | from PIL import Image 4 | import requests 5 | from accelerate import init_empty_weights, infer_auto_device_map 6 | import json 7 | import os 8 | from tqdm import tqdm 9 | 10 | 11 | # Determine if CUDA (GPU) is available. 12 | device = "cuda" if torch.cuda.is_available() else "cpu" 13 | 14 | 15 | # Load the model configuration. 16 | config = InstructBlipConfig.from_pretrained("Salesforce/instructblip-vicuna-13b") 17 | 18 | # Initialize the model with the given configuration. 19 | with init_empty_weights(): 20 | 21 | model = AutoModelForVision2Seq.from_config(config) 22 | model.tie_weights() 23 | 24 | # Load the processor and model for image processing. 25 | processor = InstructBlipProcessor.from_pretrained("Salesforce/instructblip-vicuna-13b", device_map="auto") 26 | model = InstructBlipForConditionalGeneration.from_pretrained("Salesforce/instructblip-vicuna-13b", 27 | device_map="auto") 28 | 29 | 30 | sgPrompt=''' 31 | For the provided image and its associated question, generate a scene graph in JSON format that includes the following: 32 | 1. Objects that are relevant to answering the question. 33 | 2. Object attributes that are relevant to answering the question. 34 | 3. Object relationships that are relevant to answering the question. 35 | 36 | Scene Graph: 37 | ''' 38 | 39 | image_file = "" #Path to image file 40 | question_path = "" #Path to question file 41 | result_path = "" #Path to store result 42 | result_file = open(result_path, 'w') 43 | 44 | 45 | with open(question_path, 'r') as json_file: 46 | json_list = list(json_file) 47 | 48 | 49 | for json_str in tqdm(json_list): 50 | 51 | 52 | result = json.loads(json_str) 53 | cur_image = image_file + result["image"] + ".png" 54 | image = Image.open(cur_image).convert("RGB") 55 | cur_caption = result["caption"] 56 | 57 | 58 | prompt = f" Does the given caption accurately describe the given image? Caption:{cur_caption}.\n\n{sgPrompt}" 59 | inputs = processor(images=image, text=prompt, return_tensors="pt").to("cuda") 60 | outputs = model.generate( 61 | **inputs, 62 | do_sample=False, 63 | num_beams=5, 64 | max_length=256, 65 | min_length=1, 66 | top_p=0.9, 67 | repetition_penalty=1.5, 68 | length_penalty=0.5, 69 | temperature=0, 70 | ) 71 | generated_text = processor.batch_decode(outputs, skip_special_tokens=True)[0].strip() 72 | 73 | 74 | answerPrompt = "Use the image and scene graph to reason and answer the question." 75 | prompt = f" Question: Does the given caption accurately describe the given image? Caption:{cur_caption}. Scene Graph: {generated_text}\n\n{answerPrompt}" 76 | inputs = processor(images=image, text=prompt, return_tensors="pt").to("cuda") 77 | outputs = model.generate( 78 | **inputs, 79 | do_sample=False, 80 | num_beams=5, 81 | max_length=256, 82 | min_length=1, 83 | top_p=0.9, 84 | repetition_penalty=1.5, 85 | length_penalty=0.5, 86 | temperature=0, 87 | ) 88 | generated_text = processor.batch_decode(outputs, skip_special_tokens=True)[0].strip() 89 | stored_result = {"text":generated_text} 90 | 91 | 92 | result_file.write(json.dumps(stored_result) + "\n") 93 | 94 | result_file.close() 95 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 Chancharik Mitra 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /LLaVA-1.5-13b/LLaVA_MMBench.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import torch 3 | import os 4 | import json 5 | import pandas as pd 6 | from tqdm import tqdm 7 | import shortuuid 8 | 9 | from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN 10 | from llava.conversation import conv_templates, SeparatorStyle 11 | from llava.model.builder import load_pretrained_model 12 | from llava.utils import disable_torch_init 13 | from llava.mm_utils import tokenizer_image_token, process_images, load_image_from_base64, get_model_name_from_path 14 | 15 | from PIL import Image 16 | import math 17 | 18 | answerPrompt="Use the image and scene graph as context and answer the following question: " 19 | answerPrompt2="Use the image and context to answer the following question: " 20 | sgPrompt=''' 21 | For the provided image and its associated question, generate a scene graph in JSON format that includes the following: 22 | 1. Objects that are relevant to answering the question 23 | 2. Object attributes that are relevant to answering the question 24 | 3. Object relationships that are relevant to answering the question 25 | 26 | Scene Graph: 27 | ''' 28 | 29 | all_options = ['A', 'B', 'C', 'D'] 30 | 31 | 32 | def split_list(lst, n): 33 | """Split a list into n (roughly) equal-sized chunks""" 34 | chunk_size = math.ceil(len(lst) / n) # integer division 35 | return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)] 36 | 37 | 38 | def get_chunk(lst, n, k): 39 | chunks = split_list(lst, n) 40 | return chunks[k] 41 | 42 | 43 | def is_none(value): 44 | if value is None: 45 | return True 46 | if type(value) is float and math.isnan(value): 47 | return True 48 | if type(value) is str and value.lower() == 'nan': 49 | return True 50 | if type(value) is str and value.lower() == 'none': 51 | return True 52 | return False 53 | 54 | def get_options(row, options): 55 | parsed_options = [] 56 | for option in options: 57 | option_value = row[option] 58 | if is_none(option_value): 59 | break 60 | parsed_options.append(option_value) 61 | return parsed_options 62 | 63 | 64 | #Scene-Graph Generation Step: 65 | def get_sg(row, model, image_processor, tokenizer): 66 | idx = row['index'] 67 | question = row['question'] 68 | hint = row['hint'] 69 | image = load_image_from_base64(row['image']) 70 | if not is_none(hint): 71 | question = hint + '\n' + question 72 | 73 | qs = question 74 | if model.config.mm_use_im_start_end: 75 | qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + qs 76 | else: 77 | qs = DEFAULT_IMAGE_TOKEN + '\n' + qs 78 | 79 | if args.single_pred_prompt: 80 | if args.lang == 'cn': 81 | qs = qs + '\n' + "请直接回答选项字母。" 82 | else: 83 | qs = qs + '\n' + sgPrompt 84 | # qs = qs + '\n' + "Let's think step by step:" 85 | # qs = "Provide a caption for the image." 86 | 87 | conv = conv_templates[args.conv_mode].copy() 88 | conv.append_message(conv.roles[0], qs) 89 | conv.append_message(conv.roles[1], None) 90 | prompt = conv.get_prompt() 91 | 92 | input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda() 93 | 94 | image_tensor = process_images([image], image_processor, model.config)[0] 95 | # image_tensor = image_processor.preprocess(image, return_tensors='pt')['pixel_values'][0] 96 | 97 | stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2 98 | 99 | with torch.inference_mode(): 100 | output_ids = model.generate( 101 | input_ids, 102 | images=image_tensor.unsqueeze(0).half().cuda(), 103 | do_sample=True if args.temperature > 0 else False, 104 | temperature=args.temperature, 105 | top_p=args.top_p, 106 | num_beams=args.num_beams, 107 | # no_repeat_ngram_size=3, 108 | max_new_tokens=256, 109 | use_cache=True) 110 | 111 | input_token_len = input_ids.shape[1] 112 | n_diff_input_output = (input_ids != output_ids[:, :input_token_len]).sum().item() 113 | if n_diff_input_output > 0: 114 | print(f'[Warning] {n_diff_input_output} output_ids are not the same as the input_ids') 115 | outputs = tokenizer.batch_decode(output_ids[:, input_token_len:], skip_special_tokens=True)[0] 116 | outputs = outputs.strip() 117 | if outputs.endswith(stop_str): 118 | outputs = outputs[:-len(stop_str)] 119 | outputs = outputs.strip() 120 | 121 | return "Scene Graph: " + outputs + "\n\n" + answerPrompt + row["question"] 122 | # return row["question"] + "Let's think step by step:" + outputs + ".\n\n" 123 | # return "Context: " + outputs + "\n\n" + answerPrompt2 + row["question"] 124 | 125 | #Answer Extraction and Evaluation Step: 126 | def eval_model(args): 127 | # Model 128 | disable_torch_init() 129 | model_path = os.path.expanduser(args.model_path) 130 | model_name = get_model_name_from_path(model_path) 131 | tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, args.model_base, model_name) 132 | 133 | questions = pd.read_table(os.path.expanduser(args.question_file)) 134 | questions = get_chunk(questions, args.num_chunks, args.chunk_idx) 135 | answers_file = os.path.expanduser(args.answers_file) 136 | os.makedirs(os.path.dirname(answers_file), exist_ok=True) 137 | ans_file = open(answers_file, "w") 138 | 139 | if 'plain' in model_name and 'finetune' not in model_name.lower() and 'mmtag' not in args.conv_mode: 140 | args.conv_mode = args.conv_mode + '_mmtag' 141 | print(f'It seems that this is a plain model, but it is not using a mmtag prompt, auto switching to {args.conv_mode}.') 142 | 143 | for index, row in tqdm(questions.iterrows(), total=len(questions)): 144 | options = get_options(row, all_options) 145 | cur_option_char = all_options[:len(options)] 146 | 147 | if args.all_rounds: 148 | num_rounds = len(options) 149 | else: 150 | num_rounds = 1 151 | 152 | for round_idx in range(num_rounds): 153 | idx = row['index'] 154 | question = get_sg(row, model, image_processor, tokenizer) 155 | # question = row["question"] 156 | 157 | image = load_image_from_base64(row['image']) 158 | 159 | for option_char, option in zip(all_options[:len(options)], options): 160 | question = question + '\n' + option_char + '. ' + option 161 | qs = cur_prompt = question 162 | if model.config.mm_use_im_start_end: 163 | qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + qs 164 | else: 165 | qs = DEFAULT_IMAGE_TOKEN + '\n' + qs 166 | 167 | if args.single_pred_prompt: 168 | if args.lang == 'cn': 169 | qs = qs + '\n' + "请直接回答选项字母。" 170 | else: 171 | qs = qs + '\n' + "Answer with the option's letter from the given choices directly." 172 | # qs = qs + "Therefore, the answer with the option's letter from the given choices directly is " 173 | 174 | conv = conv_templates[args.conv_mode].copy() 175 | conv.append_message(conv.roles[0], qs) 176 | conv.append_message(conv.roles[1], None) 177 | prompt = conv.get_prompt() 178 | 179 | input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda() 180 | 181 | image_tensor = process_images([image], image_processor, model.config)[0] 182 | # image_tensor = image_processor.preprocess(image, return_tensors='pt')['pixel_values'][0] 183 | 184 | stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2 185 | 186 | with torch.inference_mode(): 187 | output_ids = model.generate( 188 | input_ids, 189 | images=image_tensor.unsqueeze(0).half().cuda(), 190 | do_sample=True if args.temperature > 0 else False, 191 | temperature=args.temperature, 192 | top_p=args.top_p, 193 | num_beams=args.num_beams, 194 | # no_repeat_ngram_size=3, 195 | max_new_tokens=1024, 196 | use_cache=True) 197 | 198 | input_token_len = input_ids.shape[1] 199 | n_diff_input_output = (input_ids != output_ids[:, :input_token_len]).sum().item() 200 | if n_diff_input_output > 0: 201 | print(f'[Warning] {n_diff_input_output} output_ids are not the same as the input_ids') 202 | outputs = tokenizer.batch_decode(output_ids[:, input_token_len:], skip_special_tokens=True)[0] 203 | outputs = outputs.strip() 204 | if outputs.endswith(stop_str): 205 | outputs = outputs[:-len(stop_str)] 206 | outputs = outputs.strip() 207 | 208 | ans_id = shortuuid.uuid() 209 | ans_file.write(json.dumps({"question_id": idx, 210 | "round_id": round_idx, 211 | "prompt": cur_prompt, 212 | "text": outputs, 213 | "options": options, 214 | "option_char": cur_option_char, 215 | "answer_id": ans_id, 216 | "model_id": model_name, 217 | "metadata": {}}) + "\n") 218 | ans_file.flush() 219 | 220 | # rotate options 221 | options = options[1:] + options[:1] 222 | cur_option_char = cur_option_char[1:] + cur_option_char[:1] 223 | ans_file.close() 224 | 225 | if __name__ == "__main__": 226 | parser = argparse.ArgumentParser() 227 | parser.add_argument("--model-path", type=str, default="facebook/opt-350m") 228 | parser.add_argument("--model-base", type=str, default=None) 229 | parser.add_argument("--image-folder", type=str, default="") 230 | parser.add_argument("--question-file", type=str, default="tables/question.jsonl") 231 | parser.add_argument("--answers-file", type=str, default="answer.jsonl") 232 | parser.add_argument("--conv-mode", type=str, default="llava_v1") 233 | parser.add_argument("--num-chunks", type=int, default=1) 234 | parser.add_argument("--chunk-idx", type=int, default=0) 235 | parser.add_argument("--temperature", type=float, default=0.2) 236 | parser.add_argument("--top_p", type=float, default=None) 237 | parser.add_argument("--num_beams", type=int, default=1) 238 | parser.add_argument("--all-rounds", action="store_true") 239 | parser.add_argument("--single-pred-prompt", action="store_true") 240 | parser.add_argument("--lang", type=str, default="en") 241 | 242 | args = parser.parse_args() 243 | 244 | eval_model(args) -------------------------------------------------------------------------------- /LLaVA-1.5-13b/LLaVA_SEED.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import torch 3 | import os 4 | import json 5 | from tqdm import tqdm 6 | import shortuuid 7 | 8 | from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN 9 | from llava.conversation import conv_templates, SeparatorStyle 10 | from llava.model.builder import load_pretrained_model 11 | from llava.utils import disable_torch_init 12 | from llava.mm_utils import tokenizer_image_token, process_images, get_model_name_from_path 13 | from torch.utils.data import Dataset, DataLoader 14 | 15 | from PIL import Image 16 | import math 17 | 18 | 19 | answerPrompt="Use the image and scene graph as context and answer the following question: " 20 | 21 | sgPrompt=''' 22 | For the provided image and its associated question, generate a scene graph in JSON format that includes the following: 23 | 1. Objects that are relevant to answering the question 24 | 2. Object attributes that are relevant to answering the question 25 | 3. Object relationships that are relevant to answering the question 26 | 27 | Scene Graph: 28 | ''' 29 | 30 | 31 | 32 | 33 | 34 | def split_list(lst, n): 35 | """Split a list into n (roughly) equal-sized chunks""" 36 | chunk_size = math.ceil(len(lst) / n) # integer division 37 | return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)] 38 | 39 | 40 | def get_chunk(lst, n, k): 41 | chunks = split_list(lst, n) 42 | return chunks[k] 43 | 44 | 45 | # Custom dataset class 46 | class CustomDataset(Dataset): 47 | def __init__(self, questions, image_folder, tokenizer, image_processor, model_config, sg_prompt): 48 | self.questions = questions 49 | self.image_folder = image_folder 50 | self.tokenizer = tokenizer 51 | self.image_processor = image_processor 52 | self.model_config = model_config 53 | self.sg_prompt = sg_prompt 54 | def __getitem__(self, index): 55 | line = self.questions[index] 56 | image_file = line["image"] 57 | if self.sg_prompt == 1: 58 | 59 | qs = line["text"].split("?")[0] + sgPrompt 60 | else: 61 | qs = line["text"] 62 | if self.model_config.mm_use_im_start_end: 63 | qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + qs 64 | else: 65 | qs = DEFAULT_IMAGE_TOKEN + '\n' + qs 66 | 67 | conv = conv_templates[args.conv_mode].copy() 68 | conv.append_message(conv.roles[0], qs) 69 | conv.append_message(conv.roles[1], None) 70 | prompt = conv.get_prompt() 71 | 72 | image = Image.open(os.path.join(self.image_folder, image_file)).convert('RGB') 73 | image_tensor = process_images([image], self.image_processor, self.model_config)[0] 74 | 75 | input_ids = tokenizer_image_token(prompt, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt') 76 | 77 | return input_ids, image_tensor 78 | 79 | def __len__(self): 80 | return len(self.questions) 81 | 82 | 83 | # DataLoader 84 | def create_data_loader(questions, image_folder, tokenizer, image_processor, model_config, batch_size=1, num_workers=4, sg_prompt = 0): 85 | assert batch_size == 1, "batch_size must be 1" 86 | dataset = CustomDataset(questions, image_folder, tokenizer, image_processor, model_config, sg_prompt) 87 | data_loader = DataLoader(dataset, batch_size=batch_size, num_workers=num_workers, shuffle=False) 88 | return data_loader 89 | 90 | #Scene-Graph Generation Step: 91 | def get_sg_prompt(args): 92 | 93 | disable_torch_init() 94 | model_path = os.path.expanduser(args.model_path) 95 | model_name = get_model_name_from_path(model_path) 96 | tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, args.model_base, model_name) 97 | 98 | 99 | 100 | 101 | questions = [json.loads(q) for q in open(os.path.expanduser(args.question_file), "r")] 102 | 103 | 104 | 105 | #Add the prompt in dataloader instead 106 | 107 | 108 | questions = get_chunk(questions, args.num_chunks, args.chunk_idx) 109 | 110 | 111 | q_file = os.path.expanduser(args.answers_file) 112 | os.makedirs(os.path.dirname(q_file), exist_ok=True) 113 | q_file = open(q_file, "w") 114 | 115 | if 'plain' in model_name and 'finetune' not in model_name.lower() and 'mmtag' not in args.conv_mode: 116 | args.conv_mode = args.conv_mode + '_mmtag' 117 | print(f'It seems that this is a plain model, but it is not using a mmtag prompt, auto switching to {args.conv_mode}.') 118 | 119 | data_loader = create_data_loader(questions, args.image_folder, tokenizer, image_processor, model.config, sg_prompt = 1) 120 | 121 | 122 | for (input_ids, image_tensor), line in tqdm(zip(data_loader, questions), total=len(questions)): 123 | 124 | stop_str = conv_templates[args.conv_mode].sep if conv_templates[args.conv_mode].sep_style != SeparatorStyle.TWO else conv_templates[args.conv_mode].sep2 125 | input_ids = input_ids.to(device='cuda', non_blocking=True) 126 | 127 | with torch.inference_mode(): 128 | output_ids = model.generate( 129 | input_ids, 130 | images=image_tensor.to(dtype=torch.float16, device='cuda', non_blocking=True), 131 | do_sample=True if args.temperature > 0 else False, 132 | temperature=args.temperature, 133 | top_p=args.top_p, 134 | num_beams=args.num_beams, 135 | max_new_tokens=256, 136 | use_cache=True) 137 | 138 | input_token_len = input_ids.shape[1] 139 | n_diff_input_output = (input_ids != output_ids[:, :input_token_len]).sum().item() 140 | if n_diff_input_output > 0: 141 | print(f'[Warning] {n_diff_input_output} output_ids are not the same as the input_ids') 142 | outputs = tokenizer.batch_decode(output_ids[:, input_token_len:], skip_special_tokens=True)[0] 143 | outputs = outputs.strip() 144 | if outputs.endswith(stop_str): 145 | outputs = outputs[:-len(stop_str)] 146 | outputs = outputs.strip() 147 | 148 | 149 | q_file.write(json.dumps({ "image": line["image"], 150 | "text": " Scene Graph: " + outputs + '\n\n' + answerPrompt + line["text"], 151 | "question_id": line["question_id"]}) + "\n") 152 | 153 | 154 | q_file.close() 155 | 156 | #Answer Extraction and Evaluation Step: 157 | def eval_model(args): 158 | # Model 159 | disable_torch_init() 160 | model_path = os.path.expanduser(args.model_path) 161 | model_name = get_model_name_from_path(model_path) 162 | tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, args.model_base, model_name) 163 | 164 | questions = [json.loads(q) for q in open(os.path.expanduser(args.question_file), "r")] 165 | 166 | 167 | 168 | questions = get_chunk(questions, args.num_chunks, args.chunk_idx) 169 | 170 | 171 | answers_file = os.path.expanduser(args.answers_file) 172 | os.makedirs(os.path.dirname(answers_file), exist_ok=True) 173 | ans_file = open(answers_file, "w") 174 | 175 | if 'plain' in model_name and 'finetune' not in model_name.lower() and 'mmtag' not in args.conv_mode: 176 | args.conv_mode = args.conv_mode + '_mmtag' 177 | print(f'It seems that this is a plain model, but it is not using a mmtag prompt, auto switching to {args.conv_mode}.') 178 | 179 | data_loader = create_data_loader(questions, args.image_folder, tokenizer, image_processor, model.config) 180 | 181 | for (input_ids, image_tensor), line in tqdm(zip(data_loader, questions), total=len(questions)): 182 | idx = line["question_id"] 183 | cur_prompt = line["text"] 184 | 185 | stop_str = conv_templates[args.conv_mode].sep if conv_templates[args.conv_mode].sep_style != SeparatorStyle.TWO else conv_templates[args.conv_mode].sep2 186 | input_ids = input_ids.to(device='cuda', non_blocking=True) 187 | 188 | 189 | with torch.inference_mode(): 190 | output_ids = model.generate( 191 | input_ids, 192 | images=image_tensor.to(dtype=torch.float16, device='cuda', non_blocking=True), 193 | do_sample=True if args.temperature > 0 else False, 194 | temperature=args.temperature, 195 | top_p=args.top_p, 196 | num_beams=args.num_beams, 197 | max_new_tokens=256, 198 | use_cache=True) 199 | 200 | input_token_len = input_ids.shape[1] 201 | n_diff_input_output = (input_ids != output_ids[:, :input_token_len]).sum().item() 202 | if n_diff_input_output > 0: 203 | print(f'[Warning] {n_diff_input_output} output_ids are not the same as the input_ids') 204 | outputs = tokenizer.batch_decode(output_ids[:, input_token_len:], skip_special_tokens=True)[0] 205 | outputs = outputs.strip() 206 | if outputs.endswith(stop_str): 207 | outputs = outputs[:-len(stop_str)] 208 | outputs = outputs.strip() 209 | 210 | ans_id = shortuuid.uuid() 211 | ans_file.write(json.dumps({"question_id": idx, 212 | "prompt": cur_prompt, 213 | "text": outputs, 214 | "answer_id": ans_id, 215 | "model_id": model_name, 216 | "metadata": {}}) + "\n") 217 | # ans_file.flush() 218 | ans_file.close() 219 | 220 | if __name__ == "__main__": 221 | parser = argparse.ArgumentParser() 222 | parser.add_argument("--model-path", type=str, default="facebook/opt-350m") 223 | parser.add_argument("--model-base", type=str, default=None) 224 | parser.add_argument("--image-folder", type=str, default="") 225 | parser.add_argument("--question-file", type=str, default="tables/question.jsonl") 226 | parser.add_argument("--answers-file", type=str, default="answer.jsonl") 227 | parser.add_argument("--conv-mode", type=str, default="llava_v1") 228 | parser.add_argument("--num-chunks", type=int, default=1) 229 | parser.add_argument("--chunk-idx", type=int, default=0) 230 | parser.add_argument("--temperature", type=float, default=0.2) 231 | parser.add_argument("--top_p", type=float, default=None) 232 | parser.add_argument("--num_beams", type=int, default=1) 233 | 234 | parser.add_argument("--scene_graph", type=int, default=0) 235 | 236 | args = parser.parse_args() 237 | 238 | 239 | if args.scene_graph == 0: 240 | eval_model(args) 241 | else: 242 | get_sg_prompt(args) -------------------------------------------------------------------------------- /LLaVA-1.5-13b/LLaVA_Whoops.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import torch 3 | import os 4 | import json 5 | from tqdm import tqdm 6 | import shortuuid 7 | 8 | from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN 9 | from llava.conversation import conv_templates, SeparatorStyle 10 | from llava.model.builder import load_pretrained_model 11 | from llava.utils import disable_torch_init 12 | from llava.mm_utils import tokenizer_image_token, process_images, get_model_name_from_path 13 | from torch.utils.data import Dataset, DataLoader 14 | 15 | from PIL import Image 16 | import math 17 | from datasets import load_dataset 18 | 19 | 20 | sgPrompt=''' 21 | For the provided image and its associated question, generate a scene graph in JSON format that includes the following: 22 | 1. Objects that are relevant to answering the question 23 | 2. Object attributes that are relevant to answering the question 24 | 3. Object relationships that are relevant to answering the question 25 | 26 | Scene Graph: 27 | ''' 28 | answerPrompt="\nUse the image and scene graph to reason and answer the question with a single phrase." 29 | 30 | 31 | disable_torch_init() 32 | model_path = os.path.expanduser("liuhaotian/llava-v1.5-13b") 33 | model_name = get_model_name_from_path(model_path) 34 | tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, None, model_name) 35 | 36 | 37 | def get_ans(question, image_tensor, sg = 0): 38 | 39 | if sg == 1: 40 | new_token = 256 41 | qs = DEFAULT_IMAGE_TOKEN + question + sgPrompt 42 | else: 43 | new_token = 64 44 | qs = DEFAULT_IMAGE_TOKEN + question 45 | 46 | conv = conv_templates["vicuna_v1"].copy() 47 | conv.append_message(conv.roles[0], qs) 48 | conv.append_message(conv.roles[1], None) 49 | prompt = conv.get_prompt() 50 | 51 | input_ids = torch.unsqueeze(tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt'), 0) 52 | input_ids = input_ids.to(device='cuda', non_blocking=True) 53 | 54 | image_tensor = torch.unsqueeze(image_tensor, 0) 55 | 56 | with torch.inference_mode(): 57 | output_ids = model.generate( 58 | input_ids, 59 | images=image_tensor.to(dtype=torch.float16, device='cuda', non_blocking=True), 60 | do_sample=True if 0 > 0 else False, 61 | temperature=0, 62 | top_p=None, 63 | num_beams=1, 64 | max_new_tokens=new_token, 65 | use_cache=True) 66 | 67 | 68 | input_token_len = input_ids.shape[1] 69 | n_diff_input_output = (input_ids != output_ids[:, :input_token_len]).sum().item() 70 | if n_diff_input_output > 0: 71 | print(f'[Warning] {n_diff_input_output} output_ids are not the same as the input_ids') 72 | outputs = tokenizer.batch_decode(output_ids[:, input_token_len:], skip_special_tokens=True)[0] 73 | outputs = outputs.strip() 74 | return outputs 75 | 76 | 77 | ans_file = open("", "w") 78 | examples = load_dataset('nlphuji/whoops', use_auth_token="") 79 | for item in tqdm(examples["test"]): 80 | 81 | 82 | image_tensor = process_images([item["image"]], image_processor, model.config)[0] 83 | all_pred = [] 84 | for q_a in item["question_answering_pairs"]: 85 | 86 | question = q_a[0] 87 | pred_ans = get_ans(question, image_tensor, sg = 1) 88 | 89 | 90 | question = q_a[0] + "Scene Graph:" + pred_ans + "\n\n" + answerPrompt 91 | pred_ans = get_ans(question, image_tensor) 92 | all_pred.append(pred_ans) 93 | 94 | ans_file.write(json.dumps({ "image_id": item["image_id"], 95 | "question_answering_pairs": item["question_answering_pairs"], 96 | "prediction": all_pred}) + "\n") 97 | ans_file.flush() 98 | 99 | ans_file.close() 100 | -------------------------------------------------------------------------------- /LLaVA-1.5-13b/LLaVA_bench.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import torch 3 | import os 4 | import json 5 | from tqdm import tqdm 6 | import shortuuid 7 | 8 | from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN 9 | from llava.conversation import conv_templates, SeparatorStyle 10 | from llava.model.builder import load_pretrained_model 11 | from llava.utils import disable_torch_init 12 | from llava.mm_utils import tokenizer_image_token, get_model_name_from_path, KeywordsStoppingCriteria 13 | 14 | from PIL import Image 15 | import math 16 | 17 | answerPrompt="Use the image and scene graph as context to improve the detail and clarity of the original answer: " 18 | 19 | sgPrompt=''' 20 | 21 | For the provided image and question-answer pair, generate a scene graph in JSON format to improve the quality and/or detail of the answer. The scene graph can include the following: 22 | 1. Objects that are relevant to answering the question. 23 | 2. Object attributes that are relevant to answering the question. 24 | 3. Object relationships that are relevant to answering the question. 25 | 26 | Scene Graph: 27 | ''' 28 | 29 | def split_list(lst, n): 30 | """Split a list into n (roughly) equal-sized chunks""" 31 | chunk_size = math.ceil(len(lst) / n) # integer division 32 | return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)] 33 | 34 | 35 | def get_chunk(lst, n, k): 36 | chunks = split_list(lst, n) 37 | return chunks[k] 38 | 39 | 40 | def eval_model(args): 41 | # Model 42 | disable_torch_init() 43 | model_path = os.path.expanduser(args.model_path) 44 | model_name = get_model_name_from_path(model_path) 45 | tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, args.model_base, model_name) 46 | 47 | questions = [json.loads(q) for q in open(os.path.expanduser(args.question_file), "r")] 48 | questions = get_chunk(questions, args.num_chunks, args.chunk_idx) 49 | answers_file = os.path.expanduser(args.answers_file) 50 | os.makedirs(os.path.dirname(answers_file), exist_ok=True) 51 | ans_file = open(answers_file, "w") 52 | for line in tqdm(questions): 53 | idx = line["question_id"] 54 | image_file = line["image"] 55 | 56 | #----ZS Ans generation------ 57 | qs = line["text"] 58 | cur_prompt = qs 59 | if model.config.mm_use_im_start_end: 60 | qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + qs 61 | else: 62 | qs = DEFAULT_IMAGE_TOKEN + '\n' + qs 63 | 64 | conv = conv_templates[args.conv_mode].copy() 65 | conv.append_message(conv.roles[0], qs) 66 | conv.append_message(conv.roles[1], None) 67 | prompt = conv.get_prompt() 68 | 69 | input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda() 70 | 71 | image = Image.open(os.path.join(args.image_folder, image_file)) 72 | image_tensor = image_processor.preprocess(image, return_tensors='pt')['pixel_values'][0] 73 | 74 | stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2 75 | keywords = [stop_str] 76 | stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids) 77 | 78 | with torch.inference_mode(): 79 | output_ids = model.generate( 80 | input_ids, 81 | images=image_tensor.unsqueeze(0).half().cuda(), 82 | do_sample=True if args.temperature > 0 else False, 83 | temperature=args.temperature, 84 | top_p=args.top_p, 85 | num_beams=args.num_beams, 86 | # no_repeat_ngram_size=3, 87 | max_new_tokens=1024, 88 | use_cache=True) 89 | 90 | input_token_len = input_ids.shape[1] 91 | n_diff_input_output = (input_ids != output_ids[:, :input_token_len]).sum().item() 92 | if n_diff_input_output > 0: 93 | print(f'[Warning] {n_diff_input_output} output_ids are not the same as the input_ids') 94 | outputs = tokenizer.batch_decode(output_ids[:, input_token_len:], skip_special_tokens=True)[0] 95 | outputs = outputs.strip() 96 | if outputs.endswith(stop_str): 97 | outputs = outputs[:-len(stop_str)] 98 | outputs = outputs.strip() 99 | 100 | #-----SG Generation------- 101 | og_ans = outputs 102 | 103 | qs = "Question: " + line["text"] + "\nAnswer: " + og_ans + "\n\n" + sgPrompt 104 | 105 | cur_prompt = qs 106 | if model.config.mm_use_im_start_end: 107 | qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + qs 108 | else: 109 | qs = DEFAULT_IMAGE_TOKEN + '\n' + qs 110 | 111 | conv = conv_templates[args.conv_mode].copy() 112 | conv.append_message(conv.roles[0], qs) 113 | conv.append_message(conv.roles[1], None) 114 | prompt = conv.get_prompt() 115 | 116 | input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda() 117 | 118 | image = Image.open(os.path.join(args.image_folder, image_file)) 119 | image_tensor = image_processor.preprocess(image, return_tensors='pt')['pixel_values'][0] 120 | 121 | stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2 122 | keywords = [stop_str] 123 | stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids) 124 | 125 | with torch.inference_mode(): 126 | output_ids = model.generate( 127 | input_ids, 128 | images=image_tensor.unsqueeze(0).half().cuda(), 129 | do_sample=True if args.temperature > 0 else False, 130 | temperature=args.temperature, 131 | top_p=args.top_p, 132 | num_beams=args.num_beams, 133 | # no_repeat_ngram_size=3, 134 | max_new_tokens=256, 135 | use_cache=True) 136 | 137 | input_token_len = input_ids.shape[1] 138 | n_diff_input_output = (input_ids != output_ids[:, :input_token_len]).sum().item() 139 | if n_diff_input_output > 0: 140 | print(f'[Warning] {n_diff_input_output} output_ids are not the same as the input_ids') 141 | outputs = tokenizer.batch_decode(output_ids[:, input_token_len:], skip_special_tokens=True)[0] 142 | outputs = outputs.strip() 143 | if outputs.endswith(stop_str): 144 | outputs = outputs[:-len(stop_str)] 145 | outputs = outputs.strip() 146 | sg = outputs 147 | #----Improved Answer----- 148 | qs = answerPrompt + "\n\nScene Graph: " + sg + "\n\nQuestion: " + line["text"] + "\nOriginal Answer: " + og_ans + "\nImproved Answer: " 149 | 150 | cur_prompt = qs 151 | if model.config.mm_use_im_start_end: 152 | qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + qs 153 | else: 154 | qs = DEFAULT_IMAGE_TOKEN + '\n' + qs 155 | 156 | conv = conv_templates[args.conv_mode].copy() 157 | conv.append_message(conv.roles[0], qs) 158 | conv.append_message(conv.roles[1], None) 159 | prompt = conv.get_prompt() 160 | 161 | input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda() 162 | 163 | image = Image.open(os.path.join(args.image_folder, image_file)) 164 | image_tensor = image_processor.preprocess(image, return_tensors='pt')['pixel_values'][0] 165 | 166 | stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2 167 | keywords = [stop_str] 168 | stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids) 169 | 170 | with torch.inference_mode(): 171 | output_ids = model.generate( 172 | input_ids, 173 | images=image_tensor.unsqueeze(0).half().cuda(), 174 | do_sample=True if args.temperature > 0 else False, 175 | temperature=args.temperature, 176 | top_p=args.top_p, 177 | num_beams=args.num_beams, 178 | # no_repeat_ngram_size=3, 179 | max_new_tokens=1024, 180 | use_cache=True) 181 | 182 | input_token_len = input_ids.shape[1] 183 | n_diff_input_output = (input_ids != output_ids[:, :input_token_len]).sum().item() 184 | if n_diff_input_output > 0: 185 | print(f'[Warning] {n_diff_input_output} output_ids are not the same as the input_ids') 186 | outputs = tokenizer.batch_decode(output_ids[:, input_token_len:], skip_special_tokens=True)[0] 187 | outputs = outputs.strip() 188 | if outputs.endswith(stop_str): 189 | outputs = outputs[:-len(stop_str)] 190 | outputs = outputs.strip() 191 | ans_id = shortuuid.uuid() 192 | ans_file.write(json.dumps({"question_id": idx, 193 | "prompt": cur_prompt, 194 | "text": outputs, 195 | "answer_id": ans_id, 196 | "model_id": model_name, 197 | "metadata": {}}) + "\n") 198 | ans_file.flush() 199 | ans_file.close() 200 | 201 | if __name__ == "__main__": 202 | parser = argparse.ArgumentParser() 203 | parser.add_argument("--model-path", type=str, default="facebook/opt-350m") 204 | parser.add_argument("--model-base", type=str, default=None) 205 | parser.add_argument("--image-folder", type=str, default="") 206 | parser.add_argument("--question-file", type=str, default="tables/question.jsonl") 207 | parser.add_argument("--answers-file", type=str, default="answer.jsonl") 208 | parser.add_argument("--conv-mode", type=str, default="llava_v1") 209 | parser.add_argument("--num-chunks", type=int, default=1) 210 | parser.add_argument("--chunk-idx", type=int, default=0) 211 | parser.add_argument("--temperature", type=float, default=0.2) 212 | parser.add_argument("--top_p", type=float, default=None) 213 | parser.add_argument("--num_beams", type=int, default=1) 214 | args = parser.parse_args() 215 | 216 | eval_model(args) 217 | -------------------------------------------------------------------------------- /LLaVA-1.5-13b/LLaVA_wino.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import torch 3 | import os 4 | import json 5 | from tqdm import tqdm 6 | import shortuuid 7 | 8 | from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN 9 | from llava.conversation import conv_templates, SeparatorStyle 10 | from llava.model.builder import load_pretrained_model 11 | from llava.utils import disable_torch_init 12 | from llava.mm_utils import tokenizer_image_token, process_images, get_model_name_from_path 13 | from torch.utils.data import Dataset, DataLoader 14 | 15 | from PIL import Image 16 | import math 17 | from datasets import load_dataset 18 | 19 | 20 | sgPrompt=''' 21 | For the provided image and its associated question, generate a scene graph in JSON format that includes the following: 22 | 1. Objects that are relevant to answering the question 23 | 2. Object attributes that are relevant to answering the question 24 | 3. Object relationships that are relevant to answering the question 25 | 26 | Scene Graph: 27 | ''' 28 | 29 | 30 | disable_torch_init() 31 | model_path = os.path.expanduser("liuhaotian/llava-v1.5-13b") 32 | model_name = get_model_name_from_path(model_path) 33 | tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, None, model_name) 34 | 35 | 36 | def get_ans(question, image_tensor, sg = 0): 37 | 38 | if sg == 1: 39 | qs = DEFAULT_IMAGE_TOKEN + question + sgPrompt 40 | else: 41 | qs = DEFAULT_IMAGE_TOKEN + question 42 | 43 | conv = conv_templates["vicuna_v1"].copy() 44 | conv.append_message(conv.roles[0], qs) 45 | conv.append_message(conv.roles[1], None) 46 | prompt = conv.get_prompt() 47 | 48 | input_ids = torch.unsqueeze(tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt'), 0) 49 | input_ids = input_ids.to(device='cuda', non_blocking=True) 50 | 51 | image_tensor = torch.unsqueeze(image_tensor, 0) 52 | 53 | with torch.inference_mode(): 54 | output_ids = model.generate( 55 | input_ids, 56 | images=image_tensor.to(dtype=torch.float16, device='cuda', non_blocking=True), 57 | do_sample=True if 0 > 0 else False, 58 | temperature=0, 59 | top_p=None, 60 | num_beams=1, 61 | max_new_tokens=256, 62 | use_cache=True) 63 | 64 | 65 | input_token_len = input_ids.shape[1] 66 | n_diff_input_output = (input_ids != output_ids[:, :input_token_len]).sum().item() 67 | if n_diff_input_output > 0: 68 | print(f'[Warning] {n_diff_input_output} output_ids are not the same as the input_ids') 69 | outputs = tokenizer.batch_decode(output_ids[:, input_token_len:], skip_special_tokens=True)[0] 70 | outputs = outputs.strip() 71 | return outputs 72 | 73 | 74 | 75 | image_file = "" 76 | question_path = "" 77 | result_path = "" 78 | result_file = open(result_path, 'w') 79 | 80 | 81 | 82 | with open(question_path, 'r') as json_file: 83 | json_list = list(json_file) 84 | 85 | 86 | for json_str in tqdm(json_list): 87 | 88 | 89 | result = json.loads(json_str) 90 | cur_image = image_file + result["image"] + ".png" 91 | image = Image.open(cur_image).convert("RGB") 92 | 93 | prompt = "Does the given caption accurately describe the given image? Caption:" + result["caption"] + ".\n\n" + sgPrompt 94 | 95 | cur_sg = get_ans(prompt, image, sg=1) 96 | 97 | 98 | answerPrompt = "Use the image and scene graph to reason and answer the question." 99 | prompt = "Question: Does the given caption accurately describe the given image? Caption:" + result["caption"] + ". Scene Graph: " + cur_sg + '\n\n' + answerPrompt 100 | 101 | final_ans = get_ans(prompt, image) 102 | stored_result = {"text":final_ans} 103 | result_file.write(json.dumps(stored_result) + "\n") 104 | result_file.flush() 105 | 106 | result_file.close() 107 | -------------------------------------------------------------------------------- /LLaVA-1.5-13b/llava_seed.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}" 4 | IFS=',' read -ra GPULIST <<< "$gpu_list" 5 | 6 | CHUNKS=8 7 | 8 | EXP_NAME=$1 9 | echo "Experiment Name: llava.eval.sg.$EXP_NAME" 10 | CKPT="llava-v1.5-13b-$EXP_NAME" 11 | 12 | WITH_SG=1 13 | 14 | #Step 1: Scene-Graph Generation: 15 | for IDX in $(seq 0 $((CHUNKS-1))); do 16 | echo ${GPULIST[$IDX]} 17 | CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m llava.eval.sg.$EXP_NAME \ 18 | --model-path liuhaotian/llava-v1.5-13b \ 19 | --question-file ./playground/data/eval/seed_bench/llava-seed-bench-filtered.jsonl \ 20 | --image-folder ./playground/data/eval/seed_bench \ 21 | --answers-file ./playground/data/eval/seed_bench/full_sg/$CKPT/${CHUNKS}_${IDX}.jsonl \ 22 | --num-chunks $CHUNKS \ 23 | --chunk-idx $IDX \ 24 | --temperature 0 \ 25 | --scene_graph $WITH_SG \ 26 | --conv-mode vicuna_v1 & 27 | #sleep 2 - only for Sphinx-V2 28 | done 29 | 30 | wait 31 | 32 | output_file=./playground/data/eval/seed_bench/full_sg/$CKPT/merge_$EXP_NAME.jsonl 33 | 34 | # Clear out the output file if it exists. 35 | > "$output_file" 36 | 37 | # Loop through the indices and concatenate each file. 38 | for IDX in $(seq 0 $((CHUNKS-1))); do 39 | cat ./playground/data/eval/seed_bench/full_sg/$CKPT/${CHUNKS}_${IDX}.jsonl >> "$output_file" 40 | done 41 | 42 | 43 | #Step 2: Answer Extraction and Evaluation: 44 | for IDX in $(seq 0 $((CHUNKS-1))); do 45 | CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m llava.eval.sg.$EXP_NAME \ 46 | --model-path liuhaotian/llava-v1.5-13b \ 47 | --question-file ./playground/data/eval/seed_bench/full_sg/$CKPT/merge_$EXP_NAME.jsonl \ 48 | --image-folder ./playground/data/eval/seed_bench \ 49 | --answers-file ./playground/data/eval/seed_bench/answers/$CKPT/${CHUNKS}_${IDX}.jsonl \ 50 | --num-chunks $CHUNKS \ 51 | --chunk-idx $IDX \ 52 | --temperature 0 \ 53 | --conv-mode vicuna_v1 & 54 | #sleep 2 - Sphinx-V2 55 | done 56 | 57 | wait 58 | 59 | output_file=./playground/data/eval/seed_bench/answers/$CKPT/merge_$EXP_NAME.jsonl 60 | 61 | # Clear out the output file if it exists. 62 | > "$output_file" 63 | 64 | # Loop through the indices and concatenate each file. 65 | for IDX in $(seq 0 $((CHUNKS-1))); do 66 | cat ./playground/data/eval/seed_bench/answers/$CKPT/${CHUNKS}_${IDX}.jsonl >> "$output_file" 67 | done 68 | 69 | # Evaluate 70 | python scripts/convert_seed_for_submission.py \ 71 | --annotation-file ./playground/data/eval/seed_bench/SEED-Bench.json \ 72 | --result-file $output_file \ 73 | --result-upload-file ./playground/data/eval/seed_bench/answers_upload/llava-v1.5-13b-$EXP_NAME.jsonl 74 | 75 | # --result-file $output_file \ 76 | 77 | # --result-file ./playground/data/eval/seed_bench/gt_merge.jsonl \ -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # CCoT 🧩 🧠 2 | Official Code for the Paper "Compositional Chain-of-Thought Prompting for Large Multimodal Models" 3 | --- 4 | We present **CCoT**, a novel **C**compositional **C**hain-**o**f-**T**hought prompting method that utilizes scene-graph representations in order to extract compositional knowledge from an LMM. We find that this approach not only improves LMM performance on several compositional benchmarks but also general multimodal benchmarks as well. 5 | 6 | A more thorough discussion of our work can be found in our [paper](https://arxiv.org/abs/2311.17076). 7 | 8 |

9 | 10 |

11 | 12 | ### Method Description 13 | --- 14 |

15 | 16 |

17 | 18 | The first step in our prompting method is to generate a scene graph given both the image *and* textual task as context. Following this, the answer is extracted by prompting the LMM with the image, scene graph, question, and answer extraction prompt. Prompt sections unique to our method are shown in **bold** in the above figure. Incorporating the scene graph in the prompt eliminates the need for fine-tuning and prevents forgetting. Another benefit of our method is that generated SGs can describe any visual scene, therefore making CCoT generally applicable to a wider range of VL tasks. Finally, the fact that the generated scene graphs are compact linguistic representations of images makes CCoT a token-efficient prompting method. This is significant given the limited textual context lengths that LMMs often face due to processing both image and text inputs. 19 | 20 | ### 💻 Setup 21 | --- 22 | **Note** that because our method is a zero-shot prompting method and makes use of the codebase of its respective LMM, there is ample flexibility when applying it to your particular model and use case. As such, you may find it *easier* to simply use the general methodology shown in our figure and outlined in our scripts with a different prompt, implementation, and evaluation methodology to suit your needs. 23 | 24 | #### Datasets 25 | Please retrieve all datasets from their respective official websites or repositories. We do provide the filtered .jsonl containing just the SEEDBench-Image data points in our data folder. 26 | 27 | #### LLaVA-1.5-13b 28 | 1. First, clone the official **LLaVA** [repository](https://github.com/haotian-liu/LLaVA). 29 | ```bash 30 | git clone https://github.com/haotian-liu/LLaVA.git 31 | ``` 32 | 2. Follow the basic installation steps outlined in the repository. 33 | 3. Complete the *Evaluation* setup outlined in the repository. 34 | 4. Replace the corresponding scripts (both Python or Bash scripts where necessary) with those in our repository here. 35 | 36 | *Note: We find some users are having issues with the input processing when directly cloning the repo. This is likely because the post-LLaVA-1.6 update changes the way inputs to the model are handled. One way to remedy this is to check out the newest commit before the LLaVA-1.6 update* 37 | 38 | #### GPT-4V 39 | 40 | 1. Install the openai library: 41 | ```bash 42 | pip install openai 43 | ``` 44 | 2. Set your openai key: 45 | ```bash 46 | export OPENAI_API_KEY= 47 | ``` 48 | 3. Run the script for your desired dataset. 49 | 50 | #### InstructBLIP-13b 51 | 52 | 1. First, clone the official **LLaVA** [Repository]([https://github.com/haotian-liu/LLaVA](https://github.com/salesforce/LAVIS/tree/main/projects/instructblip)). 53 | 2. Follow the basic installation steps outlined in the repository. 54 | 3. Run the script for your desired dataset. 55 | 56 | #### Sphinx 57 | 58 | 1. For SEEDBench and MMBench, we make use of the LLaVA codebase's setup. Simply follow the LLaVA-1.5 setup steps and replace the scripts with those of Sphinx. 59 | 2. For other datasets, follow setup instructions from the official [repository](https://github.com/Alpha-VLLM/LLaMA2-Accessory/tree/main/SPHINX) 60 | 3. Run our provided script. 61 | 62 | ### 📝 Citation 63 | --- 64 | If you found our work useful, please consider starring and citing. Thank you! 65 | ```latex 66 | @inproceedings{MitraCCoT, 67 | title={Compositional Chain of Thought Prompting for Large Multimodal Models}, 68 | author={Mitra, Chancharik and Huang, Brandon and Darrell, Trevor and Herzig, Roei}, 69 | booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, 70 | month={June}, 71 | year={2024} 72 | } 73 | ``` 74 | -------------------------------------------------------------------------------- /Sphinx/Sphinx_SEED.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import torch 3 | import os 4 | import json 5 | from tqdm import tqdm 6 | import shortuuid 7 | 8 | # from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN 9 | # from llava.conversation import conv_templates, SeparatorStyle 10 | # from llava.model.builder import load_pretrained_model 11 | # from llava.utils import disable_torch_init 12 | # from llava.mm_utils import tokenizer_image_token, process_images, get_model_name_from_path 13 | from torch.utils.data import Dataset, DataLoader 14 | 15 | from PIL import Image 16 | from SPHINX import SPHINXModel 17 | import math 18 | 19 | answerPrompt="Use the image and scene graph as context and answer the following question: " 20 | 21 | sgPrompt=''' 22 | 23 | For the provided image and its associated question, generate a scene graph in JSON format that includes the following: 24 | 1. Objects that are relevant to answering the question. 25 | 2. Object attributes that are relevant to answering the question. 26 | 3. Object relationships that are relevant to answering the question. 27 | 28 | Scene Graph: 29 | ''' 30 | def split_list(lst, n): 31 | """Split a list into n (roughly) equal-sized chunks""" 32 | chunk_size = math.ceil(len(lst) / n) # integer division 33 | return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)] 34 | 35 | 36 | def get_chunk(lst, n, k): 37 | chunks = split_list(lst, n) 38 | return chunks[k] 39 | 40 | 41 | # Custom dataset class 42 | class CustomDataset(Dataset): 43 | def __init__(self, questions, image_folder, sg_prompt):#, ans_folder: str='./playground/data/eval/MME/answers/llava-v1.5-13b.jsonl'): 44 | self.questions = questions 45 | self.image_folder = image_folder 46 | self.sg_prompt = sg_prompt 47 | 48 | def __getitem__(self, index): 49 | line = self.questions[index] 50 | image_file = line["image"] 51 | if self.sg_prompt == 1: 52 | lst = line['text'].split('\n') 53 | #lst.pop() 54 | #qs = '\n'.join(lst) 55 | qs = lst[0] + sgPrompt 56 | 57 | else: 58 | qs = line["text"] 59 | 60 | 61 | return qs, self.image_folder, image_file 62 | 63 | def __len__(self): 64 | return len(self.questions) 65 | 66 | 67 | # DataLoader 68 | def create_data_loader(questions, image_folder, batch_size=1, num_workers=4, sg_prompt = 0): 69 | assert batch_size == 1, "batch_size must be 1" 70 | dataset = CustomDataset(questions, image_folder, sg_prompt) 71 | data_loader = DataLoader(dataset, batch_size=batch_size, num_workers=num_workers, shuffle=False) 72 | return data_loader 73 | 74 | def get_sg_prompt(args): 75 | 76 | model = SPHINXModel.from_pretrained(pretrained_path="/home/chancharikm/compVL/LLaMA2-Accessory/SPHINX/SPHINX-v2-1k-weights", with_visual=True).to(device='cuda') 77 | 78 | 79 | 80 | questions = [json.loads(q) for q in open(os.path.expanduser(args.question_file), "r")] 81 | 82 | 83 | 84 | questions = get_chunk(questions, args.num_chunks, args.chunk_idx) 85 | 86 | 87 | q_file = os.path.expanduser(args.answers_file) 88 | os.makedirs(os.path.dirname(q_file), exist_ok=True) 89 | q_file = open(q_file, "w") 90 | 91 | data_loader = create_data_loader(questions, args.image_folder, sg_prompt = 1) 92 | 93 | 94 | for (raw_text, image_folder, image_file), line in tqdm(zip(data_loader, questions), total=len(questions)): 95 | #No idea why raw_text, image_folder, image_file are being unpacked as tuples (value, __blank__), but I'm going with it for now... 96 | #It's fine in the data_loader but not here: weird, I suspect it has to do with this misaligned tqdm 97 | 98 | image = Image.open(os.path.join(image_folder[0], image_file[0])).convert('RGB') 99 | with torch.cuda.amp.autocast(dtype=torch.float16): 100 | outputs = model.generate_reponse( #No, this is not a typo. This seems to be typo inherited from the Sphinx codebase - let them know via Github! 101 | [[raw_text[0], None]], 102 | image, 103 | temperature=args.temperature, 104 | top_p=args.top_p, 105 | max_gen_len=256, 106 | seed=0) 107 | outputs = outputs.strip() 108 | 109 | 110 | 111 | 112 | q_file.write(json.dumps({ "image": line["image"], 113 | "text": "Scene Graph: " + outputs + '\n\n' + answerPrompt + line["text"], 114 | "category": line["category"], 115 | "question_id": line["question_id"]}) + "\n") 116 | 117 | 118 | q_file.close() 119 | 120 | 121 | def eval_model(args): 122 | 123 | model = SPHINXModel.from_pretrained(pretrained_path="/home/chancharikm/compVL/LLaMA2-Accessory/SPHINX/SPHINX-v2-1k-weights", with_visual=True).to(device='cuda') 124 | print(f'Chunk ID {args.chunk_idx} Loaded!') 125 | questions = [json.loads(q) for q in open(os.path.expanduser(args.question_file), "r")] 126 | 127 | 128 | 129 | questions = get_chunk(questions, args.num_chunks, args.chunk_idx) 130 | 131 | 132 | answers_file = os.path.expanduser(args.answers_file) 133 | os.makedirs(os.path.dirname(answers_file), exist_ok=True) 134 | ans_file = open(answers_file, "w") 135 | 136 | 137 | 138 | data_loader = create_data_loader(questions, args.image_folder) 139 | 140 | for (raw_text, image_folder, image_file), line in tqdm(zip(data_loader, questions), total=len(questions)): 141 | #No idea why raw_text, image_folder, image_file are being unpacked as tuples (value, __blank__), but I'm going with it for now... 142 | #It's fine in the data_loader but not here: weird, I suspect it has to do with this misaligned tqdm 143 | idx = line["question_id"] 144 | cur_prompt = line["text"] 145 | model_name = "Sphinx-v2-1k" 146 | image = Image.open(os.path.join(image_folder[0], image_file[0])).convert('RGB') 147 | with torch.cuda.amp.autocast(dtype=torch.float16): 148 | outputs = model.generate_reponse( 149 | [[raw_text[0], None]], 150 | image, 151 | temperature=args.temperature, 152 | top_p=args.top_p, 153 | max_gen_len=256, 154 | seed=0) 155 | 156 | outputs = outputs.strip() 157 | ans_id = shortuuid.uuid() 158 | ans_file.write(json.dumps({"question_id": idx, 159 | "prompt": cur_prompt, 160 | "text": outputs, 161 | "answer_id": ans_id, 162 | "model_id": model_name, 163 | "metadata": {}}) + "\n") 164 | # ans_file.flush() 165 | ans_file.close() 166 | 167 | if __name__ == "__main__": 168 | parser = argparse.ArgumentParser() 169 | parser.add_argument("--model-path", type=str, default="facebook/opt-350m") 170 | parser.add_argument("--model-base", type=str, default=None) 171 | parser.add_argument("--image-folder", type=str, default="") 172 | parser.add_argument("--question-file", type=str, default="tables/question.jsonl") 173 | parser.add_argument("--answers-file", type=str, default="answer.jsonl") 174 | parser.add_argument("--conv-mode", type=str, default="llava_v1") 175 | parser.add_argument("--num-chunks", type=int, default=1) 176 | parser.add_argument("--chunk-idx", type=int, default=0) 177 | parser.add_argument("--temperature", type=float, default=0.2) 178 | parser.add_argument("--top_p", type=float, default=None) 179 | parser.add_argument("--num_beams", type=int, default=1) 180 | 181 | parser.add_argument("--scene_graph", type=int, default=0) 182 | 183 | args = parser.parse_args() 184 | 185 | if args.scene_graph == 0: 186 | eval_model(args) 187 | else: 188 | get_sg_prompt(args) 189 | -------------------------------------------------------------------------------- /Sphinx/Sphinx_Whoops.py: -------------------------------------------------------------------------------- 1 | import json 2 | from tqdm import tqdm 3 | from datasets import load_dataset 4 | from SPHINX import SPHINXModel 5 | 6 | result_path="" #Path to store the result 7 | hf_token="" #Huggingface auth token 8 | 9 | 10 | sgPrompt=''' 11 | For the provided image and its associated question, generate only a scene graph in JSON format that includes the following: 12 | 1. Objects that are relevant to answering the question 13 | 2. Object attributes that are relevant to answering the question 14 | 3. Object relationships that are relevant to answering the question 15 | 16 | Scene Graph: 17 | ''' 18 | answerPrompt="\nUse the image and scene graph to reason and answer the question with a single phrase." 19 | 20 | 21 | model = SPHINXModel.from_pretrained(pretrained_path="", with_visual=True) 22 | 23 | 24 | def get_ans(question, pred_sg, image_tensor): 25 | 26 | final_ans = model.generate_response([[question + "Scene Graph:" + pred_sg + "\n\n" + answerPrompt, None]], image_tensor, max_gen_len=64, temperature=0) 27 | return final_ans 28 | 29 | 30 | def get_sg(question, image): 31 | final_ans = model.generate_response([[question + sgPrompt, None]], image, max_gen_len=256, temperature=0) 32 | return final_ans 33 | 34 | 35 | result_file = open(result_path, "w") 36 | examples = load_dataset('nlphuji/whoops', use_auth_token=hf_token) 37 | for item in tqdm(examples["test"]): 38 | 39 | image = item["image"] 40 | all_pred = [] 41 | all_sg = [] 42 | for q_a in item["question_answering_pairs"]: 43 | 44 | question = q_a[0] 45 | pred_sg = get_sg(question, image) 46 | pred_ans = get_ans(question, pred_sg, image) 47 | all_pred.append(pred_ans) 48 | 49 | result_file.write(json.dumps({ "image_id": item["image_id"], 50 | "question_answering_pairs": item["question_answering_pairs"], 51 | "prediction": all_pred}) + "\n") 52 | 53 | result_file.close() 54 | -------------------------------------------------------------------------------- /Sphinx/Sphinx_mmbench.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import torch 3 | import os 4 | import json 5 | import pandas as pd 6 | from tqdm import tqdm 7 | import shortuuid 8 | 9 | from PIL import Image 10 | import math 11 | from io import BytesIO 12 | import base64 13 | 14 | from SPHINX import SPHINXModel 15 | 16 | 17 | all_options = ['A', 'B', 'C', 'D'] 18 | 19 | answerPrompt="Use the image and scene graph as context and answer the following question: " 20 | 21 | sgPrompt=''' 22 | 23 | For the provided image and its associated question, generate a scene graph in JSON format that includes the following: 24 | 1. Objects that are relevant to answering the question. 25 | 2. Object attributes that are relevant to answering the question. 26 | 3. Object relationships that are relevant to answering the question. 27 | 28 | Scene Graph: 29 | ''' 30 | 31 | 32 | def split_list(lst, n): 33 | """Split a list into n (roughly) equal-sized chunks""" 34 | chunk_size = math.ceil(len(lst) / n) # integer division 35 | return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)] 36 | 37 | 38 | def get_chunk(lst, n, k): 39 | chunks = split_list(lst, n) 40 | return chunks[k] 41 | 42 | 43 | def is_none(value): 44 | if value is None: 45 | return True 46 | if type(value) is float and math.isnan(value): 47 | return True 48 | if type(value) is str and value.lower() == 'nan': 49 | return True 50 | if type(value) is str and value.lower() == 'none': 51 | return True 52 | return False 53 | 54 | def get_options(row, options): 55 | parsed_options = [] 56 | for option in options: 57 | option_value = row[option] 58 | if is_none(option_value): 59 | break 60 | parsed_options.append(option_value) 61 | return parsed_options 62 | 63 | def load_image_from_base64(image): 64 | return Image.open(BytesIO(base64.b64decode(image))) 65 | 66 | 67 | def eval_model(args): 68 | # Model 69 | model = SPHINXModel.from_pretrained(pretrained_path="/home/chancharikm/compVL/LLaMA2-Accessory/SPHINX/SPHINX-v2-1k-weights", with_visual=True).to(device='cuda') 70 | 71 | questions = pd.read_table(os.path.expanduser(args.question_file)) 72 | questions = get_chunk(questions, args.num_chunks, args.chunk_idx) 73 | answers_file = os.path.expanduser(args.answers_file) 74 | os.makedirs(os.path.dirname(answers_file), exist_ok=True) 75 | ans_file = open(answers_file, "w") 76 | 77 | for index, row in tqdm(questions.iterrows(), total=len(questions)): 78 | options = get_options(row, all_options) 79 | cur_option_char = all_options[:len(options)] 80 | 81 | if args.all_rounds: 82 | num_rounds = len(options) 83 | else: 84 | num_rounds = 1 85 | 86 | for round_idx in range(num_rounds): 87 | idx = row['index'] 88 | question = row['question'] 89 | hint = row['hint'] 90 | image = load_image_from_base64(row['image']) 91 | if not is_none(hint): 92 | question = hint + '\n' + question 93 | for option_char, option in zip(all_options[:len(options)], options): 94 | question = question + '\n' + option_char + '. ' + option 95 | qs = cur_prompt = question 96 | 97 | firstPrompt = question + sgPrompt 98 | #print(f'SG Prompt: {firstPrompt}') 99 | #print(f'qs {qs}') 100 | # if model.config.mm_use_im_start_end: 101 | # qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + qs 102 | # else: 103 | # qs = DEFAULT_IMAGE_TOKEN + '\n' + qs 104 | 105 | # if args.single_pred_prompt: 106 | # if args.lang == 'cn': 107 | # qs = qs + '\n' + "请直接回答选项字母。" 108 | # else: 109 | # qs = qs + '\n' + "Answer with the option's letter from the given choices directly." 110 | 111 | with torch.cuda.amp.autocast(dtype=torch.float16): 112 | outputs = model.generate_reponse( #No, this is not a typo. This seems to be typo inherited from the Sphinx codebase - let them know via Github! 113 | [[firstPrompt, None]], 114 | image, 115 | temperature=args.temperature, 116 | top_p=args.top_p, 117 | max_gen_len=256, 118 | seed=0) 119 | 120 | 121 | outputs = outputs.strip() 122 | 123 | sg = outputs 124 | 125 | secondPrompt = "Scene Graph: " + sg + '\n\n' + answerPrompt + qs + '\n' + "Answer with the option's letter from the given choices directly." 126 | #print(f'secondPrompt {secondPrompt}') 127 | 128 | with torch.cuda.amp.autocast(dtype=torch.float16): 129 | outputs = model.generate_reponse( #No, this is not a typo. This seems to be typo inherited from the Sphinx codebase - let them know via Github! 130 | [[secondPrompt, None]], 131 | image, 132 | temperature=args.temperature, 133 | top_p=args.top_p, 134 | max_gen_len=256, 135 | seed=0) 136 | 137 | outputs = outputs.strip() 138 | #print(f'Final Output: {outputs}') 139 | 140 | ans_id = shortuuid.uuid() 141 | ans_file.write(json.dumps({"question_id": idx, 142 | "round_id": round_idx, 143 | "prompt": cur_prompt, 144 | "text": outputs, 145 | "options": options, 146 | "option_char": cur_option_char, 147 | "answer_id": ans_id, 148 | "model_id": "Sphinx-v2-1k", 149 | "metadata": {}}) + "\n") 150 | ans_file.flush() 151 | 152 | # rotate options 153 | options = options[1:] + options[:1] 154 | cur_option_char = cur_option_char[1:] + cur_option_char[:1] 155 | ans_file.close() 156 | 157 | if __name__ == "__main__": 158 | parser = argparse.ArgumentParser() 159 | parser.add_argument("--model-path", type=str, default="facebook/opt-350m") 160 | parser.add_argument("--model-base", type=str, default=None) 161 | parser.add_argument("--image-folder", type=str, default="") 162 | parser.add_argument("--question-file", type=str, default="tables/question.jsonl") 163 | parser.add_argument("--answers-file", type=str, default="answer.jsonl") 164 | parser.add_argument("--conv-mode", type=str, default="llava_v1") 165 | parser.add_argument("--num-chunks", type=int, default=1) 166 | parser.add_argument("--chunk-idx", type=int, default=0) 167 | parser.add_argument("--temperature", type=float, default=0.2) 168 | parser.add_argument("--top_p", type=float, default=None) 169 | parser.add_argument("--num_beams", type=int, default=1) 170 | parser.add_argument("--all-rounds", action="store_true") 171 | parser.add_argument("--single-pred-prompt", action="store_true") 172 | parser.add_argument("--lang", type=str, default="en") 173 | args = parser.parse_args() 174 | 175 | eval_model(args) 176 | -------------------------------------------------------------------------------- /Sphinx/Sphinx_wino.py: -------------------------------------------------------------------------------- 1 | from SPHINX import SPHINXModel 2 | from PIL import Image 3 | import torch 4 | import json 5 | from tqdm import tqdm 6 | 7 | sgPrompt=''' 8 | For the provided image and its associated question, generate only a scene graph in JSON format that includes the following: 9 | 1. Objects that are relevant to answering the question 10 | 2. Object attributes that are relevant to answering the question 11 | 3. Object relationships that are relevant to answering the question 12 | ''' 13 | 14 | model = SPHINXModel.from_pretrained(pretrained_path="", with_visual=True) 15 | 16 | 17 | image_dir = "" #Directory containing image data 18 | question_path = "" #Path containing the question 19 | result_path = "" #Path to store the result 20 | result_file = open(result_path, 'w') 21 | 22 | 23 | with open(question_path, 'r') as json_file: 24 | json_list = list(json_file) 25 | 26 | 27 | for json_str in tqdm(json_list): 28 | 29 | cur_pair = json.loads(json_str) 30 | cur_image = Image.open(image_dir + cur_pair["image"] + ".png") 31 | cur_caption = cur_pair["caption"] 32 | 33 | cur_question = [[f"Does the given caption accurately describe the given image? Caption:{cur_caption}.\n\n{sgPrompt}", None]] 34 | cur_sg = model.generate_response(cur_question, cur_image, max_gen_len=256, temperature=0) 35 | 36 | new_question = [[f"Does the given caption accurately describe the given image? Caption:{cur_caption}.\n\nScene graph:{cur_sg}\n\nBased on the image and scene graph, provide a explanation to the answer.", None]] 37 | final_ans = model.generate_response(new_question, cur_image, max_gen_len=256, temperature=0) 38 | 39 | 40 | stored_response = {"text":final_ans} 41 | result_file.write(json.dumps(stored_response) + "\n") 42 | 43 | 44 | result_file.close() 45 | -------------------------------------------------------------------------------- /data/filter_qs.py: -------------------------------------------------------------------------------- 1 | import json 2 | import argparse 3 | 4 | def filter_by_category(input_file, output_file, filter_list): 5 | 6 | category_counters = {} 7 | 8 | with open(input_file, 'r') as f_in, open(output_file, 'w') as f_out: 9 | for line in f_in: 10 | obj = json.loads(line) 11 | category = obj.get('category') 12 | 13 | 14 | category_counters[category] = category_counters.get(category, 0) + 1 15 | 16 | if category in filter_list: 17 | if category_counters[category] % 1 == 0: 18 | f_out.write(json.dumps(obj)) 19 | f_out.write('\n') 20 | 21 | def main(): 22 | filter_list = ["Instances Counting", "Scene Understanding","Instance Identity", "Instance Attributes", "Instance Location", "Spatial Relation", "Visual Reasoning", "Text Understanding", "Instance Interaction"] 23 | filter_by_category("llava-seed-bench.jsonl", "llava-seed-bench-filtered.jsonl", filter_list) 24 | 25 | if __name__ == "__main__": 26 | main() 27 | -------------------------------------------------------------------------------- /eval_winoground.py: -------------------------------------------------------------------------------- 1 | from openai import OpenAI 2 | import time 3 | import json 4 | 5 | 6 | client = OpenAI( 7 | api_key='', 8 | ) 9 | 10 | 11 | def generate_response(prompt): 12 | # Call the OpenAI API to generate a response 13 | response = client.chat.completions.create( 14 | model="gpt-4-0125-preview", 15 | messages=[ 16 | {'role': 'user', 'content': prompt} 17 | ], 18 | max_tokens=512, 19 | n=1, 20 | temperature=0, 21 | top_p=0.1, 22 | frequency_penalty=0.0, 23 | presence_penalty=0, 24 | ) 25 | # Get the response text from the API response 26 | response_text = response.choices[0].message.content 27 | 28 | return response_text 29 | 30 | #Text score. Identify which caption-explanation paire matches better. 31 | def get_ans(caption1, caption2, exp1, exp2): 32 | #original 33 | PROMPT = "Caption A:" + caption1 + ". Explanation A:" + exp1 + "\n\nCaption B:" + caption2 + ". Explanation B:" + exp2 + "\n\n Each explanation tries to justify why an image match with the corresponding caption. Pick the most logical explanation and return only an alphabet letter." 34 | return generate_response(PROMPT) 35 | 36 | #Image score. Identify which explanation match better with the caption. 37 | def get_ans2(caption1, caption2, exp1, exp2): 38 | PROMPT = "Caption:" + caption1 + ".Explanation A:" + exp1 + "Explanation B:" + exp2 + "\n\n Pick the explanation with information that align with the caption and return only an alphabet letter." 39 | return generate_response(PROMPT) 40 | 41 | 42 | ans_path = "" 43 | with open(ans_path, 'r') as json_file: 44 | json_list = list(json_file) 45 | qs_path = "" 46 | with open(qs_path, 'r') as q_file: 47 | q_list = list(q_file) 48 | 49 | 50 | count = 0 51 | correct_text = 0 52 | correct_img = 0 53 | correct_group = 0 54 | 55 | 56 | print("begin") 57 | while count < 1600: 58 | 59 | result1 = json.loads(json_list[count]) 60 | result2 = json.loads(json_list[count+1]) 61 | result3 = json.loads(json_list[count+2]) 62 | result4 = json.loads(json_list[count+3]) 63 | 64 | cap1 = json.loads(q_list[count])["caption"] 65 | cap2 = json.loads(q_list[count+1])["caption"] 66 | 67 | expain1 = result1["answer"] 68 | expain2 = result2["answer"] 69 | expain3 = result3["answer"] 70 | expain4 = result4["answer"] 71 | 72 | #Get text score 73 | text_result1 = get_ans(cap1, cap2, expain1, expain2) 74 | text_result2 = get_ans(cap1, cap2, expain3, expain4) 75 | 76 | #Get image score 77 | text_result3 = get_ans2(cap1, cap1, expain1, expain3) 78 | text_result4 = get_ans2(cap2, cap2, expain2, expain4) 79 | 80 | 81 | if (text_result1 == "A" and text_result2 == "B"): 82 | correct_text += 1 83 | if (text_result3 == "A" and text_result4 == "B"): 84 | correct_img += 1 85 | 86 | if (text_result1 == "A" and text_result2 == "B") and (text_result3 == "A" and text_result4 == "B"): 87 | correct_group += 1 88 | count += 4 89 | 90 | print("text score:", correct_text / 400) 91 | print("image score:", correct_img / 400) 92 | print("group score:", correct_group / 400) 93 | -------------------------------------------------------------------------------- /images/fig1_v7.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chancharikmitra/CCoT/9ceecb7c3e9d337bf389e1c2af260b86bcc35a6b/images/fig1_v7.png -------------------------------------------------------------------------------- /images/fig2_v8.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chancharikmitra/CCoT/9ceecb7c3e9d337bf389e1c2af260b86bcc35a6b/images/fig2_v8.png --------------------------------------------------------------------------------