├── GPT-4V
    ├── GPT4V_Seed.py
    ├── GPT4V_Whoops.py
    ├── GPT4V_mmbench.py
    ├── GPT4V_wino.py
    └── Sphinx_bench.py
├── InstructBLIP-13b
    ├── InstructBLIP_MMBench.py
    ├── InstructBLIP_Seed.py
    ├── InstructBLIP_Whoops.py
    └── InstructBLIP_wino.py
├── LICENSE
├── LLaVA-1.5-13b
    ├── LLaVA_MMBench.py
    ├── LLaVA_SEED.py
    ├── LLaVA_Whoops.py
    ├── LLaVA_bench.py
    ├── LLaVA_wino.py
    └── llava_seed.sh
├── README.md
├── Sphinx
    ├── Sphinx_SEED.py
    ├── Sphinx_Whoops.py
    ├── Sphinx_mmbench.py
    └── Sphinx_wino.py
├── data
    ├── filter_qs.py
    └── llava-seed-bench-filtered.jsonl
├── eval_winoground.py
├── images
    ├── fig1_v7.png
    └── fig2_v8.png
└── parsed_winoground.jsonl


/GPT-4V/GPT4V_Seed.py:
--------------------------------------------------------------------------------
 1 | from PIL import Image
 2 | import base64
 3 | import requests
 4 | import json
 5 | from tqdm import tqdm
 6 | from openai import OpenAI
 7 | import json
 8 | 
 9 | image_file = '' # Image Path
10 | question_path = "" #Question Path
11 | result_path = "" #File to store result
12 | result_file = open(result_path, 'w')
13 | api_key='' #Openai api key
14 | 
15 | sgPrompt='''
16 | For the provided image and its associated question, generate only a scene graph in JSON format that includes the following:
17 | 1. Objects that are relevant to answering the question
18 | 2. Object attributes that are relevant to answering the question
19 | 3. Object relationships that are relevant to answering the question
20 | '''
21 | 
22 | 
23 | def encode_image(image_path):
24 |   with open(image_path, "rb") as image_file:
25 |     return base64.b64encode(image_file.read()).decode('utf-8')
26 | 
27 | 
28 | def create_payload(cur_image, cur_text):
29 |     payload = {
30 |     "model": "gpt-4-vision-preview",
31 |     "messages": [
32 |         {
33 |         "role": "user",
34 |         "content": [
35 |             {
36 |             "type": "image_url",
37 |             "image_url": {
38 |                 "url": f"data:image/jpeg;base64,{cur_image}"
39 |             }
40 |             },
41 |             {
42 |             "type": "text",
43 |             "text": cur_text
44 |             }
45 |         ]
46 |         }
47 |     ],
48 |     "max_tokens": 512,
49 |     "temperature":0
50 |     }
51 |   return payload
52 | 
53 | 
54 | with open(question_path, 'r') as json_file:
55 |     json_list = list(json_file)
56 | 
57 | headers = {
58 | "Content-Type": "application/json",
59 | "Authorization": f"Bearer {api_key}"
60 | }
61 | 
62 | 
63 | for json_str in tqdm(json_list):
64 |     is_done = False
65 |     fail_count = 0
66 | 
67 |     while not is_done:
68 |         try:
69 |             result = json.loads(json_str)
70 | 
71 | 
72 |             cur_image = encode_image(image_file + result["image"])
73 |             cur_question = result["text"]
74 |             cur_id = result["question_id"]
75 | 
76 |             payload = create_payload(cur_image, "Question: " + cur_question.split("?")[0] + "?" + sgPrompt)
77 |             response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)
78 |             cur_sg = response.json()["choices"][0]["message"]["content"]
79 | 
80 |             new_q = f"Scene Graph: {cur_sg}. Use the image and scene graph as context and answer the following question: {cur_question}"
81 |             payload = create_payload(cur_image, new_q)
82 | 
83 |             response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)
84 |             final_ans = response.json()["choices"][0]["message"]["content"]
85 | 
86 | 
87 |             temp_result = {"question_id":cur_id, "text":final_ans}
88 |             result_file.write(json.dumps(temp_result) + "\n")
89 |             is_done = True
90 |         except:
91 |             fail_count += 1
92 |             if fail_count == 5:
93 |                 break
94 |             is_done = False
95 | 
96 | result_file.close()
97 | 


--------------------------------------------------------------------------------
/GPT-4V/GPT4V_Whoops.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import torch
  3 | import os
  4 | import json
  5 | from tqdm import tqdm
  6 | import shortuuid
  7 | import requests
  8 | from PIL import Image
  9 | import math
 10 | from datasets import load_dataset
 11 | import base64
 12 | from io import BytesIO
 13 | 
 14 | 
 15 | result_path="" ##path to store result
 16 | api_key="" #Openai api key
 17 | hf_key="" #Huggingface auth key
 18 | 
 19 | 
 20 | sgPrompt='''
 21 | For the provided image and its associated question, generate only a scene graph in JSON format that includes the following:
 22 | 1. Objects that are relevant to answering the question
 23 | 2. Object attributes that are relevant to answering the question
 24 | 3. Object relationships that are relevant to answering the question
 25 | '''
 26 | answerPrompt=".Use the image and scene graph as context and answer the following question: "
 27 | 
 28 | 
 29 | def get_ans(question, image_tensor, pred_sg=None):
 30 |     if pred_sg is None:
 31 |         cur_prompt = question + sgPrompt
 32 |         max_tokens = 512
 33 |     else:
 34 |         cur_prompt = "Scene Graph: " + pred_sg + answerPrompt + question
 35 |         max_tokens = 128
 36 | 
 37 |     buffered = BytesIO()
 38 |     image_tensor.convert('RGB').save(buffered, format="JPEG")
 39 |     img_str = base64.b64encode(buffered.getvalue()).decode('utf-8')
 40 |     
 41 | 
 42 |     headers = {
 43 |     "Content-Type": "application/json",
 44 |     "Authorization": f"Bearer {api_key}"
 45 |     }
 46 | 
 47 |     payload = {
 48 |     "model": "gpt-4-vision-preview",
 49 |     "messages": [
 50 |         {
 51 |         "role": "user",
 52 |         "content": [
 53 |             {
 54 |             "type": "image_url",
 55 |             "image_url": {
 56 |                 "url": f"data:image/jpeg;base64,{img_str}"
 57 |             }
 58 |             },
 59 |             {
 60 |             "type": "text",
 61 |             "text": cur_prompt
 62 |             }
 63 |         ]
 64 |         }
 65 |     ],
 66 |     "max_tokens": max_tokens,
 67 |     "temperature":0
 68 |     }
 69 |     
 70 |     response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)
 71 |     cur_ans = response.json()["choices"][0]["message"]["content"]
 72 |     return cur_ans
 73 |     
 74 | 
 75 | result_file = open(result_path, "w")
 76 | examples = load_dataset('nlphuji/whoops', use_auth_token=hf_key)
 77 | for item in tqdm(examples["test"]):
 78 |     
 79 |     is_done = False
 80 |     fail_count = 0
 81 | 
 82 |     while not is_done:
 83 |         try:
 84 |             image = item["image"]
 85 | 
 86 |             all_pred = []
 87 |             for q_a in item["question_answering_pairs"]:
 88 |                 
 89 |                 question = q_a[0]
 90 |                 pred_sg = get_ans(question, image)
 91 |                 pred_ans = get_ans(question, image, pred_sg)
 92 |                 all_pred.append(pred_ans)
 93 | 
 94 | 
 95 |             result_file.write(json.dumps({  "image_id": item["image_id"],
 96 |                                         "question_answering_pairs": item["question_answering_pairs"],
 97 |                                         "prediction": all_pred}) + "\n")
 98 |             is_done = True
 99 |         except:
100 |             fail_count += 1
101 |             if fail_count == 5:
102 |                 break
103 |             is_done = False
104 | 
105 | result_file.close()
106 | 


--------------------------------------------------------------------------------
/GPT-4V/GPT4V_mmbench.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import json
  3 | import pandas as pd
  4 | from tqdm import tqdm
  5 | from PIL import Image
  6 | import math
  7 | import requests
  8 | 
  9 | 
 10 | api_key='' #Openai api key
 11 | question_path="" #MMbench tsv file path
 12 | result_path="" #Path to store result
 13 | 
 14 | sgPrompt='''
 15 | For the provided image and its associated question, generate only a scene graph in JSON format that includes the following:
 16 | 1. Objects that are relevant to answering the question
 17 | 2. Object attributes that are relevant to answering the question
 18 | 3. Object relationships that are relevant to answering the question
 19 | '''
 20 | answerPrompt="Use the image and scene graph as context and answer the following question: "
 21 | all_options = ['A', 'B', 'C', 'D']
 22 | 
 23 | 
 24 | def create_payload(cur_image, cur_text):
 25 |     payload = {
 26 |     "model": "gpt-4-vision-preview",
 27 |     "messages": [
 28 |         {
 29 |         "role": "user",
 30 |         "content": [
 31 |             {
 32 |             "type": "image_url",
 33 |             "image_url": {
 34 |                 "url": f"data:image/jpeg;base64,{cur_image}"
 35 |             }
 36 |             },
 37 |             {
 38 |             "type": "text",
 39 |             "text": cur_text
 40 |             }
 41 |         ]
 42 |         }
 43 |     ],
 44 |     "max_tokens": 512,
 45 |     "temperature":0
 46 |     }
 47 |   return payload
 48 | 
 49 | 
 50 | def split_list(lst, n):
 51 |     """Split a list into n (roughly) equal-sized chunks"""
 52 |     chunk_size = math.ceil(len(lst) / n)  # integer division
 53 |     return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)]
 54 | 
 55 | 
 56 | def get_chunk(lst, n, k):
 57 |     chunks = split_list(lst, n)
 58 |     return chunks[k]
 59 | 
 60 | 
 61 | def is_none(value):
 62 |     if value is None:
 63 |         return True
 64 |     if type(value) is float and math.isnan(value):
 65 |         return True
 66 |     if type(value) is str and value.lower() == 'nan':
 67 |         return True
 68 |     if type(value) is str and value.lower() == 'none':
 69 |         return True
 70 |     return False
 71 | 
 72 | def get_options(row, options):
 73 |     parsed_options = []
 74 |     for option in options:
 75 |         option_value = row[option]
 76 |         if is_none(option_value):
 77 |             break
 78 |         parsed_options.append(option_value)
 79 |     return parsed_options
 80 | 
 81 | headers = {
 82 | "Content-Type": "application/json",
 83 | "Authorization": f"Bearer {api_key}"
 84 | }
 85 | 
 86 | questions = pd.read_table(question_path)
 87 | questions = get_chunk(questions, 1, 0)
 88 | result_file = open(result_path, "w")
 89 | 
 90 | 
 91 | for index, row in tqdm(questions.iterrows(), total=len(questions)):
 92 |     is_done = False
 93 |     fail_count = 0
 94 | 
 95 |     while not is_done:
 96 |         try:
 97 |             options = get_options(row, all_options)
 98 |             cur_option_char = all_options[:len(options)]
 99 | 
100 | 
101 |             idx = row['index']
102 |             question = row["question"]
103 |             image = row['image']
104 | 
105 | 
106 |             for option_char, option in zip(all_options[:len(options)], options):
107 |                 question = question + '\n' + option_char + '. ' + option
108 |             qs = question
109 | 
110 |             payload = create_payload(image, qs + sgPrompt)
111 |             response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)
112 |             cur_sg = response.json()["choices"][0]["message"]["content"]
113 | 
114 | 
115 |             new_q = f"Scene Graph: {cur_sg}\n\n{answerPrompt}{qs}\nAnswer with the option's letter from the given choices directly."
116 |             payload = create_payload(image, new_q)
117 |             response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)
118 |             final_ans = response.json()["choices"][0]["message"]["content"]
119 | 
120 | 
121 |             temp_result = {"question_id":idx, "text":final_ans}
122 |             result_file.write(json.dumps(temp_result) + "\n")
123 | 
124 | 
125 |             is_done = True
126 |             # rotate options
127 |             options = options[1:] + options[:1]
128 |             cur_option_char = cur_option_char[1:] + cur_option_char[:1]
129 |         except:
130 |             fail_count += 1
131 |             if fail_count == 5:
132 |                 break
133 |             is_done = False
134 | result_file.close()
135 | 


--------------------------------------------------------------------------------
/GPT-4V/GPT4V_wino.py:
--------------------------------------------------------------------------------
 1 | from PIL import Image
 2 | import base64
 3 | import requests
 4 | import json
 5 | from tqdm import tqdm
 6 | from openai import OpenAI
 7 | import json
 8 | 
 9 | image_file = "" # Image Path
10 | question_path = "" #Path to question
11 | result_path = "" #Path to store result
12 | result_file = open(result_path, 'w')
13 | api_key="" #Openai api key
14 | 
15 | sgPrompt='''
16 | For the provided image and its associated question, generate only a scene graph in JSON format that includes the following:
17 | 1. Objects that are relevant to answering the question
18 | 2. Object attributes that are relevant to answering the question
19 | 3. Object relationships that are relevant to answering the question
20 | '''
21 | answerPrompt=".Use the image and scene graph as context and answer the following question: "
22 | 
23 | def create_payload(cur_image, cur_text):
24 |     payload = {
25 |     "model": "gpt-4-vision-preview",
26 |     "messages": [
27 |         {
28 |         "role": "user",
29 |         "content": [
30 |             {
31 |             "type": "image_url",
32 |             "image_url": {
33 |                 "url": f"data:image/jpeg;base64,{cur_image}"
34 |             }
35 |             },
36 |             {
37 |             "type": "text",
38 |             "text": cur_text
39 |             }
40 |         ]
41 |         }
42 |     ],
43 |     "max_tokens": 512,
44 |     "temperature":0
45 |     }
46 |   return payload
47 | 
48 | 
49 | def encode_image(image_path):
50 |   with open(image_path, "rb") as image_file:
51 |     return base64.b64encode(image_file.read()).decode('utf-8')
52 | 
53 | 
54 | headers = {
55 | "Content-Type": "application/json",
56 | "Authorization": f"Bearer {api_key}"
57 | }
58 | 
59 | 
60 | with open(question_path, 'r') as json_file:
61 |     json_list = list(json_file)
62 | 
63 | 
64 | for json_str in tqdm(json_list):
65 |     is_done = False
66 |     fail_count = 0
67 | 
68 |     while not is_done:
69 |         try:
70 |             result = json.loads(json_str)
71 |             cur_image = encode_image(image_file + result["image"] + ".png")
72 |             cur_caption = result["caption"]
73 |             cur_question = f"Does the given caption accurately describe the given image? Caption:{cur_caption}\n\n{sgPrompt}"
74 | 
75 |             payload = create_payload(cur_image, cur_question)
76 |             response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)
77 |             cur_sg = response.json()["choices"][0]["message"]["content"]
78 | 
79 |           
80 |             new_q = f"Question: Does the given caption accurately describe the given image? Caption:{cur_caption}.\n\nAnswer: Scene graph:{cur_sg}\n\nUse the scene graph and image to reason and answer the question:"
81 |             payload = create_payload(cur_image, new_q)
82 |             response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)
83 |             final_ans = response.json()["choices"][0]["message"]["content"]
84 | 
85 | 
86 |             temp_result = {"text":final_ans}
87 |             result_file.write(json.dumps(temp_result) + "\n")
88 |             is_done = True
89 |         except:
90 |             fail_count += 1
91 |             if fail_count == 5:
92 |                 break
93 |             is_done = False
94 | result_file.close()
95 | 


--------------------------------------------------------------------------------
/GPT-4V/Sphinx_bench.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import torch
  3 | import os
  4 | import json
  5 | from tqdm import tqdm
  6 | import shortuuid
  7 | 
  8 | # from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
  9 | # from llava.conversation import conv_templates, SeparatorStyle
 10 | # from llava.model.builder import load_pretrained_model
 11 | # from llava.utils import disable_torch_init
 12 | # from llava.mm_utils import tokenizer_image_token, get_model_name_from_path, KeywordsStoppingCriteria
 13 | 
 14 | from PIL import Image
 15 | from SPHINX import SPHINXModel
 16 | import math
 17 | 
 18 | answerPrompt="Use the image and scene graph as context to improve the detail and clarity of the original answer: "
 19 | 
 20 | sgPrompt='''
 21 | 
 22 | For the provided image and question-answer pair, generate a scene graph in JSON format to improve the quality and/or detail of the answer. The scene graph can include the following:
 23 | 1. Objects that are relevant to answering the question.
 24 | 2. Object attributes that are relevant to answering the question.
 25 | 3. Object relationships that are relevant to answering the question.
 26 | 
 27 | Scene Graph:
 28 | '''
 29 | 
 30 | def split_list(lst, n):
 31 |     """Split a list into n (roughly) equal-sized chunks"""
 32 |     chunk_size = math.ceil(len(lst) / n)  # integer division
 33 |     return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)]
 34 | 
 35 | 
 36 | def get_chunk(lst, n, k):
 37 |     chunks = split_list(lst, n)
 38 |     return chunks[k]
 39 | 
 40 | 
 41 | def eval_model(args):
 42 |     # Model
 43 |     # disable_torch_init()
 44 |     # model_path = os.path.expanduser(args.model_path)
 45 |     # model_name = get_model_name_from_path(model_path)
 46 |     # tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, args.model_base, model_name)
 47 | 
 48 |     model = SPHINXModel.from_pretrained(pretrained_path="/home/chancharikm/compVL/LLaMA2-Accessory/SPHINX/SPHINX-v2-1k-weights", with_visual=True).to(device='cuda')
 49 |     model_name = "Sphinx-v2-1k"
 50 | 
 51 |     questions = [json.loads(q) for q in open(os.path.expanduser(args.question_file), "r")]
 52 |     questions = get_chunk(questions, args.num_chunks, args.chunk_idx)
 53 |     answers_file = os.path.expanduser(args.answers_file)
 54 |     os.makedirs(os.path.dirname(answers_file), exist_ok=True)
 55 |     ans_file = open(answers_file, "w")
 56 |     for line in tqdm(questions):
 57 |         idx = line["question_id"]
 58 |         image_file = line["image"]
 59 | 
 60 |         #----ZS Ans generation------
 61 |         qs = line["text"]
 62 |         # cur_prompt = qs
 63 |         # if model.config.mm_use_im_start_end:
 64 |         #     qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + qs
 65 |         # else:
 66 |         #     qs = DEFAULT_IMAGE_TOKEN + '\n' + qs
 67 | 
 68 |         # conv = conv_templates[args.conv_mode].copy()
 69 |         # conv.append_message(conv.roles[0], qs)
 70 |         # conv.append_message(conv.roles[1], None)
 71 |         # prompt = conv.get_prompt()
 72 | 
 73 |         # input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda()
 74 | 
 75 |         image = Image.open(os.path.join(args.image_folder, image_file)).convert('RGB')
 76 |         # image_tensor = image_processor.preprocess(image, return_tensors='pt')['pixel_values'][0]
 77 | 
 78 |         # stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
 79 |         # keywords = [stop_str]
 80 |         # stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids)
 81 | 
 82 |         # with torch.inference_mode():
 83 |         #     output_ids = model.generate(
 84 |         #         input_ids,
 85 |         #         images=image_tensor.unsqueeze(0).half().cuda(),
 86 |         #         do_sample=True if args.temperature > 0 else False,
 87 |         #         temperature=args.temperature,
 88 |         #         top_p=args.top_p,
 89 |         #         num_beams=args.num_beams,
 90 |         #         # no_repeat_ngram_size=3,
 91 |         #         max_new_tokens=1024,
 92 |         #         use_cache=True)
 93 |         with torch.cuda.amp.autocast(dtype=torch.float16):
 94 |             outputs = model.generate_reponse( #No, this is not a typo. This seems to be typo inherited from the Sphinx codebase - let them know via Github!
 95 |                 [[qs, None]],
 96 |                 image,
 97 |                 temperature=args.temperature,
 98 |                 top_p=args.top_p,
 99 |                 max_gen_len=1024, 
100 |                 seed=0)
101 |         outputs = outputs.strip()
102 | 
103 |         # input_token_len = input_ids.shape[1]
104 |         # n_diff_input_output = (input_ids != output_ids[:, :input_token_len]).sum().item()
105 |         # if n_diff_input_output > 0:
106 |         #     print(f'[Warning] {n_diff_input_output} output_ids are not the same as the input_ids')
107 |         # outputs = tokenizer.batch_decode(output_ids[:, input_token_len:], skip_special_tokens=True)[0]
108 |         # outputs = outputs.strip()
109 |         # if outputs.endswith(stop_str):
110 |         #     outputs = outputs[:-len(stop_str)]
111 |         # outputs = outputs.strip()
112 | 
113 |         #-----SG Generation-------
114 |         og_ans = outputs
115 | 
116 |         qs = "Question: " + line["text"] + "\nAnswer: " + og_ans + "\n\n" + sgPrompt
117 |         
118 |         # cur_prompt = qs
119 |         # if model.config.mm_use_im_start_end:
120 |         #     qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + qs
121 |         # else:
122 |         #     qs = DEFAULT_IMAGE_TOKEN + '\n' + qs
123 | 
124 |         # conv = conv_templates[args.conv_mode].copy()
125 |         # conv.append_message(conv.roles[0], qs)
126 |         # conv.append_message(conv.roles[1], None)
127 |         # prompt = conv.get_prompt()
128 | 
129 |         # input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda()
130 | 
131 |         #image = Image.open(os.path.join(args.image_folder, image_file))
132 |         # image_tensor = image_processor.preprocess(image, return_tensors='pt')['pixel_values'][0]
133 | 
134 |         # stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
135 |         # keywords = [stop_str]
136 |         # stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids)
137 | 
138 |         # with torch.inference_mode():
139 |         #     output_ids = model.generate(
140 |         #         input_ids,
141 |         #         images=image_tensor.unsqueeze(0).half().cuda(),
142 |         #         do_sample=True if args.temperature > 0 else False,
143 |         #         temperature=args.temperature,
144 |         #         top_p=args.top_p,
145 |         #         num_beams=args.num_beams,
146 |         #         # no_repeat_ngram_size=3,
147 |         #         max_new_tokens=1024,
148 |         #         use_cache=True)
149 | 
150 |         # input_token_len = input_ids.shape[1]
151 |         # n_diff_input_output = (input_ids != output_ids[:, :input_token_len]).sum().item()
152 |         # if n_diff_input_output > 0:
153 |         #     print(f'[Warning] {n_diff_input_output} output_ids are not the same as the input_ids')
154 |         # outputs = tokenizer.batch_decode(output_ids[:, input_token_len:], skip_special_tokens=True)[0]
155 |         # outputs = outputs.strip()
156 |         # if outputs.endswith(stop_str):
157 |         #     outputs = outputs[:-len(stop_str)]
158 |         # outputs = outputs.strip()
159 |         with torch.cuda.amp.autocast(dtype=torch.float16):
160 |             outputs = model.generate_reponse( #No, this is not a typo. This seems to be typo inherited from the Sphinx codebase - let them know via Github!
161 |                 [[qs, None]],
162 |                 image,
163 |                 temperature=args.temperature,
164 |                 top_p=args.top_p,
165 |                 max_gen_len=256, 
166 |                 seed=0)
167 |         outputs = outputs.strip()
168 |         
169 |         sg = outputs
170 |         #----Improved Answer-----
171 |         qs = answerPrompt + "\n\nScene Graph: " + sg + "\n\nQuestion: " + line["text"] + "\nOriginal Answer: " + og_ans + "\nImproved Answer: "
172 |         cur_prompt = qs
173 |         # cur_prompt = qs
174 |         # if model.config.mm_use_im_start_end:
175 |         #     qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + qs
176 |         # else:
177 |         #     qs = DEFAULT_IMAGE_TOKEN + '\n' + qs
178 | 
179 |         # conv = conv_templates[args.conv_mode].copy()
180 |         # conv.append_message(conv.roles[0], qs)
181 |         # conv.append_message(conv.roles[1], None)
182 |         # prompt = conv.get_prompt()
183 | 
184 |         # input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda()
185 | 
186 |         #image = Image.open(os.path.join(args.image_folder, image_file))
187 |         # image_tensor = image_processor.preprocess(image, return_tensors='pt')['pixel_values'][0]
188 | 
189 |         # stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
190 |         # keywords = [stop_str]
191 |         # stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids)
192 | 
193 |         # with torch.inference_mode():
194 |         #     output_ids = model.generate(
195 |         #         input_ids,
196 |         #         images=image_tensor.unsqueeze(0).half().cuda(),
197 |         #         do_sample=True if args.temperature > 0 else False,
198 |         #         temperature=args.temperature,
199 |         #         top_p=args.top_p,
200 |         #         num_beams=args.num_beams,
201 |         #         # no_repeat_ngram_size=3,
202 |         #         max_new_tokens=1024,
203 |         #         use_cache=True)
204 | 
205 |         # input_token_len = input_ids.shape[1]
206 |         # n_diff_input_output = (input_ids != output_ids[:, :input_token_len]).sum().item()
207 |         # if n_diff_input_output > 0:
208 |         #     print(f'[Warning] {n_diff_input_output} output_ids are not the same as the input_ids')
209 |         # outputs = tokenizer.batch_decode(output_ids[:, input_token_len:], skip_special_tokens=True)[0]
210 |         # outputs = outputs.strip()
211 |         # if outputs.endswith(stop_str):
212 |         #     outputs = outputs[:-len(stop_str)]
213 |         # outputs = outputs.strip()
214 | 
215 |         with torch.cuda.amp.autocast(dtype=torch.float16):
216 |             outputs = model.generate_reponse( #No, this is not a typo. This seems to be typo inherited from the Sphinx codebase - let them know via Github!
217 |                 [[qs, None]],
218 |                 image,
219 |                 temperature=args.temperature,
220 |                 top_p=args.top_p,
221 |                 max_gen_len=1024, 
222 |                 seed=0)
223 |         outputs = outputs.strip()
224 |         ans_id = shortuuid.uuid()
225 |         ans_file.write(json.dumps({"question_id": idx,
226 |                                    "prompt": cur_prompt,
227 |                                    "text": outputs,
228 |                                    "answer_id": ans_id,
229 |                                    "model_id": model_name,
230 |                                    "metadata": {}}) + "\n")
231 |         ans_file.flush()
232 |     ans_file.close()
233 | 
234 | if __name__ == "__main__":
235 |     parser = argparse.ArgumentParser()
236 |     parser.add_argument("--model-path", type=str, default="facebook/opt-350m")
237 |     parser.add_argument("--model-base", type=str, default=None)
238 |     parser.add_argument("--image-folder", type=str, default="")
239 |     parser.add_argument("--question-file", type=str, default="tables/question.jsonl")
240 |     parser.add_argument("--answers-file", type=str, default="answer.jsonl")
241 |     parser.add_argument("--conv-mode", type=str, default="llava_v1")
242 |     parser.add_argument("--num-chunks", type=int, default=1)
243 |     parser.add_argument("--chunk-idx", type=int, default=0)
244 |     parser.add_argument("--temperature", type=float, default=0.2)
245 |     parser.add_argument("--top_p", type=float, default=None)
246 |     parser.add_argument("--num_beams", type=int, default=1)
247 |     args = parser.parse_args()
248 | 
249 |     eval_model(args)
250 | 


--------------------------------------------------------------------------------
/InstructBLIP-13b/InstructBLIP_MMBench.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import torch
  3 | import os
  4 | import json
  5 | import pandas as pd
  6 | from tqdm import tqdm
  7 | import shortuuid
  8 | 
  9 | from llava.utils import disable_torch_init
 10 | from llava.mm_utils import load_image_from_base64
 11 | from PIL import Image
 12 | import math
 13 | from transformers import InstructBlipProcessor, InstructBlipForConditionalGeneration, InstructBlipConfig,  AutoModelForVision2Seq
 14 | import torch
 15 | from PIL import Image
 16 | import requests
 17 | from accelerate import init_empty_weights, infer_auto_device_map
 18 | import json
 19 | import os
 20 | 
 21 | 
 22 | processor = InstructBlipProcessor.from_pretrained("Salesforce/instructblip-vicuna-13b", device_map="auto")
 23 | model = InstructBlipForConditionalGeneration.from_pretrained("Salesforce/instructblip-vicuna-13b")
 24 | model.to("cuda:1")
 25 | 
 26 | 
 27 | answerPrompt="Use the image and scene graph as context and answer the following question: "
 28 | 
 29 | sgPrompt='''
 30 | For the provided image and its associated question, generate a scene graph in JSON format that includes the following:
 31 | 1. Objects that are relevant to answering the question
 32 | 2. Object attributes that are relevant to answering the question
 33 | 3. Object relationships that are relevant to answering the question
 34 | 
 35 | Scene Graph:
 36 | '''
 37 | 
 38 | all_options = ['A', 'B', 'C', 'D']
 39 | 
 40 | 
 41 | def split_list(lst, n):
 42 |     """Split a list into n (roughly) equal-sized chunks"""
 43 |     chunk_size = math.ceil(len(lst) / n)  # integer division
 44 |     return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)]
 45 | 
 46 | 
 47 | def get_chunk(lst, n, k):
 48 |     chunks = split_list(lst, n)
 49 |     return chunks[k]
 50 | 
 51 | 
 52 | def is_none(value):
 53 |     if value is None:
 54 |         return True
 55 |     if type(value) is float and math.isnan(value):
 56 |         return True
 57 |     if type(value) is str and value.lower() == 'nan':
 58 |         return True
 59 |     if type(value) is str and value.lower() == 'none':
 60 |         return True
 61 |     return False
 62 | 
 63 | def get_options(row, options):
 64 |     parsed_options = []
 65 |     for option in options:
 66 |         option_value = row[option]
 67 |         if is_none(option_value):
 68 |             break
 69 |         parsed_options.append(option_value)
 70 |     return parsed_options
 71 | 
 72 | 
 73 | def eval_model(args):
 74 |     # Model
 75 |     disable_torch_init()
 76 | 
 77 |     questions = pd.read_table(os.path.expanduser(args.question_file))
 78 |     questions = get_chunk(questions, args.num_chunks, args.chunk_idx)
 79 |     answers_file = os.path.expanduser(args.answers_file)
 80 |     os.makedirs(os.path.dirname(answers_file), exist_ok=True)
 81 |     ans_file = open(answers_file, "w")
 82 | 
 83 |     for index, row in tqdm(questions.iterrows(), total=len(questions)):
 84 |         options = get_options(row, all_options)
 85 |         cur_option_char = all_options[:len(options)]
 86 | 
 87 |         if args.all_rounds:
 88 |             num_rounds = len(options)
 89 |         else:
 90 |             num_rounds = 1
 91 | 
 92 |         for round_idx in range(num_rounds):
 93 |             idx = row['index']
 94 | 
 95 |             question = row["question"]
 96 | 
 97 |             image = load_image_from_base64(row['image'])
 98 | 
 99 |             for option_char, option in zip(all_options[:len(options)], options):
100 |                 question = question + '\n' + option_char + '. ' + option
101 |             qs = cur_prompt = question
102 | 
103 | 
104 |             if args.single_pred_prompt:
105 |                 if args.lang == 'cn':
106 |                     qs = qs + '\n' + "请直接回答选项字母。"
107 |                 else:
108 | 
109 |                     qs = qs + sgPrompt
110 | 
111 | 
112 |             prompt = "<Image> " + qs 
113 |             inputs = processor(images=image, text=prompt, return_tensors="pt").to("cuda:1")
114 |             outputs = model.generate(
115 |                 **inputs,
116 |                 do_sample=False,
117 |                 num_beams=5,
118 |                 max_length=256,
119 |                 min_length=1,
120 |                 top_p=0.9,
121 |                 repetition_penalty=1.5,
122 |                 length_penalty=0.5,
123 |                 temperature=1,
124 |             )
125 |             generated_text = processor.batch_decode(outputs, skip_special_tokens=True)[0].strip()
126 | 
127 | 
128 |             prompt_score = "<Image>" + " Scene Graph: " + generated_text + '\n\n' + answerPrompt + cur_prompt + ". The correct letter is"
129 |             inputs2 = processor(images=image, text=prompt_score, return_tensors="pt").to("cuda:1")
130 |             outputs2 = model.generate(
131 |                 **inputs2,
132 |                 do_sample=False,
133 |                 num_beams=5,
134 |                 max_length=256,
135 |                 min_length=1,
136 |                 top_p=0.9,
137 |                 repetition_penalty=1.5,
138 |                 length_penalty=0.5,
139 |                 temperature=1,
140 |             )
141 | 
142 |             generated_text = processor.batch_decode(outputs2, skip_special_tokens=True)[0].strip()
143 | 
144 |             ans_file.write(json.dumps({"question_id": idx,
145 |                                     "round_id": round_idx,
146 |                                     "prompt": cur_prompt,
147 |                                     "text": generated_text,
148 |                                     "options": options,
149 |                                     "option_char": cur_option_char,
150 |                                     "metadata": {}}) + "\n")
151 |             ans_file.flush()
152 | 
153 |             # rotate options
154 |             options = options[1:] + options[:1]
155 |             cur_option_char = cur_option_char[1:] + cur_option_char[:1]
156 |     ans_file.close()
157 | 
158 | if __name__ == "__main__":
159 |     parser = argparse.ArgumentParser()
160 |     parser.add_argument("--model-path", type=str, default="facebook/opt-350m")
161 |     parser.add_argument("--model-base", type=str, default=None)
162 |     parser.add_argument("--image-folder", type=str, default="")
163 |     parser.add_argument("--question-file", type=str, default="")
164 |     parser.add_argument("--answers-file", type=str, default="")
165 |     parser.add_argument("--conv-mode", type=str, default="llava_v1")
166 |     parser.add_argument("--num-chunks", type=int, default=1)
167 |     parser.add_argument("--chunk-idx", type=int, default=0)
168 |     parser.add_argument("--temperature", type=float, default=0.2)
169 |     parser.add_argument("--top_p", type=float, default=None)
170 |     parser.add_argument("--num_beams", type=int, default=1)
171 |     parser.add_argument("--all-rounds", action="store_true")
172 |     parser.add_argument("--single-pred-prompt", action="store_true")
173 |     parser.add_argument("--lang", type=str, default="en")
174 | 
175 |     args = parser.parse_args()
176 | 
177 |     eval_model(args)
178 | 


--------------------------------------------------------------------------------
/InstructBLIP-13b/InstructBLIP_Seed.py:
--------------------------------------------------------------------------------
  1 | from transformers import InstructBlipProcessor, InstructBlipForConditionalGeneration, InstructBlipConfig,  AutoModelForVision2Seq
  2 | import torch
  3 | from PIL import Image
  4 | import requests
  5 | from accelerate import init_empty_weights, infer_auto_device_map
  6 | import json
  7 | import os
  8 | from tqdm import tqdm
  9 | 
 10 | 
 11 | # Determine if CUDA (GPU) is available.
 12 | device = "cuda" if torch.cuda.is_available() else "cpu"
 13 | 
 14 | 
 15 | # Load the model configuration.
 16 | config = InstructBlipConfig.from_pretrained("Salesforce/instructblip-vicuna-13b")
 17 | 
 18 | # Initialize the model with the given configuration.
 19 | with init_empty_weights():
 20 | 
 21 |     model = AutoModelForVision2Seq.from_config(config)
 22 |     model.tie_weights()
 23 | 
 24 | # Infer device map based on the available resources.
 25 | device_map = infer_auto_device_map(model, max_memory={7: "20GiB", 8: "20GiB", 9: "20GiB"},
 26 |                                    no_split_module_classes=['InstructBlipEncoderLayer', 'InstructBlipQFormerLayer',
 27 |                                                             'LlamaDecoderLayer'])
 28 | 
 29 | device_map['language_model.lm_head'] = device_map['language_projection'] = device_map[('language_model.model'
 30 |                                                                                        '.embed_tokens')]
 31 | 
 32 | offload = ""
 33 | # Load the processor and model for image processing.
 34 | processor = InstructBlipProcessor.from_pretrained("Salesforce/instructblip-vicuna-13b", device_map="auto")
 35 | model = InstructBlipForConditionalGeneration.from_pretrained("Salesforce/instructblip-vicuna-13b",
 36 |                                                              device_map=device_map,
 37 |                                                              offload_folder=offload, offload_state_dict=True)
 38 | 
 39 | 
 40 | sgPrompt='''
 41 | For the provided image and its associated question, generate a scene graph in JSON format that includes the following:
 42 | 1. Objects that are relevant to answering the question.
 43 | 2. Object attributes that are relevant to answering the question.
 44 | 3. Object relationships that are relevant to answering the question.
 45 | 
 46 | Scene Graph:
 47 | '''
 48 | 
 49 | 
 50 | qs_path = ""  #Path to question
 51 | ans_path = ""  #Path to store result
 52 | img_dir = ""  #Path to image
 53 | ans_file = open(ans_path, 'w')
 54 | 
 55 | 
 56 | with open(qs_path, 'r') as json_file:
 57 |     json_list = list(json_file)
 58 | 
 59 | 
 60 | count = 0
 61 | for json_str in tqdm(json_list):
 62 |     result = json.loads(json_str)
 63 |     try:
 64 |         cur_image = img_dir + result["image"]
 65 |         image = Image.open(cur_image).convert("RGB")
 66 |         prompt = "<Image> " +  result["text"].split("?")[0] + "?" + sgPrompt
 67 | 
 68 |         
 69 |         inputs = processor(images=image, text=prompt, return_tensors="pt").to("cuda")
 70 |         outputs = model.generate(
 71 |             **inputs,
 72 |             do_sample=False,
 73 |             num_beams=5,
 74 |             max_length=256,
 75 |             min_length=1,
 76 |             top_p=0.9,
 77 |             repetition_penalty=1.5,
 78 |             length_penalty=0.5,
 79 |             temperature=0,
 80 |         )
 81 |         generated_text = processor.batch_decode(outputs, skip_special_tokens=True)[0].strip()
 82 |         
 83 | 
 84 |         answerPrompt="Use the image and scene graph as context and answer the following question: "
 85 |         prompt_score = "<Image> Scene Graph: " + generated_text + '\n\n' + answerPrompt + result["text"] + ". The correct letter is"
 86 |         inputs2 = processor(images=image, text=prompt_score, return_tensors="pt").to("cuda")
 87 |         outputs2 = model.generate(
 88 |             **inputs2,
 89 |             do_sample=False,
 90 |             num_beams=5,
 91 |             max_length=256,
 92 |             min_length=1,
 93 |             top_p=0.9,
 94 |             repetition_penalty=1.5,
 95 |             length_penalty=0.5,
 96 |             temperature=0,
 97 |         )
 98 |         generated_text = processor.batch_decode(outputs2, skip_special_tokens=True)[0].strip()
 99 |     except:
100 |         generated_text = "None"
101 | 
102 |     temp_result = {"question_id":result["question_id"], "text":generated_text}
103 |     ans_file.write(json.dumps(temp_result) + "\n")
104 | ans_file.close()
105 | 


--------------------------------------------------------------------------------
/InstructBLIP-13b/InstructBLIP_Whoops.py:
--------------------------------------------------------------------------------
 1 | from transformers import InstructBlipProcessor, InstructBlipForConditionalGeneration, InstructBlipConfig,  AutoModelForVision2Seq
 2 | import torch
 3 | from PIL import Image
 4 | import requests
 5 | from accelerate import init_empty_weights, infer_auto_device_map
 6 | import json
 7 | import os
 8 | from tqdm import tqdm
 9 | from datasets import load_dataset
10 | 
11 | 
12 | hf_key=""  #Huggingface auth key
13 | result_path=""  #Path to store result
14 | 
15 | sgPrompt='''
16 | For the provided image and its associated question, generate a scene graph in JSON format that includes the following:
17 | 1. Objects that are relevant to answering the question.
18 | 2. Object attributes that are relevant to answering the question.
19 | 3. Object relationships that are relevant to answering the question.
20 | 
21 | Scene Graph:
22 | '''
23 | 
24 | 
25 | # Determine if CUDA (GPU) is available.
26 | device = "cuda" if torch.cuda.is_available() else "cpu"
27 | 
28 | 
29 | # Load the model configuration.
30 | config = InstructBlipConfig.from_pretrained("Salesforce/instructblip-vicuna-13b")
31 | 
32 | # Initialize the model with the given configuration.
33 | with init_empty_weights():
34 | 
35 |     model = AutoModelForVision2Seq.from_config(config)
36 |     model.tie_weights()
37 | 
38 | # Load the processor and model for image processing.
39 | processor = InstructBlipProcessor.from_pretrained("Salesforce/instructblip-vicuna-13b", device_map="auto")
40 | model = InstructBlipForConditionalGeneration.from_pretrained("Salesforce/instructblip-vicuna-13b",
41 |                                                              device_map="auto")
42 | 
43 | 
44 | def get_ans(question, image_tensor, pred_sg=None):
45 |     if pred_sg is None:
46 |         prompt = "<Image> " + question + "\n\n" + sgPrompt
47 |         max_token = 256
48 |     else:
49 |         prompt = f"<Image> Question:{question} Scene Graph:{pred_sg}\n\n Use the image and scene graph to reason and provide a short answer:"
50 |         max_token = 64
51 |     inputs = processor(images=image_tensor, text=prompt, return_tensors="pt").to(model.device)
52 |     outputs = model.generate(
53 |         **inputs,
54 |         do_sample=False,
55 |         num_beams=5,
56 |         max_length=max_token,
57 |         min_length=1,
58 |         top_p=0.9,
59 |         repetition_penalty=1.5,
60 |         length_penalty=0.5,
61 |         temperature=0,
62 |     )
63 |     generated_text = processor.batch_decode(outputs, skip_special_tokens=True)[0].strip()
64 | 
65 |     return generated_text
66 | 
67 | 
68 | result_file = open(result_path, "w")
69 | examples = load_dataset('nlphuji/whoops', use_auth_token=hf_key)
70 | 
71 | for item in tqdm(examples["test"]):
72 | 
73 |     image_tensor = item["image"].convert("RGB")
74 |     all_pred = []
75 |     for q_a in item["question_answering_pairs"]:
76 |         
77 | 
78 |         question = q_a[0]
79 |         pred_sg = get_ans(question, image_tensor)
80 |         pred_ans = get_ans(question, image_tensor, pred_sg)
81 |         all_pred.append(pred_ans)
82 | 
83 | 
84 |     result_file.write(json.dumps({  "image_id": item["image_id"],
85 |                                 "question_answering_pairs": item["question_answering_pairs"],
86 |                                 "prediction": all_pred}) + "\n")
87 | 
88 | 
89 | result_file.close()
90 | 


--------------------------------------------------------------------------------
/InstructBLIP-13b/InstructBLIP_wino.py:
--------------------------------------------------------------------------------
 1 | from transformers import InstructBlipProcessor, InstructBlipForConditionalGeneration, InstructBlipConfig,  AutoModelForVision2Seq
 2 | import torch
 3 | from PIL import Image
 4 | import requests
 5 | from accelerate import init_empty_weights, infer_auto_device_map
 6 | import json
 7 | import os
 8 | from tqdm import tqdm
 9 | 
10 | 
11 | # Determine if CUDA (GPU) is available.
12 | device = "cuda" if torch.cuda.is_available() else "cpu"
13 | 
14 | 
15 | # Load the model configuration.
16 | config = InstructBlipConfig.from_pretrained("Salesforce/instructblip-vicuna-13b")
17 | 
18 | # Initialize the model with the given configuration.
19 | with init_empty_weights():
20 | 
21 |     model = AutoModelForVision2Seq.from_config(config)
22 |     model.tie_weights()
23 | 
24 | # Load the processor and model for image processing.
25 | processor = InstructBlipProcessor.from_pretrained("Salesforce/instructblip-vicuna-13b", device_map="auto")
26 | model = InstructBlipForConditionalGeneration.from_pretrained("Salesforce/instructblip-vicuna-13b",
27 |                                                              device_map="auto")
28 | 
29 | 
30 | sgPrompt='''
31 | For the provided image and its associated question, generate a scene graph in JSON format that includes the following:
32 | 1. Objects that are relevant to answering the question.
33 | 2. Object attributes that are relevant to answering the question.
34 | 3. Object relationships that are relevant to answering the question.
35 | 
36 | Scene Graph:
37 | '''
38 | 
39 | image_file = ""  #Path to image file
40 | question_path = ""  #Path to question file
41 | result_path = ""  #Path to store result
42 | result_file = open(result_path, 'w')
43 | 
44 | 
45 | with open(question_path, 'r') as json_file:
46 |     json_list = list(json_file)
47 | 
48 | 
49 | for json_str in tqdm(json_list):
50 | 
51 | 
52 |     result = json.loads(json_str)
53 |     cur_image = image_file + result["image"] + ".png"
54 |     image = Image.open(cur_image).convert("RGB")
55 |     cur_caption = result["caption"]
56 | 
57 | 
58 |     prompt = f"<Image> Does the given caption accurately describe the given image? Caption:{cur_caption}.\n\n{sgPrompt}"
59 |     inputs = processor(images=image, text=prompt, return_tensors="pt").to("cuda")
60 |     outputs = model.generate(
61 |         **inputs,
62 |         do_sample=False,
63 |         num_beams=5,
64 |         max_length=256,
65 |         min_length=1,
66 |         top_p=0.9,
67 |         repetition_penalty=1.5,
68 |         length_penalty=0.5,
69 |         temperature=0,
70 |     )
71 |     generated_text = processor.batch_decode(outputs, skip_special_tokens=True)[0].strip()
72 |     
73 | 
74 |     answerPrompt = "Use the image and scene graph to reason and answer the question."
75 |     prompt = f"<Image> Question: Does the given caption accurately describe the given image? Caption:{cur_caption}. Scene Graph: {generated_text}\n\n{answerPrompt}"
76 |     inputs = processor(images=image, text=prompt, return_tensors="pt").to("cuda")
77 |     outputs = model.generate(
78 |         **inputs,
79 |         do_sample=False,
80 |         num_beams=5,
81 |         max_length=256,
82 |         min_length=1,
83 |         top_p=0.9,
84 |         repetition_penalty=1.5,
85 |         length_penalty=0.5,
86 |         temperature=0,
87 |     )
88 |     generated_text = processor.batch_decode(outputs, skip_special_tokens=True)[0].strip()
89 |     stored_result = {"text":generated_text}
90 | 
91 | 
92 |     result_file.write(json.dumps(stored_result) + "\n")
93 | 
94 | result_file.close()
95 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 Chancharik Mitra
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/LLaVA-1.5-13b/LLaVA_MMBench.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import torch
  3 | import os
  4 | import json
  5 | import pandas as pd
  6 | from tqdm import tqdm
  7 | import shortuuid
  8 | 
  9 | from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
 10 | from llava.conversation import conv_templates, SeparatorStyle
 11 | from llava.model.builder import load_pretrained_model
 12 | from llava.utils import disable_torch_init
 13 | from llava.mm_utils import tokenizer_image_token, process_images, load_image_from_base64, get_model_name_from_path
 14 | 
 15 | from PIL import Image
 16 | import math
 17 | 
 18 | answerPrompt="Use the image and scene graph as context and answer the following question: "
 19 | answerPrompt2="Use the image and context to answer the following question: "
 20 | sgPrompt='''
 21 | For the provided image and its associated question, generate a scene graph in JSON format that includes the following:
 22 | 1. Objects that are relevant to answering the question
 23 | 2. Object attributes that are relevant to answering the question
 24 | 3. Object relationships that are relevant to answering the question
 25 | 
 26 | Scene Graph:
 27 | '''
 28 | 
 29 | all_options = ['A', 'B', 'C', 'D']
 30 | 
 31 | 
 32 | def split_list(lst, n):
 33 |     """Split a list into n (roughly) equal-sized chunks"""
 34 |     chunk_size = math.ceil(len(lst) / n)  # integer division
 35 |     return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)]
 36 | 
 37 | 
 38 | def get_chunk(lst, n, k):
 39 |     chunks = split_list(lst, n)
 40 |     return chunks[k]
 41 | 
 42 | 
 43 | def is_none(value):
 44 |     if value is None:
 45 |         return True
 46 |     if type(value) is float and math.isnan(value):
 47 |         return True
 48 |     if type(value) is str and value.lower() == 'nan':
 49 |         return True
 50 |     if type(value) is str and value.lower() == 'none':
 51 |         return True
 52 |     return False
 53 | 
 54 | def get_options(row, options):
 55 |     parsed_options = []
 56 |     for option in options:
 57 |         option_value = row[option]
 58 |         if is_none(option_value):
 59 |             break
 60 |         parsed_options.append(option_value)
 61 |     return parsed_options
 62 | 
 63 | 
 64 | #Scene-Graph Generation Step:
 65 | def get_sg(row, model, image_processor, tokenizer):
 66 |     idx = row['index']
 67 |     question = row['question']
 68 |     hint = row['hint']
 69 |     image = load_image_from_base64(row['image'])
 70 |     if not is_none(hint):
 71 |         question = hint + '\n' + question
 72 | 
 73 |     qs = question
 74 |     if model.config.mm_use_im_start_end:
 75 |         qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + qs
 76 |     else:
 77 |         qs = DEFAULT_IMAGE_TOKEN + '\n' + qs
 78 |     
 79 |     if args.single_pred_prompt:
 80 |         if args.lang == 'cn':
 81 |             qs = qs + '\n' + "请直接回答选项字母。"
 82 |         else:
 83 |             qs = qs + '\n' + sgPrompt
 84 |             # qs = qs + '\n' + "Let's think step by step:"
 85 |             # qs = "Provide a caption for the image."
 86 | 
 87 |     conv = conv_templates[args.conv_mode].copy()
 88 |     conv.append_message(conv.roles[0], qs)
 89 |     conv.append_message(conv.roles[1], None)
 90 |     prompt = conv.get_prompt()
 91 | 
 92 |     input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda()
 93 | 
 94 |     image_tensor = process_images([image], image_processor, model.config)[0]
 95 |     # image_tensor = image_processor.preprocess(image, return_tensors='pt')['pixel_values'][0]
 96 | 
 97 |     stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
 98 | 
 99 |     with torch.inference_mode():
100 |         output_ids = model.generate(
101 |             input_ids,
102 |             images=image_tensor.unsqueeze(0).half().cuda(),
103 |             do_sample=True if args.temperature > 0 else False,
104 |             temperature=args.temperature,
105 |             top_p=args.top_p,
106 |             num_beams=args.num_beams,
107 |             # no_repeat_ngram_size=3,
108 |             max_new_tokens=256,
109 |             use_cache=True)
110 | 
111 |     input_token_len = input_ids.shape[1]
112 |     n_diff_input_output = (input_ids != output_ids[:, :input_token_len]).sum().item()
113 |     if n_diff_input_output > 0:
114 |         print(f'[Warning] {n_diff_input_output} output_ids are not the same as the input_ids')
115 |     outputs = tokenizer.batch_decode(output_ids[:, input_token_len:], skip_special_tokens=True)[0]
116 |     outputs = outputs.strip()
117 |     if outputs.endswith(stop_str):
118 |         outputs = outputs[:-len(stop_str)]
119 |     outputs = outputs.strip()
120 | 
121 |     return "Scene Graph: " + outputs + "\n\n" +  answerPrompt + row["question"]
122 |     # return row["question"] + "Let's think step by step:" + outputs + ".\n\n"
123 |     # return "Context: " + outputs + "\n\n" +  answerPrompt2 + row["question"]
124 | 
125 | #Answer Extraction and Evaluation Step:
126 | def eval_model(args):
127 |     # Model
128 |     disable_torch_init()
129 |     model_path = os.path.expanduser(args.model_path)
130 |     model_name = get_model_name_from_path(model_path)
131 |     tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, args.model_base, model_name)
132 | 
133 |     questions = pd.read_table(os.path.expanduser(args.question_file))
134 |     questions = get_chunk(questions, args.num_chunks, args.chunk_idx)
135 |     answers_file = os.path.expanduser(args.answers_file)
136 |     os.makedirs(os.path.dirname(answers_file), exist_ok=True)
137 |     ans_file = open(answers_file, "w")
138 | 
139 |     if 'plain' in model_name and 'finetune' not in model_name.lower() and 'mmtag' not in args.conv_mode:
140 |         args.conv_mode = args.conv_mode + '_mmtag'
141 |         print(f'It seems that this is a plain model, but it is not using a mmtag prompt, auto switching to {args.conv_mode}.')
142 | 
143 |     for index, row in tqdm(questions.iterrows(), total=len(questions)):
144 |         options = get_options(row, all_options)
145 |         cur_option_char = all_options[:len(options)]
146 | 
147 |         if args.all_rounds:
148 |             num_rounds = len(options)
149 |         else:
150 |             num_rounds = 1
151 | 
152 |         for round_idx in range(num_rounds):
153 |             idx = row['index']
154 |             question = get_sg(row, model, image_processor, tokenizer)
155 |             # question = row["question"]
156 | 
157 |             image = load_image_from_base64(row['image'])
158 | 
159 |             for option_char, option in zip(all_options[:len(options)], options):
160 |                 question = question + '\n' + option_char + '. ' + option
161 |             qs = cur_prompt = question
162 |             if model.config.mm_use_im_start_end:
163 |                 qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + qs
164 |             else:
165 |                 qs = DEFAULT_IMAGE_TOKEN + '\n' + qs
166 | 
167 |             if args.single_pred_prompt:
168 |                 if args.lang == 'cn':
169 |                     qs = qs + '\n' + "请直接回答选项字母。"
170 |                 else:
171 |                     qs = qs + '\n' + "Answer with the option's letter from the given choices directly."
172 |                     # qs = qs + "Therefore, the answer with the option's letter from the given choices directly is "
173 | 
174 |             conv = conv_templates[args.conv_mode].copy()
175 |             conv.append_message(conv.roles[0], qs)
176 |             conv.append_message(conv.roles[1], None)
177 |             prompt = conv.get_prompt()
178 | 
179 |             input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda()
180 | 
181 |             image_tensor = process_images([image], image_processor, model.config)[0]
182 |             # image_tensor = image_processor.preprocess(image, return_tensors='pt')['pixel_values'][0]
183 | 
184 |             stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
185 | 
186 |             with torch.inference_mode():
187 |                 output_ids = model.generate(
188 |                     input_ids,
189 |                     images=image_tensor.unsqueeze(0).half().cuda(),
190 |                     do_sample=True if args.temperature > 0 else False,
191 |                     temperature=args.temperature,
192 |                     top_p=args.top_p,
193 |                     num_beams=args.num_beams,
194 |                     # no_repeat_ngram_size=3,
195 |                     max_new_tokens=1024,
196 |                     use_cache=True)
197 | 
198 |             input_token_len = input_ids.shape[1]
199 |             n_diff_input_output = (input_ids != output_ids[:, :input_token_len]).sum().item()
200 |             if n_diff_input_output > 0:
201 |                 print(f'[Warning] {n_diff_input_output} output_ids are not the same as the input_ids')
202 |             outputs = tokenizer.batch_decode(output_ids[:, input_token_len:], skip_special_tokens=True)[0]
203 |             outputs = outputs.strip()
204 |             if outputs.endswith(stop_str):
205 |                 outputs = outputs[:-len(stop_str)]
206 |             outputs = outputs.strip()
207 | 
208 |             ans_id = shortuuid.uuid()
209 |             ans_file.write(json.dumps({"question_id": idx,
210 |                                     "round_id": round_idx,
211 |                                     "prompt": cur_prompt,
212 |                                     "text": outputs,
213 |                                     "options": options,
214 |                                     "option_char": cur_option_char,
215 |                                     "answer_id": ans_id,
216 |                                     "model_id": model_name,
217 |                                     "metadata": {}}) + "\n")
218 |             ans_file.flush()
219 | 
220 |             # rotate options
221 |             options = options[1:] + options[:1]
222 |             cur_option_char = cur_option_char[1:] + cur_option_char[:1]
223 |     ans_file.close()
224 | 
225 | if __name__ == "__main__":
226 |     parser = argparse.ArgumentParser()
227 |     parser.add_argument("--model-path", type=str, default="facebook/opt-350m")
228 |     parser.add_argument("--model-base", type=str, default=None)
229 |     parser.add_argument("--image-folder", type=str, default="")
230 |     parser.add_argument("--question-file", type=str, default="tables/question.jsonl")
231 |     parser.add_argument("--answers-file", type=str, default="answer.jsonl")
232 |     parser.add_argument("--conv-mode", type=str, default="llava_v1")
233 |     parser.add_argument("--num-chunks", type=int, default=1)
234 |     parser.add_argument("--chunk-idx", type=int, default=0)
235 |     parser.add_argument("--temperature", type=float, default=0.2)
236 |     parser.add_argument("--top_p", type=float, default=None)
237 |     parser.add_argument("--num_beams", type=int, default=1)
238 |     parser.add_argument("--all-rounds", action="store_true")
239 |     parser.add_argument("--single-pred-prompt", action="store_true")
240 |     parser.add_argument("--lang", type=str, default="en")
241 | 
242 |     args = parser.parse_args()
243 | 
244 |     eval_model(args)


--------------------------------------------------------------------------------
/LLaVA-1.5-13b/LLaVA_SEED.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import torch
  3 | import os
  4 | import json
  5 | from tqdm import tqdm
  6 | import shortuuid
  7 | 
  8 | from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
  9 | from llava.conversation import conv_templates, SeparatorStyle
 10 | from llava.model.builder import load_pretrained_model
 11 | from llava.utils import disable_torch_init
 12 | from llava.mm_utils import tokenizer_image_token, process_images, get_model_name_from_path
 13 | from torch.utils.data import Dataset, DataLoader
 14 | 
 15 | from PIL import Image
 16 | import math
 17 | 
 18 | 
 19 | answerPrompt="Use the image and scene graph as context and answer the following question: "
 20 | 
 21 | sgPrompt='''
 22 | For the provided image and its associated question, generate a scene graph in JSON format that includes the following:
 23 | 1. Objects that are relevant to answering the question
 24 | 2. Object attributes that are relevant to answering the question
 25 | 3. Object relationships that are relevant to answering the question
 26 | 
 27 | Scene Graph:
 28 | '''
 29 | 
 30 | 
 31 | 
 32 | 
 33 | 
 34 | def split_list(lst, n):
 35 |     """Split a list into n (roughly) equal-sized chunks"""
 36 |     chunk_size = math.ceil(len(lst) / n)  # integer division
 37 |     return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)]
 38 | 
 39 | 
 40 | def get_chunk(lst, n, k):
 41 |     chunks = split_list(lst, n)
 42 |     return chunks[k]
 43 | 
 44 | 
 45 | # Custom dataset class
 46 | class CustomDataset(Dataset):
 47 |     def __init__(self, questions, image_folder, tokenizer, image_processor, model_config, sg_prompt):
 48 |         self.questions = questions
 49 |         self.image_folder = image_folder
 50 |         self.tokenizer = tokenizer
 51 |         self.image_processor = image_processor
 52 |         self.model_config = model_config
 53 |         self.sg_prompt = sg_prompt
 54 |     def __getitem__(self, index):
 55 |         line = self.questions[index]
 56 |         image_file = line["image"]
 57 |         if self.sg_prompt == 1:
 58 | 
 59 |             qs = line["text"].split("?")[0] + sgPrompt
 60 |         else:
 61 |             qs = line["text"]
 62 |         if self.model_config.mm_use_im_start_end:
 63 |             qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + qs
 64 |         else:
 65 |             qs = DEFAULT_IMAGE_TOKEN + '\n' + qs
 66 | 
 67 |         conv = conv_templates[args.conv_mode].copy()
 68 |         conv.append_message(conv.roles[0], qs)
 69 |         conv.append_message(conv.roles[1], None)
 70 |         prompt = conv.get_prompt()
 71 | 
 72 |         image = Image.open(os.path.join(self.image_folder, image_file)).convert('RGB')
 73 |         image_tensor = process_images([image], self.image_processor, self.model_config)[0]
 74 | 
 75 |         input_ids = tokenizer_image_token(prompt, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt')
 76 | 
 77 |         return input_ids, image_tensor
 78 | 
 79 |     def __len__(self):
 80 |         return len(self.questions)
 81 | 
 82 | 
 83 | # DataLoader
 84 | def create_data_loader(questions, image_folder, tokenizer, image_processor, model_config, batch_size=1, num_workers=4, sg_prompt = 0):
 85 |     assert batch_size == 1, "batch_size must be 1"
 86 |     dataset = CustomDataset(questions, image_folder, tokenizer, image_processor, model_config, sg_prompt)
 87 |     data_loader = DataLoader(dataset, batch_size=batch_size, num_workers=num_workers, shuffle=False)
 88 |     return data_loader
 89 | 
 90 | #Scene-Graph Generation Step:
 91 | def get_sg_prompt(args):
 92 | 
 93 |     disable_torch_init()
 94 |     model_path = os.path.expanduser(args.model_path)
 95 |     model_name = get_model_name_from_path(model_path)
 96 |     tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, args.model_base, model_name)
 97 | 
 98 |     
 99 | 
100 | 
101 |     questions = [json.loads(q) for q in open(os.path.expanduser(args.question_file), "r")]
102 | 
103 | 
104 | 
105 |     #Add the prompt in dataloader instead
106 | 
107 | 
108 |     questions = get_chunk(questions, args.num_chunks, args.chunk_idx)
109 | 
110 | 
111 |     q_file = os.path.expanduser(args.answers_file)
112 |     os.makedirs(os.path.dirname(q_file), exist_ok=True)
113 |     q_file = open(q_file, "w")
114 | 
115 |     if 'plain' in model_name and 'finetune' not in model_name.lower() and 'mmtag' not in args.conv_mode:
116 |         args.conv_mode = args.conv_mode + '_mmtag'
117 |         print(f'It seems that this is a plain model, but it is not using a mmtag prompt, auto switching to {args.conv_mode}.')
118 | 
119 |     data_loader = create_data_loader(questions, args.image_folder, tokenizer, image_processor, model.config, sg_prompt = 1)
120 | 
121 | 
122 |     for (input_ids, image_tensor), line in tqdm(zip(data_loader, questions), total=len(questions)):
123 | 
124 |         stop_str = conv_templates[args.conv_mode].sep if conv_templates[args.conv_mode].sep_style != SeparatorStyle.TWO else conv_templates[args.conv_mode].sep2
125 |         input_ids = input_ids.to(device='cuda', non_blocking=True)
126 | 
127 |         with torch.inference_mode():
128 |             output_ids = model.generate(
129 |                 input_ids,
130 |                 images=image_tensor.to(dtype=torch.float16, device='cuda', non_blocking=True),
131 |                 do_sample=True if args.temperature > 0 else False,
132 |                 temperature=args.temperature,
133 |                 top_p=args.top_p,
134 |                 num_beams=args.num_beams,
135 |                 max_new_tokens=256,
136 |                 use_cache=True)
137 | 
138 |         input_token_len = input_ids.shape[1]
139 |         n_diff_input_output = (input_ids != output_ids[:, :input_token_len]).sum().item()
140 |         if n_diff_input_output > 0:
141 |             print(f'[Warning] {n_diff_input_output} output_ids are not the same as the input_ids')
142 |         outputs = tokenizer.batch_decode(output_ids[:, input_token_len:], skip_special_tokens=True)[0]
143 |         outputs = outputs.strip()
144 |         if outputs.endswith(stop_str):
145 |             outputs = outputs[:-len(stop_str)]
146 |         outputs = outputs.strip()
147 |         
148 | 
149 |         q_file.write(json.dumps({  "image": line["image"],
150 |                                    "text": " Scene Graph: " + outputs + '\n\n' + answerPrompt + line["text"],
151 |                                    "question_id": line["question_id"]}) + "\n")
152 | 
153 |        
154 |     q_file.close()
155 | 
156 | #Answer Extraction and Evaluation Step:
157 | def eval_model(args):
158 |     # Model
159 |     disable_torch_init()
160 |     model_path = os.path.expanduser(args.model_path)
161 |     model_name = get_model_name_from_path(model_path)
162 |     tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, args.model_base, model_name)
163 | 
164 |     questions = [json.loads(q) for q in open(os.path.expanduser(args.question_file), "r")]
165 | 
166 | 
167 | 
168 |     questions = get_chunk(questions, args.num_chunks, args.chunk_idx)
169 | 
170 | 
171 |     answers_file = os.path.expanduser(args.answers_file)
172 |     os.makedirs(os.path.dirname(answers_file), exist_ok=True)
173 |     ans_file = open(answers_file, "w")
174 | 
175 |     if 'plain' in model_name and 'finetune' not in model_name.lower() and 'mmtag' not in args.conv_mode:
176 |         args.conv_mode = args.conv_mode + '_mmtag'
177 |         print(f'It seems that this is a plain model, but it is not using a mmtag prompt, auto switching to {args.conv_mode}.')
178 | 
179 |     data_loader = create_data_loader(questions, args.image_folder, tokenizer, image_processor, model.config)
180 | 
181 |     for (input_ids, image_tensor), line in tqdm(zip(data_loader, questions), total=len(questions)):
182 |         idx = line["question_id"]
183 |         cur_prompt = line["text"]
184 | 
185 |         stop_str = conv_templates[args.conv_mode].sep if conv_templates[args.conv_mode].sep_style != SeparatorStyle.TWO else conv_templates[args.conv_mode].sep2
186 |         input_ids = input_ids.to(device='cuda', non_blocking=True)
187 | 
188 | 
189 |         with torch.inference_mode():
190 |             output_ids = model.generate(
191 |                 input_ids,
192 |                 images=image_tensor.to(dtype=torch.float16, device='cuda', non_blocking=True),
193 |                 do_sample=True if args.temperature > 0 else False,
194 |                 temperature=args.temperature,
195 |                 top_p=args.top_p,
196 |                 num_beams=args.num_beams,
197 |                 max_new_tokens=256,
198 |                 use_cache=True)
199 | 
200 |         input_token_len = input_ids.shape[1]
201 |         n_diff_input_output = (input_ids != output_ids[:, :input_token_len]).sum().item()
202 |         if n_diff_input_output > 0:
203 |             print(f'[Warning] {n_diff_input_output} output_ids are not the same as the input_ids')
204 |         outputs = tokenizer.batch_decode(output_ids[:, input_token_len:], skip_special_tokens=True)[0]
205 |         outputs = outputs.strip()
206 |         if outputs.endswith(stop_str):
207 |             outputs = outputs[:-len(stop_str)]
208 |         outputs = outputs.strip()
209 | 
210 |         ans_id = shortuuid.uuid()
211 |         ans_file.write(json.dumps({"question_id": idx,
212 |                                    "prompt": cur_prompt,
213 |                                    "text": outputs,
214 |                                    "answer_id": ans_id,
215 |                                    "model_id": model_name,
216 |                                    "metadata": {}}) + "\n")
217 |         # ans_file.flush()
218 |     ans_file.close()
219 | 
220 | if __name__ == "__main__":
221 |     parser = argparse.ArgumentParser()
222 |     parser.add_argument("--model-path", type=str, default="facebook/opt-350m")
223 |     parser.add_argument("--model-base", type=str, default=None)
224 |     parser.add_argument("--image-folder", type=str, default="")
225 |     parser.add_argument("--question-file", type=str, default="tables/question.jsonl")
226 |     parser.add_argument("--answers-file", type=str, default="answer.jsonl")
227 |     parser.add_argument("--conv-mode", type=str, default="llava_v1")
228 |     parser.add_argument("--num-chunks", type=int, default=1)
229 |     parser.add_argument("--chunk-idx", type=int, default=0)
230 |     parser.add_argument("--temperature", type=float, default=0.2)
231 |     parser.add_argument("--top_p", type=float, default=None)
232 |     parser.add_argument("--num_beams", type=int, default=1)
233 | 
234 |     parser.add_argument("--scene_graph", type=int, default=0)
235 | 
236 |     args = parser.parse_args()
237 | 
238 | 
239 |     if args.scene_graph == 0:
240 |         eval_model(args)
241 |     else:
242 |         get_sg_prompt(args)


--------------------------------------------------------------------------------
/LLaVA-1.5-13b/LLaVA_Whoops.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import torch
  3 | import os
  4 | import json
  5 | from tqdm import tqdm
  6 | import shortuuid
  7 | 
  8 | from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
  9 | from llava.conversation import conv_templates, SeparatorStyle
 10 | from llava.model.builder import load_pretrained_model
 11 | from llava.utils import disable_torch_init
 12 | from llava.mm_utils import tokenizer_image_token, process_images, get_model_name_from_path
 13 | from torch.utils.data import Dataset, DataLoader
 14 | 
 15 | from PIL import Image
 16 | import math
 17 | from datasets import load_dataset
 18 | 
 19 | 
 20 | sgPrompt='''
 21 | For the provided image and its associated question, generate a scene graph in JSON format that includes the following:
 22 | 1. Objects that are relevant to answering the question
 23 | 2. Object attributes that are relevant to answering the question
 24 | 3. Object relationships that are relevant to answering the question
 25 | 
 26 | Scene Graph:
 27 | '''
 28 | answerPrompt="\nUse the image and scene graph to reason and answer the question with a single phrase."
 29 | 
 30 | 
 31 | disable_torch_init()
 32 | model_path = os.path.expanduser("liuhaotian/llava-v1.5-13b")
 33 | model_name = get_model_name_from_path(model_path)
 34 | tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, None, model_name)
 35 | 
 36 | 
 37 | def get_ans(question, image_tensor, sg = 0):
 38 | 
 39 |     if sg == 1:
 40 |         new_token = 256
 41 |         qs = DEFAULT_IMAGE_TOKEN + question  + sgPrompt
 42 |     else:
 43 |         new_token = 64
 44 |         qs = DEFAULT_IMAGE_TOKEN + question 
 45 | 
 46 |     conv = conv_templates["vicuna_v1"].copy()
 47 |     conv.append_message(conv.roles[0], qs)
 48 |     conv.append_message(conv.roles[1], None)
 49 |     prompt = conv.get_prompt()
 50 | 
 51 |     input_ids = torch.unsqueeze(tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt'), 0)
 52 |     input_ids = input_ids.to(device='cuda', non_blocking=True)
 53 |     
 54 |     image_tensor = torch.unsqueeze(image_tensor, 0)
 55 | 
 56 |     with torch.inference_mode():
 57 |         output_ids = model.generate(
 58 |             input_ids,
 59 |             images=image_tensor.to(dtype=torch.float16, device='cuda', non_blocking=True),
 60 |             do_sample=True if 0 > 0 else False,
 61 |             temperature=0,
 62 |             top_p=None,
 63 |             num_beams=1,
 64 |             max_new_tokens=new_token,
 65 |             use_cache=True)
 66 | 
 67 | 
 68 |     input_token_len = input_ids.shape[1]
 69 |     n_diff_input_output = (input_ids != output_ids[:, :input_token_len]).sum().item()
 70 |     if n_diff_input_output > 0:
 71 |         print(f'[Warning] {n_diff_input_output} output_ids are not the same as the input_ids')
 72 |     outputs = tokenizer.batch_decode(output_ids[:, input_token_len:], skip_special_tokens=True)[0]
 73 |     outputs = outputs.strip()
 74 |     return outputs
 75 | 
 76 | 
 77 | ans_file = open("", "w")
 78 | examples = load_dataset('nlphuji/whoops', use_auth_token="")
 79 | for item in tqdm(examples["test"]):
 80 | 
 81 | 
 82 |     image_tensor = process_images([item["image"]], image_processor, model.config)[0]
 83 |     all_pred = []
 84 |     for q_a in item["question_answering_pairs"]:
 85 |         
 86 |         question = q_a[0]
 87 |         pred_ans = get_ans(question, image_tensor, sg = 1)
 88 | 
 89 |         
 90 |         question = q_a[0] + "Scene Graph:" +  pred_ans + "\n\n" + answerPrompt
 91 |         pred_ans = get_ans(question, image_tensor)
 92 |         all_pred.append(pred_ans)
 93 | 
 94 |     ans_file.write(json.dumps({  "image_id": item["image_id"],
 95 |                                 "question_answering_pairs": item["question_answering_pairs"],
 96 |                                 "prediction": all_pred}) + "\n")
 97 |     ans_file.flush()
 98 | 
 99 | ans_file.close()
100 | 


--------------------------------------------------------------------------------
/LLaVA-1.5-13b/LLaVA_bench.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import torch
  3 | import os
  4 | import json
  5 | from tqdm import tqdm
  6 | import shortuuid
  7 | 
  8 | from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
  9 | from llava.conversation import conv_templates, SeparatorStyle
 10 | from llava.model.builder import load_pretrained_model
 11 | from llava.utils import disable_torch_init
 12 | from llava.mm_utils import tokenizer_image_token, get_model_name_from_path, KeywordsStoppingCriteria
 13 | 
 14 | from PIL import Image
 15 | import math
 16 | 
 17 | answerPrompt="Use the image and scene graph as context to improve the detail and clarity of the original answer: "
 18 | 
 19 | sgPrompt='''
 20 | 
 21 | For the provided image and question-answer pair, generate a scene graph in JSON format to improve the quality and/or detail of the answer. The scene graph can include the following:
 22 | 1. Objects that are relevant to answering the question.
 23 | 2. Object attributes that are relevant to answering the question.
 24 | 3. Object relationships that are relevant to answering the question.
 25 | 
 26 | Scene Graph:
 27 | '''
 28 | 
 29 | def split_list(lst, n):
 30 |     """Split a list into n (roughly) equal-sized chunks"""
 31 |     chunk_size = math.ceil(len(lst) / n)  # integer division
 32 |     return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)]
 33 | 
 34 | 
 35 | def get_chunk(lst, n, k):
 36 |     chunks = split_list(lst, n)
 37 |     return chunks[k]
 38 | 
 39 | 
 40 | def eval_model(args):
 41 |     # Model
 42 |     disable_torch_init()
 43 |     model_path = os.path.expanduser(args.model_path)
 44 |     model_name = get_model_name_from_path(model_path)
 45 |     tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, args.model_base, model_name)
 46 | 
 47 |     questions = [json.loads(q) for q in open(os.path.expanduser(args.question_file), "r")]
 48 |     questions = get_chunk(questions, args.num_chunks, args.chunk_idx)
 49 |     answers_file = os.path.expanduser(args.answers_file)
 50 |     os.makedirs(os.path.dirname(answers_file), exist_ok=True)
 51 |     ans_file = open(answers_file, "w")
 52 |     for line in tqdm(questions):
 53 |         idx = line["question_id"]
 54 |         image_file = line["image"]
 55 | 
 56 |         #----ZS Ans generation------
 57 |         qs = line["text"]
 58 |         cur_prompt = qs
 59 |         if model.config.mm_use_im_start_end:
 60 |             qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + qs
 61 |         else:
 62 |             qs = DEFAULT_IMAGE_TOKEN + '\n' + qs
 63 | 
 64 |         conv = conv_templates[args.conv_mode].copy()
 65 |         conv.append_message(conv.roles[0], qs)
 66 |         conv.append_message(conv.roles[1], None)
 67 |         prompt = conv.get_prompt()
 68 | 
 69 |         input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda()
 70 | 
 71 |         image = Image.open(os.path.join(args.image_folder, image_file))
 72 |         image_tensor = image_processor.preprocess(image, return_tensors='pt')['pixel_values'][0]
 73 | 
 74 |         stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
 75 |         keywords = [stop_str]
 76 |         stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids)
 77 | 
 78 |         with torch.inference_mode():
 79 |             output_ids = model.generate(
 80 |                 input_ids,
 81 |                 images=image_tensor.unsqueeze(0).half().cuda(),
 82 |                 do_sample=True if args.temperature > 0 else False,
 83 |                 temperature=args.temperature,
 84 |                 top_p=args.top_p,
 85 |                 num_beams=args.num_beams,
 86 |                 # no_repeat_ngram_size=3,
 87 |                 max_new_tokens=1024,
 88 |                 use_cache=True)
 89 | 
 90 |         input_token_len = input_ids.shape[1]
 91 |         n_diff_input_output = (input_ids != output_ids[:, :input_token_len]).sum().item()
 92 |         if n_diff_input_output > 0:
 93 |             print(f'[Warning] {n_diff_input_output} output_ids are not the same as the input_ids')
 94 |         outputs = tokenizer.batch_decode(output_ids[:, input_token_len:], skip_special_tokens=True)[0]
 95 |         outputs = outputs.strip()
 96 |         if outputs.endswith(stop_str):
 97 |             outputs = outputs[:-len(stop_str)]
 98 |         outputs = outputs.strip()
 99 | 
100 |         #-----SG Generation-------
101 |         og_ans = outputs
102 | 
103 |         qs = "Question: " + line["text"] + "\nAnswer: " + og_ans + "\n\n" + sgPrompt
104 |         
105 |         cur_prompt = qs
106 |         if model.config.mm_use_im_start_end:
107 |             qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + qs
108 |         else:
109 |             qs = DEFAULT_IMAGE_TOKEN + '\n' + qs
110 | 
111 |         conv = conv_templates[args.conv_mode].copy()
112 |         conv.append_message(conv.roles[0], qs)
113 |         conv.append_message(conv.roles[1], None)
114 |         prompt = conv.get_prompt()
115 | 
116 |         input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda()
117 | 
118 |         image = Image.open(os.path.join(args.image_folder, image_file))
119 |         image_tensor = image_processor.preprocess(image, return_tensors='pt')['pixel_values'][0]
120 | 
121 |         stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
122 |         keywords = [stop_str]
123 |         stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids)
124 | 
125 |         with torch.inference_mode():
126 |             output_ids = model.generate(
127 |                 input_ids,
128 |                 images=image_tensor.unsqueeze(0).half().cuda(),
129 |                 do_sample=True if args.temperature > 0 else False,
130 |                 temperature=args.temperature,
131 |                 top_p=args.top_p,
132 |                 num_beams=args.num_beams,
133 |                 # no_repeat_ngram_size=3,
134 |                 max_new_tokens=256,
135 |                 use_cache=True)
136 | 
137 |         input_token_len = input_ids.shape[1]
138 |         n_diff_input_output = (input_ids != output_ids[:, :input_token_len]).sum().item()
139 |         if n_diff_input_output > 0:
140 |             print(f'[Warning] {n_diff_input_output} output_ids are not the same as the input_ids')
141 |         outputs = tokenizer.batch_decode(output_ids[:, input_token_len:], skip_special_tokens=True)[0]
142 |         outputs = outputs.strip()
143 |         if outputs.endswith(stop_str):
144 |             outputs = outputs[:-len(stop_str)]
145 |         outputs = outputs.strip()
146 |         sg = outputs
147 |         #----Improved Answer-----
148 |         qs = answerPrompt + "\n\nScene Graph: " + sg + "\n\nQuestion: " + line["text"] + "\nOriginal Answer: " + og_ans + "\nImproved Answer: "
149 |         
150 |         cur_prompt = qs
151 |         if model.config.mm_use_im_start_end:
152 |             qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + qs
153 |         else:
154 |             qs = DEFAULT_IMAGE_TOKEN + '\n' + qs
155 | 
156 |         conv = conv_templates[args.conv_mode].copy()
157 |         conv.append_message(conv.roles[0], qs)
158 |         conv.append_message(conv.roles[1], None)
159 |         prompt = conv.get_prompt()
160 | 
161 |         input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda()
162 | 
163 |         image = Image.open(os.path.join(args.image_folder, image_file))
164 |         image_tensor = image_processor.preprocess(image, return_tensors='pt')['pixel_values'][0]
165 | 
166 |         stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
167 |         keywords = [stop_str]
168 |         stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids)
169 | 
170 |         with torch.inference_mode():
171 |             output_ids = model.generate(
172 |                 input_ids,
173 |                 images=image_tensor.unsqueeze(0).half().cuda(),
174 |                 do_sample=True if args.temperature > 0 else False,
175 |                 temperature=args.temperature,
176 |                 top_p=args.top_p,
177 |                 num_beams=args.num_beams,
178 |                 # no_repeat_ngram_size=3,
179 |                 max_new_tokens=1024,
180 |                 use_cache=True)
181 | 
182 |         input_token_len = input_ids.shape[1]
183 |         n_diff_input_output = (input_ids != output_ids[:, :input_token_len]).sum().item()
184 |         if n_diff_input_output > 0:
185 |             print(f'[Warning] {n_diff_input_output} output_ids are not the same as the input_ids')
186 |         outputs = tokenizer.batch_decode(output_ids[:, input_token_len:], skip_special_tokens=True)[0]
187 |         outputs = outputs.strip()
188 |         if outputs.endswith(stop_str):
189 |             outputs = outputs[:-len(stop_str)]
190 |         outputs = outputs.strip()
191 |         ans_id = shortuuid.uuid()
192 |         ans_file.write(json.dumps({"question_id": idx,
193 |                                    "prompt": cur_prompt,
194 |                                    "text": outputs,
195 |                                    "answer_id": ans_id,
196 |                                    "model_id": model_name,
197 |                                    "metadata": {}}) + "\n")
198 |         ans_file.flush()
199 |     ans_file.close()
200 | 
201 | if __name__ == "__main__":
202 |     parser = argparse.ArgumentParser()
203 |     parser.add_argument("--model-path", type=str, default="facebook/opt-350m")
204 |     parser.add_argument("--model-base", type=str, default=None)
205 |     parser.add_argument("--image-folder", type=str, default="")
206 |     parser.add_argument("--question-file", type=str, default="tables/question.jsonl")
207 |     parser.add_argument("--answers-file", type=str, default="answer.jsonl")
208 |     parser.add_argument("--conv-mode", type=str, default="llava_v1")
209 |     parser.add_argument("--num-chunks", type=int, default=1)
210 |     parser.add_argument("--chunk-idx", type=int, default=0)
211 |     parser.add_argument("--temperature", type=float, default=0.2)
212 |     parser.add_argument("--top_p", type=float, default=None)
213 |     parser.add_argument("--num_beams", type=int, default=1)
214 |     args = parser.parse_args()
215 | 
216 |     eval_model(args)
217 | 


--------------------------------------------------------------------------------
/LLaVA-1.5-13b/LLaVA_wino.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import torch
  3 | import os
  4 | import json
  5 | from tqdm import tqdm
  6 | import shortuuid
  7 | 
  8 | from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
  9 | from llava.conversation import conv_templates, SeparatorStyle
 10 | from llava.model.builder import load_pretrained_model
 11 | from llava.utils import disable_torch_init
 12 | from llava.mm_utils import tokenizer_image_token, process_images, get_model_name_from_path
 13 | from torch.utils.data import Dataset, DataLoader
 14 | 
 15 | from PIL import Image
 16 | import math
 17 | from datasets import load_dataset
 18 | 
 19 | 
 20 | sgPrompt='''
 21 | For the provided image and its associated question, generate a scene graph in JSON format that includes the following:
 22 | 1. Objects that are relevant to answering the question
 23 | 2. Object attributes that are relevant to answering the question
 24 | 3. Object relationships that are relevant to answering the question
 25 | 
 26 | Scene Graph:
 27 | '''
 28 | 
 29 | 
 30 | disable_torch_init()
 31 | model_path = os.path.expanduser("liuhaotian/llava-v1.5-13b")
 32 | model_name = get_model_name_from_path(model_path)
 33 | tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, None, model_name)
 34 | 
 35 | 
 36 | def get_ans(question, image_tensor, sg = 0):
 37 | 
 38 |     if sg == 1:
 39 |         qs = DEFAULT_IMAGE_TOKEN + question  + sgPrompt
 40 |     else:
 41 |         qs = DEFAULT_IMAGE_TOKEN + question 
 42 | 
 43 |     conv = conv_templates["vicuna_v1"].copy()
 44 |     conv.append_message(conv.roles[0], qs)
 45 |     conv.append_message(conv.roles[1], None)
 46 |     prompt = conv.get_prompt()
 47 | 
 48 |     input_ids = torch.unsqueeze(tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt'), 0)
 49 |     input_ids = input_ids.to(device='cuda', non_blocking=True)
 50 |     
 51 |     image_tensor = torch.unsqueeze(image_tensor, 0)
 52 | 
 53 |     with torch.inference_mode():
 54 |         output_ids = model.generate(
 55 |             input_ids,
 56 |             images=image_tensor.to(dtype=torch.float16, device='cuda', non_blocking=True),
 57 |             do_sample=True if 0 > 0 else False,
 58 |             temperature=0,
 59 |             top_p=None,
 60 |             num_beams=1,
 61 |             max_new_tokens=256,
 62 |             use_cache=True)
 63 | 
 64 | 
 65 |     input_token_len = input_ids.shape[1]
 66 |     n_diff_input_output = (input_ids != output_ids[:, :input_token_len]).sum().item()
 67 |     if n_diff_input_output > 0:
 68 |         print(f'[Warning] {n_diff_input_output} output_ids are not the same as the input_ids')
 69 |     outputs = tokenizer.batch_decode(output_ids[:, input_token_len:], skip_special_tokens=True)[0]
 70 |     outputs = outputs.strip()
 71 |     return outputs
 72 | 
 73 | 
 74 | 
 75 | image_file = ""
 76 | question_path = ""
 77 | result_path = ""
 78 | result_file = open(result_path, 'w')
 79 | 
 80 | 
 81 | 
 82 | with open(question_path, 'r') as json_file:
 83 |     json_list = list(json_file)
 84 | 
 85 | 
 86 | for json_str in tqdm(json_list):
 87 | 
 88 | 
 89 |     result = json.loads(json_str)
 90 |     cur_image = image_file + result["image"] + ".png"
 91 |     image = Image.open(cur_image).convert("RGB")
 92 | 
 93 |     prompt = "Does the given caption accurately describe the given image? Caption:" +  result["caption"] + ".\n\n" + sgPrompt
 94 | 
 95 |     cur_sg = get_ans(prompt, image, sg=1)
 96 | 
 97 | 
 98 |     answerPrompt = "Use the image and scene graph to reason and answer the question."
 99 |     prompt = "Question: Does the given caption accurately describe the given image? Caption:" +  result["caption"] + ". Scene Graph: " + cur_sg + '\n\n' + answerPrompt
100 | 
101 |     final_ans = get_ans(prompt, image)
102 |     stored_result = {"text":final_ans}
103 |     result_file.write(json.dumps(stored_result) + "\n")
104 |     result_file.flush()
105 | 
106 | result_file.close()
107 | 


--------------------------------------------------------------------------------
/LLaVA-1.5-13b/llava_seed.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
 4 | IFS=',' read -ra GPULIST <<< "$gpu_list"
 5 | 
 6 | CHUNKS=8
 7 | 
 8 | EXP_NAME=$1
 9 | echo "Experiment Name: llava.eval.sg.$EXP_NAME"
10 | CKPT="llava-v1.5-13b-$EXP_NAME"
11 | 
12 | WITH_SG=1
13 | 
14 | #Step 1: Scene-Graph Generation:
15 | for IDX in $(seq 0 $((CHUNKS-1))); do
16 |     echo ${GPULIST[$IDX]}
17 |     CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m llava.eval.sg.$EXP_NAME \
18 |         --model-path liuhaotian/llava-v1.5-13b \
19 |         --question-file ./playground/data/eval/seed_bench/llava-seed-bench-filtered.jsonl \
20 |         --image-folder ./playground/data/eval/seed_bench \
21 |         --answers-file ./playground/data/eval/seed_bench/full_sg/$CKPT/${CHUNKS}_${IDX}.jsonl \
22 |         --num-chunks $CHUNKS \
23 |         --chunk-idx $IDX \
24 |         --temperature 0 \
25 |         --scene_graph $WITH_SG \
26 |         --conv-mode vicuna_v1 &
27 |     #sleep 2 - only for Sphinx-V2
28 | done
29 | 
30 | wait
31 | 
32 | output_file=./playground/data/eval/seed_bench/full_sg/$CKPT/merge_$EXP_NAME.jsonl
33 | 
34 | # Clear out the output file if it exists.
35 | > "$output_file"
36 | 
37 | # Loop through the indices and concatenate each file.
38 | for IDX in $(seq 0 $((CHUNKS-1))); do
39 |     cat ./playground/data/eval/seed_bench/full_sg/$CKPT/${CHUNKS}_${IDX}.jsonl >> "$output_file"
40 | done
41 | 
42 | 
43 | #Step 2: Answer Extraction and Evaluation:
44 | for IDX in $(seq 0 $((CHUNKS-1))); do
45 |     CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m llava.eval.sg.$EXP_NAME \
46 |         --model-path liuhaotian/llava-v1.5-13b \
47 |         --question-file ./playground/data/eval/seed_bench/full_sg/$CKPT/merge_$EXP_NAME.jsonl \
48 |         --image-folder ./playground/data/eval/seed_bench \
49 |         --answers-file ./playground/data/eval/seed_bench/answers/$CKPT/${CHUNKS}_${IDX}.jsonl \
50 |         --num-chunks $CHUNKS \
51 |         --chunk-idx $IDX \
52 |         --temperature 0 \
53 |         --conv-mode vicuna_v1 &
54 |     #sleep 2 - Sphinx-V2
55 | done
56 | 
57 | wait
58 | 
59 | output_file=./playground/data/eval/seed_bench/answers/$CKPT/merge_$EXP_NAME.jsonl
60 | 
61 | # Clear out the output file if it exists.
62 | > "$output_file"
63 | 
64 | # Loop through the indices and concatenate each file.
65 | for IDX in $(seq 0 $((CHUNKS-1))); do
66 |     cat ./playground/data/eval/seed_bench/answers/$CKPT/${CHUNKS}_${IDX}.jsonl >> "$output_file"
67 | done
68 | 
69 | # Evaluate
70 | python scripts/convert_seed_for_submission.py \
71 |     --annotation-file ./playground/data/eval/seed_bench/SEED-Bench.json \
72 |     --result-file $output_file \
73 |     --result-upload-file ./playground/data/eval/seed_bench/answers_upload/llava-v1.5-13b-$EXP_NAME.jsonl
74 | 
75 | # --result-file $output_file \
76 | 
77 | # --result-file ./playground/data/eval/seed_bench/gt_merge.jsonl \


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # CCoT 🧩 🧠
 2 | Official Code for the Paper "Compositional Chain-of-Thought Prompting for Large Multimodal Models"
 3 | ---
 4 | We present **CCoT**, a novel **C**compositional **C**hain-**o**f-**T**hought prompting method that utilizes scene-graph representations in order to extract compositional knowledge from an LMM. We find that this approach not only improves LMM performance on several compositional benchmarks but also general multimodal benchmarks as well. 
 5 | 
 6 | A more thorough discussion of our work can be found in our [paper](https://arxiv.org/abs/2311.17076).
 7 | 
 8 | <p align="center">
 9 |   <img src=images/fig1_v7.png width="500"/>
10 | </p>
11 | 
12 | ### Method Description
13 | ---
14 | <p align="center">
15 |   <img src=images/fig2_v8.png />
16 | </p>
17 | 
18 | The first step in our prompting method is to generate a scene graph given both the image *and* textual task as context. Following this, the answer is extracted by prompting the LMM with the image, scene graph, question, and answer extraction prompt. Prompt sections unique to our method are shown in **bold** in the above figure. Incorporating the scene graph in the prompt eliminates the need for fine-tuning and prevents forgetting. Another benefit of our method is that generated SGs can describe any visual scene, therefore making CCoT generally applicable to a wider range of VL tasks. Finally, the fact that the generated scene graphs are compact linguistic representations of images makes CCoT a token-efficient prompting method. This is significant given the limited textual context lengths that LMMs often face due to processing both image and text inputs.
19 | 
20 | ### 💻 Setup
21 | ---
22 | **Note** that because our method is a zero-shot prompting method and makes use of the codebase of its respective LMM, there is ample flexibility when applying it to your particular model and use case. As such, you may find it *easier* to simply use the general methodology shown in our figure and outlined in our scripts with a different prompt, implementation, and evaluation methodology to suit your needs.
23 | 
24 | #### Datasets
25 | Please retrieve all datasets from their respective official websites or repositories. We do provide the filtered .jsonl containing just the SEEDBench-Image data points in our data folder.
26 | 
27 | #### LLaVA-1.5-13b
28 | 1. First, clone the official **LLaVA** [repository](https://github.com/haotian-liu/LLaVA).
29 | ```bash
30 | git clone https://github.com/haotian-liu/LLaVA.git
31 | ```
32 | 2. Follow the basic installation steps outlined in the repository.
33 | 3. Complete the *Evaluation* setup outlined in the repository.
34 | 4. Replace the corresponding scripts (both Python or Bash scripts where necessary) with those in our repository here.
35 | 
36 | *Note: We find some users are having issues with the input processing when directly cloning the repo. This is likely because the post-LLaVA-1.6 update changes the way inputs to the model are handled. One way to remedy this is to check out the newest commit before the LLaVA-1.6 update*
37 | 
38 | #### GPT-4V
39 | 
40 | 1. Install the openai library:
41 | ```bash
42 | pip install openai
43 | ```
44 | 2. Set your openai key:
45 | ```bash
46 | export OPENAI_API_KEY=
47 | ```
48 | 3. Run the script for your desired dataset.
49 | 
50 | #### InstructBLIP-13b
51 | 
52 | 1. First, clone the official **LLaVA** [Repository]([https://github.com/haotian-liu/LLaVA](https://github.com/salesforce/LAVIS/tree/main/projects/instructblip)).
53 | 2. Follow the basic installation steps outlined in the repository.
54 | 3. Run the script for your desired dataset.
55 | 
56 | #### Sphinx
57 | 
58 | 1. For SEEDBench and MMBench, we make use of the LLaVA codebase's setup. Simply follow the LLaVA-1.5 setup steps and replace the scripts with those of Sphinx.
59 | 2. For other datasets, follow setup instructions from the official [repository](https://github.com/Alpha-VLLM/LLaMA2-Accessory/tree/main/SPHINX)
60 | 3. Run our provided script.
61 | 
62 | ### 📝 Citation
63 | ---
64 | If you found our work useful, please consider starring and citing. Thank you!
65 | ```latex
66 | @inproceedings{MitraCCoT,
67 |   title={Compositional Chain of Thought Prompting for Large Multimodal Models},
68 |   author={Mitra, Chancharik and Huang, Brandon and Darrell, Trevor and Herzig, Roei},
69 |   booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
70 |   month={June},
71 |   year={2024}
72 | }
73 | ```
74 | 


--------------------------------------------------------------------------------
/Sphinx/Sphinx_SEED.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import torch
  3 | import os
  4 | import json
  5 | from tqdm import tqdm
  6 | import shortuuid
  7 | 
  8 | # from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
  9 | # from llava.conversation import conv_templates, SeparatorStyle
 10 | # from llava.model.builder import load_pretrained_model
 11 | # from llava.utils import disable_torch_init
 12 | # from llava.mm_utils import tokenizer_image_token, process_images, get_model_name_from_path
 13 | from torch.utils.data import Dataset, DataLoader
 14 | 
 15 | from PIL import Image
 16 | from SPHINX import SPHINXModel
 17 | import math
 18 | 
 19 | answerPrompt="Use the image and scene graph as context and answer the following question: "
 20 | 
 21 | sgPrompt='''
 22 | 
 23 | For the provided image and its associated question, generate a scene graph in JSON format that includes the following:
 24 | 1. Objects that are relevant to answering the question.
 25 | 2. Object attributes that are relevant to answering the question.
 26 | 3. Object relationships that are relevant to answering the question.
 27 | 
 28 | Scene Graph:
 29 | '''
 30 | def split_list(lst, n):
 31 |     """Split a list into n (roughly) equal-sized chunks"""
 32 |     chunk_size = math.ceil(len(lst) / n)  # integer division
 33 |     return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)]
 34 | 
 35 | 
 36 | def get_chunk(lst, n, k):
 37 |     chunks = split_list(lst, n)
 38 |     return chunks[k]
 39 | 
 40 | 
 41 | # Custom dataset class
 42 | class CustomDataset(Dataset):
 43 |     def __init__(self, questions, image_folder, sg_prompt):#, ans_folder: str='./playground/data/eval/MME/answers/llava-v1.5-13b.jsonl'):
 44 |         self.questions = questions
 45 |         self.image_folder = image_folder
 46 |         self.sg_prompt = sg_prompt
 47 | 
 48 |     def __getitem__(self, index):
 49 |         line = self.questions[index]
 50 |         image_file = line["image"]
 51 |         if self.sg_prompt == 1:
 52 |             lst = line['text'].split('\n')
 53 |             #lst.pop()
 54 |             #qs = '\n'.join(lst)
 55 |             qs = lst[0] + sgPrompt
 56 |          
 57 |         else:
 58 |             qs = line["text"]
 59 |          
 60 | 
 61 |         return qs, self.image_folder, image_file
 62 | 
 63 |     def __len__(self):
 64 |         return len(self.questions)
 65 | 
 66 | 
 67 | # DataLoader
 68 | def create_data_loader(questions, image_folder, batch_size=1, num_workers=4, sg_prompt = 0):
 69 |     assert batch_size == 1, "batch_size must be 1"
 70 |     dataset = CustomDataset(questions, image_folder, sg_prompt)
 71 |     data_loader = DataLoader(dataset, batch_size=batch_size, num_workers=num_workers, shuffle=False)
 72 |     return data_loader
 73 | 
 74 | def get_sg_prompt(args):
 75 |     
 76 |     model = SPHINXModel.from_pretrained(pretrained_path="/home/chancharikm/compVL/LLaMA2-Accessory/SPHINX/SPHINX-v2-1k-weights", with_visual=True).to(device='cuda')
 77 |     
 78 | 
 79 | 
 80 |     questions = [json.loads(q) for q in open(os.path.expanduser(args.question_file), "r")]
 81 | 
 82 | 
 83 |     
 84 |     questions = get_chunk(questions, args.num_chunks, args.chunk_idx)
 85 | 
 86 | 
 87 |     q_file = os.path.expanduser(args.answers_file)
 88 |     os.makedirs(os.path.dirname(q_file), exist_ok=True)
 89 |     q_file = open(q_file, "w")
 90 | 
 91 |     data_loader = create_data_loader(questions, args.image_folder, sg_prompt = 1)
 92 | 
 93 | 
 94 |     for (raw_text, image_folder, image_file), line in tqdm(zip(data_loader, questions), total=len(questions)):
 95 |         #No idea why raw_text, image_folder, image_file are being unpacked as tuples (value, __blank__), but I'm going with it for now...
 96 |         #It's fine in the data_loader but not here: weird, I suspect it has to do with this misaligned tqdm
 97 | 
 98 |         image = Image.open(os.path.join(image_folder[0], image_file[0])).convert('RGB')
 99 |         with torch.cuda.amp.autocast(dtype=torch.float16):
100 |             outputs = model.generate_reponse( #No, this is not a typo. This seems to be typo inherited from the Sphinx codebase - let them know via Github!
101 |                 [[raw_text[0], None]],
102 |                 image,
103 |                 temperature=args.temperature,
104 |                 top_p=args.top_p,
105 |                 max_gen_len=256, 
106 |                 seed=0)
107 |         outputs = outputs.strip()
108 | 
109 | 
110 |         
111 | 
112 |         q_file.write(json.dumps({  "image": line["image"],
113 |                                    "text": "Scene Graph: " + outputs + '\n\n' + answerPrompt + line["text"],
114 |                                    "category": line["category"],
115 |                                    "question_id": line["question_id"]}) + "\n")
116 | 
117 |        
118 |     q_file.close()
119 | 
120 | 
121 | def eval_model(args):
122 | 
123 |     model = SPHINXModel.from_pretrained(pretrained_path="/home/chancharikm/compVL/LLaMA2-Accessory/SPHINX/SPHINX-v2-1k-weights", with_visual=True).to(device='cuda')
124 |     print(f'Chunk ID {args.chunk_idx} Loaded!')
125 |     questions = [json.loads(q) for q in open(os.path.expanduser(args.question_file), "r")]
126 | 
127 | 
128 | 
129 |     questions = get_chunk(questions, args.num_chunks, args.chunk_idx)
130 | 
131 | 
132 |     answers_file = os.path.expanduser(args.answers_file)
133 |     os.makedirs(os.path.dirname(answers_file), exist_ok=True)
134 |     ans_file = open(answers_file, "w")
135 | 
136 | 
137 | 
138 |     data_loader = create_data_loader(questions, args.image_folder)
139 | 
140 |     for (raw_text, image_folder, image_file), line in tqdm(zip(data_loader, questions), total=len(questions)):
141 |         #No idea why raw_text, image_folder, image_file are being unpacked as tuples (value, __blank__), but I'm going with it for now...
142 |         #It's fine in the data_loader but not here: weird, I suspect it has to do with this misaligned tqdm
143 |         idx = line["question_id"]
144 |         cur_prompt = line["text"]
145 |         model_name = "Sphinx-v2-1k"
146 |         image = Image.open(os.path.join(image_folder[0], image_file[0])).convert('RGB')
147 |         with torch.cuda.amp.autocast(dtype=torch.float16):
148 |             outputs = model.generate_reponse(
149 |                 [[raw_text[0], None]],
150 |                 image,
151 |                 temperature=args.temperature,
152 |                 top_p=args.top_p,
153 |                 max_gen_len=256, 
154 |                 seed=0)
155 | 
156 |         outputs = outputs.strip()
157 |         ans_id = shortuuid.uuid()
158 |         ans_file.write(json.dumps({"question_id": idx,
159 |                                    "prompt": cur_prompt,
160 |                                    "text": outputs,
161 |                                    "answer_id": ans_id,
162 |                                    "model_id": model_name,
163 |                                    "metadata": {}}) + "\n")
164 |         # ans_file.flush()
165 |     ans_file.close()
166 | 
167 | if __name__ == "__main__":
168 |     parser = argparse.ArgumentParser()
169 |     parser.add_argument("--model-path", type=str, default="facebook/opt-350m")
170 |     parser.add_argument("--model-base", type=str, default=None)
171 |     parser.add_argument("--image-folder", type=str, default="")
172 |     parser.add_argument("--question-file", type=str, default="tables/question.jsonl")
173 |     parser.add_argument("--answers-file", type=str, default="answer.jsonl")
174 |     parser.add_argument("--conv-mode", type=str, default="llava_v1")
175 |     parser.add_argument("--num-chunks", type=int, default=1)
176 |     parser.add_argument("--chunk-idx", type=int, default=0)
177 |     parser.add_argument("--temperature", type=float, default=0.2)
178 |     parser.add_argument("--top_p", type=float, default=None)
179 |     parser.add_argument("--num_beams", type=int, default=1)
180 | 
181 |     parser.add_argument("--scene_graph", type=int, default=0)
182 | 
183 |     args = parser.parse_args()
184 | 
185 |     if args.scene_graph == 0:
186 |         eval_model(args)
187 |     else:
188 |         get_sg_prompt(args)
189 | 


--------------------------------------------------------------------------------
/Sphinx/Sphinx_Whoops.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | from tqdm import tqdm
 3 | from datasets import load_dataset
 4 | from SPHINX import SPHINXModel
 5 | 
 6 | result_path="" #Path to store the result
 7 | hf_token="" #Huggingface auth token
 8 | 
 9 | 
10 | sgPrompt='''
11 | For the provided image and its associated question, generate only a scene graph in JSON format that includes the following:
12 | 1. Objects that are relevant to answering the question
13 | 2. Object attributes that are relevant to answering the question
14 | 3. Object relationships that are relevant to answering the question
15 | 
16 | Scene Graph:
17 | '''
18 | answerPrompt="\nUse the image and scene graph to reason and answer the question with a single phrase."
19 | 
20 | 
21 | model = SPHINXModel.from_pretrained(pretrained_path="", with_visual=True)
22 | 
23 | 
24 | def get_ans(question, pred_sg, image_tensor):
25 | 
26 |     final_ans = model.generate_response([[question + "Scene Graph:" + pred_sg + "\n\n" + answerPrompt, None]], image_tensor, max_gen_len=64, temperature=0)
27 |     return final_ans
28 | 
29 | 
30 | def get_sg(question, image):
31 |     final_ans = model.generate_response([[question + sgPrompt, None]], image, max_gen_len=256, temperature=0)
32 |     return final_ans
33 | 
34 | 
35 | result_file = open(result_path, "w")
36 | examples = load_dataset('nlphuji/whoops', use_auth_token=hf_token)
37 | for item in tqdm(examples["test"]):
38 |     
39 |     image = item["image"]
40 |     all_pred = []
41 |     all_sg = []
42 |     for q_a in item["question_answering_pairs"]:
43 |         
44 |         question = q_a[0]
45 |         pred_sg = get_sg(question, image)
46 |         pred_ans = get_ans(question, pred_sg, image)
47 |         all_pred.append(pred_ans)
48 | 
49 |     result_file.write(json.dumps({  "image_id": item["image_id"],
50 |                                 "question_answering_pairs": item["question_answering_pairs"],
51 |                                 "prediction": all_pred}) + "\n")
52 | 
53 | result_file.close()
54 | 


--------------------------------------------------------------------------------
/Sphinx/Sphinx_mmbench.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import torch
  3 | import os
  4 | import json
  5 | import pandas as pd
  6 | from tqdm import tqdm
  7 | import shortuuid
  8 | 
  9 | from PIL import Image
 10 | import math
 11 | from io import BytesIO
 12 | import base64
 13 | 
 14 | from SPHINX import SPHINXModel
 15 | 
 16 | 
 17 | all_options = ['A', 'B', 'C', 'D']
 18 | 
 19 | answerPrompt="Use the image and scene graph as context and answer the following question: "
 20 | 
 21 | sgPrompt='''
 22 | 
 23 | For the provided image and its associated question, generate a scene graph in JSON format that includes the following:
 24 | 1. Objects that are relevant to answering the question.
 25 | 2. Object attributes that are relevant to answering the question.
 26 | 3. Object relationships that are relevant to answering the question.
 27 | 
 28 | Scene Graph:
 29 | '''
 30 | 
 31 | 
 32 | def split_list(lst, n):
 33 |     """Split a list into n (roughly) equal-sized chunks"""
 34 |     chunk_size = math.ceil(len(lst) / n)  # integer division
 35 |     return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)]
 36 | 
 37 | 
 38 | def get_chunk(lst, n, k):
 39 |     chunks = split_list(lst, n)
 40 |     return chunks[k]
 41 | 
 42 | 
 43 | def is_none(value):
 44 |     if value is None:
 45 |         return True
 46 |     if type(value) is float and math.isnan(value):
 47 |         return True
 48 |     if type(value) is str and value.lower() == 'nan':
 49 |         return True
 50 |     if type(value) is str and value.lower() == 'none':
 51 |         return True
 52 |     return False
 53 | 
 54 | def get_options(row, options):
 55 |     parsed_options = []
 56 |     for option in options:
 57 |         option_value = row[option]
 58 |         if is_none(option_value):
 59 |             break
 60 |         parsed_options.append(option_value)
 61 |     return parsed_options
 62 | 
 63 | def load_image_from_base64(image):
 64 |     return Image.open(BytesIO(base64.b64decode(image)))
 65 | 
 66 | 
 67 | def eval_model(args):
 68 |     # Model
 69 |     model = SPHINXModel.from_pretrained(pretrained_path="/home/chancharikm/compVL/LLaMA2-Accessory/SPHINX/SPHINX-v2-1k-weights", with_visual=True).to(device='cuda')
 70 | 
 71 |     questions = pd.read_table(os.path.expanduser(args.question_file))
 72 |     questions = get_chunk(questions, args.num_chunks, args.chunk_idx)
 73 |     answers_file = os.path.expanduser(args.answers_file)
 74 |     os.makedirs(os.path.dirname(answers_file), exist_ok=True)
 75 |     ans_file = open(answers_file, "w")
 76 | 
 77 |     for index, row in tqdm(questions.iterrows(), total=len(questions)):
 78 |         options = get_options(row, all_options)
 79 |         cur_option_char = all_options[:len(options)]
 80 | 
 81 |         if args.all_rounds:
 82 |             num_rounds = len(options)
 83 |         else:
 84 |             num_rounds = 1
 85 | 
 86 |         for round_idx in range(num_rounds):
 87 |             idx = row['index']
 88 |             question = row['question']
 89 |             hint = row['hint']
 90 |             image = load_image_from_base64(row['image'])
 91 |             if not is_none(hint):
 92 |                 question = hint + '\n' + question
 93 |             for option_char, option in zip(all_options[:len(options)], options):
 94 |                 question = question + '\n' + option_char + '. ' + option
 95 |             qs = cur_prompt = question
 96 | 
 97 |             firstPrompt = question + sgPrompt
 98 |             #print(f'SG Prompt: {firstPrompt}')
 99 |             #print(f'qs {qs}')
100 |             # if model.config.mm_use_im_start_end:
101 |             #     qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + qs
102 |             # else:
103 |             #     qs = DEFAULT_IMAGE_TOKEN + '\n' + qs
104 | 
105 |             # if args.single_pred_prompt:
106 |             #     if args.lang == 'cn':
107 |             #         qs = qs + '\n' + "请直接回答选项字母。"
108 |             #     else:
109 |             #         qs = qs + '\n' + "Answer with the option's letter from the given choices directly."
110 | 
111 |             with torch.cuda.amp.autocast(dtype=torch.float16):
112 |                 outputs = model.generate_reponse( #No, this is not a typo. This seems to be typo inherited from the Sphinx codebase - let them know via Github!
113 |                     [[firstPrompt, None]],
114 |                     image,
115 |                     temperature=args.temperature,
116 |                     top_p=args.top_p,
117 |                     max_gen_len=256, 
118 |                     seed=0)
119 | 
120 | 
121 |             outputs = outputs.strip()
122 | 
123 |             sg = outputs
124 | 
125 |             secondPrompt = "Scene Graph: " + sg + '\n\n' + answerPrompt + qs + '\n' + "Answer with the option's letter from the given choices directly."
126 |             #print(f'secondPrompt {secondPrompt}')
127 | 
128 |             with torch.cuda.amp.autocast(dtype=torch.float16):
129 |                 outputs = model.generate_reponse( #No, this is not a typo. This seems to be typo inherited from the Sphinx codebase - let them know via Github!
130 |                     [[secondPrompt, None]],
131 |                     image,
132 |                     temperature=args.temperature,
133 |                     top_p=args.top_p,
134 |                     max_gen_len=256, 
135 |                     seed=0)
136 | 
137 |             outputs = outputs.strip()
138 |             #print(f'Final Output: {outputs}')
139 | 
140 |             ans_id = shortuuid.uuid()
141 |             ans_file.write(json.dumps({"question_id": idx,
142 |                                     "round_id": round_idx,
143 |                                     "prompt": cur_prompt,
144 |                                     "text": outputs,
145 |                                     "options": options,
146 |                                     "option_char": cur_option_char,
147 |                                     "answer_id": ans_id,
148 |                                     "model_id": "Sphinx-v2-1k",
149 |                                     "metadata": {}}) + "\n")
150 |             ans_file.flush()
151 | 
152 |             # rotate options
153 |             options = options[1:] + options[:1]
154 |             cur_option_char = cur_option_char[1:] + cur_option_char[:1]
155 |     ans_file.close()
156 | 
157 | if __name__ == "__main__":
158 |     parser = argparse.ArgumentParser()
159 |     parser.add_argument("--model-path", type=str, default="facebook/opt-350m")
160 |     parser.add_argument("--model-base", type=str, default=None)
161 |     parser.add_argument("--image-folder", type=str, default="")
162 |     parser.add_argument("--question-file", type=str, default="tables/question.jsonl")
163 |     parser.add_argument("--answers-file", type=str, default="answer.jsonl")
164 |     parser.add_argument("--conv-mode", type=str, default="llava_v1")
165 |     parser.add_argument("--num-chunks", type=int, default=1)
166 |     parser.add_argument("--chunk-idx", type=int, default=0)
167 |     parser.add_argument("--temperature", type=float, default=0.2)
168 |     parser.add_argument("--top_p", type=float, default=None)
169 |     parser.add_argument("--num_beams", type=int, default=1)
170 |     parser.add_argument("--all-rounds", action="store_true")
171 |     parser.add_argument("--single-pred-prompt", action="store_true")
172 |     parser.add_argument("--lang", type=str, default="en")
173 |     args = parser.parse_args()
174 | 
175 |     eval_model(args)
176 | 


--------------------------------------------------------------------------------
/Sphinx/Sphinx_wino.py:
--------------------------------------------------------------------------------
 1 | from SPHINX import SPHINXModel
 2 | from PIL import Image
 3 | import torch
 4 | import json
 5 | from tqdm import tqdm
 6 | 
 7 | sgPrompt='''
 8 | For the provided image and its associated question, generate only a scene graph in JSON format that includes the following:
 9 | 1. Objects that are relevant to answering the question
10 | 2. Object attributes that are relevant to answering the question
11 | 3. Object relationships that are relevant to answering the question
12 | '''
13 | 
14 | model = SPHINXModel.from_pretrained(pretrained_path="", with_visual=True)
15 | 
16 | 
17 | image_dir = ""  #Directory containing image data
18 | question_path = ""  #Path containing the question
19 | result_path = ""  #Path to store the result
20 | result_file = open(result_path, 'w')
21 | 
22 | 
23 | with open(question_path, 'r') as json_file:
24 |     json_list = list(json_file)
25 | 
26 | 
27 | for json_str in tqdm(json_list):
28 | 
29 |     cur_pair = json.loads(json_str)
30 |     cur_image = Image.open(image_dir + cur_pair["image"] + ".png")
31 |     cur_caption = cur_pair["caption"]
32 | 
33 |     cur_question = [[f"Does the given caption accurately describe the given image? Caption:{cur_caption}.\n\n{sgPrompt}", None]]
34 |     cur_sg = model.generate_response(cur_question, cur_image, max_gen_len=256, temperature=0)
35 |     
36 |     new_question = [[f"Does the given caption accurately describe the given image? Caption:{cur_caption}.\n\nScene graph:{cur_sg}\n\nBased on the image and scene graph, provide a explanation to the answer.", None]]
37 |     final_ans = model.generate_response(new_question, cur_image, max_gen_len=256, temperature=0)
38 | 
39 | 
40 |     stored_response = {"text":final_ans}
41 |     result_file.write(json.dumps(stored_response) + "\n")
42 | 
43 | 
44 | result_file.close()
45 | 


--------------------------------------------------------------------------------
/data/filter_qs.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import argparse
 3 | 
 4 | def filter_by_category(input_file, output_file, filter_list):
 5 | 
 6 |     category_counters = {}
 7 | 
 8 |     with open(input_file, 'r') as f_in, open(output_file, 'w') as f_out:
 9 |         for line in f_in:
10 |             obj = json.loads(line)
11 |             category = obj.get('category')
12 |             
13 |          
14 |             category_counters[category] = category_counters.get(category, 0) + 1
15 |             
16 |             if category in filter_list:
17 |                 if category_counters[category] % 1 == 0:
18 |                     f_out.write(json.dumps(obj))
19 |                     f_out.write('\n')
20 | 
21 | def main():
22 |     filter_list = ["Instances Counting", "Scene Understanding","Instance Identity", "Instance Attributes", "Instance Location", "Spatial Relation", "Visual Reasoning", "Text Understanding", "Instance Interaction"] 
23 |     filter_by_category("llava-seed-bench.jsonl", "llava-seed-bench-filtered.jsonl", filter_list)
24 | 
25 | if __name__ == "__main__":
26 |     main()
27 | 


--------------------------------------------------------------------------------
/eval_winoground.py:
--------------------------------------------------------------------------------
 1 | from openai import OpenAI
 2 | import time
 3 | import json
 4 | 
 5 | 
 6 | client = OpenAI(
 7 |     api_key='',
 8 | )
 9 | 
10 | 
11 | def generate_response(prompt):
12 |     # Call the OpenAI API to generate a response
13 |     response = client.chat.completions.create(
14 |         model="gpt-4-0125-preview",
15 |         messages=[
16 |             {'role': 'user', 'content': prompt}
17 |             ],
18 |         max_tokens=512,
19 |         n=1,
20 |         temperature=0,
21 |         top_p=0.1,
22 |         frequency_penalty=0.0,
23 |         presence_penalty=0,
24 |     )
25 |     # Get the response text from the API response
26 |     response_text = response.choices[0].message.content
27 | 
28 |     return response_text
29 | 
30 | #Text score. Identify which caption-explanation paire matches better.
31 | def get_ans(caption1, caption2, exp1, exp2):
32 |     #original
33 |     PROMPT = "Caption A:" + caption1 + ". Explanation A:" + exp1 + "\n\nCaption B:" + caption2 + ". Explanation B:" + exp2 + "\n\n  Each explanation tries to justify why an image match with the corresponding caption. Pick the most logical explanation and return only an alphabet letter."
34 |     return generate_response(PROMPT)
35 | 
36 | #Image score. Identify which explanation match better with the caption.
37 | def get_ans2(caption1, caption2, exp1, exp2):
38 |     PROMPT = "Caption:" + caption1 + ".Explanation A:" + exp1 + "Explanation B:" + exp2 + "\n\n Pick the explanation with information that align with the caption and return only an alphabet letter."
39 |     return generate_response(PROMPT)
40 | 
41 | 
42 | ans_path = ""
43 | with open(ans_path, 'r') as json_file:
44 |     json_list = list(json_file)
45 | qs_path = ""
46 | with open(qs_path, 'r') as q_file:
47 |     q_list = list(q_file)
48 | 
49 | 
50 | count = 0
51 | correct_text = 0
52 | correct_img = 0
53 | correct_group = 0
54 | 
55 | 
56 | print("begin")
57 | while count < 1600:
58 |     
59 |     result1 = json.loads(json_list[count])
60 |     result2 = json.loads(json_list[count+1])
61 |     result3 = json.loads(json_list[count+2])
62 |     result4 = json.loads(json_list[count+3])
63 | 
64 |     cap1 = json.loads(q_list[count])["caption"]
65 |     cap2 = json.loads(q_list[count+1])["caption"]
66 | 
67 |     expain1 = result1["answer"]
68 |     expain2 = result2["answer"]
69 |     expain3 = result3["answer"]
70 |     expain4 = result4["answer"]
71 | 
72 |     #Get text score
73 |     text_result1 = get_ans(cap1, cap2, expain1, expain2)
74 |     text_result2 = get_ans(cap1, cap2, expain3, expain4)
75 | 
76 |     #Get image score
77 |     text_result3 = get_ans2(cap1, cap1, expain1, expain3)
78 |     text_result4 = get_ans2(cap2, cap2, expain2, expain4)
79 | 
80 | 
81 |     if (text_result1 == "A" and text_result2 == "B"):
82 |         correct_text += 1
83 |     if (text_result3 == "A" and text_result4 == "B"):
84 |         correct_img += 1
85 |         
86 |     if (text_result1 == "A" and text_result2 == "B") and (text_result3 == "A" and text_result4 == "B"):
87 |         correct_group += 1
88 |     count += 4
89 | 
90 | print("text score:", correct_text / 400)
91 | print("image score:", correct_img / 400)
92 | print("group score:", correct_group / 400)
93 | 


--------------------------------------------------------------------------------
/images/fig1_v7.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chancharikmitra/CCoT/9ceecb7c3e9d337bf389e1c2af260b86bcc35a6b/images/fig1_v7.png


--------------------------------------------------------------------------------
/images/fig2_v8.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chancharikmitra/CCoT/9ceecb7c3e9d337bf389e1c2af260b86bcc35a6b/images/fig2_v8.png


--------------------------------------------------------------------------------