├── __init__.py
├── tools
    ├── __init__.py
    ├── ocr.py
    ├── scene_graph.py
    ├── com.py
    ├── diff.py
    └── grding.py
├── assets
    ├── case.png
    ├── table-1.png
    ├── table-2.png
    ├── framework.png
    ├── overview.png
    └── table-40p.png
├── qwen.py
├── spear.py
├── README.md
├── agent
    ├── openai.py
    ├── prompt_vlm.py
    └── prompt_agent.py
├── environment.yml
├── util.py
├── evaluate_vie.py
├── evaluate.py
└── run_test_40p.py


/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tools/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/assets/case.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HITsz-TMG/Agentic-CIGEval/HEAD/assets/case.png


--------------------------------------------------------------------------------
/assets/table-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HITsz-TMG/Agentic-CIGEval/HEAD/assets/table-1.png


--------------------------------------------------------------------------------
/assets/table-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HITsz-TMG/Agentic-CIGEval/HEAD/assets/table-2.png


--------------------------------------------------------------------------------
/assets/framework.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HITsz-TMG/Agentic-CIGEval/HEAD/assets/framework.png


--------------------------------------------------------------------------------
/assets/overview.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HITsz-TMG/Agentic-CIGEval/HEAD/assets/overview.png


--------------------------------------------------------------------------------
/assets/table-40p.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HITsz-TMG/Agentic-CIGEval/HEAD/assets/table-40p.png


--------------------------------------------------------------------------------
/tools/ocr.py:
--------------------------------------------------------------------------------
 1 | from paddleocr import PaddleOCR,draw_ocr
 2 | ocr = PaddleOCR(lang='en') # need to run only once to download and load model into memory
 3 | 
 4 | def OCR(image):
 5 |     result = ocr.ocr(image, cls=False)
 6 |     texts = []
 7 |     for idx in range(len(result)):
 8 |         res = result[idx]
 9 |         for line in res:
10 |             texts.append(line[1][0])
11 |     return texts
12 | 
13 | 


--------------------------------------------------------------------------------
/tools/scene_graph.py:
--------------------------------------------------------------------------------
 1 | from agent.openai import GPT4o,QWen25
 2 | from util import clean_text
 3 | 
 4 | sgPrompt='''
 5 | For the provided image, generate only a scene graph in JSON format that includes the following:
 6 | 1. Objects that are relevant to describing the image content in detail
 7 | 2. Object attributes that are relevant to describing the image content in detail
 8 | 3. Object relationships that are relevant to describing the image content in detail
 9 | '''
10 | 
11 | # SG_generator = QWen25()
12 | SG_generator = GPT4o()
13 | 
14 | def sg_generate(img_links):
15 |     prompt_content = SG_generator.prepare_prompt(img_links, sgPrompt)
16 |     sg = SG_generator.get_result(prompt_content)
17 |     print(sg)
18 |     return clean_text(sg)


--------------------------------------------------------------------------------
/tools/com.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from PIL import Image, ImageDraw
 3 | 
 4 | 
 5 | def mark(image, bbox_list):
 6 |     if isinstance(image, str):
 7 |         image = Image.open(image)
 8 |     image_copy = image.copy()
 9 |     draw = ImageDraw.Draw(image_copy)
10 |     for bbox in bbox_list:
11 |         draw.rectangle(bbox, outline='red', width=3)
12 |     return image_copy
13 |     
14 |     
15 | def highlight(image, bbox_list):
16 |     if isinstance(image, str):
17 |         image = Image.open(image)
18 |     np_img = np.array(image)
19 |     np_ori = np_img.copy()
20 |     if len(bbox_list)>0:
21 |         np_img //= 4
22 |     for bbox in bbox_list:
23 |         np_img[bbox[1]:bbox[3], bbox[0]:bbox[2]] = np_ori[bbox[1]:bbox[3], bbox[0]:bbox[2]]
24 |     image_h = Image.fromarray(np_img)
25 |     return image_h
26 | 
27 | 
28 | def segment(image, bbox):
29 |     if isinstance(image, str):
30 |         image = Image.open(image)
31 |     np_img = np.array(image)
32 |     bbox_area = np_img[bbox[1]:bbox[3], bbox[0]:bbox[2]]
33 |     image_s = Image.fromarray(bbox_area)
34 |     return image_s
35 | 
36 | 
37 | def split2part(image):
38 |     if isinstance(image, str):
39 |         img = Image.open(image)
40 |     else:
41 |         img = image
42 |     w = img.width
43 |     h = img.height
44 |     box_L = (0,0,w*0.5,h)
45 |     box_R = (w*0.5,0,w,h)
46 |     img_L = img.crop(box_L)
47 |     img_R = img.crop(box_R)
48 |     return img_L,img_R


--------------------------------------------------------------------------------
/qwen.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import os
 3 | os.environ["CUDA_VISIBLE_DEVICES"] = "6,5"
 4 | from transformers import Qwen2_5_VLForConditionalGeneration, AutoTokenizer, AutoProcessor
 5 | from modelscope import Qwen2VLForConditionalGeneration
 6 | from peft import PeftModel
 7 | from qwen_vl_utils import process_vision_info
 8 | import torch
 9 | import flask
10 | from flask import Flask, request, jsonify
11 | 
12 | # torch.cuda.empty_cache()
13 | app = Flask(__name__)
14 | model_dir = "path_to/Qwen2-VL-7B-Instruct"
15 | # model_dir = "path_to/Qwen2.5-VL-7B-Instruct"
16 | # model_dir = "path_to/Qwen2.5-VL-7B-Instruct-sft"
17 | # model_dir = "path_to/Qwen2-VL-7B-Instruct-sft"
18 | # lora_path = ""
19 | 
20 | 
21 | 
22 | 
23 | model = Qwen2VLForConditionalGeneration.from_pretrained(
24 |     model_dir,
25 |     torch_dtype=torch.bfloat16,
26 |     attn_implementation="flash_attention_2",
27 |     device_map="auto",
28 | )
29 | 
30 | 
31 | # model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
32 | #     model_dir,
33 | #     torch_dtype=torch.bfloat16,
34 | #     attn_implementation="flash_attention_2",
35 | #     device_map="auto",
36 | # )
37 | # if lora_path:
38 | #     model = PeftModel.from_pretrained(model, lora_path, torch_dtype=torch.bfloat16, device_map="auto")
39 | 
40 | model.eval()
41 | processor = AutoProcessor.from_pretrained(model_dir)
42 | 
43 | 
44 | def format(images, text):
45 |     content = []
46 |     for img in images:
47 |         content.append({"type": "image", "image": img})
48 |     content.append({"type": "text", "text": text})
49 |     messages = [
50 |         {
51 |             "role": "user",
52 |             "content": content,
53 |         }
54 |     ]
55 |     return messages
56 | 
57 | 
58 | @app.route('/generate', methods=['POST'])
59 | def generate():
60 |     msg = flask.request.get_json(force=True)
61 |     imgs = msg['imgs']
62 |     text = msg['text']
63 |     messages = format(imgs, text)
64 |     ########
65 |     text = processor.apply_chat_template(
66 |         messages, tokenize=False, add_generation_prompt=True,add_vision_id=True
67 |     )
68 |     image_inputs, video_inputs = process_vision_info(messages)
69 |     inputs = processor(
70 |         text=[text],
71 |         images=image_inputs,
72 |         videos=video_inputs,
73 |         padding=True,
74 |         return_tensors="pt",
75 |     )
76 |     inputs = inputs.to(model.device)
77 | 
78 |     # Inference: Generation of the output
79 |     with torch.no_grad():
80 |         generated_ids = model.generate(**inputs, max_new_tokens=1024)
81 |     generated_ids_trimmed = [
82 |         out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
83 |     ]
84 |     output_text = processor.batch_decode(
85 |         generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
86 |     )[0].strip()
87 | 
88 |     return jsonify({"response": output_text})
89 | 
90 |     # except Exception as e:
91 |     #     return jsonify({"error": str(e)})
92 | 
93 | 
94 | 
95 | if __name__ == "__main__":
96 |     app.run(host='0.0.0.0', port=8080)


--------------------------------------------------------------------------------
/tools/diff.py:
--------------------------------------------------------------------------------
 1 | import cv2
 2 | import numpy as np
 3 | from PIL import Image
 4 | from com import mark, highlight
 5 | 
 6 | threshold = 0.5
 7 | top_n = 1
 8 | 
 9 | def detect_diff(src_img, dst_img):
10 |     
11 |     height, width = dst_img.shape[:2]
12 |     total_area = height * width
13 |     # Calculate area threshold (0.5% of total image area)
14 |     area_threshold = total_area * (threshold / 100)
15 |     
16 |     src_img = cv2.GaussianBlur(src_img, [5, 5], 0)
17 |     dst_img = cv2.GaussianBlur(dst_img, [5, 5], 0)
18 |     diff = cv2.absdiff(src_img, dst_img)
19 |     gray = cv2.cvtColor(diff, cv2.COLOR_BGR2GRAY)
20 |     # Apply thresholding to get a binary image (45)
21 |     _, result = cv2.threshold(gray, 45, 255, cv2.THRESH_BINARY)
22 | 
23 |     result = cv2.dilate(result, np.ones([3, 3]))
24 |     contours, _ = cv2.findContours(result, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
25 |     rect_pos = []
26 |     for c in contours:
27 |         x, y, w, h = cv2.boundingRect(c)
28 |         area = w * h
29 |         if area > area_threshold:
30 |             rect_pos.append((x, y, w, h))
31 |             
32 |             
33 |     # Merge overlapping rectangles
34 |     merged_rects = []
35 |     while rect_pos:
36 |         rect = rect_pos.pop(0)
37 |         x, y, w, h = rect
38 |         
39 |         merge = False
40 |         for other_rect in rect_pos[:]:
41 |             ox, oy, ow, oh = other_rect
42 |             if x < ox + ow and x + w > ox and y < oy + oh and y + h > oy:
43 |                 x = min(x, ox)
44 |                 y = min(y, oy)
45 |                 w = max(x + w, ox + ow) - x
46 |                 h = max(y + h, oy + oh) - y
47 |                 
48 |                 rect_pos.remove(other_rect)
49 |                 merge = True
50 |         
51 |         merged_rects.append((x, y, w, h))
52 |         
53 |         if not merge:
54 |             break
55 | 
56 |     areas = []
57 |     for C in merged_rects:
58 |         x, y, w, h = C
59 |         area = w * h
60 |         areas.append((x, y, w, h, area))
61 |     areas.sort(key=lambda x: x[4], reverse=True)
62 | 
63 |     pos = []
64 |     for i in range(min(top_n, len(areas))):
65 |         x, y, w, h, _ = areas[i]
66 |         pos.append((x, y, x+w, y+h))
67 |     
68 |     return pos
69 | 
70 | 
71 | def imgs_diff(src_img_path, dst_img_path, function="highlight"):
72 |     src_img = cv2.imread(src_img_path)
73 |     dst_img = cv2.imread(dst_img_path)
74 |     if src_img.shape != dst_img.shape:
75 |         dst_img = cv2.resize(dst_img, (src_img.shape[1], src_img.shape[0]))
76 |     
77 |     # Get the rectangular coordinates of the difference area
78 |     rects = detect_diff(src_img, dst_img)
79 |     
80 |     if function == "highlight":
81 |         src_highlight_img = highlight(src_img_path, rects)
82 |         dst_highlight_img = highlight(dst_img_path, rects)
83 |         return src_highlight_img, dst_highlight_img
84 | 
85 |     else:
86 |         # Mark the difference areas on the image
87 |         src_mark_img = mark(src_img_path, rects)
88 |         dst_mark_img = mark(dst_img_path, rects)
89 |         return src_mark_img, dst_mark_img


--------------------------------------------------------------------------------
/spear.py:
--------------------------------------------------------------------------------
 1 | from util import *
 2 | 
 3 | human_ratings = "path_to_ImagenHub_human_eval_results"
 4 | 
 5 | 
 6 | def read_human_sc(task, model, sample):
 7 |     import pandas as pd
 8 |     import numpy as np
 9 |     import ast
10 | 
11 |     mm = task.replace("ImagenHub_","")
12 | 
13 |     df1 = pd.read_csv(os.path.join(human_ratings,f"{task}/{mm}_rater1.tsv"), sep="\t")
14 |     df2 = pd.read_csv(os.path.join(human_ratings,f"{task}/{mm}_rater2.tsv"), sep="\t")
15 |     df3 = pd.read_csv(os.path.join(human_ratings,f"{task}/{mm}_rater3.tsv"), sep="\t")
16 | 
17 |     cell_value_1 = df1.loc[df1['uid'] == sample, model].values
18 |     cell_value_2 = df2.loc[df2['uid'] == sample, model].values
19 |     cell_value_3 = df3.loc[df3['uid'] == sample, model].values
20 | 
21 |     sc_1 = ast.literal_eval(cell_value_1[0])[0]
22 |     sc_2 = ast.literal_eval(cell_value_2[0])[0]
23 |     sc_3 = ast.literal_eval(cell_value_3[0])[0]
24 |     
25 |     
26 |     return np.mean([sc_1,sc_2,sc_3])
27 |     # return [sc_1,sc_2,sc_3]
28 | 
29 | 
30 | def preprocess(_list):
31 |     temp_list = []
32 |     for scores in _list:
33 |         if isinstance(scores, (int, float)):
34 |             temp_list.append(map_to_nearest_higher(scores/10.0))
35 |         else:
36 |             scores = [int(score) for score in scores]
37 |             # temp_list.append(map_to_nearest_higher(min(scores)))
38 |             temp_list.append(map_to_nearest_higher(min(scores)/10.0))
39 |     return temp_list
40 | 
41 | 
42 | def sigfig(number, sigfigs=4, digit_mode=True):
43 |     if digit_mode:
44 |         string_mode = '{:#.{sigfigs}f}'
45 |     else:
46 |         string_mode = '{:#.{sigfigs}g}'
47 |     if isinstance(number, list):
48 |         new_numbers = []
49 |         for num in number:
50 |             new_num = string_mode.format(num, sigfigs=sigfigs)
51 |             new_numbers.append(float(new_num))
52 |         return new_numbers
53 |     else:
54 |         return float(string_mode.format(number, sigfigs=sigfigs))
55 | 
56 | 
57 | def map_to_nearest_higher(number, target_numbers=[0.0, 0.17, 0.33, 0.5, 0.67, 0.83, 1.0], not_mapping=True):
58 |     if not_mapping:
59 |         if number > 1.0:
60 |             return 1.0
61 |         if number < 0.0:
62 |             return 0.0
63 |         return number
64 |     
65 |     # Find the nearest higher number
66 |     for target in target_numbers:
67 |         if target >= number:
68 |             return target
69 |     return target_numbers[-1]  # Return the maximum if no higher number is found
70 | 
71 | 
72 | def average_correlation(z_scores):
73 |     import math
74 |     # Calculate the average Z score
75 |     z_avg = sum(z_scores) / len(z_scores)
76 | 
77 |     # Convert the average Z score back to a correlation coefficient
78 |     r_avg = (math.exp(2 * z_avg) - 1) / (math.exp(2 * z_avg) + 1)
79 |     return r_avg
80 | 
81 | 
82 | 
83 | 
84 | from scipy.stats import spearmanr
85 | import numpy as np
86 | import ast
87 | task=""
88 | model=""
89 | # Read the identifiers of the evaluation images under each task/model and organize them into a list
90 | keys=[]
91 | # Read the automated evaluation results and organize them into a list
92 | SC_gpt4o = []
93 | 
94 | SC_human = [read_human_sc(task, model, key) for key in keys]
95 | SC_rho, _ = spearmanr(SC_gpt4o, SC_human)
96 | print(task, model, "SC|", sigfig(SC_rho))
97 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # CIGEval
 2 | <p align="center">
 3 |    📃 <a href="https://arxiv.org/abs/2504.07046" target="_blank">Paper</a>
 4 | </p>
 5 | In this work, we propose CIGEval, a unified agentic framework for comprehensive evaluation of conditional image generation tasks. CIGEval utilizes large multimodal models (LMMs) as its core, integrating a multi-functional toolbox to enable fine-grained evaluation. Please check out our paper <a href="https://arxiv.org/abs/2504.07046" target="_blank">"A Unified Agentic Framework for Evaluating Conditional Image Generation"</a>.
 6 | 
 7 | 
 8 | ## 🌟 Framework
 9 | CIGEval adopts a divide-and-conquer scheme for evaluating images generated under multiple conditions. For each sub-question, CIGEval selects the most suitable tool from its toolbox, focusing on the specific aspect of evaluation. Then, the LMM analyzes the tool outputs and assigns scores.
10 | <p align="center">
11 | <img src="assets/framework.png" height="270px"></img>
12 | <img src="assets/overview.png" height="270px"></img>
13 | </p>
14 | 
15 | 
16 | 
17 | ## ⚡️ Installation
18 | Run the following command to set up the environments.
19 | ```
20 | conda env create -f environment.yml
21 | ```
22 | The toolbox contains grounding tool. Please refer to [GroundingDINO](https://github.com/IDEA-Research/GroundingDINO) to configure the environment and model. In addition, you need to download the evaluation images and human ratings from [ImagenHub](https://tiger-ai-lab.github.io/ImagenHub/)
23 | 
24 | 
25 | ## ✨ Conditional Image Synthesis Evaluation
26 | Run the following command to use CIGEval to evaluate 7 conditional image synthesis tasks:
27 | ```
28 | python evaluate.py
29 | ```
30 | We also provide the [VIEScore](https://github.com/TIGER-AI-Lab/VIEScore) evaluation method:
31 | ```
32 | python evaluate_vie.py 
33 | ```
34 | Then you can use `spear.py` to calculate the correlation between the evaluation results and human ratings.
35 | 
36 | ## 📌 SFT
37 | <p>To empower smaller LMMs as effective evaluators, we aim to perform supervised fine-tuning on 7B models to integrate agentic capabilities into them. The ImagenHub data is randomly split into a 6:4 ratio for the training set and test set.
38 | </p>
39 | <p> For the training set, we employ GPT-4o to carry out the evaluation process of CIGEval, then filter out the trajectories where the evaluation results differ from human scores by less than 0.3, resulting in 2,274 high-quality trajectories for supervised fine-tuning. Using this structured trajectory data, we perform supervised fine-tuning on Qwen2-VL-7B-Instruct and Qwen2.5-VL-7B-Instruct. You can visit <a href="https://huggingface.co/datasets/HIT-TMG/CIGEval_sft_data" target="_blank">trajectories</a> to get the sft data, and visit <a href="https://huggingface.co/HIT-TMG/CIGEval-Qwen2-VL-7B-Instruct-sft" target="_blank">Qwen2-VL-7B-Instruct-sft</a> and <a href="https://huggingface.co/HIT-TMG/CIGEval-Qwen2.5-VL-7B-Instruct-sft" target="_blank">Qwen2.5-VL-7B-Instruct-sft</a> to get our fine-tuned models.
40 | </p>
41 | <p> For the test set, it is stored in `test_40p.json`. You can run `run_test_40p.py` to get the results of the locally run model on the test set.
42 | </p>
43 | 
44 | ## ⚖️ Paper Results
45 | <p align="center">
46 | <img src="assets/table-1.png" width="550px"></img>
47 | </p>
48 | <p align="center">
49 | <img src="assets/table-40p.png" width="550px"></img>
50 | </p>
51 | 
52 | 
53 | 
54 | ## 📚 Citation
55 | 
56 | If you found this repository useful, please consider cite our paper:
57 | 
58 | ```bibtex
59 | @misc{wang2025cigeval,
60 |       title={A Unified Agentic Framework for Evaluating Conditional Image Generation}, 
61 |       author={Jifang Wang and Xue Yang and Longyue Wang and Zhenran Xu and Yiyu Wang and Yaowei Wang and Weihua Luo and Kaifu Zhang and Baotian Hu and Min Zhang},
62 |       year={2025},
63 |       eprint={2504.07046},
64 |       archivePrefix={arXiv},
65 |       primaryClass={cs.CV},
66 |       url={https://arxiv.org/abs/2504.07046}, 
67 | }
68 | ```
69 | 


--------------------------------------------------------------------------------
/agent/openai.py:
--------------------------------------------------------------------------------
  1 | import base64
  2 | import requests
  3 | from io import BytesIO
  4 | from PIL import Image, ImageOps
  5 | import os
  6 | import openai
  7 | from openai import OpenAI
  8 | 
  9 | 
 10 | class GPT4o():
 11 |     def __init__(self, model_name="gpt-4o-2024-05-13", api_key = "API_KEY", base_url = "BASE_URL"):
 12 |         self.model_name = model_name
 13 |         self.client = OpenAI(api_key = api_key, base_url = base_url)
 14 | 
 15 |     def prepare_prompt(self, image_links = [], text_prompt = ""):
 16 |         if not isinstance(image_links, list):
 17 |             image_links = [image_links]
 18 |             
 19 |         prompt_content = []
 20 |         text_dict = {
 21 |                     "type": "text",
 22 |                     "text": text_prompt
 23 |                 }
 24 |         prompt_content.append(text_dict)
 25 | 
 26 |         for image_link in image_links:
 27 |             if "base64" not in image_link:
 28 |                 img = load_image(image_link)
 29 |                 image_link = f"data:image/jpeg;base64,{encode_pil_image(img)}"
 30 |             visual_dict = {
 31 |                     "type": "image_url",
 32 |                     "image_url": {"url": image_link}
 33 |             }
 34 |             prompt_content.append(visual_dict)
 35 |         return prompt_content
 36 | 
 37 | 
 38 |     def get_result(self, prompt):
 39 |         try:
 40 |             response = self.client.chat.completions.create(
 41 |                 model = self.model_name,
 42 |                 messages = [
 43 |                     {
 44 |                         "role": "user",
 45 |                         "content": prompt
 46 |                     }
 47 |                 ]                                      
 48 |             )
 49 | 
 50 |             out = response.choices[0].message.content
 51 |             return out
 52 |         except Exception as e:
 53 |             print(f"Error: {e}")
 54 |             return None
 55 | 
 56 | 
 57 |         
 58 |     
 59 | 
 60 | class QWen25(GPT4o):
 61 |     def __init__(self, model_name="qwen2.5-vl-72b-instruct", api_key = "API_KEY", base_url = "BASE_URL"):
 62 |         super().__init__(model_name, api_key, base_url)
 63 | 
 64 | 
 65 | 
 66 | 
 67 | 
 68 | class LlavaNext(GPT4o):
 69 |     def __init__(self):
 70 |         self.address = "http://127.0.0.1:8080/generate"
 71 | 
 72 |     def get_result(self, images, text):
 73 |         if not isinstance(images, list):
 74 |             images = [images]
 75 |         data = {'imgs':images, 'text':text}
 76 |         try:
 77 |             response = requests.post(self.address, json = data).json()
 78 |             out = response['response']
 79 |             return out
 80 | 
 81 |         except Exception as e:
 82 |             print(f"Error: {e}")
 83 |             return None
 84 | 
 85 | 
 86 | 
 87 | ############################################################### Image Processing Functions
 88 | ###############################################################
 89 | def load_image(image, format = "RGB"):
 90 |     if isinstance(image, str):
 91 |         if image.startswith("http://") or image.startswith("https://"):
 92 |             image = Image.open(requests.get(image, stream=True).raw)
 93 |         elif os.path.isfile(image):
 94 |             image = Image.open(image)
 95 |         else:
 96 |             raise ValueError(
 97 |                 f"{image} is not a valid path or url."
 98 |             )
 99 |     elif isinstance(image, Image.Image):
100 |         image = image
101 |     else:
102 |         raise ValueError(
103 |             "Incorrect format used for image. Should be an url linking to an image, a local path, or a PIL image."
104 |         )
105 |     image = ImageOps.exif_transpose(image)
106 |     image = image.convert(format)
107 |     return image
108 | 
109 | 
110 | def encode_pil_image(pil_image, format="JPEG"):
111 |     image_stream = BytesIO()
112 |     pil_image.save(image_stream, format=format)
113 |     image_data = image_stream.getvalue()
114 |     base64_image = base64.b64encode(image_data).decode('utf-8')   
115 |     return base64_image
116 | 
117 | ############################################################### Image Processing Functions
118 | ###############################################################


--------------------------------------------------------------------------------
/tools/grding.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import os
  3 | import re
  4 | import torch
  5 | from collections import defaultdict
  6 | from groundingdino.util.inference import load_model, load_image, predict
  7 | from torchvision.ops import box_convert
  8 | from PIL import Image, ImageDraw
  9 | 
 10 | 
 11 | os.environ["CUDA_VISIBLE_DEVICES"] = "6"
 12 | 
 13 | class Grounding_Module():
 14 |     def __init__(self, base_dir):
 15 |         self.model = load_model(
 16 |             os.path.join(base_dir, "groundingdino/config/GroundingDINO_SwinT_OGC.py"), 
 17 |             os.path.join(base_dir, "weights/groundingdino_swint_ogc.pth")
 18 |             )
 19 | 
 20 |     def forward(self, img, prompt, bbox_thrd, text_thrd, do_clean=True):
 21 |         img_source, img = load_image(image_path=img)
 22 |         w, h = img_source.shape[1], img_source.shape[0]
 23 |         boxes, logits, phrases = predict(
 24 |             model=self.model,
 25 |             image=img,
 26 |             caption=prompt,
 27 |             box_threshold=bbox_thrd,
 28 |             text_threshold=text_thrd,
 29 |             )
 30 |         boxes = boxes * torch.Tensor([w, h, w, h])
 31 |         boxes = box_convert(boxes=boxes, in_fmt="cxcywh", out_fmt="xyxy").numpy()
 32 |         boxes = list(boxes)
 33 |         logits = logits.numpy()
 34 |         logits = list(logits)
 35 |         res = []
 36 |         for bbox, logit, phrase in zip(boxes, logits, phrases):
 37 |             res.append((list([int(xy) for xy in bbox]), logit, phrase))
 38 |         if do_clean:
 39 |             res = self._clean_bbox(res)
 40 |         return sorted(res, key=lambda x: x[1], reverse=True)
 41 |     
 42 |     def _clean_bbox(self, bbox_list):
 43 |         def get_range(bbox):
 44 |             return (bbox[2]-bbox[0]) * (bbox[3]-bbox[1])
 45 |         def check_recap(bbox1, bbox2):
 46 |             if bbox2[0]<bbox1[0] and bbox2[1]<bbox1[1] and bbox2[2]>bbox1[2] and bbox2[3]>bbox1[3]:
 47 |                 return True
 48 |             return False
 49 | 
 50 |         bbox_list = sorted(bbox_list, key=lambda x: get_range(x[0]))
 51 |         cleaned_bbox_list = []
 52 |         for bbox in bbox_list:
 53 |             if len(bbox_list) == 0:
 54 |                 cleaned_bbox_list.append(bbox)
 55 |                 continue
 56 | 
 57 |             flag = True
 58 |             for cleaned_bbox in cleaned_bbox_list:
 59 |                 if check_recap(cleaned_bbox[0], bbox[0]):
 60 |                     flag = False
 61 |                     break
 62 |             if flag:
 63 |                 cleaned_bbox_list.append(bbox)
 64 |         return cleaned_bbox_list
 65 | 
 66 | 
 67 | 
 68 | 
 69 | 
 70 | def filter_bboxes_by_max_logit(a):
 71 |     phrase_dict = defaultdict(lambda: (None, float('-inf')))
 72 |     for bbox, logit, phrase in a:
 73 |         if logit > phrase_dict[phrase][1]:
 74 |             phrase_dict[phrase] = (bbox, logit)
 75 |     filtered_list = [bbox for phrase, (bbox, logit) in phrase_dict.items()]
 76 |     return filtered_list
 77 | 
 78 | 
 79 | 
 80 | def tolist(a):
 81 |     box_list = []
 82 |     for bbox, logit, phrase in a:
 83 |         box_list.append(bbox)
 84 |     return box_list
 85 | 
 86 | 
 87 | grounding_module = Grounding_Module("PATH_TO_GroundingDINO")
 88 | 
 89 | 
 90 | def grding(img_path, text, function):
 91 |     res = grounding_module.forward(img_path, text, bbox_thrd=0.2, text_thrd=0.2, do_clean=True) # (bbox, logit, phrase)
 92 |         
 93 |     if len(res) == 0:
 94 |         print("Grounding: no result")
 95 |         if isinstance(img_path, str):
 96 |             img_path = Image.open(image_path).convert("RGB")
 97 |         if function=="highlight++":
 98 |             return(img_path, res)
 99 |         return img_path
100 |     else:
101 |         if function=="mark":
102 |             res = filter_bboxes_by_max_logit(res)
103 |             return(mark(img_path, res))
104 |         if function=="highlight":
105 |             res = filter_bboxes_by_max_logit(res)
106 |             return(highlight(img_path, res))
107 |         if function=="segment":
108 |             return(segment(img_path, res[-1][0]))
109 |         if function=="highlight++":
110 |             res = filter_bboxes_by_max_logit(res)
111 |             return(highlight(img_path, res), res)
112 | 
113 | 


--------------------------------------------------------------------------------
/environment.yml:
--------------------------------------------------------------------------------
  1 | name: pige
  2 | channels:
  3 |   - http://mirrors.tuna.tsinghua.edu.cn/anaconda/pkgs/msys2
  4 |   - http://mirrors.tuna.tsinghua.edu.cn/anaconda/pkgs/pro
  5 |   - http://mirrors.tuna.tsinghua.edu.cn/anaconda/pkgs/r
  6 |   - http://mirrors.tuna.tsinghua.edu.cn/anaconda/pkgs/free
  7 |   - http://mirrors.tuna.tsinghua.edu.cn/anaconda/pkgs/main
  8 |   - defaults
  9 | dependencies:
 10 |   - _libgcc_mutex=0.1=main
 11 |   - _openmp_mutex=5.1=1_gnu
 12 |   - ca-certificates=2024.11.26=h06a4308_0
 13 |   - ld_impl_linux-64=2.40=h12ee557_0
 14 |   - libffi=3.4.4=h6a678d5_1
 15 |   - libgcc-ng=11.2.0=h1234567_1
 16 |   - libgomp=11.2.0=h1234567_1
 17 |   - libstdcxx-ng=11.2.0=h1234567_1
 18 |   - ncurses=6.4=h6a678d5_0
 19 |   - openssl=3.0.15=h5eee18b_0
 20 |   - pip=24.2=py39h06a4308_0
 21 |   - python=3.9.18=h955ad1f_0
 22 |   - readline=8.2=h5eee18b_0
 23 |   - setuptools=75.1.0=py39h06a4308_0
 24 |   - sqlite=3.45.3=h5eee18b_0
 25 |   - tk=8.6.14=h39e8969_0
 26 |   - wheel=0.44.0=py39h06a4308_0
 27 |   - xz=5.4.6=h5eee18b_1
 28 |   - zlib=1.2.13=h5eee18b_1
 29 |   - pip:
 30 |       - absl-py==2.1.0
 31 |       - accelerate==1.2.1
 32 |       - addict==2.4.0
 33 |       - albucore==0.0.23
 34 |       - albumentations==2.0.5
 35 |       - annotated-types==0.7.0
 36 |       - antlr4-python3-runtime==4.8
 37 |       - anyio==4.7.0
 38 |       - anytree==2.12.1
 39 |       - appdirs==1.4.4
 40 |       - astor==0.8.1
 41 |       - asttokens==3.0.0
 42 |       - av==14.1.0
 43 |       - beautifulsoup4==4.13.3
 44 |       - black==21.4b2
 45 |       - blinker==1.9.0
 46 |       - boto3==1.35.87
 47 |       - botocore==1.35.87
 48 |       - certifi==2024.12.14
 49 |       - charset-normalizer==3.4.0
 50 |       - click==8.1.8
 51 |       - cloudpickle==3.1.0
 52 |       - cmake==3.31.2
 53 |       - colorama==0.4.6
 54 |       - contourpy==1.3.0
 55 |       - cycler==0.12.1
 56 |       - cython==3.0.11
 57 |       - dataclasses==0.6
 58 |       - decorator==5.2.1
 59 |       - decord==0.6.0
 60 |       - deepspeed==0.7.0
 61 |       - defusedxml==0.7.1
 62 |       - distro==1.9.0
 63 |       - einops==0.4.1
 64 |       - eval-type-backport==0.2.2
 65 |       - exceptiongroup==1.2.2
 66 |       - executing==2.1.0
 67 |       - fasttext==0.9.3
 68 |       - filelock==3.16.1
 69 |       - fire==0.7.0
 70 |       - flash-attn==2.7.3
 71 |       - flask==3.1.0
 72 |       - fonttools==4.55.3
 73 |       - fsspec==2024.10.0
 74 |       - ftfy==6.3.1
 75 |       - future==1.0.0
 76 |       - fvcore==0.1.5.post20221221
 77 |       - grpcio==1.68.1
 78 |       - h11==0.14.0
 79 |       - hjson==3.1.0
 80 |       - httpcore==1.0.7
 81 |       - httpx==0.28.1
 82 |       - huggingface-hub==0.27.0
 83 |       - hydra-core==1.1.2
 84 |       - icecream==2.1.3
 85 |       - idna==3.10
 86 |       - imageio==2.36.1
 87 |       - importlib-metadata==8.5.0
 88 |       - importlib-resources==6.4.5
 89 |       - inflect==7.4.0
 90 |       - iopath==0.1.9
 91 |       - itsdangerous==2.2.0
 92 |       - jinja2==3.1.4
 93 |       - jiter==0.8.2
 94 |       - jmespath==1.0.1
 95 |       - joblib==1.4.2
 96 |       - kiwisolver==1.4.7
 97 |       - lazy-loader==0.4
 98 |       - levenshtein==0.26.1
 99 |       - lit==18.1.8
100 |       - lmdb==1.6.2
101 |       - lvis==0.5.3
102 |       - lxml==5.3.1
103 |       - markdown==3.7
104 |       - markupsafe==3.0.2
105 |       - matplotlib==3.9.4
106 |       - modelscope==1.22.3
107 |       - more-itertools==10.5.0
108 |       - mpmath==1.3.0
109 |       - mss==10.0.0
110 |       - mypy-extensions==1.0.0
111 |       - networkx==3.2.1
112 |       - ninja==1.11.1.3
113 |       - nltk==3.9.1
114 |       - numpy==2.0.2
115 |       - nvidia-cublas-cu11==11.10.3.66
116 |       - nvidia-cublas-cu12==12.4.5.8
117 |       - nvidia-cuda-cupti-cu11==11.7.101
118 |       - nvidia-cuda-cupti-cu12==12.4.127
119 |       - nvidia-cuda-nvrtc-cu11==11.7.99
120 |       - nvidia-cuda-nvrtc-cu12==12.4.127
121 |       - nvidia-cuda-runtime-cu11==11.7.99
122 |       - nvidia-cuda-runtime-cu12==12.4.127
123 |       - nvidia-cudnn-cu11==8.5.0.96
124 |       - nvidia-cudnn-cu12==9.1.0.70
125 |       - nvidia-cufft-cu11==10.9.0.58
126 |       - nvidia-cufft-cu12==11.2.1.3
127 |       - nvidia-curand-cu11==10.2.10.91
128 |       - nvidia-curand-cu12==10.3.5.147
129 |       - nvidia-cusolver-cu11==11.4.0.1
130 |       - nvidia-cusolver-cu12==11.6.1.9
131 |       - nvidia-cusparse-cu11==11.7.4.91
132 |       - nvidia-cusparse-cu12==12.3.1.170
133 |       - nvidia-nccl-cu11==2.14.3
134 |       - nvidia-nccl-cu12==2.21.5
135 |       - nvidia-nvjitlink-cu12==12.4.127
136 |       - nvidia-nvtx-cu11==11.7.91
137 |       - nvidia-nvtx-cu12==12.4.127
138 |       - omegaconf==2.1.2
139 |       - openai==1.58.1
140 |       - opencv-contrib-python==4.11.0.86
141 |       - opencv-python==4.5.5.64
142 |       - opencv-python-headless==4.11.0.86
143 |       - opt-einsum==3.3.0
144 |       - packaging==24.2
145 |       - paddleocr==2.10.0
146 |       - paddlepaddle==3.0.0rc1
147 |       - pandas==2.2.3
148 |       - pathspec==0.12.1
149 |       - peft==0.14.0
150 |       - pillow==11.0.0
151 |       - platformdirs==4.3.6
152 |       - portalocker==3.0.0
153 |       - protobuf==6.30.1
154 |       - psutil==6.1.1
155 |       - py-cpuinfo==9.0.0
156 |       - pyarrow==19.0.0
157 |       - pybind11==2.13.6
158 |       - pyclipper==1.3.0.post6
159 |       - pycocotools==2.0.8
160 |       - pydantic==2.10.3
161 |       - pydantic-core==2.27.1
162 |       - pydot==3.0.3
163 |       - pygments==2.18.0
164 |       - pyparsing==3.2.0
165 |       - python-dateutil==2.9.0.post0
166 |       - python-docx==1.1.2
167 |       - pytz==2025.1
168 |       - pyyaml==6.0.2
169 |       - qwen-vl-utils==0.0.8
170 |       - rapidfuzz==3.11.0
171 |       - regex==2024.11.6
172 |       - requests==2.32.3
173 |       - s3transfer==0.10.4
174 |       - safetensors==0.4.5
175 |       - scikit-image==0.24.0
176 |       - scikit-learn==1.6.0
177 |       - scipy==1.13.1
178 |       - shapely==2.0.7
179 |       - simsimd==6.2.1
180 |       - six==1.17.0
181 |       - sniffio==1.3.1
182 |       - soupsieve==2.6
183 |       - stringzilla==3.12.3
184 |       - supervision==0.25.1
185 |       - sympy==1.13.1
186 |       - tabulate==0.9.0
187 |       - tensorboard==2.18.0
188 |       - tensorboard-data-server==0.7.2
189 |       - termcolor==2.5.0
190 |       - threadpoolctl==3.5.0
191 |       - tifffile==2024.8.30
192 |       - timm==1.0.12
193 |       - tokenizers==0.21.0
194 |       - toml==0.10.2
195 |       - tomli==2.2.1
196 |       - torch==2.0.0
197 |       - torchvision==0.15.1
198 |       - tqdm==4.67.1
199 |       - transformers==4.47.1
200 |       - triton==2.0.0
201 |       - typeguard==4.4.1
202 |       - typing-extensions==4.12.2
203 |       - tzdata==2025.1
204 |       - urllib3==1.26.20
205 |       - wcwidth==0.2.13
206 |       - werkzeug==3.1.3
207 |       - yacs==0.1.8
208 |       - yapf==0.43.0
209 |       - zipp==3.21.0
210 | 
211 | 


--------------------------------------------------------------------------------
/util.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import os
  3 | import re
  4 | from agent.openai import GPT4o
  5 | from typing import Union
  6 | import base64
  7 | import requests
  8 | import Levenshtein
  9 | from io import BytesIO
 10 | from PIL import Image, ImageOps
 11 | 
 12 | gpt = GPT4o()
 13 | 
 14 | 
 15 | def url2path(url, root):
 16 |     its = url.split("/")
 17 |     return os.path.join(root, its[-3], its[-2], its[-1])
 18 | 
 19 | 
 20 | def merge_images(images):
 21 |     if len(images) == 0:
 22 |         return None
 23 |     if len(images) == 1:
 24 |         return images[0]
 25 |     widths, heights = zip(*(i.size for i in images))
 26 |     average_height = sum(heights) // len(heights)
 27 |     for i, im in enumerate(images):
 28 |         # scale in proportion
 29 |         images[i] = im.resize((int(im.size[0] * average_height / im.size[1]), average_height))
 30 |     widths, heights = zip(*(i.size for i in images))
 31 |     total_width = sum(widths)
 32 |     max_height = max(heights)
 33 |     new_im = Image.new("RGB", (total_width + 10 * (len(images) - 1), max_height))
 34 |     x_offset = 0
 35 |     for i, im in enumerate(images):
 36 |         if i > 0:
 37 |             # past a column of 1 pixel starting from x_offset width being black, 8 pixels being white, and 1 pixel being black
 38 |             new_im.paste(Image.new("RGB", (1, max_height), (0, 0, 0)), (x_offset, 0))
 39 |             x_offset += 1
 40 |             new_im.paste(Image.new("RGB", (8, max_height), (255, 255, 255)), (x_offset, 0))
 41 |             x_offset += 8
 42 |             new_im.paste(Image.new("RGB", (1, max_height), (0, 0, 0)), (x_offset, 0))
 43 |             x_offset += 1
 44 |         new_im.paste(im, (x_offset, 0))
 45 |         x_offset += im.size[0]
 46 |     return new_im
 47 | 
 48 | 
 49 | # Function to encode a PIL image
 50 | def encode_pil_image(pil_image, format="JPEG"):
 51 |     image_stream = BytesIO()
 52 |     pil_image.save(image_stream, format=format)
 53 |     image_data = image_stream.getvalue()
 54 |     base64_image = base64.b64encode(image_data).decode('utf-8')   
 55 |     return base64_image
 56 | 
 57 | 
 58 | def load_image(image: Union[str, Image.Image], format = "RGB") -> Image.Image:
 59 |     if isinstance(image, str):
 60 |         if image.startswith("http://") or image.startswith("https://"):
 61 |             image = Image.open(requests.get(image, stream=True).raw)
 62 |         elif os.path.isfile(image):
 63 |             image = Image.open(image)
 64 |         else:
 65 |             raise ValueError(
 66 |                 f"{image} is not a valid path or url."
 67 |             )
 68 |     elif isinstance(image, Image.Image):
 69 |         image = image
 70 |     else:
 71 |         raise ValueError(
 72 |             "Incorrect format used for image. Should be an url linking to an image, a local path, or a PIL image."
 73 |         )
 74 |     image = ImageOps.exif_transpose(image)
 75 |     image = image.convert(format)
 76 |     return image
 77 | 
 78 | 
 79 | def read_json(input_path):
 80 |     with open(input_path, 'r', encoding='utf-8') as f:
 81 |         return json.load(f)
 82 | 
 83 | 
 84 | def write_json(output_path, output_data):
 85 |     with open(output_path, 'w', encoding='utf-8') as f:
 86 |         json.dump(output_data, f, ensure_ascii=False)
 87 |     
 88 |     
 89 | def log_prompt(prompt_log_path, input):
 90 |     if not isinstance(input, str):
 91 |         input = toString(input)
 92 |     with open(prompt_log_path, "a", encoding="utf-8") as log_file:
 93 |         log_file.write(f"{input}\n")
 94 |         log_file.write("#######################################################\n")
 95 |     
 96 |     
 97 | def cretae_new_path(path, filetype):
 98 |     files = os.listdir(path)
 99 |     if len(files) == 0:
100 |         return os.path.join(path, f'0.{filetype}')
101 |     else:
102 |         max = 0
103 |         for file in files:
104 |             max = max if max>=int(os.path.splitext(file)[0]) else int(os.path.splitext(file)[0])
105 |         return os.path.join(path, str(max+1) + f'.{filetype}')
106 | 
107 | def find_latest_file(path):
108 |     files = os.listdir(path)
109 |     max = 0
110 |     f = ""
111 |     for file in files:
112 |         if max < int(os.path.splitext(file)[0]):
113 |             max = int(os.path.splitext(file)[0])
114 |             f = file
115 | 
116 |     return os.path.join(path, f)
117 | 
118 | def toString(input):
119 |     return json.dumps(input, ensure_ascii=False, separators=(",", ":"))          
120 |        
121 |         
122 | def prompt_format(prompt, params):
123 |     text = prompt
124 |     for key, value in params.items():
125 |         if isinstance(value, (dict, list)):
126 |             value = toString(value)
127 |         if isinstance(value, (int, float)):
128 |             value = str(value)
129 |         text = text.replace(key, value)
130 |     return text
131 | 
132 | 
133 | def calculate_similarity(str1, str2):
134 |     distance = Levenshtein.distance(str1.lower(), str2.lower())
135 |     max_len = max(len(str1), len(str2))
136 |     similarity = 1 - distance / max_len
137 |     return similarity
138 | 
139 |     
140 | def return_most_similar(string, string_list):
141 |     max_similarity = 0
142 |     tgt = 0
143 |     for id,item in enumerate(string_list):
144 |         current_similarity = calculate_similarity(string, item)
145 |         if current_similarity > max_similarity:
146 |             max_similarity = current_similarity
147 |             tgt = id
148 |     
149 |     return string_list[tgt]
150 | 
151 | 
152 | def check(src, keys):
153 |     if isinstance(src, list):
154 |         dst = []
155 |         for item in src:
156 |             dst_item = {}
157 |             for item_key in item.keys():
158 |                 key = return_most_similar(item_key, keys)
159 |                 dst_item[key] = item[item_key]
160 |             for _key in keys:
161 |                 if _key not in item.keys():
162 |                     dst_item[_key] = "None"
163 |             dst.append(dst_item)
164 |     else:
165 |         dst = {}
166 |         for _key in src.keys():
167 |             key = return_most_similar(_key, keys)
168 |             dst[key] = src[_key]
169 |     
170 |     return dst
171 |     
172 | def GPTResponse2JSON(response):
173 |     json_string = clean_text(response)
174 |     # json_string = firstjson(clean_text(response))
175 |     prompt = f"Modify the following string so that it can be correctly parsed by the json.loads() method:\n{json_string}\n\nYou should just return the modified string."
176 |     print(json_string)
177 |     if "```json" in json_string:
178 |         json_string = json_string.replace("```","")
179 |         json_string = json_string.replace("json","")
180 |         json_string = json_string.strip()
181 |     try:
182 |         result = json.loads(json_string)
183 |     except:
184 |         _prompt = gpt.prepare_prompt(text_prompt=prompt)
185 |         result = json.loads(clean_text(gpt.get_result(_prompt)))
186 | 
187 |     return result
188 | 
189 | 
190 | def clean_text(text):
191 |     # Only keep json content
192 |     pattern = r"```json(.*?)```"
193 |     match = re.search(pattern, text, re.DOTALL)
194 |     if match:
195 |         text =  match.group(1)
196 |     return text
197 | 
198 | 
199 | def get_number(text):
200 |     if isinstance(text, str):
201 |         pattern = r'[^0-9]'
202 |         text = re.sub(pattern, '', text)
203 |         return int(text)
204 |     else:
205 |         return text
206 | 
207 | 
208 | def firstjson(text):
209 |     match = re.search(r'\{([^}]*)\}', text)
210 |     if match:
211 |         return "{" + match.group(1) + "}"
212 |     else:
213 |         return text
214 | 
215 | 
216 | def matchkey(json, key):
217 |     for k in json.keys():
218 |         if key in k:
219 |             return k
220 |     return None
221 | 


--------------------------------------------------------------------------------
/evaluate_vie.py:
--------------------------------------------------------------------------------
  1 | from agent.openai import GPT4o,QWen25
  2 | from agent.prompt_mm import *
  3 | from util import *
  4 | from tqdm import tqdm
  5 | import os
  6 | 
  7 | 
  8 | 
  9 | def test_Text_Guided_IE_evaluate(in_path, out_path):
 10 |     data = read_json(in_path)
 11 |     out = {}
 12 |     for key in tqdm(list(data.keys())):
 13 |         try:
 14 |             instruction = data[key]["prompt_input"].replace("Editing instruction:", "").strip()
 15 |             task = Text_Guided_IE.replace("{instruction}", instruction)
 16 |             img_links = [url2path(data[key]["vision_input"][0], Image_Root), url2path(data[key]["vision_input"][1], Image_Root)]
 17 |             json_1 = GPTResponse2JSON(agent_run.get_result(img_links, task))
 18 | 
 19 |             out[key] = {}
 20 |             out[key]['score'] = json_1['score']
 21 |             out[key]['reasoning'] = json_1['reasoning']
 22 |             out[key]['prompt_input'] = task
 23 |             out[key]['vision_input'] = data[key]["vision_input"]
 24 |             print(f"{key} over")
 25 |         except Exception as e:
 26 |             print(f"Error: {key} evaluation failed: {e}")
 27 |     
 28 |     write_json(out_path, out)
 29 | 
 30 | 
 31 | 
 32 | def test_Subject_Driven_IE_evaluate(in_path, out_path):
 33 |     data = read_json(in_path)
 34 |     out = {}
 35 |     for key in tqdm(list(data.keys())):
 36 |         try:
 37 |             subject = data[key]["prompt_input"].replace("Subject:", "").strip()
 38 |             task = Subject_Driven_IE.replace("{subject}", subject)
 39 |             im_1 = load_image(url2path(data[key]["vision_input"][0], Image_Root))
 40 |             im_2 = load_image(url2path(data[key]["vision_input"][1], Image_Root))
 41 |             im = merge_images([im_1, im_2])
 42 |             img_links = [f"data:image/jpeg;base64,{encode_pil_image(im)}", url2path(data[key]["vision_input"][2], Image_Root)]
 43 |             json_1 = GPTResponse2JSON(agent_run.get_result(img_links, task))
 44 |             out[key] = {}
 45 |             out[key]['score'] = json_1['score']
 46 |             out[key]['reasoning'] = json_1['reasoning']
 47 |             out[key]['prompt_input'] = task
 48 |             out[key]['vision_input'] = data[key]["vision_input"]
 49 |             print(f"{key} over")
 50 |         except Exception as e:
 51 |             print(f"Error: {key} evaluation failed: {e}")
 52 |     
 53 |     write_json(out_path, out)
 54 | 
 55 | 
 56 | 
 57 | def test_Mask_Guided_IE_evaluate(in_path, out_path):
 58 |     data = read_json(in_path)
 59 |     out = {}
 60 |     for key in tqdm(list(data.keys())):
 61 |         try:
 62 |             instruction = data[key]["prompt_input"].replace("Editing instruction:", "").strip()
 63 |             task = Mask_Guided_IE.replace("{instruction}", instruction)
 64 |             img_links = [url2path(data[key]["vision_input"][0], Image_Root), url2path(data[key]["vision_input"][1], Image_Root)]
 65 |             json_1 = GPTResponse2JSON(agent_run.get_result(img_links, task))
 66 |             
 67 |             out[key] = {}
 68 |             out[key]['score'] = json_1['score']
 69 |             out[key]['reasoning'] = json_1['reasoning']
 70 |             out[key]['prompt_input'] = task
 71 |             out[key]['vision_input'] = data[key]["vision_input"]
 72 |             print(f"{key} over")
 73 |         except Exception as e:
 74 |             print(f"Error: {key} evaluation failed: {e}")
 75 |     
 76 |     write_json(out_path, out)
 77 | 
 78 | 
 79 | 
 80 | def test_Multi_Concept_IC_evaluate(in_path, out_path):
 81 |     data = read_json(in_path)
 82 |     out = {}
 83 |     for key in tqdm(list(data.keys())):
 84 |         try:
 85 |             text = data[key]["prompt_input"].replace("Text Prompt:", "").strip()
 86 |             task = Multi_Concept_IC.replace("{text}", text)
 87 |             img_links = [url2path(data[key]["vision_input"][0], Image_Root), url2path(data[key]["vision_input"][1], Image_Root)]
 88 |             json_1 = GPTResponse2JSON(agent_run.get_result(img_links, task))
 89 |             out[key] = {}
 90 |             out[key]['score'] = json_1['score']
 91 |             out[key]['reasoning'] = json_1['reasoning']
 92 |             out[key]['prompt_input'] = task
 93 |             out[key]['vision_input'] = data[key]["vision_input"]
 94 |             print(f"{key} over")
 95 |         except Exception as e:
 96 |             print(f"Error: {key} evaluation failed: {e}")
 97 |     
 98 |     write_json(out_path, out)
 99 | 
100 | 
101 | 
102 | def test_Text_Guided_IG_evaluate(in_path, out_path):
103 |     data = read_json(in_path)
104 |     out = {}
105 |     for key in tqdm(list(data.keys())):
106 |         try:
107 |             text = data[key]["prompt_input"].replace("Text Prompt:", "").strip()
108 |             task = Text_Guided_IG.replace("{text}", text)
109 |             img_links = [url2path(data[key]["vision_input"][0], Image_Root)]
110 |             json_1 = GPTResponse2JSON(agent_run.get_result(img_links, task))
111 | 
112 |             out[key] = {}
113 |             out[key]['score'] = json_1['score']
114 |             out[key]['reasoning'] = json_1['reasoning']
115 |             out[key]['prompt_input'] = task
116 |             out[key]['vision_input'] = data[key]["vision_input"]
117 |             print(f"{key} over")
118 |         except Exception as e:
119 |             print(f"Error: {key} evaluation failed: {e}")
120 | 
121 |     write_json(out_path, out)
122 |     
123 | 
124 | 
125 | def test_Control_Guided_IG_evaluate(in_path, out_path):
126 |     data = read_json(in_path)
127 |     out = {}
128 |     for key in tqdm(list(data.keys())):
129 |         try:
130 |             text = data[key]["prompt_input"].replace("Text Prompt:", "").strip()
131 |             task = Control_Guided_IG.replace("{text}", text)
132 |             img_links = [url2path(data[key]["vision_input"][0], Image_Root), url2path(data[key]["vision_input"][1], Image_Root)]
133 |         
134 |             json_2 = GPTResponse2JSON(agent_run.get_result(img_links, task))
135 |             out[key] = {}
136 |             out[key]['score'] = json_2['score']
137 |             out[key]['reasoning'] = json_2['reasoning']
138 |             out[key]['prompt_input'] = task
139 |             out[key]['vision_input'] = data[key]["vision_input"]
140 |             print(f"{key} over")
141 |         except Exception as e:
142 |             print(f"Error: {key} evaluation failed: {e}")
143 |     
144 |     write_json(out_path, out)
145 | 
146 | 
147 | 
148 | def test_Subject_Driven_IG_evaluate(in_path, out_path):
149 |     data = read_json(in_path)
150 |     out = {}
151 |     for key in tqdm(list(data.keys())):
152 |         try:
153 |             text = data[key]["prompt_input"].replace("Text Prompt:", "").strip()
154 |             task = Subject_Driven_IG.replace("{text}", text)
155 |             img_links = [url2path(data[key]["vision_input"][0], Image_Root), url2path(data[key]["vision_input"][1], Image_Root)]
156 |             json_1 = GPTResponse2JSON(agent_run.get_result(img_links, task))
157 |             out[key] = {}
158 |             out[key]['score'] = json_1['score']
159 |             out[key]['reasoning'] = json_1['reasoning']
160 |             out[key]['prompt_input'] = task
161 |             out[key]['vision_input'] = data[key]["vision_input"]
162 |             print(f"{key} over")
163 |         except Exception as e:
164 |             print(f"Error: {key} evaluation failed: {e}")
165 | 
166 |     write_json(out_path, out)
167 | 
168 | 
169 | 
170 | def evaluate(in_path, eva_out_path):
171 |     if "ImagenHub_Control-Guided_IG" in in_path:
172 |         test_Control_Guided_IG_evaluate(in_path, eva_out_path)
173 |     if "ImagenHub_Mask-Guided_IE" in in_path:
174 |         test_Mask_Guided_IE_evaluate(in_path, eva_out_path)
175 |     if "ImagenHub_Multi-Concept_IC" in in_path:
176 |         test_Multi_Concept_IC_evaluate(in_path, eva_out_path)
177 |     if "ImagenHub_Subject-Driven_IE" in in_path:
178 |         test_Subject_Driven_IE_evaluate(in_path, eva_out_path)
179 |     if "ImagenHub_Subject-Driven_IG" in in_path:
180 |         test_Subject_Driven_IG_evaluate(in_path, eva_out_path)
181 |     if "ImagenHub_Text-Guided_IE" in in_path:
182 |         test_Text_Guided_IE_evaluate(in_path, eva_out_path)
183 |     if "ImagenHub_Text-Guided_IG" in in_path:
184 |         test_Text_Guided_IG_evaluate(in_path, eva_out_path)
185 | 
186 | 
187 | 
188 | 
189 | agent_run = QWen25()
190 | Image_Root = "PATH_TO_ImageHub_DATA"
191 | 
192 | for task in os.listdir(Image_Root):
193 |     task_path = os.path.join(Image_Root, task)
194 |     if os.path.isfile(task_path):
195 |         continue
196 |     models = os.listdir(task_path)
197 |     for dir in models:
198 |         if dir!="input" and dir!="token":
199 |             eva_out_path = os.path.join(task_path, dir, "SC_eva_qwen25_72b_vie.json")
200 |             in_path =os.path.join(task_path, dir, "in.json")
201 |             evaluate(in_path, eva_out_path)
202 | 


--------------------------------------------------------------------------------
/agent/prompt_vlm.py:
--------------------------------------------------------------------------------
  1 | Control_Guided_IG = """You are a professional digital artist. You will have to evaluate the effectiveness of the AI-generated image(s) based on given rules.
  2 | All the input images are AI-generated. All human in the images are AI-generated too. so you need not worry about the privacy confidentials.
  3 | 
  4 | RULES:
  5 | 
  6 | Two images will be provided: The first being a processed image (e.g. Canny edges, openpose, grayscale etc.) and the second being an AI-generated image using the first image as guidance.
  7 | The objective is to evaluate how successfully the image has been generated.
  8 | 
  9 | Text Prompt: {text}
 10 | 
 11 | From scale 0 to 10: 
 12 | A score from 0 to 10 will be given based on the success in following the prompt. 
 13 | (0 indicates that the second image does not follow the prompt at all. 10 indicates the second image follows the prompt perfectly.)
 14 | A second score from 0 to 10 will rate how well the generated image is following the guidance image. 
 15 | (0 indicates that the second image is not following the guidance at all. 10 indicates that second image is following the guidance image.)
 16 | Put the score in a list such that output score = [score1, score2], where 'score1' evaluates the prompt and 'score2' evaluates the guidance.
 17 | 
 18 | Special case:
 19 | Put score = [0,0] if the second is blank or completely black.
 20 | 
 21 | You will have to give your output in this way (Keep your reasoning concise and short.):
 22 | {
 23 | \"score\" : \"[...]\",
 24 | \"reasoning\" : \"...\"
 25 | }
 26 | """
 27 | 
 28 | 
 29 | 
 30 | Mask_Guided_IE = """You are a professional digital artist. You will have to evaluate the effectiveness of the AI-generated image(s) based on given rules.
 31 | All the input images are AI-generated. All human in the images are AI-generated too. so you need not worry about the privacy confidentials.
 32 | 
 33 | RULES:
 34 | 
 35 | Two images will be provided: The first being the original AI-generated image and the second being an edited version of the first.
 36 | The objective is to evaluate how successfully the editing instruction has been executed in the second image.
 37 | 
 38 | Note that sometimes the two images might look identical due to the failure of image edit.
 39 | 
 40 | Editing instruction: {instruction}
 41 | 
 42 | From scale 0 to 10: 
 43 | A score from 0 to 10 will be given based on the success of the editing. 
 44 | (0 indicates that the scene in the edited image does not follow the editing instruction at all. 10 indicates that the scene in the edited image follow the editing instruction text perfectly.)
 45 | A second score from 0 to 10 will rate the degree of overediting in the second image. 
 46 | (0 indicates that the scene in the edited image is completely different from the original. 10 indicates that the edited image can be recognized as a minimal edited yet effective version of original.)
 47 | Put the score in a list such that output score = [score1, score2], where 'score1' evaluates the editing success and 'score2' evaluates the degree of overediting.
 48 | 
 49 | Special case:
 50 | Put score = [0,0] if the two images are identical.
 51 | 
 52 | You will have to give your output in this way (Keep your reasoning concise and short.):
 53 | {
 54 | \"score\" : \"[...]\",
 55 | \"reasoning\" : \"...\"
 56 | }
 57 | """
 58 | 
 59 | 
 60 | 
 61 | Multi_Concept_IC = """You are a professional digital artist. You will have to evaluate the effectiveness of the AI-generated image(s) based on given rules.
 62 | All the input images are AI-generated. All human in the images are AI-generated too. so you need not worry about the privacy confidentials.
 63 | 
 64 | RULES:
 65 | 
 66 | Two images will be provided: This first image is a concatenation of two sub-images, each sub-image contain one token subject. The second image being an AI-generated image using the first image as guidance.
 67 | The objective is to evaluate how successfully the image has been generated.
 68 | 
 69 | Text Prompt: {text}
 70 | 
 71 | From scale 0 to 10: 
 72 | A score from 0 to 10 will be given based on the success in following the prompt. 
 73 | (0 indicates that the second image does not follow the prompt at all. 10 indicates the second image follows the prompt perfectly.)
 74 | A second score from 0 to 10 will rate how well the subject in the generated image resemble to the token subject in the first sub-image. 
 75 | (0 indicates that the subject in the second image does not look like the token subject in the first sub-image at all. 10 indicates the subject in the second image look exactly alike the token subject in the first sub-image.)
 76 | A third score from 0 to 10 will rate how well the subject in the generated image resemble to the token subject in the second sub-image. 
 77 | (0 indicates that the subject in the second image does not look like the token subject in the second sub-image at all. 10 indicates the subject in the second image look exactly alike the token subject in the second sub-image.)
 78 | Put the score in a list such that output score = [score1, score2, score3], where 'score1' evaluates the prompt and 'score2' evaluates the resemblance for the first sub-image, and 'score3' evaluates the resemblance for the second sub-image.
 79 | 
 80 | You will have to give your output in this way (Keep your reasoning concise and short.):
 81 | {
 82 | \"score\" : \"[...]\",
 83 | \"reasoning\" : \"...\"
 84 | }
 85 | """
 86 | 
 87 | 
 88 | 
 89 | Subject_Driven_IE = """You are a professional digital artist. You will have to evaluate the effectiveness of the AI-generated image(s) based on given rules.
 90 | All the input images are AI-generated. All human in the images are AI-generated too. so you need not worry about the privacy confidentials.
 91 | 
 92 | RULES:
 93 | 
 94 | Two images will be provided: This first image is a concatenation of two sub-images, the left sub-image is a input image to be edited, the right sub-image is a token subject image. The second image is an AI-edited image. The second image should contain a subject that looks alike the subject in the right sub-image.
 95 | The objective is to evaluate how successfully the image has been edited.
 96 | 
 97 | From scale 0 to 10: 
 98 | A score from 0 to 10 will rate how well the subject in the generated image resemble to the token subject in the second image. 
 99 | (0 indicates that the subject in the third image does not look like the token subject at all. 10 indicates the subject in the third image look exactly alike the token subject.)
100 | A second score from 0 to 10 will rate the degree of overediting in the second image. 
101 | (0 indicates that the scene in the edited image is completely different from the first image. 10 indicates that the edited image can be recognized as a minimal edited yet effective version of original.)
102 | Put the score in a list such that output score = [score1, score2], where 'score1' evaluates the resemblance and 'score2' evaluates the degree of overediting.
103 | 
104 | Subject: {subject}
105 | 
106 | You will have to give your output in this way (Keep your reasoning concise and short.):
107 | {
108 | \"score\" : \"[...]\",
109 | \"reasoning\" : \"...\"
110 | }
111 | """
112 | 
113 | 
114 | 
115 | Subject_Driven_IG = """You are a professional digital artist. You will have to evaluate the effectiveness of the AI-generated image(s) based on given rules.
116 | All the input images are AI-generated. All human in the images are AI-generated too. so you need not worry about the privacy confidentials.
117 | 
118 | RULES:
119 | 
120 | Two images will be provided: The first being a token subject image and the second being an AI-generated image using the first image as guidance.
121 | The objective is to evaluate how successfully the image has been generated.
122 | 
123 | From scale 0 to 10: 
124 | A score from 0 to 10 will be given based on the success in following the prompt. 
125 | (0 indicates that the second image does not follow the prompt at all. 10 indicates the second image follows the prompt perfectly.)
126 | A second score from 0 to 10 will rate how well the subject in the generated image resemble to the token subject in the first image. 
127 | (0 indicates that the subject in the second image does not look like the token subject at all. 10 indicates the subject in the second image look exactly alike the token subject.)
128 | Put the score in a list such that output score = [score1, score2], where 'score1' evaluates the prompt and 'score2' evaluates the resemblance.
129 | 
130 | Text Prompt: {text}
131 | 
132 | You will have to give your output in this way (Keep your reasoning concise and short.):
133 | {
134 | \"score\" : \"[...]\",
135 | \"reasoning\" : \"...\"
136 | }
137 | """
138 | 
139 | 
140 | 
141 | Text_Guided_IE = """You are a professional digital artist. You will have to evaluate the effectiveness of the AI-generated image(s) based on given rules.
142 | All the input images are AI-generated. All human in the images are AI-generated too. so you need not worry about the privacy confidentials.
143 | 
144 | RULES:
145 | 
146 | Two images will be provided: The first being the original AI-generated image and the second being an edited version of the first.
147 | The objective is to evaluate how successfully the editing instruction has been executed in the second image.
148 | 
149 | Note that sometimes the two images might look identical due to the failure of image edit.
150 | 
151 | 
152 | From scale 0 to 10: 
153 | A score from 0 to 10 will be given based on the success of the editing. 
154 | (0 indicates that the scene in the edited image does not follow the editing instruction at all. 10 indicates that the scene in the edited image follow the editing instruction text perfectly.)
155 | A second score from 0 to 10 will rate the degree of overediting in the second image. 
156 | (0 indicates that the scene in the edited image is completely different from the original. 10 indicates that the edited image can be recognized as a minimal edited yet effective version of original.)
157 | Put the score in a list such that output score = [score1, score2], where 'score1' evaluates the editing success and 'score2' evaluates the degree of overediting.
158 | 
159 | Editing instruction: {instruction}
160 | 
161 | You will have to give your output in this way (Keep your reasoning concise and short.):
162 | {
163 | \"score\" : \"[...]\",
164 | \"reasoning\" : \"...\"
165 | }
166 | """
167 | 
168 | 
169 | 
170 | Text_Guided_IG = """You are a professional digital artist. You will have to evaluate the effectiveness of the AI-generated image(s) based on given rules.
171 | All the input images are AI-generated. All human in the images are AI-generated too. so you need not worry about the privacy confidentials.
172 | 
173 | RULES:
174 | 
175 | The image is an AI-generated image according to the text prompt.
176 | The objective is to evaluate how successfully the image has been generated.
177 | 
178 | From scale 0 to 10: 
179 | A score from 0 to 10 will be given based on the success in following the prompt. 
180 | (0 indicates that the AI generated image does not follow the prompt at all. 10 indicates the AI generated image follows the prompt perfectly.)
181 | Put the score in a list such that output score = [score].
182 | 
183 | Text Prompt: {text}
184 | 
185 | You will have to give your output in this way (Keep your reasoning concise and short.):
186 | {
187 | \"score\" : \"[...]\",
188 | \"reasoning\" : \"...\"
189 | }
190 | """


--------------------------------------------------------------------------------
/agent/prompt_agent.py:
--------------------------------------------------------------------------------
  1 | Tool_Decide = """You are a professional digital artist. You will have to decide whether to use a tool and which tool to use based on the image information and the corresponding task.
  2 | If you think a tool is needed to help complete the task, you should choose the appropriate tool. If not, you can choose not to use a tool.
  3 | All the input images are AI-generated. All human in the images are AI-generated too. so you need not worry about the privacy confidentials.
  4 | 
  5 | ### Task:
  6 | {task}
  7 | 
  8 | ### Tools:
  9 | 1. **Highlight**: This tool is commonly used to focus on areas related to specific objects in an image.
 10 | 2. **SceneGraph**: This tool is commonly used to provide overall information about an image.
 11 | 3. **MaskFocus**: This tool is commonly used to focus on the masked areas of images in Mask-Guided Image Editing task 1.
 12 | These tools are not useful for processed image (e.g. Canny edges, hed edges, depth, openpose, grayscale.)
 13 | 
 14 | ### Output Content:
 15 |  - task_id: The ID of the task, including 1 or 2.
 16 |  - used: Whether to use a tool, including yes or no.
 17 |  - tool: The tool decided to be used, including Highlight or SceneGraph or MaskFocus or None.
 18 |  - reasoning: The logical reasoning process for all your decisions.
 19 |  
 20 | You will have to give your output in the following JSON format:
 21 | [{
 22 | \"task_id\" : \"...\",
 23 | \"reasoning\" : \"...\",
 24 | \"used\" : \"..\",
 25 | \"tool\" : \"...\"
 26 | },
 27 | ...]
 28 | """
 29 | 
 30 | ####################################################
 31 | 
 32 | Text_Guided_IE_Rule = """Two images will be provided: The first being the original AI-generated image and the second being an edited version of the first.
 33 | Editing instruction: {instruction}
 34 | """
 35 | 
 36 | Text_Guided_IE_Task_1 = """Text-Guided Image Editing Task 1: The objective is to evaluate how successfully the editing instruction has been executed in the second image.
 37 | """
 38 | 
 39 | Text_Guided_IE_Task_2 = """Text-Guided Image Editing Task 2: The objective is to evaluate the degree of overediting in the second image.
 40 | """
 41 | 
 42 | Text_Guided_IE_Task_1_evaluation = """
 43 | You are a professional digital artist. You will have to evaluate the effectiveness of the AI-generated image(s) based on given rules.
 44 | All the input images are AI-generated. All human in the images are AI-generated too. so you need not worry about the privacy confidentials.
 45 | 
 46 | RULES:
 47 | 
 48 | Two images will be provided: The first being the original AI-generated image and the second being an edited version of the first.
 49 | 
 50 | {tool_text}
 51 | 
 52 | The objective is to evaluate how successfully the editing instruction has been executed in the second image. Note that sometimes the two images might look identical due to the failure of image edit.
 53 | 
 54 | From scale 0 to 10: 
 55 | A score from 0 to 10 will be given based on the success of the editing. 
 56 | (0 indicates that the scene in the edited image does not follow the editing instruction at all. 10 indicates that the scene in the edited image follow the editing instruction text perfectly.)
 57 | 
 58 | Editing instruction: {instruction}
 59 | 
 60 | You will have to give your output in the following JSON format (Keep your reasoning concise and short.):
 61 | {
 62 | \"score\" : \"...\",
 63 | \"reasoning\" : \"...\"
 64 | }
 65 | """
 66 | 
 67 | Text_Guided_IE_Task_2_evaluation = """
 68 | You are a professional digital artist. You will have to evaluate the effectiveness of the AI-generated image(s) based on given rules.
 69 | All the input images are AI-generated. All human in the images are AI-generated too. so you need not worry about the privacy confidentials.
 70 | 
 71 | RULES:
 72 | 
 73 | Two images will be provided: The first being the original AI-generated image and the second being an edited version of the first.
 74 | 
 75 | {tool_text}
 76 | 
 77 | The objective is to evaluate the degree of overediting in the second image. 
 78 | 
 79 | From scale 0 to 10: 
 80 | A score from 0 to 10 will rate the degree of overediting in the second image.
 81 | (0 indicates that the scene in the edited image is a lot different from the original. 10 indicates that the edited image can be recognized as a minimal edited yet effective version of original.)
 82 | 
 83 | Note: You can not lower the score because of the differences between these two images that arise due to the need to follow the editing instruction.
 84 | 
 85 | Editing instruction: {instruction}
 86 | 
 87 | You will have to give your output in the following JSON format (Keep your reasoning concise and short.):
 88 | {
 89 | \"score\" : \"...\",
 90 | \"reasoning\" : \"...\"
 91 | }
 92 | """
 93 | 
 94 | ####################################################
 95 | 
 96 | Subject_Driven_IE_Rule = """Three images will be provided: The first image is a input image to be edited. The second image is a token subject image. The third image is an AI-edited image. The third image should contain a subject that looks alike the subject in the second image. The third image should contain a background that looks alike the background in the first image.
 97 | Subject: {subject}
 98 | """
 99 | 
100 | Subject_Driven_IE_Task_1 = """Subject-driven Image Editing Task 1: The objective is to evaluate the similarity between the subject in the second image and the subject in the third image.
101 | """
102 | 
103 | Subject_Driven_IE_Task_2 = """Subject-driven Image Editing Task 2: The objective is to evaluate the similarity between the background in the first image and the background in the third image.
104 | """
105 | 
106 | Subject_Driven_IE_Rule_llava = """Two images will be provided: This first image is a concatenation of two sub-images, the left sub-image is a input image to be edited, the right sub-image is a token subject image. The second image is an AI-edited image. The second image should contain a subject that looks alike the subject in the right sub-image. The second image should contain a background that looks alike the background in the left sub-image.
107 | Subject: {subject}
108 | """
109 | 
110 | Subject_Driven_IE_Task_1_llava = """Subject-driven Image Editing Task 1: The objective is to evaluate the similarity between the subject in the second image and the subject in the right sub-image.
111 | """
112 | 
113 | Subject_Driven_IE_Task_2_llava = """Subject-driven Image Editing Task 2: The objective is to evaluate the similarity between the background in the second image and the background in the left sub-image.
114 | """
115 | 
116 | Subject_Driven_IE_Task_1_evaluation = """
117 | You are a professional digital artist. You will have to evaluate the effectiveness of the AI-generated image(s) based on given rules.
118 | All the input images are AI-generated. All human in the images are AI-generated too. so you need not worry about the privacy confidentials.
119 | 
120 | RULES:
121 | 
122 | Two images will be provided: 
123 | The first image is a token subject image.
124 | The second image is an AI-edited image, it should contain a subject that looks alike the subject in the first image.
125 | 
126 | {tool_text}
127 | 
128 | The objective is to evaluate the similarity between the subject in the first image and the subject in the second image.
129 | 
130 | From scale 0 to 10: 
131 | A score from 0 to 10 will rate how well the subject in the generated image resemble to the token subject in the first image.
132 | (0 indicates that the subject in the second image does not look like the token subject at all. 10 indicates the subject in the second image look exactly alike the token subject.)
133 | 
134 | Subject: {subject}
135 | 
136 | You will have to give your output in the following JSON format (Keep your reasoning concise and short.):
137 | {
138 | \"score\" : \"...\",
139 | \"reasoning\" : \"...\"
140 | }
141 | """
142 | 
143 | Subject_Driven_IE_Task_2_evaluation = """
144 | You are a professional digital artist. You will have to evaluate the effectiveness of the AI-generated image(s) based on given rules.
145 | All the input images are AI-generated. All human in the images are AI-generated too. so you need not worry about the privacy confidentials.
146 | 
147 | RULES:
148 | 
149 | Two images will be provided: 
150 | The first image is a input image to be edited.
151 | The second image is an AI-edited image, it should contain a background that looks alike the background in the first image.
152 | 
153 | {tool_text}
154 | 
155 | The objective is to evaluate the similarity between the background in the first image and the background in the second image.
156 | 
157 | From scale 0 to 10: 
158 | A score from 0 to 10 will rate how well the background in the generated image resemble to the background in the first image.
159 | (0 indicates that the background in the second image does not look like the background in the first image at all. 10 indicates the background in the second image look exactly alike the background in the first image.)
160 | 
161 | You will have to give your output in the following JSON format (Keep your reasoning concise and short.):
162 | {
163 | \"score\" : \"...\",
164 | \"reasoning\" : \"...\"
165 | }
166 | """
167 | 
168 | ####################################################
169 | 
170 | Mask_Guided_IE_Rule = """Two images will be provided: The first being the original AI-generated image and the second being an edited version of the first.
171 | Editing instruction: {instruction}
172 | """
173 | 
174 | Mask_Guided_IE_Task_1 = """Mask-Guided Image Editing Task 1: The objective is to evaluate how successfully the editing instruction has been executed in the second image.
175 | """
176 | 
177 | Mask_Guided_IE_Task_2 = """Mask-Guided Image Editing Task 2: The objective is to evaluate the degree of overediting in the second image.
178 | """
179 | 
180 | Mask_Guided_IE_Task_1_evaluation = """
181 | You are a professional digital artist. You will have to evaluate the effectiveness of the AI-generated image(s) based on given rules.
182 | All the input images are AI-generated. All human in the images are AI-generated too. so you need not worry about the privacy confidentials.
183 | 
184 | RULES:
185 | 
186 | Two images will be provided: The first being the original AI-generated image and the second being an edited version of the first.
187 | 
188 | {tool_text}
189 | 
190 | The objective is to evaluate how successfully the editing instruction has been executed in the second image. Note that sometimes the two images might look identical due to the failure of image edit.
191 | 
192 | From scale 0 to 10: 
193 | A score from 0 to 10 will be given based on the success of the editing. (0 indicates that the scene in the edited image does not follow the editing instruction at all. 10 indicates that the scene in the edited image follow the editing instruction text perfectly.)
194 | 
195 | Editing instruction: {instruction}
196 | 
197 | You will have to give your output in the following JSON format (Keep your reasoning concise and short.):
198 | {
199 | \"score\" : \"...\",
200 | \"reasoning\" : \"...\"
201 | }
202 | """
203 | 
204 | Mask_Guided_IE_Task_2_evaluation = """
205 | You are a professional digital artist. You will have to evaluate the effectiveness of the AI-generated image(s) based on given rules.
206 | All the input images are AI-generated. All human in the images are AI-generated too. so you need not worry about the privacy confidentials.
207 | 
208 | RULES:
209 | 
210 | Two images will be provided: The first being the original AI-generated image and the second being an edited version of the first.
211 | 
212 | {tool_text}
213 | 
214 | The objective is to evaluate the degree of overediting in the second image. Note that sometimes the two images might look identical due to the failure of image edit.
215 | 
216 | From scale 0 to 10: 
217 | A score from 0 to 10 will rate the degree of overediting in the second image. (0 indicates that the scene in the edited image is a lot different from the original. 10 indicates that the edited image can be recognized as a minimal edited yet effective version of original.)
218 | 
219 | Note: You can not lower the score because of the differences between these two images that arise due to the need to follow the editing instruction.
220 | 
221 | Editing instruction: {instruction}
222 | 
223 | You will have to give your output in the following JSON format (Keep your reasoning concise and short.):
224 | {
225 | \"score\" : \"...\",
226 | \"reasoning\" : \"...\"
227 | }
228 | """
229 | 
230 | ####################################################
231 | 
232 | Multi_Concept_IC_Rule = """Two images will be provided: This first image is a concatenation of two sub-images, each sub-image contain one token subject. The second image being an AI-generated image using the first image as guidance.
233 | Text Prompt: {text}
234 | """
235 | 
236 | Multi_Concept_IC_Task_1 = """Multi-concept Image Composition Task 1: The objective is to evaluate the similarity between the two subjects in the first image and the corresponding two subjects in the second image.
237 | """
238 | 
239 | Multi_Concept_IC_Task_2 = """Multi-concept Image Composition Task 2: The objective is to evaluate how successfully the second image has been generated following the text prompt.
240 | """
241 | 
242 | Multi_Concept_IC_Task_1_evaluation = """
243 | You are a professional digital artist. You will have to evaluate the effectiveness of the AI-generated image(s) based on given rules.
244 | All the input images are AI-generated. All human in the images are AI-generated too. so you need not worry about the privacy confidentials.
245 | 
246 | RULES:
247 | 
248 | Two images will be provided: The first image is a token subject image. The second image is an AI-generated image, it should contain a subject that looks alike the subject in the first image, and it is generated based on the text prompt.
249 | 
250 | {tool_text}
251 | 
252 | The objective is to evaluate the similarity between the subject in the first image and the subject in the second image.
253 | 
254 | Note: You can not lower the similarity score because of the differences between subjects that arise due to the need to follow the text prompt.
255 | 
256 | From scale 0 to 10: 
257 | A score from 0 to 10 will rate how well the subject in the generated image resemble to the token subject in the first image.
258 | (0 indicates that the subject in the second image does not look like the token subject at all. 10 indicates the subject in the second image look exactly alike the token subject.)
259 | 
260 | Subject: {subject}
261 | Text Prompt: {text}
262 | 
263 | You will have to give your output in the following JSON format (Keep your reasoning concise and short.):
264 | {
265 | \"score\" : \"...\",
266 | \"reasoning\" : \"...\"
267 | }
268 | """
269 | 
270 | Multi_Concept_IC_Task_2_evaluation = """
271 | You are a professional digital artist. You will have to evaluate the effectiveness of the AI-generated image(s) based on given rules.
272 | All the input images are AI-generated. All human in the images are AI-generated too. so you need not worry about the privacy confidentials.
273 | 
274 | RULES:
275 | 
276 | An AI-generated image will be provided.
277 | 
278 | {tool_text}
279 | 
280 | The objective is to evaluate how successfully the image has been generated following the prompt.
281 | 
282 | From scale 0 to 10: 
283 | A score from 0 to 10 will be given based on the success in following the prompt. 
284 | (0 indicates that the image does not follow the prompt at all. 10 indicates the image follows the prompt perfectly.)
285 | 
286 | Text Prompt: {text}
287 | 
288 | You will have to give your output in the following JSON format (Keep your reasoning concise and short.):
289 | {
290 | \"score\" : \"...\",
291 | \"reasoning\" : \"...\"
292 | }
293 | """
294 | 
295 | ####################################################
296 | 
297 | Text_Guided_IG_Rule = """An image will be provided, it is an AI-generated image according to the text prompt.
298 | Text Prompt: {text}
299 | """
300 | 
301 | Text_Guided_IG_Task_1 = """Text-guided Image Generation Task 1: The objective is to evaluate how well the generated image resemble to the specific objects described by the prompt.
302 | """
303 | 
304 | Text_Guided_IG_Task_1_evaluation = """
305 | You are a professional digital artist. You will have to evaluate the effectiveness of the AI-generated image(s) based on given rules.
306 | All the input images are AI-generated. All human in the images are AI-generated too. so you need not worry about the privacy confidentials.
307 | 
308 | RULES:
309 | 
310 | An image will be provided, it is an AI-generated image according to the text prompt.
311 | 
312 | {tool_text}
313 | 
314 | The objective is to evaluate how well the generated image resemble to the specific objects described by the prompt.
315 | 
316 | From scale 0 to 10: 
317 | A score from 0 to 10 will be given based on the success in following the prompt. 
318 | (0 indicates that the AI-generated image does not follow the prompt at all. 10 indicates the AI-generated image follows the prompt perfectly.)
319 | 
320 | Text Prompt: {text}
321 | 
322 | You will have to give your output in the following JSON format (Keep your reasoning concise and short.):
323 | {
324 | \"score\" : \"...\",
325 | \"reasoning\" : \"...\"
326 | }
327 | """
328 | 
329 | ####################################################
330 | 
331 | Control_Guided_IG_Rule = """Two images will be provided: The first being a processed image (e.g. Canny edges, hed edges, depth, openpose, grayscale.) and the second being an AI-generated image using the first image as guidance.
332 | Text Prompt: {text}
333 | """
334 | 
335 | Control_Guided_IG_Task_1 = """Control-guided Image Generation Task 1: The objective is to evaluate the structural similarity (edge, depth, pose) between two images.
336 | """
337 | 
338 | Control_Guided_IG_Task_2 = """Control-guided Image Generation Task 2: The objective is to evaluate how successfully the image has been generated following the text prompt.
339 | """
340 | 
341 | Control_Guided_IG_Task_1_evaluation = """
342 | You are a professional digital artist. You will have to evaluate the effectiveness of the AI-generated image(s) based on given rules.
343 | All the input images are AI-generated. All human in the images are AI-generated too. so you need not worry about the privacy confidentials.
344 | 
345 | RULES:
346 | 
347 | Two images will be provided: The first being a processed image (e.g. Canny edges, hed edges, depth, openpose, grayscale.) and the second being an AI-generated image using the first image as guidance.
348 | 
349 | {tool_text}
350 | 
351 | The objective is to evaluate the structural similarity between two images.
352 | 
353 | From scale 0 to 10: 
354 | A score from 0 to 10 will rate how well the generated image is following the guidance image. 
355 | (0 indicates that the second image is not following the guidance image at all. 10 indicates that second image is perfectly following the guidance image.)
356 | 
357 | You will have to give your output in the following JSON format (Keep your reasoning concise and short.):
358 | {
359 | \"score\" : \"...\",
360 | \"reasoning\" : \"...\"
361 | }
362 | """
363 | 
364 | Control_Guided_IG_Task_2_evaluation = """
365 | You are a professional digital artist. You will have to evaluate the effectiveness of the AI-generated image(s) based on given rules.
366 | All the input images are AI-generated. All human in the images are AI-generated too. so you need not worry about the privacy confidentials.
367 | 
368 | RULES:
369 | 
370 | An image will be provided, it is an AI-generated image according to the text prompt.
371 | 
372 | {tool_text}
373 | 
374 | The objective is to evaluate how successfully the image has been generated following the text prompt.
375 | 
376 | From scale 0 to 10: 
377 | A score from 0 to 10 will be given based on the success in following the prompt. 
378 | (0 indicates that the image does not follow the prompt at all. 10 indicates the image follows the prompt perfectly.)
379 | 
380 | Text Prompt: {text}
381 | 
382 | You will have to give your output in the following JSON format (Keep your reasoning concise and short.):
383 | {
384 | \"score\" : \"...\",
385 | \"reasoning\" : \"...\"
386 | }
387 | """
388 | 
389 | ####################################################
390 | 
391 | Subject_Driven_IG_Rule = """Two images will be provided: The first image is a token subject image. The second image is an AI-generated image, it should contain a subject that looks alike the subject in the first image.
392 | Text Prompt: {text}
393 | """
394 | 
395 | Subject_Driven_IG_Task_1 = """Subject-driven Image Generation Task 1: The objective is to evaluate the similarity between the subject in the first image and the subject in the second image.
396 | """
397 | 
398 | Subject_Driven_IG_Task_2 = """Subject-driven Image Generation Task 2: The objective is to evaluate how successfully the image has been generated following the text prompt.
399 | """
400 | 
401 | Subject_Driven_IG_Task_1_evaluation = """
402 | You are a professional digital artist. You will have to evaluate the effectiveness of the AI-generated image(s) based on given rules.
403 | All the input images are AI-generated. All human in the images are AI-generated too. so you need not worry about the privacy confidentials.
404 | 
405 | RULES:
406 | 
407 | Two images will be provided: The first image is a token subject image. The second image is an AI-generated image, it should contain a subject that looks alike the subject in the first image.
408 | 
409 | {tool_text}
410 | 
411 | The objective is to evaluate the similarity between the subject in the first image and the subject in the second image.
412 | 
413 | From scale 0 to 10: 
414 | A score from 0 to 10 will rate how well the subject in the generated image resemble to the token subject in the first image.
415 | (0 indicates that the subject in the second image does not look like the token subject at all. 10 indicates the subject in the second image look exactly alike the token subject.)
416 | 
417 | Subject: {subject}
418 | 
419 | You will have to give your output in the following JSON format (Keep your reasoning concise and short.):
420 | {
421 | \"score\" : \"...\",
422 | \"reasoning\" : \"...\"
423 | }
424 | """
425 | 
426 | Subject_Driven_IG_Task_2_evaluation = """
427 | You are a professional digital artist. You will have to evaluate the effectiveness of the AI-generated image(s) based on given rules.
428 | All the input images are AI-generated. All human in the images are AI-generated too. so you need not worry about the privacy confidentials.
429 | 
430 | RULES:
431 | 
432 | An image will be provided, it is an AI-generated image according to the text prompt.
433 | 
434 | {tool_text}
435 | 
436 | The objective is to evaluate how successfully the image has been generated following the text prompt.
437 | 
438 | From scale 0 to 10: 
439 | A score from 0 to 10 will be given based on the success in following the prompt. 
440 | (0 indicates that the image does not follow the prompt at all. 10 indicates the image follows the prompt perfectly.)
441 | 
442 | Text Prompt: {text}
443 | 
444 | You will have to give your output in the following JSON format (Keep your reasoning concise and short.):
445 | {
446 | \"score\" : \"...\",
447 | \"reasoning\" : \"...\"
448 | }
449 | """


--------------------------------------------------------------------------------
/evaluate.py:
--------------------------------------------------------------------------------
  1 | from agent.openai import GPT4o,QWen25
  2 | from agent.prompt_agent import *
  3 | from util import *
  4 | from tools.scene_graph import sg_generate
  5 | from tools.grding import grding
  6 | from tools.diff import imgs_diff
  7 | from tools.com import split2part
  8 | from tqdm import tqdm
  9 | import os
 10 | 
 11 | 
 12 | def get_tool_text(tool, img_links, log_path):
 13 |     if tool=="None":
 14 |         return ""
 15 |     if tool=="Highlight" or tool=="MaskFocus":
 16 |         return "Focus on the highlighted parts of the image."
 17 |     if tool=="SceneGraph":
 18 |         text = ""
 19 |         if len(img_links) == 2:
 20 |             text = """Two scene graphs in JSON format generated from two images will be provided:\nThe first scene graph:\n{scene_graph_1}\nThe second scene graph:\n{scene_graph_2}"""
 21 |             sg_1 = sg_generate(img_links[0])
 22 |             sg_2 = sg_generate(img_links[1])
 23 |             params = {"{scene_graph_1}": sg_1, "{scene_graph_2}": sg_2}
 24 |             text = prompt_format(text, params)
 25 |         if len(img_links) == 1:
 26 |             text = """The scene graph in JSON format generated from this image is as follows:\n{scene_graph}"""
 27 |             sg_1 = sg_generate(img_links[0])
 28 |             params = {"{scene_graph}": sg_1}
 29 |             text = prompt_format(text, params)
 30 |         return text
 31 | 
 32 | 
 33 | 
 34 | def test_Text_Guided_IE_tool(in_path, out_path, log_path):
 35 |     data = read_json(in_path)
 36 |     out = {}
 37 |     for key in tqdm(list(data.keys())):
 38 |         counter = 0
 39 |         while counter < 3:
 40 |             try:
 41 |                 instruction = data[key]["prompt_input"].replace("Editing instruction:", "").strip()
 42 |                 # tool
 43 |                 task_rule = Text_Guided_IE_Rule.replace("{instruction}", instruction)
 44 |                 task = task_rule + Text_Guided_IE_Task_1 + Text_Guided_IE_Task_2
 45 |                 prompt_tool = Tool_Decide.replace("{task}", task)
 46 |                 img_links = [data[key]["vision_input"][0], data[key]["vision_input"][1]]
 47 |                 prompt = agent_run.prepare_prompt(img_links, prompt_tool)
 48 |                 result = GPTResponse2JSON(agent_run.get_result(prompt))
 49 |                 result = check(result, ["task_id", "reasoning", "used", "tool"])
 50 |                 log_prompt(log_path, prompt_tool)
 51 |                 log_prompt(log_path, result)
 52 |                 # tool 
 53 | 
 54 |                 out[key] = {}
 55 |                 out[key]['tool_plan'] = result
 56 |                 print(f"{key} over")
 57 |                 break
 58 |             except Exception as e:
 59 |                 print(f"Error: {key} evaluation failed: {e}")
 60 |                 counter += 1
 61 |     
 62 |     write_json(out_path, out)
 63 | 
 64 | 
 65 | 
 66 | def test_Text_Guided_IE_evaluate(in_path, tool_path, out_path, log_path):
 67 |     data = read_json(in_path)
 68 |     data_tool = read_json(tool_path)
 69 |     out = {}
 70 |     for key in tqdm(list(data.keys())):
 71 |         counter = 0
 72 |         while counter < 3:
 73 |             try:
 74 |                 instruction = data[key]["prompt_input"].replace("Editing instruction:", "").strip()
 75 |                 # Task 1
 76 |                 ## prompt
 77 |                 if data_tool[key]["tool_plan"][0]["tool"] in ["Highlight", "SceneGraph"]:
 78 |                     tool = data_tool[key]["tool_plan"][0]["tool"]
 79 |                 else:
 80 |                     tool = "None"
 81 |                 task_1_eva = Text_Guided_IE_Task_1_evaluation.replace("{instruction}", instruction)
 82 |                 tool_text = get_tool_text(tool, data[key]["vision_input"], log_path)
 83 |                 task_1_eva = task_1_eva.replace("{tool_text}", tool_text)
 84 |                 ## prompt
 85 |                 ## image
 86 |                 img_links = data[key]["vision_input"]
 87 |                 if tool == "Highlight":
 88 |                     img_1 = load_image(url2path(data[key]["vision_input"][0], Image_task_path))
 89 |                     img_2 = load_image(url2path(data[key]["vision_input"][1], Image_task_path))
 90 |                     img_1 = grding(img_1, instruction, "highlight")  
 91 |                     img_2 = grding(img_2, instruction, "highlight")  
 92 |                     img_links = [f"data:image/jpeg;base64,{encode_pil_image(img_1)}", f"data:image/jpeg;base64,{encode_pil_image(img_2)}"] 
 93 |                 ## image
 94 |                 prompt = agent_run.prepare_prompt(img_links, task_1_eva)
 95 |                 json_1 = GPTResponse2JSON(agent_run.get_result(prompt))
 96 |                 json_1['score'] = get_number(json_1['score'])
 97 |                 log_prompt(log_path, task_1_eva)
 98 |                 log_prompt(log_path, json_1)
 99 |                 # Task 1
100 |                 
101 |                 # Task 2
102 |                 ## prompt
103 |                 if data_tool[key]["tool_plan"][1]["tool"] in ["Highlight", "SceneGraph"]:
104 |                     tool = data_tool[key]["tool_plan"][1]["tool"]
105 |                 else:
106 |                     tool = "None"
107 |                 task_2_eva = Text_Guided_IE_Task_2_evaluation.replace("{instruction}", instruction)
108 |                 tool_text = get_tool_text(tool, data[key]["vision_input"], log_path)
109 |                 task_2_eva = task_2_eva.replace("{tool_text}", tool_text)
110 |                 ## prompt
111 |                 ## image
112 |                 img_links = data[key]["vision_input"]
113 |                 if tool == "Highlight":
114 |                     img_1 = load_image(url2path(data[key]["vision_input"][0], Image_task_path))
115 |                     img_2 = load_image(url2path(data[key]["vision_input"][1], Image_task_path))
116 |                     img_1 = grding(img_1, instruction, "highlight")  
117 |                     img_2 = grding(img_2, instruction, "highlight")  
118 |                     img_links = [f"data:image/jpeg;base64,{encode_pil_image(img_1)}", f"data:image/jpeg;base64,{encode_pil_image(img_2)}"] 
119 |                
120 |                 ## image
121 |                 prompt = agent_run.prepare_prompt(img_links, task_2_eva)
122 |                 json_2 = GPTResponse2JSON(agent_run.get_result(prompt))
123 |                 json_2['score'] = get_number(json_2['score'])
124 |                 log_prompt(log_path, task_2_eva)
125 |                 log_prompt(log_path, json_2)
126 |                 # Task 2
127 | 
128 |                 out[key] = {}
129 |                 out[key]['score'] = [json_1['score'], json_2['score']]
130 |                 out[key]['reasoning'] = [json_1['reasoning'], json_2['reasoning']]
131 |                 out[key]['prompt_input'] = [task_1_eva, task_2_eva]
132 |                 out[key]['vision_input'] = data[key]["vision_input"]
133 |                 print(f"{key} over")
134 |                 break
135 |             except Exception as e:
136 |                 print(f"Error: {key} evaluation failed: {e}")
137 |                 counter += 1
138 |     
139 |     write_json(out_path, out)
140 | 
141 | 
142 | 
143 | def test_Subject_Driven_IE_tool(in_path, out_path, log_path):
144 |     data = read_json(in_path)
145 |     out = {}
146 |     for key in tqdm(list(data.keys())):
147 |         
148 |         counter = 0
149 |         while counter < 3:
150 |             try:
151 |                 subject = data[key]["prompt_input"].replace("Subject:", "").strip()
152 |                 # tool
153 |                 task_rule = Subject_Driven_IE_Rule.replace("{subject}", subject)
154 |                 task = task_rule + Subject_Driven_IE_Task_1 + Subject_Driven_IE_Task_2
155 |                 prompt_tool = Tool_Decide.replace("{task}", task)
156 |                 img_links = [data[key]["vision_input"][0], data[key]["vision_input"][1], data[key]["vision_input"][2]]
157 |                 prompt = agent_run.prepare_prompt(img_links, prompt_tool)
158 |                 result = GPTResponse2JSON(agent_run.get_result(prompt))
159 |                 result = check(result, ["task_id", "reasoning", "used", "tool"])
160 |                 log_prompt(log_path, prompt_tool)
161 |                 log_prompt(log_path, result)
162 |                 # tool 
163 | 
164 |                 out[key] = {}
165 |                 out[key]['tool_plan'] = result
166 |                 print(f"{key} over")
167 |                 break
168 |             except Exception as e:
169 |                 print(f"Error: {key} evaluation failed: {e}")
170 |                 counter += 1
171 |     
172 |     write_json(out_path, out)
173 | 
174 | 
175 | 
176 | def test_Subject_Driven_IE_evaluate(in_path, tool_path, out_path, log_path):
177 |     data = read_json(in_path)
178 |     data_tool = read_json(tool_path)
179 |     out = {}
180 |     for key in tqdm(list(data.keys())):
181 |         
182 |         counter = 0
183 |         while counter < 3:
184 |             try:
185 |                 subject = data[key]["prompt_input"].replace("Subject:", "").strip()
186 |                 # Task 1
187 |                 ## prompt
188 |                 if data_tool[key]["tool_plan"][0]["tool"] in ["Highlight", "SceneGraph"]:
189 |                     tool = data_tool[key]["tool_plan"][0]["tool"]
190 |                 else:
191 |                     tool = "None"
192 |                 task_1_eva = Subject_Driven_IE_Task_1_evaluation.replace("{subject}", subject)
193 |                 tool_text = get_tool_text(tool, [data[key]["vision_input"][1], data[key]["vision_input"][2]], log_path)
194 |                 task_1_eva = task_1_eva.replace("{tool_text}", tool_text)
195 |                 ## prompt
196 |                 ## image
197 |                 img_links = [data[key]["vision_input"][1], data[key]["vision_input"][2]]
198 |                 if tool == "Highlight":
199 |                     img_1 = load_image(url2path(data[key]["vision_input"][1], Image_task_path))
200 |                     img_2 = load_image(url2path(data[key]["vision_input"][2], Image_task_path))
201 |                     img_1 = grding(img_1, subject, "highlight")  
202 |                     img_2 = grding(img_2, subject, "highlight")  
203 |                     img_links = [f"data:image/jpeg;base64,{encode_pil_image(img_1)}", f"data:image/jpeg;base64,{encode_pil_image(img_2)}"] 
204 |                 ## image
205 |                 prompt = agent_run.prepare_prompt(img_links, task_1_eva)
206 |                 json_1 = GPTResponse2JSON(agent_run.get_result(prompt))
207 |                 json_1['score'] = get_number(json_1['score'])
208 |                 log_prompt(log_path, task_1_eva)
209 |                 log_prompt(log_path, json_1)
210 |                 # Task 1
211 |                 
212 |                 # Task 2
213 |                 ## prompt
214 |                 if data_tool[key]["tool_plan"][1]["tool"] in ["Highlight", "SceneGraph"]:
215 |                     tool = data_tool[key]["tool_plan"][1]["tool"]
216 |                 else:
217 |                     tool = "None"
218 |                 tool_text = get_tool_text(tool, [data[key]["vision_input"][0], data[key]["vision_input"][2]], log_path)
219 |                 task_2_eva = Subject_Driven_IE_Task_2_evaluation.replace("{tool_text}", tool_text)
220 |                 ## prompt
221 |                 ## image
222 |                 img_links = [data[key]["vision_input"][0], data[key]["vision_input"][2]]
223 |                 if tool == "Highlight":
224 |                     img_1 = load_image(url2path(data[key]["vision_input"][0], Image_task_path))
225 |                     img_2 = load_image(url2path(data[key]["vision_input"][2], Image_task_path))
226 |                     img_1 = grding(img_1, "background", "highlight")  
227 |                     img_2 = grding(img_2, "background", "highlight")  
228 |                     img_links = [f"data:image/jpeg;base64,{encode_pil_image(img_1)}", f"data:image/jpeg;base64,{encode_pil_image(img_2)}"] 
229 |                 
230 |                 ## image
231 |                 prompt = agent_run.prepare_prompt(img_links, task_2_eva)
232 |                 json_2 = GPTResponse2JSON(agent_run.get_result(prompt))
233 |                 json_2['score'] = get_number(json_2['score'])
234 |                 log_prompt(log_path, task_2_eva)
235 |                 log_prompt(log_path, json_2)
236 |                 # Task 2
237 | 
238 |                 out[key] = {}
239 |                 out[key]['score'] = [json_1['score'], json_2['score']]
240 |                 out[key]['reasoning'] = [json_1['reasoning'], json_2['reasoning']]
241 |                 out[key]['prompt_input'] = [task_1_eva, task_2_eva]
242 |                 out[key]['vision_input'] = data[key]["vision_input"]
243 |                 print(f"{key} over")
244 |                 break
245 |             except Exception as e:
246 |                 print(f"Error: {key} evaluation failed: {e}")
247 |                 counter += 1
248 |     
249 |     write_json(out_path, out)
250 | 
251 | 
252 | 
253 | def test_Mask_Guided_IE_tool(in_path, out_path, log_path):
254 |     data = read_json(in_path)
255 |     out = {}
256 |     for key in tqdm(list(data.keys())):
257 |         
258 |         counter = 0
259 |         while counter < 3:
260 |             try:
261 |                 instruction = data[key]["prompt_input"].replace("Editing instruction:", "").strip()
262 |                 # tool
263 |                 task_rule = Mask_Guided_IE_Rule.replace("{instruction}", instruction)
264 |                 task = task_rule + Mask_Guided_IE_Task_1 + Mask_Guided_IE_Task_2
265 |                 prompt_tool = Tool_Decide.replace("{task}", task)
266 |                 img_links = [data[key]["vision_input"][0], data[key]["vision_input"][1]]
267 |                 prompt = agent_run.prepare_prompt(img_links, prompt_tool)
268 |                 result = GPTResponse2JSON(agent_run.get_result(prompt))
269 |                 result = check(result, ["task_id", "reasoning", "used", "tool"])
270 |                 log_prompt(log_path, prompt_tool)
271 |                 log_prompt(log_path, result)
272 |                 # tool 
273 | 
274 |                 out[key] = {}
275 |                 out[key]['tool_plan'] = result
276 |                 print(f"{key} over")
277 |                 break
278 |             except Exception as e:
279 |                 print(f"Error: {key} evaluation failed: {e}")
280 |                 counter += 1
281 |     
282 |     write_json(out_path, out)
283 | 
284 | 
285 | 
286 | def test_Mask_Guided_IE_evaluate(in_path, tool_path, out_path, log_path):
287 |     data = read_json(in_path)
288 |     data_tool = read_json(tool_path)
289 |     out = {}
290 |     for key in tqdm(list(data.keys())):
291 |         
292 |         counter = 0
293 |         while counter < 3:
294 |             try:
295 |                 instruction = data[key]["prompt_input"].replace("Editing instruction:", "").strip()
296 |                 # Task 1
297 |                 ## prompt
298 |                 if data_tool[key]["tool_plan"][0]["tool"] in ["Highlight", "SceneGraph", "MaskFocus"]:
299 |                     tool = data_tool[key]["tool_plan"][0]["tool"]
300 |                 else:
301 |                     tool = "None"
302 |                 task_1_eva = Mask_Guided_IE_Task_1_evaluation.replace("{instruction}", instruction)
303 |                 tool_text = get_tool_text(tool, data[key]["vision_input"], log_path)
304 |                 task_1_eva = task_1_eva.replace("{tool_text}", tool_text)
305 |                 ## prompt
306 |                 ## image
307 |                 img_links = data[key]["vision_input"]
308 |                 if tool == "Highlight":
309 |                     img_1 = load_image(url2path(data[key]["vision_input"][0], Image_task_path))
310 |                     img_2 = load_image(url2path(data[key]["vision_input"][1], Image_task_path))
311 |                     img_1 = grding(img_1, instruction, "highlight")  
312 |                     img_2 = grding(img_2, instruction, "highlight")  
313 |                     img_links = [f"data:image/jpeg;base64,{encode_pil_image(img_1)}", f"data:image/jpeg;base64,{encode_pil_image(img_2)}"] 
314 |                 if tool == "MaskFocus":
315 |                     img_1, img_2 = imgs_diff(url2path(data[key]["vision_input"][0], Image_task_path), url2path(data[key]["vision_input"][1], Image_task_path), "highlight")
316 |                     img_links = [f"data:image/jpeg;base64,{encode_pil_image(img_1)}", f"data:image/jpeg;base64,{encode_pil_image(img_2)}"]
317 |                 ## image
318 |                 prompt = agent_run.prepare_prompt(img_links, task_1_eva)
319 |                 json_1 = GPTResponse2JSON(agent_run.get_result(prompt))
320 |                 json_1['score'] = get_number(json_1['score'])
321 |                 log_prompt(log_path, task_1_eva)
322 |                 log_prompt(log_path, json_1)
323 |                 # Task 1
324 |                 
325 |                 # Task 2
326 |                 ## prompt
327 |                 if data_tool[key]["tool_plan"][1]["tool"] in ["Highlight", "SceneGraph", "MaskFocus"]:
328 |                     tool = data_tool[key]["tool_plan"][1]["tool"]
329 |                 else:
330 |                     tool = "None"
331 |                 task_2_eva = Mask_Guided_IE_Task_2_evaluation.replace("{instruction}", instruction)
332 |                 tool_text = get_tool_text(tool, data[key]["vision_input"], log_path)
333 |                 task_2_eva = task_2_eva.replace("{tool_text}", tool_text)
334 |                 ## prompt
335 |                 ## image
336 |                 img_links = data[key]["vision_input"]
337 |                 if tool == "Highlight":
338 |                     img_1 = load_image(url2path(data[key]["vision_input"][0], Image_task_path))
339 |                     img_2 = load_image(url2path(data[key]["vision_input"][1], Image_task_path))
340 |                     img_1 = grding(img_1, instruction, "highlight")  
341 |                     img_2 = grding(img_2, instruction, "highlight")  
342 |                     img_links = [f"data:image/jpeg;base64,{encode_pil_image(img_1)}", f"data:image/jpeg;base64,{encode_pil_image(img_2)}"] 
343 |                 if tool == "MaskFocus":
344 |                     img_1, img_2 = imgs_diff(url2path(data[key]["vision_input"][0], Image_task_path), url2path(data[key]["vision_input"][1], Image_task_path), "highlight")
345 |                     img_links = [f"data:image/jpeg;base64,{encode_pil_image(img_1)}", f"data:image/jpeg;base64,{encode_pil_image(img_2)}"]
346 |                 ## image
347 |                 prompt = agent_run.prepare_prompt(img_links, task_2_eva)
348 |                 json_2 = GPTResponse2JSON(agent_run.get_result(prompt))
349 |                 json_2['score'] = get_number(json_2['score'])
350 |                 log_prompt(log_path, task_2_eva)
351 |                 log_prompt(log_path, json_2)
352 |                 # Task 2
353 | 
354 |                 out[key] = {}
355 |                 out[key]['score'] = [json_1['score'], json_2['score']]
356 |                 out[key]['reasoning'] = [json_1['reasoning'], json_2['reasoning']]
357 |                 out[key]['prompt_input'] = [task_1_eva, task_2_eva]
358 |                 out[key]['vision_input'] = data[key]["vision_input"]
359 |                 print(f"{key} over")
360 |                 break
361 |             except Exception as e:
362 |                 print(f"Error: {key} evaluation failed: {e}")
363 |                 counter += 1
364 |     
365 |     write_json(out_path, out)
366 | 
367 | 
368 | 
369 | def test_Multi_Concept_IC_tool(in_path, out_path, log_path):
370 |     data = read_json(in_path)
371 |     out = {}
372 |     for key in tqdm(list(data.keys())):
373 |             
374 |         counter = 0
375 |         while counter < 3:
376 |             try:
377 |                 text = data[key]["prompt_input"].replace("Text Prompt:", "").strip()
378 |                 # tool
379 |                 task_rule = Multi_Concept_IC_Rule.replace("{text}", text)
380 |                 task = task_rule + Multi_Concept_IC_Task_1 + Multi_Concept_IC_Task_2
381 |                 prompt_tool = Tool_Decide.replace("{task}", task)
382 |                 img_links = [data[key]["vision_input"][0], data[key]["vision_input"][1]]
383 |                 prompt = agent_run.prepare_prompt(img_links, prompt_tool)
384 |                 result = GPTResponse2JSON(agent_run.get_result(prompt))
385 |                 result = check(result, ["task_id", "reasoning", "used", "tool"])
386 |                 log_prompt(log_path, prompt_tool)
387 |                 log_prompt(log_path, result)
388 |                 # tool 
389 | 
390 |                 out[key] = {}
391 |                 out[key]['tool_plan'] = result
392 |                 print(f"{key} over")
393 |                 break
394 |             except Exception as e:
395 |                 print(f"Error: {key} evaluation failed: {e}")
396 |                 counter += 1
397 |     
398 |     write_json(out_path, out)
399 | 
400 | 
401 | 
402 | def test_Multi_Concept_IC_evaluate(in_path, tool_path, out_path, log_path):
403 |     data = read_json(in_path)
404 |     data_tool = read_json(tool_path)
405 |     out = {}
406 |     for key in tqdm(list(data.keys())):
407 | 
408 |         counter = 0
409 |         while counter < 3:
410 |             try:
411 |                 text = data[key]["prompt_input"].replace("Text Prompt:", "").strip()
412 |                 img_L, img_R = split2part(url2path(data[key]["vision_input"][0], Image_task_path))
413 |                 img_LR_links = [f"data:image/jpeg;base64,{encode_pil_image(img_L)}", f"data:image/jpeg;base64,{encode_pil_image(img_R)}"]
414 |                 subject_L, subject_R = data[key]["concepts"][0], data[key]["concepts"][1]
415 |                 # Task 1
416 |                 ## prompt
417 |                 if data_tool[key]["tool_plan"][0]["tool"] in ["Highlight", "SceneGraph"]:
418 |                     tool = data_tool[key]["tool_plan"][0]["tool"]
419 |                 else:
420 |                     tool = "None"
421 |                 task_1_eva = Multi_Concept_IC_Task_1_evaluation.replace("{text}", text)
422 |                 task_1_eva = task_1_eva.replace("{subject}", subject_L)
423 |                 tool_text = get_tool_text(tool, [img_LR_links[0], data[key]["vision_input"][1]], log_path)
424 |                 task_1_eva = task_1_eva.replace("{tool_text}", tool_text)
425 |                 ## prompt
426 |                 ## image
427 |                 img_links = [img_LR_links[0], data[key]["vision_input"][1]]
428 |                 if tool == "Highlight":
429 |                     img_1 = img_L
430 |                     img_2 = load_image(url2path(data[key]["vision_input"][1], Image_task_path))
431 |                     img_1 = grding(img_1, subject_L, "highlight")  
432 |                     img_2 = grding(img_2, subject_L, "highlight")  
433 |                     img_links = [f"data:image/jpeg;base64,{encode_pil_image(img_1)}", f"data:image/jpeg;base64,{encode_pil_image(img_2)}"] 
434 |                 ## image
435 |                 prompt = agent_run.prepare_prompt(img_links, task_1_eva)
436 |                 json_1 = GPTResponse2JSON(agent_run.get_result(prompt))
437 |                 json_1['score'] = get_number(json_1['score'])
438 |                 log_prompt(log_path, task_1_eva)
439 |                 log_prompt(log_path, json_1)
440 |                 # Task 1
441 |                 
442 |                 # Task 1
443 |                 ## prompt
444 |                 if data_tool[key]["tool_plan"][0]["tool"] in ["Highlight", "SceneGraph"]:
445 |                     tool = data_tool[key]["tool_plan"][0]["tool"]
446 |                 else:
447 |                     tool = "None"
448 |                 task_1_eva_ = Multi_Concept_IC_Task_1_evaluation.replace("{text}", text)
449 |                 task_1_eva_ = task_1_eva_.replace("{subject}", subject_R)
450 |                 tool_text = get_tool_text(tool, [img_LR_links[1], data[key]["vision_input"][1]], log_path)
451 |                 task_1_eva_ = task_1_eva_.replace("{tool_text}", tool_text)
452 |                 ## prompt
453 |                 ## image
454 |                 img_links = [img_LR_links[1], data[key]["vision_input"][1]]
455 |                 if tool == "Highlight":
456 |                     img_1 = img_R
457 |                     img_2 = load_image(url2path(data[key]["vision_input"][1], Image_task_path))
458 |                     img_1 = grding(img_1, subject_R, "highlight")  
459 |                     img_2 = grding(img_2, subject_R, "highlight")  
460 |                     img_links = [f"data:image/jpeg;base64,{encode_pil_image(img_1)}", f"data:image/jpeg;base64,{encode_pil_image(img_2)}"] 
461 |                 ## image
462 |                 prompt = agent_run.prepare_prompt(img_links, task_1_eva_)
463 |                 json_2 = GPTResponse2JSON(agent_run.get_result(prompt))
464 |                 json_2['score'] = get_number(json_2['score'])
465 |                 log_prompt(log_path, task_1_eva_)
466 |                 log_prompt(log_path, json_2)
467 |                 # Task 1
468 |                 
469 |                 # Task 2
470 |                 ## prompt
471 |                 if data_tool[key]["tool_plan"][1]["tool"] in ["Highlight", "SceneGraph"]:
472 |                     tool = data_tool[key]["tool_plan"][1]["tool"]
473 |                 else:
474 |                     tool = "None"
475 |                 task_2_eva = Multi_Concept_IC_Task_2_evaluation.replace("{text}", text)
476 |                 tool_text = get_tool_text(tool, [data[key]["vision_input"][1]], log_path)
477 |                 task_2_eva = task_2_eva.replace("{tool_text}", tool_text)
478 |                 ## prompt
479 |                 ## image
480 |                 img_links = [data[key]["vision_input"][1]]
481 |                 if tool == "Highlight":
482 |                     img_2 = load_image(url2path(data[key]["vision_input"][1], Image_task_path))
483 |                     img_2 = grding(img_2, text, "highlight")  
484 |                     img_links = [f"data:image/jpeg;base64,{encode_pil_image(img_2)}"]
485 |                 ## image
486 |                 prompt = agent_run.prepare_prompt(img_links, task_2_eva)
487 |                 json_3 = GPTResponse2JSON(agent_run.get_result(prompt))
488 |                 json_3['score'] = get_number(json_3['score'])
489 |                 log_prompt(log_path, task_2_eva)
490 |                 log_prompt(log_path, json_3)
491 |                 # Task 2
492 | 
493 |                 out[key] = {}
494 |                 out[key]['score'] = [json_1['score'], json_2['score'], json_3['score']]
495 |                 out[key]['reasoning'] = [json_1['reasoning'], json_2['reasoning'], json_3['reasoning']]
496 |                 out[key]['prompt_input'] = [task_1_eva, task_1_eva_, task_2_eva]
497 |                 out[key]['vision_input'] = data[key]["vision_input"]
498 |                 print(f"{key} over")
499 |                 break
500 |             except Exception as e:
501 |                 print(f"Error: {key} evaluation failed: {e}")
502 |                 counter += 1
503 |     
504 |     write_json(out_path, out)
505 | 
506 | 
507 | 
508 | def test_Text_Guided_IG_tool(in_path, out_path, log_path):
509 |     data = read_json(in_path)
510 |     out = {}
511 |     for key in tqdm(list(data.keys())):
512 | 
513 |         counter = 0
514 |         while counter < 3:
515 |             try:
516 |                 text = data[key]["prompt_input"].replace("Text Prompt:", "").strip()
517 |                 # tool
518 |                 task_rule = Text_Guided_IG_Rule.replace("{text}", text)
519 |                 task = task_rule + Text_Guided_IG_Task_1
520 |                 prompt_tool = Tool_Decide.replace("{task}", task)
521 |                 img_links = data[key]["vision_input"]
522 |                 prompt = agent_run.prepare_prompt(img_links, prompt_tool)
523 |                 result = GPTResponse2JSON(agent_run.get_result(prompt))
524 |                 result = check(result, ["task_id", "reasoning", "used", "tool"])
525 |                 log_prompt(log_path, prompt_tool)
526 |                 log_prompt(log_path, result)
527 |                 # tool 
528 | 
529 |                 out[key] = {}
530 |                 out[key]['tool_plan'] = result
531 |                 print(f"{key} over")
532 |                 break
533 |             except Exception as e:
534 |                 print(f"Error: {key} evaluation failed: {e}")
535 |                 counter += 1
536 |     
537 |     write_json(out_path, out)
538 | 
539 | 
540 | 
541 | def test_Text_Guided_IG_evaluate(in_path, tool_path, out_path, log_path):
542 |     data = read_json(in_path)
543 |     data_tool = read_json(tool_path)
544 |     out = {}
545 |     for key in tqdm(list(data.keys())):
546 | 
547 |         counter = 0
548 |         while counter < 3:
549 |             try:
550 |                 text = data[key]["prompt_input"].replace("Text Prompt:", "").strip()
551 |                 # Task 1
552 |                 ## prompt
553 |                 if data_tool[key]["tool_plan"][0]["tool"] in ["Highlight", "SceneGraph"]:
554 |                     tool = data_tool[key]["tool_plan"][0]["tool"]
555 |                 else:
556 |                     tool = "None"
557 |                 task_1_eva = Text_Guided_IG_Task_1_evaluation.replace("{text}", text)
558 |                 tool_text = get_tool_text(tool, data[key]["vision_input"], log_path)
559 |                 task_1_eva = task_1_eva.replace("{tool_text}", tool_text)
560 |                 ## prompt
561 |                 ## image
562 |                 img_links = data[key]["vision_input"]
563 |                 if tool == "Highlight":
564 |                     img_1 = load_image(url2path(data[key]["vision_input"][0], Image_task_path))
565 |                     img_1 = grding(img_1, text, "highlight")  
566 |                     img_links = [f"data:image/jpeg;base64,{encode_pil_image(img_1)}"] 
567 |                 
568 |                 ## image
569 |                 prompt = agent_run.prepare_prompt(img_links, task_1_eva)
570 |                 json_1 = GPTResponse2JSON(agent_run.get_result(prompt))
571 |                 json_1['score'] = get_number(json_1['score'])
572 |                 log_prompt(log_path, task_1_eva)
573 |                 log_prompt(log_path, json_1)
574 |                 # Task 1
575 | 
576 |                 out[key] = {}
577 |                 out[key]['score'] = [json_1['score']]
578 |                 out[key]['reasoning'] = [json_1['reasoning']]
579 |                 out[key]['prompt_input'] = [task_1_eva]
580 |                 out[key]['vision_input'] = data[key]["vision_input"]
581 |                 print(f"{key} over")
582 |                 break
583 |             except Exception as e:
584 |                 print(f"Error: {key} evaluation failed: {e}")
585 |                 counter += 1
586 |     
587 |     write_json(out_path, out)
588 |     
589 | 
590 | 
591 | def test_Control_Guided_IG_tool(in_path, out_path, log_path):
592 |     data = read_json(in_path)
593 |     out = {}
594 |     for key in tqdm(list(data.keys())):
595 | 
596 |         counter = 0
597 |         while counter < 3:
598 |             try:
599 |                 text = data[key]["prompt_input"].replace("Text Prompt:", "").strip()
600 |                 # tool
601 |                 task_rule = Control_Guided_IG_Rule.replace("{text}", text)
602 |                 task = task_rule + Control_Guided_IG_Task_1 + Control_Guided_IG_Task_2
603 |                 prompt_tool = Tool_Decide.replace("{task}", task)
604 |                 img_links = data[key]["vision_input"]
605 |                 prompt = agent_run.prepare_prompt(img_links, prompt_tool)
606 |                 result = GPTResponse2JSON(agent_run.get_result(prompt))
607 |                 result = check(result, ["task_id", "reasoning", "used", "tool"])
608 |                 log_prompt(log_path, prompt_tool)
609 |                 log_prompt(log_path, result)
610 |                 # tool 
611 | 
612 |                 out[key] = {}
613 |                 out[key]['tool_plan'] = result
614 |                 print(f"{key} over")
615 |                 break
616 |             except Exception as e:
617 |                 print(f"Error: {key} evaluation failed: {e}")
618 |                 counter += 1
619 |     
620 |     write_json(out_path, out)
621 | 
622 | 
623 | 
624 | def test_Control_Guided_IG_evaluate(in_path, tool_path, out_path, log_path):
625 |     data = read_json(in_path)
626 |     data_tool = read_json(tool_path)
627 |     out = {}
628 |     for key in tqdm(list(data.keys())):
629 | 
630 |         counter = 0
631 |         while counter < 3:
632 |             try:
633 |                 text = data[key]["prompt_input"].replace("Text Prompt:", "").strip()
634 |                 # Task 1
635 |                 ## prompt
636 |                 if data_tool[key]["tool_plan"][0]["tool"] in ["Highlight", "SceneGraph"]:
637 |                     tool = data_tool[key]["tool_plan"][0]["tool"]
638 |                 else:
639 |                     tool = "None"
640 |                 task_1_eva = Control_Guided_IG_Task_1_evaluation
641 |                 tool_text = get_tool_text(tool, data[key]["vision_input"], log_path)
642 |                 task_1_eva = task_1_eva.replace("{tool_text}", tool_text)
643 |                 ## prompt
644 |                 ## image
645 |                 img_links = data[key]["vision_input"]
646 |                 if tool == "Highlight":
647 |                     img_1 = load_image(url2path(data[key]["vision_input"][0], Image_task_path))
648 |                     img_2 = load_image(url2path(data[key]["vision_input"][1], Image_task_path))
649 |                     img_1 = grding(img_1, text, "highlight")  
650 |                     img_2 = grding(img_2, text, "highlight")  
651 |                     img_links = [f"data:image/jpeg;base64,{encode_pil_image(img_1)}", f"data:image/jpeg;base64,{encode_pil_image(img_2)}"] 
652 |                 ## image
653 |                 prompt = agent_run.prepare_prompt(img_links, task_1_eva)
654 |                 json_1 = GPTResponse2JSON(agent_run.get_result(prompt))
655 |                 json_1['score'] = get_number(json_1['score'])
656 |                 log_prompt(log_path, task_1_eva)
657 |                 log_prompt(log_path, json_1)
658 |                 # Task 1
659 |                 
660 |                 # Task 2
661 |                 ## prompt
662 |                 if data_tool[key]["tool_plan"][1]["tool"] in ["Highlight", "SceneGraph"]:
663 |                     tool = data_tool[key]["tool_plan"][1]["tool"]
664 |                 else:
665 |                     tool = "None"
666 |                 task_2_eva = Control_Guided_IG_Task_2_evaluation.replace("{text}", text)
667 |                 tool_text = get_tool_text(tool, [data[key]["vision_input"][1]], log_path)
668 |                 task_2_eva = task_2_eva.replace("{tool_text}", tool_text)
669 |                 ## prompt
670 |                 ## image
671 |                 img_links = [data[key]["vision_input"][1]]
672 |                 if tool == "Highlight":
673 |                     img_2 = load_image(url2path(data[key]["vision_input"][1], Image_task_path))
674 |                     img_2 = grding(img_2, text, "highlight")  
675 |                     img_links = [f"data:image/jpeg;base64,{encode_pil_image(img_2)}"] 
676 |                 ## image
677 |                 prompt = agent_run.prepare_prompt(img_links, task_2_eva)
678 |                 json_2 = GPTResponse2JSON(agent_run.get_result(prompt))
679 |                 json_2['score'] = get_number(json_2['score'])
680 |                 log_prompt(log_path, task_2_eva)
681 |                 log_prompt(log_path, json_2)
682 |                 # Task 2
683 | 
684 |                 out[key] = {}
685 |                 out[key]['score'] = [json_1['score'], json_2['score']]
686 |                 out[key]['reasoning'] = [json_1['reasoning'], json_2['reasoning']]
687 |                 out[key]['prompt_input'] = [task_1_eva, task_2_eva]
688 |                 out[key]['vision_input'] = data[key]["vision_input"]
689 |                 print(f"{key} over")
690 |                 break
691 |             except Exception as e:
692 |                 print(f"Error: {key} evaluation failed: {e}")
693 |                 counter += 1
694 |     
695 |     write_json(out_path, out)
696 | 
697 | 
698 | 
699 | def test_Subject_Driven_IG_tool(in_path, out_path, log_path):
700 |     data = read_json(in_path)
701 |     out = {}
702 |     for key in tqdm(list(data.keys())):
703 | 
704 |         counter = 0
705 |         while counter < 3:
706 |             try:
707 |                 text = data[key]["prompt_input"].replace("Text Prompt:", "").strip()
708 |                 # tool
709 |                 task_rule = Subject_Driven_IG_Rule.replace("{text}", text)
710 |                 task = task_rule + Subject_Driven_IG_Task_1 + Subject_Driven_IG_Task_2
711 |                 prompt_tool = Tool_Decide.replace("{task}", task)
712 |                 img_links = data[key]["vision_input"]
713 |                 prompt = agent_run.prepare_prompt(img_links, prompt_tool)
714 |                 result = GPTResponse2JSON(agent_run.get_result(prompt))
715 |                 result = check(result, ["task_id", "reasoning", "used", "tool"])
716 |                 log_prompt(log_path, prompt_tool)
717 |                 log_prompt(log_path, result)
718 |                 # tool 
719 | 
720 |                 out[key] = {}
721 |                 out[key]['tool_plan'] = result
722 |                 print(f"{key} over")
723 |                 break
724 |             except Exception as e:
725 |                 print(f"Error: {key} evaluation failed: {e}")
726 |                 counter += 1
727 |     
728 |     write_json(out_path, out)
729 | 
730 | 
731 | 
732 | def test_Subject_Driven_IG_evaluate(in_path, tool_path, out_path, log_path):
733 |     data = read_json(in_path)
734 |     data_tool = read_json(tool_path)
735 |     out = {}
736 |     for key in tqdm(list(data.keys())):
737 | 
738 |         counter = 0
739 |         while counter < 3:
740 |             try:
741 |                 text = data[key]["prompt_input"].replace("Text Prompt:", "").strip()
742 |                 subject = data[key]["subject"]
743 |                 # Task 1
744 |                 ## prompt
745 |                 if data_tool[key]["tool_plan"][0]["tool"] in ["Highlight", "SceneGraph"]:
746 |                     tool = data_tool[key]["tool_plan"][0]["tool"]
747 |                 else:
748 |                     tool = "None"
749 |                 task_1_eva = Subject_Driven_IG_Task_1_evaluation.replace("{subject}", subject)
750 |                 tool_text = get_tool_text(tool, data[key]["vision_input"], log_path)
751 |                 task_1_eva = task_1_eva.replace("{tool_text}", tool_text)
752 |                 ## prompt
753 |                 ## image
754 |                 img_links = data[key]["vision_input"]
755 |                 if tool == "Highlight":
756 |                     img_1 = load_image(url2path(data[key]["vision_input"][0], Image_task_path))
757 |                     img_2 = load_image(url2path(data[key]["vision_input"][1], Image_task_path))
758 |                     img_1 = grding(img_1, text, "highlight")  
759 |                     img_2 = grding(img_2, text, "highlight")  
760 |                     img_links = [f"data:image/jpeg;base64,{encode_pil_image(img_1)}", f"data:image/jpeg;base64,{encode_pil_image(img_2)}"] 
761 |                 ## image
762 |                 prompt = agent_run.prepare_prompt(img_links, task_1_eva)
763 |                 json_1 = GPTResponse2JSON(agent_run.get_result(prompt))
764 |                 json_1['score'] = get_number(json_1['score'])
765 |                 log_prompt(log_path, task_1_eva)
766 |                 log_prompt(log_path, json_1)
767 |                 # Task 1
768 |                 
769 |                 # Task 2
770 |                 ## prompt
771 |                 if data_tool[key]["tool_plan"][1]["tool"] in ["Highlight", "SceneGraph"]:
772 |                     tool = data_tool[key]["tool_plan"][1]["tool"]
773 |                 else:
774 |                     tool = "None"
775 |                 task_2_eva = Subject_Driven_IG_Task_2_evaluation.replace("{text}", text)
776 |                 tool_text = get_tool_text(tool, [data[key]["vision_input"][1]], log_path)
777 |                 task_2_eva = task_2_eva.replace("{tool_text}", tool_text)
778 |                 ## prompt
779 |                 ## image
780 |                 img_links = [data[key]["vision_input"][1]]
781 |                 if tool == "Highlight":
782 |                     img_2 = load_image(url2path(data[key]["vision_input"][1], Image_task_path))
783 |                     img_2 = grding(img_2, text, "highlight")  
784 |                     img_links = [f"data:image/jpeg;base64,{encode_pil_image(img_2)}"] 
785 |                 ## image
786 |                 prompt = agent_run.prepare_prompt(img_links, task_2_eva)
787 |                 json_2 = GPTResponse2JSON(agent_run.get_result(prompt))
788 |                 json_2['score'] = get_number(json_2['score'])
789 |                 log_prompt(log_path, task_2_eva)
790 |                 log_prompt(log_path, json_2)
791 |                 # Task 2
792 | 
793 |                 out[key] = {}
794 |                 out[key]['score'] = [json_1['score'], json_2['score']]
795 |                 out[key]['reasoning'] = [json_1['reasoning'], json_2['reasoning']]
796 |                 out[key]['prompt_input'] = [task_1_eva, task_2_eva]
797 |                 out[key]['vision_input'] = data[key]["vision_input"]
798 |                 print(f"{key} over")
799 |                 break
800 |             except Exception as e:
801 |                 print(f"Error: {key} evaluation failed: {e}")
802 |                 counter += 1
803 |     
804 |     write_json(out_path, out)
805 | 
806 | 
807 | 
808 | def evaluate(in_path, tool_out_path, tool_log_path, eva_out_path, eva_log_path):
809 |     if "ImagenHub_Control-Guided_IG" in in_path:
810 |         test_Control_Guided_IG_tool(in_path, tool_out_path, tool_log_path)
811 |         test_Control_Guided_IG_evaluate(in_path, tool_out_path, eva_out_path, eva_log_path)
812 |     if "ImagenHub_Mask-Guided_IE" in in_path:
813 |         test_Mask_Guided_IE_tool(in_path, tool_out_path, tool_log_path)
814 |         test_Mask_Guided_IE_evaluate(in_path, tool_out_path, eva_out_path, eva_log_path)
815 |     if "ImagenHub_Multi-Concept_IC" in in_path:
816 |         test_Multi_Concept_IC_tool(in_path, tool_out_path, tool_log_path)
817 |         test_Multi_Concept_IC_evaluate(in_path, tool_out_path, eva_out_path, eva_log_path)
818 |     if "ImagenHub_Subject-Driven_IE" in in_path:
819 |         test_Subject_Driven_IE_tool(in_path, tool_out_path, tool_log_path)
820 |         test_Subject_Driven_IE_evaluate(in_path, tool_out_path, eva_out_path, eva_log_path)
821 |     if "ImagenHub_Subject-Driven_IG" in in_path:
822 |         test_Subject_Driven_IG_tool(in_path, tool_out_path, tool_log_path)
823 |         test_Subject_Driven_IG_evaluate(in_path, tool_out_path, eva_out_path, eva_log_path)
824 |     if "ImagenHub_Text-Guided_IE" in in_path:
825 |         test_Text_Guided_IE_tool(in_path, tool_out_path, tool_log_path)
826 |         test_Text_Guided_IE_evaluate(in_path, tool_out_path, eva_out_path, eva_log_path)
827 |     if "ImagenHub_Text-Guided_IG" in in_path:
828 |         test_Text_Guided_IG_tool(in_path, tool_out_path, tool_log_path)
829 |         test_Text_Guided_IG_evaluate(in_path, tool_out_path, eva_out_path, eva_log_path)
830 | 
831 | 
832 | 
833 | 
834 | agent_run = QWen25()
835 | Image_task_path = "PATH_TO_ImageHub_DATA"
836 | for task in os.listdir(Image_task_path):
837 |     task_path = os.path.join(Image_task_path, task)
838 |     models = os.listdir(task_path)
839 |     print(models)
840 |     for dir in models:
841 |         if dir!="input" and dir!="token":
842 |             tool_log_path = f"{task_path}_{dir}_tool_qwen25_72b.txt"
843 |             tool_out_path = f"{task_path}/{dir}/SC_tool_qwen25_72b.json"
844 |             eva_log_path = f"{task_path}_{dir}_eva_qwen25_72b.txt"
845 |             eva_out_path = f"{task_path}/{dir}/SC_eva_qwen25_72b.json"
846 |             in_path = f"{task_path}/{dir}/in.json"
847 |             evaluate(in_path, tool_out_path, tool_log_path, eva_out_path, eva_log_path)
848 |                 
849 | 


--------------------------------------------------------------------------------
/run_test_40p.py:
--------------------------------------------------------------------------------
  1 | from agent.llava_next import LlavaNext
  2 | from agent.prompt_agent import *
  3 | from util import *
  4 | from tools.scene_graph import sg_generate
  5 | from tools.grding import grding, highlight
  6 | from tools.diff import imgs_diff
  7 | from tools.split import split2part
  8 | from tqdm import tqdm
  9 | 
 10 | 
 11 | def get_tool_text(tool, img_links, log_path):
 12 |     if tool=="None":
 13 |         return ""
 14 |     if tool=="Highlight" or tool=="MaskFocus":
 15 |         return "Focus on the highlighted parts of the image."
 16 |     if tool=="SceneGraph":
 17 |         text = ""
 18 |         if len(img_links) == 2:
 19 |             text = """Two scene graphs in JSON format generated from two images will be provided:\nThe first scene graph:\n{scene_graph_1}\nThe second scene graph:\n{scene_graph_2}"""
 20 |             sg_1 = sg_generate(img_links[0], 0, log_path)
 21 |             sg_2 = sg_generate(img_links[1], 0, log_path)
 22 |             params = {"{scene_graph_1}": sg_1, "{scene_graph_2}": sg_2}
 23 |             text = prompt_format(text, params)
 24 |         if len(img_links) == 1:
 25 |             text = """The scene graph in JSON format generated from this image is as follows:\n{scene_graph}"""
 26 |             sg_1 = sg_generate(img_links[0], 0, log_path)
 27 |             params = {"{scene_graph}": sg_1}
 28 |             text = prompt_format(text, params)
 29 |         return text
 30 | 
 31 | 
 32 | 
 33 | def test_Text_Guided_IE_tool(in_path, out_path, log_path, test=None):
 34 |     data = read_json(in_path)
 35 |     out = {}
 36 |     for key in tqdm(list(data.keys())):
 37 |         if test:
 38 |             its = in_path.split("/")
 39 |             if key not in test[its[-3]][its[-2]]['sample']:
 40 |                 continue
 41 |         counter = 0
 42 |         while counter < 3:
 43 |             try:
 44 |                 instruction = data[key]["prompt_input"].replace("Editing instruction:", "").strip()
 45 |                 # tool
 46 |                 task_rule = Text_Guided_IE_Rule.replace("{instruction}", instruction)
 47 |                 task = task_rule + Text_Guided_IE_Task_1 + Text_Guided_IE_Task_2
 48 |                 prompt_tool = Tool_Decide.replace("{task}", task)
 49 |                 img_links = [url2path(data[key]["vision_input"][0], Image_Root), url2path(data[key]["vision_input"][1], Image_Root)]
 50 |                 
 51 |                 result = GPTResponse2JSON(agent_run.get_result(img_links, prompt_tool))
 52 |                 result = check(result, ["task_id", "reasoning", "used", "tool"])
 53 |                 log_prompt(log_path, prompt_tool)
 54 |                 log_prompt(log_path, result)
 55 |                 # tool 
 56 | 
 57 |                 out[key] = {}
 58 |                 out[key]['tool_plan'] = result
 59 |                 print(f"{key} over")
 60 |                 break
 61 |             except Exception as e:
 62 |                 print(f"Error: {key} evaluation failed: {e}")
 63 |                 counter += 1
 64 |     
 65 |     write_json(out_path, out)
 66 | 
 67 | 
 68 | 
 69 | def test_Text_Guided_IE_evaluate(in_path, tool_path, out_path, log_path, test=None):
 70 |     data = read_json(in_path)
 71 |     data_tool = read_json(tool_path)
 72 |     out = {}
 73 |     for key in tqdm(list(data.keys())):
 74 |         if test:
 75 |             its = in_path.split("/")
 76 |             if key not in test[its[-3]][its[-2]]['sample']:
 77 |                 continue
 78 |         counter = 0
 79 |         while counter < 3:
 80 |             try:
 81 |                 instruction = data[key]["prompt_input"].replace("Editing instruction:", "").strip()
 82 |                 # Task 1
 83 |                 ## prompt
 84 |                 if data_tool[key]["tool_plan"][0]["tool"] in ["Highlight", "SceneGraph"]:
 85 |                     tool = data_tool[key]["tool_plan"][0]["tool"]
 86 |                 else:
 87 |                     tool = "None"
 88 |                 task_1_eva = Text_Guided_IE_Task_1_evaluation.replace("{instruction}", instruction)
 89 |                 tool_text = get_tool_text(tool, data[key]["vision_input"], log_path)
 90 |                 task_1_eva = task_1_eva.replace("{tool_text}", tool_text)
 91 |                 ## prompt
 92 |                 ## image
 93 |                 img_links = [url2path(data[key]["vision_input"][0], Image_Root), url2path(data[key]["vision_input"][1], Image_Root)]
 94 |                 if tool == "Highlight":
 95 |                     img_1 = load_image(url2path(data[key]["vision_input"][0], Image_Root))
 96 |                     img_2 = load_image(url2path(data[key]["vision_input"][1], Image_Root))
 97 |                     img_1 = grding(img_1, instruction, "highlight")  
 98 |                     img_2 = grding(img_2, instruction, "highlight")  
 99 |                     img_links = [f"data:image/jpeg;base64,{encode_pil_image(img_1)}", f"data:image/jpeg;base64,{encode_pil_image(img_2)}"] 
100 |                 ## image
101 |                     
102 |                 json_1 = GPTResponse2JSON(agent_run.get_result(img_links, task_1_eva))
103 |                 json_1['score'] = get_number(json_1['score'])
104 |                 log_prompt(log_path, task_1_eva)
105 |                 log_prompt(log_path, json_1)
106 |                 # Task 1
107 |                 
108 |                 # Task 2
109 |                 ## prompt
110 |                 if data_tool[key]["tool_plan"][1]["tool"] in ["Highlight", "SceneGraph"]:
111 |                     tool = data_tool[key]["tool_plan"][1]["tool"]
112 |                 else:
113 |                     tool = "None"
114 |                 task_2_eva = Text_Guided_IE_Task_2_evaluation.replace("{instruction}", instruction)
115 |                 tool_text = get_tool_text(tool, data[key]["vision_input"], log_path)
116 |                 task_2_eva = task_2_eva.replace("{tool_text}", tool_text)
117 |                 ## prompt
118 |                 ## image
119 |                 img_links = [url2path(data[key]["vision_input"][0], Image_Root), url2path(data[key]["vision_input"][1], Image_Root)]
120 |                 if tool == "Highlight":
121 |                     img_1 = load_image(url2path(data[key]["vision_input"][0], Image_Root))
122 |                     img_2 = load_image(url2path(data[key]["vision_input"][1], Image_Root))
123 |                     img_1 = grding(img_1, instruction, "highlight")  
124 |                     img_2 = grding(img_2, instruction, "highlight")  
125 |                     img_links = [f"data:image/jpeg;base64,{encode_pil_image(img_1)}", f"data:image/jpeg;base64,{encode_pil_image(img_2)}"] 
126 |                 
127 |                 ## image
128 |                 
129 |                 json_2 = GPTResponse2JSON(agent_run.get_result(img_links, task_2_eva))
130 |                 json_2['score'] = get_number(json_2['score'])
131 |                 log_prompt(log_path, task_2_eva)
132 |                 log_prompt(log_path, json_2)
133 |                 # Task 2
134 | 
135 |                 out[key] = {}
136 |                 out[key]['score'] = [json_1['score'], json_2['score']]
137 |                 out[key]['reasoning'] = [json_1['reasoning'], json_2['reasoning']]
138 |                 out[key]['prompt_input'] = [task_1_eva, task_2_eva]
139 |                 out[key]['vision_input'] = data[key]["vision_input"]
140 |                 print(f"{key} over")
141 |                 break
142 |             except Exception as e:
143 |                 print(f"Error: {key} evaluation failed: {e}")
144 |                 counter += 1
145 |     
146 |     write_json(out_path, out)
147 | 
148 | 
149 | 
150 | def test_Subject_Driven_IE_tool(in_path, out_path, log_path, test=None):
151 |     data = read_json(in_path)
152 |     out = {}
153 |     for key in tqdm(list(data.keys())):
154 |         if test:
155 |             its = in_path.split("/")
156 |             if key not in test[its[-3]][its[-2]]['sample']:
157 |                 continue
158 |         counter = 0
159 |         while counter < 3:
160 |             try:
161 |                 subject = data[key]["prompt_input"].replace("Subject:", "").strip()
162 |                 # tool
163 |                 task_rule = Subject_Driven_IE_Rule_llava.replace("{subject}", subject)
164 |                 task = task_rule + Subject_Driven_IE_Task_1_llava + Subject_Driven_IE_Task_2_llava
165 |                 prompt_tool = Tool_Decide.replace("{task}", task)
166 |                 im_1 = load_image(url2path(data[key]["vision_input"][0], Image_Root))
167 |                 im_2 = load_image(url2path(data[key]["vision_input"][1], Image_Root))
168 |                 im = merge_images([im_1, im_2])
169 | 
170 |                 img_links = [f"data:image/jpeg;base64,{encode_pil_image(im)}", url2path(data[key]["vision_input"][2], Image_Root)]
171 |                 
172 |                 result = GPTResponse2JSON(agent_run.get_result(img_links, prompt_tool))
173 |                 result = check(result, ["task_id", "reasoning", "used", "tool"])
174 |                 log_prompt(log_path, prompt_tool)
175 |                 log_prompt(log_path, result)
176 |                 # tool 
177 | 
178 |                 out[key] = {}
179 |                 out[key]['tool_plan'] = result
180 |                 print(f"{key} over")
181 |                 break
182 |             except Exception as e:
183 |                 print(f"Error: {key} evaluation failed: {e}")
184 |                 counter += 1
185 |     
186 |     write_json(out_path, out)
187 | 
188 | 
189 | 
190 | def test_Subject_Driven_IE_evaluate(in_path, tool_path, out_path, log_path, test=None):
191 |     data = read_json(in_path)
192 |     data_tool = read_json(tool_path)
193 |     out = {}
194 |     for key in tqdm(list(data.keys())):
195 |         if test:
196 |             its = in_path.split("/")
197 |             if key not in test[its[-3]][its[-2]]['sample']:
198 |                 continue
199 |         counter = 0
200 |         while counter < 3:
201 |             try:
202 |                 subject = data[key]["prompt_input"].replace("Subject:", "").strip()
203 |                 # Task 1
204 |                 ## prompt
205 |                 if data_tool[key]["tool_plan"][0]["tool"] in ["Highlight", "SceneGraph"]:
206 |                     tool = data_tool[key]["tool_plan"][0]["tool"]
207 |                 else:
208 |                     tool = "None"
209 |                 task_1_eva = Subject_Driven_IE_Task_1_evaluation.replace("Subject: {subject}", "")
210 |                 tool_text = get_tool_text(tool, [data[key]["vision_input"][1], data[key]["vision_input"][2]], log_path)
211 |                 task_1_eva = task_1_eva.replace("{tool_text}", tool_text)
212 |                 ## prompt
213 |                 ## image
214 |                 img_links = [url2path(data[key]["vision_input"][1], Image_Root), url2path(data[key]["vision_input"][2], Image_Root)]
215 |                 if tool == "Highlight":
216 |                     img_1 = load_image(url2path(data[key]["vision_input"][1], Image_Root))
217 |                     img_2 = load_image(url2path(data[key]["vision_input"][2], Image_Root))
218 |                     img_1 = grding(img_1, subject, "highlight")  
219 |                     img_2 = grding(img_2, subject, "highlight")  
220 |                     img_links = [f"data:image/jpeg;base64,{encode_pil_image(img_1)}", f"data:image/jpeg;base64,{encode_pil_image(img_2)}"] 
221 |                 ## image
222 | 
223 |                 json_1 = GPTResponse2JSON(agent_run.get_result(img_links, task_1_eva))
224 |                 json_1['score'] = get_number(json_1['score'])
225 |                 log_prompt(log_path, task_1_eva)
226 |                 log_prompt(log_path, json_1)
227 |                 # Task 1
228 |                 
229 |                 # Task 2
230 |                 ## prompt
231 |                 if data_tool[key]["tool_plan"][1]["tool"] in ["Highlight", "SceneGraph"]:
232 |                     tool = data_tool[key]["tool_plan"][1]["tool"]
233 |                 else:
234 |                     tool = "None"
235 |                 tool_text = get_tool_text(tool, [data[key]["vision_input"][0], data[key]["vision_input"][2]], log_path)
236 |                 task_2_eva = Subject_Driven_IE_Task_2_evaluation.replace("{tool_text}", tool_text)
237 |                 ## prompt
238 |                 ## image
239 |                 img_links = [url2path(data[key]["vision_input"][0], Image_Root), url2path(data[key]["vision_input"][2], Image_Root)]
240 |                 if tool == "Highlight":
241 |                     img_1 = load_image(url2path(data[key]["vision_input"][0], Image_Root))
242 |                     img_2 = load_image(url2path(data[key]["vision_input"][2], Image_Root))
243 |                     img_1 = grding(img_1, "background", "highlight")  
244 |                     img_2 = grding(img_2, "background", "highlight")  
245 |                     img_links = [f"data:image/jpeg;base64,{encode_pil_image(img_1)}", f"data:image/jpeg;base64,{encode_pil_image(img_2)}"] 
246 |                 
247 |                 ## image
248 | 
249 |                 json_2 = GPTResponse2JSON(agent_run.get_result(img_links, task_2_eva))
250 |                 json_2['score'] = get_number(json_2['score'])
251 |                 log_prompt(log_path, task_2_eva)
252 |                 log_prompt(log_path, json_2)
253 |                 # Task 2
254 | 
255 |                 out[key] = {}
256 |                 out[key]['score'] = [json_1['score'], json_2['score']]
257 |                 out[key]['reasoning'] = [json_1['reasoning'], json_2['reasoning']]
258 |                 out[key]['prompt_input'] = [task_1_eva, task_2_eva]
259 |                 out[key]['vision_input'] = data[key]["vision_input"]
260 |                 print(f"{key} over")
261 |                 break
262 |             except Exception as e:
263 |                 print(f"Error: {key} evaluation failed: {e}")
264 |                 counter += 1
265 |     
266 |     write_json(out_path, out)
267 | 
268 | 
269 | 
270 | def test_Mask_Guided_IE_tool(in_path, out_path, log_path, test=None):
271 |     data = read_json(in_path)
272 |     out = {}
273 |     st = False
274 |     for key in tqdm(list(data.keys())):
275 |         if test:
276 |             its = in_path.split("/")
277 |             if key not in test[its[-3]][its[-2]]['sample']:
278 |                 continue
279 |         counter = 0
280 |         while counter < 3:
281 |             try:
282 |                 instruction = data[key]["prompt_input"].replace("Editing instruction:", "").strip()
283 |                 # tool
284 |                 task_rule = Mask_Guided_IE_Rule.replace("{instruction}", instruction)
285 |                 task = task_rule + Mask_Guided_IE_Task_1 + Mask_Guided_IE_Task_2
286 |                 prompt_tool = Tool_Decide.replace("{task}", task)
287 |                 img_links = [url2path(data[key]["vision_input"][0], Image_Root), url2path(data[key]["vision_input"][1], Image_Root)]
288 | 
289 |                 result = GPTResponse2JSON(agent_run.get_result(img_links, prompt_tool))
290 |                 result = check(result, ["task_id", "reasoning", "used", "tool"])
291 |                 log_prompt(log_path, prompt_tool)
292 |                 log_prompt(log_path, result)
293 |                 # tool 
294 | 
295 |                 out[key] = {}
296 |                 out[key]['tool_plan'] = result
297 |                 print(f"{key} over")
298 |                 break
299 |             except Exception as e:
300 |                 print(f"Error: {key} evaluation failed: {e}")
301 |                 counter += 1
302 |     
303 |     write_json(out_path, out)
304 | 
305 | 
306 | 
307 | def test_Mask_Guided_IE_evaluate(in_path, tool_path, out_path, log_path, test=None):
308 |     data = read_json(in_path)
309 |     data_tool = read_json(tool_path)
310 |     out = {}
311 |     for key in tqdm(list(data.keys())):
312 |         if test:
313 |             its = in_path.split("/")
314 |             if key not in test[its[-3]][its[-2]]['sample']:
315 |                 continue
316 |         counter = 0
317 |         while counter < 3:
318 |             try:
319 |                 instruction = data[key]["prompt_input"].replace("Editing instruction:", "").strip()
320 |                 # Task 1
321 |                 ## prompt
322 |                 if data_tool[key]["tool_plan"][0]["tool"] in ["Highlight", "SceneGraph", "MaskFocus"]:
323 |                     tool = data_tool[key]["tool_plan"][0]["tool"]
324 |                 else:
325 |                     tool = "None"
326 |                 task_1_eva = Mask_Guided_IE_Task_1_evaluation.replace("{instruction}", instruction)
327 |                 tool_text = get_tool_text(tool, data[key]["vision_input"], log_path)
328 |                 task_1_eva = task_1_eva.replace("{tool_text}", tool_text)
329 |                 ## prompt
330 |                 ## image
331 |                 img_links = [url2path(data[key]["vision_input"][0], Image_Root), url2path(data[key]["vision_input"][1], Image_Root)]
332 |                 if tool == "Highlight":
333 |                     img_1 = load_image(url2path(data[key]["vision_input"][0], Image_Root))
334 |                     img_2 = load_image(url2path(data[key]["vision_input"][1], Image_Root))
335 |                     img_1 = grding(img_1, instruction, "highlight")  
336 |                     img_2 = grding(img_2, instruction, "highlight")  
337 |                     img_links = [f"data:image/jpeg;base64,{encode_pil_image(img_1)}", f"data:image/jpeg;base64,{encode_pil_image(img_2)}"] 
338 |                 if tool == "MaskFocus":
339 |                     img_1, img_2 = imgs_diff(url2path(data[key]["vision_input"][0], Image_Root), url2path(data[key]["vision_input"][1], Image_Root), "highlight")
340 |                     img_links = [f"data:image/jpeg;base64,{encode_pil_image(img_1)}", f"data:image/jpeg;base64,{encode_pil_image(img_2)}"]
341 |                 ## image
342 |  
343 |                 json_1 = GPTResponse2JSON(agent_run.get_result(img_links, task_1_eva))
344 |                 json_1['score'] = get_number(json_1['score'])
345 |                 log_prompt(log_path, task_1_eva)
346 |                 log_prompt(log_path, json_1)
347 |                 # Task 1
348 |                 
349 |                 # Task 2
350 |                 ## prompt
351 |                 if data_tool[key]["tool_plan"][1]["tool"] in ["Highlight", "SceneGraph", "MaskFocus"]:
352 |                     tool = data_tool[key]["tool_plan"][1]["tool"]
353 |                 else:
354 |                     tool = "None"
355 |                 task_2_eva = Mask_Guided_IE_Task_2_evaluation.replace("{instruction}", instruction)
356 |                 tool_text = get_tool_text(tool, data[key]["vision_input"], log_path)
357 |                 task_2_eva = task_2_eva.replace("{tool_text}", tool_text)
358 |                 ## prompt
359 |                 ## image
360 |                 img_links = [url2path(data[key]["vision_input"][0], Image_Root), url2path(data[key]["vision_input"][1], Image_Root)]
361 |                 if tool == "Highlight":
362 |                     img_1 = load_image(url2path(data[key]["vision_input"][0], Image_Root))
363 |                     img_2 = load_image(url2path(data[key]["vision_input"][1], Image_Root))
364 |                     img_1 = grding(img_1, instruction, "highlight")  
365 |                     img_2 = grding(img_2, instruction, "highlight")  
366 |                     img_links = [f"data:image/jpeg;base64,{encode_pil_image(img_1)}", f"data:image/jpeg;base64,{encode_pil_image(img_2)}"] 
367 |                 if tool == "MaskFocus":
368 |                     img_1, img_2 = imgs_diff(url2path(data[key]["vision_input"][0], Image_Root), url2path(data[key]["vision_input"][1], Image_Root), "highlight")
369 |                     img_links = [f"data:image/jpeg;base64,{encode_pil_image(img_1)}", f"data:image/jpeg;base64,{encode_pil_image(img_2)}"]
370 |                 ## image
371 | 
372 |                 json_2 = GPTResponse2JSON(agent_run.get_result(img_links, task_2_eva))
373 |                 json_2['score'] = get_number(json_2['score'])
374 |                 log_prompt(log_path, task_2_eva)
375 |                 log_prompt(log_path, json_2)
376 |                 # Task 2
377 | 
378 |                 out[key] = {}
379 |                 out[key]['score'] = [json_1['score'], json_2['score']]
380 |                 out[key]['reasoning'] = [json_1['reasoning'], json_2['reasoning']]
381 |                 out[key]['prompt_input'] = [task_1_eva, task_2_eva]
382 |                 out[key]['vision_input'] = data[key]["vision_input"]
383 |                 print(f"{key} over")
384 |                 break
385 |             except Exception as e:
386 |                 print(f"Error: {key} evaluation failed: {e}")
387 |                 counter += 1
388 |     
389 |     write_json(out_path, out)
390 | 
391 | 
392 | 
393 | def test_Multi_Concept_IC_tool(in_path, out_path, log_path, test=None):
394 |     data = read_json(in_path)
395 |     out = {}
396 |     for key in tqdm(list(data.keys())):
397 |         if test:
398 |             its = in_path.split("/")
399 |             if key not in test[its[-3]][its[-2]]['sample']:
400 |                 continue
401 |         counter = 0
402 |         while counter < 3:
403 |             try:
404 |                 text = data[key]["prompt_input"].replace("Text Prompt:", "").strip()
405 |                 # tool
406 |                 task_rule = Multi_Concept_IC_Rule.replace("{text}", text)
407 |                 task = task_rule + Multi_Concept_IC_Task_1 + Multi_Concept_IC_Task_2
408 |                 prompt_tool = Tool_Decide.replace("{task}", task)
409 |                 img_links = [url2path(data[key]["vision_input"][0], Image_Root), url2path(data[key]["vision_input"][1], Image_Root)]
410 | 
411 |                 result = GPTResponse2JSON(agent_run.get_result(img_links, prompt_tool))
412 |                 result = check(result, ["task_id", "reasoning", "used", "tool"])
413 |                 log_prompt(log_path, prompt_tool)
414 |                 log_prompt(log_path, result)
415 |                 # tool 
416 | 
417 |                 out[key] = {}
418 |                 out[key]['tool_plan'] = result
419 |                 print(f"{key} over")
420 |                 break
421 |             except Exception as e:
422 |                 print(f"Error: {key} evaluation failed: {e}")
423 |                 counter += 1
424 |     
425 |     write_json(out_path, out)
426 | 
427 | 
428 | 
429 | def test_Multi_Concept_IC_evaluate(in_path, tool_path, out_path, log_path, test=None):
430 |     data = read_json(in_path)
431 |     data_tool = read_json(tool_path)
432 |     out = {}
433 |     for key in tqdm(list(data.keys())):
434 |         if test:
435 |             its = in_path.split("/")
436 |             if key not in test[its[-3]][its[-2]]['sample']:
437 |                 continue
438 |         counter = 0
439 |         while counter < 3:
440 |             try:
441 |                 text = data[key]["prompt_input"].replace("Text Prompt:", "").strip()
442 |                 img_L, img_R = split2part(url2path(data[key]["vision_input"][0], Image_Root))
443 |                 img_LR_links = [f"data:image/jpeg;base64,{encode_pil_image(img_L)}", f"data:image/jpeg;base64,{encode_pil_image(img_R)}"]
444 |                 subject_L, subject_R = data[key]["concepts"][0], data[key]["concepts"][1]
445 |                 # Task 1
446 |                 ## prompt
447 |                 if data_tool[key]["tool_plan"][0]["tool"] in ["Highlight", "SceneGraph"]:
448 |                     tool = data_tool[key]["tool_plan"][0]["tool"]
449 |                 else:
450 |                     tool = "None"
451 |                 task_1_eva = Multi_Concept_IC_Task_1_evaluation.replace("{text}", text)
452 |                 task_1_eva = task_1_eva.replace("{subject}", subject_L)
453 |                 tool_text = get_tool_text(tool, [img_LR_links[0], data[key]["vision_input"][1]], log_path)
454 |                 task_1_eva = task_1_eva.replace("{tool_text}", tool_text)
455 |                 ## prompt
456 | 
457 |                 img_links = [img_LR_links[0], url2path(data[key]["vision_input"][1], Image_Root)]
458 |                 if tool == "Highlight":
459 |                     img_1 = img_L
460 |                     img_2 = load_image(url2path(data[key]["vision_input"][1], Image_Root))
461 |                     img_1 = grding(img_1, subject_L, "highlight")  
462 |                     img_2 = grding(img_2, subject_L, "highlight")  
463 |                     img_links = [f"data:image/jpeg;base64,{encode_pil_image(img_1)}", f"data:image/jpeg;base64,{encode_pil_image(img_2)}"] 
464 |                 ## image
465 | 
466 |                 json_1 = GPTResponse2JSON(agent_run.get_result(img_links, task_1_eva))
467 |                 json_1['score'] = get_number(json_1['score'])
468 |                 log_prompt(log_path, task_1_eva)
469 |                 log_prompt(log_path, json_1)
470 |                 # Task 1
471 |                 
472 |                 # Task 1
473 |                 ## prompt
474 |                 if data_tool[key]["tool_plan"][0]["tool"] in ["Highlight", "SceneGraph"]:
475 |                     tool = data_tool[key]["tool_plan"][0]["tool"]
476 |                 else:
477 |                     tool = "None"
478 |                 task_1_eva_ = Multi_Concept_IC_Task_1_evaluation.replace("{text}", text)
479 |                 task_1_eva_ = task_1_eva_.replace("{subject}", subject_R)
480 |                 tool_text = get_tool_text(tool, [img_LR_links[1], data[key]["vision_input"][1]], log_path)
481 |                 task_1_eva_ = task_1_eva_.replace("{tool_text}", tool_text)
482 |                 ## prompt
483 |                 ## image
484 |                 img_links = [img_LR_links[1], url2path(data[key]["vision_input"][1], Image_Root)]
485 |                 if tool == "Highlight":
486 |                     img_1 = img_R
487 |                     img_2 = load_image(url2path(data[key]["vision_input"][1], Image_Root))
488 |                     img_1 = grding(img_1, subject_R, "highlight")  
489 |                     img_2 = grding(img_2, subject_R, "highlight")  
490 |                     img_links = [f"data:image/jpeg;base64,{encode_pil_image(img_1)}", f"data:image/jpeg;base64,{encode_pil_image(img_2)}"] 
491 |                 ## image
492 |                
493 |                 json_2 = GPTResponse2JSON(agent_run.get_result(img_links, task_1_eva_))
494 |                 json_2['score'] = get_number(json_2['score'])
495 |                 log_prompt(log_path, task_1_eva_)
496 |                 log_prompt(log_path, json_2)
497 |                 # Task 1
498 |                 
499 |                 # Task 2
500 |                 ## prompt
501 |                 if data_tool[key]["tool_plan"][1]["tool"] in ["Highlight", "SceneGraph"]:
502 |                     tool = data_tool[key]["tool_plan"][1]["tool"]
503 |                 else:
504 |                     tool = "None"
505 |                 task_2_eva = Multi_Concept_IC_Task_2_evaluation.replace("{text}", text)
506 |                 tool_text = get_tool_text(tool, [data[key]["vision_input"][1]], log_path)
507 |                 task_2_eva = task_2_eva.replace("{tool_text}", tool_text)
508 |                 ## prompt
509 |                 ## image
510 |                 img_links = [url2path(data[key]["vision_input"][1], Image_Root)]
511 |                 if tool == "Highlight":
512 |                     img_2 = load_image(url2path(data[key]["vision_input"][1], Image_Root))
513 |                     img_2 = grding(img_2, text, "highlight")  
514 |                     img_links = [f"data:image/jpeg;base64,{encode_pil_image(img_2)}"]
515 |                 ## image
516 | 
517 |                 json_3 = GPTResponse2JSON(agent_run.get_result(img_links, task_2_eva))
518 |                 json_3['score'] = get_number(json_3['score'])
519 |                 log_prompt(log_path, task_2_eva)
520 |                 log_prompt(log_path, json_3)
521 |                 # Task 2
522 | 
523 |                 out[key] = {}
524 |                 out[key]['score'] = [json_1['score'], json_2['score'], json_3['score']]
525 |                 out[key]['reasoning'] = [json_1['reasoning'], json_2['reasoning'], json_3['reasoning']]
526 |                 out[key]['prompt_input'] = [task_1_eva, task_1_eva_, task_2_eva]
527 |                 out[key]['vision_input'] = data[key]["vision_input"]
528 |                 print(f"{key} over")
529 |                 break
530 |             except Exception as e:
531 |                 print(f"Error: {key} evaluation failed: {e}")
532 |                 counter += 1
533 |     
534 |     write_json(out_path, out)
535 | 
536 | 
537 | 
538 | def test_Text_Guided_IG_tool(in_path, out_path, log_path, test=None):
539 |     data = read_json(in_path)
540 |     out = {}
541 |     for key in tqdm(list(data.keys())):
542 |         if test:
543 |             its = in_path.split("/")
544 |             if key not in test[its[-3]][its[-2]]['sample']:
545 |                 continue
546 |         counter = 0
547 |         while counter < 3:
548 |             try:
549 |                 text = data[key]["prompt_input"].replace("Text Prompt:", "").strip()
550 |                 # tool
551 |                 task_rule = Text_Guided_IG_Rule.replace("{text}", text)
552 |                 task = task_rule + Text_Guided_IG_Task_1
553 |                 prompt_tool = Tool_Decide.replace("{task}", task)
554 |                 img_links = [url2path(data[key]["vision_input"][0], Image_Root)]
555 | 
556 |                 result = GPTResponse2JSON(agent_run.get_result(img_links, prompt_tool))
557 |                 result = check(result, ["task_id", "reasoning", "used", "tool"])
558 |                 log_prompt(log_path, prompt_tool)
559 |                 log_prompt(log_path, result)
560 |                 # tool 
561 | 
562 |                 out[key] = {}
563 |                 out[key]['tool_plan'] = result
564 |                 print(f"{key} over")
565 |                 break
566 |             except Exception as e:
567 |                 print(f"Error: {key} evaluation failed: {e}")
568 |                 counter += 1
569 |     
570 |     write_json(out_path, out)
571 | 
572 | 
573 | 
574 | def test_Text_Guided_IG_evaluate(in_path, tool_path, out_path, log_path, test=None):
575 |     data = read_json(in_path)
576 |     data_tool = read_json(tool_path)
577 |     out = {}
578 |     for key in tqdm(list(data.keys())):
579 |         if test:
580 |             its = in_path.split("/")
581 |             if key not in test[its[-3]][its[-2]]['sample']:
582 |                 continue
583 |         counter = 0
584 |         while counter < 3:
585 |             try:
586 |                 text = data[key]["prompt_input"].replace("Text Prompt:", "").strip()
587 |                 # Task 1
588 |                 ## prompt
589 |                 if data_tool[key]["tool_plan"][0]["tool"] in ["Highlight", "SceneGraph"]:
590 |                     tool = data_tool[key]["tool_plan"][0]["tool"]
591 |                 else:
592 |                     tool = "None"
593 |                 task_1_eva = Text_Guided_IG_Task_1_evaluation.replace("{text}", text)
594 |                 tool_text = get_tool_text(tool, data[key]["vision_input"], log_path)
595 |                 task_1_eva = task_1_eva.replace("{tool_text}", tool_text)
596 |                 ## prompt
597 |                 ## image
598 |                 img_links = [url2path(data[key]["vision_input"][0], Image_Root)]
599 |                 if tool == "Highlight":
600 |                     img_1 = load_image(url2path(data[key]["vision_input"][0], Image_Root))
601 |                     img_1 = grding(img_1, text, "highlight")  
602 |                     img_links = [f"data:image/jpeg;base64,{encode_pil_image(img_1)}"] 
603 |                 
604 |                 ## image
605 |         
606 |                 json_1 = GPTResponse2JSON(agent_run.get_result(img_links, task_1_eva))
607 |                 json_1['score'] = get_number(json_1['score'])
608 |                 log_prompt(log_path, task_1_eva)
609 |                 log_prompt(log_path, json_1)
610 |                 # Task 1
611 | 
612 |                 out[key] = {}
613 |                 out[key]['score'] = [json_1['score']]
614 |                 out[key]['reasoning'] = [json_1['reasoning']]
615 |                 out[key]['prompt_input'] = [task_1_eva]
616 |                 out[key]['vision_input'] = data[key]["vision_input"]
617 |                 print(f"{key} over")
618 |                 break
619 |             except Exception as e:
620 |                 print(f"Error: {key} evaluation failed: {e}")
621 |                 counter += 1
622 |     
623 |     write_json(out_path, out)
624 |     
625 | 
626 | 
627 | def test_Control_Guided_IG_tool(in_path, out_path, log_path, test=None):
628 |     data = read_json(in_path)
629 |     out = {}
630 |     for key in tqdm(list(data.keys())):
631 |         if test:
632 |             its = in_path.split("/")
633 |             if key not in test[its[-3]][its[-2]]['sample']:
634 |                 continue
635 |         counter = 0
636 |         while counter < 3:
637 |             try:
638 |                 text = data[key]["prompt_input"].replace("Text Prompt:", "").strip()
639 |                 # tool
640 |                 task_rule = Control_Guided_IG_Rule.replace("{text}", text)
641 |                 task = task_rule + Control_Guided_IG_Task_1 + Control_Guided_IG_Task_2
642 |                 prompt_tool = Tool_Decide.replace("{task}", task)
643 |                 img_links = [url2path(data[key]["vision_input"][0], Image_Root), url2path(data[key]["vision_input"][1], Image_Root)]
644 |         
645 |                 result = GPTResponse2JSON(agent_run.get_result(img_links, prompt_tool))
646 |                 result = check(result, ["task_id", "reasoning", "used", "tool"])
647 |                 log_prompt(log_path, prompt_tool)
648 |                 log_prompt(log_path, result)
649 |                 # tool 
650 | 
651 |                 out[key] = {}
652 |                 out[key]['tool_plan'] = result
653 |                 print(f"{key} over")
654 |                 break
655 |             except Exception as e:
656 |                 print(f"Error: {key} evaluation failed: {e}")
657 |                 counter += 1
658 |     
659 |     write_json(out_path, out)
660 | 
661 | 
662 | 
663 | def test_Control_Guided_IG_evaluate(in_path, tool_path, out_path, log_path,test=None):
664 |     data = read_json(in_path)
665 |     data_tool = read_json(tool_path)
666 |     out = {}
667 |     for key in tqdm(list(data.keys())):
668 |         if test:
669 |             its = in_path.split("/")
670 |             if key not in test[its[-3]][its[-2]]['sample']:
671 |                 continue
672 |         counter = 0
673 |         while counter < 3:
674 |             try:
675 |                 text = data[key]["prompt_input"].replace("Text Prompt:", "").strip()
676 |                 # Task 1
677 |                 ## prompt
678 |                 if data_tool[key]["tool_plan"][0]["tool"] in ["Highlight", "SceneGraph"]:
679 |                     tool = data_tool[key]["tool_plan"][0]["tool"]
680 |                 else:
681 |                     tool = "None"
682 |                 task_1_eva = Control_Guided_IG_Task_1_evaluation
683 |                 tool_text = get_tool_text(tool, data[key]["vision_input"], log_path)
684 |                 task_1_eva = task_1_eva.replace("{tool_text}", tool_text)
685 |                 ## prompt
686 |                 ## image
687 |                 img_links = [url2path(data[key]["vision_input"][0], Image_Root), url2path(data[key]["vision_input"][1], Image_Root)]
688 |                 if tool == "Highlight":
689 |                     img_1 = load_image(url2path(data[key]["vision_input"][0], Image_Root))
690 |                     img_2 = load_image(url2path(data[key]["vision_input"][1], Image_Root))
691 |                     img_1 = grding(img_1, text, "highlight")  
692 |                     img_2 = grding(img_2, text, "highlight")  
693 |                     img_links = [f"data:image/jpeg;base64,{encode_pil_image(img_1)}", f"data:image/jpeg;base64,{encode_pil_image(img_2)}"] 
694 |                 ## image
695 |        
696 |                 json_1 = GPTResponse2JSON(agent_run.get_result(img_links, task_1_eva))
697 |                 json_1['score'] = get_number(json_1['score'])
698 |                 log_prompt(log_path, task_1_eva)
699 |                 log_prompt(log_path, json_1)
700 |                 # Task 1
701 |                 
702 |                 # Task 2
703 |                 ## prompt
704 |                 if data_tool[key]["tool_plan"][1]["tool"] in ["Highlight", "SceneGraph"]:
705 |                     tool = data_tool[key]["tool_plan"][1]["tool"]
706 |                 else:
707 |                     tool = "None"
708 |                 task_2_eva = Control_Guided_IG_Task_2_evaluation.replace("{text}", text)
709 |                 tool_text = get_tool_text(tool, [data[key]["vision_input"][1]], log_path)
710 |                 task_2_eva = task_2_eva.replace("{tool_text}", tool_text)
711 |                 ## prompt
712 |                 ## image
713 |                 img_links = [url2path(data[key]["vision_input"][1], Image_Root)]
714 |                 if tool == "Highlight":
715 |                     img_2 = load_image(url2path(data[key]["vision_input"][1], Image_Root))
716 |                     img_2 = grding(img_2, text, "highlight")  
717 |                     img_links = [f"data:image/jpeg;base64,{encode_pil_image(img_2)}"] 
718 |                 ## image
719 |             
720 |                 json_2 = GPTResponse2JSON(agent_run.get_result(img_links, task_2_eva))
721 |                 json_2['score'] = get_number(json_2['score'])
722 |                 log_prompt(log_path, task_2_eva)
723 |                 log_prompt(log_path, json_2)
724 |                 # Task 2
725 | 
726 |                 out[key] = {}
727 |                 out[key]['score'] = [json_1['score'], json_2['score']]
728 |                 out[key]['reasoning'] = [json_1['reasoning'], json_2['reasoning']]
729 |                 out[key]['prompt_input'] = [task_1_eva, task_2_eva]
730 |                 out[key]['vision_input'] = data[key]["vision_input"]
731 |                 print(f"{key} over")
732 |                 break
733 |             except Exception as e:
734 |                 print(f"Error: {key} evaluation failed: {e}")
735 |                 counter += 1
736 |     
737 |     write_json(out_path, out)
738 | 
739 | 
740 | 
741 | def test_Subject_Driven_IG_tool(in_path, out_path, log_path, test=None):
742 |     data = read_json(in_path)
743 |     out = {}
744 |     for key in tqdm(list(data.keys())):
745 |         if test:
746 |             its = in_path.split("/")
747 |             if key not in test[its[-3]][its[-2]]['sample']:
748 |                 continue
749 |         counter = 0
750 |         while counter < 3:
751 |             try:
752 |                 text = data[key]["prompt_input"].replace("Text Prompt:", "").strip()
753 |                 # tool
754 |                 task_rule = Subject_Driven_IG_Rule.replace("{text}", text)
755 |                 task = task_rule + Subject_Driven_IG_Task_1 + Subject_Driven_IG_Task_2
756 |                 prompt_tool = Tool_Decide.replace("{task}", task)
757 |                 img_links = [url2path(data[key]["vision_input"][0], Image_Root), url2path(data[key]["vision_input"][1], Image_Root)]
758 |                
759 |                 result = GPTResponse2JSON(agent_run.get_result(img_links, prompt_tool))
760 |                 result = check(result, ["task_id", "reasoning", "used", "tool"])
761 |                 log_prompt(log_path, prompt_tool)
762 |                 log_prompt(log_path, result)
763 |                 # tool 
764 | 
765 |                 out[key] = {}
766 |                 out[key]['tool_plan'] = result
767 |                 print(f"{key} over")
768 |                 break
769 |             except Exception as e:
770 |                 print(f"Error: {key} evaluation failed: {e}")
771 |                 counter += 1
772 |     
773 |     write_json(out_path, out)
774 | 
775 | 
776 | 
777 | def test_Subject_Driven_IG_evaluate(in_path, tool_path, out_path, log_path, test=None):
778 |     data = read_json(in_path)
779 |     data_tool = read_json(tool_path)
780 |     out = {}
781 |     for key in tqdm(list(data.keys())):
782 |         if test:
783 |             its = in_path.split("/")
784 |             if key not in test[its[-3]][its[-2]]['sample']:
785 |                 continue
786 |         counter = 0
787 |         while counter < 3:
788 |             try:
789 |                 text = data[key]["prompt_input"].replace("Text Prompt:", "").strip()
790 |                 subject = data[key]["subject"]
791 |                 # Task 1
792 |                 ## prompt
793 |                 if data_tool[key]["tool_plan"][0]["tool"] in ["Highlight", "SceneGraph"]:
794 |                     tool = data_tool[key]["tool_plan"][0]["tool"]
795 |                 else:
796 |                     tool = "None"
797 |                 task_1_eva = Subject_Driven_IG_Task_1_evaluation.replace("{subject}", subject)
798 |                 tool_text = get_tool_text(tool, data[key]["vision_input"], log_path)
799 |                 task_1_eva = task_1_eva.replace("{tool_text}", tool_text)
800 |                 ## prompt
801 |                 ## image
802 |                 img_links = [url2path(data[key]["vision_input"][0], Image_Root), url2path(data[key]["vision_input"][1], Image_Root)]
803 |                 if tool == "Highlight":
804 |                     img_1 = load_image(url2path(data[key]["vision_input"][0], Image_Root))
805 |                     img_2 = load_image(url2path(data[key]["vision_input"][1], Image_Root))
806 |                     img_1 = grding(img_1, text, "highlight")  
807 |                     img_2 = grding(img_2, text, "highlight")  
808 |                     img_links = [f"data:image/jpeg;base64,{encode_pil_image(img_1)}", f"data:image/jpeg;base64,{encode_pil_image(img_2)}"] 
809 |                 ## image
810 |                 
811 |                 json_1 = GPTResponse2JSON(agent_run.get_result(img_links, task_1_eva))
812 |                 json_1['score'] = get_number(json_1['score'])
813 |                 log_prompt(log_path, task_1_eva)
814 |                 log_prompt(log_path, json_1)
815 |                 # Task 1
816 |                 
817 |                 # Task 2
818 |                 ## prompt
819 |                 if data_tool[key]["tool_plan"][1]["tool"] in ["Highlight", "SceneGraph"]:
820 |                     tool = data_tool[key]["tool_plan"][1]["tool"]
821 |                 else:
822 |                     tool = "None"
823 |                 task_2_eva = Subject_Driven_IG_Task_2_evaluation.replace("{text}", text)
824 |                 tool_text = get_tool_text(tool, [data[key]["vision_input"][1]], log_path)
825 |                 task_2_eva = task_2_eva.replace("{tool_text}", tool_text)
826 |                 ## prompt
827 |                 ## image
828 |                 img_links = [url2path(data[key]["vision_input"][1], Image_Root)]
829 |                 if tool == "Highlight":
830 |                     img_2 = load_image(url2path(data[key]["vision_input"][1], Image_Root))
831 |                     img_2 = grding(img_2, text, "highlight")  
832 |                     img_links = [f"data:image/jpeg;base64,{encode_pil_image(img_2)}"] 
833 |                 ## image
834 | 
835 |                 json_2 = GPTResponse2JSON(agent_run.get_result(img_links, task_2_eva))
836 |                 json_2['score'] = get_number(json_2['score'])
837 |                 log_prompt(log_path, task_2_eva)
838 |                 log_prompt(log_path, json_2)
839 |                 # Task 2
840 | 
841 |                 out[key] = {}
842 |                 out[key]['score'] = [json_1['score'], json_2['score']]
843 |                 out[key]['reasoning'] = [json_1['reasoning'], json_2['reasoning']]
844 |                 out[key]['prompt_input'] = [task_1_eva, task_2_eva]
845 |                 out[key]['vision_input'] = data[key]["vision_input"]
846 |                 print(f"{key} over")
847 |                 break
848 |             except Exception as e:
849 |                 print(f"Error: {key} evaluation failed: {e}")
850 |                 counter += 1
851 |     
852 |     write_json(out_path, out)
853 | 
854 | 
855 | 
856 | 
857 |     
858 | def evaluate(in_path, tool_out_path, tool_log_path, eva_out_path, eva_log_path, test):
859 |     if "ImagenHub_Control-Guided_IG" in in_path:
860 |         test_Control_Guided_IG_tool(in_path, tool_out_path, tool_log_path, test)
861 |         test_Control_Guided_IG_evaluate(in_path, tool_out_path, eva_out_path, eva_log_path, test)
862 |     if "ImagenHub_Mask-Guided_IE" in in_path:
863 |         test_Mask_Guided_IE_tool(in_path, tool_out_path, tool_log_path, test)
864 |         test_Mask_Guided_IE_evaluate(in_path, tool_out_path, eva_out_path, eva_log_path, test)
865 |     if "ImagenHub_Multi-Concept_IC" in in_path:
866 |         test_Multi_Concept_IC_tool(in_path, tool_out_path, tool_log_path, test)
867 |         test_Multi_Concept_IC_evaluate(in_path, tool_out_path, eva_out_path, eva_log_path, test)
868 |     if "ImagenHub_Subject-Driven_IE" in in_path:
869 |         test_Subject_Driven_IE_tool(in_path, tool_out_path, tool_log_path, test)
870 |         test_Subject_Driven_IE_evaluate(in_path, tool_out_path, eva_out_path, eva_log_path, test)
871 |     if "ImagenHub_Subject-Driven_IG" in in_path:
872 |         test_Subject_Driven_IG_tool(in_path, tool_out_path, tool_log_path, test)
873 |         test_Subject_Driven_IG_evaluate(in_path, tool_out_path, eva_out_path, eva_log_path, test)
874 |     if "ImagenHub_Text-Guided_IE" in in_path:
875 |         test_Text_Guided_IE_tool(in_path, tool_out_path, tool_log_path, test)
876 |         test_Text_Guided_IE_evaluate(in_path, tool_out_path, eva_out_path, eva_log_path, test)
877 |     if "ImagenHub_Text-Guided_IG" in in_path:
878 |         test_Text_Guided_IG_tool(in_path, tool_out_path, tool_log_path, test)
879 |         test_Text_Guided_IG_evaluate(in_path, tool_out_path, eva_out_path, eva_log_path, test)
880 | 
881 | 
882 | 
883 | import os
884 | agent_run = LlavaNext()
885 | Image_task_path = "PATH_TO_ImageHub_DATA"
886 | test = read_json("test_40p.json")
887 | # test=None
888 | for task in os.listdir(Image_task_path):
889 |     task_path = os.path.join(Image_task_path, task)
890 |     models = os.listdir(task_path)
891 |     print(models)
892 |     for dir in models:
893 |         if dir!="input" and dir!="token":
894 |             tool_log_path = f"{task_path}_{dir}_tool_test_40p.txt"
895 |             tool_out_path = f"{task_path}/{dir}/SC_tool_test_40p.json"
896 |             eva_log_path = f"{task_path}_{dir}_eva_test_40p.txt"
897 |             eva_out_path = f"{task_path}/{dir}/SC_eva_test_40p.json"
898 |             in_path = f"{task_path}/{dir}/in.json"
899 |             evaluate(in_path, tool_out_path, tool_log_path, eva_out_path, eva_log_path, test)
900 | 
901 | 


--------------------------------------------------------------------------------