├── __init__.py ├── tools ├── __init__.py ├── ocr.py ├── scene_graph.py ├── com.py ├── diff.py └── grding.py ├── assets ├── case.png ├── table-1.png ├── table-2.png ├── framework.png ├── overview.png └── table-40p.png ├── qwen.py ├── spear.py ├── README.md ├── agent ├── openai.py ├── prompt_vlm.py └── prompt_agent.py ├── environment.yml ├── util.py ├── evaluate_vie.py ├── evaluate.py └── run_test_40p.py /__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tools/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /assets/case.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HITsz-TMG/Agentic-CIGEval/HEAD/assets/case.png -------------------------------------------------------------------------------- /assets/table-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HITsz-TMG/Agentic-CIGEval/HEAD/assets/table-1.png -------------------------------------------------------------------------------- /assets/table-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HITsz-TMG/Agentic-CIGEval/HEAD/assets/table-2.png -------------------------------------------------------------------------------- /assets/framework.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HITsz-TMG/Agentic-CIGEval/HEAD/assets/framework.png -------------------------------------------------------------------------------- /assets/overview.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HITsz-TMG/Agentic-CIGEval/HEAD/assets/overview.png -------------------------------------------------------------------------------- /assets/table-40p.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HITsz-TMG/Agentic-CIGEval/HEAD/assets/table-40p.png -------------------------------------------------------------------------------- /tools/ocr.py: -------------------------------------------------------------------------------- 1 | from paddleocr import PaddleOCR,draw_ocr 2 | ocr = PaddleOCR(lang='en') # need to run only once to download and load model into memory 3 | 4 | def OCR(image): 5 | result = ocr.ocr(image, cls=False) 6 | texts = [] 7 | for idx in range(len(result)): 8 | res = result[idx] 9 | for line in res: 10 | texts.append(line[1][0]) 11 | return texts 12 | 13 | -------------------------------------------------------------------------------- /tools/scene_graph.py: -------------------------------------------------------------------------------- 1 | from agent.openai import GPT4o,QWen25 2 | from util import clean_text 3 | 4 | sgPrompt=''' 5 | For the provided image, generate only a scene graph in JSON format that includes the following: 6 | 1. Objects that are relevant to describing the image content in detail 7 | 2. Object attributes that are relevant to describing the image content in detail 8 | 3. Object relationships that are relevant to describing the image content in detail 9 | ''' 10 | 11 | # SG_generator = QWen25() 12 | SG_generator = GPT4o() 13 | 14 | def sg_generate(img_links): 15 | prompt_content = SG_generator.prepare_prompt(img_links, sgPrompt) 16 | sg = SG_generator.get_result(prompt_content) 17 | print(sg) 18 | return clean_text(sg) -------------------------------------------------------------------------------- /tools/com.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from PIL import Image, ImageDraw 3 | 4 | 5 | def mark(image, bbox_list): 6 | if isinstance(image, str): 7 | image = Image.open(image) 8 | image_copy = image.copy() 9 | draw = ImageDraw.Draw(image_copy) 10 | for bbox in bbox_list: 11 | draw.rectangle(bbox, outline='red', width=3) 12 | return image_copy 13 | 14 | 15 | def highlight(image, bbox_list): 16 | if isinstance(image, str): 17 | image = Image.open(image) 18 | np_img = np.array(image) 19 | np_ori = np_img.copy() 20 | if len(bbox_list)>0: 21 | np_img //= 4 22 | for bbox in bbox_list: 23 | np_img[bbox[1]:bbox[3], bbox[0]:bbox[2]] = np_ori[bbox[1]:bbox[3], bbox[0]:bbox[2]] 24 | image_h = Image.fromarray(np_img) 25 | return image_h 26 | 27 | 28 | def segment(image, bbox): 29 | if isinstance(image, str): 30 | image = Image.open(image) 31 | np_img = np.array(image) 32 | bbox_area = np_img[bbox[1]:bbox[3], bbox[0]:bbox[2]] 33 | image_s = Image.fromarray(bbox_area) 34 | return image_s 35 | 36 | 37 | def split2part(image): 38 | if isinstance(image, str): 39 | img = Image.open(image) 40 | else: 41 | img = image 42 | w = img.width 43 | h = img.height 44 | box_L = (0,0,w*0.5,h) 45 | box_R = (w*0.5,0,w,h) 46 | img_L = img.crop(box_L) 47 | img_R = img.crop(box_R) 48 | return img_L,img_R -------------------------------------------------------------------------------- /qwen.py: -------------------------------------------------------------------------------- 1 | 2 | import os 3 | os.environ["CUDA_VISIBLE_DEVICES"] = "6,5" 4 | from transformers import Qwen2_5_VLForConditionalGeneration, AutoTokenizer, AutoProcessor 5 | from modelscope import Qwen2VLForConditionalGeneration 6 | from peft import PeftModel 7 | from qwen_vl_utils import process_vision_info 8 | import torch 9 | import flask 10 | from flask import Flask, request, jsonify 11 | 12 | # torch.cuda.empty_cache() 13 | app = Flask(__name__) 14 | model_dir = "path_to/Qwen2-VL-7B-Instruct" 15 | # model_dir = "path_to/Qwen2.5-VL-7B-Instruct" 16 | # model_dir = "path_to/Qwen2.5-VL-7B-Instruct-sft" 17 | # model_dir = "path_to/Qwen2-VL-7B-Instruct-sft" 18 | # lora_path = "" 19 | 20 | 21 | 22 | 23 | model = Qwen2VLForConditionalGeneration.from_pretrained( 24 | model_dir, 25 | torch_dtype=torch.bfloat16, 26 | attn_implementation="flash_attention_2", 27 | device_map="auto", 28 | ) 29 | 30 | 31 | # model = Qwen2_5_VLForConditionalGeneration.from_pretrained( 32 | # model_dir, 33 | # torch_dtype=torch.bfloat16, 34 | # attn_implementation="flash_attention_2", 35 | # device_map="auto", 36 | # ) 37 | # if lora_path: 38 | # model = PeftModel.from_pretrained(model, lora_path, torch_dtype=torch.bfloat16, device_map="auto") 39 | 40 | model.eval() 41 | processor = AutoProcessor.from_pretrained(model_dir) 42 | 43 | 44 | def format(images, text): 45 | content = [] 46 | for img in images: 47 | content.append({"type": "image", "image": img}) 48 | content.append({"type": "text", "text": text}) 49 | messages = [ 50 | { 51 | "role": "user", 52 | "content": content, 53 | } 54 | ] 55 | return messages 56 | 57 | 58 | @app.route('/generate', methods=['POST']) 59 | def generate(): 60 | msg = flask.request.get_json(force=True) 61 | imgs = msg['imgs'] 62 | text = msg['text'] 63 | messages = format(imgs, text) 64 | ######## 65 | text = processor.apply_chat_template( 66 | messages, tokenize=False, add_generation_prompt=True,add_vision_id=True 67 | ) 68 | image_inputs, video_inputs = process_vision_info(messages) 69 | inputs = processor( 70 | text=[text], 71 | images=image_inputs, 72 | videos=video_inputs, 73 | padding=True, 74 | return_tensors="pt", 75 | ) 76 | inputs = inputs.to(model.device) 77 | 78 | # Inference: Generation of the output 79 | with torch.no_grad(): 80 | generated_ids = model.generate(**inputs, max_new_tokens=1024) 81 | generated_ids_trimmed = [ 82 | out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids) 83 | ] 84 | output_text = processor.batch_decode( 85 | generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False 86 | )[0].strip() 87 | 88 | return jsonify({"response": output_text}) 89 | 90 | # except Exception as e: 91 | # return jsonify({"error": str(e)}) 92 | 93 | 94 | 95 | if __name__ == "__main__": 96 | app.run(host='0.0.0.0', port=8080) -------------------------------------------------------------------------------- /tools/diff.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | import numpy as np 3 | from PIL import Image 4 | from com import mark, highlight 5 | 6 | threshold = 0.5 7 | top_n = 1 8 | 9 | def detect_diff(src_img, dst_img): 10 | 11 | height, width = dst_img.shape[:2] 12 | total_area = height * width 13 | # Calculate area threshold (0.5% of total image area) 14 | area_threshold = total_area * (threshold / 100) 15 | 16 | src_img = cv2.GaussianBlur(src_img, [5, 5], 0) 17 | dst_img = cv2.GaussianBlur(dst_img, [5, 5], 0) 18 | diff = cv2.absdiff(src_img, dst_img) 19 | gray = cv2.cvtColor(diff, cv2.COLOR_BGR2GRAY) 20 | # Apply thresholding to get a binary image (45) 21 | _, result = cv2.threshold(gray, 45, 255, cv2.THRESH_BINARY) 22 | 23 | result = cv2.dilate(result, np.ones([3, 3])) 24 | contours, _ = cv2.findContours(result, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) 25 | rect_pos = [] 26 | for c in contours: 27 | x, y, w, h = cv2.boundingRect(c) 28 | area = w * h 29 | if area > area_threshold: 30 | rect_pos.append((x, y, w, h)) 31 | 32 | 33 | # Merge overlapping rectangles 34 | merged_rects = [] 35 | while rect_pos: 36 | rect = rect_pos.pop(0) 37 | x, y, w, h = rect 38 | 39 | merge = False 40 | for other_rect in rect_pos[:]: 41 | ox, oy, ow, oh = other_rect 42 | if x < ox + ow and x + w > ox and y < oy + oh and y + h > oy: 43 | x = min(x, ox) 44 | y = min(y, oy) 45 | w = max(x + w, ox + ow) - x 46 | h = max(y + h, oy + oh) - y 47 | 48 | rect_pos.remove(other_rect) 49 | merge = True 50 | 51 | merged_rects.append((x, y, w, h)) 52 | 53 | if not merge: 54 | break 55 | 56 | areas = [] 57 | for C in merged_rects: 58 | x, y, w, h = C 59 | area = w * h 60 | areas.append((x, y, w, h, area)) 61 | areas.sort(key=lambda x: x[4], reverse=True) 62 | 63 | pos = [] 64 | for i in range(min(top_n, len(areas))): 65 | x, y, w, h, _ = areas[i] 66 | pos.append((x, y, x+w, y+h)) 67 | 68 | return pos 69 | 70 | 71 | def imgs_diff(src_img_path, dst_img_path, function="highlight"): 72 | src_img = cv2.imread(src_img_path) 73 | dst_img = cv2.imread(dst_img_path) 74 | if src_img.shape != dst_img.shape: 75 | dst_img = cv2.resize(dst_img, (src_img.shape[1], src_img.shape[0])) 76 | 77 | # Get the rectangular coordinates of the difference area 78 | rects = detect_diff(src_img, dst_img) 79 | 80 | if function == "highlight": 81 | src_highlight_img = highlight(src_img_path, rects) 82 | dst_highlight_img = highlight(dst_img_path, rects) 83 | return src_highlight_img, dst_highlight_img 84 | 85 | else: 86 | # Mark the difference areas on the image 87 | src_mark_img = mark(src_img_path, rects) 88 | dst_mark_img = mark(dst_img_path, rects) 89 | return src_mark_img, dst_mark_img -------------------------------------------------------------------------------- /spear.py: -------------------------------------------------------------------------------- 1 | from util import * 2 | 3 | human_ratings = "path_to_ImagenHub_human_eval_results" 4 | 5 | 6 | def read_human_sc(task, model, sample): 7 | import pandas as pd 8 | import numpy as np 9 | import ast 10 | 11 | mm = task.replace("ImagenHub_","") 12 | 13 | df1 = pd.read_csv(os.path.join(human_ratings,f"{task}/{mm}_rater1.tsv"), sep="\t") 14 | df2 = pd.read_csv(os.path.join(human_ratings,f"{task}/{mm}_rater2.tsv"), sep="\t") 15 | df3 = pd.read_csv(os.path.join(human_ratings,f"{task}/{mm}_rater3.tsv"), sep="\t") 16 | 17 | cell_value_1 = df1.loc[df1['uid'] == sample, model].values 18 | cell_value_2 = df2.loc[df2['uid'] == sample, model].values 19 | cell_value_3 = df3.loc[df3['uid'] == sample, model].values 20 | 21 | sc_1 = ast.literal_eval(cell_value_1[0])[0] 22 | sc_2 = ast.literal_eval(cell_value_2[0])[0] 23 | sc_3 = ast.literal_eval(cell_value_3[0])[0] 24 | 25 | 26 | return np.mean([sc_1,sc_2,sc_3]) 27 | # return [sc_1,sc_2,sc_3] 28 | 29 | 30 | def preprocess(_list): 31 | temp_list = [] 32 | for scores in _list: 33 | if isinstance(scores, (int, float)): 34 | temp_list.append(map_to_nearest_higher(scores/10.0)) 35 | else: 36 | scores = [int(score) for score in scores] 37 | # temp_list.append(map_to_nearest_higher(min(scores))) 38 | temp_list.append(map_to_nearest_higher(min(scores)/10.0)) 39 | return temp_list 40 | 41 | 42 | def sigfig(number, sigfigs=4, digit_mode=True): 43 | if digit_mode: 44 | string_mode = '{:#.{sigfigs}f}' 45 | else: 46 | string_mode = '{:#.{sigfigs}g}' 47 | if isinstance(number, list): 48 | new_numbers = [] 49 | for num in number: 50 | new_num = string_mode.format(num, sigfigs=sigfigs) 51 | new_numbers.append(float(new_num)) 52 | return new_numbers 53 | else: 54 | return float(string_mode.format(number, sigfigs=sigfigs)) 55 | 56 | 57 | def map_to_nearest_higher(number, target_numbers=[0.0, 0.17, 0.33, 0.5, 0.67, 0.83, 1.0], not_mapping=True): 58 | if not_mapping: 59 | if number > 1.0: 60 | return 1.0 61 | if number < 0.0: 62 | return 0.0 63 | return number 64 | 65 | # Find the nearest higher number 66 | for target in target_numbers: 67 | if target >= number: 68 | return target 69 | return target_numbers[-1] # Return the maximum if no higher number is found 70 | 71 | 72 | def average_correlation(z_scores): 73 | import math 74 | # Calculate the average Z score 75 | z_avg = sum(z_scores) / len(z_scores) 76 | 77 | # Convert the average Z score back to a correlation coefficient 78 | r_avg = (math.exp(2 * z_avg) - 1) / (math.exp(2 * z_avg) + 1) 79 | return r_avg 80 | 81 | 82 | 83 | 84 | from scipy.stats import spearmanr 85 | import numpy as np 86 | import ast 87 | task="" 88 | model="" 89 | # Read the identifiers of the evaluation images under each task/model and organize them into a list 90 | keys=[] 91 | # Read the automated evaluation results and organize them into a list 92 | SC_gpt4o = [] 93 | 94 | SC_human = [read_human_sc(task, model, key) for key in keys] 95 | SC_rho, _ = spearmanr(SC_gpt4o, SC_human) 96 | print(task, model, "SC|", sigfig(SC_rho)) 97 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # CIGEval 2 |

3 | 📃 Paper 4 |

5 | In this work, we propose CIGEval, a unified agentic framework for comprehensive evaluation of conditional image generation tasks. CIGEval utilizes large multimodal models (LMMs) as its core, integrating a multi-functional toolbox to enable fine-grained evaluation. Please check out our paper "A Unified Agentic Framework for Evaluating Conditional Image Generation". 6 | 7 | 8 | ## 🌟 Framework 9 | CIGEval adopts a divide-and-conquer scheme for evaluating images generated under multiple conditions. For each sub-question, CIGEval selects the most suitable tool from its toolbox, focusing on the specific aspect of evaluation. Then, the LMM analyzes the tool outputs and assigns scores. 10 |

11 | 12 | 13 |

14 | 15 | 16 | 17 | ## ⚡️ Installation 18 | Run the following command to set up the environments. 19 | ``` 20 | conda env create -f environment.yml 21 | ``` 22 | The toolbox contains grounding tool. Please refer to [GroundingDINO](https://github.com/IDEA-Research/GroundingDINO) to configure the environment and model. In addition, you need to download the evaluation images and human ratings from [ImagenHub](https://tiger-ai-lab.github.io/ImagenHub/) 23 | 24 | 25 | ## ✨ Conditional Image Synthesis Evaluation 26 | Run the following command to use CIGEval to evaluate 7 conditional image synthesis tasks: 27 | ``` 28 | python evaluate.py 29 | ``` 30 | We also provide the [VIEScore](https://github.com/TIGER-AI-Lab/VIEScore) evaluation method: 31 | ``` 32 | python evaluate_vie.py 33 | ``` 34 | Then you can use `spear.py` to calculate the correlation between the evaluation results and human ratings. 35 | 36 | ## 📌 SFT 37 |

To empower smaller LMMs as effective evaluators, we aim to perform supervised fine-tuning on 7B models to integrate agentic capabilities into them. The ImagenHub data is randomly split into a 6:4 ratio for the training set and test set. 38 |

39 |

For the training set, we employ GPT-4o to carry out the evaluation process of CIGEval, then filter out the trajectories where the evaluation results differ from human scores by less than 0.3, resulting in 2,274 high-quality trajectories for supervised fine-tuning. Using this structured trajectory data, we perform supervised fine-tuning on Qwen2-VL-7B-Instruct and Qwen2.5-VL-7B-Instruct. You can visit trajectories to get the sft data, and visit Qwen2-VL-7B-Instruct-sft and Qwen2.5-VL-7B-Instruct-sft to get our fine-tuned models. 40 |

41 |

For the test set, it is stored in `test_40p.json`. You can run `run_test_40p.py` to get the results of the locally run model on the test set. 42 |

43 | 44 | ## ⚖️ Paper Results 45 |

46 | 47 |

48 |

49 | 50 |

51 | 52 | 53 | 54 | ## 📚 Citation 55 | 56 | If you found this repository useful, please consider cite our paper: 57 | 58 | ```bibtex 59 | @misc{wang2025cigeval, 60 | title={A Unified Agentic Framework for Evaluating Conditional Image Generation}, 61 | author={Jifang Wang and Xue Yang and Longyue Wang and Zhenran Xu and Yiyu Wang and Yaowei Wang and Weihua Luo and Kaifu Zhang and Baotian Hu and Min Zhang}, 62 | year={2025}, 63 | eprint={2504.07046}, 64 | archivePrefix={arXiv}, 65 | primaryClass={cs.CV}, 66 | url={https://arxiv.org/abs/2504.07046}, 67 | } 68 | ``` 69 | -------------------------------------------------------------------------------- /agent/openai.py: -------------------------------------------------------------------------------- 1 | import base64 2 | import requests 3 | from io import BytesIO 4 | from PIL import Image, ImageOps 5 | import os 6 | import openai 7 | from openai import OpenAI 8 | 9 | 10 | class GPT4o(): 11 | def __init__(self, model_name="gpt-4o-2024-05-13", api_key = "API_KEY", base_url = "BASE_URL"): 12 | self.model_name = model_name 13 | self.client = OpenAI(api_key = api_key, base_url = base_url) 14 | 15 | def prepare_prompt(self, image_links = [], text_prompt = ""): 16 | if not isinstance(image_links, list): 17 | image_links = [image_links] 18 | 19 | prompt_content = [] 20 | text_dict = { 21 | "type": "text", 22 | "text": text_prompt 23 | } 24 | prompt_content.append(text_dict) 25 | 26 | for image_link in image_links: 27 | if "base64" not in image_link: 28 | img = load_image(image_link) 29 | image_link = f"data:image/jpeg;base64,{encode_pil_image(img)}" 30 | visual_dict = { 31 | "type": "image_url", 32 | "image_url": {"url": image_link} 33 | } 34 | prompt_content.append(visual_dict) 35 | return prompt_content 36 | 37 | 38 | def get_result(self, prompt): 39 | try: 40 | response = self.client.chat.completions.create( 41 | model = self.model_name, 42 | messages = [ 43 | { 44 | "role": "user", 45 | "content": prompt 46 | } 47 | ] 48 | ) 49 | 50 | out = response.choices[0].message.content 51 | return out 52 | except Exception as e: 53 | print(f"Error: {e}") 54 | return None 55 | 56 | 57 | 58 | 59 | 60 | class QWen25(GPT4o): 61 | def __init__(self, model_name="qwen2.5-vl-72b-instruct", api_key = "API_KEY", base_url = "BASE_URL"): 62 | super().__init__(model_name, api_key, base_url) 63 | 64 | 65 | 66 | 67 | 68 | class LlavaNext(GPT4o): 69 | def __init__(self): 70 | self.address = "http://127.0.0.1:8080/generate" 71 | 72 | def get_result(self, images, text): 73 | if not isinstance(images, list): 74 | images = [images] 75 | data = {'imgs':images, 'text':text} 76 | try: 77 | response = requests.post(self.address, json = data).json() 78 | out = response['response'] 79 | return out 80 | 81 | except Exception as e: 82 | print(f"Error: {e}") 83 | return None 84 | 85 | 86 | 87 | ############################################################### Image Processing Functions 88 | ############################################################### 89 | def load_image(image, format = "RGB"): 90 | if isinstance(image, str): 91 | if image.startswith("http://") or image.startswith("https://"): 92 | image = Image.open(requests.get(image, stream=True).raw) 93 | elif os.path.isfile(image): 94 | image = Image.open(image) 95 | else: 96 | raise ValueError( 97 | f"{image} is not a valid path or url." 98 | ) 99 | elif isinstance(image, Image.Image): 100 | image = image 101 | else: 102 | raise ValueError( 103 | "Incorrect format used for image. Should be an url linking to an image, a local path, or a PIL image." 104 | ) 105 | image = ImageOps.exif_transpose(image) 106 | image = image.convert(format) 107 | return image 108 | 109 | 110 | def encode_pil_image(pil_image, format="JPEG"): 111 | image_stream = BytesIO() 112 | pil_image.save(image_stream, format=format) 113 | image_data = image_stream.getvalue() 114 | base64_image = base64.b64encode(image_data).decode('utf-8') 115 | return base64_image 116 | 117 | ############################################################### Image Processing Functions 118 | ############################################################### -------------------------------------------------------------------------------- /tools/grding.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import os 3 | import re 4 | import torch 5 | from collections import defaultdict 6 | from groundingdino.util.inference import load_model, load_image, predict 7 | from torchvision.ops import box_convert 8 | from PIL import Image, ImageDraw 9 | 10 | 11 | os.environ["CUDA_VISIBLE_DEVICES"] = "6" 12 | 13 | class Grounding_Module(): 14 | def __init__(self, base_dir): 15 | self.model = load_model( 16 | os.path.join(base_dir, "groundingdino/config/GroundingDINO_SwinT_OGC.py"), 17 | os.path.join(base_dir, "weights/groundingdino_swint_ogc.pth") 18 | ) 19 | 20 | def forward(self, img, prompt, bbox_thrd, text_thrd, do_clean=True): 21 | img_source, img = load_image(image_path=img) 22 | w, h = img_source.shape[1], img_source.shape[0] 23 | boxes, logits, phrases = predict( 24 | model=self.model, 25 | image=img, 26 | caption=prompt, 27 | box_threshold=bbox_thrd, 28 | text_threshold=text_thrd, 29 | ) 30 | boxes = boxes * torch.Tensor([w, h, w, h]) 31 | boxes = box_convert(boxes=boxes, in_fmt="cxcywh", out_fmt="xyxy").numpy() 32 | boxes = list(boxes) 33 | logits = logits.numpy() 34 | logits = list(logits) 35 | res = [] 36 | for bbox, logit, phrase in zip(boxes, logits, phrases): 37 | res.append((list([int(xy) for xy in bbox]), logit, phrase)) 38 | if do_clean: 39 | res = self._clean_bbox(res) 40 | return sorted(res, key=lambda x: x[1], reverse=True) 41 | 42 | def _clean_bbox(self, bbox_list): 43 | def get_range(bbox): 44 | return (bbox[2]-bbox[0]) * (bbox[3]-bbox[1]) 45 | def check_recap(bbox1, bbox2): 46 | if bbox2[0]bbox1[2] and bbox2[3]>bbox1[3]: 47 | return True 48 | return False 49 | 50 | bbox_list = sorted(bbox_list, key=lambda x: get_range(x[0])) 51 | cleaned_bbox_list = [] 52 | for bbox in bbox_list: 53 | if len(bbox_list) == 0: 54 | cleaned_bbox_list.append(bbox) 55 | continue 56 | 57 | flag = True 58 | for cleaned_bbox in cleaned_bbox_list: 59 | if check_recap(cleaned_bbox[0], bbox[0]): 60 | flag = False 61 | break 62 | if flag: 63 | cleaned_bbox_list.append(bbox) 64 | return cleaned_bbox_list 65 | 66 | 67 | 68 | 69 | 70 | def filter_bboxes_by_max_logit(a): 71 | phrase_dict = defaultdict(lambda: (None, float('-inf'))) 72 | for bbox, logit, phrase in a: 73 | if logit > phrase_dict[phrase][1]: 74 | phrase_dict[phrase] = (bbox, logit) 75 | filtered_list = [bbox for phrase, (bbox, logit) in phrase_dict.items()] 76 | return filtered_list 77 | 78 | 79 | 80 | def tolist(a): 81 | box_list = [] 82 | for bbox, logit, phrase in a: 83 | box_list.append(bbox) 84 | return box_list 85 | 86 | 87 | grounding_module = Grounding_Module("PATH_TO_GroundingDINO") 88 | 89 | 90 | def grding(img_path, text, function): 91 | res = grounding_module.forward(img_path, text, bbox_thrd=0.2, text_thrd=0.2, do_clean=True) # (bbox, logit, phrase) 92 | 93 | if len(res) == 0: 94 | print("Grounding: no result") 95 | if isinstance(img_path, str): 96 | img_path = Image.open(image_path).convert("RGB") 97 | if function=="highlight++": 98 | return(img_path, res) 99 | return img_path 100 | else: 101 | if function=="mark": 102 | res = filter_bboxes_by_max_logit(res) 103 | return(mark(img_path, res)) 104 | if function=="highlight": 105 | res = filter_bboxes_by_max_logit(res) 106 | return(highlight(img_path, res)) 107 | if function=="segment": 108 | return(segment(img_path, res[-1][0])) 109 | if function=="highlight++": 110 | res = filter_bboxes_by_max_logit(res) 111 | return(highlight(img_path, res), res) 112 | 113 | -------------------------------------------------------------------------------- /environment.yml: -------------------------------------------------------------------------------- 1 | name: pige 2 | channels: 3 | - http://mirrors.tuna.tsinghua.edu.cn/anaconda/pkgs/msys2 4 | - http://mirrors.tuna.tsinghua.edu.cn/anaconda/pkgs/pro 5 | - http://mirrors.tuna.tsinghua.edu.cn/anaconda/pkgs/r 6 | - http://mirrors.tuna.tsinghua.edu.cn/anaconda/pkgs/free 7 | - http://mirrors.tuna.tsinghua.edu.cn/anaconda/pkgs/main 8 | - defaults 9 | dependencies: 10 | - _libgcc_mutex=0.1=main 11 | - _openmp_mutex=5.1=1_gnu 12 | - ca-certificates=2024.11.26=h06a4308_0 13 | - ld_impl_linux-64=2.40=h12ee557_0 14 | - libffi=3.4.4=h6a678d5_1 15 | - libgcc-ng=11.2.0=h1234567_1 16 | - libgomp=11.2.0=h1234567_1 17 | - libstdcxx-ng=11.2.0=h1234567_1 18 | - ncurses=6.4=h6a678d5_0 19 | - openssl=3.0.15=h5eee18b_0 20 | - pip=24.2=py39h06a4308_0 21 | - python=3.9.18=h955ad1f_0 22 | - readline=8.2=h5eee18b_0 23 | - setuptools=75.1.0=py39h06a4308_0 24 | - sqlite=3.45.3=h5eee18b_0 25 | - tk=8.6.14=h39e8969_0 26 | - wheel=0.44.0=py39h06a4308_0 27 | - xz=5.4.6=h5eee18b_1 28 | - zlib=1.2.13=h5eee18b_1 29 | - pip: 30 | - absl-py==2.1.0 31 | - accelerate==1.2.1 32 | - addict==2.4.0 33 | - albucore==0.0.23 34 | - albumentations==2.0.5 35 | - annotated-types==0.7.0 36 | - antlr4-python3-runtime==4.8 37 | - anyio==4.7.0 38 | - anytree==2.12.1 39 | - appdirs==1.4.4 40 | - astor==0.8.1 41 | - asttokens==3.0.0 42 | - av==14.1.0 43 | - beautifulsoup4==4.13.3 44 | - black==21.4b2 45 | - blinker==1.9.0 46 | - boto3==1.35.87 47 | - botocore==1.35.87 48 | - certifi==2024.12.14 49 | - charset-normalizer==3.4.0 50 | - click==8.1.8 51 | - cloudpickle==3.1.0 52 | - cmake==3.31.2 53 | - colorama==0.4.6 54 | - contourpy==1.3.0 55 | - cycler==0.12.1 56 | - cython==3.0.11 57 | - dataclasses==0.6 58 | - decorator==5.2.1 59 | - decord==0.6.0 60 | - deepspeed==0.7.0 61 | - defusedxml==0.7.1 62 | - distro==1.9.0 63 | - einops==0.4.1 64 | - eval-type-backport==0.2.2 65 | - exceptiongroup==1.2.2 66 | - executing==2.1.0 67 | - fasttext==0.9.3 68 | - filelock==3.16.1 69 | - fire==0.7.0 70 | - flash-attn==2.7.3 71 | - flask==3.1.0 72 | - fonttools==4.55.3 73 | - fsspec==2024.10.0 74 | - ftfy==6.3.1 75 | - future==1.0.0 76 | - fvcore==0.1.5.post20221221 77 | - grpcio==1.68.1 78 | - h11==0.14.0 79 | - hjson==3.1.0 80 | - httpcore==1.0.7 81 | - httpx==0.28.1 82 | - huggingface-hub==0.27.0 83 | - hydra-core==1.1.2 84 | - icecream==2.1.3 85 | - idna==3.10 86 | - imageio==2.36.1 87 | - importlib-metadata==8.5.0 88 | - importlib-resources==6.4.5 89 | - inflect==7.4.0 90 | - iopath==0.1.9 91 | - itsdangerous==2.2.0 92 | - jinja2==3.1.4 93 | - jiter==0.8.2 94 | - jmespath==1.0.1 95 | - joblib==1.4.2 96 | - kiwisolver==1.4.7 97 | - lazy-loader==0.4 98 | - levenshtein==0.26.1 99 | - lit==18.1.8 100 | - lmdb==1.6.2 101 | - lvis==0.5.3 102 | - lxml==5.3.1 103 | - markdown==3.7 104 | - markupsafe==3.0.2 105 | - matplotlib==3.9.4 106 | - modelscope==1.22.3 107 | - more-itertools==10.5.0 108 | - mpmath==1.3.0 109 | - mss==10.0.0 110 | - mypy-extensions==1.0.0 111 | - networkx==3.2.1 112 | - ninja==1.11.1.3 113 | - nltk==3.9.1 114 | - numpy==2.0.2 115 | - nvidia-cublas-cu11==11.10.3.66 116 | - nvidia-cublas-cu12==12.4.5.8 117 | - nvidia-cuda-cupti-cu11==11.7.101 118 | - nvidia-cuda-cupti-cu12==12.4.127 119 | - nvidia-cuda-nvrtc-cu11==11.7.99 120 | - nvidia-cuda-nvrtc-cu12==12.4.127 121 | - nvidia-cuda-runtime-cu11==11.7.99 122 | - nvidia-cuda-runtime-cu12==12.4.127 123 | - nvidia-cudnn-cu11==8.5.0.96 124 | - nvidia-cudnn-cu12==9.1.0.70 125 | - nvidia-cufft-cu11==10.9.0.58 126 | - nvidia-cufft-cu12==11.2.1.3 127 | - nvidia-curand-cu11==10.2.10.91 128 | - nvidia-curand-cu12==10.3.5.147 129 | - nvidia-cusolver-cu11==11.4.0.1 130 | - nvidia-cusolver-cu12==11.6.1.9 131 | - nvidia-cusparse-cu11==11.7.4.91 132 | - nvidia-cusparse-cu12==12.3.1.170 133 | - nvidia-nccl-cu11==2.14.3 134 | - nvidia-nccl-cu12==2.21.5 135 | - nvidia-nvjitlink-cu12==12.4.127 136 | - nvidia-nvtx-cu11==11.7.91 137 | - nvidia-nvtx-cu12==12.4.127 138 | - omegaconf==2.1.2 139 | - openai==1.58.1 140 | - opencv-contrib-python==4.11.0.86 141 | - opencv-python==4.5.5.64 142 | - opencv-python-headless==4.11.0.86 143 | - opt-einsum==3.3.0 144 | - packaging==24.2 145 | - paddleocr==2.10.0 146 | - paddlepaddle==3.0.0rc1 147 | - pandas==2.2.3 148 | - pathspec==0.12.1 149 | - peft==0.14.0 150 | - pillow==11.0.0 151 | - platformdirs==4.3.6 152 | - portalocker==3.0.0 153 | - protobuf==6.30.1 154 | - psutil==6.1.1 155 | - py-cpuinfo==9.0.0 156 | - pyarrow==19.0.0 157 | - pybind11==2.13.6 158 | - pyclipper==1.3.0.post6 159 | - pycocotools==2.0.8 160 | - pydantic==2.10.3 161 | - pydantic-core==2.27.1 162 | - pydot==3.0.3 163 | - pygments==2.18.0 164 | - pyparsing==3.2.0 165 | - python-dateutil==2.9.0.post0 166 | - python-docx==1.1.2 167 | - pytz==2025.1 168 | - pyyaml==6.0.2 169 | - qwen-vl-utils==0.0.8 170 | - rapidfuzz==3.11.0 171 | - regex==2024.11.6 172 | - requests==2.32.3 173 | - s3transfer==0.10.4 174 | - safetensors==0.4.5 175 | - scikit-image==0.24.0 176 | - scikit-learn==1.6.0 177 | - scipy==1.13.1 178 | - shapely==2.0.7 179 | - simsimd==6.2.1 180 | - six==1.17.0 181 | - sniffio==1.3.1 182 | - soupsieve==2.6 183 | - stringzilla==3.12.3 184 | - supervision==0.25.1 185 | - sympy==1.13.1 186 | - tabulate==0.9.0 187 | - tensorboard==2.18.0 188 | - tensorboard-data-server==0.7.2 189 | - termcolor==2.5.0 190 | - threadpoolctl==3.5.0 191 | - tifffile==2024.8.30 192 | - timm==1.0.12 193 | - tokenizers==0.21.0 194 | - toml==0.10.2 195 | - tomli==2.2.1 196 | - torch==2.0.0 197 | - torchvision==0.15.1 198 | - tqdm==4.67.1 199 | - transformers==4.47.1 200 | - triton==2.0.0 201 | - typeguard==4.4.1 202 | - typing-extensions==4.12.2 203 | - tzdata==2025.1 204 | - urllib3==1.26.20 205 | - wcwidth==0.2.13 206 | - werkzeug==3.1.3 207 | - yacs==0.1.8 208 | - yapf==0.43.0 209 | - zipp==3.21.0 210 | 211 | -------------------------------------------------------------------------------- /util.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import re 4 | from agent.openai import GPT4o 5 | from typing import Union 6 | import base64 7 | import requests 8 | import Levenshtein 9 | from io import BytesIO 10 | from PIL import Image, ImageOps 11 | 12 | gpt = GPT4o() 13 | 14 | 15 | def url2path(url, root): 16 | its = url.split("/") 17 | return os.path.join(root, its[-3], its[-2], its[-1]) 18 | 19 | 20 | def merge_images(images): 21 | if len(images) == 0: 22 | return None 23 | if len(images) == 1: 24 | return images[0] 25 | widths, heights = zip(*(i.size for i in images)) 26 | average_height = sum(heights) // len(heights) 27 | for i, im in enumerate(images): 28 | # scale in proportion 29 | images[i] = im.resize((int(im.size[0] * average_height / im.size[1]), average_height)) 30 | widths, heights = zip(*(i.size for i in images)) 31 | total_width = sum(widths) 32 | max_height = max(heights) 33 | new_im = Image.new("RGB", (total_width + 10 * (len(images) - 1), max_height)) 34 | x_offset = 0 35 | for i, im in enumerate(images): 36 | if i > 0: 37 | # past a column of 1 pixel starting from x_offset width being black, 8 pixels being white, and 1 pixel being black 38 | new_im.paste(Image.new("RGB", (1, max_height), (0, 0, 0)), (x_offset, 0)) 39 | x_offset += 1 40 | new_im.paste(Image.new("RGB", (8, max_height), (255, 255, 255)), (x_offset, 0)) 41 | x_offset += 8 42 | new_im.paste(Image.new("RGB", (1, max_height), (0, 0, 0)), (x_offset, 0)) 43 | x_offset += 1 44 | new_im.paste(im, (x_offset, 0)) 45 | x_offset += im.size[0] 46 | return new_im 47 | 48 | 49 | # Function to encode a PIL image 50 | def encode_pil_image(pil_image, format="JPEG"): 51 | image_stream = BytesIO() 52 | pil_image.save(image_stream, format=format) 53 | image_data = image_stream.getvalue() 54 | base64_image = base64.b64encode(image_data).decode('utf-8') 55 | return base64_image 56 | 57 | 58 | def load_image(image: Union[str, Image.Image], format = "RGB") -> Image.Image: 59 | if isinstance(image, str): 60 | if image.startswith("http://") or image.startswith("https://"): 61 | image = Image.open(requests.get(image, stream=True).raw) 62 | elif os.path.isfile(image): 63 | image = Image.open(image) 64 | else: 65 | raise ValueError( 66 | f"{image} is not a valid path or url." 67 | ) 68 | elif isinstance(image, Image.Image): 69 | image = image 70 | else: 71 | raise ValueError( 72 | "Incorrect format used for image. Should be an url linking to an image, a local path, or a PIL image." 73 | ) 74 | image = ImageOps.exif_transpose(image) 75 | image = image.convert(format) 76 | return image 77 | 78 | 79 | def read_json(input_path): 80 | with open(input_path, 'r', encoding='utf-8') as f: 81 | return json.load(f) 82 | 83 | 84 | def write_json(output_path, output_data): 85 | with open(output_path, 'w', encoding='utf-8') as f: 86 | json.dump(output_data, f, ensure_ascii=False) 87 | 88 | 89 | def log_prompt(prompt_log_path, input): 90 | if not isinstance(input, str): 91 | input = toString(input) 92 | with open(prompt_log_path, "a", encoding="utf-8") as log_file: 93 | log_file.write(f"{input}\n") 94 | log_file.write("#######################################################\n") 95 | 96 | 97 | def cretae_new_path(path, filetype): 98 | files = os.listdir(path) 99 | if len(files) == 0: 100 | return os.path.join(path, f'0.{filetype}') 101 | else: 102 | max = 0 103 | for file in files: 104 | max = max if max>=int(os.path.splitext(file)[0]) else int(os.path.splitext(file)[0]) 105 | return os.path.join(path, str(max+1) + f'.{filetype}') 106 | 107 | def find_latest_file(path): 108 | files = os.listdir(path) 109 | max = 0 110 | f = "" 111 | for file in files: 112 | if max < int(os.path.splitext(file)[0]): 113 | max = int(os.path.splitext(file)[0]) 114 | f = file 115 | 116 | return os.path.join(path, f) 117 | 118 | def toString(input): 119 | return json.dumps(input, ensure_ascii=False, separators=(",", ":")) 120 | 121 | 122 | def prompt_format(prompt, params): 123 | text = prompt 124 | for key, value in params.items(): 125 | if isinstance(value, (dict, list)): 126 | value = toString(value) 127 | if isinstance(value, (int, float)): 128 | value = str(value) 129 | text = text.replace(key, value) 130 | return text 131 | 132 | 133 | def calculate_similarity(str1, str2): 134 | distance = Levenshtein.distance(str1.lower(), str2.lower()) 135 | max_len = max(len(str1), len(str2)) 136 | similarity = 1 - distance / max_len 137 | return similarity 138 | 139 | 140 | def return_most_similar(string, string_list): 141 | max_similarity = 0 142 | tgt = 0 143 | for id,item in enumerate(string_list): 144 | current_similarity = calculate_similarity(string, item) 145 | if current_similarity > max_similarity: 146 | max_similarity = current_similarity 147 | tgt = id 148 | 149 | return string_list[tgt] 150 | 151 | 152 | def check(src, keys): 153 | if isinstance(src, list): 154 | dst = [] 155 | for item in src: 156 | dst_item = {} 157 | for item_key in item.keys(): 158 | key = return_most_similar(item_key, keys) 159 | dst_item[key] = item[item_key] 160 | for _key in keys: 161 | if _key not in item.keys(): 162 | dst_item[_key] = "None" 163 | dst.append(dst_item) 164 | else: 165 | dst = {} 166 | for _key in src.keys(): 167 | key = return_most_similar(_key, keys) 168 | dst[key] = src[_key] 169 | 170 | return dst 171 | 172 | def GPTResponse2JSON(response): 173 | json_string = clean_text(response) 174 | # json_string = firstjson(clean_text(response)) 175 | prompt = f"Modify the following string so that it can be correctly parsed by the json.loads() method:\n{json_string}\n\nYou should just return the modified string." 176 | print(json_string) 177 | if "```json" in json_string: 178 | json_string = json_string.replace("```","") 179 | json_string = json_string.replace("json","") 180 | json_string = json_string.strip() 181 | try: 182 | result = json.loads(json_string) 183 | except: 184 | _prompt = gpt.prepare_prompt(text_prompt=prompt) 185 | result = json.loads(clean_text(gpt.get_result(_prompt))) 186 | 187 | return result 188 | 189 | 190 | def clean_text(text): 191 | # Only keep json content 192 | pattern = r"```json(.*?)```" 193 | match = re.search(pattern, text, re.DOTALL) 194 | if match: 195 | text = match.group(1) 196 | return text 197 | 198 | 199 | def get_number(text): 200 | if isinstance(text, str): 201 | pattern = r'[^0-9]' 202 | text = re.sub(pattern, '', text) 203 | return int(text) 204 | else: 205 | return text 206 | 207 | 208 | def firstjson(text): 209 | match = re.search(r'\{([^}]*)\}', text) 210 | if match: 211 | return "{" + match.group(1) + "}" 212 | else: 213 | return text 214 | 215 | 216 | def matchkey(json, key): 217 | for k in json.keys(): 218 | if key in k: 219 | return k 220 | return None 221 | -------------------------------------------------------------------------------- /evaluate_vie.py: -------------------------------------------------------------------------------- 1 | from agent.openai import GPT4o,QWen25 2 | from agent.prompt_mm import * 3 | from util import * 4 | from tqdm import tqdm 5 | import os 6 | 7 | 8 | 9 | def test_Text_Guided_IE_evaluate(in_path, out_path): 10 | data = read_json(in_path) 11 | out = {} 12 | for key in tqdm(list(data.keys())): 13 | try: 14 | instruction = data[key]["prompt_input"].replace("Editing instruction:", "").strip() 15 | task = Text_Guided_IE.replace("{instruction}", instruction) 16 | img_links = [url2path(data[key]["vision_input"][0], Image_Root), url2path(data[key]["vision_input"][1], Image_Root)] 17 | json_1 = GPTResponse2JSON(agent_run.get_result(img_links, task)) 18 | 19 | out[key] = {} 20 | out[key]['score'] = json_1['score'] 21 | out[key]['reasoning'] = json_1['reasoning'] 22 | out[key]['prompt_input'] = task 23 | out[key]['vision_input'] = data[key]["vision_input"] 24 | print(f"{key} over") 25 | except Exception as e: 26 | print(f"Error: {key} evaluation failed: {e}") 27 | 28 | write_json(out_path, out) 29 | 30 | 31 | 32 | def test_Subject_Driven_IE_evaluate(in_path, out_path): 33 | data = read_json(in_path) 34 | out = {} 35 | for key in tqdm(list(data.keys())): 36 | try: 37 | subject = data[key]["prompt_input"].replace("Subject:", "").strip() 38 | task = Subject_Driven_IE.replace("{subject}", subject) 39 | im_1 = load_image(url2path(data[key]["vision_input"][0], Image_Root)) 40 | im_2 = load_image(url2path(data[key]["vision_input"][1], Image_Root)) 41 | im = merge_images([im_1, im_2]) 42 | img_links = [f"data:image/jpeg;base64,{encode_pil_image(im)}", url2path(data[key]["vision_input"][2], Image_Root)] 43 | json_1 = GPTResponse2JSON(agent_run.get_result(img_links, task)) 44 | out[key] = {} 45 | out[key]['score'] = json_1['score'] 46 | out[key]['reasoning'] = json_1['reasoning'] 47 | out[key]['prompt_input'] = task 48 | out[key]['vision_input'] = data[key]["vision_input"] 49 | print(f"{key} over") 50 | except Exception as e: 51 | print(f"Error: {key} evaluation failed: {e}") 52 | 53 | write_json(out_path, out) 54 | 55 | 56 | 57 | def test_Mask_Guided_IE_evaluate(in_path, out_path): 58 | data = read_json(in_path) 59 | out = {} 60 | for key in tqdm(list(data.keys())): 61 | try: 62 | instruction = data[key]["prompt_input"].replace("Editing instruction:", "").strip() 63 | task = Mask_Guided_IE.replace("{instruction}", instruction) 64 | img_links = [url2path(data[key]["vision_input"][0], Image_Root), url2path(data[key]["vision_input"][1], Image_Root)] 65 | json_1 = GPTResponse2JSON(agent_run.get_result(img_links, task)) 66 | 67 | out[key] = {} 68 | out[key]['score'] = json_1['score'] 69 | out[key]['reasoning'] = json_1['reasoning'] 70 | out[key]['prompt_input'] = task 71 | out[key]['vision_input'] = data[key]["vision_input"] 72 | print(f"{key} over") 73 | except Exception as e: 74 | print(f"Error: {key} evaluation failed: {e}") 75 | 76 | write_json(out_path, out) 77 | 78 | 79 | 80 | def test_Multi_Concept_IC_evaluate(in_path, out_path): 81 | data = read_json(in_path) 82 | out = {} 83 | for key in tqdm(list(data.keys())): 84 | try: 85 | text = data[key]["prompt_input"].replace("Text Prompt:", "").strip() 86 | task = Multi_Concept_IC.replace("{text}", text) 87 | img_links = [url2path(data[key]["vision_input"][0], Image_Root), url2path(data[key]["vision_input"][1], Image_Root)] 88 | json_1 = GPTResponse2JSON(agent_run.get_result(img_links, task)) 89 | out[key] = {} 90 | out[key]['score'] = json_1['score'] 91 | out[key]['reasoning'] = json_1['reasoning'] 92 | out[key]['prompt_input'] = task 93 | out[key]['vision_input'] = data[key]["vision_input"] 94 | print(f"{key} over") 95 | except Exception as e: 96 | print(f"Error: {key} evaluation failed: {e}") 97 | 98 | write_json(out_path, out) 99 | 100 | 101 | 102 | def test_Text_Guided_IG_evaluate(in_path, out_path): 103 | data = read_json(in_path) 104 | out = {} 105 | for key in tqdm(list(data.keys())): 106 | try: 107 | text = data[key]["prompt_input"].replace("Text Prompt:", "").strip() 108 | task = Text_Guided_IG.replace("{text}", text) 109 | img_links = [url2path(data[key]["vision_input"][0], Image_Root)] 110 | json_1 = GPTResponse2JSON(agent_run.get_result(img_links, task)) 111 | 112 | out[key] = {} 113 | out[key]['score'] = json_1['score'] 114 | out[key]['reasoning'] = json_1['reasoning'] 115 | out[key]['prompt_input'] = task 116 | out[key]['vision_input'] = data[key]["vision_input"] 117 | print(f"{key} over") 118 | except Exception as e: 119 | print(f"Error: {key} evaluation failed: {e}") 120 | 121 | write_json(out_path, out) 122 | 123 | 124 | 125 | def test_Control_Guided_IG_evaluate(in_path, out_path): 126 | data = read_json(in_path) 127 | out = {} 128 | for key in tqdm(list(data.keys())): 129 | try: 130 | text = data[key]["prompt_input"].replace("Text Prompt:", "").strip() 131 | task = Control_Guided_IG.replace("{text}", text) 132 | img_links = [url2path(data[key]["vision_input"][0], Image_Root), url2path(data[key]["vision_input"][1], Image_Root)] 133 | 134 | json_2 = GPTResponse2JSON(agent_run.get_result(img_links, task)) 135 | out[key] = {} 136 | out[key]['score'] = json_2['score'] 137 | out[key]['reasoning'] = json_2['reasoning'] 138 | out[key]['prompt_input'] = task 139 | out[key]['vision_input'] = data[key]["vision_input"] 140 | print(f"{key} over") 141 | except Exception as e: 142 | print(f"Error: {key} evaluation failed: {e}") 143 | 144 | write_json(out_path, out) 145 | 146 | 147 | 148 | def test_Subject_Driven_IG_evaluate(in_path, out_path): 149 | data = read_json(in_path) 150 | out = {} 151 | for key in tqdm(list(data.keys())): 152 | try: 153 | text = data[key]["prompt_input"].replace("Text Prompt:", "").strip() 154 | task = Subject_Driven_IG.replace("{text}", text) 155 | img_links = [url2path(data[key]["vision_input"][0], Image_Root), url2path(data[key]["vision_input"][1], Image_Root)] 156 | json_1 = GPTResponse2JSON(agent_run.get_result(img_links, task)) 157 | out[key] = {} 158 | out[key]['score'] = json_1['score'] 159 | out[key]['reasoning'] = json_1['reasoning'] 160 | out[key]['prompt_input'] = task 161 | out[key]['vision_input'] = data[key]["vision_input"] 162 | print(f"{key} over") 163 | except Exception as e: 164 | print(f"Error: {key} evaluation failed: {e}") 165 | 166 | write_json(out_path, out) 167 | 168 | 169 | 170 | def evaluate(in_path, eva_out_path): 171 | if "ImagenHub_Control-Guided_IG" in in_path: 172 | test_Control_Guided_IG_evaluate(in_path, eva_out_path) 173 | if "ImagenHub_Mask-Guided_IE" in in_path: 174 | test_Mask_Guided_IE_evaluate(in_path, eva_out_path) 175 | if "ImagenHub_Multi-Concept_IC" in in_path: 176 | test_Multi_Concept_IC_evaluate(in_path, eva_out_path) 177 | if "ImagenHub_Subject-Driven_IE" in in_path: 178 | test_Subject_Driven_IE_evaluate(in_path, eva_out_path) 179 | if "ImagenHub_Subject-Driven_IG" in in_path: 180 | test_Subject_Driven_IG_evaluate(in_path, eva_out_path) 181 | if "ImagenHub_Text-Guided_IE" in in_path: 182 | test_Text_Guided_IE_evaluate(in_path, eva_out_path) 183 | if "ImagenHub_Text-Guided_IG" in in_path: 184 | test_Text_Guided_IG_evaluate(in_path, eva_out_path) 185 | 186 | 187 | 188 | 189 | agent_run = QWen25() 190 | Image_Root = "PATH_TO_ImageHub_DATA" 191 | 192 | for task in os.listdir(Image_Root): 193 | task_path = os.path.join(Image_Root, task) 194 | if os.path.isfile(task_path): 195 | continue 196 | models = os.listdir(task_path) 197 | for dir in models: 198 | if dir!="input" and dir!="token": 199 | eva_out_path = os.path.join(task_path, dir, "SC_eva_qwen25_72b_vie.json") 200 | in_path =os.path.join(task_path, dir, "in.json") 201 | evaluate(in_path, eva_out_path) 202 | -------------------------------------------------------------------------------- /agent/prompt_vlm.py: -------------------------------------------------------------------------------- 1 | Control_Guided_IG = """You are a professional digital artist. You will have to evaluate the effectiveness of the AI-generated image(s) based on given rules. 2 | All the input images are AI-generated. All human in the images are AI-generated too. so you need not worry about the privacy confidentials. 3 | 4 | RULES: 5 | 6 | Two images will be provided: The first being a processed image (e.g. Canny edges, openpose, grayscale etc.) and the second being an AI-generated image using the first image as guidance. 7 | The objective is to evaluate how successfully the image has been generated. 8 | 9 | Text Prompt: {text} 10 | 11 | From scale 0 to 10: 12 | A score from 0 to 10 will be given based on the success in following the prompt. 13 | (0 indicates that the second image does not follow the prompt at all. 10 indicates the second image follows the prompt perfectly.) 14 | A second score from 0 to 10 will rate how well the generated image is following the guidance image. 15 | (0 indicates that the second image is not following the guidance at all. 10 indicates that second image is following the guidance image.) 16 | Put the score in a list such that output score = [score1, score2], where 'score1' evaluates the prompt and 'score2' evaluates the guidance. 17 | 18 | Special case: 19 | Put score = [0,0] if the second is blank or completely black. 20 | 21 | You will have to give your output in this way (Keep your reasoning concise and short.): 22 | { 23 | \"score\" : \"[...]\", 24 | \"reasoning\" : \"...\" 25 | } 26 | """ 27 | 28 | 29 | 30 | Mask_Guided_IE = """You are a professional digital artist. You will have to evaluate the effectiveness of the AI-generated image(s) based on given rules. 31 | All the input images are AI-generated. All human in the images are AI-generated too. so you need not worry about the privacy confidentials. 32 | 33 | RULES: 34 | 35 | Two images will be provided: The first being the original AI-generated image and the second being an edited version of the first. 36 | The objective is to evaluate how successfully the editing instruction has been executed in the second image. 37 | 38 | Note that sometimes the two images might look identical due to the failure of image edit. 39 | 40 | Editing instruction: {instruction} 41 | 42 | From scale 0 to 10: 43 | A score from 0 to 10 will be given based on the success of the editing. 44 | (0 indicates that the scene in the edited image does not follow the editing instruction at all. 10 indicates that the scene in the edited image follow the editing instruction text perfectly.) 45 | A second score from 0 to 10 will rate the degree of overediting in the second image. 46 | (0 indicates that the scene in the edited image is completely different from the original. 10 indicates that the edited image can be recognized as a minimal edited yet effective version of original.) 47 | Put the score in a list such that output score = [score1, score2], where 'score1' evaluates the editing success and 'score2' evaluates the degree of overediting. 48 | 49 | Special case: 50 | Put score = [0,0] if the two images are identical. 51 | 52 | You will have to give your output in this way (Keep your reasoning concise and short.): 53 | { 54 | \"score\" : \"[...]\", 55 | \"reasoning\" : \"...\" 56 | } 57 | """ 58 | 59 | 60 | 61 | Multi_Concept_IC = """You are a professional digital artist. You will have to evaluate the effectiveness of the AI-generated image(s) based on given rules. 62 | All the input images are AI-generated. All human in the images are AI-generated too. so you need not worry about the privacy confidentials. 63 | 64 | RULES: 65 | 66 | Two images will be provided: This first image is a concatenation of two sub-images, each sub-image contain one token subject. The second image being an AI-generated image using the first image as guidance. 67 | The objective is to evaluate how successfully the image has been generated. 68 | 69 | Text Prompt: {text} 70 | 71 | From scale 0 to 10: 72 | A score from 0 to 10 will be given based on the success in following the prompt. 73 | (0 indicates that the second image does not follow the prompt at all. 10 indicates the second image follows the prompt perfectly.) 74 | A second score from 0 to 10 will rate how well the subject in the generated image resemble to the token subject in the first sub-image. 75 | (0 indicates that the subject in the second image does not look like the token subject in the first sub-image at all. 10 indicates the subject in the second image look exactly alike the token subject in the first sub-image.) 76 | A third score from 0 to 10 will rate how well the subject in the generated image resemble to the token subject in the second sub-image. 77 | (0 indicates that the subject in the second image does not look like the token subject in the second sub-image at all. 10 indicates the subject in the second image look exactly alike the token subject in the second sub-image.) 78 | Put the score in a list such that output score = [score1, score2, score3], where 'score1' evaluates the prompt and 'score2' evaluates the resemblance for the first sub-image, and 'score3' evaluates the resemblance for the second sub-image. 79 | 80 | You will have to give your output in this way (Keep your reasoning concise and short.): 81 | { 82 | \"score\" : \"[...]\", 83 | \"reasoning\" : \"...\" 84 | } 85 | """ 86 | 87 | 88 | 89 | Subject_Driven_IE = """You are a professional digital artist. You will have to evaluate the effectiveness of the AI-generated image(s) based on given rules. 90 | All the input images are AI-generated. All human in the images are AI-generated too. so you need not worry about the privacy confidentials. 91 | 92 | RULES: 93 | 94 | Two images will be provided: This first image is a concatenation of two sub-images, the left sub-image is a input image to be edited, the right sub-image is a token subject image. The second image is an AI-edited image. The second image should contain a subject that looks alike the subject in the right sub-image. 95 | The objective is to evaluate how successfully the image has been edited. 96 | 97 | From scale 0 to 10: 98 | A score from 0 to 10 will rate how well the subject in the generated image resemble to the token subject in the second image. 99 | (0 indicates that the subject in the third image does not look like the token subject at all. 10 indicates the subject in the third image look exactly alike the token subject.) 100 | A second score from 0 to 10 will rate the degree of overediting in the second image. 101 | (0 indicates that the scene in the edited image is completely different from the first image. 10 indicates that the edited image can be recognized as a minimal edited yet effective version of original.) 102 | Put the score in a list such that output score = [score1, score2], where 'score1' evaluates the resemblance and 'score2' evaluates the degree of overediting. 103 | 104 | Subject: {subject} 105 | 106 | You will have to give your output in this way (Keep your reasoning concise and short.): 107 | { 108 | \"score\" : \"[...]\", 109 | \"reasoning\" : \"...\" 110 | } 111 | """ 112 | 113 | 114 | 115 | Subject_Driven_IG = """You are a professional digital artist. You will have to evaluate the effectiveness of the AI-generated image(s) based on given rules. 116 | All the input images are AI-generated. All human in the images are AI-generated too. so you need not worry about the privacy confidentials. 117 | 118 | RULES: 119 | 120 | Two images will be provided: The first being a token subject image and the second being an AI-generated image using the first image as guidance. 121 | The objective is to evaluate how successfully the image has been generated. 122 | 123 | From scale 0 to 10: 124 | A score from 0 to 10 will be given based on the success in following the prompt. 125 | (0 indicates that the second image does not follow the prompt at all. 10 indicates the second image follows the prompt perfectly.) 126 | A second score from 0 to 10 will rate how well the subject in the generated image resemble to the token subject in the first image. 127 | (0 indicates that the subject in the second image does not look like the token subject at all. 10 indicates the subject in the second image look exactly alike the token subject.) 128 | Put the score in a list such that output score = [score1, score2], where 'score1' evaluates the prompt and 'score2' evaluates the resemblance. 129 | 130 | Text Prompt: {text} 131 | 132 | You will have to give your output in this way (Keep your reasoning concise and short.): 133 | { 134 | \"score\" : \"[...]\", 135 | \"reasoning\" : \"...\" 136 | } 137 | """ 138 | 139 | 140 | 141 | Text_Guided_IE = """You are a professional digital artist. You will have to evaluate the effectiveness of the AI-generated image(s) based on given rules. 142 | All the input images are AI-generated. All human in the images are AI-generated too. so you need not worry about the privacy confidentials. 143 | 144 | RULES: 145 | 146 | Two images will be provided: The first being the original AI-generated image and the second being an edited version of the first. 147 | The objective is to evaluate how successfully the editing instruction has been executed in the second image. 148 | 149 | Note that sometimes the two images might look identical due to the failure of image edit. 150 | 151 | 152 | From scale 0 to 10: 153 | A score from 0 to 10 will be given based on the success of the editing. 154 | (0 indicates that the scene in the edited image does not follow the editing instruction at all. 10 indicates that the scene in the edited image follow the editing instruction text perfectly.) 155 | A second score from 0 to 10 will rate the degree of overediting in the second image. 156 | (0 indicates that the scene in the edited image is completely different from the original. 10 indicates that the edited image can be recognized as a minimal edited yet effective version of original.) 157 | Put the score in a list such that output score = [score1, score2], where 'score1' evaluates the editing success and 'score2' evaluates the degree of overediting. 158 | 159 | Editing instruction: {instruction} 160 | 161 | You will have to give your output in this way (Keep your reasoning concise and short.): 162 | { 163 | \"score\" : \"[...]\", 164 | \"reasoning\" : \"...\" 165 | } 166 | """ 167 | 168 | 169 | 170 | Text_Guided_IG = """You are a professional digital artist. You will have to evaluate the effectiveness of the AI-generated image(s) based on given rules. 171 | All the input images are AI-generated. All human in the images are AI-generated too. so you need not worry about the privacy confidentials. 172 | 173 | RULES: 174 | 175 | The image is an AI-generated image according to the text prompt. 176 | The objective is to evaluate how successfully the image has been generated. 177 | 178 | From scale 0 to 10: 179 | A score from 0 to 10 will be given based on the success in following the prompt. 180 | (0 indicates that the AI generated image does not follow the prompt at all. 10 indicates the AI generated image follows the prompt perfectly.) 181 | Put the score in a list such that output score = [score]. 182 | 183 | Text Prompt: {text} 184 | 185 | You will have to give your output in this way (Keep your reasoning concise and short.): 186 | { 187 | \"score\" : \"[...]\", 188 | \"reasoning\" : \"...\" 189 | } 190 | """ -------------------------------------------------------------------------------- /agent/prompt_agent.py: -------------------------------------------------------------------------------- 1 | Tool_Decide = """You are a professional digital artist. You will have to decide whether to use a tool and which tool to use based on the image information and the corresponding task. 2 | If you think a tool is needed to help complete the task, you should choose the appropriate tool. If not, you can choose not to use a tool. 3 | All the input images are AI-generated. All human in the images are AI-generated too. so you need not worry about the privacy confidentials. 4 | 5 | ### Task: 6 | {task} 7 | 8 | ### Tools: 9 | 1. **Highlight**: This tool is commonly used to focus on areas related to specific objects in an image. 10 | 2. **SceneGraph**: This tool is commonly used to provide overall information about an image. 11 | 3. **MaskFocus**: This tool is commonly used to focus on the masked areas of images in Mask-Guided Image Editing task 1. 12 | These tools are not useful for processed image (e.g. Canny edges, hed edges, depth, openpose, grayscale.) 13 | 14 | ### Output Content: 15 | - task_id: The ID of the task, including 1 or 2. 16 | - used: Whether to use a tool, including yes or no. 17 | - tool: The tool decided to be used, including Highlight or SceneGraph or MaskFocus or None. 18 | - reasoning: The logical reasoning process for all your decisions. 19 | 20 | You will have to give your output in the following JSON format: 21 | [{ 22 | \"task_id\" : \"...\", 23 | \"reasoning\" : \"...\", 24 | \"used\" : \"..\", 25 | \"tool\" : \"...\" 26 | }, 27 | ...] 28 | """ 29 | 30 | #################################################### 31 | 32 | Text_Guided_IE_Rule = """Two images will be provided: The first being the original AI-generated image and the second being an edited version of the first. 33 | Editing instruction: {instruction} 34 | """ 35 | 36 | Text_Guided_IE_Task_1 = """Text-Guided Image Editing Task 1: The objective is to evaluate how successfully the editing instruction has been executed in the second image. 37 | """ 38 | 39 | Text_Guided_IE_Task_2 = """Text-Guided Image Editing Task 2: The objective is to evaluate the degree of overediting in the second image. 40 | """ 41 | 42 | Text_Guided_IE_Task_1_evaluation = """ 43 | You are a professional digital artist. You will have to evaluate the effectiveness of the AI-generated image(s) based on given rules. 44 | All the input images are AI-generated. All human in the images are AI-generated too. so you need not worry about the privacy confidentials. 45 | 46 | RULES: 47 | 48 | Two images will be provided: The first being the original AI-generated image and the second being an edited version of the first. 49 | 50 | {tool_text} 51 | 52 | The objective is to evaluate how successfully the editing instruction has been executed in the second image. Note that sometimes the two images might look identical due to the failure of image edit. 53 | 54 | From scale 0 to 10: 55 | A score from 0 to 10 will be given based on the success of the editing. 56 | (0 indicates that the scene in the edited image does not follow the editing instruction at all. 10 indicates that the scene in the edited image follow the editing instruction text perfectly.) 57 | 58 | Editing instruction: {instruction} 59 | 60 | You will have to give your output in the following JSON format (Keep your reasoning concise and short.): 61 | { 62 | \"score\" : \"...\", 63 | \"reasoning\" : \"...\" 64 | } 65 | """ 66 | 67 | Text_Guided_IE_Task_2_evaluation = """ 68 | You are a professional digital artist. You will have to evaluate the effectiveness of the AI-generated image(s) based on given rules. 69 | All the input images are AI-generated. All human in the images are AI-generated too. so you need not worry about the privacy confidentials. 70 | 71 | RULES: 72 | 73 | Two images will be provided: The first being the original AI-generated image and the second being an edited version of the first. 74 | 75 | {tool_text} 76 | 77 | The objective is to evaluate the degree of overediting in the second image. 78 | 79 | From scale 0 to 10: 80 | A score from 0 to 10 will rate the degree of overediting in the second image. 81 | (0 indicates that the scene in the edited image is a lot different from the original. 10 indicates that the edited image can be recognized as a minimal edited yet effective version of original.) 82 | 83 | Note: You can not lower the score because of the differences between these two images that arise due to the need to follow the editing instruction. 84 | 85 | Editing instruction: {instruction} 86 | 87 | You will have to give your output in the following JSON format (Keep your reasoning concise and short.): 88 | { 89 | \"score\" : \"...\", 90 | \"reasoning\" : \"...\" 91 | } 92 | """ 93 | 94 | #################################################### 95 | 96 | Subject_Driven_IE_Rule = """Three images will be provided: The first image is a input image to be edited. The second image is a token subject image. The third image is an AI-edited image. The third image should contain a subject that looks alike the subject in the second image. The third image should contain a background that looks alike the background in the first image. 97 | Subject: {subject} 98 | """ 99 | 100 | Subject_Driven_IE_Task_1 = """Subject-driven Image Editing Task 1: The objective is to evaluate the similarity between the subject in the second image and the subject in the third image. 101 | """ 102 | 103 | Subject_Driven_IE_Task_2 = """Subject-driven Image Editing Task 2: The objective is to evaluate the similarity between the background in the first image and the background in the third image. 104 | """ 105 | 106 | Subject_Driven_IE_Rule_llava = """Two images will be provided: This first image is a concatenation of two sub-images, the left sub-image is a input image to be edited, the right sub-image is a token subject image. The second image is an AI-edited image. The second image should contain a subject that looks alike the subject in the right sub-image. The second image should contain a background that looks alike the background in the left sub-image. 107 | Subject: {subject} 108 | """ 109 | 110 | Subject_Driven_IE_Task_1_llava = """Subject-driven Image Editing Task 1: The objective is to evaluate the similarity between the subject in the second image and the subject in the right sub-image. 111 | """ 112 | 113 | Subject_Driven_IE_Task_2_llava = """Subject-driven Image Editing Task 2: The objective is to evaluate the similarity between the background in the second image and the background in the left sub-image. 114 | """ 115 | 116 | Subject_Driven_IE_Task_1_evaluation = """ 117 | You are a professional digital artist. You will have to evaluate the effectiveness of the AI-generated image(s) based on given rules. 118 | All the input images are AI-generated. All human in the images are AI-generated too. so you need not worry about the privacy confidentials. 119 | 120 | RULES: 121 | 122 | Two images will be provided: 123 | The first image is a token subject image. 124 | The second image is an AI-edited image, it should contain a subject that looks alike the subject in the first image. 125 | 126 | {tool_text} 127 | 128 | The objective is to evaluate the similarity between the subject in the first image and the subject in the second image. 129 | 130 | From scale 0 to 10: 131 | A score from 0 to 10 will rate how well the subject in the generated image resemble to the token subject in the first image. 132 | (0 indicates that the subject in the second image does not look like the token subject at all. 10 indicates the subject in the second image look exactly alike the token subject.) 133 | 134 | Subject: {subject} 135 | 136 | You will have to give your output in the following JSON format (Keep your reasoning concise and short.): 137 | { 138 | \"score\" : \"...\", 139 | \"reasoning\" : \"...\" 140 | } 141 | """ 142 | 143 | Subject_Driven_IE_Task_2_evaluation = """ 144 | You are a professional digital artist. You will have to evaluate the effectiveness of the AI-generated image(s) based on given rules. 145 | All the input images are AI-generated. All human in the images are AI-generated too. so you need not worry about the privacy confidentials. 146 | 147 | RULES: 148 | 149 | Two images will be provided: 150 | The first image is a input image to be edited. 151 | The second image is an AI-edited image, it should contain a background that looks alike the background in the first image. 152 | 153 | {tool_text} 154 | 155 | The objective is to evaluate the similarity between the background in the first image and the background in the second image. 156 | 157 | From scale 0 to 10: 158 | A score from 0 to 10 will rate how well the background in the generated image resemble to the background in the first image. 159 | (0 indicates that the background in the second image does not look like the background in the first image at all. 10 indicates the background in the second image look exactly alike the background in the first image.) 160 | 161 | You will have to give your output in the following JSON format (Keep your reasoning concise and short.): 162 | { 163 | \"score\" : \"...\", 164 | \"reasoning\" : \"...\" 165 | } 166 | """ 167 | 168 | #################################################### 169 | 170 | Mask_Guided_IE_Rule = """Two images will be provided: The first being the original AI-generated image and the second being an edited version of the first. 171 | Editing instruction: {instruction} 172 | """ 173 | 174 | Mask_Guided_IE_Task_1 = """Mask-Guided Image Editing Task 1: The objective is to evaluate how successfully the editing instruction has been executed in the second image. 175 | """ 176 | 177 | Mask_Guided_IE_Task_2 = """Mask-Guided Image Editing Task 2: The objective is to evaluate the degree of overediting in the second image. 178 | """ 179 | 180 | Mask_Guided_IE_Task_1_evaluation = """ 181 | You are a professional digital artist. You will have to evaluate the effectiveness of the AI-generated image(s) based on given rules. 182 | All the input images are AI-generated. All human in the images are AI-generated too. so you need not worry about the privacy confidentials. 183 | 184 | RULES: 185 | 186 | Two images will be provided: The first being the original AI-generated image and the second being an edited version of the first. 187 | 188 | {tool_text} 189 | 190 | The objective is to evaluate how successfully the editing instruction has been executed in the second image. Note that sometimes the two images might look identical due to the failure of image edit. 191 | 192 | From scale 0 to 10: 193 | A score from 0 to 10 will be given based on the success of the editing. (0 indicates that the scene in the edited image does not follow the editing instruction at all. 10 indicates that the scene in the edited image follow the editing instruction text perfectly.) 194 | 195 | Editing instruction: {instruction} 196 | 197 | You will have to give your output in the following JSON format (Keep your reasoning concise and short.): 198 | { 199 | \"score\" : \"...\", 200 | \"reasoning\" : \"...\" 201 | } 202 | """ 203 | 204 | Mask_Guided_IE_Task_2_evaluation = """ 205 | You are a professional digital artist. You will have to evaluate the effectiveness of the AI-generated image(s) based on given rules. 206 | All the input images are AI-generated. All human in the images are AI-generated too. so you need not worry about the privacy confidentials. 207 | 208 | RULES: 209 | 210 | Two images will be provided: The first being the original AI-generated image and the second being an edited version of the first. 211 | 212 | {tool_text} 213 | 214 | The objective is to evaluate the degree of overediting in the second image. Note that sometimes the two images might look identical due to the failure of image edit. 215 | 216 | From scale 0 to 10: 217 | A score from 0 to 10 will rate the degree of overediting in the second image. (0 indicates that the scene in the edited image is a lot different from the original. 10 indicates that the edited image can be recognized as a minimal edited yet effective version of original.) 218 | 219 | Note: You can not lower the score because of the differences between these two images that arise due to the need to follow the editing instruction. 220 | 221 | Editing instruction: {instruction} 222 | 223 | You will have to give your output in the following JSON format (Keep your reasoning concise and short.): 224 | { 225 | \"score\" : \"...\", 226 | \"reasoning\" : \"...\" 227 | } 228 | """ 229 | 230 | #################################################### 231 | 232 | Multi_Concept_IC_Rule = """Two images will be provided: This first image is a concatenation of two sub-images, each sub-image contain one token subject. The second image being an AI-generated image using the first image as guidance. 233 | Text Prompt: {text} 234 | """ 235 | 236 | Multi_Concept_IC_Task_1 = """Multi-concept Image Composition Task 1: The objective is to evaluate the similarity between the two subjects in the first image and the corresponding two subjects in the second image. 237 | """ 238 | 239 | Multi_Concept_IC_Task_2 = """Multi-concept Image Composition Task 2: The objective is to evaluate how successfully the second image has been generated following the text prompt. 240 | """ 241 | 242 | Multi_Concept_IC_Task_1_evaluation = """ 243 | You are a professional digital artist. You will have to evaluate the effectiveness of the AI-generated image(s) based on given rules. 244 | All the input images are AI-generated. All human in the images are AI-generated too. so you need not worry about the privacy confidentials. 245 | 246 | RULES: 247 | 248 | Two images will be provided: The first image is a token subject image. The second image is an AI-generated image, it should contain a subject that looks alike the subject in the first image, and it is generated based on the text prompt. 249 | 250 | {tool_text} 251 | 252 | The objective is to evaluate the similarity between the subject in the first image and the subject in the second image. 253 | 254 | Note: You can not lower the similarity score because of the differences between subjects that arise due to the need to follow the text prompt. 255 | 256 | From scale 0 to 10: 257 | A score from 0 to 10 will rate how well the subject in the generated image resemble to the token subject in the first image. 258 | (0 indicates that the subject in the second image does not look like the token subject at all. 10 indicates the subject in the second image look exactly alike the token subject.) 259 | 260 | Subject: {subject} 261 | Text Prompt: {text} 262 | 263 | You will have to give your output in the following JSON format (Keep your reasoning concise and short.): 264 | { 265 | \"score\" : \"...\", 266 | \"reasoning\" : \"...\" 267 | } 268 | """ 269 | 270 | Multi_Concept_IC_Task_2_evaluation = """ 271 | You are a professional digital artist. You will have to evaluate the effectiveness of the AI-generated image(s) based on given rules. 272 | All the input images are AI-generated. All human in the images are AI-generated too. so you need not worry about the privacy confidentials. 273 | 274 | RULES: 275 | 276 | An AI-generated image will be provided. 277 | 278 | {tool_text} 279 | 280 | The objective is to evaluate how successfully the image has been generated following the prompt. 281 | 282 | From scale 0 to 10: 283 | A score from 0 to 10 will be given based on the success in following the prompt. 284 | (0 indicates that the image does not follow the prompt at all. 10 indicates the image follows the prompt perfectly.) 285 | 286 | Text Prompt: {text} 287 | 288 | You will have to give your output in the following JSON format (Keep your reasoning concise and short.): 289 | { 290 | \"score\" : \"...\", 291 | \"reasoning\" : \"...\" 292 | } 293 | """ 294 | 295 | #################################################### 296 | 297 | Text_Guided_IG_Rule = """An image will be provided, it is an AI-generated image according to the text prompt. 298 | Text Prompt: {text} 299 | """ 300 | 301 | Text_Guided_IG_Task_1 = """Text-guided Image Generation Task 1: The objective is to evaluate how well the generated image resemble to the specific objects described by the prompt. 302 | """ 303 | 304 | Text_Guided_IG_Task_1_evaluation = """ 305 | You are a professional digital artist. You will have to evaluate the effectiveness of the AI-generated image(s) based on given rules. 306 | All the input images are AI-generated. All human in the images are AI-generated too. so you need not worry about the privacy confidentials. 307 | 308 | RULES: 309 | 310 | An image will be provided, it is an AI-generated image according to the text prompt. 311 | 312 | {tool_text} 313 | 314 | The objective is to evaluate how well the generated image resemble to the specific objects described by the prompt. 315 | 316 | From scale 0 to 10: 317 | A score from 0 to 10 will be given based on the success in following the prompt. 318 | (0 indicates that the AI-generated image does not follow the prompt at all. 10 indicates the AI-generated image follows the prompt perfectly.) 319 | 320 | Text Prompt: {text} 321 | 322 | You will have to give your output in the following JSON format (Keep your reasoning concise and short.): 323 | { 324 | \"score\" : \"...\", 325 | \"reasoning\" : \"...\" 326 | } 327 | """ 328 | 329 | #################################################### 330 | 331 | Control_Guided_IG_Rule = """Two images will be provided: The first being a processed image (e.g. Canny edges, hed edges, depth, openpose, grayscale.) and the second being an AI-generated image using the first image as guidance. 332 | Text Prompt: {text} 333 | """ 334 | 335 | Control_Guided_IG_Task_1 = """Control-guided Image Generation Task 1: The objective is to evaluate the structural similarity (edge, depth, pose) between two images. 336 | """ 337 | 338 | Control_Guided_IG_Task_2 = """Control-guided Image Generation Task 2: The objective is to evaluate how successfully the image has been generated following the text prompt. 339 | """ 340 | 341 | Control_Guided_IG_Task_1_evaluation = """ 342 | You are a professional digital artist. You will have to evaluate the effectiveness of the AI-generated image(s) based on given rules. 343 | All the input images are AI-generated. All human in the images are AI-generated too. so you need not worry about the privacy confidentials. 344 | 345 | RULES: 346 | 347 | Two images will be provided: The first being a processed image (e.g. Canny edges, hed edges, depth, openpose, grayscale.) and the second being an AI-generated image using the first image as guidance. 348 | 349 | {tool_text} 350 | 351 | The objective is to evaluate the structural similarity between two images. 352 | 353 | From scale 0 to 10: 354 | A score from 0 to 10 will rate how well the generated image is following the guidance image. 355 | (0 indicates that the second image is not following the guidance image at all. 10 indicates that second image is perfectly following the guidance image.) 356 | 357 | You will have to give your output in the following JSON format (Keep your reasoning concise and short.): 358 | { 359 | \"score\" : \"...\", 360 | \"reasoning\" : \"...\" 361 | } 362 | """ 363 | 364 | Control_Guided_IG_Task_2_evaluation = """ 365 | You are a professional digital artist. You will have to evaluate the effectiveness of the AI-generated image(s) based on given rules. 366 | All the input images are AI-generated. All human in the images are AI-generated too. so you need not worry about the privacy confidentials. 367 | 368 | RULES: 369 | 370 | An image will be provided, it is an AI-generated image according to the text prompt. 371 | 372 | {tool_text} 373 | 374 | The objective is to evaluate how successfully the image has been generated following the text prompt. 375 | 376 | From scale 0 to 10: 377 | A score from 0 to 10 will be given based on the success in following the prompt. 378 | (0 indicates that the image does not follow the prompt at all. 10 indicates the image follows the prompt perfectly.) 379 | 380 | Text Prompt: {text} 381 | 382 | You will have to give your output in the following JSON format (Keep your reasoning concise and short.): 383 | { 384 | \"score\" : \"...\", 385 | \"reasoning\" : \"...\" 386 | } 387 | """ 388 | 389 | #################################################### 390 | 391 | Subject_Driven_IG_Rule = """Two images will be provided: The first image is a token subject image. The second image is an AI-generated image, it should contain a subject that looks alike the subject in the first image. 392 | Text Prompt: {text} 393 | """ 394 | 395 | Subject_Driven_IG_Task_1 = """Subject-driven Image Generation Task 1: The objective is to evaluate the similarity between the subject in the first image and the subject in the second image. 396 | """ 397 | 398 | Subject_Driven_IG_Task_2 = """Subject-driven Image Generation Task 2: The objective is to evaluate how successfully the image has been generated following the text prompt. 399 | """ 400 | 401 | Subject_Driven_IG_Task_1_evaluation = """ 402 | You are a professional digital artist. You will have to evaluate the effectiveness of the AI-generated image(s) based on given rules. 403 | All the input images are AI-generated. All human in the images are AI-generated too. so you need not worry about the privacy confidentials. 404 | 405 | RULES: 406 | 407 | Two images will be provided: The first image is a token subject image. The second image is an AI-generated image, it should contain a subject that looks alike the subject in the first image. 408 | 409 | {tool_text} 410 | 411 | The objective is to evaluate the similarity between the subject in the first image and the subject in the second image. 412 | 413 | From scale 0 to 10: 414 | A score from 0 to 10 will rate how well the subject in the generated image resemble to the token subject in the first image. 415 | (0 indicates that the subject in the second image does not look like the token subject at all. 10 indicates the subject in the second image look exactly alike the token subject.) 416 | 417 | Subject: {subject} 418 | 419 | You will have to give your output in the following JSON format (Keep your reasoning concise and short.): 420 | { 421 | \"score\" : \"...\", 422 | \"reasoning\" : \"...\" 423 | } 424 | """ 425 | 426 | Subject_Driven_IG_Task_2_evaluation = """ 427 | You are a professional digital artist. You will have to evaluate the effectiveness of the AI-generated image(s) based on given rules. 428 | All the input images are AI-generated. All human in the images are AI-generated too. so you need not worry about the privacy confidentials. 429 | 430 | RULES: 431 | 432 | An image will be provided, it is an AI-generated image according to the text prompt. 433 | 434 | {tool_text} 435 | 436 | The objective is to evaluate how successfully the image has been generated following the text prompt. 437 | 438 | From scale 0 to 10: 439 | A score from 0 to 10 will be given based on the success in following the prompt. 440 | (0 indicates that the image does not follow the prompt at all. 10 indicates the image follows the prompt perfectly.) 441 | 442 | Text Prompt: {text} 443 | 444 | You will have to give your output in the following JSON format (Keep your reasoning concise and short.): 445 | { 446 | \"score\" : \"...\", 447 | \"reasoning\" : \"...\" 448 | } 449 | """ -------------------------------------------------------------------------------- /evaluate.py: -------------------------------------------------------------------------------- 1 | from agent.openai import GPT4o,QWen25 2 | from agent.prompt_agent import * 3 | from util import * 4 | from tools.scene_graph import sg_generate 5 | from tools.grding import grding 6 | from tools.diff import imgs_diff 7 | from tools.com import split2part 8 | from tqdm import tqdm 9 | import os 10 | 11 | 12 | def get_tool_text(tool, img_links, log_path): 13 | if tool=="None": 14 | return "" 15 | if tool=="Highlight" or tool=="MaskFocus": 16 | return "Focus on the highlighted parts of the image." 17 | if tool=="SceneGraph": 18 | text = "" 19 | if len(img_links) == 2: 20 | text = """Two scene graphs in JSON format generated from two images will be provided:\nThe first scene graph:\n{scene_graph_1}\nThe second scene graph:\n{scene_graph_2}""" 21 | sg_1 = sg_generate(img_links[0]) 22 | sg_2 = sg_generate(img_links[1]) 23 | params = {"{scene_graph_1}": sg_1, "{scene_graph_2}": sg_2} 24 | text = prompt_format(text, params) 25 | if len(img_links) == 1: 26 | text = """The scene graph in JSON format generated from this image is as follows:\n{scene_graph}""" 27 | sg_1 = sg_generate(img_links[0]) 28 | params = {"{scene_graph}": sg_1} 29 | text = prompt_format(text, params) 30 | return text 31 | 32 | 33 | 34 | def test_Text_Guided_IE_tool(in_path, out_path, log_path): 35 | data = read_json(in_path) 36 | out = {} 37 | for key in tqdm(list(data.keys())): 38 | counter = 0 39 | while counter < 3: 40 | try: 41 | instruction = data[key]["prompt_input"].replace("Editing instruction:", "").strip() 42 | # tool 43 | task_rule = Text_Guided_IE_Rule.replace("{instruction}", instruction) 44 | task = task_rule + Text_Guided_IE_Task_1 + Text_Guided_IE_Task_2 45 | prompt_tool = Tool_Decide.replace("{task}", task) 46 | img_links = [data[key]["vision_input"][0], data[key]["vision_input"][1]] 47 | prompt = agent_run.prepare_prompt(img_links, prompt_tool) 48 | result = GPTResponse2JSON(agent_run.get_result(prompt)) 49 | result = check(result, ["task_id", "reasoning", "used", "tool"]) 50 | log_prompt(log_path, prompt_tool) 51 | log_prompt(log_path, result) 52 | # tool 53 | 54 | out[key] = {} 55 | out[key]['tool_plan'] = result 56 | print(f"{key} over") 57 | break 58 | except Exception as e: 59 | print(f"Error: {key} evaluation failed: {e}") 60 | counter += 1 61 | 62 | write_json(out_path, out) 63 | 64 | 65 | 66 | def test_Text_Guided_IE_evaluate(in_path, tool_path, out_path, log_path): 67 | data = read_json(in_path) 68 | data_tool = read_json(tool_path) 69 | out = {} 70 | for key in tqdm(list(data.keys())): 71 | counter = 0 72 | while counter < 3: 73 | try: 74 | instruction = data[key]["prompt_input"].replace("Editing instruction:", "").strip() 75 | # Task 1 76 | ## prompt 77 | if data_tool[key]["tool_plan"][0]["tool"] in ["Highlight", "SceneGraph"]: 78 | tool = data_tool[key]["tool_plan"][0]["tool"] 79 | else: 80 | tool = "None" 81 | task_1_eva = Text_Guided_IE_Task_1_evaluation.replace("{instruction}", instruction) 82 | tool_text = get_tool_text(tool, data[key]["vision_input"], log_path) 83 | task_1_eva = task_1_eva.replace("{tool_text}", tool_text) 84 | ## prompt 85 | ## image 86 | img_links = data[key]["vision_input"] 87 | if tool == "Highlight": 88 | img_1 = load_image(url2path(data[key]["vision_input"][0], Image_task_path)) 89 | img_2 = load_image(url2path(data[key]["vision_input"][1], Image_task_path)) 90 | img_1 = grding(img_1, instruction, "highlight") 91 | img_2 = grding(img_2, instruction, "highlight") 92 | img_links = [f"data:image/jpeg;base64,{encode_pil_image(img_1)}", f"data:image/jpeg;base64,{encode_pil_image(img_2)}"] 93 | ## image 94 | prompt = agent_run.prepare_prompt(img_links, task_1_eva) 95 | json_1 = GPTResponse2JSON(agent_run.get_result(prompt)) 96 | json_1['score'] = get_number(json_1['score']) 97 | log_prompt(log_path, task_1_eva) 98 | log_prompt(log_path, json_1) 99 | # Task 1 100 | 101 | # Task 2 102 | ## prompt 103 | if data_tool[key]["tool_plan"][1]["tool"] in ["Highlight", "SceneGraph"]: 104 | tool = data_tool[key]["tool_plan"][1]["tool"] 105 | else: 106 | tool = "None" 107 | task_2_eva = Text_Guided_IE_Task_2_evaluation.replace("{instruction}", instruction) 108 | tool_text = get_tool_text(tool, data[key]["vision_input"], log_path) 109 | task_2_eva = task_2_eva.replace("{tool_text}", tool_text) 110 | ## prompt 111 | ## image 112 | img_links = data[key]["vision_input"] 113 | if tool == "Highlight": 114 | img_1 = load_image(url2path(data[key]["vision_input"][0], Image_task_path)) 115 | img_2 = load_image(url2path(data[key]["vision_input"][1], Image_task_path)) 116 | img_1 = grding(img_1, instruction, "highlight") 117 | img_2 = grding(img_2, instruction, "highlight") 118 | img_links = [f"data:image/jpeg;base64,{encode_pil_image(img_1)}", f"data:image/jpeg;base64,{encode_pil_image(img_2)}"] 119 | 120 | ## image 121 | prompt = agent_run.prepare_prompt(img_links, task_2_eva) 122 | json_2 = GPTResponse2JSON(agent_run.get_result(prompt)) 123 | json_2['score'] = get_number(json_2['score']) 124 | log_prompt(log_path, task_2_eva) 125 | log_prompt(log_path, json_2) 126 | # Task 2 127 | 128 | out[key] = {} 129 | out[key]['score'] = [json_1['score'], json_2['score']] 130 | out[key]['reasoning'] = [json_1['reasoning'], json_2['reasoning']] 131 | out[key]['prompt_input'] = [task_1_eva, task_2_eva] 132 | out[key]['vision_input'] = data[key]["vision_input"] 133 | print(f"{key} over") 134 | break 135 | except Exception as e: 136 | print(f"Error: {key} evaluation failed: {e}") 137 | counter += 1 138 | 139 | write_json(out_path, out) 140 | 141 | 142 | 143 | def test_Subject_Driven_IE_tool(in_path, out_path, log_path): 144 | data = read_json(in_path) 145 | out = {} 146 | for key in tqdm(list(data.keys())): 147 | 148 | counter = 0 149 | while counter < 3: 150 | try: 151 | subject = data[key]["prompt_input"].replace("Subject:", "").strip() 152 | # tool 153 | task_rule = Subject_Driven_IE_Rule.replace("{subject}", subject) 154 | task = task_rule + Subject_Driven_IE_Task_1 + Subject_Driven_IE_Task_2 155 | prompt_tool = Tool_Decide.replace("{task}", task) 156 | img_links = [data[key]["vision_input"][0], data[key]["vision_input"][1], data[key]["vision_input"][2]] 157 | prompt = agent_run.prepare_prompt(img_links, prompt_tool) 158 | result = GPTResponse2JSON(agent_run.get_result(prompt)) 159 | result = check(result, ["task_id", "reasoning", "used", "tool"]) 160 | log_prompt(log_path, prompt_tool) 161 | log_prompt(log_path, result) 162 | # tool 163 | 164 | out[key] = {} 165 | out[key]['tool_plan'] = result 166 | print(f"{key} over") 167 | break 168 | except Exception as e: 169 | print(f"Error: {key} evaluation failed: {e}") 170 | counter += 1 171 | 172 | write_json(out_path, out) 173 | 174 | 175 | 176 | def test_Subject_Driven_IE_evaluate(in_path, tool_path, out_path, log_path): 177 | data = read_json(in_path) 178 | data_tool = read_json(tool_path) 179 | out = {} 180 | for key in tqdm(list(data.keys())): 181 | 182 | counter = 0 183 | while counter < 3: 184 | try: 185 | subject = data[key]["prompt_input"].replace("Subject:", "").strip() 186 | # Task 1 187 | ## prompt 188 | if data_tool[key]["tool_plan"][0]["tool"] in ["Highlight", "SceneGraph"]: 189 | tool = data_tool[key]["tool_plan"][0]["tool"] 190 | else: 191 | tool = "None" 192 | task_1_eva = Subject_Driven_IE_Task_1_evaluation.replace("{subject}", subject) 193 | tool_text = get_tool_text(tool, [data[key]["vision_input"][1], data[key]["vision_input"][2]], log_path) 194 | task_1_eva = task_1_eva.replace("{tool_text}", tool_text) 195 | ## prompt 196 | ## image 197 | img_links = [data[key]["vision_input"][1], data[key]["vision_input"][2]] 198 | if tool == "Highlight": 199 | img_1 = load_image(url2path(data[key]["vision_input"][1], Image_task_path)) 200 | img_2 = load_image(url2path(data[key]["vision_input"][2], Image_task_path)) 201 | img_1 = grding(img_1, subject, "highlight") 202 | img_2 = grding(img_2, subject, "highlight") 203 | img_links = [f"data:image/jpeg;base64,{encode_pil_image(img_1)}", f"data:image/jpeg;base64,{encode_pil_image(img_2)}"] 204 | ## image 205 | prompt = agent_run.prepare_prompt(img_links, task_1_eva) 206 | json_1 = GPTResponse2JSON(agent_run.get_result(prompt)) 207 | json_1['score'] = get_number(json_1['score']) 208 | log_prompt(log_path, task_1_eva) 209 | log_prompt(log_path, json_1) 210 | # Task 1 211 | 212 | # Task 2 213 | ## prompt 214 | if data_tool[key]["tool_plan"][1]["tool"] in ["Highlight", "SceneGraph"]: 215 | tool = data_tool[key]["tool_plan"][1]["tool"] 216 | else: 217 | tool = "None" 218 | tool_text = get_tool_text(tool, [data[key]["vision_input"][0], data[key]["vision_input"][2]], log_path) 219 | task_2_eva = Subject_Driven_IE_Task_2_evaluation.replace("{tool_text}", tool_text) 220 | ## prompt 221 | ## image 222 | img_links = [data[key]["vision_input"][0], data[key]["vision_input"][2]] 223 | if tool == "Highlight": 224 | img_1 = load_image(url2path(data[key]["vision_input"][0], Image_task_path)) 225 | img_2 = load_image(url2path(data[key]["vision_input"][2], Image_task_path)) 226 | img_1 = grding(img_1, "background", "highlight") 227 | img_2 = grding(img_2, "background", "highlight") 228 | img_links = [f"data:image/jpeg;base64,{encode_pil_image(img_1)}", f"data:image/jpeg;base64,{encode_pil_image(img_2)}"] 229 | 230 | ## image 231 | prompt = agent_run.prepare_prompt(img_links, task_2_eva) 232 | json_2 = GPTResponse2JSON(agent_run.get_result(prompt)) 233 | json_2['score'] = get_number(json_2['score']) 234 | log_prompt(log_path, task_2_eva) 235 | log_prompt(log_path, json_2) 236 | # Task 2 237 | 238 | out[key] = {} 239 | out[key]['score'] = [json_1['score'], json_2['score']] 240 | out[key]['reasoning'] = [json_1['reasoning'], json_2['reasoning']] 241 | out[key]['prompt_input'] = [task_1_eva, task_2_eva] 242 | out[key]['vision_input'] = data[key]["vision_input"] 243 | print(f"{key} over") 244 | break 245 | except Exception as e: 246 | print(f"Error: {key} evaluation failed: {e}") 247 | counter += 1 248 | 249 | write_json(out_path, out) 250 | 251 | 252 | 253 | def test_Mask_Guided_IE_tool(in_path, out_path, log_path): 254 | data = read_json(in_path) 255 | out = {} 256 | for key in tqdm(list(data.keys())): 257 | 258 | counter = 0 259 | while counter < 3: 260 | try: 261 | instruction = data[key]["prompt_input"].replace("Editing instruction:", "").strip() 262 | # tool 263 | task_rule = Mask_Guided_IE_Rule.replace("{instruction}", instruction) 264 | task = task_rule + Mask_Guided_IE_Task_1 + Mask_Guided_IE_Task_2 265 | prompt_tool = Tool_Decide.replace("{task}", task) 266 | img_links = [data[key]["vision_input"][0], data[key]["vision_input"][1]] 267 | prompt = agent_run.prepare_prompt(img_links, prompt_tool) 268 | result = GPTResponse2JSON(agent_run.get_result(prompt)) 269 | result = check(result, ["task_id", "reasoning", "used", "tool"]) 270 | log_prompt(log_path, prompt_tool) 271 | log_prompt(log_path, result) 272 | # tool 273 | 274 | out[key] = {} 275 | out[key]['tool_plan'] = result 276 | print(f"{key} over") 277 | break 278 | except Exception as e: 279 | print(f"Error: {key} evaluation failed: {e}") 280 | counter += 1 281 | 282 | write_json(out_path, out) 283 | 284 | 285 | 286 | def test_Mask_Guided_IE_evaluate(in_path, tool_path, out_path, log_path): 287 | data = read_json(in_path) 288 | data_tool = read_json(tool_path) 289 | out = {} 290 | for key in tqdm(list(data.keys())): 291 | 292 | counter = 0 293 | while counter < 3: 294 | try: 295 | instruction = data[key]["prompt_input"].replace("Editing instruction:", "").strip() 296 | # Task 1 297 | ## prompt 298 | if data_tool[key]["tool_plan"][0]["tool"] in ["Highlight", "SceneGraph", "MaskFocus"]: 299 | tool = data_tool[key]["tool_plan"][0]["tool"] 300 | else: 301 | tool = "None" 302 | task_1_eva = Mask_Guided_IE_Task_1_evaluation.replace("{instruction}", instruction) 303 | tool_text = get_tool_text(tool, data[key]["vision_input"], log_path) 304 | task_1_eva = task_1_eva.replace("{tool_text}", tool_text) 305 | ## prompt 306 | ## image 307 | img_links = data[key]["vision_input"] 308 | if tool == "Highlight": 309 | img_1 = load_image(url2path(data[key]["vision_input"][0], Image_task_path)) 310 | img_2 = load_image(url2path(data[key]["vision_input"][1], Image_task_path)) 311 | img_1 = grding(img_1, instruction, "highlight") 312 | img_2 = grding(img_2, instruction, "highlight") 313 | img_links = [f"data:image/jpeg;base64,{encode_pil_image(img_1)}", f"data:image/jpeg;base64,{encode_pil_image(img_2)}"] 314 | if tool == "MaskFocus": 315 | img_1, img_2 = imgs_diff(url2path(data[key]["vision_input"][0], Image_task_path), url2path(data[key]["vision_input"][1], Image_task_path), "highlight") 316 | img_links = [f"data:image/jpeg;base64,{encode_pil_image(img_1)}", f"data:image/jpeg;base64,{encode_pil_image(img_2)}"] 317 | ## image 318 | prompt = agent_run.prepare_prompt(img_links, task_1_eva) 319 | json_1 = GPTResponse2JSON(agent_run.get_result(prompt)) 320 | json_1['score'] = get_number(json_1['score']) 321 | log_prompt(log_path, task_1_eva) 322 | log_prompt(log_path, json_1) 323 | # Task 1 324 | 325 | # Task 2 326 | ## prompt 327 | if data_tool[key]["tool_plan"][1]["tool"] in ["Highlight", "SceneGraph", "MaskFocus"]: 328 | tool = data_tool[key]["tool_plan"][1]["tool"] 329 | else: 330 | tool = "None" 331 | task_2_eva = Mask_Guided_IE_Task_2_evaluation.replace("{instruction}", instruction) 332 | tool_text = get_tool_text(tool, data[key]["vision_input"], log_path) 333 | task_2_eva = task_2_eva.replace("{tool_text}", tool_text) 334 | ## prompt 335 | ## image 336 | img_links = data[key]["vision_input"] 337 | if tool == "Highlight": 338 | img_1 = load_image(url2path(data[key]["vision_input"][0], Image_task_path)) 339 | img_2 = load_image(url2path(data[key]["vision_input"][1], Image_task_path)) 340 | img_1 = grding(img_1, instruction, "highlight") 341 | img_2 = grding(img_2, instruction, "highlight") 342 | img_links = [f"data:image/jpeg;base64,{encode_pil_image(img_1)}", f"data:image/jpeg;base64,{encode_pil_image(img_2)}"] 343 | if tool == "MaskFocus": 344 | img_1, img_2 = imgs_diff(url2path(data[key]["vision_input"][0], Image_task_path), url2path(data[key]["vision_input"][1], Image_task_path), "highlight") 345 | img_links = [f"data:image/jpeg;base64,{encode_pil_image(img_1)}", f"data:image/jpeg;base64,{encode_pil_image(img_2)}"] 346 | ## image 347 | prompt = agent_run.prepare_prompt(img_links, task_2_eva) 348 | json_2 = GPTResponse2JSON(agent_run.get_result(prompt)) 349 | json_2['score'] = get_number(json_2['score']) 350 | log_prompt(log_path, task_2_eva) 351 | log_prompt(log_path, json_2) 352 | # Task 2 353 | 354 | out[key] = {} 355 | out[key]['score'] = [json_1['score'], json_2['score']] 356 | out[key]['reasoning'] = [json_1['reasoning'], json_2['reasoning']] 357 | out[key]['prompt_input'] = [task_1_eva, task_2_eva] 358 | out[key]['vision_input'] = data[key]["vision_input"] 359 | print(f"{key} over") 360 | break 361 | except Exception as e: 362 | print(f"Error: {key} evaluation failed: {e}") 363 | counter += 1 364 | 365 | write_json(out_path, out) 366 | 367 | 368 | 369 | def test_Multi_Concept_IC_tool(in_path, out_path, log_path): 370 | data = read_json(in_path) 371 | out = {} 372 | for key in tqdm(list(data.keys())): 373 | 374 | counter = 0 375 | while counter < 3: 376 | try: 377 | text = data[key]["prompt_input"].replace("Text Prompt:", "").strip() 378 | # tool 379 | task_rule = Multi_Concept_IC_Rule.replace("{text}", text) 380 | task = task_rule + Multi_Concept_IC_Task_1 + Multi_Concept_IC_Task_2 381 | prompt_tool = Tool_Decide.replace("{task}", task) 382 | img_links = [data[key]["vision_input"][0], data[key]["vision_input"][1]] 383 | prompt = agent_run.prepare_prompt(img_links, prompt_tool) 384 | result = GPTResponse2JSON(agent_run.get_result(prompt)) 385 | result = check(result, ["task_id", "reasoning", "used", "tool"]) 386 | log_prompt(log_path, prompt_tool) 387 | log_prompt(log_path, result) 388 | # tool 389 | 390 | out[key] = {} 391 | out[key]['tool_plan'] = result 392 | print(f"{key} over") 393 | break 394 | except Exception as e: 395 | print(f"Error: {key} evaluation failed: {e}") 396 | counter += 1 397 | 398 | write_json(out_path, out) 399 | 400 | 401 | 402 | def test_Multi_Concept_IC_evaluate(in_path, tool_path, out_path, log_path): 403 | data = read_json(in_path) 404 | data_tool = read_json(tool_path) 405 | out = {} 406 | for key in tqdm(list(data.keys())): 407 | 408 | counter = 0 409 | while counter < 3: 410 | try: 411 | text = data[key]["prompt_input"].replace("Text Prompt:", "").strip() 412 | img_L, img_R = split2part(url2path(data[key]["vision_input"][0], Image_task_path)) 413 | img_LR_links = [f"data:image/jpeg;base64,{encode_pil_image(img_L)}", f"data:image/jpeg;base64,{encode_pil_image(img_R)}"] 414 | subject_L, subject_R = data[key]["concepts"][0], data[key]["concepts"][1] 415 | # Task 1 416 | ## prompt 417 | if data_tool[key]["tool_plan"][0]["tool"] in ["Highlight", "SceneGraph"]: 418 | tool = data_tool[key]["tool_plan"][0]["tool"] 419 | else: 420 | tool = "None" 421 | task_1_eva = Multi_Concept_IC_Task_1_evaluation.replace("{text}", text) 422 | task_1_eva = task_1_eva.replace("{subject}", subject_L) 423 | tool_text = get_tool_text(tool, [img_LR_links[0], data[key]["vision_input"][1]], log_path) 424 | task_1_eva = task_1_eva.replace("{tool_text}", tool_text) 425 | ## prompt 426 | ## image 427 | img_links = [img_LR_links[0], data[key]["vision_input"][1]] 428 | if tool == "Highlight": 429 | img_1 = img_L 430 | img_2 = load_image(url2path(data[key]["vision_input"][1], Image_task_path)) 431 | img_1 = grding(img_1, subject_L, "highlight") 432 | img_2 = grding(img_2, subject_L, "highlight") 433 | img_links = [f"data:image/jpeg;base64,{encode_pil_image(img_1)}", f"data:image/jpeg;base64,{encode_pil_image(img_2)}"] 434 | ## image 435 | prompt = agent_run.prepare_prompt(img_links, task_1_eva) 436 | json_1 = GPTResponse2JSON(agent_run.get_result(prompt)) 437 | json_1['score'] = get_number(json_1['score']) 438 | log_prompt(log_path, task_1_eva) 439 | log_prompt(log_path, json_1) 440 | # Task 1 441 | 442 | # Task 1 443 | ## prompt 444 | if data_tool[key]["tool_plan"][0]["tool"] in ["Highlight", "SceneGraph"]: 445 | tool = data_tool[key]["tool_plan"][0]["tool"] 446 | else: 447 | tool = "None" 448 | task_1_eva_ = Multi_Concept_IC_Task_1_evaluation.replace("{text}", text) 449 | task_1_eva_ = task_1_eva_.replace("{subject}", subject_R) 450 | tool_text = get_tool_text(tool, [img_LR_links[1], data[key]["vision_input"][1]], log_path) 451 | task_1_eva_ = task_1_eva_.replace("{tool_text}", tool_text) 452 | ## prompt 453 | ## image 454 | img_links = [img_LR_links[1], data[key]["vision_input"][1]] 455 | if tool == "Highlight": 456 | img_1 = img_R 457 | img_2 = load_image(url2path(data[key]["vision_input"][1], Image_task_path)) 458 | img_1 = grding(img_1, subject_R, "highlight") 459 | img_2 = grding(img_2, subject_R, "highlight") 460 | img_links = [f"data:image/jpeg;base64,{encode_pil_image(img_1)}", f"data:image/jpeg;base64,{encode_pil_image(img_2)}"] 461 | ## image 462 | prompt = agent_run.prepare_prompt(img_links, task_1_eva_) 463 | json_2 = GPTResponse2JSON(agent_run.get_result(prompt)) 464 | json_2['score'] = get_number(json_2['score']) 465 | log_prompt(log_path, task_1_eva_) 466 | log_prompt(log_path, json_2) 467 | # Task 1 468 | 469 | # Task 2 470 | ## prompt 471 | if data_tool[key]["tool_plan"][1]["tool"] in ["Highlight", "SceneGraph"]: 472 | tool = data_tool[key]["tool_plan"][1]["tool"] 473 | else: 474 | tool = "None" 475 | task_2_eva = Multi_Concept_IC_Task_2_evaluation.replace("{text}", text) 476 | tool_text = get_tool_text(tool, [data[key]["vision_input"][1]], log_path) 477 | task_2_eva = task_2_eva.replace("{tool_text}", tool_text) 478 | ## prompt 479 | ## image 480 | img_links = [data[key]["vision_input"][1]] 481 | if tool == "Highlight": 482 | img_2 = load_image(url2path(data[key]["vision_input"][1], Image_task_path)) 483 | img_2 = grding(img_2, text, "highlight") 484 | img_links = [f"data:image/jpeg;base64,{encode_pil_image(img_2)}"] 485 | ## image 486 | prompt = agent_run.prepare_prompt(img_links, task_2_eva) 487 | json_3 = GPTResponse2JSON(agent_run.get_result(prompt)) 488 | json_3['score'] = get_number(json_3['score']) 489 | log_prompt(log_path, task_2_eva) 490 | log_prompt(log_path, json_3) 491 | # Task 2 492 | 493 | out[key] = {} 494 | out[key]['score'] = [json_1['score'], json_2['score'], json_3['score']] 495 | out[key]['reasoning'] = [json_1['reasoning'], json_2['reasoning'], json_3['reasoning']] 496 | out[key]['prompt_input'] = [task_1_eva, task_1_eva_, task_2_eva] 497 | out[key]['vision_input'] = data[key]["vision_input"] 498 | print(f"{key} over") 499 | break 500 | except Exception as e: 501 | print(f"Error: {key} evaluation failed: {e}") 502 | counter += 1 503 | 504 | write_json(out_path, out) 505 | 506 | 507 | 508 | def test_Text_Guided_IG_tool(in_path, out_path, log_path): 509 | data = read_json(in_path) 510 | out = {} 511 | for key in tqdm(list(data.keys())): 512 | 513 | counter = 0 514 | while counter < 3: 515 | try: 516 | text = data[key]["prompt_input"].replace("Text Prompt:", "").strip() 517 | # tool 518 | task_rule = Text_Guided_IG_Rule.replace("{text}", text) 519 | task = task_rule + Text_Guided_IG_Task_1 520 | prompt_tool = Tool_Decide.replace("{task}", task) 521 | img_links = data[key]["vision_input"] 522 | prompt = agent_run.prepare_prompt(img_links, prompt_tool) 523 | result = GPTResponse2JSON(agent_run.get_result(prompt)) 524 | result = check(result, ["task_id", "reasoning", "used", "tool"]) 525 | log_prompt(log_path, prompt_tool) 526 | log_prompt(log_path, result) 527 | # tool 528 | 529 | out[key] = {} 530 | out[key]['tool_plan'] = result 531 | print(f"{key} over") 532 | break 533 | except Exception as e: 534 | print(f"Error: {key} evaluation failed: {e}") 535 | counter += 1 536 | 537 | write_json(out_path, out) 538 | 539 | 540 | 541 | def test_Text_Guided_IG_evaluate(in_path, tool_path, out_path, log_path): 542 | data = read_json(in_path) 543 | data_tool = read_json(tool_path) 544 | out = {} 545 | for key in tqdm(list(data.keys())): 546 | 547 | counter = 0 548 | while counter < 3: 549 | try: 550 | text = data[key]["prompt_input"].replace("Text Prompt:", "").strip() 551 | # Task 1 552 | ## prompt 553 | if data_tool[key]["tool_plan"][0]["tool"] in ["Highlight", "SceneGraph"]: 554 | tool = data_tool[key]["tool_plan"][0]["tool"] 555 | else: 556 | tool = "None" 557 | task_1_eva = Text_Guided_IG_Task_1_evaluation.replace("{text}", text) 558 | tool_text = get_tool_text(tool, data[key]["vision_input"], log_path) 559 | task_1_eva = task_1_eva.replace("{tool_text}", tool_text) 560 | ## prompt 561 | ## image 562 | img_links = data[key]["vision_input"] 563 | if tool == "Highlight": 564 | img_1 = load_image(url2path(data[key]["vision_input"][0], Image_task_path)) 565 | img_1 = grding(img_1, text, "highlight") 566 | img_links = [f"data:image/jpeg;base64,{encode_pil_image(img_1)}"] 567 | 568 | ## image 569 | prompt = agent_run.prepare_prompt(img_links, task_1_eva) 570 | json_1 = GPTResponse2JSON(agent_run.get_result(prompt)) 571 | json_1['score'] = get_number(json_1['score']) 572 | log_prompt(log_path, task_1_eva) 573 | log_prompt(log_path, json_1) 574 | # Task 1 575 | 576 | out[key] = {} 577 | out[key]['score'] = [json_1['score']] 578 | out[key]['reasoning'] = [json_1['reasoning']] 579 | out[key]['prompt_input'] = [task_1_eva] 580 | out[key]['vision_input'] = data[key]["vision_input"] 581 | print(f"{key} over") 582 | break 583 | except Exception as e: 584 | print(f"Error: {key} evaluation failed: {e}") 585 | counter += 1 586 | 587 | write_json(out_path, out) 588 | 589 | 590 | 591 | def test_Control_Guided_IG_tool(in_path, out_path, log_path): 592 | data = read_json(in_path) 593 | out = {} 594 | for key in tqdm(list(data.keys())): 595 | 596 | counter = 0 597 | while counter < 3: 598 | try: 599 | text = data[key]["prompt_input"].replace("Text Prompt:", "").strip() 600 | # tool 601 | task_rule = Control_Guided_IG_Rule.replace("{text}", text) 602 | task = task_rule + Control_Guided_IG_Task_1 + Control_Guided_IG_Task_2 603 | prompt_tool = Tool_Decide.replace("{task}", task) 604 | img_links = data[key]["vision_input"] 605 | prompt = agent_run.prepare_prompt(img_links, prompt_tool) 606 | result = GPTResponse2JSON(agent_run.get_result(prompt)) 607 | result = check(result, ["task_id", "reasoning", "used", "tool"]) 608 | log_prompt(log_path, prompt_tool) 609 | log_prompt(log_path, result) 610 | # tool 611 | 612 | out[key] = {} 613 | out[key]['tool_plan'] = result 614 | print(f"{key} over") 615 | break 616 | except Exception as e: 617 | print(f"Error: {key} evaluation failed: {e}") 618 | counter += 1 619 | 620 | write_json(out_path, out) 621 | 622 | 623 | 624 | def test_Control_Guided_IG_evaluate(in_path, tool_path, out_path, log_path): 625 | data = read_json(in_path) 626 | data_tool = read_json(tool_path) 627 | out = {} 628 | for key in tqdm(list(data.keys())): 629 | 630 | counter = 0 631 | while counter < 3: 632 | try: 633 | text = data[key]["prompt_input"].replace("Text Prompt:", "").strip() 634 | # Task 1 635 | ## prompt 636 | if data_tool[key]["tool_plan"][0]["tool"] in ["Highlight", "SceneGraph"]: 637 | tool = data_tool[key]["tool_plan"][0]["tool"] 638 | else: 639 | tool = "None" 640 | task_1_eva = Control_Guided_IG_Task_1_evaluation 641 | tool_text = get_tool_text(tool, data[key]["vision_input"], log_path) 642 | task_1_eva = task_1_eva.replace("{tool_text}", tool_text) 643 | ## prompt 644 | ## image 645 | img_links = data[key]["vision_input"] 646 | if tool == "Highlight": 647 | img_1 = load_image(url2path(data[key]["vision_input"][0], Image_task_path)) 648 | img_2 = load_image(url2path(data[key]["vision_input"][1], Image_task_path)) 649 | img_1 = grding(img_1, text, "highlight") 650 | img_2 = grding(img_2, text, "highlight") 651 | img_links = [f"data:image/jpeg;base64,{encode_pil_image(img_1)}", f"data:image/jpeg;base64,{encode_pil_image(img_2)}"] 652 | ## image 653 | prompt = agent_run.prepare_prompt(img_links, task_1_eva) 654 | json_1 = GPTResponse2JSON(agent_run.get_result(prompt)) 655 | json_1['score'] = get_number(json_1['score']) 656 | log_prompt(log_path, task_1_eva) 657 | log_prompt(log_path, json_1) 658 | # Task 1 659 | 660 | # Task 2 661 | ## prompt 662 | if data_tool[key]["tool_plan"][1]["tool"] in ["Highlight", "SceneGraph"]: 663 | tool = data_tool[key]["tool_plan"][1]["tool"] 664 | else: 665 | tool = "None" 666 | task_2_eva = Control_Guided_IG_Task_2_evaluation.replace("{text}", text) 667 | tool_text = get_tool_text(tool, [data[key]["vision_input"][1]], log_path) 668 | task_2_eva = task_2_eva.replace("{tool_text}", tool_text) 669 | ## prompt 670 | ## image 671 | img_links = [data[key]["vision_input"][1]] 672 | if tool == "Highlight": 673 | img_2 = load_image(url2path(data[key]["vision_input"][1], Image_task_path)) 674 | img_2 = grding(img_2, text, "highlight") 675 | img_links = [f"data:image/jpeg;base64,{encode_pil_image(img_2)}"] 676 | ## image 677 | prompt = agent_run.prepare_prompt(img_links, task_2_eva) 678 | json_2 = GPTResponse2JSON(agent_run.get_result(prompt)) 679 | json_2['score'] = get_number(json_2['score']) 680 | log_prompt(log_path, task_2_eva) 681 | log_prompt(log_path, json_2) 682 | # Task 2 683 | 684 | out[key] = {} 685 | out[key]['score'] = [json_1['score'], json_2['score']] 686 | out[key]['reasoning'] = [json_1['reasoning'], json_2['reasoning']] 687 | out[key]['prompt_input'] = [task_1_eva, task_2_eva] 688 | out[key]['vision_input'] = data[key]["vision_input"] 689 | print(f"{key} over") 690 | break 691 | except Exception as e: 692 | print(f"Error: {key} evaluation failed: {e}") 693 | counter += 1 694 | 695 | write_json(out_path, out) 696 | 697 | 698 | 699 | def test_Subject_Driven_IG_tool(in_path, out_path, log_path): 700 | data = read_json(in_path) 701 | out = {} 702 | for key in tqdm(list(data.keys())): 703 | 704 | counter = 0 705 | while counter < 3: 706 | try: 707 | text = data[key]["prompt_input"].replace("Text Prompt:", "").strip() 708 | # tool 709 | task_rule = Subject_Driven_IG_Rule.replace("{text}", text) 710 | task = task_rule + Subject_Driven_IG_Task_1 + Subject_Driven_IG_Task_2 711 | prompt_tool = Tool_Decide.replace("{task}", task) 712 | img_links = data[key]["vision_input"] 713 | prompt = agent_run.prepare_prompt(img_links, prompt_tool) 714 | result = GPTResponse2JSON(agent_run.get_result(prompt)) 715 | result = check(result, ["task_id", "reasoning", "used", "tool"]) 716 | log_prompt(log_path, prompt_tool) 717 | log_prompt(log_path, result) 718 | # tool 719 | 720 | out[key] = {} 721 | out[key]['tool_plan'] = result 722 | print(f"{key} over") 723 | break 724 | except Exception as e: 725 | print(f"Error: {key} evaluation failed: {e}") 726 | counter += 1 727 | 728 | write_json(out_path, out) 729 | 730 | 731 | 732 | def test_Subject_Driven_IG_evaluate(in_path, tool_path, out_path, log_path): 733 | data = read_json(in_path) 734 | data_tool = read_json(tool_path) 735 | out = {} 736 | for key in tqdm(list(data.keys())): 737 | 738 | counter = 0 739 | while counter < 3: 740 | try: 741 | text = data[key]["prompt_input"].replace("Text Prompt:", "").strip() 742 | subject = data[key]["subject"] 743 | # Task 1 744 | ## prompt 745 | if data_tool[key]["tool_plan"][0]["tool"] in ["Highlight", "SceneGraph"]: 746 | tool = data_tool[key]["tool_plan"][0]["tool"] 747 | else: 748 | tool = "None" 749 | task_1_eva = Subject_Driven_IG_Task_1_evaluation.replace("{subject}", subject) 750 | tool_text = get_tool_text(tool, data[key]["vision_input"], log_path) 751 | task_1_eva = task_1_eva.replace("{tool_text}", tool_text) 752 | ## prompt 753 | ## image 754 | img_links = data[key]["vision_input"] 755 | if tool == "Highlight": 756 | img_1 = load_image(url2path(data[key]["vision_input"][0], Image_task_path)) 757 | img_2 = load_image(url2path(data[key]["vision_input"][1], Image_task_path)) 758 | img_1 = grding(img_1, text, "highlight") 759 | img_2 = grding(img_2, text, "highlight") 760 | img_links = [f"data:image/jpeg;base64,{encode_pil_image(img_1)}", f"data:image/jpeg;base64,{encode_pil_image(img_2)}"] 761 | ## image 762 | prompt = agent_run.prepare_prompt(img_links, task_1_eva) 763 | json_1 = GPTResponse2JSON(agent_run.get_result(prompt)) 764 | json_1['score'] = get_number(json_1['score']) 765 | log_prompt(log_path, task_1_eva) 766 | log_prompt(log_path, json_1) 767 | # Task 1 768 | 769 | # Task 2 770 | ## prompt 771 | if data_tool[key]["tool_plan"][1]["tool"] in ["Highlight", "SceneGraph"]: 772 | tool = data_tool[key]["tool_plan"][1]["tool"] 773 | else: 774 | tool = "None" 775 | task_2_eva = Subject_Driven_IG_Task_2_evaluation.replace("{text}", text) 776 | tool_text = get_tool_text(tool, [data[key]["vision_input"][1]], log_path) 777 | task_2_eva = task_2_eva.replace("{tool_text}", tool_text) 778 | ## prompt 779 | ## image 780 | img_links = [data[key]["vision_input"][1]] 781 | if tool == "Highlight": 782 | img_2 = load_image(url2path(data[key]["vision_input"][1], Image_task_path)) 783 | img_2 = grding(img_2, text, "highlight") 784 | img_links = [f"data:image/jpeg;base64,{encode_pil_image(img_2)}"] 785 | ## image 786 | prompt = agent_run.prepare_prompt(img_links, task_2_eva) 787 | json_2 = GPTResponse2JSON(agent_run.get_result(prompt)) 788 | json_2['score'] = get_number(json_2['score']) 789 | log_prompt(log_path, task_2_eva) 790 | log_prompt(log_path, json_2) 791 | # Task 2 792 | 793 | out[key] = {} 794 | out[key]['score'] = [json_1['score'], json_2['score']] 795 | out[key]['reasoning'] = [json_1['reasoning'], json_2['reasoning']] 796 | out[key]['prompt_input'] = [task_1_eva, task_2_eva] 797 | out[key]['vision_input'] = data[key]["vision_input"] 798 | print(f"{key} over") 799 | break 800 | except Exception as e: 801 | print(f"Error: {key} evaluation failed: {e}") 802 | counter += 1 803 | 804 | write_json(out_path, out) 805 | 806 | 807 | 808 | def evaluate(in_path, tool_out_path, tool_log_path, eva_out_path, eva_log_path): 809 | if "ImagenHub_Control-Guided_IG" in in_path: 810 | test_Control_Guided_IG_tool(in_path, tool_out_path, tool_log_path) 811 | test_Control_Guided_IG_evaluate(in_path, tool_out_path, eva_out_path, eva_log_path) 812 | if "ImagenHub_Mask-Guided_IE" in in_path: 813 | test_Mask_Guided_IE_tool(in_path, tool_out_path, tool_log_path) 814 | test_Mask_Guided_IE_evaluate(in_path, tool_out_path, eva_out_path, eva_log_path) 815 | if "ImagenHub_Multi-Concept_IC" in in_path: 816 | test_Multi_Concept_IC_tool(in_path, tool_out_path, tool_log_path) 817 | test_Multi_Concept_IC_evaluate(in_path, tool_out_path, eva_out_path, eva_log_path) 818 | if "ImagenHub_Subject-Driven_IE" in in_path: 819 | test_Subject_Driven_IE_tool(in_path, tool_out_path, tool_log_path) 820 | test_Subject_Driven_IE_evaluate(in_path, tool_out_path, eva_out_path, eva_log_path) 821 | if "ImagenHub_Subject-Driven_IG" in in_path: 822 | test_Subject_Driven_IG_tool(in_path, tool_out_path, tool_log_path) 823 | test_Subject_Driven_IG_evaluate(in_path, tool_out_path, eva_out_path, eva_log_path) 824 | if "ImagenHub_Text-Guided_IE" in in_path: 825 | test_Text_Guided_IE_tool(in_path, tool_out_path, tool_log_path) 826 | test_Text_Guided_IE_evaluate(in_path, tool_out_path, eva_out_path, eva_log_path) 827 | if "ImagenHub_Text-Guided_IG" in in_path: 828 | test_Text_Guided_IG_tool(in_path, tool_out_path, tool_log_path) 829 | test_Text_Guided_IG_evaluate(in_path, tool_out_path, eva_out_path, eva_log_path) 830 | 831 | 832 | 833 | 834 | agent_run = QWen25() 835 | Image_task_path = "PATH_TO_ImageHub_DATA" 836 | for task in os.listdir(Image_task_path): 837 | task_path = os.path.join(Image_task_path, task) 838 | models = os.listdir(task_path) 839 | print(models) 840 | for dir in models: 841 | if dir!="input" and dir!="token": 842 | tool_log_path = f"{task_path}_{dir}_tool_qwen25_72b.txt" 843 | tool_out_path = f"{task_path}/{dir}/SC_tool_qwen25_72b.json" 844 | eva_log_path = f"{task_path}_{dir}_eva_qwen25_72b.txt" 845 | eva_out_path = f"{task_path}/{dir}/SC_eva_qwen25_72b.json" 846 | in_path = f"{task_path}/{dir}/in.json" 847 | evaluate(in_path, tool_out_path, tool_log_path, eva_out_path, eva_log_path) 848 | 849 | -------------------------------------------------------------------------------- /run_test_40p.py: -------------------------------------------------------------------------------- 1 | from agent.llava_next import LlavaNext 2 | from agent.prompt_agent import * 3 | from util import * 4 | from tools.scene_graph import sg_generate 5 | from tools.grding import grding, highlight 6 | from tools.diff import imgs_diff 7 | from tools.split import split2part 8 | from tqdm import tqdm 9 | 10 | 11 | def get_tool_text(tool, img_links, log_path): 12 | if tool=="None": 13 | return "" 14 | if tool=="Highlight" or tool=="MaskFocus": 15 | return "Focus on the highlighted parts of the image." 16 | if tool=="SceneGraph": 17 | text = "" 18 | if len(img_links) == 2: 19 | text = """Two scene graphs in JSON format generated from two images will be provided:\nThe first scene graph:\n{scene_graph_1}\nThe second scene graph:\n{scene_graph_2}""" 20 | sg_1 = sg_generate(img_links[0], 0, log_path) 21 | sg_2 = sg_generate(img_links[1], 0, log_path) 22 | params = {"{scene_graph_1}": sg_1, "{scene_graph_2}": sg_2} 23 | text = prompt_format(text, params) 24 | if len(img_links) == 1: 25 | text = """The scene graph in JSON format generated from this image is as follows:\n{scene_graph}""" 26 | sg_1 = sg_generate(img_links[0], 0, log_path) 27 | params = {"{scene_graph}": sg_1} 28 | text = prompt_format(text, params) 29 | return text 30 | 31 | 32 | 33 | def test_Text_Guided_IE_tool(in_path, out_path, log_path, test=None): 34 | data = read_json(in_path) 35 | out = {} 36 | for key in tqdm(list(data.keys())): 37 | if test: 38 | its = in_path.split("/") 39 | if key not in test[its[-3]][its[-2]]['sample']: 40 | continue 41 | counter = 0 42 | while counter < 3: 43 | try: 44 | instruction = data[key]["prompt_input"].replace("Editing instruction:", "").strip() 45 | # tool 46 | task_rule = Text_Guided_IE_Rule.replace("{instruction}", instruction) 47 | task = task_rule + Text_Guided_IE_Task_1 + Text_Guided_IE_Task_2 48 | prompt_tool = Tool_Decide.replace("{task}", task) 49 | img_links = [url2path(data[key]["vision_input"][0], Image_Root), url2path(data[key]["vision_input"][1], Image_Root)] 50 | 51 | result = GPTResponse2JSON(agent_run.get_result(img_links, prompt_tool)) 52 | result = check(result, ["task_id", "reasoning", "used", "tool"]) 53 | log_prompt(log_path, prompt_tool) 54 | log_prompt(log_path, result) 55 | # tool 56 | 57 | out[key] = {} 58 | out[key]['tool_plan'] = result 59 | print(f"{key} over") 60 | break 61 | except Exception as e: 62 | print(f"Error: {key} evaluation failed: {e}") 63 | counter += 1 64 | 65 | write_json(out_path, out) 66 | 67 | 68 | 69 | def test_Text_Guided_IE_evaluate(in_path, tool_path, out_path, log_path, test=None): 70 | data = read_json(in_path) 71 | data_tool = read_json(tool_path) 72 | out = {} 73 | for key in tqdm(list(data.keys())): 74 | if test: 75 | its = in_path.split("/") 76 | if key not in test[its[-3]][its[-2]]['sample']: 77 | continue 78 | counter = 0 79 | while counter < 3: 80 | try: 81 | instruction = data[key]["prompt_input"].replace("Editing instruction:", "").strip() 82 | # Task 1 83 | ## prompt 84 | if data_tool[key]["tool_plan"][0]["tool"] in ["Highlight", "SceneGraph"]: 85 | tool = data_tool[key]["tool_plan"][0]["tool"] 86 | else: 87 | tool = "None" 88 | task_1_eva = Text_Guided_IE_Task_1_evaluation.replace("{instruction}", instruction) 89 | tool_text = get_tool_text(tool, data[key]["vision_input"], log_path) 90 | task_1_eva = task_1_eva.replace("{tool_text}", tool_text) 91 | ## prompt 92 | ## image 93 | img_links = [url2path(data[key]["vision_input"][0], Image_Root), url2path(data[key]["vision_input"][1], Image_Root)] 94 | if tool == "Highlight": 95 | img_1 = load_image(url2path(data[key]["vision_input"][0], Image_Root)) 96 | img_2 = load_image(url2path(data[key]["vision_input"][1], Image_Root)) 97 | img_1 = grding(img_1, instruction, "highlight") 98 | img_2 = grding(img_2, instruction, "highlight") 99 | img_links = [f"data:image/jpeg;base64,{encode_pil_image(img_1)}", f"data:image/jpeg;base64,{encode_pil_image(img_2)}"] 100 | ## image 101 | 102 | json_1 = GPTResponse2JSON(agent_run.get_result(img_links, task_1_eva)) 103 | json_1['score'] = get_number(json_1['score']) 104 | log_prompt(log_path, task_1_eva) 105 | log_prompt(log_path, json_1) 106 | # Task 1 107 | 108 | # Task 2 109 | ## prompt 110 | if data_tool[key]["tool_plan"][1]["tool"] in ["Highlight", "SceneGraph"]: 111 | tool = data_tool[key]["tool_plan"][1]["tool"] 112 | else: 113 | tool = "None" 114 | task_2_eva = Text_Guided_IE_Task_2_evaluation.replace("{instruction}", instruction) 115 | tool_text = get_tool_text(tool, data[key]["vision_input"], log_path) 116 | task_2_eva = task_2_eva.replace("{tool_text}", tool_text) 117 | ## prompt 118 | ## image 119 | img_links = [url2path(data[key]["vision_input"][0], Image_Root), url2path(data[key]["vision_input"][1], Image_Root)] 120 | if tool == "Highlight": 121 | img_1 = load_image(url2path(data[key]["vision_input"][0], Image_Root)) 122 | img_2 = load_image(url2path(data[key]["vision_input"][1], Image_Root)) 123 | img_1 = grding(img_1, instruction, "highlight") 124 | img_2 = grding(img_2, instruction, "highlight") 125 | img_links = [f"data:image/jpeg;base64,{encode_pil_image(img_1)}", f"data:image/jpeg;base64,{encode_pil_image(img_2)}"] 126 | 127 | ## image 128 | 129 | json_2 = GPTResponse2JSON(agent_run.get_result(img_links, task_2_eva)) 130 | json_2['score'] = get_number(json_2['score']) 131 | log_prompt(log_path, task_2_eva) 132 | log_prompt(log_path, json_2) 133 | # Task 2 134 | 135 | out[key] = {} 136 | out[key]['score'] = [json_1['score'], json_2['score']] 137 | out[key]['reasoning'] = [json_1['reasoning'], json_2['reasoning']] 138 | out[key]['prompt_input'] = [task_1_eva, task_2_eva] 139 | out[key]['vision_input'] = data[key]["vision_input"] 140 | print(f"{key} over") 141 | break 142 | except Exception as e: 143 | print(f"Error: {key} evaluation failed: {e}") 144 | counter += 1 145 | 146 | write_json(out_path, out) 147 | 148 | 149 | 150 | def test_Subject_Driven_IE_tool(in_path, out_path, log_path, test=None): 151 | data = read_json(in_path) 152 | out = {} 153 | for key in tqdm(list(data.keys())): 154 | if test: 155 | its = in_path.split("/") 156 | if key not in test[its[-3]][its[-2]]['sample']: 157 | continue 158 | counter = 0 159 | while counter < 3: 160 | try: 161 | subject = data[key]["prompt_input"].replace("Subject:", "").strip() 162 | # tool 163 | task_rule = Subject_Driven_IE_Rule_llava.replace("{subject}", subject) 164 | task = task_rule + Subject_Driven_IE_Task_1_llava + Subject_Driven_IE_Task_2_llava 165 | prompt_tool = Tool_Decide.replace("{task}", task) 166 | im_1 = load_image(url2path(data[key]["vision_input"][0], Image_Root)) 167 | im_2 = load_image(url2path(data[key]["vision_input"][1], Image_Root)) 168 | im = merge_images([im_1, im_2]) 169 | 170 | img_links = [f"data:image/jpeg;base64,{encode_pil_image(im)}", url2path(data[key]["vision_input"][2], Image_Root)] 171 | 172 | result = GPTResponse2JSON(agent_run.get_result(img_links, prompt_tool)) 173 | result = check(result, ["task_id", "reasoning", "used", "tool"]) 174 | log_prompt(log_path, prompt_tool) 175 | log_prompt(log_path, result) 176 | # tool 177 | 178 | out[key] = {} 179 | out[key]['tool_plan'] = result 180 | print(f"{key} over") 181 | break 182 | except Exception as e: 183 | print(f"Error: {key} evaluation failed: {e}") 184 | counter += 1 185 | 186 | write_json(out_path, out) 187 | 188 | 189 | 190 | def test_Subject_Driven_IE_evaluate(in_path, tool_path, out_path, log_path, test=None): 191 | data = read_json(in_path) 192 | data_tool = read_json(tool_path) 193 | out = {} 194 | for key in tqdm(list(data.keys())): 195 | if test: 196 | its = in_path.split("/") 197 | if key not in test[its[-3]][its[-2]]['sample']: 198 | continue 199 | counter = 0 200 | while counter < 3: 201 | try: 202 | subject = data[key]["prompt_input"].replace("Subject:", "").strip() 203 | # Task 1 204 | ## prompt 205 | if data_tool[key]["tool_plan"][0]["tool"] in ["Highlight", "SceneGraph"]: 206 | tool = data_tool[key]["tool_plan"][0]["tool"] 207 | else: 208 | tool = "None" 209 | task_1_eva = Subject_Driven_IE_Task_1_evaluation.replace("Subject: {subject}", "") 210 | tool_text = get_tool_text(tool, [data[key]["vision_input"][1], data[key]["vision_input"][2]], log_path) 211 | task_1_eva = task_1_eva.replace("{tool_text}", tool_text) 212 | ## prompt 213 | ## image 214 | img_links = [url2path(data[key]["vision_input"][1], Image_Root), url2path(data[key]["vision_input"][2], Image_Root)] 215 | if tool == "Highlight": 216 | img_1 = load_image(url2path(data[key]["vision_input"][1], Image_Root)) 217 | img_2 = load_image(url2path(data[key]["vision_input"][2], Image_Root)) 218 | img_1 = grding(img_1, subject, "highlight") 219 | img_2 = grding(img_2, subject, "highlight") 220 | img_links = [f"data:image/jpeg;base64,{encode_pil_image(img_1)}", f"data:image/jpeg;base64,{encode_pil_image(img_2)}"] 221 | ## image 222 | 223 | json_1 = GPTResponse2JSON(agent_run.get_result(img_links, task_1_eva)) 224 | json_1['score'] = get_number(json_1['score']) 225 | log_prompt(log_path, task_1_eva) 226 | log_prompt(log_path, json_1) 227 | # Task 1 228 | 229 | # Task 2 230 | ## prompt 231 | if data_tool[key]["tool_plan"][1]["tool"] in ["Highlight", "SceneGraph"]: 232 | tool = data_tool[key]["tool_plan"][1]["tool"] 233 | else: 234 | tool = "None" 235 | tool_text = get_tool_text(tool, [data[key]["vision_input"][0], data[key]["vision_input"][2]], log_path) 236 | task_2_eva = Subject_Driven_IE_Task_2_evaluation.replace("{tool_text}", tool_text) 237 | ## prompt 238 | ## image 239 | img_links = [url2path(data[key]["vision_input"][0], Image_Root), url2path(data[key]["vision_input"][2], Image_Root)] 240 | if tool == "Highlight": 241 | img_1 = load_image(url2path(data[key]["vision_input"][0], Image_Root)) 242 | img_2 = load_image(url2path(data[key]["vision_input"][2], Image_Root)) 243 | img_1 = grding(img_1, "background", "highlight") 244 | img_2 = grding(img_2, "background", "highlight") 245 | img_links = [f"data:image/jpeg;base64,{encode_pil_image(img_1)}", f"data:image/jpeg;base64,{encode_pil_image(img_2)}"] 246 | 247 | ## image 248 | 249 | json_2 = GPTResponse2JSON(agent_run.get_result(img_links, task_2_eva)) 250 | json_2['score'] = get_number(json_2['score']) 251 | log_prompt(log_path, task_2_eva) 252 | log_prompt(log_path, json_2) 253 | # Task 2 254 | 255 | out[key] = {} 256 | out[key]['score'] = [json_1['score'], json_2['score']] 257 | out[key]['reasoning'] = [json_1['reasoning'], json_2['reasoning']] 258 | out[key]['prompt_input'] = [task_1_eva, task_2_eva] 259 | out[key]['vision_input'] = data[key]["vision_input"] 260 | print(f"{key} over") 261 | break 262 | except Exception as e: 263 | print(f"Error: {key} evaluation failed: {e}") 264 | counter += 1 265 | 266 | write_json(out_path, out) 267 | 268 | 269 | 270 | def test_Mask_Guided_IE_tool(in_path, out_path, log_path, test=None): 271 | data = read_json(in_path) 272 | out = {} 273 | st = False 274 | for key in tqdm(list(data.keys())): 275 | if test: 276 | its = in_path.split("/") 277 | if key not in test[its[-3]][its[-2]]['sample']: 278 | continue 279 | counter = 0 280 | while counter < 3: 281 | try: 282 | instruction = data[key]["prompt_input"].replace("Editing instruction:", "").strip() 283 | # tool 284 | task_rule = Mask_Guided_IE_Rule.replace("{instruction}", instruction) 285 | task = task_rule + Mask_Guided_IE_Task_1 + Mask_Guided_IE_Task_2 286 | prompt_tool = Tool_Decide.replace("{task}", task) 287 | img_links = [url2path(data[key]["vision_input"][0], Image_Root), url2path(data[key]["vision_input"][1], Image_Root)] 288 | 289 | result = GPTResponse2JSON(agent_run.get_result(img_links, prompt_tool)) 290 | result = check(result, ["task_id", "reasoning", "used", "tool"]) 291 | log_prompt(log_path, prompt_tool) 292 | log_prompt(log_path, result) 293 | # tool 294 | 295 | out[key] = {} 296 | out[key]['tool_plan'] = result 297 | print(f"{key} over") 298 | break 299 | except Exception as e: 300 | print(f"Error: {key} evaluation failed: {e}") 301 | counter += 1 302 | 303 | write_json(out_path, out) 304 | 305 | 306 | 307 | def test_Mask_Guided_IE_evaluate(in_path, tool_path, out_path, log_path, test=None): 308 | data = read_json(in_path) 309 | data_tool = read_json(tool_path) 310 | out = {} 311 | for key in tqdm(list(data.keys())): 312 | if test: 313 | its = in_path.split("/") 314 | if key not in test[its[-3]][its[-2]]['sample']: 315 | continue 316 | counter = 0 317 | while counter < 3: 318 | try: 319 | instruction = data[key]["prompt_input"].replace("Editing instruction:", "").strip() 320 | # Task 1 321 | ## prompt 322 | if data_tool[key]["tool_plan"][0]["tool"] in ["Highlight", "SceneGraph", "MaskFocus"]: 323 | tool = data_tool[key]["tool_plan"][0]["tool"] 324 | else: 325 | tool = "None" 326 | task_1_eva = Mask_Guided_IE_Task_1_evaluation.replace("{instruction}", instruction) 327 | tool_text = get_tool_text(tool, data[key]["vision_input"], log_path) 328 | task_1_eva = task_1_eva.replace("{tool_text}", tool_text) 329 | ## prompt 330 | ## image 331 | img_links = [url2path(data[key]["vision_input"][0], Image_Root), url2path(data[key]["vision_input"][1], Image_Root)] 332 | if tool == "Highlight": 333 | img_1 = load_image(url2path(data[key]["vision_input"][0], Image_Root)) 334 | img_2 = load_image(url2path(data[key]["vision_input"][1], Image_Root)) 335 | img_1 = grding(img_1, instruction, "highlight") 336 | img_2 = grding(img_2, instruction, "highlight") 337 | img_links = [f"data:image/jpeg;base64,{encode_pil_image(img_1)}", f"data:image/jpeg;base64,{encode_pil_image(img_2)}"] 338 | if tool == "MaskFocus": 339 | img_1, img_2 = imgs_diff(url2path(data[key]["vision_input"][0], Image_Root), url2path(data[key]["vision_input"][1], Image_Root), "highlight") 340 | img_links = [f"data:image/jpeg;base64,{encode_pil_image(img_1)}", f"data:image/jpeg;base64,{encode_pil_image(img_2)}"] 341 | ## image 342 | 343 | json_1 = GPTResponse2JSON(agent_run.get_result(img_links, task_1_eva)) 344 | json_1['score'] = get_number(json_1['score']) 345 | log_prompt(log_path, task_1_eva) 346 | log_prompt(log_path, json_1) 347 | # Task 1 348 | 349 | # Task 2 350 | ## prompt 351 | if data_tool[key]["tool_plan"][1]["tool"] in ["Highlight", "SceneGraph", "MaskFocus"]: 352 | tool = data_tool[key]["tool_plan"][1]["tool"] 353 | else: 354 | tool = "None" 355 | task_2_eva = Mask_Guided_IE_Task_2_evaluation.replace("{instruction}", instruction) 356 | tool_text = get_tool_text(tool, data[key]["vision_input"], log_path) 357 | task_2_eva = task_2_eva.replace("{tool_text}", tool_text) 358 | ## prompt 359 | ## image 360 | img_links = [url2path(data[key]["vision_input"][0], Image_Root), url2path(data[key]["vision_input"][1], Image_Root)] 361 | if tool == "Highlight": 362 | img_1 = load_image(url2path(data[key]["vision_input"][0], Image_Root)) 363 | img_2 = load_image(url2path(data[key]["vision_input"][1], Image_Root)) 364 | img_1 = grding(img_1, instruction, "highlight") 365 | img_2 = grding(img_2, instruction, "highlight") 366 | img_links = [f"data:image/jpeg;base64,{encode_pil_image(img_1)}", f"data:image/jpeg;base64,{encode_pil_image(img_2)}"] 367 | if tool == "MaskFocus": 368 | img_1, img_2 = imgs_diff(url2path(data[key]["vision_input"][0], Image_Root), url2path(data[key]["vision_input"][1], Image_Root), "highlight") 369 | img_links = [f"data:image/jpeg;base64,{encode_pil_image(img_1)}", f"data:image/jpeg;base64,{encode_pil_image(img_2)}"] 370 | ## image 371 | 372 | json_2 = GPTResponse2JSON(agent_run.get_result(img_links, task_2_eva)) 373 | json_2['score'] = get_number(json_2['score']) 374 | log_prompt(log_path, task_2_eva) 375 | log_prompt(log_path, json_2) 376 | # Task 2 377 | 378 | out[key] = {} 379 | out[key]['score'] = [json_1['score'], json_2['score']] 380 | out[key]['reasoning'] = [json_1['reasoning'], json_2['reasoning']] 381 | out[key]['prompt_input'] = [task_1_eva, task_2_eva] 382 | out[key]['vision_input'] = data[key]["vision_input"] 383 | print(f"{key} over") 384 | break 385 | except Exception as e: 386 | print(f"Error: {key} evaluation failed: {e}") 387 | counter += 1 388 | 389 | write_json(out_path, out) 390 | 391 | 392 | 393 | def test_Multi_Concept_IC_tool(in_path, out_path, log_path, test=None): 394 | data = read_json(in_path) 395 | out = {} 396 | for key in tqdm(list(data.keys())): 397 | if test: 398 | its = in_path.split("/") 399 | if key not in test[its[-3]][its[-2]]['sample']: 400 | continue 401 | counter = 0 402 | while counter < 3: 403 | try: 404 | text = data[key]["prompt_input"].replace("Text Prompt:", "").strip() 405 | # tool 406 | task_rule = Multi_Concept_IC_Rule.replace("{text}", text) 407 | task = task_rule + Multi_Concept_IC_Task_1 + Multi_Concept_IC_Task_2 408 | prompt_tool = Tool_Decide.replace("{task}", task) 409 | img_links = [url2path(data[key]["vision_input"][0], Image_Root), url2path(data[key]["vision_input"][1], Image_Root)] 410 | 411 | result = GPTResponse2JSON(agent_run.get_result(img_links, prompt_tool)) 412 | result = check(result, ["task_id", "reasoning", "used", "tool"]) 413 | log_prompt(log_path, prompt_tool) 414 | log_prompt(log_path, result) 415 | # tool 416 | 417 | out[key] = {} 418 | out[key]['tool_plan'] = result 419 | print(f"{key} over") 420 | break 421 | except Exception as e: 422 | print(f"Error: {key} evaluation failed: {e}") 423 | counter += 1 424 | 425 | write_json(out_path, out) 426 | 427 | 428 | 429 | def test_Multi_Concept_IC_evaluate(in_path, tool_path, out_path, log_path, test=None): 430 | data = read_json(in_path) 431 | data_tool = read_json(tool_path) 432 | out = {} 433 | for key in tqdm(list(data.keys())): 434 | if test: 435 | its = in_path.split("/") 436 | if key not in test[its[-3]][its[-2]]['sample']: 437 | continue 438 | counter = 0 439 | while counter < 3: 440 | try: 441 | text = data[key]["prompt_input"].replace("Text Prompt:", "").strip() 442 | img_L, img_R = split2part(url2path(data[key]["vision_input"][0], Image_Root)) 443 | img_LR_links = [f"data:image/jpeg;base64,{encode_pil_image(img_L)}", f"data:image/jpeg;base64,{encode_pil_image(img_R)}"] 444 | subject_L, subject_R = data[key]["concepts"][0], data[key]["concepts"][1] 445 | # Task 1 446 | ## prompt 447 | if data_tool[key]["tool_plan"][0]["tool"] in ["Highlight", "SceneGraph"]: 448 | tool = data_tool[key]["tool_plan"][0]["tool"] 449 | else: 450 | tool = "None" 451 | task_1_eva = Multi_Concept_IC_Task_1_evaluation.replace("{text}", text) 452 | task_1_eva = task_1_eva.replace("{subject}", subject_L) 453 | tool_text = get_tool_text(tool, [img_LR_links[0], data[key]["vision_input"][1]], log_path) 454 | task_1_eva = task_1_eva.replace("{tool_text}", tool_text) 455 | ## prompt 456 | 457 | img_links = [img_LR_links[0], url2path(data[key]["vision_input"][1], Image_Root)] 458 | if tool == "Highlight": 459 | img_1 = img_L 460 | img_2 = load_image(url2path(data[key]["vision_input"][1], Image_Root)) 461 | img_1 = grding(img_1, subject_L, "highlight") 462 | img_2 = grding(img_2, subject_L, "highlight") 463 | img_links = [f"data:image/jpeg;base64,{encode_pil_image(img_1)}", f"data:image/jpeg;base64,{encode_pil_image(img_2)}"] 464 | ## image 465 | 466 | json_1 = GPTResponse2JSON(agent_run.get_result(img_links, task_1_eva)) 467 | json_1['score'] = get_number(json_1['score']) 468 | log_prompt(log_path, task_1_eva) 469 | log_prompt(log_path, json_1) 470 | # Task 1 471 | 472 | # Task 1 473 | ## prompt 474 | if data_tool[key]["tool_plan"][0]["tool"] in ["Highlight", "SceneGraph"]: 475 | tool = data_tool[key]["tool_plan"][0]["tool"] 476 | else: 477 | tool = "None" 478 | task_1_eva_ = Multi_Concept_IC_Task_1_evaluation.replace("{text}", text) 479 | task_1_eva_ = task_1_eva_.replace("{subject}", subject_R) 480 | tool_text = get_tool_text(tool, [img_LR_links[1], data[key]["vision_input"][1]], log_path) 481 | task_1_eva_ = task_1_eva_.replace("{tool_text}", tool_text) 482 | ## prompt 483 | ## image 484 | img_links = [img_LR_links[1], url2path(data[key]["vision_input"][1], Image_Root)] 485 | if tool == "Highlight": 486 | img_1 = img_R 487 | img_2 = load_image(url2path(data[key]["vision_input"][1], Image_Root)) 488 | img_1 = grding(img_1, subject_R, "highlight") 489 | img_2 = grding(img_2, subject_R, "highlight") 490 | img_links = [f"data:image/jpeg;base64,{encode_pil_image(img_1)}", f"data:image/jpeg;base64,{encode_pil_image(img_2)}"] 491 | ## image 492 | 493 | json_2 = GPTResponse2JSON(agent_run.get_result(img_links, task_1_eva_)) 494 | json_2['score'] = get_number(json_2['score']) 495 | log_prompt(log_path, task_1_eva_) 496 | log_prompt(log_path, json_2) 497 | # Task 1 498 | 499 | # Task 2 500 | ## prompt 501 | if data_tool[key]["tool_plan"][1]["tool"] in ["Highlight", "SceneGraph"]: 502 | tool = data_tool[key]["tool_plan"][1]["tool"] 503 | else: 504 | tool = "None" 505 | task_2_eva = Multi_Concept_IC_Task_2_evaluation.replace("{text}", text) 506 | tool_text = get_tool_text(tool, [data[key]["vision_input"][1]], log_path) 507 | task_2_eva = task_2_eva.replace("{tool_text}", tool_text) 508 | ## prompt 509 | ## image 510 | img_links = [url2path(data[key]["vision_input"][1], Image_Root)] 511 | if tool == "Highlight": 512 | img_2 = load_image(url2path(data[key]["vision_input"][1], Image_Root)) 513 | img_2 = grding(img_2, text, "highlight") 514 | img_links = [f"data:image/jpeg;base64,{encode_pil_image(img_2)}"] 515 | ## image 516 | 517 | json_3 = GPTResponse2JSON(agent_run.get_result(img_links, task_2_eva)) 518 | json_3['score'] = get_number(json_3['score']) 519 | log_prompt(log_path, task_2_eva) 520 | log_prompt(log_path, json_3) 521 | # Task 2 522 | 523 | out[key] = {} 524 | out[key]['score'] = [json_1['score'], json_2['score'], json_3['score']] 525 | out[key]['reasoning'] = [json_1['reasoning'], json_2['reasoning'], json_3['reasoning']] 526 | out[key]['prompt_input'] = [task_1_eva, task_1_eva_, task_2_eva] 527 | out[key]['vision_input'] = data[key]["vision_input"] 528 | print(f"{key} over") 529 | break 530 | except Exception as e: 531 | print(f"Error: {key} evaluation failed: {e}") 532 | counter += 1 533 | 534 | write_json(out_path, out) 535 | 536 | 537 | 538 | def test_Text_Guided_IG_tool(in_path, out_path, log_path, test=None): 539 | data = read_json(in_path) 540 | out = {} 541 | for key in tqdm(list(data.keys())): 542 | if test: 543 | its = in_path.split("/") 544 | if key not in test[its[-3]][its[-2]]['sample']: 545 | continue 546 | counter = 0 547 | while counter < 3: 548 | try: 549 | text = data[key]["prompt_input"].replace("Text Prompt:", "").strip() 550 | # tool 551 | task_rule = Text_Guided_IG_Rule.replace("{text}", text) 552 | task = task_rule + Text_Guided_IG_Task_1 553 | prompt_tool = Tool_Decide.replace("{task}", task) 554 | img_links = [url2path(data[key]["vision_input"][0], Image_Root)] 555 | 556 | result = GPTResponse2JSON(agent_run.get_result(img_links, prompt_tool)) 557 | result = check(result, ["task_id", "reasoning", "used", "tool"]) 558 | log_prompt(log_path, prompt_tool) 559 | log_prompt(log_path, result) 560 | # tool 561 | 562 | out[key] = {} 563 | out[key]['tool_plan'] = result 564 | print(f"{key} over") 565 | break 566 | except Exception as e: 567 | print(f"Error: {key} evaluation failed: {e}") 568 | counter += 1 569 | 570 | write_json(out_path, out) 571 | 572 | 573 | 574 | def test_Text_Guided_IG_evaluate(in_path, tool_path, out_path, log_path, test=None): 575 | data = read_json(in_path) 576 | data_tool = read_json(tool_path) 577 | out = {} 578 | for key in tqdm(list(data.keys())): 579 | if test: 580 | its = in_path.split("/") 581 | if key not in test[its[-3]][its[-2]]['sample']: 582 | continue 583 | counter = 0 584 | while counter < 3: 585 | try: 586 | text = data[key]["prompt_input"].replace("Text Prompt:", "").strip() 587 | # Task 1 588 | ## prompt 589 | if data_tool[key]["tool_plan"][0]["tool"] in ["Highlight", "SceneGraph"]: 590 | tool = data_tool[key]["tool_plan"][0]["tool"] 591 | else: 592 | tool = "None" 593 | task_1_eva = Text_Guided_IG_Task_1_evaluation.replace("{text}", text) 594 | tool_text = get_tool_text(tool, data[key]["vision_input"], log_path) 595 | task_1_eva = task_1_eva.replace("{tool_text}", tool_text) 596 | ## prompt 597 | ## image 598 | img_links = [url2path(data[key]["vision_input"][0], Image_Root)] 599 | if tool == "Highlight": 600 | img_1 = load_image(url2path(data[key]["vision_input"][0], Image_Root)) 601 | img_1 = grding(img_1, text, "highlight") 602 | img_links = [f"data:image/jpeg;base64,{encode_pil_image(img_1)}"] 603 | 604 | ## image 605 | 606 | json_1 = GPTResponse2JSON(agent_run.get_result(img_links, task_1_eva)) 607 | json_1['score'] = get_number(json_1['score']) 608 | log_prompt(log_path, task_1_eva) 609 | log_prompt(log_path, json_1) 610 | # Task 1 611 | 612 | out[key] = {} 613 | out[key]['score'] = [json_1['score']] 614 | out[key]['reasoning'] = [json_1['reasoning']] 615 | out[key]['prompt_input'] = [task_1_eva] 616 | out[key]['vision_input'] = data[key]["vision_input"] 617 | print(f"{key} over") 618 | break 619 | except Exception as e: 620 | print(f"Error: {key} evaluation failed: {e}") 621 | counter += 1 622 | 623 | write_json(out_path, out) 624 | 625 | 626 | 627 | def test_Control_Guided_IG_tool(in_path, out_path, log_path, test=None): 628 | data = read_json(in_path) 629 | out = {} 630 | for key in tqdm(list(data.keys())): 631 | if test: 632 | its = in_path.split("/") 633 | if key not in test[its[-3]][its[-2]]['sample']: 634 | continue 635 | counter = 0 636 | while counter < 3: 637 | try: 638 | text = data[key]["prompt_input"].replace("Text Prompt:", "").strip() 639 | # tool 640 | task_rule = Control_Guided_IG_Rule.replace("{text}", text) 641 | task = task_rule + Control_Guided_IG_Task_1 + Control_Guided_IG_Task_2 642 | prompt_tool = Tool_Decide.replace("{task}", task) 643 | img_links = [url2path(data[key]["vision_input"][0], Image_Root), url2path(data[key]["vision_input"][1], Image_Root)] 644 | 645 | result = GPTResponse2JSON(agent_run.get_result(img_links, prompt_tool)) 646 | result = check(result, ["task_id", "reasoning", "used", "tool"]) 647 | log_prompt(log_path, prompt_tool) 648 | log_prompt(log_path, result) 649 | # tool 650 | 651 | out[key] = {} 652 | out[key]['tool_plan'] = result 653 | print(f"{key} over") 654 | break 655 | except Exception as e: 656 | print(f"Error: {key} evaluation failed: {e}") 657 | counter += 1 658 | 659 | write_json(out_path, out) 660 | 661 | 662 | 663 | def test_Control_Guided_IG_evaluate(in_path, tool_path, out_path, log_path,test=None): 664 | data = read_json(in_path) 665 | data_tool = read_json(tool_path) 666 | out = {} 667 | for key in tqdm(list(data.keys())): 668 | if test: 669 | its = in_path.split("/") 670 | if key not in test[its[-3]][its[-2]]['sample']: 671 | continue 672 | counter = 0 673 | while counter < 3: 674 | try: 675 | text = data[key]["prompt_input"].replace("Text Prompt:", "").strip() 676 | # Task 1 677 | ## prompt 678 | if data_tool[key]["tool_plan"][0]["tool"] in ["Highlight", "SceneGraph"]: 679 | tool = data_tool[key]["tool_plan"][0]["tool"] 680 | else: 681 | tool = "None" 682 | task_1_eva = Control_Guided_IG_Task_1_evaluation 683 | tool_text = get_tool_text(tool, data[key]["vision_input"], log_path) 684 | task_1_eva = task_1_eva.replace("{tool_text}", tool_text) 685 | ## prompt 686 | ## image 687 | img_links = [url2path(data[key]["vision_input"][0], Image_Root), url2path(data[key]["vision_input"][1], Image_Root)] 688 | if tool == "Highlight": 689 | img_1 = load_image(url2path(data[key]["vision_input"][0], Image_Root)) 690 | img_2 = load_image(url2path(data[key]["vision_input"][1], Image_Root)) 691 | img_1 = grding(img_1, text, "highlight") 692 | img_2 = grding(img_2, text, "highlight") 693 | img_links = [f"data:image/jpeg;base64,{encode_pil_image(img_1)}", f"data:image/jpeg;base64,{encode_pil_image(img_2)}"] 694 | ## image 695 | 696 | json_1 = GPTResponse2JSON(agent_run.get_result(img_links, task_1_eva)) 697 | json_1['score'] = get_number(json_1['score']) 698 | log_prompt(log_path, task_1_eva) 699 | log_prompt(log_path, json_1) 700 | # Task 1 701 | 702 | # Task 2 703 | ## prompt 704 | if data_tool[key]["tool_plan"][1]["tool"] in ["Highlight", "SceneGraph"]: 705 | tool = data_tool[key]["tool_plan"][1]["tool"] 706 | else: 707 | tool = "None" 708 | task_2_eva = Control_Guided_IG_Task_2_evaluation.replace("{text}", text) 709 | tool_text = get_tool_text(tool, [data[key]["vision_input"][1]], log_path) 710 | task_2_eva = task_2_eva.replace("{tool_text}", tool_text) 711 | ## prompt 712 | ## image 713 | img_links = [url2path(data[key]["vision_input"][1], Image_Root)] 714 | if tool == "Highlight": 715 | img_2 = load_image(url2path(data[key]["vision_input"][1], Image_Root)) 716 | img_2 = grding(img_2, text, "highlight") 717 | img_links = [f"data:image/jpeg;base64,{encode_pil_image(img_2)}"] 718 | ## image 719 | 720 | json_2 = GPTResponse2JSON(agent_run.get_result(img_links, task_2_eva)) 721 | json_2['score'] = get_number(json_2['score']) 722 | log_prompt(log_path, task_2_eva) 723 | log_prompt(log_path, json_2) 724 | # Task 2 725 | 726 | out[key] = {} 727 | out[key]['score'] = [json_1['score'], json_2['score']] 728 | out[key]['reasoning'] = [json_1['reasoning'], json_2['reasoning']] 729 | out[key]['prompt_input'] = [task_1_eva, task_2_eva] 730 | out[key]['vision_input'] = data[key]["vision_input"] 731 | print(f"{key} over") 732 | break 733 | except Exception as e: 734 | print(f"Error: {key} evaluation failed: {e}") 735 | counter += 1 736 | 737 | write_json(out_path, out) 738 | 739 | 740 | 741 | def test_Subject_Driven_IG_tool(in_path, out_path, log_path, test=None): 742 | data = read_json(in_path) 743 | out = {} 744 | for key in tqdm(list(data.keys())): 745 | if test: 746 | its = in_path.split("/") 747 | if key not in test[its[-3]][its[-2]]['sample']: 748 | continue 749 | counter = 0 750 | while counter < 3: 751 | try: 752 | text = data[key]["prompt_input"].replace("Text Prompt:", "").strip() 753 | # tool 754 | task_rule = Subject_Driven_IG_Rule.replace("{text}", text) 755 | task = task_rule + Subject_Driven_IG_Task_1 + Subject_Driven_IG_Task_2 756 | prompt_tool = Tool_Decide.replace("{task}", task) 757 | img_links = [url2path(data[key]["vision_input"][0], Image_Root), url2path(data[key]["vision_input"][1], Image_Root)] 758 | 759 | result = GPTResponse2JSON(agent_run.get_result(img_links, prompt_tool)) 760 | result = check(result, ["task_id", "reasoning", "used", "tool"]) 761 | log_prompt(log_path, prompt_tool) 762 | log_prompt(log_path, result) 763 | # tool 764 | 765 | out[key] = {} 766 | out[key]['tool_plan'] = result 767 | print(f"{key} over") 768 | break 769 | except Exception as e: 770 | print(f"Error: {key} evaluation failed: {e}") 771 | counter += 1 772 | 773 | write_json(out_path, out) 774 | 775 | 776 | 777 | def test_Subject_Driven_IG_evaluate(in_path, tool_path, out_path, log_path, test=None): 778 | data = read_json(in_path) 779 | data_tool = read_json(tool_path) 780 | out = {} 781 | for key in tqdm(list(data.keys())): 782 | if test: 783 | its = in_path.split("/") 784 | if key not in test[its[-3]][its[-2]]['sample']: 785 | continue 786 | counter = 0 787 | while counter < 3: 788 | try: 789 | text = data[key]["prompt_input"].replace("Text Prompt:", "").strip() 790 | subject = data[key]["subject"] 791 | # Task 1 792 | ## prompt 793 | if data_tool[key]["tool_plan"][0]["tool"] in ["Highlight", "SceneGraph"]: 794 | tool = data_tool[key]["tool_plan"][0]["tool"] 795 | else: 796 | tool = "None" 797 | task_1_eva = Subject_Driven_IG_Task_1_evaluation.replace("{subject}", subject) 798 | tool_text = get_tool_text(tool, data[key]["vision_input"], log_path) 799 | task_1_eva = task_1_eva.replace("{tool_text}", tool_text) 800 | ## prompt 801 | ## image 802 | img_links = [url2path(data[key]["vision_input"][0], Image_Root), url2path(data[key]["vision_input"][1], Image_Root)] 803 | if tool == "Highlight": 804 | img_1 = load_image(url2path(data[key]["vision_input"][0], Image_Root)) 805 | img_2 = load_image(url2path(data[key]["vision_input"][1], Image_Root)) 806 | img_1 = grding(img_1, text, "highlight") 807 | img_2 = grding(img_2, text, "highlight") 808 | img_links = [f"data:image/jpeg;base64,{encode_pil_image(img_1)}", f"data:image/jpeg;base64,{encode_pil_image(img_2)}"] 809 | ## image 810 | 811 | json_1 = GPTResponse2JSON(agent_run.get_result(img_links, task_1_eva)) 812 | json_1['score'] = get_number(json_1['score']) 813 | log_prompt(log_path, task_1_eva) 814 | log_prompt(log_path, json_1) 815 | # Task 1 816 | 817 | # Task 2 818 | ## prompt 819 | if data_tool[key]["tool_plan"][1]["tool"] in ["Highlight", "SceneGraph"]: 820 | tool = data_tool[key]["tool_plan"][1]["tool"] 821 | else: 822 | tool = "None" 823 | task_2_eva = Subject_Driven_IG_Task_2_evaluation.replace("{text}", text) 824 | tool_text = get_tool_text(tool, [data[key]["vision_input"][1]], log_path) 825 | task_2_eva = task_2_eva.replace("{tool_text}", tool_text) 826 | ## prompt 827 | ## image 828 | img_links = [url2path(data[key]["vision_input"][1], Image_Root)] 829 | if tool == "Highlight": 830 | img_2 = load_image(url2path(data[key]["vision_input"][1], Image_Root)) 831 | img_2 = grding(img_2, text, "highlight") 832 | img_links = [f"data:image/jpeg;base64,{encode_pil_image(img_2)}"] 833 | ## image 834 | 835 | json_2 = GPTResponse2JSON(agent_run.get_result(img_links, task_2_eva)) 836 | json_2['score'] = get_number(json_2['score']) 837 | log_prompt(log_path, task_2_eva) 838 | log_prompt(log_path, json_2) 839 | # Task 2 840 | 841 | out[key] = {} 842 | out[key]['score'] = [json_1['score'], json_2['score']] 843 | out[key]['reasoning'] = [json_1['reasoning'], json_2['reasoning']] 844 | out[key]['prompt_input'] = [task_1_eva, task_2_eva] 845 | out[key]['vision_input'] = data[key]["vision_input"] 846 | print(f"{key} over") 847 | break 848 | except Exception as e: 849 | print(f"Error: {key} evaluation failed: {e}") 850 | counter += 1 851 | 852 | write_json(out_path, out) 853 | 854 | 855 | 856 | 857 | 858 | def evaluate(in_path, tool_out_path, tool_log_path, eva_out_path, eva_log_path, test): 859 | if "ImagenHub_Control-Guided_IG" in in_path: 860 | test_Control_Guided_IG_tool(in_path, tool_out_path, tool_log_path, test) 861 | test_Control_Guided_IG_evaluate(in_path, tool_out_path, eva_out_path, eva_log_path, test) 862 | if "ImagenHub_Mask-Guided_IE" in in_path: 863 | test_Mask_Guided_IE_tool(in_path, tool_out_path, tool_log_path, test) 864 | test_Mask_Guided_IE_evaluate(in_path, tool_out_path, eva_out_path, eva_log_path, test) 865 | if "ImagenHub_Multi-Concept_IC" in in_path: 866 | test_Multi_Concept_IC_tool(in_path, tool_out_path, tool_log_path, test) 867 | test_Multi_Concept_IC_evaluate(in_path, tool_out_path, eva_out_path, eva_log_path, test) 868 | if "ImagenHub_Subject-Driven_IE" in in_path: 869 | test_Subject_Driven_IE_tool(in_path, tool_out_path, tool_log_path, test) 870 | test_Subject_Driven_IE_evaluate(in_path, tool_out_path, eva_out_path, eva_log_path, test) 871 | if "ImagenHub_Subject-Driven_IG" in in_path: 872 | test_Subject_Driven_IG_tool(in_path, tool_out_path, tool_log_path, test) 873 | test_Subject_Driven_IG_evaluate(in_path, tool_out_path, eva_out_path, eva_log_path, test) 874 | if "ImagenHub_Text-Guided_IE" in in_path: 875 | test_Text_Guided_IE_tool(in_path, tool_out_path, tool_log_path, test) 876 | test_Text_Guided_IE_evaluate(in_path, tool_out_path, eva_out_path, eva_log_path, test) 877 | if "ImagenHub_Text-Guided_IG" in in_path: 878 | test_Text_Guided_IG_tool(in_path, tool_out_path, tool_log_path, test) 879 | test_Text_Guided_IG_evaluate(in_path, tool_out_path, eva_out_path, eva_log_path, test) 880 | 881 | 882 | 883 | import os 884 | agent_run = LlavaNext() 885 | Image_task_path = "PATH_TO_ImageHub_DATA" 886 | test = read_json("test_40p.json") 887 | # test=None 888 | for task in os.listdir(Image_task_path): 889 | task_path = os.path.join(Image_task_path, task) 890 | models = os.listdir(task_path) 891 | print(models) 892 | for dir in models: 893 | if dir!="input" and dir!="token": 894 | tool_log_path = f"{task_path}_{dir}_tool_test_40p.txt" 895 | tool_out_path = f"{task_path}/{dir}/SC_tool_test_40p.json" 896 | eva_log_path = f"{task_path}_{dir}_eva_test_40p.txt" 897 | eva_out_path = f"{task_path}/{dir}/SC_eva_test_40p.json" 898 | in_path = f"{task_path}/{dir}/in.json" 899 | evaluate(in_path, tool_out_path, tool_log_path, eva_out_path, eva_log_path, test) 900 | 901 | --------------------------------------------------------------------------------