├── Toxicity AI Prototype ├── .env ├── 4.3. 핵심 기술 구현 결과서 (Toxicity AI 프로토타입 개발).pdf ├── no_tool_chat_template_qwen3.jinja ├── mmlu_toxic.py ├── utils.py └── mobile_eval_e.py ├── Generalized ADMET Inference Baseline ├── 4.2. 핵심 기술 구현 결과서 (Generalizaed ADMET Inference 베이스라인 구축).pdf ├── chem_cot.py └── utils.py ├── run_vllm.sh ├── utils.py └── README.md /Toxicity AI Prototype/.env: -------------------------------------------------------------------------------- 1 | BASE_URL=http://localhost:30002/v1/ 2 | GPT_API_KEY= 3 | -------------------------------------------------------------------------------- /Toxicity AI Prototype/4.3. 핵심 기술 구현 결과서 (Toxicity AI 프로토타입 개발).pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KwangSun-Ryu/ADMET-AGI-Toxicity-AI-Prototype-and-Baseline--/HEAD/Toxicity AI Prototype/4.3. 핵심 기술 구현 결과서 (Toxicity AI 프로토타입 개발).pdf -------------------------------------------------------------------------------- /Generalized ADMET Inference Baseline/4.2. 핵심 기술 구현 결과서 (Generalizaed ADMET Inference 베이스라인 구축).pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KwangSun-Ryu/ADMET-AGI-Toxicity-AI-Prototype-and-Baseline--/HEAD/Generalized ADMET Inference Baseline/4.2. 핵심 기술 구현 결과서 (Generalizaed ADMET Inference 베이스라인 구축).pdf -------------------------------------------------------------------------------- /run_vllm.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # VLLM 컨테이너 실행 예시 (경로/포트/GPU는 환경에 맞게 수정) 4 | docker run -it --rm \ 5 | --gpus all \ 6 | -p 30002:8000 \ 7 | -v "/mnt/e/Google Drive/External SSD/HealthCare/ADMET/코드/ADMET-AGI/Toxicity AI":/workspace:rw \ 8 | -e CUDA_VISIBLE_DEVICES=0 \ 9 | -e TP_SIZE=1 \ 10 | -e MODEL_PATH=/workspace/25TOXMC_Blowfish_v1.0.9-AWQ \ 11 | -e CHAT_TEMPLATE_PATH=/workspace/no_tool_chat_template_qwen3.jinja \ 12 | -e GPU_MEMORY_UTILIZATION=0.9 \ 13 | -e DTYPE=bfloat16 \ 14 | vllm-25admet-vllm \ 15 | --host=0.0.0.0 \ 16 | --model=/workspace/25TOXMC_Blowfish_v1.0.9-AWQ \ 17 | --dtype=bfloat16 \ 18 | --chat-template=/workspace/no_tool_chat_template_qwen3.jinja \ 19 | --gpu-memory-utilization=0.9 \ 20 | --tensor-parallel-size=1 \ 21 | --max-model-len=16384 22 | 23 | # 컨테이너 내부에서 .env 변수 설정 후 평가 스크립트 실행 예시 24 | # export BASE_URL=http://:30002/v1/ 25 | # export GPT_API_KEY= 26 | # python3 mobile_eval_e.py 27 | # python3 mmlu_toxic.py 28 | # python3 chem_cot.py 29 | -------------------------------------------------------------------------------- /Toxicity AI Prototype/no_tool_chat_template_qwen3.jinja: -------------------------------------------------------------------------------- 1 | {%- if messages[0].role == 'system' %} 2 | <|im_start|>system 3 | {{ messages[0].content }} 4 | <|im_end|> 5 | 6 | {%- endif %} 7 | {%- for message in messages %} 8 | {%- if loop.index0 == 0 %}{% continue %}{% endif %} 9 | {%- if message.content is string %} 10 | {%- set content = message.content %} 11 | {%- else %} 12 | {%- set content = '' %} 13 | {%- endif %} 14 | 15 | {%- if message.role == "user" or (message.role == "system" and not loop.first) %} 16 | <|im_start|>{{ message.role }} 17 | {{ content }} 18 | <|im_end|> 19 | 20 | {%- elif message.role == "assistant" %} 21 | {%- set reasoning_content = '' %} 22 | {%- if message.reasoning_content is string %} 23 | {%- set reasoning_content = message.reasoning_content %} 24 | {%- elif '' in content %} 25 | {%- set reasoning_content = content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} 26 | {%- set content = content.split('')[-1].lstrip('\n') %} 27 | {%- endif %} 28 | 29 | <|im_start|>{{ message.role }} 30 | {%- if reasoning_content %} 31 | 32 | {{ reasoning_content.strip('\n') }} 33 | 34 | 35 | {%- endif %} 36 | {{ content.lstrip('\n') }} 37 | <|im_end|> 38 | {%- endif %} 39 | {%- endfor %} 40 | {%- if add_generation_prompt %} 41 | <|im_start|>assistant 42 | {%- endif %} 43 | -------------------------------------------------------------------------------- /Toxicity AI Prototype/mmlu_toxic.py: -------------------------------------------------------------------------------- 1 | # MMLU_toxic.py 2 | import json 3 | import asyncio 4 | from utils import run_concurrent_worker, save_jsonl, compute_em_score_mmlu, summarize_scores 5 | import openai 6 | import os 7 | from dotenv import load_dotenv 8 | 9 | load_dotenv() 10 | 11 | BASE_URL = os.getenv("BASE_URL") 12 | 13 | client = openai.AsyncOpenAI(api_key="dummy", base_url=BASE_URL) 14 | 15 | def build_messages(item): 16 | system = item.get("system", "") 17 | prompt = item.get("prompt", "") 18 | return [ 19 | {"role": "system", "content": system}, 20 | {"role": "user", "content": prompt}, 21 | ] 22 | 23 | def main(): 24 | with open("./mmlu_toxic.json", "r", encoding="utf-8") as f: 25 | data = json.load(f) 26 | 27 | outputs = asyncio.run(run_concurrent_worker(data, build_messages, client, concurrency=16)) 28 | 29 | results = [] 30 | for i, item in enumerate(data): 31 | pred = outputs[i] 32 | em = compute_em_score_mmlu(pred, item.get("answer", [])) 33 | results.append({ 34 | "id": item.get("id", i), 35 | "prompt": item.get("prompt"), 36 | "model_output": pred, 37 | "reference": item.get("answer"), 38 | "score": em 39 | }) 40 | 41 | save_jsonl(results, "./MMLU_toxic_results.jsonl") 42 | print("SUMMARY:", summarize_scores(results)) 43 | 44 | if __name__ == "__main__": 45 | main() 46 | 47 | -------------------------------------------------------------------------------- /Generalized ADMET Inference Baseline/chem_cot.py: -------------------------------------------------------------------------------- 1 | # ChemCoT.py 2 | import asyncio 3 | import json 4 | import datasets 5 | from utils import run_concurrent_worker, save_jsonl, compute_em_score, summarize_scores 6 | import openai 7 | from dotenv import load_dotenv 8 | import os 9 | 10 | load_dotenv() 11 | 12 | BASE_URL = os.getenv("BASE_URL") 13 | print(BASE_URL) 14 | client = openai.AsyncOpenAI(api_key="dummy", base_url=BASE_URL) 15 | 16 | def build_messages(item): 17 | system = '''You are a chemical assistant. Given the SMILES structural formula of a molecule, help me add a specified functional group and output the improved SMILES sequence of the molecule. 18 | Your response must be directly parsable JSON format: 19 | { 20 | "output": "Modified Molecule SMILES" 21 | }''' 22 | prompt = item.get("prompt") or item.get("query", "") 23 | return [ 24 | {"role": "system", "content": system}, 25 | {"role": "user", "content": prompt}, 26 | ] 27 | 28 | def main(): 29 | ds = datasets.load_from_disk('./ChemCoTBench') 30 | outputs = asyncio.run(run_concurrent_worker(ds, build_messages, client, concurrency=16)) 31 | 32 | results = [] 33 | for i, item in enumerate(ds): 34 | pred = outputs[i] 35 | gold = json.loads(item["meta"]).get("reference") 36 | em = compute_em_score(pred, gold) 37 | results.append({ 38 | "id": item.get("id", i), 39 | "prompt": item.get("prompt") or item.get("query"), 40 | "model_output": pred, 41 | "reference": gold, 42 | "score": em 43 | }) 44 | 45 | save_jsonl(results, "./ChemCoT_results.jsonl") 46 | print("SUMMARY:", summarize_scores(results)) 47 | 48 | if __name__ == "__main__": 49 | main() 50 | -------------------------------------------------------------------------------- /Toxicity AI Prototype/utils.py: -------------------------------------------------------------------------------- 1 | # utils.py (Toxicity AI Prototype) 2 | import os 3 | import json 4 | import asyncio 5 | import re 6 | from concurrent.futures import ThreadPoolExecutor 7 | from tqdm import tqdm 8 | 9 | # ========================== 10 | # 파일 저장 11 | # ========================== 12 | def save_jsonl(data_list, out_path): 13 | os.makedirs(os.path.dirname(out_path), exist_ok=True) 14 | with open(out_path, "w", encoding="utf-8") as f: 15 | for item in data_list: 16 | f.write(json.dumps(item, ensure_ascii=False) + "\n") 17 | print(f"Saved results to {out_path}") 18 | 19 | # ========================== 20 | # EM 점수 계산 21 | # ========================== 22 | def compute_em_score_mmlu(pred, reference): 23 | return 1 if pred in reference else 0 24 | 25 | # ========================== 26 | # Summary 27 | # ========================== 28 | def summarize_scores(results): 29 | total = len(results) 30 | em_total = sum(r.get("score", 0) for r in results) 31 | return { 32 | "n_samples": total, 33 | "em_score": em_total / total if total > 0 else None 34 | } 35 | 36 | # ========================== 37 | # 비동기 모델 호출 38 | # ========================== 39 | async def call_model_async(messages, client, retries=3, initial_delay=1.0): 40 | delay = initial_delay 41 | for attempt in range(retries): 42 | try: 43 | resp = await client.chat.completions.create( 44 | model="25TOXMC_Blowfish_v1.0.9-AWQ", 45 | messages=messages, 46 | temperature=0.0, 47 | top_p=0.95, 48 | stream=False 49 | ) 50 | return resp.choices[0].message.content 51 | except Exception as e: 52 | if attempt == retries-1: 53 | raise 54 | await asyncio.sleep(delay) 55 | delay *= 2 56 | 57 | # ========================== 58 | # 공통 비동기 워커 59 | # ========================== 60 | async def run_concurrent_worker(data, build_messages_func, client, concurrency=16): 61 | sem = asyncio.Semaphore(concurrency) 62 | results = [None] * len(data) 63 | 64 | async def worker(i): 65 | async with sem: 66 | messages = build_messages_func(data[i]) 67 | out = await call_model_async(messages, client) 68 | try: 69 | out_clean = re.sub(r".*?", "", out, flags=re.DOTALL).strip() 70 | out_json = json.loads(out_clean) 71 | results[i] = out_json.get("output") 72 | except Exception: 73 | results[i] = out_clean 74 | 75 | tasks = [asyncio.create_task(worker(i)) for i in range(len(data))] 76 | for f in tqdm(asyncio.as_completed(tasks), total=len(data), desc="추론 진행중"): 77 | await f 78 | 79 | return results 80 | -------------------------------------------------------------------------------- /Generalized ADMET Inference Baseline/utils.py: -------------------------------------------------------------------------------- 1 | # utils.py (Generalized ADMET Inference Baseline) 2 | import os 3 | import json 4 | import asyncio 5 | import re 6 | from concurrent.futures import ThreadPoolExecutor 7 | from tqdm import tqdm 8 | 9 | # ========================== 10 | # 파일 저장 11 | # ========================== 12 | def save_jsonl(data_list, out_path): 13 | os.makedirs(os.path.dirname(out_path), exist_ok=True) 14 | with open(out_path, "w", encoding="utf-8") as f: 15 | for item in data_list: 16 | f.write(json.dumps(item, ensure_ascii=False) + "\n") 17 | print(f"Saved results to {out_path}") 18 | 19 | # ========================== 20 | # EM 점수 계산 21 | # ========================== 22 | def compute_em_score(pred, reference): 23 | return 1 if pred == reference else 0 24 | 25 | # ========================== 26 | # Summary 27 | # ========================== 28 | def summarize_scores(results): 29 | total = len(results) 30 | em_total = sum(r.get("score", 0) for r in results) 31 | return { 32 | "n_samples": total, 33 | "em_score": em_total / total if total > 0 else None 34 | } 35 | 36 | # ========================== 37 | # 비동기 모델 호출 38 | # ========================== 39 | async def call_model_async(messages, client, retries=3, initial_delay=1.0): 40 | delay = initial_delay 41 | for attempt in range(retries): 42 | try: 43 | resp = await client.chat.completions.create( 44 | model="25TOXMC_Blowfish_v1.0.9-AWQ", 45 | messages=messages, 46 | temperature=0.0, 47 | top_p=0.95, 48 | stream=False 49 | ) 50 | return resp.choices[0].message.content 51 | except Exception as e: 52 | if attempt == retries-1: 53 | raise 54 | await asyncio.sleep(delay) 55 | delay *= 2 56 | 57 | # ========================== 58 | # 공통 비동기 워커 59 | # ========================== 60 | async def run_concurrent_worker(data, build_messages_func, client, concurrency=16): 61 | sem = asyncio.Semaphore(concurrency) 62 | results = [None] * len(data) 63 | 64 | async def worker(i): 65 | async with sem: 66 | messages = build_messages_func(data[i]) 67 | out = await call_model_async(messages, client) 68 | try: 69 | out_clean = re.sub(r".*?", "", out, flags=re.DOTALL).strip() 70 | out_json = json.loads(out_clean) 71 | results[i] = out_json.get("output") 72 | except Exception: 73 | results[i] = out_clean 74 | 75 | tasks = [asyncio.create_task(worker(i)) for i in range(len(data))] 76 | for f in tqdm(asyncio.as_completed(tasks), total=len(data), desc="추론 진행중"): 77 | await f 78 | 79 | return results 80 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | # utils.py 2 | import os 3 | import json 4 | import asyncio 5 | import re 6 | from concurrent.futures import ThreadPoolExecutor 7 | from tqdm import tqdm 8 | 9 | # ========================== 10 | # 파일 저장 11 | # ========================== 12 | def save_jsonl(data_list, out_path): 13 | os.makedirs(os.path.dirname(out_path), exist_ok=True) 14 | with open(out_path, "w", encoding="utf-8") as f: 15 | for item in data_list: 16 | f.write(json.dumps(item, ensure_ascii=False) + "\n") 17 | print(f"Saved results to {out_path}") 18 | 19 | # ========================== 20 | # EM 점수 계산 21 | # ========================== 22 | def compute_em_score(pred, reference): 23 | return 1 if pred == reference else 0 24 | 25 | def compute_em_score_mmlu(pred, reference): 26 | return 1 if pred in reference else 0 27 | 28 | # ========================== 29 | # Summary 30 | # ========================== 31 | def summarize_scores(results): 32 | total = len(results) 33 | em_total = sum(r.get("score", 0) for r in results) 34 | return { 35 | "n_samples": total, 36 | "em_score": em_total / total if total > 0 else None 37 | } 38 | 39 | # ========================== 40 | # 비동기 모델 호출 41 | # ========================== 42 | async def call_model_async(messages, client, retries=3, initial_delay=1.0): 43 | delay = initial_delay 44 | for attempt in range(retries): 45 | try: 46 | resp = await client.chat.completions.create( 47 | model="25TOXMC_Blowfish_v1.0.9-AWQ", 48 | messages=messages, 49 | temperature=0.0, 50 | top_p=0.95, 51 | stream=False 52 | ) 53 | return resp.choices[0].message.content 54 | except Exception as e: 55 | if attempt == retries-1: 56 | raise 57 | await asyncio.sleep(delay) 58 | delay *= 2 59 | 60 | # ========================== 61 | # 공통 비동기 워커 62 | # ========================== 63 | async def run_concurrent_worker(data, build_messages_func, client, concurrency=16): 64 | sem = asyncio.Semaphore(concurrency) 65 | results = [None] * len(data) 66 | 67 | async def worker(i): 68 | async with sem: 69 | messages = build_messages_func(data[i]) 70 | out = await call_model_async(messages, client) 71 | # 제거 + JSON 파싱 72 | try: 73 | out_clean = re.sub(r".*?", "", out, flags=re.DOTALL).strip() 74 | out_json = json.loads(out_clean) 75 | results[i] = out_json.get("output") 76 | except: 77 | results[i] = out_clean 78 | 79 | tasks = [asyncio.create_task(worker(i)) for i in range(len(data))] 80 | for f in tqdm(asyncio.as_completed(tasks), total=len(data), desc="추론 진행중"): 81 | await f 82 | 83 | return results 84 | 85 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # UNIVA 프로젝트 설명서 2 | 3 | ## 프로젝트 개요 4 | 5 | - **목적**: 신약 개발 초기 단계(기초 연구–비임상 실험)에서 요구되는 ADMET(흡수·분포·대사·배설·독성) 특성 분석을 자동화하기 위해, 인지·추론 기반 AGI 에이전트 플랫폼을 구축한다. 본 플랫폼은 대규모 독성·약물동태 데이터와 온톨로지 기반 지식을 통합하여, 분자 수준의 ADMET 프로파일을 자율적으로 추론·해석할 수 있는 차세대 AI 시스템을 지향한다. 6 | - **구현 내용**: 1차년도 1단계 목표로 **Generalized ADMET Inference 베이스라인**과 **Toxicity AI 프로토타입** 모델을 구축한다. 이를 통해 기존 신약개발 과정에서 반복되는 수작업 기반 독성 예측 및 ADMET 전주기 분석의 단절 문제를 해소하고, 능동적 의사결정이 가능한 self-evolving ADMET AI 에이전트 개발의 초석을 마련한다. 7 | 8 | ## 모델/런타임 9 | 10 | - Generalized ADMET Inference Baseline: VLLM(OpenAI 호환) 엔드포인트(25TOXMC_Blowfish_v1.0.9-AWQ 기준)로 ChemCoTBench 평가 수행 11 | - Toxicity AI Prototype: 동일 VLLM 엔드포인트로 독성 MMLU, Mobile-Eval-E 평가 수행 12 | - `Toxicity AI Prototype/.env` 에서 `BASE_URL`(VLLM)과 `GPT_API_KEY`(GPT 채점) 관리 13 | 14 | ## 디렉터리 트리 & 파일 설명 15 | 16 | ``` 17 | Generalized ADMET Inference Baseline/ 18 | ├─ chem_cot.py # ChemCoTBench 평가 스크립트 19 | └─ utils.py # JSONL 저장, EM 계산, 비동기 호출, 제거 등 20 | 21 | Toxicity AI Prototype/ 22 | ├─ mmlu_toxic.py # 독성 MMLU 평가 스크립트 23 | ├─ mmlu_toxic.json # 독성 MMLU 문제 200문항 24 | ├─ mobile_eval_e.py # Mobile-Eval-E 플랜 생성(로컬 VLLM) + GPT 채점 25 | ├─ utils.py # JSONL 저장, EM 계산(MMLU), 비동기 호출, 제거 등 26 | ├─ no_tool_chat_template_qwen3.jinja # Qwen 채팅 템플릿( 처리) 27 | └─ .env # BASE_URL(VLLM), GPT_API_KEY(GPT 채점) 환경 변수 28 | 29 | run_vllm.sh # 25TOXMC_Blowfish_v1.0.9-AWQ VLLM 실행 예시(Volume/포트/GPU 수정 필요) 30 | ``` 31 | 32 | ## 실행 및 평가 흐름 33 | 34 | 1) VLLM 서버 기동 35 | - 루트의 `run_vllm.sh`에서 마운트 경로(`-v`), 포트(`-p`), GPU(`CUDA_VISIBLE_DEVICES`)를 환경에 맞게 수정 후 실행(`chmod +x run_vllm.sh && ./run_vllm.sh`). 36 | - `Toxicity AI Prototype/.env`에 `BASE_URL`(예: `http://:30002/v1/`), `GPT_API_KEY`를 채운다. 37 | 38 | 2) Toxicity MMLU 평가 (`Toxicity AI Prototype/mmlu_toxic.py`) 39 | - 작업 디렉터리를 `Toxicity AI Prototype/`로 두고 실행. 40 | - `mmlu_toxic.json`의 `system/prompt/answer`를 메시지로 보내고 EM 점수를 계산해 `MMLU_toxic_results.jsonl` 생성. 41 | 42 | 3) ChemCoTBench 평가 (`Generalized ADMET Inference Baseline/chem_cot.py`) 43 | - `Generalized ADMET Inference Baseline/` 아래에 `ChemCoTBench/`를 `datasets.load_from_disk`로 읽을 수 있게 배치. 44 | - 실행하면 응답 `"output"`과 `meta.reference`를 비교해 EM을 계산하고 `ChemCoT_results.jsonl` 생성. 45 | 46 | 4) Mobile-Eval-E 평가 (`Toxicity AI Prototype/mobile_eval_e.py`) 47 | - `GPT_API_KEY`를 `.env`에 채운 뒤 실행. 48 | - **액터**: `process_request_vl`이 로컬 VLLM 엔드포인트로 `plan/operations` JSON을 생성(필요 시 URL 수정). 49 | - **채점**: `judge_with_gpt`가 GPT로 루브릭·참고 동작을 비교해 점수 산출 후 `MobileEvalE_results.jsonl` 생성. 50 | 51 | 5) 결과 확인 52 | - 각 `*_results.jsonl`과 콘솔 `SUMMARY`/평균 점수를 확인. 53 | 54 | ## 환경/운영 참고 55 | 56 | - `.env`는 `Toxicity AI Prototype/`에 있으며, 스크립트를 해당 디렉터리에서 실행하거나 `load_dotenv("<경로>")`로 직접 지정한다. 57 | - VLLM 옵션(`gpu-memory-utilization`, `tensor-parallel-size`, `max-model-len`)은 GPU 자원에 맞춰 조정한다. 58 | - `mobile_eval_e.py`의 `process_request_vl` 기본 URL(`http://192.168.0.202:25321/v1/`)이 다르면 실제 VLLM 주소로 수정한다. 59 | - 응답에 `` 블록이 포함돼도 `run_concurrent_worker`가 제거 후 JSON 파싱을 시도하므로 템플릿을 유지해도 된다. 60 | -------------------------------------------------------------------------------- /Toxicity AI Prototype/mobile_eval_e.py: -------------------------------------------------------------------------------- 1 | # Mobile-Eval-E.py 2 | import json 3 | from datasets import load_dataset 4 | from tqdm import tqdm 5 | from openai import OpenAI 6 | from utils import save_jsonl 7 | import os 8 | from dotenv import load_dotenv 9 | load_dotenv() 10 | # ------------------------- 11 | # GPT 클라이언트 12 | # ------------------------- 13 | 14 | API_KEY = os.getenv("GPT_API_KEY") 15 | 16 | client = OpenAI(api_key=API_KEY) 17 | 18 | # ------------------------- 19 | # 시스템 프롬프트 20 | # ------------------------- 21 | SYSTEM_PROMPT = """You are a mobile task planner that controls an Android phone via high-level actions. 22 | Given a user instruction and a list of available apps, your goal is to output a step-by-step action sequence 23 | to complete the task on the phone. 24 | 25 | You MUST output a single JSON object with the following structure: 26 | 27 | { 28 | "plan": ["high-level step 1", "high-level step 2", ...], 29 | "operations": [ 30 | "action 1", 31 | "action 2", 32 | ... 33 | ] 34 | } 35 | 36 | Each action should be a short imperative phrase describing a concrete phone operation 37 | (e.g., "open Maps", "tap on the search bar", "type 'korean restaurant'", "press enter"). 38 | Do not include any explanations or extra text outside the JSON object. 39 | """ 40 | 41 | JUDGE_SYSTEM_PROMPT = """ 42 | You are an expert evaluator for a mobile phone agent benchmark. 43 | 44 | Your job is to evaluate how well a model-generated action sequence (operations) 45 | solves a given mobile task, based on: 46 | 47 | 1) The natural language instruction. 48 | 2) The list of available apps and scenario. 49 | 3) A list of rubrics describing what a good solution should do. 50 | 4) A human reference action sequence (operations). 51 | 5) The model-generated action sequence. 52 | 53 | You must output a single JSON object with the following fields: 54 | 55 | { 56 | "rubric_score": float, // between 0.0 and 1.0 57 | "action_match_score": float, // between 0.0 and 1.0 58 | "overall_score": float, // between 0.0 and 1.0 59 | "reason": "short explanation" 60 | } 61 | 62 | - rubric_score: how well the model operations satisfy the rubrics. 63 | - action_match_score: how similar the model operations are to the human reference operations. 64 | - overall_score: your overall judgement, not necessarily the average. 65 | Return only the JSON object, with no additional text. 66 | """ 67 | 68 | # ------------------------- 69 | # JSON 파싱 70 | # ------------------------- 71 | def extract_json(text: str): 72 | start = text.find("{") 73 | end = text.rfind("}") 74 | if start == -1 or end == -1 or end <= start: 75 | raise ValueError(f"JSON block not found in model output: {text[:200]}...") 76 | json_str = text[start:end+1] 77 | return json.loads(json_str) 78 | 79 | # ------------------------- 80 | # Judge Prompt 빌드 81 | # ------------------------- 82 | def build_judge_prompt(example, model_ops): 83 | instruction = example["instruction"] 84 | apps = example.get("apps", []) 85 | scenario = example.get("scenario", "") 86 | rubrics = example.get("rubrics", []) 87 | human_ops = example.get("human_reference_operations", []) 88 | return f""" 89 | [Instruction] 90 | {instruction} 91 | 92 | [Apps] 93 | {apps} 94 | 95 | [Scenario] 96 | {scenario} 97 | 98 | [Rubrics] 99 | {json.dumps(rubrics, ensure_ascii=False, indent=2)} 100 | 101 | [Human Reference Operations] 102 | {json.dumps(human_ops, ensure_ascii=False, indent=2)} 103 | 104 | [Model Operations to Evaluate] 105 | {json.dumps(model_ops, ensure_ascii=False, indent=2)} 106 | """ 107 | 108 | # ------------------------- 109 | # GPT Judge 호출 110 | # ------------------------- 111 | def judge_with_gpt(example, model_ops): 112 | user_prompt = build_judge_prompt(example, model_ops) 113 | messages = [ 114 | {"role": "system", "content": JUDGE_SYSTEM_PROMPT}, 115 | {"role": "user", "content": user_prompt}, 116 | ] 117 | resp = client.chat.completions.create( 118 | model="gpt-5-mini", 119 | messages=messages, 120 | ) 121 | text = resp.choices[0].message.content 122 | data = extract_json(text) 123 | return { 124 | "rubric_score": float(data.get("rubric_score", 0.0)), 125 | "action_match_score": float(data.get("action_match_score", 0.0)), 126 | "overall_score": float(data.get("overall_score", 0.0)), 127 | "reason": data.get("reason", ""), 128 | } 129 | 130 | # ------------------------- 131 | # Actor 호출 132 | # ------------------------- 133 | def process_request_vl(messages): 134 | import openai 135 | openai.api_key = "sk-None-1234" 136 | openai.base_url = "http://192.168.0.202:25321/v1/" 137 | output = openai.chat.completions.create( 138 | model='25TOXMC_Blowfish_v1.0.9-AWQ', 139 | messages=messages, 140 | temperature=0.0, 141 | top_p=0.95, 142 | stream=False 143 | ) 144 | return output 145 | 146 | def build_actor_prompt(example): 147 | instruction = example["instruction"] 148 | apps = example.get("apps", []) 149 | scenario = example.get("scenario", "") 150 | apps_str = ", ".join(apps) if apps else "no specific apps" 151 | return f"""User instruction: 152 | {instruction} 153 | 154 | You may use the following apps: {apps_str} 155 | Scenario: {scenario} 156 | 157 | Return ONLY a JSON object with the fields "plan" and "operations". 158 | """ 159 | 160 | def call_actor(example): 161 | messages = [ 162 | {"role": "system", "content": SYSTEM_PROMPT}, 163 | {"role": "user", "content": build_actor_prompt(example)}, 164 | ] 165 | resp = process_request_vl(messages) 166 | data = extract_json(resp.choices[0].message.content) 167 | ops = [str(o).strip() for o in data.get("operations", []) if str(o).strip()] 168 | return ops 169 | 170 | # ------------------------- 171 | # 메인 루프 172 | # ------------------------- 173 | def main(): 174 | ds = load_dataset("mikewang/mobile_eval_e", split="test") 175 | scores = [] 176 | 177 | for ex in tqdm(ds, desc="Evaluating with GPT judge"): 178 | try: 179 | model_ops = call_actor(ex) 180 | except Exception as e: 181 | print("Actor model failed:", e) 182 | model_ops = [] 183 | 184 | try: 185 | judge_result = judge_with_gpt(ex, model_ops) 186 | except Exception as e: 187 | print("Judge model failed:", e) 188 | judge_result = { 189 | "rubric_score": 0.0, 190 | "action_match_score": 0.0, 191 | "overall_score": 0.0, 192 | "reason": f"Judge error: {e}", 193 | } 194 | 195 | scores.append(judge_result) 196 | 197 | avg_rubric = sum(s["rubric_score"] for s in scores) / len(scores) 198 | avg_action = sum(s["action_match_score"] for s in scores) / len(scores) 199 | avg_overall = sum(s["overall_score"] for s in scores) / len(scores) 200 | 201 | print("\n===== GPT Judge Overall Results =====") 202 | print(f"#examples : {len(scores)}") 203 | print(f"Avg rubric_score : {avg_rubric:.4f}") 204 | print(f"Avg action_match : {avg_action:.4f}") 205 | print(f"Avg overall_score : {avg_overall:.4f}") 206 | 207 | # JSONL 저장 (공통 구조) 208 | save_jsonl(scores, "./MobileEvalE_results.jsonl") 209 | 210 | if __name__ == "__main__": 211 | main() 212 | 213 | --------------------------------------------------------------------------------