)") 168 | 169 | section_context_dict = dict() 170 | section_pred_dict = dict() 171 | orig_sections = section_pattern.split(pred_dict["input_sections"]) 172 | summ_sections = section_pattern.split(pred_dict["prediction"]) 173 | 174 | for i in range(1, len(orig_sections), 2): 175 | section_context_dict[orig_sections[i]] = orig_sections[i+1].strip() 176 | 177 | for i in range(1, len(summ_sections), 2): 178 | section_pred_dict[summ_sections[i]] = summ_sections[i+1].strip() 179 | 180 | # 2. create batch using 3 different criteria per section 181 | for prompt in [prompt_con, prompt_faith, prompt_rel]: 182 | for section in section_context_dict: 183 | if section not in section_pred_dict: # model did not create summary for the section 184 | continue 185 | prompt_with_content = prompt.replace('{{Document}}', section_context_dict[section]).replace('{{Summary}}', section_pred_dict[section]) 186 | batch = { 187 | 'custom_id': f"{domain}_{filename}_{section}_{criteria_dict[prompt]}", 188 | 'method': 'POST', 189 | 'url': "/v1/chat/completions", 190 | 'body': { 191 | 'model': 'gpt-4o-2024-08-06', 192 | 'messages': [{"role": "system", "content": prompt_with_content}], 193 | 'temperature': 0, 194 | 'max_tokens': 5, 195 | 'top_p': 1, 196 | 'frequency_penalty': 0, 197 | 'presence_penalty': 0, 198 | 'stop': None, 199 | 'logprobs': True, 200 | 'top_logprobs': 10, 201 | 'n': 1 202 | } 203 | } 204 | 205 | batch_list.append(batch) 206 | 207 | return batch_list 208 | 209 | def run_batch_for_summarizing(batch_input_path): 210 | 211 | batch_output_path = os.path.join(os.path.dirname(batch_input_path), "summarizing_output.jsonl") 212 | 213 | client = OpenAI(api_key=CONFIG["openai"][0]) 214 | batch_input_file = client.files.create( 215 | file=open(batch_input_path, "rb"), 216 | purpose="batch" 217 | ) 218 | 219 | batch_job = client.batches.create( 220 | input_file_id=batch_input_file.id, 221 | endpoint="/v1/chat/completions", 222 | completion_window="24h" 223 | ) 224 | time.sleep(10) 225 | 226 | # retrieve batch information 227 | retrieved_batch_job = client.batches.retrieve(batch_job.id) 228 | 229 | while True: 230 | time.sleep(30) # wait for 30 seconds for another status request 231 | retrieved_batch_job = client.batches.retrieve(batch_job.id) 232 | if retrieved_batch_job.status == 'completed' or retrieved_batch_job.status == 'failed': 233 | break 234 | 235 | if retrieved_batch_job.status == 'failed': 236 | raise ValueError() 237 | 238 | result_file_id = retrieved_batch_job.output_file_id 239 | result = client.files.content(result_file_id).text 240 | 241 | time.sleep(10) 242 | 243 | with open(batch_output_path, "w") as wf: 244 | wf.write(result) 245 | 246 | return batch_output_path 247 | 248 | def parse_score_for_summarizing(batch_output_path): 249 | 250 | batch_outputs = [] 251 | with open(batch_output_path) as rf: 252 | for line in rf: 253 | batch_outputs.append(json.loads(line)) 254 | 255 | samples = dict() 256 | for batch_output in batch_outputs: 257 | custom_id = batch_output["custom_id"] # {domain}_{filename}_{section}_{criteria} 258 | domain = custom_id.split("_")[0] 259 | section_format_text = "_ 5: 287 | continue 288 | 289 | logprob = tokens.get('logprob', float('-inf')) 290 | prob = np.exp(logprob) 291 | scores_dict[score] += prob 292 | 293 | for score, prob in scores_dict.items(): 294 | samples[sample_id][f'weighted_{criteria}'] += score * prob 295 | 296 | samples[sample_id]['count'] += 1 297 | 298 | # Average scores 299 | for sample in samples: 300 | samples[sample]['count'] /= 3 301 | 302 | for score in samples[sample]: 303 | samples[sample][score] /= samples[sample]['count'] 304 | 305 | samples[sample]['weighted'] = sum(samples[sample][feature] for feature in samples[sample] if 'weighted' in feature) / 3 306 | samples[sample]['top'] = sum(samples[sample][feature] for feature in samples[sample] if 'top' in feature) / 3 307 | 308 | return samples 309 | 310 | def calculate_score(task, user_msg, prediction, answer): 311 | 312 | result_dict = dict() 313 | result_dict["prediction"] = prediction 314 | result_dict["answer"] = answer 315 | 316 | if task == "Recalling": 317 | if prediction == "FAILED": 318 | result_dict["precision"], result_dict["recall"], result_dict["f1_score"] = 0, 0, 0 319 | else: 320 | result_dict["precision"], result_dict["recall"], result_dict["f1_score"] = calculate_f1_score(prediction, answer) 321 | score = result_dict["f1_score"] 322 | elif task == "Summarizing": 323 | input_sections_or_segments = re.search("### Context:\n(.+?)\n\nNow, respond to the instruction", user_msg, re.DOTALL).group(1) 324 | result_dict["input_sections"] = input_sections_or_segments 325 | score = 0 # score will be calculated separately 326 | elif task == "Organizing": 327 | if prediction == "FAILED": 328 | result_dict["lcs"], result_dict["lcs_score"] = 0 329 | else: 330 | pred_in_list = re.findall("\d+", prediction) 331 | answer_in_list = re.findall("\d+", answer) 332 | result_dict["lcs"], result_dict["lcs_score"] = calculate_lcs(pred_in_list, answer_in_list) 333 | score = result_dict["lcs_score"] 334 | elif task == "Attributing": 335 | if prediction == "FAILED": 336 | result_dict["precision"], result_dict["recall"], result_dict["f1_score"] = 0, 0, 0 337 | else: 338 | match = re.search(r"(Related Segments|Core IDs):\s*(.+)", prediction) 339 | if match: # model has followed format instruction 340 | target_span = match.group(2) 341 | else: 342 | target_span = prediction 343 | 344 | pred_numbers = ", ".join(set(re.findall("\d+", target_span))) 345 | answer_numbers = [re.search("\d+", ans).group() if re.search("\d+", ans) else "None" for ans in answer] 346 | 347 | if pred_numbers == []: 348 | pred_numbers = "None" 349 | 350 | result_dict["precision"], result_dict["recall"], result_dict["f1_score"] = calculate_f1_score(pred_numbers, answer_numbers) 351 | score = result_dict["f1_score"] 352 | 353 | return result_dict, score 354 | 355 | def write_score_file(task, save_path): 356 | 357 | if task == "Recalling": 358 | score_per_domain = { 359 | "Books":[], 360 | "Debates":[], 361 | "Medicine":[], 362 | "Law":[] 363 | } 364 | metric = "f1_score" 365 | elif task == "Summarizing": 366 | score_per_domain = { 367 | "Books":[], 368 | "Debates":[], 369 | "Medicine":[], 370 | "Law":[] 371 | } 372 | metric = "score" 373 | elif task == "Organizing": 374 | score_per_domain = { 375 | "Books":[], 376 | "Debates":[], 377 | } 378 | metric = "lcs_score" 379 | elif task == "Attributing": 380 | score_per_domain = { 381 | "Medicine":[], 382 | "Law":[] 383 | } 384 | metric = "f1_score" 385 | 386 | scores = [] 387 | for domain in score_per_domain.keys(): 388 | pred_paths = glob.glob(os.path.join(save_path, domain, "*.json")) 389 | for pred_path in pred_paths: 390 | with open(pred_path) as rf: 391 | pred_dict = json.load(rf) 392 | score = pred_dict[metric] 393 | scores.append(score) 394 | score_per_domain[domain].append(score) 395 | 396 | # write score file (overall / per domain) 397 | avg_score = sum(scores) / len(scores) 398 | avg_score_per_domain = {key: sum(value) / len(value) for key, value in score_per_domain.items()} 399 | 400 | with open(os.path.join(save_path, "final_score.txt"), "w") as wf: 401 | wf.write(str(avg_score)) 402 | with open(os.path.join(save_path, "domain_score.json"), "w") as wf: 403 | json.dump(avg_score_per_domain, wf) 404 | 405 | def get_model_prompts(model_name_or_path): 406 | 407 | if "gemini" in model_name_or_path: 408 | prompt = "{system_msg}\n\n{user_msg}" 409 | elif "Llama-3.1" in model_name_or_path: 410 | prompt = "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n{system_msg}<|eot_id|>\n<|start_header_id|>user<|end_header_id|>\n\n{user_msg}<|eot_id|>\n<|start_header_id|>assistant<|end_header_id|>" 411 | elif "Phi" in model_name_or_path: 412 | prompt = "<|system|>\n{system_msg}<|end|>\n<|user|>\n{user_msg}<|end|>\n<|assistant|>" 413 | else: # gpt, qwen, glm receives "messages" list as input 414 | prompt = "" 415 | return prompt --------------------------------------------------------------------------------