├── README.md ├── logging_results ├── __init__.py ├── __pycache__ │ ├── __init__.cpython-311.pyc │ └── logging.cpython-311.pyc └── logging.py ├── models ├── __init__.py ├── __pycache__ │ ├── __init__.cpython-311.pyc │ ├── api_based_inference.cpython-311.pyc │ ├── load_gemini.cpython-311.pyc │ ├── load_model.cpython-311.pyc │ ├── load_opensource_model.cpython-311.pyc │ └── open_source_model_inference.cpython-311.pyc ├── api_based_inference.py ├── load_gemini.py ├── load_model.py ├── load_opensource_model.py └── open_source_model_inference.py ├── post_processing ├── __init__.py ├── __pycache__ │ ├── __init__.cpython-311.pyc │ └── process_answer.cpython-311.pyc └── process_answer.py ├── prompt ├── RAG_qa_prompt_multi_choice_structured.txt ├── RAG_qa_prompt_multi_choice_unstructured.txt ├── RAG_qa_prompt_open_ended.txt ├── chatgpt_summarize_prompt.txt ├── gpt_as_a_judge_system_prompt.txt ├── gpt_as_a_judge_user_prompt.txt ├── naive_llm_inference_multi_choice_structured.txt ├── naive_llm_inference_multi_choice_unstructured.txt └── naive_llm_inference_open_ended.txt ├── pseudo_simulator.py ├── requirements.txt ├── results ├── entire_log-friends-model_gpt-4o-mini-debug_True-quantization_4bit-time_limit_6.0-history_type_utts-openai-emb_original-version_0.json ├── entire_log-friends-model_gpt-4o-mini-debug_True-quantization_4bit-time_limit_600.0-history_type_session-entire-bm25_original-version_2.json ├── entire_log-friends-model_mistral-7b-it-debug_True-quantization_4bit-time_limit_600.0-history_type_session-entire-bm25_original-version_1.json ├── results-friends-model_gpt-4o-mini-debug_True-quantization_4bit-time_limit_6.0-history_type_utts-openai-emb_original-version_0.json ├── results-friends-model_gpt-4o-mini-debug_True-quantization_4bit-time_limit_600.0-history_type_session-entire-bm25_original-version_2.json └── results-friends-model_mistral-7b-it-debug_True-quantization_4bit-time_limit_600.0-history_type_session-entire-bm25_original-version_1.json ├── scripts └── script_1.sh ├── simulator.py └── utils ├── __init__.py ├── __pycache__ ├── __init__.cpython-311.pyc └── utils.cpython-311.pyc └── utils.py /README.md: -------------------------------------------------------------------------------- 1 | # DialSim 2 | 3 | We introduce DialSim, a dialogue simulator. In this simulator, an agent is assigned the role of a character from popular TV shows, requiring it to respond to spontaneous questions using past dialogue information and to distinguish between known and unknown information. Key features of DialSim include assessing the agent’s ability to understand long-term multi-party dialogues and evaluating performance under randomized questioning with LongDialQA, a novel, high-quality question-answering dataset. 4 | 5 | ## Dataset 6 | You can download the dataset [here](https://drive.google.com/drive/folders/1MhPlUFWuchVZ5E1NQDWfbT7_RW7ozbuk?usp=drive_link). 7 | 8 | v1.0: This version includes the dataset as described in the paper. 9 | 10 | v1.1: To incorporate more diverse and challenging data, this version has been updated to include unanswerable multi-hop questions. 11 | 12 | ## Experimental Setup 13 | 14 | After downloading appropriate version of ```torch```, do: 15 | 16 | 1. ```pip install -r requirements.txt``` 17 | 18 | 2. ```mkdir data``` 19 | 20 | 3. ```mv dialsim_v1.1.zip ./data/``` 21 | 22 | 4. ```cd data``` 23 | 24 | 5. ```unzip dialsim_v1.1.zip``` 25 | 26 | ## Simulation 27 | Command Example: 28 | ```CUDA_VISIBLE_DEVICES=0 python simulator.py --model_name "llama3.1-70b-it" --quantization "4bit" --script_name "friends" --history_type "session-entire" --ret_method "bm25" --trial_version 0 --sh_number 0 --num_cores 10 --answer_format "multi_choice_structured" --openai_api_key "<>(not required in this line)"``` 29 | 30 | ```CUDA_VISIBLE_DEVICES=0 python simulator.py --model_name "gpt-4o" --quantization "4bit" --script_name "friends" --history_type "session-summary" --ret_method "bm25" --trial_version 0 --sh_number 0 --num_cores 10 --answer_format "open_ended" --openai_api_key "<>(required in this line)"``` 31 | 32 | ```CUDA_VISIBLE_DEVICES=0 python simulator.py --model_name "gpt-4o-mini" --quantization "4bit" --script_name "friends" --history_type "utts" --ret_method "openai-emb" --trial_version 0 --sh_number 0 --num_cores 10 --answer_format "multi_choice_unstructured" --openai_api_key "<>(required in this line)"``` 33 | 34 | ```CUDA_VISIBLE_DEVICES=0 python simulator.py --model_name "llama3.1-8b-it" --quantization "4bit" --script_name "friends" --ret_method "no_ret" --trial_version 0 --sh_number 0 --num_cores 10 --answer_format "multi_choice_unstructured" --openai_api_key "<>(required in this line)"``` 35 | 36 | #### Arguments 37 | - `model_name`: Specifies the model to use, default is "gpt-3.5-turbo". Options include "llama3.1-8b-it", "llama3.1-70b-it", "tulu2-7b-dpo", "tulu2-70b-dpo", "gemma-2b-it", "gemma-7b-it", "mistral-7b-it", "mixtral-it", "claude-3", "claude-2.1", and model names for openai models and gemini models. 38 | - `quantization`: Model quantization level, default is "no". Options include "no", "16bit", "8bit", and "4bit". 39 | - `script_name`: TV show script for the simulation, default is "friends". Options include "friends", "bigbang", and "theoffice". 40 | - `history_type`: Method for saving history, default is "session-entire". Options include "utts", "session-entire", and "session-summary". 41 | - `num_ret_history`: Number of retrieved histories to use. Modify lines 180-198, 222-236 in `simulator.py` to change this number. 42 | - `ret_method`: Retrieval method, default is "bm25". Options include "openai-emb", "bm25", "no_ret", and "oracle". 43 | - `name_shuffle`: Type of adversarial test, default is "original". Options include "original", "shuffle", and "new_name". 44 | - `answer_format`: The format of the answer the model to generate, default is "multi_choice_structured". Options include "multi_choice_structured", "multi_choice_unstructured", "open_ended". 45 | - `trial_version`: Experiment version number, default: 0 46 | - `sh_number`: Shell script number, default: 0 47 | - `num_cores`: Maximum number of CPU cores to use, default: 10 48 | - `openai_api_key`: Required if using openai models, or `ret_method="openai-emb"`, or using "multi_choice_unstructured" or "open_ended" for `answer_format`(we use gpt-4o-mini as a judge when `answer_format` is "multi_choice_unstructured" or "open_ended", and rule-based evaluation when `answer_format` is "multi_choice_structured"). 49 | - `gemini_api_key`: Required if using "gemini" in the model name. 50 | - `anthropic_api_key`: Required if using "claude-3" or "claude-2.1" in the model name. 51 | -------------------------------------------------------------------------------- /logging_results/__init__.py: -------------------------------------------------------------------------------- 1 | from .logging import log_answers, log_results, log_ret_history_question, log_times -------------------------------------------------------------------------------- /logging_results/__pycache__/__init__.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiho283/Simulator/d36924acbb65008e07dff83be43de596a4afef49/logging_results/__pycache__/__init__.cpython-311.pyc -------------------------------------------------------------------------------- /logging_results/__pycache__/logging.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiho283/Simulator/d36924acbb65008e07dff83be43de596a4afef49/logging_results/__pycache__/logging.cpython-311.pyc -------------------------------------------------------------------------------- /logging_results/logging.py: -------------------------------------------------------------------------------- 1 | import json 2 | def log_results(log_info, log_file_path=""): 3 | if log_file_path == "": 4 | raise AssertionError("Need to specify log directory.") 5 | info = { 6 | "score" : log_info["score"], 7 | "calibrated_score" : log_info["calibrated_score"], 8 | "calibrated_result_list" : log_info["calibrated_result_list"], 9 | "avg_answer_time" : log_info["result_time_mean"], 10 | "result_list" : log_info["result_list"], 11 | "result_time_list" : log_info["result_time_list"], 12 | "ambiguous_idx_list" : log_info["ambiguous_idx_list"], 13 | "ambiguous_answer_list" : log_info["ambiguous_answer_list"], 14 | "ambiguous_gold_answer_list" : log_info["ambiguous_gold_answer_list"] 15 | } 16 | with open(log_file_path, "w") as f: 17 | json.dump(info, f, indent=2) 18 | 19 | def log_answers(answer_list, gold_answer_list, log_file_path=""): 20 | if log_file_path == "": 21 | raise AssertionError("Need to specify log directory.") 22 | info = { 23 | "answer_list" : answer_list, 24 | "gold_answer_list" : gold_answer_list 25 | } 26 | with open(log_file_path, "w") as f: 27 | json.dump(info, f, indent=2) 28 | 29 | def log_ret_history_question(ret_history_question_answer_list, target_level_list, log_file_path=""): 30 | if log_file_path == "": 31 | raise AssertionError("Need to specify log directory.") 32 | info = [] 33 | for (ret_history, question, gold_answer, distilled_answer) in ret_history_question_answer_list: 34 | info.append({ 35 | "ret_history" : ret_history, 36 | "question" : question, 37 | "gold_answer" : gold_answer, 38 | "model_answer" : distilled_answer, 39 | "target_level_list" : target_level_list 40 | }) 41 | with open(log_file_path, "w") as f: 42 | json.dump(info, f, indent=2) 43 | 44 | def log_times(save_time_list, retrieve_search_time_list, ans_time_list, log_file_path=""): 45 | if log_file_path == "": 46 | raise AssertionError("Need to specify log directory.") 47 | info = [] 48 | for (save_time, retrieve_search_time, ans_time) in zip(save_time_list,retrieve_search_time_list, ans_time_list): 49 | info.append({ 50 | "save_time" : save_time, 51 | "retrieve_search_time" : retrieve_search_time, 52 | "ans_time" : ans_time 53 | }) 54 | with open(log_file_path, "w") as f: 55 | json.dump(info, f, indent=2) 56 | 57 | def log_calibration(log_info, log_file_path=""): 58 | if log_file_path == "": 59 | raise AssertionError("Need to specify log directory.") 60 | info = [] 61 | for (calibrated_result, calibrated_distilled_answer) in zip(log_info["calibrated_result_list"], log_info["calibrated_distilled_answer_list"]): 62 | info.append({ 63 | "calibrated_result" : calibrated_result, 64 | "calibrated_distilled_answer" : calibrated_distilled_answer 65 | }) 66 | with open(log_file_path, "w") as f: 67 | json.dump(info, f, indent=2) 68 | 69 | def log_everything(log_info, log_file_path=""): 70 | if log_file_path == "": 71 | raise AssertionError("Need to specify log directory.") 72 | with open(log_file_path, "w") as f: 73 | json.dump(log_info, f, indent=2) -------------------------------------------------------------------------------- /models/__init__.py: -------------------------------------------------------------------------------- 1 | from .api_based_inference import gpt35_inference, gpt4_inference, claude_inference, gemini_inference 2 | from .open_source_model_inference import open_source_model_inference 3 | from .load_model import load_model -------------------------------------------------------------------------------- /models/__pycache__/__init__.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiho283/Simulator/d36924acbb65008e07dff83be43de596a4afef49/models/__pycache__/__init__.cpython-311.pyc -------------------------------------------------------------------------------- /models/__pycache__/api_based_inference.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiho283/Simulator/d36924acbb65008e07dff83be43de596a4afef49/models/__pycache__/api_based_inference.cpython-311.pyc -------------------------------------------------------------------------------- /models/__pycache__/load_gemini.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiho283/Simulator/d36924acbb65008e07dff83be43de596a4afef49/models/__pycache__/load_gemini.cpython-311.pyc -------------------------------------------------------------------------------- /models/__pycache__/load_model.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiho283/Simulator/d36924acbb65008e07dff83be43de596a4afef49/models/__pycache__/load_model.cpython-311.pyc -------------------------------------------------------------------------------- /models/__pycache__/load_opensource_model.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiho283/Simulator/d36924acbb65008e07dff83be43de596a4afef49/models/__pycache__/load_opensource_model.cpython-311.pyc -------------------------------------------------------------------------------- /models/__pycache__/open_source_model_inference.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiho283/Simulator/d36924acbb65008e07dff83be43de596a4afef49/models/__pycache__/open_source_model_inference.cpython-311.pyc -------------------------------------------------------------------------------- /models/api_based_inference.py: -------------------------------------------------------------------------------- 1 | def gpt_inference(message, model_name, client, temperature=0.2, top_p=0.1): 2 | completion = client.chat.completions.create( 3 | model=model_name, 4 | messages=[ 5 | {"role": "system", "content": "You are a helpful assistant."}, 6 | {"role": "user", "content": message} 7 | ], 8 | temperature=temperature, 9 | top_p = top_p 10 | ) 11 | 12 | return completion.choices[0].message.content 13 | 14 | 15 | def gpt35_inference(message, client, temperature=0.2, top_p=0.1): 16 | # deprecated. Use `gpt_inference` instead. 17 | completion = client.chat.completions.create( 18 | model="gpt-3.5-turbo", 19 | messages=[ 20 | {"role": "system", "content": "You are a helpful assistant."}, 21 | {"role": "user", "content": message} 22 | ], 23 | temperature=temperature, 24 | top_p = top_p 25 | ) 26 | 27 | return completion.choices[0].message.content 28 | 29 | def gpt4_inference(message, client): 30 | # deprecated. Use `gpt_inference` instead. 31 | completion = client.chat.completions.create( 32 | model="gpt-4o", 33 | messages=[ 34 | {"role": "system", "content": "You are a helpful assistant."}, 35 | {"role": "user", "content": message} 36 | ], 37 | temperature=0.2, 38 | top_p = 0.1 39 | ) 40 | 41 | return completion.choices[0].message.content 42 | 43 | def claude_inference(message, model_name, client): 44 | model_dir = "" 45 | if model_name == "claude-3": 46 | model_dir = "claude-3-opus-20240229" 47 | elif model_name == "claude-2.1": 48 | model_dir = "claude-2.1" 49 | completion = client.messages.create( 50 | model=model_dir, 51 | max_tokens=4096, 52 | messages=[ 53 | {"role": "user", "content": message} 54 | ], 55 | temperature=0.2, 56 | top_p=0.1 57 | ) 58 | return completion.content[0].text 59 | 60 | def gemini_inference(message, model): 61 | response = model.generate_content(message) 62 | """ 63 | Somehow `response.text` doesn't work (As of April 2024). Using the alternative. 64 | 65 | The docs of Gemini says in 66 | `generativeai/types/generation_types.py` 67 | that `response.text` is a quick accessor of 68 | `self.candidates[0].parts[0].text`. 69 | Which I believe is a typo of 70 | `self.candidates[0].content.parts[0].text` 71 | 72 | """ 73 | result = "" 74 | try: 75 | result = response.text 76 | except: 77 | try: 78 | result = response.candidates[0].content.parts[0].text 79 | except: 80 | result = "" # Gemini can't generate some responses due to safety reasons. We will consider it as a wrong answer 81 | return result -------------------------------------------------------------------------------- /models/load_gemini.py: -------------------------------------------------------------------------------- 1 | import google.generativeai as genai 2 | 3 | def load_gemini( 4 | candidate_count:int = 1, 5 | max_output_tokens:int = 256, 6 | temperature:float = 0.2, 7 | top_p:float = 0.1, 8 | gemini_api_key:str = "", 9 | model_name:str="gemini-pro" 10 | ): 11 | if not gemini_api_key: 12 | raise AssertionError("You must specify your gemini API key") 13 | genai.configure(api_key=gemini_api_key) 14 | generation_config = { 15 | "candidate_count": candidate_count, 16 | "max_output_tokens": max_output_tokens, 17 | "temperature": temperature, 18 | "top_p": top_p, 19 | } 20 | safety_settings=[ 21 | { 22 | "category": "HARM_CATEGORY_DANGEROUS", 23 | "threshold": "BLOCK_NONE", 24 | }, 25 | { 26 | "category": "HARM_CATEGORY_HARASSMENT", 27 | "threshold": "BLOCK_NONE", 28 | }, 29 | { 30 | "category": "HARM_CATEGORY_HATE_SPEECH", 31 | "threshold": "BLOCK_NONE", 32 | }, 33 | { 34 | "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT", 35 | "threshold": "BLOCK_NONE", 36 | }, 37 | { 38 | "category": "HARM_CATEGORY_DANGEROUS_CONTENT", 39 | "threshold": "BLOCK_NONE", 40 | }, 41 | ] 42 | if model_name == "gemini-1.5-pro": 43 | model = genai.GenerativeModel('gemini-1.5-pro', generation_config=generation_config, safety_settings=safety_settings) 44 | elif model_name == "gemini-pro": 45 | model = genai.GenerativeModel('gemini-pro', generation_config=generation_config, safety_settings=safety_settings) 46 | else: 47 | model = genai.GenerativeModel(model_name, generation_config=generation_config, safety_settings=safety_settings) 48 | return model -------------------------------------------------------------------------------- /models/load_model.py: -------------------------------------------------------------------------------- 1 | from models.load_opensource_model import load_opensource_model 2 | from models.load_gemini import load_gemini 3 | def load_model(model_name, quantization, gemini_api_key=""): 4 | model = None 5 | tokenizer = None 6 | config = None 7 | if "gpt" in model_name.lower(): 8 | pass 9 | elif model_name == "claude-3" or model_name == "claude-2.1": 10 | pass 11 | elif "gemini" in model_name: 12 | if not gemini_api_key: 13 | raise AssertionError("You must specify your gemini API key") 14 | model = load_gemini(gemini_api_key=gemini_api_key, model_name=model_name) 15 | else: 16 | model, tokenizer, config = load_opensource_model(model_name, quantization) 17 | 18 | return model, tokenizer, config -------------------------------------------------------------------------------- /models/load_opensource_model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, AutoConfig 3 | 4 | def get_model_dir(model_name:str): 5 | model_dir = "" 6 | if model_name == "llama2-7b-chat": 7 | model_dir = "meta-llama/Llama-2-7b-chat-hf" 8 | elif model_name == "llama2-70b-chat": 9 | model_dir = "meta-llama/Llama-2-70b-chat-hf" 10 | elif model_name == "tulu2-7b-dpo": 11 | model_dir = "allenai/tulu-2-dpo-7b" 12 | elif model_name == "tulu2-70b-dpo": 13 | model_dir = "allenai/tulu-2-dpo-70b" 14 | elif model_name == "gemma-2b-it": 15 | model_dir = "google/gemma-2b-it" 16 | elif model_name == "gemma-7b-it": 17 | model_dir = "google/gemma-7b-it" 18 | elif model_name == "mistral-7b-it": 19 | model_dir = "mistralai/Mistral-7B-Instruct-v0.2" 20 | elif model_name == "mixtral-it": 21 | model_dir = "mistralai/Mixtral-8x7B-Instruct-v0.1" 22 | else: 23 | return AssertionError("Incorrect model name.") 24 | return model_dir 25 | 26 | def load_opensource_model(model_name:str, quantization:str="no"): 27 | model_dir = get_model_dir(model_name) 28 | if quantization == "no": 29 | model = AutoModelForCausalLM.from_pretrained(model_dir, device_map="auto", trust_remote_code=True) 30 | elif quantization == "4bit": 31 | bnb_config_4bit = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.bfloat16) 32 | model = AutoModelForCausalLM.from_pretrained(model_dir, device_map="auto", quantization_config=bnb_config_4bit, trust_remote_code=True) 33 | elif quantization == "8bit": 34 | bnb_config_8bit = BitsAndBytesConfig(load_in_8bit=True, bnb_8bit_compute_dtype=torch.bfloat16) 35 | model = AutoModelForCausalLM.from_pretrained(model_dir, device_map="auto", quantization_config=bnb_config_8bit, trust_remote_code=True) 36 | elif quantization == "16bit": 37 | bnb_config_16bit = BitsAndBytesConfig(load_in_16bit=True, bnb_16bit_compute_dtype=torch.bfloat16) 38 | model = AutoModelForCausalLM.from_pretrained(model_dir, device_map="auto", quantization_config=bnb_config_16bit, trust_remote_code=True) 39 | else: 40 | raise AssertionError("quantization should be in ['no', '4bit', '8bit', '16bit']") 41 | if 'tulu' in model_name: 42 | tokenizer = AutoTokenizer.from_pretrained(model_dir, legacy=False, trust_remote_code=True) 43 | else: 44 | tokenizer = AutoTokenizer.from_pretrained(model_dir, trust_remote_code=True) 45 | config = AutoConfig.from_pretrained(model_dir, trust_remote_code=True) 46 | return model, tokenizer, config 47 | 48 | def load_opensource_tokenizer(model_name:str): 49 | model_dir = get_model_dir(model_name) 50 | tokenizer = AutoTokenizer.from_pretrained(model_dir, trust_remote_code=True) 51 | return tokenizer 52 | -------------------------------------------------------------------------------- /models/open_source_model_inference.py: -------------------------------------------------------------------------------- 1 | def open_source_model_inference(message, model_name, model, tokenizer, config, output_max_length=128, temperautre=0.2, top_p=0.1): 2 | """ 3 | Inference for open-source models. 4 | Tantamount to `gpt35_inference`. 5 | 6 | Parameters: 7 | message (str): prompt for the model 8 | model_name (str) : name of the model 9 | model: model 10 | tokenizer: tokenizer 11 | config: configuration 12 | 13 | Returns: 14 | generated_text: generated text of the model. We disregard the input prompt from the output of the model. 15 | """ 16 | input_ids = None 17 | # TODO: generalize this. 18 | if "mistral" in model_name or "mixtral" in model_name or "gemma" in model_name: 19 | messages = [ 20 | {"role": "user", "content": message}, 21 | ] 22 | 23 | encodeds = tokenizer.apply_chat_template(messages, return_tensors="pt") 24 | model_inputs = encodeds.to("cuda") 25 | model_inputs_len = model_inputs.shape[1] 26 | 27 | generated_ids = model.generate(model_inputs, max_new_tokens=output_max_length, do_sample=True, pad_token_id=tokenizer.eos_token_id)[:, model_inputs_len:] 28 | decoded = tokenizer.batch_decode(generated_ids, skip_special_tokens=True) 29 | decoded = decoded[0] 30 | return decoded 31 | 32 | elif "tulu" in model_name: 33 | message = f"<|user|>\n{message}\n<|assistant|>" # <|assistant|>\n 34 | input_ids = tokenizer(message, return_tensors="pt", max_length=config.max_position_embeddings, truncation=True).to("cuda").input_ids 35 | 36 | elif "llama2" in model_name: 37 | message = f"[INST] {message} [/INST]" 38 | input_ids = tokenizer(message, return_tensors="pt", max_length=config.max_position_embeddings, truncation=True).to("cuda").input_ids 39 | 40 | 41 | input_len = len(input_ids[0]) 42 | generated_ids = None 43 | try: 44 | generated_ids = model.generate(input_ids, max_length=input_len+output_max_length, temperature=temperautre, top_p=top_p, pad_token_id=tokenizer.eos_token_id) 45 | except: 46 | generated_ids = model.generate(input_ids, max_length=input_len+output_max_length, temperature=temperautre, top_p=top_p) 47 | 48 | generated_text = tokenizer.decode(generated_ids[0][input_len:], skip_special_tokens=True) 49 | 50 | return generated_text -------------------------------------------------------------------------------- /post_processing/__init__.py: -------------------------------------------------------------------------------- 1 | from .process_answer import judge_eq, distill_answer -------------------------------------------------------------------------------- /post_processing/__pycache__/__init__.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiho283/Simulator/d36924acbb65008e07dff83be43de596a4afef49/post_processing/__pycache__/__init__.cpython-311.pyc -------------------------------------------------------------------------------- /post_processing/__pycache__/process_answer.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiho283/Simulator/d36924acbb65008e07dff83be43de596a4afef49/post_processing/__pycache__/process_answer.cpython-311.pyc -------------------------------------------------------------------------------- /post_processing/process_answer.py: -------------------------------------------------------------------------------- 1 | import json 2 | from pydantic import BaseModel 3 | from utils import open_file 4 | 5 | class ResultFormat(BaseModel): 6 | correct: bool 7 | wrong: bool 8 | 9 | def gpt_judge(question, true_answer, answer, client, temperature=0.2, top_p=0.1, patience=3): 10 | patience = patience 11 | user_message = open_file("./prompt/gpt_as_a_judge_user_prompt.txt").replace("<<>>", question).replace("<<>>", true_answer).replace("<<>>", answer) 12 | result = "Ambiguous" 13 | while patience > 0: 14 | completion = client.chat.completions.create( 15 | model="gpt-4o-mini", 16 | response_format={"type": "json_object"}, 17 | messages=[ 18 | {"role": "system", "content": open_file("./prompt/gpt_as_a_judge_system_prompt.txt")}, 19 | {"role": "user", "content": user_message} 20 | ], 21 | temperature=temperature, 22 | top_p = top_p 23 | ) 24 | #response = completion.choices[0].message 25 | result_dict = json.loads(completion.choices[0].message.content) 26 | if result_dict["correct"]: 27 | result = "Correct" 28 | break 29 | elif result_dict["wrong"]: 30 | result = "Wrong" 31 | break 32 | else: 33 | patience -= 1 34 | 35 | return result 36 | 37 | def judge_eq(true_answer_op, answer, question, client=None, answer_format="multi_choice_structured"): 38 | """ 39 | A function that judges whether the model's output is correct or not. 40 | Handles ambiguous cases. If ambiguous, we return the flag of ambiguity with `True`. 41 | 42 | Parameters: 43 | true_answer_op: one of (A), (B), (C), (D), (E) 44 | answer: generated text from the model 45 | 46 | Returns: 47 | result (str): 'Correct' or 'Wrong' 48 | is_ambiguous (bool): flag of cases whether the model is outputting ambiguous answer. 49 | """ 50 | if answer_format not in ["multi_choice_structured", "multi_choice_unstructured", "open_ended"]: 51 | raise ValueError("answer_format should be one of ['multi_choice_structured', 'multi_choice_unstructured', 'open_ended']") 52 | is_ambiguous = False 53 | # is_ambiguous: count of cases when the model is outputting multiple options 54 | ## this is ambiguous because: 55 | ### (1) the model is just enumerating multiple options without actually answering the question 56 | ### (2) the model is reasoning with explicitly mentioning other options and gives the correct answer 57 | ### (3) the model is reasoning with explicitly mentioning other options and gives the wrong answer 58 | result = 'Wrong' 59 | if answer_format == "multi_choice_structured": 60 | if answer == "": 61 | is_ambiguous = False 62 | result = 'Wrong' 63 | return result, is_ambiguous 64 | elif true_answer_op in answer: # if the true answer is in the generated text 65 | op_list = ['(A)', '(B)', '(C)', '(D)', '(E)', '(a)', '(b)', '(c)', '(d)', '(e)'] 66 | try: 67 | op_list.remove(true_answer_op) 68 | op_list.remove(true_answer_op.lower()) 69 | except: 70 | print(f"UNEXPECTED ERROR: true_answer_op is set to {true_answer_op}.") 71 | for op in op_list: 72 | if op in answer: # if the other options are in the generated text 73 | result = 'Ambiguous' 74 | is_ambiguous = True 75 | return result, is_ambiguous 76 | result = 'Correct' 77 | return result, is_ambiguous 78 | elif answer_format in ["multi_choice_unstructured", "open_ended"]: 79 | if answer == true_answer_op: 80 | result = "Correct" 81 | is_ambiguous = False 82 | return result, is_ambiguous 83 | elif answer == f"{true_answer_op}.": 84 | result = "Correct" 85 | is_ambiguous = False 86 | return result, is_ambiguous 87 | else: 88 | result = gpt_judge(question, true_answer_op, answer, client) 89 | if result == "Ambiguous": 90 | is_ambiguous = True 91 | return result, is_ambiguous 92 | 93 | 94 | def distill_answer(answer): 95 | # answer: model output 96 | # returns: distilled answer from model output 97 | # should be called after measuring answer time 98 | candidate_list = ["(A)", "(B)", "(C)", "(D)", "(E)"] 99 | for candidate in candidate_list: 100 | if candidate in answer or candidate.lower() in answer: 101 | op_list = ['(A)', '(B)', '(C)', '(D)', '(E)', '(a)', '(b)', '(c)', '(d)', '(e)'] 102 | op_list.remove(candidate) 103 | op_list.remove(candidate.lower()) 104 | for op in op_list: 105 | if op in answer: 106 | return "([/AMB/])" # when output has multiple options 107 | return candidate 108 | return "([N/A])" 109 | 110 | def calibrate(original_result, is_ambiguous, true_answer_op, answer, question, distilled_answer, answer_format="multi_choice_structured", client=None, lenient=True): 111 | """Disambiguates ambiguous answers and evaluate model's answsers with a more lenient criteria. 112 | By using this function, you consider the followings to also be a correct answer from a model. 113 | - (lenient) When the model outputs only the description of the option. e.g. "Chandler" instead of "(C) Chandler" 114 | - (lenient) Various absentions when the true answer is "(E)". e.g., No options in the text, but the model answers "None of the options are correct" 115 | - (disambiguation) When it answers it correctly, but also has other options in the answer. 116 | 117 | Paramters: 118 | - true_answer_op (str): True answer. one of (A), (B), (C), (D), (E) if multiple choice question. Else it is a text. 119 | - answer (str): Model's output 120 | - question (str): question asked along with MC options 121 | - distilled answer (str): output of `distill_answer` 122 | - lenient (bool): Whether to use a more lenient evaluating system. 123 | Returns: 124 | - result (str): Wrong, Correct, Ambiguous 125 | - is_ambiguous (bool): whether the result('Wrong') is still ambigous or not 126 | - calibrated_distilled_answer (str): calibrated version of the output of `distill_answer` 127 | """ 128 | if not is_ambiguous: 129 | return original_result, is_ambiguous, distilled_answer 130 | if answer_format not in ["multi_choice_structured", "multi_choice_unstructured", "open_ended"]: 131 | raise ValueError("answer_format should be one of ['multi_choice_structured', 'multi_choice_unstructured', 'open_ended']") 132 | 133 | if answer_format in ["multi_choice_unstructured", "open_ended"]: 134 | return original_result, is_ambiguous, distilled_answer 135 | 136 | else: 137 | op2idx = { 138 | "(A)" : 1, 139 | "(B)" : 2, 140 | "(C)" : 3, 141 | "(D)" : 4, 142 | "(E)" : 5 143 | } 144 | true_text = question.split("\t")[op2idx[true_answer_op]].replace(true_answer_op, "").strip() 145 | if distilled_answer == "([/AMB/])": 146 | if "none of the options" in answer: 147 | calibrated_distilled_answer = "(E)" 148 | if true_answer_op == "(E)": 149 | result = "Correct" 150 | else: 151 | result = "Wrong" 152 | is_ambiguous = False 153 | return result, is_ambiguous, calibrated_distilled_answer 154 | for option in ['(A)', '(B)', '(C)', '(D)', '(E)']: 155 | if f"answer is {option}" in answer: 156 | calibrated_distilled_answer = option 157 | if true_answer_op == option: 158 | result = "Correct" 159 | else: 160 | result = "Wrong" 161 | is_ambiguous = False 162 | return result, is_ambiguous, calibrated_distilled_answer 163 | if "[EXPLANATION]" in answer.upper(): 164 | explicit_answer = answer.upper().split("[EXPLANATION]")[0] 165 | if true_answer_op in explicit_answer: 166 | op_list = ['(A)', '(B)', '(C)', '(D)', '(E)'] 167 | try: 168 | op_list.remove(true_answer_op) 169 | except: 170 | print(f"UNEXPECTED ERROR: true_answer_op is set to {true_answer_op}.") 171 | for op in op_list: 172 | if op in answer: # if the other options are in the generated text 173 | result = 'Ambiguous' 174 | is_ambiguous = True 175 | return result, is_ambiguous, "([/AMB/])" 176 | result = 'Correct' 177 | is_ambiguous = False 178 | calibrated_distilled_answer = true_answer_op 179 | return result, is_ambiguous, calibrated_distilled_answer 180 | elif true_text in explicit_answer and lenient: # lenient because we are allowing violation of the format 181 | text_A = question.split("\t")[1].replace("(A)","").strip() 182 | text_B = question.split("\t")[2].replace("(B)","").strip() 183 | text_C = question.split("\t")[3].replace("(C)","").strip() 184 | text_D = question.split("\t")[4].replace("(D)","").strip() 185 | text_E = question.split("\t")[5].replace("(E)","").strip() 186 | text_list = [text_A, text_B, text_C, text_D, text_E] 187 | try: 188 | text_list.remove(true_text) 189 | except: 190 | raise AssertionError(f"UNEXPECTED ERROR: true_text is set to {true_text}.") 191 | for text in text_list: 192 | if text in explicit_answer: 193 | result = 'Ambiguous' 194 | is_ambiguous = True 195 | print(f"AMBIGUOUS. true text: {true_text} answer: {answer}") 196 | return result, is_ambiguous, "([/AMB/])" 197 | result = "Correct" 198 | is_ambiguous = False 199 | calibrated_distilled_answer = true_answer_op 200 | return result, is_ambiguous, calibrated_distilled_answer 201 | else: 202 | return "Wrong", True, "([/AMB/])" 203 | 204 | if distilled_answer == "([N/A])" and lenient: 205 | if "none of the options" in answer or "I cannot answer this question" in answer or "context does not" in answer or "cannot answer" in answer or "does not specify" in answer or "does not provide" in answer: 206 | calibrated_distilled_answer = "(E)" 207 | if true_answer_op == "(E)": 208 | result = "Correct" 209 | else: 210 | result = "Wrong" 211 | is_ambiguous = False 212 | return result, is_ambiguous, calibrated_distilled_answer 213 | if answer.replace("\n","").replace(" ","").replace("\t","") == "": 214 | is_ambiguous = False 215 | result = 'Wrong' 216 | return result, is_ambiguous, "([N/A])" 217 | if true_text in answer: 218 | text_A = question.split("\t")[1].replace("(A)","").strip() 219 | text_B = question.split("\t")[2].replace("(B)","").strip() 220 | text_C = question.split("\t")[3].replace("(C)","").strip() 221 | text_D = question.split("\t")[4].replace("(D)","").strip() 222 | text_E = question.split("\t")[5].replace("(E)","").strip() 223 | text_list = [text_A, text_B, text_C, text_D, text_E] 224 | try: 225 | text_list.remove(true_text) 226 | except: 227 | raise AssertionError(f"UNEXPECTED ERROR: true_text is set to {true_text}.") 228 | for text in text_list: 229 | if text in answer: 230 | result = 'Ambiguous' 231 | is_ambiguous = True 232 | print(f"AMBIGUOUS. true text: {true_text} answer: {answer}") 233 | return result, is_ambiguous, "([/AMB/])" 234 | result = "Correct" 235 | is_ambiguous = False 236 | calibrated_distilled_answer = true_answer_op 237 | return result, is_ambiguous, calibrated_distilled_answer 238 | else: 239 | return "Wrong", True, "([/AMB/])" 240 | if "<<>>" 244 | else: 245 | if distilled_answer == true_answer_op: 246 | result = "Correct" 247 | else: 248 | result = "Wrong" 249 | is_ambiguous = False 250 | return result, is_ambiguous, distilled_answer -------------------------------------------------------------------------------- /prompt/RAG_qa_prompt_multi_choice_structured.txt: -------------------------------------------------------------------------------- 1 | You are <<>>, a long-term conversation agent capable of interacting with multiple users. 2 | Based on the [Retrieved Dialog History] provided, please answer the given [Question]. 3 | Note the following points: 4 | 1. Your answer must exclusively be one of the options: (A), (B), (C), (D), (E). 5 | 2. Your responses should solely rely on the retrieved dialog history. If the information in the dialog history is insufficient to answer the question, you must choose (E). 6 | 3. This question is being asked in the context of <<>>. 7 | 8 | [Retrieved Dialog History] 9 | <<>> 10 | [Question] <<>> 11 | [Answer] -------------------------------------------------------------------------------- /prompt/RAG_qa_prompt_multi_choice_unstructured.txt: -------------------------------------------------------------------------------- 1 | You are <<>>, a long-term conversation agent capable of interacting with multiple users. 2 | Based on the [Retrieved Dialog History] provided, please answer the given [Question]. 3 | Note the following points: 4 | 1. Your answer must be one of the options from the question. 5 | 2. Your responses should solely rely on the retrieved dialog history. If the information in the dialog history is insufficient to answer the question, you must reply with "I don't know". 6 | 3. This question is being asked in the context of <<>>. 7 | 8 | [Retrieved Dialog History] 9 | <<>> 10 | [Question] <<>> -------------------------------------------------------------------------------- /prompt/RAG_qa_prompt_open_ended.txt: -------------------------------------------------------------------------------- 1 | You are <<>>, a long-term conversation agent capable of interacting with multiple users. 2 | Based on the [Retrieved Dialog History] provided, please answer the given [Question]. 3 | Note the following points: 4 | 1. Your responses should solely rely on the retrieved dialog history. If the information in the dialog history is insufficient to answer the question, you must admit that you don't know the answer. 5 | 2. This question is being asked in the context of <<>>. 6 | 7 | [Retrieved Dialog History] 8 | <<>> 9 | [Question] <<>> -------------------------------------------------------------------------------- /prompt/chatgpt_summarize_prompt.txt: -------------------------------------------------------------------------------- 1 | Given the following dialog, identify and summarize the key points, including any specific details, conclusions, relationships between speakers or action items that are discussed. Ensure the summary is concise yet captures the essence and any decisions made or opinions expressed. Summarize the following dialog and do not generate any explanation. 2 | 3 | [Dialog] 4 | <<>> 5 | 6 | [Summary] -------------------------------------------------------------------------------- /prompt/gpt_as_a_judge_system_prompt.txt: -------------------------------------------------------------------------------- 1 | You have to judge the correctness of a to a corresponding , based on . 2 | 3 | If the basically says the same thing as the , you should say that the is correct. 4 | Otherwise, you should say that the is wrong. 5 | 6 | Your answer should follow the json format below: 7 | 8 | ```json 9 | { 10 | "correct": True or False in Boolean format, 11 | "wrong": True or False in Boolean format, 12 | } 13 | ``` 14 | 15 | Only one of the "correct" and "wrong" should be True, and the other should be False. 16 | "correct" and "wrong" can't be False at the same time. -------------------------------------------------------------------------------- /prompt/gpt_as_a_judge_user_prompt.txt: -------------------------------------------------------------------------------- 1 | 2 | <<>> 3 | 4 | 5 | <<>> 6 | 7 | 8 | <<>> -------------------------------------------------------------------------------- /prompt/naive_llm_inference_multi_choice_structured.txt: -------------------------------------------------------------------------------- 1 | You are <<>>, a long-term conversation agent capable of interacting with multiple users. 2 | Based on the [Dialog History] provided, please answer the given [Question]. 3 | Note the following points: 4 | 1. Your answer must exclusively be one of the options: (A), (B), (C), (D), (E). 5 | 2. Your responses should solely rely on the dialog history. If the information in the dialog history is insufficient to answer the question, you must choose (E). 6 | 3. This question is being asked in the context of <<>>. 7 | 8 | [Dialog History] 9 | <<>> 10 | [Question] <<>> 11 | [Answer] -------------------------------------------------------------------------------- /prompt/naive_llm_inference_multi_choice_unstructured.txt: -------------------------------------------------------------------------------- 1 | You are <<>>, a long-term conversation agent capable of interacting with multiple users. 2 | Based on the [Dialog History] provided, please answer the given [Question]. 3 | Note the following points: 4 | 1. Your answer must be one of the options from the question. 5 | 2. Your responses should solely rely on the retrieved dialog history. If the information in the dialog history is insufficient to answer the question, you must reply with "I don't know". 6 | 3. This question is being asked in the context of <<>>. 7 | 8 | [Dialog History] 9 | <<>> 10 | [Question] <<>> 11 | [Answer] -------------------------------------------------------------------------------- /prompt/naive_llm_inference_open_ended.txt: -------------------------------------------------------------------------------- 1 | You are <<>>, a long-term conversation agent capable of interacting with multiple users. 2 | Based on the [Dialog History] provided, please answer the given [Question]. 3 | Note the following points: 4 | 1. Your responses should solely rely on the retrieved dialog history. If the information in the dialog history is insufficient to answer the question, you must admit that you don't know the answer. 5 | 2. This question is being asked in the context of <<>>. 6 | 7 | [Dialog History] 8 | <<>> 9 | [Question] <<>> 10 | [Answer] -------------------------------------------------------------------------------- /pseudo_simulator.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pickle 3 | from copy import deepcopy 4 | import time 5 | import random 6 | import pandas as pd 7 | from openai import OpenAI 8 | from anthropic import Anthropic 9 | import time 10 | import signal 11 | import warnings 12 | import argparse 13 | from rank_bm25 import BM25Okapi 14 | from nltk.tokenize import word_tokenize 15 | import nltk 16 | 17 | from logging_results.logging import log_results, log_everything 18 | from post_processing.process_answer import judge_eq, distill_answer, calibrate 19 | from models.api_based_inference import gpt_inference, claude_inference, gemini_inference 20 | from models.open_source_model_inference import open_source_model_inference 21 | from models.load_opensource_model import load_opensource_tokenizer 22 | from models.load_model import load_model 23 | from utils.utils import get_embedding, search_history, open_file, name_change, extract_gt_sessions_bm25_date 24 | warnings.filterwarnings('ignore') 25 | #from func_timeout import func_set_timeout, FunctionTimedOut, func_timeout 26 | 27 | 28 | def parse_args(): 29 | parser = argparse.ArgumentParser() 30 | parser.add_argument("--model_name", type=str, default="gpt-3.5-turbo", help="name of the model. Default: 'gpt-3.5-turbo'.") 31 | parser.add_argument("--debug", action=argparse.BooleanOptionalAction, help="if set, use truncated dataset for debugging.") 32 | parser.add_argument("--debug_n_episodes", type=int, default=5, help="number of episodes to evalutate on debug mode.") 33 | parser.add_argument("--quantization", type=str, default="no", help="either to quantize the model or not. Default: False") 34 | parser.add_argument("--script_name", type=str, default='friends', help="name of the script to evaluate. Should be one of ('friends', 'bigbang', 'theoffice'). Default: 'friends'") 35 | parser.add_argument("--sleep_time", type=float, default=5, help="time limit in seconds for model response. Default: 5") 36 | parser.add_argument('--history_type', type=str, default='session-entire', help="How to store conversation history.") 37 | parser.add_argument('--num_ret_history', type=int, default=10, help="Number of histories we are going to retrieve. Default: 10.") 38 | parser.add_argument('--ret_method', type=str, default='bm25', help=" Default: openai-emb. Should be one of ('openai-emb', 'bm25', 'no_ret')") 39 | parser.add_argument('--name_shuffle', type=str, default='original', help=" Default: original. Should be one of ('original', 'shuffle', 'new_name')") 40 | parser.add_argument('--trial_version', type=int, default=0, help= "version number of the experiment.") 41 | parser.add_argument('--sh_number', type=int, default=0, help='shell script number') 42 | parser.add_argument('--num_cores', type=int, default=10, help='upper bound of number of cpu cores') 43 | parser.add_argument('--openai_api_key', type=str, default="", help="OpenAI API Key") 44 | parser.add_argument('--gemini_api_key', type=str, default="", help="Gemini API key") 45 | parser.add_argument('--antrhopic_api_key', type=str, default="", help="Anthropic API key") 46 | parser.add_argument('--fast_eval', type=str, default="yes", help="When set to 'yes', the simulator proceeds to the next utterance without waiting for the time interval if the history has already been updated. Should be one of ('yes', 'no')") 47 | parser.add_argument('--answer_format', type=str, default='multi_choice_structured', help="the format of the answer of the agent.") 48 | return parser.parse_args() 49 | 50 | def answer_question(model_name, client, model, tokenizer, config, prompt): 51 | answer = "" 52 | try: 53 | if "gpt" in model_name.lower(): 54 | answer = gpt_inference(prompt, model_name, client) 55 | elif model_name == "claude-3" or model_name == "claude-2.1": 56 | answer = claude_inference(prompt, model_name, client) 57 | elif model_name == "gemini": 58 | answer = gemini_inference(prompt, model) 59 | else: 60 | answer = open_source_model_inference(prompt, model_name, model, tokenizer, config) 61 | except: 62 | pass 63 | return answer 64 | 65 | def retrieve_history(ret_method, num_ret_history, openai_client, max_token_len, save_result, char_ask_sh, real_question_sh, data_dict, gt_sessions): 66 | ret_histories = '' 67 | if ret_method == 'openai-emb': 68 | if len(data_dict['history']) == 0: 69 | ret_histories = "No history.\n" 70 | else: 71 | res = search_history(save_result, f'{char_ask_sh}: {real_question_sh}', client=openai_client, n=num_ret_history) 72 | for ret_history in list(res['history']): 73 | ret_histories = ret_histories + ret_history + '\n' 74 | elif ret_method == 'bm25': 75 | if len(data_dict['history']) == 0: 76 | ret_histories = "No history.\n" 77 | else: 78 | tokenized_query = word_tokenize(f'{char_ask_sh}: {real_question_sh}'.lower()) 79 | doc_scores = save_result.get_scores(tokenized_query) 80 | top_doc_indices = sorted(range(len(doc_scores)), key=lambda i: doc_scores[i], reverse=True)[:num_ret_history] 81 | top_docs = [data_dict['history'][i] for i in top_doc_indices] 82 | for ret_history in top_docs: 83 | ret_histories = ret_histories + ret_history + '\n' 84 | elif ret_method == 'no_ret': 85 | total_token_len = 0 86 | ret_his_inds = [] 87 | if len(data_dict['history']) == 0: 88 | ret_histories = "No history.\n" 89 | else: 90 | for h_ind in range(len(data_dict['ada_embedding'])): 91 | total_token_len += data_dict['ada_embedding'][-1-h_ind] 92 | if total_token_len > max_token_len - 500: 93 | break 94 | ret_his_inds.append(-1-h_ind) 95 | ret_histories = data_dict['history'][-1-h_ind] + '\n' + ret_histories 96 | elif ret_method == 'oracle': 97 | ret_histories = gt_sessions 98 | return ret_histories 99 | 100 | 101 | def save_history(history_num, history, history_type, date, cur_conv_num, un, post_utterances, utter_post, model_name, client, model, tokenizer, config, data_dict, ret_method, llama_tokenizer): 102 | if history_type == "utts": 103 | processed_history = f"[Date: {date}, Session #{cur_conv_num}, Utterance #{history_num+1}] {history}" 104 | history_num += 1 105 | elif history_type == "session-entire": 106 | processed_history = f"[Date: {date}, Session #{cur_conv_num}]\n{history}" 107 | elif history_type == "session-summary": 108 | history_sum = "" 109 | if un == len(post_utterances)-1: 110 | sum_prompt = open_file('./prompt/chatgpt_summarize_prompt.txt').replace('<<>>', history) 111 | try: 112 | if "gpt" in model_name.lower(): 113 | history_sum = gpt_inference(sum_prompt, model_name, client) 114 | elif model_name == "claude-3" or model_name == "claude-2.1": 115 | history_sum = claude_inference(sum_prompt, model_name, client) 116 | elif model_name == "gemini": 117 | history_sum = gemini_inference(sum_prompt, model) 118 | else: 119 | history_sum = open_source_model_inference(sum_prompt, model_name, model, tokenizer, config) 120 | except: 121 | pass 122 | else: 123 | history_sum = history 124 | processed_history = f"[Date: {date}, Session #{cur_conv_num}]\n{history_sum}\n" 125 | 126 | #save_to_data_dict_signal_handler = signal_handler 127 | 128 | if ret_method == 'openai-emb': 129 | embedding_vec = get_embedding(processed_history, client=client, model="text-embedding-3-small") 130 | data_dict['history'].append(processed_history) 131 | data_dict['ada_embedding'].append(embedding_vec) 132 | data_df = pd.DataFrame(data_dict) 133 | return data_df, history_num#, data_dict 134 | elif ret_method == 'bm25': 135 | 136 | data_dict['history'].append(processed_history) 137 | tokenized_docs = [word_tokenize(doc.lower()) for doc in data_dict['history']] 138 | bm25 = BM25Okapi(tokenized_docs) 139 | return bm25, history_num#, data_dict 140 | elif ret_method == 'no_ret': 141 | token_len = llama_tokenizer(processed_history, return_tensors="pt", truncation=True).input_ids.shape[1] 142 | data_dict['history'].append(processed_history) 143 | data_dict['ada_embedding'].append(token_len) 144 | return None, history_num#, data_dict 145 | elif ret_method == "oracle": 146 | return None, history_num#, data_dict 147 | else: 148 | return AssertionError("Incorrect `ret_method`.") 149 | 150 | 151 | def simulator( 152 | script_name, 153 | sleep_time=5, 154 | tkg_ratio=0.7, 155 | num_ret_history = 5, 156 | model_name:str="gpt-3.5-turbo", 157 | debug:bool=False, 158 | debug_n_episodes:int=5, 159 | quantization:str="no", 160 | history_type:str="session-entire", 161 | ret_method:str='bm25', 162 | name_shuffle:str='original', 163 | openai_api_key:str="", 164 | gemini_api_key:str="", 165 | antrhopic_api_key:str="", 166 | fast_eval:str="yes", 167 | answer_format:str="multi_choice_structured" 168 | ): 169 | """ 170 | script_name: script name ('friends', 'bigbang', 'theoffice') 171 | sleep_time: time for one utterance in the simulator. we do not use this parameter in unlimited simulator. (e.g. 3) 172 | tkg_ratio: the ratio of KG based question among the whole question set (0.0-1.0) 173 | mode: question type ('select' or 'free text') 174 | num_ret_history: the number of the retrieved utterances 175 | ret_method: retrieval method. openai embedding based: 'openai-emb', BM25 based: 'bm25', Naive LLM inference: 'no_ret'. 176 | """ 177 | 178 | model, tokenizer, config = load_model(model_name, quantization, gemini_api_key=gemini_api_key) 179 | simulator_start_time = time.time() 180 | if ret_method in ['no_ret', 'oracle']: 181 | history_type = 'session-entire' 182 | 183 | ####For hyperparameter 184 | 185 | if history_type == "utts": 186 | num_ret_history = 20 187 | elif history_type == "session-entire": 188 | if 'llama2' in model_name.lower(): 189 | num_ret_history = 3 190 | if ret_method == 'bm25': 191 | num_ret_history = 1 192 | elif 'tulu' in model_name.lower() or 'gemma' in model_name.lower(): 193 | if ret_method == 'bm25': 194 | num_ret_history = 2 195 | else: 196 | num_ret_history = 5 197 | elif "llama3.1-70b" in model_name.lower(): 198 | 199 | num_ret_history = 8 200 | else: 201 | num_ret_history = 10 202 | elif history_type == "session-summary": 203 | if 'llama2' in model_name.lower(): 204 | num_ret_history = 8 205 | else: 206 | num_ret_history = 15 207 | 208 | if ret_method == 'no_ret': 209 | llama_tokenizer = load_opensource_tokenizer("llama2-7b-chat") 210 | else: 211 | llama_tokenizer = "" 212 | 213 | max_token_len = 0 214 | if model_name == "gpt-3.5-turbo": 215 | max_token_len = 16000 216 | elif "gpt-4" in model_name.lower(): 217 | max_token_len = 128000 218 | elif model_name == "claude-3" or model_name == "claude-2.1": 219 | max_token_len = 200000 220 | elif model_name == "gemini": 221 | max_token_len = 32000 222 | elif 'tulu' in model_name.lower(): 223 | max_token_len = 6000 224 | elif 'llama3.1' in model_name.lower(): 225 | max_token_len = 8196 226 | else: 227 | try: 228 | max_token_len = config.max_position_embeddings 229 | except: 230 | max_token_len = 4000 231 | 232 | if ret_method == "oracle": 233 | if "gpt" in model_name.lower(): 234 | num_ret_history = 20 235 | elif model_name == "claude-3" or model_name == "claude-2.1": 236 | num_ret_history = 20 237 | elif "gemini" in model_name.lower(): 238 | num_ret_history = 20 239 | elif 'tulu' in model_name.lower(): 240 | num_ret_history = 4 241 | elif 'llama2' in model_name.lower(): 242 | num_ret_history = 2 243 | elif 'gemma' in model_name.lower(): 244 | num_ret_history = 10 245 | else: 246 | num_ret_history = 20 247 | 248 | 249 | openai_client = None 250 | if 'gpt' in model_name.lower() or 'openai' in ret_method: 251 | openai_client = OpenAI(api_key=openai_api_key) 252 | if answer_format in ['multi_choice_unstructured', 'open_ended']: 253 | openai_client = OpenAI(api_key=openai_api_key) 254 | anthropic_client = None 255 | if "claude" in model_name.lower(): 256 | anthropic_client = Anthropic(api_key=antrhopic_api_key) 257 | 258 | if "gpt" in model_name.lower(): 259 | client = openai_client 260 | if answer_format in ['multi_choice_unstructured', 'open_ended']: 261 | client = OpenAI(api_key=openai_api_key) 262 | elif "claude" in model_name.lower(): 263 | client = anthropic_client 264 | else: 265 | client = None 266 | 267 | with open(f'./data/{script_name}_dialsim.pickle', 'rb') as f: 268 | data = pickle.load(f) 269 | with open(f'./data/{script_name}_oracle_tkg.pickle', 'rb') as f_h: 270 | oracle_tkg = pickle.load(f_h) 271 | with open(f'./data/{script_name}_oracle_fan.pickle', 'rb') as f_e: 272 | oracle_fan = pickle.load(f_e) 273 | 274 | if script_name == 'friends': 275 | chatbot = 'Ross' 276 | elif script_name == 'bigbang': 277 | chatbot = 'Sheldon' 278 | elif script_name == 'theoffice': 279 | chatbot = 'Michael' 280 | else: 281 | assert 0 282 | 283 | data_dict = { 284 | 'ada_embedding': [], ### openai-emb -> embedding vector, no_ret -> token length 285 | 'history': [] 286 | } 287 | 288 | episodes = list(data) 289 | if debug: 290 | episodes = episodes[:debug_n_episodes] 291 | before_date = '' 292 | cur_conv_num = 1 293 | 294 | result_list = [] 295 | result_time_list = [] 296 | ambiguous_idx_list = [] # list of indices of the data (episode, session, question_prompt) where the model's output is ambiguous. 297 | ambiguous_answer_list = [] # list of answers(model output) that are ambiguous. 298 | ambiguous_gold_answer_list = [] # list of ground truth answers for the ambiguous answers. 299 | answer_list = [] # list of answers generated by the models. TODO: implement logging answers too. 300 | gold_answer_list = [] # list of ground truth (gold) answers 301 | ret_histories_question_answer_list = [] # list of (ret_histories, question) 302 | save_time_list = [] # list of saving time 303 | retrieve_search_time_list = [] # list of time spent in `search_history` 304 | ans_time_list = [] # list of time spent in answering 305 | calibrated_result_list = [] # list of calibrated answers 306 | calibrated_distilled_answer_list = [] # list of calibrated distilled answers 307 | epi_session_date_to_sessions = {} 308 | date_to_sessions = {} 309 | target_level_list = [] 310 | 311 | for epi in episodes: 312 | epi_session_date_to_sessions[epi] = {} 313 | epi_data = data[epi] 314 | session_nums = list(epi_data) 315 | 316 | for sc_num in session_nums: 317 | already_asked = 0 318 | script = epi_data[sc_num]['script'] 319 | date = epi_data[sc_num]['date'] 320 | date_splitted = date.replace(',', '').split() 321 | cannot_tkg = 0 322 | cannot_fan = 0 323 | temp_script = name_change(script_name, script, name_shuffle) 324 | epi_session_date_to_sessions[epi][sc_num] = {date: temp_script} 325 | 326 | try: 327 | date_to_sessions[date].append(temp_script) 328 | except: 329 | date_to_sessions[date] = [temp_script] 330 | 331 | ###Whether it is possible to ask tkg-based questions 332 | try: 333 | question_dict = epi_data[sc_num]['hard_q'] 334 | final_tkg_list = [] 335 | tkg_list = list(question_dict) 336 | for tkg in tkg_list: 337 | if len(question_dict[tkg]) > 0: 338 | final_tkg_list.append(tkg) 339 | tkg_target_type = random.choice(final_tkg_list) 340 | 341 | tkg_q_list = question_dict[tkg_target_type] 342 | target_question = random.choice(tkg_q_list) 343 | except: 344 | cannot_tkg=1 345 | pass 346 | 347 | ###Whether it is possible to ask fan quiz-based questions 348 | try: 349 | question_dict = epi_data[sc_num]['easy_q'] 350 | final_fan_list = [] 351 | fan_list = list(question_dict) 352 | for fan in fan_list: 353 | if len(list(question_dict[fan])) > 0: 354 | final_fan_list.append(fan) 355 | fan_target_type = random.choice(final_fan_list) 356 | 357 | fan_q_list = list(question_dict[fan_target_type]) 358 | fan_q_target_num = random.choice(fan_q_list) 359 | target_question = question_dict[fan_target_type][fan_q_target_num] 360 | except: 361 | cannot_fan = 1 362 | pass 363 | 364 | target_question_list = [] 365 | current_type = '' 366 | gt_sessions = "" 367 | target_dates_list = [] 368 | 369 | #### Question Selection (tkg or fan) 370 | rand_val = random.random() 371 | if cannot_fan == 1 and cannot_tkg == 1: 372 | target_question_list = ['cannot ask' for _ in range(20)] 373 | elif (cannot_fan == 1 and cannot_tkg == 0) or rand_val < tkg_ratio: 374 | question_dict = epi_data[sc_num]['hard_q'] 375 | final_tkg_list = [] 376 | fu_num = 0 377 | not_fu_list = [] 378 | tkg_list = list(question_dict) 379 | for tkg in tkg_list: 380 | if len(question_dict[tkg]) > 0: 381 | final_tkg_list.append(tkg) 382 | if 'fu' in tkg: 383 | fu_num += 1 384 | else: 385 | not_fu_list.append(tkg) 386 | if len(not_fu_list) > 0: 387 | random.shuffle(not_fu_list) 388 | while True: 389 | should_stop = 0 390 | for not_fu in not_fu_list: 391 | if fu_num/len(final_tkg_list) < 0.215: 392 | should_stop = 1 393 | break 394 | final_tkg_list.append(not_fu) 395 | if should_stop == 1: 396 | break 397 | tkg_target_type = random.choice(final_tkg_list) 398 | tkg_q_list = question_dict[tkg_target_type] 399 | 400 | current_type = tkg_target_type 401 | for _ in range(20): 402 | target_question = random.choice(tkg_q_list) 403 | ran_q = target_question['questions'][list(target_question['questions'])[0]] 404 | if 'n '+ date_splitted[2] in ran_q or date_splitted[0] + ' ' + date_splitted[2] in ran_q: 405 | continue 406 | final_target_question = deepcopy(target_question) 407 | target_question_list.append(final_target_question) 408 | 409 | try: 410 | target_dates_list.append(oracle_tkg[epi][sc_num][current_type][tkg_q_list.index(target_question)]) 411 | except: 412 | try: 413 | target_dates_list.append(oracle_tkg[epi][sc_num][current_type][target_question['questions'][list(target_question['questions'])[0]]]) 414 | except: 415 | target_dates_list.append([]) 416 | 417 | elif (cannot_fan == 0 and cannot_tkg == 1) or rand_val >= tkg_ratio: 418 | question_dict = epi_data[sc_num]['easy_q'] 419 | final_fan_list = [] 420 | unans_num = 0 421 | ans_list = [] 422 | fan_list = list(question_dict) 423 | for fan in fan_list: 424 | if len(list(question_dict[fan])) > 0: 425 | final_fan_list.append(fan) 426 | if 'unans' in fan: 427 | unans_num += 1 428 | else: 429 | ans_list.append(fan) 430 | 431 | if len(ans_list) > 0: 432 | random.shuffle(ans_list) 433 | while True: 434 | should_stop = 0 435 | for ans_ele in ans_list: 436 | if unans_num/len(final_fan_list) < 0.27: 437 | should_stop = 1 438 | break 439 | final_fan_list.append(ans_ele) 440 | if should_stop == 1: 441 | break 442 | 443 | fan_target_type = random.choice(final_fan_list) 444 | fan_q_list = list(question_dict[fan_target_type]) 445 | current_type = fan_target_type 446 | 447 | for _ in range(20): 448 | fan_q_target_num = random.choice(fan_q_list) 449 | target_question = deepcopy(question_dict[fan_target_type][fan_q_target_num]) 450 | target_question_list.append(target_question) 451 | if current_type in ['ans_w_time', 'dont_know_unans_time']: 452 | try: 453 | target_dates_list.append(oracle_fan[epi][sc_num][current_type][fan_q_target_num]) 454 | except: 455 | target_dates_list.append([]) 456 | else: 457 | target_dates_list.append([]) 458 | 459 | if before_date != date: 460 | cur_conv_num = 1 461 | before_date = date 462 | 463 | utterances = script.split('\n') 464 | post_utterances = [] 465 | temp_utter = '' 466 | 467 | chatbot_utters = [] 468 | characters = [] 469 | 470 | for utter in utterances: 471 | if len(utter.strip()) == 0: 472 | continue 473 | if 'Teleplay: ' in utter or 'Story: ' in utter: 474 | continue 475 | if ':' in utter: 476 | characters.append(utter.split(':')[0].strip()) 477 | if chatbot+':' in utter: 478 | chatbot_utters.append(utter.strip()) 479 | if ':' in utter: 480 | post_utterances.append(utter.strip()) 481 | temp_utter = deepcopy(utter.strip()) 482 | else: 483 | post_utterances.pop() 484 | temp_utter += '\n'+utter.strip() 485 | post_utterances.append(temp_utter) 486 | 487 | if sc_num != session_nums[0]: 488 | print() 489 | 490 | print('###########################################') 491 | print(f'Date: {date}, Conversation #{cur_conv_num}') 492 | print('###########################################\n') 493 | 494 | try: 495 | if len(chatbot_utters) > 1: 496 | chatbot_utters = chatbot_utters[1:] 497 | random_chatbot_utter = random.choice(chatbot_utters) 498 | bot_indices = [i for i, s in enumerate(post_utterances) if random_chatbot_utter in s] 499 | range_indices = [i for i in range(max(0, bot_indices[0]-3), min(len(post_utterances), bot_indices[0]+3))] 500 | close_chars = [] 501 | for idx in range_indices: 502 | if ':' in post_utterances[idx]: 503 | close_chars.append(post_utterances[idx].split(':')[0]) 504 | characters = list(set(close_chars)) 505 | close_chars = list(set(close_chars)) 506 | 507 | for char_ in close_chars: 508 | if chatbot.lower() in char_.lower() or 'all' == char_.lower(): 509 | try: 510 | characters.remove(char_) 511 | except: 512 | pass 513 | except: 514 | pass 515 | 516 | if len(characters) > 0: 517 | char_ask = random.choice(characters) 518 | else: 519 | char_ask = "" 520 | 521 | history_num = 0 522 | script_history = "" 523 | 524 | for un, utter_post in enumerate(post_utterances): 525 | print(name_change(script_name, utter_post, name_shuffle)) 526 | history = "" 527 | if history_type == "utts": 528 | history = name_change(script_name, utter_post, name_shuffle) 529 | elif history_type == "session-entire": 530 | if not utter_post.endswith("\n"): 531 | utter_post += "\n" 532 | script_history += name_change(script_name, utter_post, name_shuffle) 533 | history = script_history 534 | elif history_type == "session-summary": 535 | if not utter_post.endswith("\n"): 536 | utter_post += "\n" 537 | script_history += name_change(script_name, utter_post, name_shuffle) 538 | history = script_history 539 | else: 540 | return AssertionError("Incorrect `history_type`.") 541 | 542 | embedding_vec = None 543 | 544 | save_timeout_flag = False 545 | search_timeout_flag = False 546 | ans_timeout_flag = False 547 | save_start_time = None 548 | save_end_time = None 549 | save_time = None 550 | 551 | # below are what we are actually going to log 552 | time_in_saving = None 553 | time_in_retrieval_searching = None 554 | time_in_answering = None 555 | result_time = None 556 | ans_time = None 557 | answer = "" 558 | 559 | already_pop = False 560 | history_before_save_len = len(data_dict['history']) 561 | embedding_before_save_len = len(data_dict['ada_embedding']) 562 | save_start_time = time.time() 563 | save_result = None 564 | 565 | 566 | #save_result, history_num = func_timeout(sleep_time, save_history, args=(history_num, history, history_type, date, cur_conv_num, un, post_utterances, utter_post, model_name, openai_client, model, tokenizer, config, data_dict, ret_method, llama_tokenizer)) 567 | save_result, history_num = save_history(history_num, history, history_type, date, cur_conv_num, un, post_utterances, utter_post, model_name, openai_client, model, tokenizer, config, data_dict, ret_method, llama_tokenizer) 568 | save_end_time = time.time() 569 | save_time = save_end_time - save_start_time 570 | 571 | if save_time > sleep_time: 572 | history_after_save_len = len(data_dict['history']) 573 | embedding_after_save_len = len(data_dict['ada_embedding']) 574 | save_timeout_flag = True 575 | print("\nTimeout (saving history)!!!\n") 576 | print("Corresponding history couldn't be saved.\n") 577 | if len(data_dict['history']) > 0 and history_after_save_len > history_before_save_len: 578 | data_dict['history'].pop() 579 | if ret_method in ["openai-emb", "no_ret"]: 580 | if len(data_dict['ada_embedding']) > 0 and embedding_after_save_len > embedding_before_save_len: 581 | data_dict['ada_embedding'].pop() 582 | if ret_method == "openai-emb": 583 | save_result = pd.DataFrame(data_dict) 584 | if ret_method == "bm25": 585 | if len(data_dict['history']) > 0: 586 | tokenized_docs = [word_tokenize(doc.lower()) for doc in data_dict['history']] 587 | save_result = BM25Okapi(tokenized_docs) 588 | ret_histories = "No history.\n" 589 | already_pop = True 590 | result = "Wrong (Timeout in saving history)" 591 | is_ambiguous = False 592 | answer = "<<>>" 593 | time_in_saving = "<<>>" 594 | time_in_retrieval_searching = "<<>>" 595 | time_in_ans = "<<>>" 596 | result_time = "<<>>" 597 | 598 | #### Question 599 | if random_chatbot_utter.lower() in utter_post.lower() and len(characters) > 0 and target_question_list[0] != 'cannot ask': 600 | if already_asked == 1: 601 | continue 602 | real_question = '' 603 | real_tar_id = -1 604 | for tar_id in range(len(target_question_list)): 605 | if char_ask in list(target_question_list[tar_id]['questions']): 606 | real_question = target_question_list[tar_id]['questions'][char_ask] 607 | elif 'default' in list(target_question_list[tar_id]['questions']): 608 | real_question = target_question_list[tar_id]['questions']['default'] 609 | else: 610 | continue 611 | 612 | try: 613 | true_answer = target_question_list[tar_id]['answer'] 614 | real_tar_id = tar_id 615 | assert(len(target_dates_list)==len(target_question_list)) 616 | gt_sessions = extract_gt_sessions_bm25_date(date_to_sessions, epi_session_date_to_sessions, current_type, target_dates_list[tar_id], epi, sc_num, num_ret_history, real_question) 617 | break 618 | except: 619 | continue 620 | 621 | if real_question == '' or real_tar_id == -1 or gt_sessions == "": 622 | continue 623 | 624 | true_answer_op = '' 625 | 626 | for oi, op in enumerate(['(A)', '(B)', '(C)', '(D)', '(E)']): 627 | if true_answer.lower() == target_question_list[real_tar_id]['options'][oi].lower(): 628 | true_answer_op = op 629 | break 630 | 631 | if true_answer_op == '': 632 | continue 633 | 634 | if answer_format in ['multi_choice_unstructured', 'open_ended']: 635 | if true_answer_op == "(E)": 636 | true_answer_op = "I don't know." 637 | else: 638 | true_answer_op = true_answer 639 | 640 | 641 | question_part_prompt = '' 642 | 643 | question_part_prompt += f'{char_ask}: {real_question}' 644 | options = target_question_list[real_tar_id]['options'] 645 | if answer_format == 'multi_choice_structured': 646 | question_part_prompt += '\n' 647 | question_part_prompt += f'\t(A) {options[0]}\n' 648 | question_part_prompt += f'\t(B) {options[1]}\n' 649 | question_part_prompt += f'\t(C) {options[2]}\n' 650 | question_part_prompt += f'\t(D) {options[3]}\n' 651 | question_part_prompt += f'\t(E) {options[4]}' 652 | elif answer_format == 'open_ended': 653 | pass 654 | elif answer_format == 'multi_choice_unstructured': 655 | question_part_prompt += ' ' 656 | question_part_prompt += f'{options[0]}? or ' 657 | question_part_prompt += f'{options[1]}? or ' 658 | question_part_prompt += f'{options[2]}? or ' 659 | question_part_prompt += f'{options[3]}? or ' 660 | question_part_prompt += f"you don't know?" 661 | else: 662 | raise ValueError("Invalid answer format. Should be one of ('multi_choice_structured', 'multi_choice_unstructured', 'open_ended')") 663 | question_part_prompt_sh = name_change(script_name, question_part_prompt, name_shuffle) 664 | """Start of Answering. Time measure starts HERE""" 665 | # time measure START 666 | ans_timeout_flag = False 667 | retrieve_save_start_time = None 668 | ans_start_time = None 669 | 670 | char_ask_sh = name_change(script_name, char_ask, name_shuffle) 671 | real_question_sh = name_change(script_name, real_question, name_shuffle) 672 | 673 | if not save_timeout_flag: 674 | ret_search_start_time = time.time() 675 | #ret_histories = func_timeout(sleep_time-save_time, retrieve_history, args=(ret_method, num_ret_history, openai_client, max_token_len, save_result, char_ask_sh, real_question_sh, data_dict, gt_sessions)) 676 | ret_histories = retrieve_history(ret_method, num_ret_history, openai_client, max_token_len, save_result, char_ask_sh, real_question_sh, data_dict, gt_sessions) 677 | retrieve_search_time = time.time()-ret_search_start_time 678 | if retrieve_search_time > sleep_time-save_time: # timeout during searching history. Note that saving history was done correctly though. 679 | print("\nTimeout (searching history)!!!\n") 680 | search_timeout_flag = True 681 | result = "Wrong (Timeout in searching history)" 682 | is_ambiguous = False 683 | answer = "<<>>" 684 | time_in_saving = save_time # record actual time taken in saving 685 | time_in_retrieval_searching = "<<>>" 686 | time_in_ans = "<<>>" 687 | result_time = "<<>>" 688 | if not search_timeout_flag: 689 | # Model inference 690 | chatbot_sh = name_change(script_name, chatbot, name_shuffle) 691 | if answer_format not in ['multi_choice_structured', 'multi_choice_unstructured', 'open_ended']: 692 | raise ValueError("Invalid answer format. Should be one of ('multi_choice_structured', 'multi_choice_unstructured', 'open_ended')") 693 | if ret_method == 'no_ret': 694 | prompt = open_file(f'./prompt/naive_llm_inference_{answer_format}.txt').replace('<<>>', date).replace('<<>>', ret_histories).replace('<<>>', question_part_prompt_sh).replace('<<>>', chatbot_sh) 695 | else: 696 | prompt = open_file(f'./prompt/RAG_qa_prompt_{answer_format}.txt').replace('<<>>', date).replace('<<>>', ret_histories).replace('<<>>', question_part_prompt_sh).replace('<<>>', chatbot_sh) 697 | 698 | ans_start_time = time.time() 699 | 700 | #answer = func_timeout(sleep_time-save_time-retrieve_search_time, answer_question, args=(model_name, client, model, tokenizer, config, prompt)) 701 | answer = answer_question(model_name, client, model, tokenizer, config, prompt) 702 | ans_time = time.time() - ans_start_time 703 | time_in_saving = save_time 704 | time_in_retrieval_searching = retrieve_search_time 705 | time_in_answering = ans_time 706 | result_time = save_time + retrieve_search_time + ans_time 707 | if result_time > sleep_time: 708 | print("\nTimeout (answering)!!!\n") 709 | ans_timeout_flag = True 710 | result = "Wrong (Timeout in answering)" 711 | is_ambiguous = False 712 | answer = "<<>>" 713 | time_in_saving = save_time 714 | time_in_retrieval_searching = retrieve_search_time 715 | time_in_answering = "<<>>" 716 | result_time = "<<>>" 717 | """Measuring time for timeout stops HERE""" 718 | 719 | is_ambiguous = False 720 | if not ans_timeout_flag and not save_timeout_flag and not search_timeout_flag: 721 | result, is_ambiguous = judge_eq(true_answer_op, answer, question_part_prompt_sh, client, answer_format=answer_format) 722 | if result_time >= sleep_time: 723 | result = "Wrong (Timeout)" 724 | else: 725 | if fast_eval == "no": 726 | time.sleep(sleep_time-result_time) 727 | 728 | already_asked = 1 729 | # log results 730 | answer_list.append(answer) 731 | gold_answer_list.append(true_answer_op) 732 | result_list.append(result) 733 | result_time_list.append(result_time) 734 | save_time_list.append(time_in_saving) 735 | retrieve_search_time_list.append(time_in_retrieval_searching) 736 | ans_time_list.append(time_in_answering) 737 | target_level_list.append({"current_type" : current_type}) 738 | print(question_part_prompt_sh) 739 | print(f'------------------------------- Q&A result -------------------------------') 740 | print(f'result: {result}, ambiguous answer: {is_ambiguous}') 741 | print(f'true answer: {true_answer_op}\t model answer: {answer}') 742 | print(f'time spent in saving: {time_in_saving}') 743 | print(f'time spent in searching history: {time_in_retrieval_searching}') 744 | print(f'time spent in answering: {time_in_answering}') 745 | print(f'time spent overall: {result_time}') 746 | print(f'time limit: {sleep_time}') 747 | print(f'model name: {model_name}') 748 | print(f'--------------------------------------------------------------------------') 749 | 750 | if is_ambiguous: 751 | ambiguous_idx_list.append((epi, sc_num, question_part_prompt_sh)) 752 | ambiguous_answer_list.append(answer) 753 | ambiguous_gold_answer_list.append(true_answer_op) 754 | 755 | distilled_answer = distill_answer(answer) 756 | ret_histories_question_answer_list.append((ret_histories, question_part_prompt_sh, true_answer_op, distilled_answer)) 757 | 758 | calibration = calibrate(result, is_ambiguous, true_answer_op, answer, question_part_prompt_sh, distilled_answer, answer_format=answer_format, lenient=True) # (result, is_ambiguous, calibrated_distilled_answer) 759 | if isinstance(result_time, float) and result_time >= sleep_time: 760 | calibrated_result_list.append("Wrong (Timeout)") 761 | calibrated_distilled_answer_list.append("Wrong (Timeout)") 762 | else: 763 | calibrated_result_list.append(calibration[0]) 764 | calibrated_distilled_answer_list.append(calibration[2]) 765 | 766 | else: 767 | if fast_eval == "no": 768 | if save_time == None: 769 | pass 770 | else: 771 | time.sleep(sleep_time-save_time) 772 | 773 | if not already_pop and "session" in history_type and un < len(post_utterances) - 1: 774 | if ret_method == 'openai-emb' or ret_method == 'no_ret': 775 | try: 776 | data_dict["history"].pop() 777 | data_dict["ada_embedding"].pop() 778 | except: 779 | AssertionError("Unexpected error(probable cause: couldn't save even one embedding using openai-emb in time). Please run the program again.") 780 | else: 781 | try: 782 | data_dict["history"].pop() 783 | except: 784 | pass 785 | cur_conv_num += 1 786 | 787 | simulator_running_time = time.time() - simulator_start_time 788 | 789 | if "Correct" in result_list: 790 | score_total = result_list.count('Correct') / len(result_list) 791 | else: 792 | score_total = 0 793 | 794 | valid_result_time_list = [] 795 | for result_time in result_time_list: 796 | if isinstance(result_time, float): 797 | valid_result_time_list.append(result_time) 798 | 799 | if len(valid_result_time_list) == 0: 800 | result_time_mean = 0 801 | else: 802 | result_time_mean = sum(valid_result_time_list) / len(valid_result_time_list) 803 | 804 | if "Correct" in calibrated_result_list: 805 | calibrated_score = calibrated_result_list.count('Correct') / len(calibrated_result_list) 806 | else: 807 | calibrated_score = 0 808 | 809 | log_info = { 810 | "score" : score_total, 811 | "calibrated_score" : calibrated_score, 812 | "result_time_mean" : result_time_mean, 813 | "simulator_running_time" : simulator_running_time, 814 | "result_list" : result_list, 815 | "result_time_list" : result_time_list, 816 | "ambiguous_idx_list" : ambiguous_idx_list, 817 | "ambiguous_answer_list" : ambiguous_answer_list, 818 | "ambiguous_gold_answer_list" : ambiguous_gold_answer_list, 819 | "answer_list" : answer_list, 820 | "gold_answer_list" : gold_answer_list, 821 | "ret_histories_question_answer_list" : ret_histories_question_answer_list, 822 | "save_time_list" : save_time_list, 823 | "retrieve_search_time_list": retrieve_search_time_list, 824 | "ans_time_list" : ans_time_list, 825 | "calibrated_result_list" : calibrated_result_list, 826 | "calibrated_distilled_answer_list" : calibrated_distilled_answer_list, 827 | "target_level_list" : target_level_list 828 | } 829 | 830 | return log_info 831 | 832 | 833 | 834 | if __name__ == "__main__": 835 | args = parse_args() 836 | print(args) 837 | def set_affinity(num_cores, sh_number): 838 | cpu_list = range(num_cores*sh_number, num_cores*(sh_number+1)) 839 | os.sched_setaffinity(os.getpid(), set(cpu_list)) 840 | 841 | set_affinity(args.num_cores, args.sh_number) 842 | cpu_count = os.sched_getaffinity(os.getpid()) 843 | print(f"Available CPUs: {cpu_count}") 844 | 845 | log_info = simulator(script_name=args.script_name, history_type=args.history_type, sleep_time=args.sleep_time, num_ret_history=args.num_ret_history, model_name=args.model_name, \ 846 | debug=args.debug, debug_n_episodes=args.debug_n_episodes, quantization=args.quantization, ret_method=args.ret_method, name_shuffle=args.name_shuffle, openai_api_key=args.openai_api_key, gemini_api_key=args.gemini_api_key, antrhopic_api_key=args.antrhopic_api_key, fast_eval=args.fast_eval, answer_format=args.answer_format) 847 | 848 | print() 849 | print('SCORE: ', log_info["score"]) 850 | print(f'SCORE(calibrated): {log_info["calibrated_score"]}') 851 | print('Answer Time Mean: ', log_info["result_time_mean"]) 852 | 853 | log_results_path = \ 854 | f"./results/results-{args.script_name}-model_{args.model_name}-debug_{args.debug}-quantization_{args.quantization}-time_limit_{args.sleep_time}-history_type_{args.history_type}-{args.ret_method}_{args.name_shuffle}-version_{args.trial_version}.json" 855 | log_total_path = \ 856 | f"./results/entire_log-{args.script_name}-model_{args.model_name}-debug_{args.debug}-quantization_{args.quantization}-time_limit_{args.sleep_time}-history_type_{args.history_type}-{args.ret_method}_{args.name_shuffle}-version_{args.trial_version}.json" 857 | 858 | log_results(log_info, log_file_path=log_results_path) 859 | log_everything(log_info, log_file_path=log_total_path) -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | accelerate==0.27.2 2 | anthropic==0.19.1 3 | bitsandbytes==0.43.0 4 | google-generativeai==0.4.0 5 | huggingface-hub==0.21.4 6 | openai==1.40.3 7 | transformers==4.38.2 8 | numpy==1.26.4 9 | pandas==2.1.4 10 | scikit-learn==1.2.2 11 | sentencepiece==0.2.0 12 | nltk==3.8.1 13 | rank-bm25==0.2.2 14 | func_timeout==4.3.5 -------------------------------------------------------------------------------- /results/entire_log-friends-model_gpt-4o-mini-debug_True-quantization_4bit-time_limit_6.0-history_type_utts-openai-emb_original-version_0.json: -------------------------------------------------------------------------------- 1 | { 2 | "score": 0.47058823529411764, 3 | "calibrated_score": 0.47058823529411764, 4 | "result_time_mean": 1.5760387112112606, 5 | "simulator_running_time": 209.41859030723572, 6 | "result_list": [ 7 | "Correct", 8 | "Correct", 9 | "Correct", 10 | "Correct", 11 | "Wrong", 12 | "Correct", 13 | "Wrong", 14 | "Wrong", 15 | "Correct", 16 | "Wrong", 17 | "Wrong", 18 | "Wrong", 19 | "Wrong", 20 | "Wrong", 21 | "Wrong", 22 | "Correct", 23 | "Correct" 24 | ], 25 | "result_time_list": [ 26 | 1.3372583389282227, 27 | 1.367516040802002, 28 | 1.2561695575714111, 29 | 1.4960777759552002, 30 | 1.7268848419189453, 31 | 1.5129830837249756, 32 | 1.15739107131958, 33 | 2.0546560287475586, 34 | 1.523338794708252, 35 | 2.4329042434692383, 36 | 1.306854009628296, 37 | 1.2766575813293457, 38 | 1.6645827293395996, 39 | 1.7893786430358887, 40 | 1.3747978210449219, 41 | 2.147660970687866, 42 | 1.367546558380127 43 | ], 44 | "ambiguous_idx_list": [], 45 | "ambiguous_answer_list": [], 46 | "ambiguous_gold_answer_list": [], 47 | "answer_list": [ 48 | "I don't know the answer to that. The retrieved dialog history does not provide any information about Rachel or her boots.", 49 | "I don't know the answer.", 50 | "I don't know the answer to that question.", 51 | "Chandler was referring to Paul the Wine Guy, who was going out with Monica on September 22, 1994.", 52 | "Based on the retrieved dialog history, the only person mentioned as a Lincoln High survivor is Rachel. Monica introduces her to the group, saying, \"this is Rachel, another Lincoln High survivor.\" There is no further information provided about other alumni from Lincoln High. Therefore, I don't know who else counts themselves among the illustrious alumni of Lincoln High.", 53 | "I'm sorry, but the retrieved dialog history does not contain any information about Franny or her vacation destination. Therefore, I don't know the answer to your question.", 54 | "I don't know the answer.", 55 | "Based on the retrieved dialog history, Joey mentioned that he had a date with someone named Andrea (or Angela, as he seemed unsure). However, there is no specific mention of who was going out with Joey on September 22, 1994. Therefore, I don't know the answer to that question.", 56 | "I don't know the answer.", 57 | "I don't know the answer to that. The retrieved dialog history does not provide information about who Andrea's date was on the night of September 22, 1994.", 58 | "I don't know the answer.", 59 | "I don't know the answer to that question.", 60 | "I don't know the answer to that question. The retrieved dialog history does not provide information about the identity of the person Paul went on a date with or who their \"mama bear\" is.", 61 | "I don't know the answer to that question. The retrieved dialog history does not provide information about who Mr. Geller's daughter was sharing her apartment with on September 22, 1994.", 62 | "I don't know the answer to that.", 63 | "Rachel was freaking out about losing her engagement ring, but it seems she eventually found it.", 64 | "I don't know the answer to that question." 65 | ], 66 | "gold_answer_list": [ 67 | "I don't know.", 68 | "I don't know.", 69 | "I don't know.", 70 | "Paul", 71 | "Monica", 72 | "I don't know.", 73 | "Paul", 74 | "Andrea", 75 | "I don't know.", 76 | "Joey", 77 | "Carol", 78 | "Ross", 79 | "Mrs. geller", 80 | "Rachel", 81 | "Rachel", 82 | "An engagement ring", 83 | "I don't know." 84 | ], 85 | "ret_histories_question_answer_list": [ 86 | [ 87 | "[Date: September 22, 1994, Session #1, Utterance #3] Chandler: All right Joey, be nice. So does he have a hump? A hump and a hairpiece?\n[Date: September 22, 1994, Session #1, Utterance #20] Monica: Are you okay, sweetie?\n[Date: September 22, 1994, Session #1, Utterance #16] Monica: And they weren't looking at you before?!\n[Date: September 22, 1994, Session #1, Utterance #34] Ross: No!! Okay?! Why does everyone keep fixating on that? She didn't know, how should I know?\n[Date: September 22, 1994, Session #1, Utterance #23] Monica: Carol moved her stuff out today.\n[Date: September 22, 1994, Session #1, Utterance #25] Monica: Let me get you some coffee.\n[Date: September 22, 1994, Session #1, Utterance #1] Monica: There's nothing to tell! He's just some guy I work with!\n[Date: September 22, 1994, Session #1, Utterance #4] Phoebe: Wait, does he eat chalk?\n[Date: September 22, 1994, Session #1, Utterance #22] Chandler: Cookie?\n[Date: September 22, 1994, Session #1, Utterance #27] Phoebe: Ooh! Oh!\n[Date: September 22, 1994, Session #1, Utterance #31] Monica: No you don't.\n[Date: September 22, 1994, Session #1, Utterance #33] Joey: And you never knew she was a lesbian...\n[Date: September 22, 1994, Session #1, Utterance #5] Phoebe: Just, 'cause, I don't want her to go through what I went through with Carl- oh!\n[Date: September 22, 1994, Session #1, Utterance #32] Ross: No I don't, to hell with her, she left me!\n[Date: September 22, 1994, Session #1, Utterance #7] Chandler: Sounds like a date to me.\n[Date: September 22, 1994, Session #1, Utterance #15] Chandler: All of a sudden, the phone starts to ring. Now I don't know what to do, everybody starts looking at me.\n[Date: September 22, 1994, Session #1, Utterance #10] Chandler: Then I look down, and I realize there's a phone... there.\n[Date: September 22, 1994, Session #1, Utterance #14] Phoebe: No.\n[Date: September 22, 1994, Session #1, Utterance #29] Phoebe: Fine! Be murky!\n[Date: September 22, 1994, Session #1, Utterance #18] Ross: Hi.\n", 88 | "Monica: Hey, do you know what Rachel used to snag those super cute new boots she's been rocking?", 89 | "I don't know.", 90 | "([N/A])" 91 | ], 92 | [ 93 | "[Date: September 22, 1994, Session #1, Utterance #3] Chandler: All right Joey, be nice. So does he have a hump? A hump and a hairpiece?\n[Date: September 22, 1994, Session #1, Utterance #46] Monica: De-caff. Okay, everybody, this is Rachel, another Lincoln High survivor. This is everybody, this is Chandler, and Phoebe, and Joey, and- you remember my brother Ross?\n[Date: September 22, 1994, Session #1, Utterance #4] Phoebe: Wait, does he eat chalk?\n[Date: September 22, 1994, Session #2, Utterance #8] Phoebe, Ross, Chandler, and Joey: Push her down the stairs! Push her down the stairs! Push her down the stairs!\n[Date: September 22, 1994, Session #1, Utterance #22] Chandler: Cookie?\n[Date: September 22, 1994, Session #2, Utterance #19] Joey: And hey, you need anything, you can always come to Joey. Me and Chandler live across the hall. And he's away a lot.\n[Date: September 22, 1994, Session #1, Utterance #27] Phoebe: Ooh! Oh!\n[Date: September 22, 1994, Session #1, Utterance #35] Chandler: Sometimes I wish I was a lesbian... Did I say that out loud?\n[Date: September 22, 1994, Session #2, Utterance #15] Phoebe: Raindrops on roses and rabbits and kittens, bluebells and sleighbells and- something with mittens... La la la la...something and noodles with string. These are a few...\n[Date: September 22, 1994, Session #1, Utterance #14] Phoebe: No.\n[Date: September 22, 1994, Session #2, Utterance #17] Phoebe: I helped!\n[Date: September 22, 1994, Session #1, Utterance #43] Monica: Rachel?!\n[Date: September 22, 1994, Session #1, Utterance #10] Chandler: Then I look down, and I realize there's a phone... there.\n[Date: September 22, 1994, Session #2, Utterance #30] Chandler: Ooh, this is a Dear Diary moment.\n[Date: September 22, 1994, Session #1, Utterance #29] Phoebe: Fine! Be murky!\n[Date: September 22, 1994, Session #2, Utterance #6] Chandler: Ooh, she should not be wearing those pants.\n[Date: September 22, 1994, Session #1, Utterance #15] Chandler: All of a sudden, the phone starts to ring. Now I don't know what to do, everybody starts looking at me.\n[Date: September 22, 1994, Session #1, Utterance #12] Chandler: That's right.\n[Date: September 22, 1994, Session #1, Utterance #5] Phoebe: Just, 'cause, I don't want her to go through what I went through with Carl- oh!\n[Date: September 22, 1994, Session #1, Utterance #7] Chandler: Sounds like a date to me.\n", 94 | "Phoebe: Oh my god, like, who's the furry little creature that Chandler calls his pet?", 95 | "I don't know.", 96 | "([N/A])" 97 | ], 98 | [ 99 | "[Date: September 22, 1994, Session #3, Utterance #13] Joey: Ross, let me ask you a question. She got the furniture, the stereo, the good TV- what did you get?\n[Date: September 22, 1994, Session #1, Utterance #46] Monica: De-caff. Okay, everybody, this is Rachel, another Lincoln High survivor. This is everybody, this is Chandler, and Phoebe, and Joey, and- you remember my brother Ross?\n[Date: September 22, 1994, Session #2, Utterance #49] Ross: So Rachel, what're you, uh... what're you up to tonight?\n[Date: September 22, 1994, Session #1, Utterance #3] Chandler: All right Joey, be nice. So does he have a hump? A hump and a hairpiece?\n[Date: September 22, 1994, Session #2, Utterance #8] Phoebe, Ross, Chandler, and Joey: Push her down the stairs! Push her down the stairs! Push her down the stairs!\n[Date: September 22, 1994, Session #1, Utterance #43] Monica: Rachel?!\n[Date: September 22, 1994, Session #2, Utterance #19] Joey: And hey, you need anything, you can always come to Joey. Me and Chandler live across the hall. And he's away a lot.\n[Date: September 22, 1994, Session #2, Utterance #20] Monica: Joey, stop hitting on her! It's her wedding day!\n[Date: September 22, 1994, Session #2, Utterance #51] Ross: Right, you're not even getting your honeymoon, God.. No, no, although, Aruba, this time of year... talk about your- -big lizards... Anyway, if you don't feel like being alone tonight, Joey and Chandler are coming over to help me put together my new furniture.\n[Date: September 22, 1994, Session #1, Utterance #44] Rachel: Oh God Monica hi! Thank God! I just went to your building and you weren't there and then this guy with a big hammer said you might be here and you are, you are!\n[Date: September 22, 1994, Session #1, Utterance #50] Rachel: Oh God... well, it started about a half hour before the wedding. I was in the room where we were keeping all the presents, and I was looking at this gravy boat. This really gorgeous Lamauge gravy boat. When all of a sudden- Sweet 'n' Lo?- I realized that I was more turned on by this gravy boat than by Barry! And then I got really freaked out, and that's when it hit me: how much Barry looks like Mr. Potato Head. Y'know, I mean, I always knew looked familiar, but... Anyway, I just had to get out of there, and I started wondering 'Why am I doing this, and who am I doing this for?'. So anyway I just didn't know where to go, and I know that you and I have kinda drifted apart, but you're the only person I knew who lived here in the city.\n[Date: September 22, 1994, Session #1, Utterance #2] Joey: C'mon, you're going out with the guy! There's gotta be something wrong with him!\n[Date: September 22, 1994, Session #1, Utterance #33] Joey: And you never knew she was a lesbian...\n[Date: September 22, 1994, Session #2, Utterance #9] Rachel: C'mon Daddy, listen to me! It's like, it's like, all of my life, everyone has always told me, 'You're a shoe! You're a shoe, you're a shoe, you're a shoe!'. And today I just stopped and I said, 'What if I don't wanna be a shoe? What if I wanna be a- a purse, y'know? Or a- or a hat! No, I'm not saying I want you to buy me a hat, I'm saying I am a ha- It's a metaphor, Daddy!\n[Date: September 22, 1994, Session #2, Utterance #7] Joey: I say push her down the stairs.\n[Date: September 22, 1994, Session #2, Utterance #6] Chandler: Ooh, she should not be wearing those pants.\n[Date: September 22, 1994, Session #2, Utterance #48] Monica: Shut up, Joey!\n[Date: September 22, 1994, Session #2, Utterance #11] Rachel: Look Daddy, it's my life. Well maybe I'll just stay here with Monica.\n[Date: September 22, 1994, Session #2, Utterance #55] Joey: Hey Pheebs, you wanna help?\n[Date: September 22, 1994, Session #1, Utterance #39] Joey: Alright Ross, look. You're feeling a lot of pain right now. You're angry. You're hurting. Can I tell you what the answer is?\n", 100 | "Joey: Hey, how did Rachel manage to snag those killer boots, huh?", 101 | "I don't know.", 102 | "([N/A])" 103 | ], 104 | [ 105 | "[Date: September 22, 1994, Session #1, Utterance #46] Monica: De-caff. Okay, everybody, this is Rachel, another Lincoln High survivor. This is everybody, this is Chandler, and Phoebe, and Joey, and- you remember my brother Ross?\n[Date: September 22, 1994, Session #1, Utterance #43] Monica: Rachel?!\n[Date: September 22, 1994, Session #1, Utterance #7] Chandler: Sounds like a date to me.\n[Date: September 22, 1994, Session #1, Utterance #35] Chandler: Sometimes I wish I was a lesbian... Did I say that out loud?\n[Date: September 22, 1994, Session #4, Utterance #5] Chandler: Look, Ross, you gotta understand, between us we haven't had a relationship that has lasted longer than a Mento. You, however have had the love of a woman for four years. Four years of closeness and sharing at the end of which she ripped your heart out, and that is why we don't do it! I don't think that was my point!\n[Date: September 22, 1994, Session #2, Utterance #27] Monica: Maybe. Joey: Wait. Your 'not a real date' tonight is with Paul the Wine Guy?\n[Date: September 22, 1994, Session #3, Utterance #7] Chandler: I have no idea.\n[Date: September 22, 1994, Session #2, Utterance #12] Monica: Well, I guess we've established who's staying here with Monica...\n[Date: September 22, 1994, Session #2, Utterance #6] Chandler: Ooh, she should not be wearing those pants.\n[Date: September 22, 1994, Session #1, Utterance #22] Chandler: Cookie?\n[Date: September 22, 1994, Session #3, Utterance #15] Chandler: Oh, God.\n[Date: September 22, 1994, Session #1, Utterance #12] Chandler: That's right.\n[Date: September 22, 1994, Session #2, Utterance #19] Joey: And hey, you need anything, you can always come to Joey. Me and Chandler live across the hall. And he's away a lot.\n[Date: September 22, 1994, Session #2, Utterance #35] Monica: Really?\n[Date: September 22, 1994, Session #1, Utterance #6] Monica: Okay, everybody relax. This is not even a date. It's just two people going out to dinner and- not having sex.\n[Date: September 22, 1994, Session #1, Utterance #3] Chandler: All right Joey, be nice. So does he have a hump? A hump and a hairpiece?\n[Date: September 22, 1994, Session #3, Utterance #17] Chandler: Oh my God!\n[Date: September 22, 1994, Session #1, Utterance #51] Monica: Who wasn't invited to the wedding.\n[Date: September 22, 1994, Session #2, Utterance #40] Chandler: I'm sorry, I didn't catch your name. Paul, was it?\n[Date: September 22, 1994, Session #1, Utterance #23] Monica: Carol moved her stuff out today.\n", 106 | "Chandler: Oh, could you BE any more specific about who was going out with Monica on September 22, 1994?", 107 | "Paul", 108 | "([N/A])" 109 | ], 110 | [ 111 | "[Date: September 22, 1994, Session #1, Utterance #46] Monica: De-caff. Okay, everybody, this is Rachel, another Lincoln High survivor. This is everybody, this is Chandler, and Phoebe, and Joey, and- you remember my brother Ross?\n[Date: September 22, 1994, Session #1, Utterance #8] Chandler: Alright, so I'm back in high school, I'm standing in the middle of the cafeteria, and I realize I am totally naked.\n[Date: September 22, 1994, Session #1, Utterance #35] Chandler: Sometimes I wish I was a lesbian... Did I say that out loud?\n[Date: September 22, 1994, Session #1, Utterance #15] Chandler: All of a sudden, the phone starts to ring. Now I don't know what to do, everybody starts looking at me.\n[Date: September 22, 1994, Session #3, Utterance #17] Chandler: Oh my God!\n[Date: September 22, 1994, Session #1, Utterance #22] Chandler: Cookie?\n[Date: September 22, 1994, Session #2, Utterance #30] Chandler: Ooh, this is a Dear Diary moment.\n[Date: September 22, 1994, Session #1, Utterance #3] Chandler: All right Joey, be nice. So does he have a hump? A hump and a hairpiece?\n[Date: September 22, 1994, Session #3, Utterance #15] Chandler: Oh, God.\n[Date: September 22, 1994, Session #2, Utterance #40] Chandler: I'm sorry, I didn't catch your name. Paul, was it?\n[Date: September 22, 1994, Session #3, Utterance #7] Chandler: I have no idea.\n[Date: September 22, 1994, Session #2, Utterance #19] Joey: And hey, you need anything, you can always come to Joey. Me and Chandler live across the hall. And he's away a lot.\n[Date: September 22, 1994, Session #1, Utterance #12] Chandler: That's right.\n[Date: September 22, 1994, Session #1, Utterance #10] Chandler: Then I look down, and I realize there's a phone... there.\n[Date: September 22, 1994, Session #1, Utterance #42] Chandler: And I just want a million dollars!\n[Date: September 22, 1994, Session #2, Utterance #52] Chandler: Yes, and we're very excited about it.\n[Date: September 22, 1994, Session #1, Utterance #17] Chandler: Finally, I figure I'd better answer it, and it turns out it's my mother, which is very-very weird, because- she never calls me!\n[Date: September 22, 1994, Session #2, Utterance #6] Chandler: Ooh, she should not be wearing those pants.\n[Date: September 22, 1994, Session #1, Utterance #7] Chandler: Sounds like a date to me.\n[Date: September 22, 1994, Session #3, Utterance #5] Chandler: I would have to say that is an 'L'-shaped bracket.\n", 112 | "Chandler: Oh, could we BE any more inquisitive? Who exactly counts themselves among the illustrious alumni of Lincoln high?", 113 | "Monica", 114 | "([N/A])" 115 | ], 116 | [ 117 | "[Date: September 22, 1994, Session #2, Utterance #12] Monica: Well, I guess we've established who's staying here with Monica...\n[Date: September 22, 1994, Session #1, Utterance #43] Monica: Rachel?!\n[Date: September 22, 1994, Session #2, Utterance #35] Monica: Really?\n[Date: September 22, 1994, Session #1, Utterance #23] Monica: Carol moved her stuff out today.\n[Date: September 22, 1994, Session #2, Utterance #41] Monica: Okay, umm-umm, I'll just--I'll be right back, I just gotta go ah, go ah...\n[Date: September 22, 1994, Session #1, Utterance #20] Monica: Are you okay, sweetie?\n[Date: September 22, 1994, Session #2, Utterance #29] Monica: Yes!\n[Date: September 22, 1994, Session #1, Utterance #44] Rachel: Oh God Monica hi! Thank God! I just went to your building and you weren't there and then this guy with a big hammer said you might be here and you are, you are!\n[Date: September 23, 1994, Session #1, Utterance #2] Monica: Why?! Why? Why, why would anybody do something like that?\n[Date: September 22, 1994, Session #1, Utterance #25] Monica: Let me get you some coffee.\n[Date: September 22, 1994, Session #2, Utterance #24] Monica: Oh God, is it 6:30? Buzz him in!\n[Date: September 22, 1994, Session #1, Utterance #46] Monica: De-caff. Okay, everybody, this is Rachel, another Lincoln High survivor. This is everybody, this is Chandler, and Phoebe, and Joey, and- you remember my brother Ross?\n[Date: September 22, 1994, Session #2, Utterance #27] Monica: Maybe. Joey: Wait. Your 'not a real date' tonight is with Paul the Wine Guy?\n[Date: September 22, 1994, Session #1, Utterance #37] Monica: Oh really, so that hysterical phone call I got from a woman at sobbing 3:00 A.M., \"I'll never have grandchildren, I'll never have grandchildren.\" was what? A wrong number?\n[Date: September 22, 1994, Session #2, Utterance #33] Monica: Are, are you okay? I mean, do you want me to stay?\n[Date: September 22, 1994, Session #2, Utterance #11] Rachel: Look Daddy, it's my life. Well maybe I'll just stay here with Monica.\n[Date: September 23, 1994, Session #1, Utterance #8] Monica: I just thought he was nice, y'know?\n[Date: September 22, 1994, Session #2, Utterance #18] Monica: Okay, look, this is probably for the best, y'know? Independence. Taking control of your life. The whole, 'hat' thing.\n[Date: September 22, 1994, Session #2, Utterance #43] Monica: Change! Okay, sit down. Two seconds.\n[Date: September 22, 1994, Session #2, Utterance #50] Rachel: Well, I was kinda supposed to be headed for Aruba on my honeymoon, so nothing!\n", 118 | "Monica: Oh my God, you remember that chat we had back on September 23, 1994, right? So, where was it again that Franny jetted off to for her vacay?", 119 | "I don't know.", 120 | "([N/A])" 121 | ], 122 | [ 123 | "[Date: September 22, 1994, Session #1, Utterance #43] Monica: Rachel?!\n[Date: September 22, 1994, Session #1, Utterance #46] Monica: De-caff. Okay, everybody, this is Rachel, another Lincoln High survivor. This is everybody, this is Chandler, and Phoebe, and Joey, and- you remember my brother Ross?\n[Date: September 22, 1994, Session #1, Utterance #44] Rachel: Oh God Monica hi! Thank God! I just went to your building and you weren't there and then this guy with a big hammer said you might be here and you are, you are!\n[Date: September 23, 1994, Session #2, Utterance #16] Monica: Rachel! That was a library card!\n[Date: September 22, 1994, Session #2, Utterance #11] Rachel: Look Daddy, it's my life. Well maybe I'll just stay here with Monica.\n[Date: September 22, 1994, Session #2, Utterance #49] Ross: So Rachel, what're you, uh... what're you up to tonight?\n[Date: September 23, 1994, Session #1, Utterance #10] Rachel: Guess what?\n[Date: September 23, 1994, Session #2, Utterance #24] Rachel: Hey Mon, look what I just found on the floor. What?\n[Date: September 22, 1994, Session #2, Utterance #27] Monica: Maybe. Joey: Wait. Your 'not a real date' tonight is with Paul the Wine Guy?\n[Date: September 22, 1994, Session #2, Utterance #12] Monica: Well, I guess we've established who's staying here with Monica...\n[Date: September 22, 1994, Session #2, Utterance #35] Monica: Really?\n[Date: September 23, 1994, Session #2, Utterance #3] Rachel: I know that. That's why I was getting married.\n[Date: September 22, 1994, Session #1, Utterance #6] Monica: Okay, everybody relax. This is not even a date. It's just two people going out to dinner and- not having sex.\n[Date: September 22, 1994, Session #1, Utterance #47] Rachel: Hi, sure!\n[Date: September 22, 1994, Session #2, Utterance #31] Monica: Rach, wait, I can cancel...\n[Date: September 22, 1994, Session #1, Utterance #23] Monica: Carol moved her stuff out today.\n[Date: September 22, 1994, Session #1, Utterance #52] Rachel: Ooh, I was kinda hoping that wouldn't be an issue...\n[Date: September 22, 1994, Session #2, Utterance #29] Monica: Yes!\n[Date: September 23, 1994, Session #1, Utterance #2] Monica: Why?! Why? Why, why would anybody do something like that?\n[Date: September 22, 1994, Session #1, Utterance #51] Monica: Who wasn't invited to the wedding.\n", 124 | "Rachel: Oh my god, so like, who was going out with Monica on September 22, 1994?", 125 | "Paul", 126 | "([N/A])" 127 | ], 128 | [ 129 | "[Date: September 22, 1994, Session #1, Utterance #2] Joey: C'mon, you're going out with the guy! There's gotta be something wrong with him!\n[Date: September 22, 1994, Session #1, Utterance #33] Joey: And you never knew she was a lesbian...\n[Date: September 22, 1994, Session #5, Utterance #2] Joey: Great story! But, I uh, I gotta go, I got a date with Andrea--Angela--Andrea... Oh man,\n[Date: September 22, 1994, Session #3, Utterance #13] Joey: Ross, let me ask you a question. She got the furniture, the stereo, the good TV- what did you get?\n[Date: September 22, 1994, Session #2, Utterance #27] Monica: Maybe. Joey: Wait. Your 'not a real date' tonight is with Paul the Wine Guy?\n[Date: September 22, 1994, Session #2, Utterance #25] Joey: Who's Paul?\n[Date: September 22, 1994, Session #2, Utterance #19] Joey: And hey, you need anything, you can always come to Joey. Me and Chandler live across the hall. And he's away a lot.\n[Date: September 22, 1994, Session #1, Utterance #24] Joey: Ohh.\n[Date: September 22, 1994, Session #5, Utterance #4] Joey: Right. Thanks. It's June. I'm outta here.\n[Date: September 22, 1994, Session #2, Utterance #45] Joey: Hey, Paul!\n[Date: September 22, 1994, Session #1, Utterance #40] Joey: Strip joint! C'mon, you're single! Have some hormones!\n[Date: September 22, 1994, Session #3, Utterance #4] Joey: What's this?\n[Date: September 22, 1994, Session #2, Utterance #48] Monica: Shut up, Joey!\n[Date: September 22, 1994, Session #3, Utterance #6] Joey: Which goes where?\n[Date: September 22, 1994, Session #3, Utterance #16] Joey: You got screwed.\n[Date: September 22, 1994, Session #3, Utterance #11] Joey: Hey-hey-hey-hey, if you're gonna start with that stuff we're outta here.\n[Date: September 22, 1994, Session #1, Utterance #11] Joey: Instead of...?\n[Date: September 22, 1994, Session #1, Utterance #46] Monica: De-caff. Okay, everybody, this is Rachel, another Lincoln High survivor. This is everybody, this is Chandler, and Phoebe, and Joey, and- you remember my brother Ross?\n[Date: September 22, 1994, Session #4, Utterance #2] Joey: Shut up!\n[Date: September 22, 1994, Session #2, Utterance #55] Joey: Hey Pheebs, you wanna help?\n", 130 | "Joey: Hey, just outta curiosity, who was goin' out with Joey on September 22, 1994?", 131 | "Andrea", 132 | "([N/A])" 133 | ], 134 | [ 135 | "[Date: September 22, 1994, Session #2, Utterance #12] Monica: Well, I guess we've established who's staying here with Monica...\n[Date: September 22, 1994, Session #2, Utterance #27] Monica: Maybe. Joey: Wait. Your 'not a real date' tonight is with Paul the Wine Guy?\n[Date: September 22, 1994, Session #2, Utterance #20] Monica: Joey, stop hitting on her! It's her wedding day!\n[Date: September 22, 1994, Session #1, Utterance #1] Monica: There's nothing to tell! He's just some guy I work with!\n[Date: September 22, 1994, Session #1, Utterance #2] Joey: C'mon, you're going out with the guy! There's gotta be something wrong with him!\n[Date: September 22, 1994, Session #1, Utterance #5] Phoebe: Just, 'cause, I don't want her to go through what I went through with Carl- oh!\n[Date: September 22, 1994, Session #1, Utterance #23] Monica: Carol moved her stuff out today.\n[Date: September 22, 1994, Session #1, Utterance #43] Monica: Rachel?!\n[Date: September 22, 1994, Session #5, Utterance #2] Joey: Great story! But, I uh, I gotta go, I got a date with Andrea--Angela--Andrea... Oh man,\n[Date: September 22, 1994, Session #2, Utterance #1] Monica: Now I'm guessing that he bought her the big pipe organ, and she's really not happy about it.\n[Date: September 23, 1994, Session #1, Utterance #19] Monica: And who pays for that?\n[Date: September 23, 1994, Session #2, Utterance #42] Monica: See ya.... Waitwait, what's with you?\n[Date: September 23, 1994, Session #1, Utterance #8] Monica: I just thought he was nice, y'know?\n[Date: September 22, 1994, Session #1, Utterance #44] Rachel: Oh God Monica hi! Thank God! I just went to your building and you weren't there and then this guy with a big hammer said you might be here and you are, you are!\n[Date: September 22, 1994, Session #2, Utterance #35] Monica: Really?\n[Date: September 22, 1994, Session #1, Utterance #20] Monica: Are you okay, sweetie?\n[Date: September 22, 1994, Session #1, Utterance #51] Monica: Who wasn't invited to the wedding.\n[Date: September 23, 1994, Session #1, Utterance #2] Monica: Why?! Why? Why, why would anybody do something like that?\n[Date: September 22, 1994, Session #2, Utterance #29] Monica: Yes!\n[Date: September 22, 1994, Session #1, Utterance #49] Monica: So you wanna tell us now, or are we waiting for four wet bridesmaids?\n", 136 | "Monica: Oh my god! So, who's the current lucky guy dating Carl right now?", 137 | "I don't know.", 138 | "([N/A])" 139 | ], 140 | [ 141 | "[Date: September 22, 1994, Session #5, Utterance #2] Joey: Great story! But, I uh, I gotta go, I got a date with Andrea--Angela--Andrea... Oh man,\n[Date: September 30, 1994, Session #1, Utterance #6] Marsha: Speaking of issues, isn't that your ex-wife?\n[Date: September 30, 1994, Session #1, Utterance #2] Marsha: Well, she has issues.\n[Date: September 30, 1994, Session #1, Utterance #4] Marsha: He's out banging other women over the head with a club, while she sits at home trying to get the mastodon smell out of the carpet!\n[Date: September 22, 1994, Session #2, Utterance #27] Monica: Maybe. Joey: Wait. Your 'not a real date' tonight is with Paul the Wine Guy?\n[Date: September 22, 1994, Session #5, Utterance #3] Chandler: Angela's the screamer, Andrea has cats.\n[Date: September 22, 1994, Session #1, Utterance #51] Monica: Who wasn't invited to the wedding.\n[Date: September 22, 1994, Session #1, Utterance #6] Monica: Okay, everybody relax. This is not even a date. It's just two people going out to dinner and- not having sex.\n[Date: September 22, 1994, Session #1, Utterance #43] Monica: Rachel?!\n[Date: September 22, 1994, Session #2, Utterance #12] Monica: Well, I guess we've established who's staying here with Monica...\n[Date: September 23, 1994, Session #1, Utterance #17] Monica: How'd you pay for them?\n[Date: September 22, 1994, Session #2, Utterance #49] Ross: So Rachel, what're you, uh... what're you up to tonight?\n[Date: September 23, 1994, Session #1, Utterance #19] Monica: And who pays for that?\n[Date: September 22, 1994, Session #1, Utterance #50] Rachel: Oh God... well, it started about a half hour before the wedding. I was in the room where we were keeping all the presents, and I was looking at this gravy boat. This really gorgeous Lamauge gravy boat. When all of a sudden- Sweet 'n' Lo?- I realized that I was more turned on by this gravy boat than by Barry! And then I got really freaked out, and that's when it hit me: how much Barry looks like Mr. Potato Head. Y'know, I mean, I always knew looked familiar, but... Anyway, I just had to get out of there, and I started wondering 'Why am I doing this, and who am I doing this for?'. So anyway I just didn't know where to go, and I know that you and I have kinda drifted apart, but you're the only person I knew who lived here in the city.\n[Date: September 22, 1994, Session #2, Utterance #28] Ross: He finally asked you out?\n[Date: September 22, 1994, Session #2, Utterance #35] Monica: Really?\n[Date: September 22, 1994, Session #1, Utterance #7] Chandler: Sounds like a date to me.\n[Date: September 22, 1994, Session #1, Utterance #34] Ross: No!! Okay?! Why does everyone keep fixating on that? She didn't know, how should I know?\n[Date: September 23, 1994, Session #2, Utterance #16] Monica: Rachel! That was a library card!\n[Date: September 23, 1994, Session #1, Utterance #2] Monica: Why?! Why? Why, why would anybody do something like that?\n", 142 | "Marsha: So, who was Andrea's date on the night of September 22, 1994?", 143 | "Joey", 144 | "([N/A])" 145 | ], 146 | [ 147 | "[Date: September 22, 1994, Session #1, Utterance #46] Monica: De-caff. Okay, everybody, this is Rachel, another Lincoln High survivor. This is everybody, this is Chandler, and Phoebe, and Joey, and- you remember my brother Ross?\n[Date: September 22, 1994, Session #2, Utterance #8] Phoebe, Ross, Chandler, and Joey: Push her down the stairs! Push her down the stairs! Push her down the stairs!\n[Date: September 23, 1994, Session #2, Utterance #35] Ross: You did! Oh.... I always figured you just thought I was Monica's geeky older brother.\n[Date: September 30, 1994, Session #2, Utterance #10] Phoebe: Monica- Hi! Um, Monica, you're scaring me. I mean, you're like, you're like all chaotic and twirly. And not-not in a good way.\n[Date: September 30, 1994, Session #2, Utterance #25] Phoebe: Doy! Probably right before she lost it!\n[Date: September 22, 1994, Session #2, Utterance #49] Ross: So Rachel, what're you, uh... what're you up to tonight?\n[Date: September 22, 1994, Session #1, Utterance #43] Monica: Rachel?!\n[Date: September 22, 1994, Session #3, Utterance #13] Joey: Ross, let me ask you a question. She got the furniture, the stereo, the good TV- what did you get?\n[Date: September 22, 1994, Session #4, Utterance #5] Chandler: Look, Ross, you gotta understand, between us we haven't had a relationship that has lasted longer than a Mento. You, however have had the love of a woman for four years. Four years of closeness and sharing at the end of which she ripped your heart out, and that is why we don't do it! I don't think that was my point!\n[Date: September 22, 1994, Session #4, Utterance #7] Joey: What are you talking about? 'One woman'? That's like saying there's only one flavor of ice cream for you. Lemme tell you something, Ross. There's lots of flavors out there. There's Rocky Road, and Cookie Dough, and Bing! Cherry Vanilla. You could get 'em with Jimmies, or nuts, or whipped cream! This is the best thing that ever happened to you! You got married, you were, like, what, eight? Welcome back to the world! Grab a spoon!\n[Date: September 30, 1994, Session #1, Utterance #3] Ross: Does she.\n[Date: September 29, 1994, Session #1, Utterance #3] Phoebe: Oh, yeah!\n[Date: September 22, 1994, Session #1, Utterance #27] Phoebe: Ooh! Oh!\n[Date: September 24, 1994, Session #1, Utterance #6] Phoebe: Oh, was I doing it again?\n[Date: September 22, 1994, Session #1, Utterance #14] Phoebe: No.\n[Date: September 22, 1994, Session #1, Utterance #5] Phoebe: Just, 'cause, I don't want her to go through what I went through with Carl- oh!\n[Date: September 30, 1994, Session #2, Utterance #7] Phoebe: She's already fluffed that pillow... Monica, you know, you've already fluffed that- -but, it's fine!\n[Date: September 22, 1994, Session #5, Utterance #5] Ross: Y'know, here's the thing. Even if I could get it together enough to- to ask a woman out,... who am I gonna ask?\n[Date: September 22, 1994, Session #2, Utterance #17] Phoebe: I helped!\n[Date: September 23, 1994, Session #2, Utterance #33] Ross: Okay. You know you probably didn't know this, but back in high school, I had a, um, major crush on you.\n", 148 | "Phoebe: Oh my gosh, so who's the woman that used to be like, Mrs. Ross?", 149 | "Carol", 150 | "([N/A])" 151 | ], 152 | [ 153 | "[Date: September 30, 1994, Session #3, Utterance #16] Mrs. Geller: What that Rachel did to her life.... We ran into her parents at the club, they were not playing very well.\n[Date: September 30, 1994, Session #3, Utterance #6] Mrs. Geller: They all had a thing for him.\n[Date: September 30, 1994, Session #3, Utterance #18] Mrs. Geller: Well, at least she had the chance to leave a man at the altar...\n[Date: September 30, 1994, Session #3, Utterance #26] Ross: Okay! Okay. Look, I, uh- I realise you guys have been wondering what exactly happened between Carol and me, and, so, well, here's the deal. Carol's a lesbian. She's living with a woman named Susan. She's pregnant with my child, and she and Susan are going to raise the baby.\n[Date: September 30, 1994, Session #3, Utterance #3] Mrs. Geller: Mmmm!\n[Date: September 30, 1994, Session #3, Utterance #5] Mr. Geller: Do you remember the Ludwins? The big one had a thing for you, didn't she?\n[Date: September 30, 1994, Session #3, Utterance #9] Mrs. Geller: Oh, she just graduated, and she wants to be something in cooking, or food, or.... I don't know. Anyway, I told her you had a restaurant-\n[Date: September 30, 1994, Session #1, Utterance #18] Ross: Why- why are you here, Carol?\n[Date: September 30, 1994, Session #3, Utterance #1] Mrs. Geller: Oh, Martha Ludwin's daughter is gonna call you. Mmm! What's that curry taste?\n[Date: September 30, 1994, Session #3, Utterance #11] Mrs. Geller: Well, they don't have to know that...\n[Date: September 30, 1994, Session #1, Utterance #15] Carol: A lesbian?\n[Date: September 30, 1994, Session #1, Utterance #6] Marsha: Speaking of issues, isn't that your ex-wife?\n[Date: September 30, 1994, Session #2, Utterance #39] Ross: Carol's pregnant.\n[Date: September 30, 1994, Session #3, Utterance #20] Mrs. Geller: Nothing! It's an expression.\n[Date: September 30, 1994, Session #3, Utterance #17] Mr. Geller: I'm not gonna tell you what they spent on that wedding... but forty thousand dollars is a lot of money!\n[Date: September 30, 1994, Session #3, Utterance #14] Mrs. Geller: Oh, we're having spaghetti! That's.... easy.\n[Date: September 30, 1994, Session #1, Utterance #11] Carol: So.\n[Date: September 30, 1994, Session #1, Utterance #17] Carol: Marty's still totally paranoid. Oh, and, uh-\n[Date: September 30, 1994, Session #1, Utterance #8] Marsha: Yes, it is. Carol! Hi!\n[Date: September 30, 1994, Session #3, Utterance #24] Mr. Geller: ...And I read about these women trying to have it all, and I thank God 'Our Little Harmonica' doesn't seem to have that problem.\n", 154 | "Mrs. Geller: Hey, just out of curiosity, who was Carol's ex-husband?", 155 | "Ross", 156 | "([N/A])" 157 | ], 158 | [ 159 | "[Date: September 22, 1994, Session #2, Utterance #27] Monica: Maybe. Joey: Wait. Your 'not a real date' tonight is with Paul the Wine Guy?\n[Date: September 22, 1994, Session #2, Utterance #25] Joey: Who's Paul?\n[Date: September 22, 1994, Session #1, Utterance #14] Phoebe: No.\n[Date: September 29, 1994, Session #1, Utterance #3] Phoebe: Oh, yeah!\n[Date: September 22, 1994, Session #1, Utterance #5] Phoebe: Just, 'cause, I don't want her to go through what I went through with Carl- oh!\n[Date: September 30, 1994, Session #2, Utterance #46] Phoebe: She is so great! I miss her.\n[Date: September 30, 1994, Session #2, Utterance #25] Phoebe: Doy! Probably right before she lost it!\n[Date: September 22, 1994, Session #1, Utterance #27] Phoebe: Ooh! Oh!\n[Date: September 23, 1994, Session #2, Utterance #4] Phoebe: Give her a break, it's hard being on your own for the first time.\n[Date: September 30, 1994, Session #2, Utterance #10] Phoebe: Monica- Hi! Um, Monica, you're scaring me. I mean, you're like, you're like all chaotic and twirly. And not-not in a good way.\n[Date: September 24, 1994, Session #1, Utterance #6] Phoebe: Oh, was I doing it again?\n[Date: September 30, 1994, Session #2, Utterance #2] Phoebe:...Then I've already seen this one!\n[Date: September 24, 1994, Session #1, Utterance #4] Phoebe: What I said you had...\n[Date: September 30, 1994, Session #2, Utterance #40] Phoebe: Ooh! I found it!\n[Date: September 22, 1994, Session #1, Utterance #4] Phoebe: Wait, does he eat chalk?\n[Date: September 22, 1994, Session #2, Utterance #17] Phoebe: I helped!\n[Date: September 22, 1994, Session #1, Utterance #46] Monica: De-caff. Okay, everybody, this is Rachel, another Lincoln High survivor. This is everybody, this is Chandler, and Phoebe, and Joey, and- you remember my brother Ross?\n[Date: September 22, 1994, Session #5, Utterance #2] Joey: Great story! But, I uh, I gotta go, I got a date with Andrea--Angela--Andrea... Oh man,\n[Date: September 22, 1994, Session #2, Utterance #45] Joey: Hey, Paul!\n[Date: September 22, 1994, Session #2, Utterance #23] Paul: It's, uh, it's Paul.\n", 160 | "Phoebe: So, like, you know how Paul went on a date with that person on September 22, 1994? Who's the mama bear of that person?", 161 | "Mrs. geller", 162 | "([N/A])" 163 | ], 164 | [ 165 | "[Date: September 30, 1994, Session #3, Utterance #16] Mrs. Geller: What that Rachel did to her life.... We ran into her parents at the club, they were not playing very well.\n[Date: September 30, 1994, Session #3, Utterance #27] Mrs. Geller: And you knew about this?!\n[Date: September 30, 1994, Session #4, Utterance #25] Rachel: Oh, you've got Carol tomorrow.. When did it get so complicated?\n[Date: September 30, 1994, Session #3, Utterance #5] Mr. Geller: Do you remember the Ludwins? The big one had a thing for you, didn't she?\n[Date: September 30, 1994, Session #3, Utterance #6] Mrs. Geller: They all had a thing for him.\n[Date: September 30, 1994, Session #3, Utterance #26] Ross: Okay! Okay. Look, I, uh- I realise you guys have been wondering what exactly happened between Carol and me, and, so, well, here's the deal. Carol's a lesbian. She's living with a woman named Susan. She's pregnant with my child, and she and Susan are going to raise the baby.\n[Date: September 30, 1994, Session #1, Utterance #15] Carol: A lesbian?\n[Date: September 30, 1994, Session #3, Utterance #1] Mrs. Geller: Oh, Martha Ludwin's daughter is gonna call you. Mmm! What's that curry taste?\n[Date: September 30, 1994, Session #1, Utterance #18] Ross: Why- why are you here, Carol?\n[Date: September 22, 1994, Session #1, Utterance #23] Monica: Carol moved her stuff out today.\n[Date: September 30, 1994, Session #4, Utterance #27] Rachel: Remember when we were in high school together?\n[Date: September 30, 1994, Session #3, Utterance #11] Mrs. Geller: Well, they don't have to know that...\n[Date: September 30, 1994, Session #1, Utterance #17] Carol: Marty's still totally paranoid. Oh, and, uh-\n[Date: September 30, 1994, Session #2, Utterance #39] Ross: Carol's pregnant.\n[Date: September 22, 1994, Session #1, Utterance #50] Rachel: Oh God... well, it started about a half hour before the wedding. I was in the room where we were keeping all the presents, and I was looking at this gravy boat. This really gorgeous Lamauge gravy boat. When all of a sudden- Sweet 'n' Lo?- I realized that I was more turned on by this gravy boat than by Barry! And then I got really freaked out, and that's when it hit me: how much Barry looks like Mr. Potato Head. Y'know, I mean, I always knew looked familiar, but... Anyway, I just had to get out of there, and I started wondering 'Why am I doing this, and who am I doing this for?'. So anyway I just didn't know where to go, and I know that you and I have kinda drifted apart, but you're the only person I knew who lived here in the city.\n[Date: September 30, 1994, Session #3, Utterance #9] Mrs. Geller: Oh, she just graduated, and she wants to be something in cooking, or food, or.... I don't know. Anyway, I told her you had a restaurant-\n[Date: September 30, 1994, Session #3, Utterance #18] Mrs. Geller: Well, at least she had the chance to leave a man at the altar...\n[Date: September 30, 1994, Session #1, Utterance #8] Marsha: Yes, it is. Carol! Hi!\n[Date: September 30, 1994, Session #3, Utterance #3] Mrs. Geller: Mmmm!\n[Date: September 22, 1994, Session #3, Utterance #10] Ross: This was Carol's favorite beer. She always drank it out of the can, I should have known.\n", 166 | "Carol: Hey, just curious, do you know who Mr. geller's daughter was sharing her apartment with back on September 22, 1994?", 167 | "Rachel", 168 | "([N/A])" 169 | ], 170 | [ 171 | "[Date: September 30, 1994, Session #3, Utterance #16] Mrs. Geller: What that Rachel did to her life.... We ran into her parents at the club, they were not playing very well.\n[Date: September 30, 1994, Session #3, Utterance #27] Mrs. Geller: And you knew about this?!\n[Date: September 30, 1994, Session #3, Utterance #5] Mr. Geller: Do you remember the Ludwins? The big one had a thing for you, didn't she?\n[Date: September 30, 1994, Session #4, Utterance #25] Rachel: Oh, you've got Carol tomorrow.. When did it get so complicated?\n[Date: September 30, 1994, Session #3, Utterance #6] Mrs. Geller: They all had a thing for him.\n[Date: September 30, 1994, Session #3, Utterance #26] Ross: Okay! Okay. Look, I, uh- I realise you guys have been wondering what exactly happened between Carol and me, and, so, well, here's the deal. Carol's a lesbian. She's living with a woman named Susan. She's pregnant with my child, and she and Susan are going to raise the baby.\n[Date: September 30, 1994, Session #3, Utterance #1] Mrs. Geller: Oh, Martha Ludwin's daughter is gonna call you. Mmm! What's that curry taste?\n[Date: September 30, 1994, Session #1, Utterance #15] Carol: A lesbian?\n[Date: September 30, 1994, Session #1, Utterance #18] Ross: Why- why are you here, Carol?\n[Date: September 30, 1994, Session #4, Utterance #27] Rachel: Remember when we were in high school together?\n[Date: September 30, 1994, Session #3, Utterance #9] Mrs. Geller: Oh, she just graduated, and she wants to be something in cooking, or food, or.... I don't know. Anyway, I told her you had a restaurant-\n[Date: September 30, 1994, Session #3, Utterance #11] Mrs. Geller: Well, they don't have to know that...\n[Date: September 22, 1994, Session #1, Utterance #50] Rachel: Oh God... well, it started about a half hour before the wedding. I was in the room where we were keeping all the presents, and I was looking at this gravy boat. This really gorgeous Lamauge gravy boat. When all of a sudden- Sweet 'n' Lo?- I realized that I was more turned on by this gravy boat than by Barry! And then I got really freaked out, and that's when it hit me: how much Barry looks like Mr. Potato Head. Y'know, I mean, I always knew looked familiar, but... Anyway, I just had to get out of there, and I started wondering 'Why am I doing this, and who am I doing this for?'. So anyway I just didn't know where to go, and I know that you and I have kinda drifted apart, but you're the only person I knew who lived here in the city.\n[Date: September 22, 1994, Session #1, Utterance #23] Monica: Carol moved her stuff out today.\n[Date: September 30, 1994, Session #1, Utterance #17] Carol: Marty's still totally paranoid. Oh, and, uh-\n[Date: September 30, 1994, Session #2, Utterance #39] Ross: Carol's pregnant.\n[Date: September 30, 1994, Session #3, Utterance #3] Mrs. Geller: Mmmm!\n[Date: September 30, 1994, Session #1, Utterance #8] Marsha: Yes, it is. Carol! Hi!\n[Date: September 30, 1994, Session #3, Utterance #18] Mrs. Geller: Well, at least she had the chance to leave a man at the altar...\n[Date: October 1, 1994, Session #1, Utterance #3] Carol: Ross, you remember Susan.\n", 172 | "Carol: Hey, just curious, do you know who Mr. geller's daughter was sharing her apartment with back in September 1994?", 173 | "Rachel", 174 | "([N/A])" 175 | ], 176 | [ 177 | "[Date: September 30, 1994, Session #2, Utterance #21] Rachel: Oh, like I wasn't dreading tomorrow enough, having to give it back to him... 'Hi Barry! Remember me? I'm the girl in the veil who stomped on your heart in front of your entire family!' Oh God and now I'm gonna have to return the ring, without the ring, which makes it so much harder...\n[Date: September 30, 1994, Session #4, Utterance #25] Rachel: Oh, you've got Carol tomorrow.. When did it get so complicated?\n[Date: September 22, 1994, Session #1, Utterance #50] Rachel: Oh God... well, it started about a half hour before the wedding. I was in the room where we were keeping all the presents, and I was looking at this gravy boat. This really gorgeous Lamauge gravy boat. When all of a sudden- Sweet 'n' Lo?- I realized that I was more turned on by this gravy boat than by Barry! And then I got really freaked out, and that's when it hit me: how much Barry looks like Mr. Potato Head. Y'know, I mean, I always knew looked familiar, but... Anyway, I just had to get out of there, and I started wondering 'Why am I doing this, and who am I doing this for?'. So anyway I just didn't know where to go, and I know that you and I have kinda drifted apart, but you're the only person I knew who lived here in the city.\n[Date: September 30, 1994, Session #3, Utterance #16] Mrs. Geller: What that Rachel did to her life.... We ran into her parents at the club, they were not playing very well.\n[Date: September 30, 1994, Session #4, Utterance #29] Rachel: I mean, didn't you think you were just gonna meet somone, fall in love- and that'd be it? ..Ross?\n[Date: September 30, 1994, Session #4, Utterance #27] Rachel: Remember when we were in high school together?\n[Date: September 23, 1994, Session #2, Utterance #24] Rachel: Hey Mon, look what I just found on the floor. What?\n[Date: September 30, 1994, Session #3, Utterance #26] Ross: Okay! Okay. Look, I, uh- I realise you guys have been wondering what exactly happened between Carol and me, and, so, well, here's the deal. Carol's a lesbian. She's living with a woman named Susan. She's pregnant with my child, and she and Susan are going to raise the baby.\n[Date: September 30, 1994, Session #2, Utterance #17] Rachel: Has anybody seen my engagement ring?\n[Date: September 23, 1994, Session #2, Utterance #34] Rachel: I knew.\n[Date: September 22, 1994, Session #1, Utterance #43] Monica: Rachel?!\n[Date: September 22, 1994, Session #1, Utterance #44] Rachel: Oh God Monica hi! Thank God! I just went to your building and you weren't there and then this guy with a big hammer said you might be here and you are, you are!\n[Date: September 30, 1994, Session #2, Utterance #39] Ross: Carol's pregnant.\n[Date: September 30, 1994, Session #4, Utterance #23] Rachel: So, got any advice? Y'know, as someone who's recently been- dumped?\n[Date: September 23, 1994, Session #2, Utterance #16] Monica: Rachel! That was a library card!\n[Date: September 30, 1994, Session #2, Utterance #25] Phoebe: Doy! Probably right before she lost it!\n[Date: September 23, 1994, Session #1, Utterance #10] Rachel: Guess what?\n[Date: September 30, 1994, Session #4, Utterance #31] Rachel: Oh! Man, I never thought I'd be here..\n[Date: September 22, 1994, Session #3, Utterance #10] Ross: This was Carol's favorite beer. She always drank it out of the can, I should have known.\n[Date: September 30, 1994, Session #2, Utterance #19] Rachel: Oh God, oh God, oh God oh God oh God oh God....\n", 178 | "Carol: Oh my God, can you remember what Rachel was freaking out about losing but then finally found?", 179 | "An engagement ring", 180 | "([N/A])" 181 | ], 182 | [ 183 | "[Date: September 22, 1994, Session #2, Utterance #25] Joey: Who's Paul?\n[Date: September 22, 1994, Session #2, Utterance #20] Monica: Joey, stop hitting on her! It's her wedding day!\n[Date: September 22, 1994, Session #3, Utterance #13] Joey: Ross, let me ask you a question. She got the furniture, the stereo, the good TV- what did you get?\n[Date: September 22, 1994, Session #4, Utterance #7] Joey: What are you talking about? 'One woman'? That's like saying there's only one flavor of ice cream for you. Lemme tell you something, Ross. There's lots of flavors out there. There's Rocky Road, and Cookie Dough, and Bing! Cherry Vanilla. You could get 'em with Jimmies, or nuts, or whipped cream! This is the best thing that ever happened to you! You got married, you were, like, what, eight? Welcome back to the world! Grab a spoon!\n[Date: September 22, 1994, Session #1, Utterance #3] Chandler: All right Joey, be nice. So does he have a hump? A hump and a hairpiece?\n[Date: September 22, 1994, Session #1, Utterance #2] Joey: C'mon, you're going out with the guy! There's gotta be something wrong with him!\n[Date: September 22, 1994, Session #2, Utterance #27] Monica: Maybe. Joey: Wait. Your 'not a real date' tonight is with Paul the Wine Guy?\n[Date: September 22, 1994, Session #2, Utterance #19] Joey: And hey, you need anything, you can always come to Joey. Me and Chandler live across the hall. And he's away a lot.\n[Date: September 22, 1994, Session #1, Utterance #33] Joey: And you never knew she was a lesbian...\n[Date: September 30, 1994, Session #4, Utterance #8] Chandler: What does she do?\n[Date: September 22, 1994, Session #1, Utterance #46] Monica: De-caff. Okay, everybody, this is Rachel, another Lincoln High survivor. This is everybody, this is Chandler, and Phoebe, and Joey, and- you remember my brother Ross?\n[Date: September 30, 1994, Session #4, Utterance #9] Phoebe: She's a waitress.\n[Date: September 30, 1994, Session #3, Utterance #1] Mrs. Geller: Oh, Martha Ludwin's daughter is gonna call you. Mmm! What's that curry taste?\n[Date: September 22, 1994, Session #2, Utterance #55] Joey: Hey Pheebs, you wanna help?\n[Date: September 22, 1994, Session #2, Utterance #48] Monica: Shut up, Joey!\n[Date: September 22, 1994, Session #1, Utterance #24] Joey: Ohh.\n[Date: September 22, 1994, Session #5, Utterance #2] Joey: Great story! But, I uh, I gotta go, I got a date with Andrea--Angela--Andrea... Oh man,\n[Date: September 30, 1994, Session #1, Utterance #6] Marsha: Speaking of issues, isn't that your ex-wife?\n[Date: September 22, 1994, Session #3, Utterance #4] Joey: What's this?\n[Date: September 30, 1994, Session #3, Utterance #11] Mrs. Geller: Well, they don't have to know that...\n", 184 | "Joey: Hey, just outta curiosity, who's Mrs. waltham's hubby?", 185 | "I don't know.", 186 | "([N/A])" 187 | ] 188 | ], 189 | "save_time_list": [ 190 | 0.3327913284301758, 191 | 0.3942553997039795, 192 | 0.31287312507629395, 193 | 0.34005165100097656, 194 | 0.29903578758239746, 195 | 0.2886030673980713, 196 | 0.30900025367736816, 197 | 0.3351249694824219, 198 | 0.32923197746276855, 199 | 0.4834303855895996, 200 | 0.329481840133667, 201 | 0.3038666248321533, 202 | 0.284437894821167, 203 | 0.5323793888092041, 204 | 0.2826194763183594, 205 | 0.7156615257263184, 206 | 0.3103823661804199 207 | ], 208 | "retrieve_search_time_list": [ 209 | 0.32669925689697266, 210 | 0.3535313606262207, 211 | 0.3175649642944336, 212 | 0.3314173221588135, 213 | 0.29174089431762695, 214 | 0.32009029388427734, 215 | 0.321483850479126, 216 | 0.28399038314819336, 217 | 0.544675350189209, 218 | 0.8056018352508545, 219 | 0.3517446517944336, 220 | 0.3448023796081543, 221 | 0.3273906707763672, 222 | 0.34744691848754883, 223 | 0.48077964782714844, 224 | 0.5534074306488037, 225 | 0.28615593910217285 226 | ], 227 | "ans_time_list": [ 228 | 0.6777677536010742, 229 | 0.6197292804718018, 230 | 0.6257314682006836, 231 | 0.8246088027954102, 232 | 1.136108160018921, 233 | 0.904289722442627, 234 | 0.5269069671630859, 235 | 1.4355406761169434, 236 | 0.6494314670562744, 237 | 1.1438720226287842, 238 | 0.6256275177001953, 239 | 0.6279885768890381, 240 | 1.0527541637420654, 241 | 0.9095523357391357, 242 | 0.6113986968994141, 243 | 0.8785920143127441, 244 | 0.7710082530975342 245 | ], 246 | "calibrated_result_list": [ 247 | "Correct", 248 | "Correct", 249 | "Correct", 250 | "Correct", 251 | "Wrong", 252 | "Correct", 253 | "Wrong", 254 | "Wrong", 255 | "Correct", 256 | "Wrong", 257 | "Wrong", 258 | "Wrong", 259 | "Wrong", 260 | "Wrong", 261 | "Wrong", 262 | "Correct", 263 | "Correct" 264 | ], 265 | "calibrated_distilled_answer_list": [ 266 | "([N/A])", 267 | "([N/A])", 268 | "([N/A])", 269 | "([N/A])", 270 | "([N/A])", 271 | "([N/A])", 272 | "([N/A])", 273 | "([N/A])", 274 | "([N/A])", 275 | "([N/A])", 276 | "([N/A])", 277 | "([N/A])", 278 | "([N/A])", 279 | "([N/A])", 280 | "([N/A])", 281 | "([N/A])", 282 | "([N/A])" 283 | ], 284 | "target_level_list": [ 285 | { 286 | "current_type": "before_event_unans" 287 | }, 288 | { 289 | "current_type": "fu" 290 | }, 291 | { 292 | "current_type": "before_event_unans" 293 | }, 294 | { 295 | "current_type": "past" 296 | }, 297 | { 298 | "current_type": "cur" 299 | }, 300 | { 301 | "current_type": "dont_know_unans_time" 302 | }, 303 | { 304 | "current_type": "past" 305 | }, 306 | { 307 | "current_type": "past" 308 | }, 309 | { 310 | "current_type": "fu" 311 | }, 312 | { 313 | "current_type": "past" 314 | }, 315 | { 316 | "current_type": "cur" 317 | }, 318 | { 319 | "current_type": "cur" 320 | }, 321 | { 322 | "current_type": "past_cur" 323 | }, 324 | { 325 | "current_type": "past_past" 326 | }, 327 | { 328 | "current_type": "past_past" 329 | }, 330 | { 331 | "current_type": "ans_wo_time" 332 | }, 333 | { 334 | "current_type": "fu" 335 | } 336 | ] 337 | } -------------------------------------------------------------------------------- /results/results-friends-model_gpt-4o-mini-debug_True-quantization_4bit-time_limit_6.0-history_type_utts-openai-emb_original-version_0.json: -------------------------------------------------------------------------------- 1 | { 2 | "score": 0.47058823529411764, 3 | "calibrated_score": 0.47058823529411764, 4 | "calibrated_result_list": [ 5 | "Correct", 6 | "Correct", 7 | "Correct", 8 | "Correct", 9 | "Wrong", 10 | "Correct", 11 | "Wrong", 12 | "Wrong", 13 | "Correct", 14 | "Wrong", 15 | "Wrong", 16 | "Wrong", 17 | "Wrong", 18 | "Wrong", 19 | "Wrong", 20 | "Correct", 21 | "Correct" 22 | ], 23 | "avg_answer_time": 1.5760387112112606, 24 | "result_list": [ 25 | "Correct", 26 | "Correct", 27 | "Correct", 28 | "Correct", 29 | "Wrong", 30 | "Correct", 31 | "Wrong", 32 | "Wrong", 33 | "Correct", 34 | "Wrong", 35 | "Wrong", 36 | "Wrong", 37 | "Wrong", 38 | "Wrong", 39 | "Wrong", 40 | "Correct", 41 | "Correct" 42 | ], 43 | "result_time_list": [ 44 | 1.3372583389282227, 45 | 1.367516040802002, 46 | 1.2561695575714111, 47 | 1.4960777759552002, 48 | 1.7268848419189453, 49 | 1.5129830837249756, 50 | 1.15739107131958, 51 | 2.0546560287475586, 52 | 1.523338794708252, 53 | 2.4329042434692383, 54 | 1.306854009628296, 55 | 1.2766575813293457, 56 | 1.6645827293395996, 57 | 1.7893786430358887, 58 | 1.3747978210449219, 59 | 2.147660970687866, 60 | 1.367546558380127 61 | ], 62 | "ambiguous_idx_list": [], 63 | "ambiguous_answer_list": [], 64 | "ambiguous_gold_answer_list": [] 65 | } -------------------------------------------------------------------------------- /results/results-friends-model_gpt-4o-mini-debug_True-quantization_4bit-time_limit_600.0-history_type_session-entire-bm25_original-version_2.json: -------------------------------------------------------------------------------- 1 | { 2 | "score": 0.7647058823529411, 3 | "calibrated_score": 0.7647058823529411, 4 | "calibrated_result_list": [ 5 | "Correct", 6 | "Correct", 7 | "Correct", 8 | "Correct", 9 | "Correct", 10 | "Correct", 11 | "Correct", 12 | "Correct", 13 | "Correct", 14 | "Wrong", 15 | "Correct", 16 | "Correct", 17 | "Correct", 18 | "Wrong", 19 | "Wrong", 20 | "Correct", 21 | "Wrong" 22 | ], 23 | "avg_answer_time": 0.788747605155496, 24 | "result_list": [ 25 | "Correct", 26 | "Correct", 27 | "Correct", 28 | "Correct", 29 | "Correct", 30 | "Correct", 31 | "Correct", 32 | "Correct", 33 | "Correct", 34 | "Wrong", 35 | "Correct", 36 | "Correct", 37 | "Correct", 38 | "Wrong", 39 | "Wrong", 40 | "Correct", 41 | "Wrong" 42 | ], 43 | "result_time_list": [ 44 | 0.6705124378204346, 45 | 0.6907761096954346, 46 | 0.588655948638916, 47 | 0.663825273513794, 48 | 0.57029128074646, 49 | 0.6687910556793213, 50 | 0.5266945362091064, 51 | 1.866330862045288, 52 | 0.844897985458374, 53 | 0.6778724193572998, 54 | 0.7247071266174316, 55 | 0.8085505962371826, 56 | 0.7705795764923096, 57 | 0.734687328338623, 58 | 0.9593427181243896, 59 | 0.7516252994537354, 60 | 0.890568733215332 61 | ], 62 | "ambiguous_idx_list": [], 63 | "ambiguous_answer_list": [], 64 | "ambiguous_gold_answer_list": [] 65 | } -------------------------------------------------------------------------------- /results/results-friends-model_mistral-7b-it-debug_True-quantization_4bit-time_limit_600.0-history_type_session-entire-bm25_original-version_1.json: -------------------------------------------------------------------------------- 1 | { 2 | "score": 0.4411764705882353, 3 | "calibrated_score": 0.4411764705882353, 4 | "calibrated_result_list": [ 5 | "Wrong", 6 | "Correct", 7 | "Correct", 8 | "Correct", 9 | "Correct", 10 | "Correct", 11 | "Correct", 12 | "Wrong", 13 | "Correct", 14 | "Wrong", 15 | "Wrong", 16 | "Wrong", 17 | "Correct", 18 | "Wrong", 19 | "Wrong", 20 | "Wrong", 21 | "Correct", 22 | "Wrong", 23 | "Wrong", 24 | "Wrong", 25 | "Correct", 26 | "Wrong", 27 | "Wrong", 28 | "Correct", 29 | "Correct", 30 | "Wrong", 31 | "Wrong", 32 | "Wrong", 33 | "Correct", 34 | "Wrong", 35 | "Correct", 36 | "Correct", 37 | "Wrong", 38 | "Wrong" 39 | ], 40 | "avg_answer_time": 3.5602791870341584, 41 | "result_list": [ 42 | "Wrong", 43 | "Correct", 44 | "Correct", 45 | "Correct", 46 | "Correct", 47 | "Correct", 48 | "Correct", 49 | "Wrong", 50 | "Correct", 51 | "Wrong", 52 | "Wrong", 53 | "Wrong", 54 | "Correct", 55 | "Wrong", 56 | "Wrong", 57 | "Wrong", 58 | "Correct", 59 | "Wrong", 60 | "Wrong", 61 | "Wrong", 62 | "Correct", 63 | "Wrong", 64 | "Wrong", 65 | "Correct", 66 | "Correct", 67 | "Wrong", 68 | "Wrong", 69 | "Wrong", 70 | "Correct", 71 | "Wrong", 72 | "Correct", 73 | "Correct", 74 | "Wrong", 75 | "Wrong" 76 | ], 77 | "result_time_list": [ 78 | 3.1548521518707275, 79 | 2.560687303543091, 80 | 2.433565855026245, 81 | 1.167703628540039, 82 | 3.2000246047973633, 83 | 3.1250967979431152, 84 | 3.4482481479644775, 85 | 1.4094173908233643, 86 | 2.819232702255249, 87 | 4.01602578163147, 88 | 3.993535041809082, 89 | 3.051607131958008, 90 | 2.1200222969055176, 91 | 4.386370420455933, 92 | 3.8467817306518555, 93 | 2.657038927078247, 94 | 2.3905858993530273, 95 | 3.784196615219116, 96 | 3.5588788986206055, 97 | 5.592507839202881, 98 | 3.568309783935547, 99 | 4.250510215759277, 100 | 7.20405387878418, 101 | 3.401456117630005, 102 | 2.392115354537964, 103 | 7.146764755249023, 104 | 3.8354439735412598, 105 | 3.5823769569396973, 106 | 4.315815687179565, 107 | 4.740592002868652, 108 | 4.35824728012085, 109 | 1.88535475730896, 110 | 3.443660020828247, 111 | 4.208412408828735 112 | ], 113 | "ambiguous_idx_list": [], 114 | "ambiguous_answer_list": [], 115 | "ambiguous_gold_answer_list": [] 116 | } -------------------------------------------------------------------------------- /scripts/script_1.sh: -------------------------------------------------------------------------------- 1 | CUDA_VISIBLE_DEVICES=0 python simulator.py --model_name "llama2-7b-chat" --quantization "4bit" --script_name "friends" --ret_method "bm25" --history_type "session-entire" --sleep_time 600 --trial_version 1 --sh_number 0 2 | CUDA_VISIBLE_DEVICES=0 python simulator.py --model_name "mixtral-it" --quantization "4bit" --script_name "friends" --ret_method "bm25" --history_type "session-entire" --sleep_time 600 --trial_version 2 --sh_number 0 3 | -------------------------------------------------------------------------------- /simulator.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pickle 3 | from copy import deepcopy 4 | import time 5 | import random 6 | import pandas as pd 7 | from openai import OpenAI 8 | from anthropic import Anthropic 9 | import time 10 | import signal 11 | import warnings 12 | import argparse 13 | from rank_bm25 import BM25Okapi 14 | from nltk.tokenize import word_tokenize 15 | import nltk 16 | 17 | from logging_results.logging import log_results, log_everything 18 | from post_processing.process_answer import judge_eq, distill_answer, calibrate 19 | from models.api_based_inference import gpt_inference, claude_inference, gemini_inference 20 | from models.open_source_model_inference import open_source_model_inference 21 | from models.load_opensource_model import load_opensource_tokenizer 22 | from models.load_model import load_model 23 | from utils.utils import get_embedding, search_history, open_file, name_change, extract_gt_sessions_bm25_date 24 | warnings.filterwarnings('ignore') 25 | from func_timeout import func_set_timeout, FunctionTimedOut, func_timeout 26 | 27 | 28 | def parse_args(): 29 | parser = argparse.ArgumentParser() 30 | parser.add_argument("--model_name", type=str, default="gpt-3.5-turbo", help="name of the model. Default: 'gpt-3.5-turbo'.") 31 | parser.add_argument("--debug", action=argparse.BooleanOptionalAction, help="if set, use truncated dataset for debugging.") 32 | parser.add_argument("--debug_n_episodes", type=int, default=5, help="number of episodes to evalutate on debug mode.") 33 | parser.add_argument("--quantization", type=str, default="no", help="either to quantize the model or not. Default: False") 34 | parser.add_argument("--script_name", type=str, default='friends', help="name of the script to evaluate. Should be one of ('friends', 'bigbang', 'theoffice'). Default: 'friends'") 35 | parser.add_argument("--sleep_time", type=float, default=5, help="time limit in seconds for model response. Default: 5") 36 | parser.add_argument('--history_type', type=str, default='session-entire', help="How to store conversation history.") 37 | parser.add_argument('--num_ret_history', type=int, default=10, help="Number of histories we are going to retrieve. Default: 10.") 38 | parser.add_argument('--ret_method', type=str, default='bm25', help=" Default: openai-emb. Should be one of ('openai-emb', 'bm25', 'no_ret')") 39 | parser.add_argument('--name_shuffle', type=str, default='original', help=" Default: original. Should be one of ('original', 'shuffle', 'new_name')") 40 | parser.add_argument('--trial_version', type=int, default=0, help= "version number of the experiment.") 41 | parser.add_argument('--sh_number', type=int, default=0, help='shell script number') 42 | parser.add_argument('--num_cores', type=int, default=10, help='upper bound of number of cpu cores') 43 | parser.add_argument('--openai_api_key', type=str, default="", help="OpenAI API Key") 44 | parser.add_argument('--gemini_api_key', type=str, default="", help="Gemini API key") 45 | parser.add_argument('--antrhopic_api_key', type=str, default="", help="Anthropic API key") 46 | parser.add_argument('--fast_eval', type=str, default="yes", help="When set to 'yes', the simulator proceeds to the next utterance without waiting for the time interval if the history has already been updated. Should be one of ('yes', 'no')") 47 | parser.add_argument('--answer_format', type=str, default='multi_choice_structured', help="the format of the answer of the agent.") 48 | return parser.parse_args() 49 | 50 | def answer_question(model_name, client, model, tokenizer, config, prompt): 51 | answer = "" 52 | try: 53 | if "gpt" in model_name.lower(): 54 | answer = gpt_inference(prompt, model_name, client) 55 | elif model_name == "claude-3" or model_name == "claude-2.1": 56 | answer = claude_inference(prompt, model_name, client) 57 | elif model_name == "gemini": 58 | answer = gemini_inference(prompt, model) 59 | else: 60 | answer = open_source_model_inference(prompt, model_name, model, tokenizer, config) 61 | except: 62 | pass 63 | return answer 64 | 65 | def retrieve_history(ret_method, num_ret_history, openai_client, max_token_len, save_result, char_ask_sh, real_question_sh, data_dict, gt_sessions): 66 | ret_histories = '' 67 | if ret_method == 'openai-emb': 68 | if len(data_dict['history']) == 0: 69 | ret_histories = "No history.\n" 70 | else: 71 | res = search_history(save_result, f'{char_ask_sh}: {real_question_sh}', client=openai_client, n=num_ret_history) 72 | for ret_history in list(res['history']): 73 | ret_histories = ret_histories + ret_history + '\n' 74 | elif ret_method == 'bm25': 75 | if len(data_dict['history']) == 0: 76 | ret_histories = "No history.\n" 77 | else: 78 | tokenized_query = word_tokenize(f'{char_ask_sh}: {real_question_sh}'.lower()) 79 | doc_scores = save_result.get_scores(tokenized_query) 80 | top_doc_indices = sorted(range(len(doc_scores)), key=lambda i: doc_scores[i], reverse=True)[:num_ret_history] 81 | top_docs = [data_dict['history'][i] for i in top_doc_indices] 82 | for ret_history in top_docs: 83 | ret_histories = ret_histories + ret_history + '\n' 84 | elif ret_method == 'no_ret': 85 | total_token_len = 0 86 | ret_his_inds = [] 87 | if len(data_dict['history']) == 0: 88 | ret_histories = "No history.\n" 89 | else: 90 | for h_ind in range(len(data_dict['ada_embedding'])): 91 | total_token_len += data_dict['ada_embedding'][-1-h_ind] 92 | if total_token_len > max_token_len - 500: 93 | break 94 | ret_his_inds.append(-1-h_ind) 95 | ret_histories = data_dict['history'][-1-h_ind] + '\n' + ret_histories 96 | elif ret_method == 'oracle': 97 | ret_histories = gt_sessions 98 | return ret_histories 99 | 100 | 101 | def save_history(history_num, history, history_type, date, cur_conv_num, un, post_utterances, utter_post, model_name, client, model, tokenizer, config, data_dict, ret_method, llama_tokenizer): 102 | if history_type == "utts": 103 | processed_history = f"[Date: {date}, Session #{cur_conv_num}, Utterance #{history_num+1}] {history}" 104 | history_num += 1 105 | elif history_type == "session-entire": 106 | processed_history = f"[Date: {date}, Session #{cur_conv_num}]\n{history}" 107 | elif history_type == "session-summary": 108 | history_sum = "" 109 | if un == len(post_utterances)-1: 110 | sum_prompt = open_file('./prompt/chatgpt_summarize_prompt.txt').replace('<<>>', history) 111 | try: 112 | if "gpt" in model_name.lower(): 113 | history_sum = gpt_inference(sum_prompt, model_name, client) 114 | elif model_name == "claude-3" or model_name == "claude-2.1": 115 | history_sum = claude_inference(sum_prompt, model_name, client) 116 | elif model_name == "gemini": 117 | history_sum = gemini_inference(sum_prompt, model) 118 | else: 119 | history_sum = open_source_model_inference(sum_prompt, model_name, model, tokenizer, config) 120 | except: 121 | pass 122 | else: 123 | history_sum = history 124 | processed_history = f"[Date: {date}, Session #{cur_conv_num}]\n{history_sum}\n" 125 | 126 | data_dict['history'].append(processed_history) 127 | if ret_method == 'openai-emb': 128 | embedding_vec = get_embedding(processed_history, client=client, model="text-embedding-3-small") 129 | data_dict['ada_embedding'].append(embedding_vec) 130 | data_df = pd.DataFrame(data_dict) 131 | return data_df, history_num#, data_dict 132 | elif ret_method == 'bm25': 133 | tokenized_docs = [word_tokenize(doc.lower()) for doc in data_dict['history']] 134 | bm25 = BM25Okapi(tokenized_docs) 135 | return bm25, history_num#, data_dict 136 | elif ret_method == 'no_ret': 137 | token_len = llama_tokenizer(processed_history, return_tensors="pt", truncation=True).input_ids.shape[1] 138 | data_dict['ada_embedding'].append(token_len) 139 | return None, history_num#, data_dict 140 | elif ret_method == "oracle": 141 | return None, history_num#, data_dict 142 | else: 143 | return AssertionError("Incorrect `ret_method`.") 144 | 145 | 146 | def simulator( 147 | script_name, 148 | sleep_time=5, 149 | tkg_ratio=0.7, 150 | num_ret_history = 5, 151 | model_name:str="gpt-3.5-turbo", 152 | debug:bool=False, 153 | debug_n_episodes:int=5, 154 | quantization:str="no", 155 | history_type:str="session-entire", 156 | ret_method:str='bm25', 157 | name_shuffle:str='original', 158 | openai_api_key:str="", 159 | gemini_api_key:str="", 160 | antrhopic_api_key:str="", 161 | fast_eval:str="yes", 162 | answer_format:str="multi_choice_structured" 163 | ): 164 | """ 165 | script_name: script name ('friends', 'bigbang', 'theoffice') 166 | sleep_time: time for one utterance in the simulator. we do not use this parameter in unlimited simulator. (e.g. 3) 167 | tkg_ratio: the ratio of KG based question among the whole question set (0.0-1.0) 168 | mode: question type ('select' or 'free text') 169 | num_ret_history: the number of the retrieved utterances 170 | ret_method: retrieval method. openai embedding based: 'openai-emb', BM25 based: 'bm25', Naive LLM inference: 'no_ret'. 171 | """ 172 | 173 | model, tokenizer, config = load_model(model_name, quantization, gemini_api_key=gemini_api_key) 174 | simulator_start_time = time.time() 175 | if ret_method in ['no_ret', 'oracle']: 176 | history_type = 'session-entire' 177 | 178 | ####For hyperparameter 179 | if history_type == "utts": 180 | num_ret_history = 20 181 | elif history_type == "session-entire": 182 | if 'llama' in model_name.lower(): 183 | num_ret_history = 3 184 | if ret_method == 'bm25': 185 | num_ret_history = 1 186 | elif 'tulu' in model_name.lower() or 'gemma' in model_name.lower(): 187 | if ret_method == 'bm25': 188 | num_ret_history = 2 189 | else: 190 | num_ret_history = 5 191 | else: 192 | num_ret_history = 10 193 | elif history_type == "session-summary": 194 | if 'llama' in model_name.lower(): 195 | num_ret_history = 8 196 | else: 197 | num_ret_history = 15 198 | 199 | if ret_method == 'no_ret': 200 | llama_tokenizer = load_opensource_tokenizer("llama2-7b-chat") 201 | else: 202 | llama_tokenizer = "" 203 | 204 | max_token_len = 0 205 | if model_name == "gpt-3.5-turbo": 206 | max_token_len = 16000 207 | elif "gpt-4" in model_name.lower(): 208 | max_token_len = 128000 209 | elif model_name == "claude-3" or model_name == "claude-2.1": 210 | max_token_len = 200000 211 | elif model_name == "gemini": 212 | max_token_len = 32000 213 | elif 'tulu' in model_name.lower(): 214 | max_token_len = 6000 215 | else: 216 | try: 217 | max_token_len = config.max_position_embeddings 218 | except: 219 | max_token_len = 4000 220 | 221 | if ret_method == "oracle": 222 | if "gpt" in model_name.lower(): 223 | num_ret_history = 20 224 | elif model_name == "claude-3" or model_name == "claude-2.1": 225 | num_ret_history = 20 226 | elif "gemini" in model_name.lower(): 227 | num_ret_history = 20 228 | elif 'tulu' in model_name.lower(): 229 | num_ret_history = 4 230 | elif 'llama2' in model_name.lower(): 231 | num_ret_history = 2 232 | elif 'gemma' in model_name.lower(): 233 | num_ret_history = 10 234 | else: 235 | num_ret_history = 20 236 | client = None 237 | openai_client = None 238 | if 'gpt' in model_name.lower() or 'openai' in ret_method: 239 | openai_client = OpenAI(api_key=openai_api_key) 240 | 241 | elif answer_format in ['multi_choice_unstructured', 'open_ended']: 242 | openai_client = OpenAI(api_key=openai_api_key) 243 | if "claude" not in model_name.lower() or "gemini" not in model_name.lower(): 244 | client = openai_client 245 | anthropic_client = None 246 | if "claude" in model_name.lower(): 247 | anthropic_client = Anthropic(api_key=antrhopic_api_key) 248 | 249 | elif "gpt" in model_name.lower(): 250 | client = openai_client 251 | if answer_format in ['multi_choice_unstructured', 'open_ended']: 252 | client = OpenAI(api_key=openai_api_key) 253 | elif "claude" in model_name.lower(): 254 | client = anthropic_client 255 | 256 | with open(f'./data/{script_name}_dialsim.pickle', 'rb') as f: 257 | data = pickle.load(f) 258 | with open(f'./data/{script_name}_oracle_tkg.pickle', 'rb') as f_h: 259 | oracle_tkg = pickle.load(f_h) 260 | with open(f'./data/{script_name}_oracle_fan.pickle', 'rb') as f_e: 261 | oracle_fan = pickle.load(f_e) 262 | 263 | if script_name == 'friends': 264 | chatbot = 'Ross' 265 | elif script_name == 'bigbang': 266 | chatbot = 'Sheldon' 267 | elif script_name == 'theoffice': 268 | chatbot = 'Michael' 269 | else: 270 | assert 0 271 | 272 | data_dict = { 273 | 'ada_embedding': [], ### openai-emb -> embedding vector, no_ret -> token length 274 | 'history': [] 275 | } 276 | 277 | episodes = list(data) 278 | if debug: 279 | episodes = episodes[:debug_n_episodes] 280 | before_date = '' 281 | cur_conv_num = 1 282 | 283 | result_list = [] 284 | result_time_list = [] 285 | ambiguous_idx_list = [] # list of indices of the data (episode, session, question_prompt) where the model's output is ambiguous. 286 | ambiguous_answer_list = [] # list of answers(model output) that are ambiguous. 287 | ambiguous_gold_answer_list = [] # list of ground truth answers for the ambiguous answers. 288 | answer_list = [] # list of answers generated by the models. TODO: implement logging answers too. 289 | gold_answer_list = [] # list of ground truth (gold) answers 290 | ret_histories_question_answer_list = [] # list of (ret_histories, question) 291 | save_time_list = [] # list of saving time 292 | retrieve_search_time_list = [] # list of time spent in `search_history` 293 | ans_time_list = [] # list of time spent in answering 294 | calibrated_result_list = [] # list of calibrated answers 295 | calibrated_distilled_answer_list = [] # list of calibrated distilled answers 296 | epi_session_date_to_sessions = {} 297 | date_to_sessions = {} 298 | target_level_list = [] 299 | 300 | for epi in episodes: 301 | epi_session_date_to_sessions[epi] = {} 302 | epi_data = data[epi] 303 | session_nums = list(epi_data) 304 | 305 | for sc_num in session_nums: 306 | already_asked = 0 307 | script = epi_data[sc_num]['script'] 308 | date = epi_data[sc_num]['date'] 309 | date_splitted = date.replace(',', '').split() 310 | cannot_tkg = 0 311 | cannot_fan = 0 312 | temp_script = name_change(script_name, script, name_shuffle) 313 | epi_session_date_to_sessions[epi][sc_num] = {date: temp_script} 314 | 315 | try: 316 | date_to_sessions[date].append(temp_script) 317 | except: 318 | date_to_sessions[date] = [temp_script] 319 | 320 | ###Whether it is possible to ask tkg-based questions 321 | try: 322 | question_dict = epi_data[sc_num]['hard_q'] 323 | final_tkg_list = [] 324 | tkg_list = list(question_dict) 325 | for tkg in tkg_list: 326 | if len(question_dict[tkg]) > 0: 327 | final_tkg_list.append(tkg) 328 | tkg_target_type = random.choice(final_tkg_list) 329 | 330 | tkg_q_list = question_dict[tkg_target_type] 331 | target_question = random.choice(tkg_q_list) 332 | except: 333 | cannot_tkg=1 334 | pass 335 | 336 | ###Whether it is possible to ask fan quiz-based questions 337 | try: 338 | question_dict = epi_data[sc_num]['easy_q'] 339 | final_fan_list = [] 340 | fan_list = list(question_dict) 341 | for fan in fan_list: 342 | if len(list(question_dict[fan])) > 0: 343 | final_fan_list.append(fan) 344 | fan_target_type = random.choice(final_fan_list) 345 | 346 | fan_q_list = list(question_dict[fan_target_type]) 347 | fan_q_target_num = random.choice(fan_q_list) 348 | target_question = question_dict[fan_target_type][fan_q_target_num] 349 | except: 350 | cannot_fan = 1 351 | pass 352 | 353 | target_question_list = [] 354 | current_type = '' 355 | gt_sessions = "" 356 | target_dates_list = [] 357 | 358 | #### Question Selection (tkg or fan) 359 | rand_val = random.random() 360 | if cannot_fan == 1 and cannot_tkg == 1: 361 | target_question_list = ['cannot ask' for _ in range(20)] 362 | elif (cannot_fan == 1 and cannot_tkg == 0) or rand_val < tkg_ratio: 363 | question_dict = epi_data[sc_num]['hard_q'] 364 | final_tkg_list = [] 365 | fu_num = 0 366 | not_fu_list = [] 367 | tkg_list = list(question_dict) 368 | for tkg in tkg_list: 369 | if len(question_dict[tkg]) > 0: 370 | final_tkg_list.append(tkg) 371 | if 'fu' in tkg: 372 | fu_num += 1 373 | else: 374 | not_fu_list.append(tkg) 375 | if len(not_fu_list) > 0: 376 | random.shuffle(not_fu_list) 377 | while True: 378 | should_stop = 0 379 | for not_fu in not_fu_list: 380 | if fu_num/len(final_tkg_list) < 0.215: 381 | should_stop = 1 382 | break 383 | final_tkg_list.append(not_fu) 384 | if should_stop == 1: 385 | break 386 | tkg_target_type = random.choice(final_tkg_list) 387 | tkg_q_list = question_dict[tkg_target_type] 388 | 389 | current_type = tkg_target_type 390 | for _ in range(20): 391 | target_question = random.choice(tkg_q_list) 392 | ran_q = target_question['questions'][list(target_question['questions'])[0]] 393 | if 'n '+ date_splitted[2] in ran_q or date_splitted[0] + ' ' + date_splitted[2] in ran_q: 394 | continue 395 | final_target_question = deepcopy(target_question) 396 | target_question_list.append(final_target_question) 397 | 398 | try: 399 | target_dates_list.append(oracle_tkg[epi][sc_num][current_type][tkg_q_list.index(target_question)]) 400 | except: 401 | try: 402 | target_dates_list.append(oracle_tkg[epi][sc_num][current_type][target_question['questions'][list(target_question['questions'])[0]]]) 403 | except: 404 | target_dates_list.append([]) 405 | 406 | elif (cannot_fan == 0 and cannot_tkg == 1) or rand_val >= tkg_ratio: 407 | question_dict = epi_data[sc_num]['easy_q'] 408 | final_fan_list = [] 409 | unans_num = 0 410 | ans_list = [] 411 | fan_list = list(question_dict) 412 | for fan in fan_list: 413 | if len(list(question_dict[fan])) > 0: 414 | final_fan_list.append(fan) 415 | if 'unans' in fan: 416 | unans_num += 1 417 | else: 418 | ans_list.append(fan) 419 | 420 | if len(ans_list) > 0: 421 | random.shuffle(ans_list) 422 | while True: 423 | should_stop = 0 424 | for ans_ele in ans_list: 425 | if unans_num/len(final_fan_list) < 0.27: 426 | should_stop = 1 427 | break 428 | final_fan_list.append(ans_ele) 429 | if should_stop == 1: 430 | break 431 | 432 | fan_target_type = random.choice(final_fan_list) 433 | fan_q_list = list(question_dict[fan_target_type]) 434 | current_type = fan_target_type 435 | 436 | for _ in range(20): 437 | fan_q_target_num = random.choice(fan_q_list) 438 | target_question = deepcopy(question_dict[fan_target_type][fan_q_target_num]) 439 | target_question_list.append(target_question) 440 | if current_type in ['ans_w_time', 'dont_know_unans_time']: 441 | try: 442 | target_dates_list.append(oracle_fan[epi][sc_num][current_type][fan_q_target_num]) 443 | except: 444 | target_dates_list.append([]) 445 | else: 446 | target_dates_list.append([]) 447 | 448 | if before_date != date: 449 | cur_conv_num = 1 450 | before_date = date 451 | 452 | utterances = script.split('\n') 453 | post_utterances = [] 454 | temp_utter = '' 455 | 456 | chatbot_utters = [] 457 | characters = [] 458 | 459 | for utter in utterances: 460 | if len(utter.strip()) == 0: 461 | continue 462 | if 'Teleplay: ' in utter or 'Story: ' in utter: 463 | continue 464 | if ':' in utter: 465 | characters.append(utter.split(':')[0].strip()) 466 | if chatbot+':' in utter: 467 | chatbot_utters.append(utter.strip()) 468 | if ':' in utter: 469 | post_utterances.append(utter.strip()) 470 | temp_utter = deepcopy(utter.strip()) 471 | else: 472 | post_utterances.pop() 473 | temp_utter += '\n'+utter.strip() 474 | post_utterances.append(temp_utter) 475 | 476 | if sc_num != session_nums[0]: 477 | print() 478 | 479 | print('###########################################') 480 | print(f'Date: {date}, Conversation #{cur_conv_num}') 481 | print('###########################################\n') 482 | 483 | try: 484 | if len(chatbot_utters) > 1: 485 | chatbot_utters = chatbot_utters[1:] 486 | random_chatbot_utter = random.choice(chatbot_utters) 487 | bot_indices = [i for i, s in enumerate(post_utterances) if random_chatbot_utter in s] 488 | range_indices = [i for i in range(max(0, bot_indices[0]-3), min(len(post_utterances), bot_indices[0]+3))] 489 | close_chars = [] 490 | for idx in range_indices: 491 | if ':' in post_utterances[idx]: 492 | close_chars.append(post_utterances[idx].split(':')[0]) 493 | characters = list(set(close_chars)) 494 | close_chars = list(set(close_chars)) 495 | 496 | for char_ in close_chars: 497 | if chatbot.lower() in char_.lower() or 'all' == char_.lower(): 498 | try: 499 | characters.remove(char_) 500 | except: 501 | pass 502 | except: 503 | pass 504 | 505 | if len(characters) > 0: 506 | char_ask = random.choice(characters) 507 | else: 508 | char_ask = "" 509 | 510 | history_num = 0 511 | script_history = "" 512 | 513 | for un, utter_post in enumerate(post_utterances): 514 | print(name_change(script_name, utter_post, name_shuffle)) 515 | history = "" 516 | if history_type == "utts": 517 | history = name_change(script_name, utter_post, name_shuffle) 518 | elif history_type == "session-entire": 519 | if not utter_post.endswith("\n"): 520 | utter_post += "\n" 521 | script_history += name_change(script_name, utter_post, name_shuffle) 522 | history = script_history 523 | elif history_type == "session-summary": 524 | if not utter_post.endswith("\n"): 525 | utter_post += "\n" 526 | script_history += name_change(script_name, utter_post, name_shuffle) 527 | history = script_history 528 | else: 529 | return AssertionError("Incorrect `history_type`.") 530 | 531 | embedding_vec = None 532 | 533 | save_timeout_flag = False 534 | search_timeout_flag = False 535 | ans_timeout_flag = False 536 | save_start_time = None 537 | save_end_time = None 538 | save_time = None 539 | 540 | # below are what we are actually going to log 541 | time_in_saving = None 542 | time_in_retrieval_searching = None 543 | time_in_answering = None 544 | result_time = None 545 | ans_time = None 546 | answer = "" 547 | 548 | already_pop = False 549 | history_before_save_len = len(data_dict['history']) 550 | embedding_before_save_len = len(data_dict['ada_embedding']) 551 | save_start_time = time.time() 552 | save_result = None 553 | 554 | try: 555 | 556 | save_result, history_num = func_timeout(sleep_time, save_history, args=(history_num, history, history_type, date, cur_conv_num, un, post_utterances, utter_post, model_name, openai_client, model, tokenizer, config, data_dict, ret_method, llama_tokenizer)) 557 | save_end_time = time.time() 558 | save_time = save_end_time - save_start_time 559 | 560 | 561 | 562 | except FunctionTimedOut: 563 | history_after_save_len = len(data_dict['history']) 564 | embedding_after_save_len = len(data_dict['ada_embedding']) 565 | save_timeout_flag = True 566 | print("\nTimeout (saving history)!!!\n") 567 | print("Corresponding history couldn't be saved.\n") 568 | if len(data_dict['history']) > 0 and history_after_save_len > history_before_save_len: 569 | data_dict['history'].pop() 570 | if ret_method in ["openai-emb", "no_ret"]: 571 | if len(data_dict['ada_embedding']) > 0 and embedding_after_save_len > embedding_before_save_len: 572 | data_dict['ada_embedding'].pop() 573 | if ret_method == "openai-emb": 574 | save_result = pd.DataFrame(data_dict) 575 | if ret_method == "bm25": 576 | if len(data_dict['history']) > 0: 577 | tokenized_docs = [word_tokenize(doc.lower()) for doc in data_dict['history']] 578 | save_result = BM25Okapi(tokenized_docs) 579 | ret_histories = "No history.\n" 580 | already_pop = True 581 | result = "Wrong (Timeout in saving history)" 582 | is_ambiguous = False 583 | answer = "<<>>" 584 | time_in_saving = "<<>>" 585 | time_in_retrieval_searching = "<<>>" 586 | time_in_ans = "<<>>" 587 | result_time = "<<>>" 588 | 589 | #### Question 590 | if random_chatbot_utter.lower() in utter_post.lower() and len(characters) > 0 and target_question_list[0] != 'cannot ask': 591 | if already_asked == 1: 592 | continue 593 | real_question = '' 594 | real_tar_id = -1 595 | for tar_id in range(len(target_question_list)): 596 | if char_ask in list(target_question_list[tar_id]['questions']): 597 | real_question = target_question_list[tar_id]['questions'][char_ask] 598 | elif 'default' in list(target_question_list[tar_id]['questions']): 599 | real_question = target_question_list[tar_id]['questions']['default'] 600 | else: 601 | continue 602 | 603 | try: 604 | true_answer = target_question_list[tar_id]['answer'] 605 | real_tar_id = tar_id 606 | assert(len(target_dates_list)==len(target_question_list)) 607 | gt_sessions = extract_gt_sessions_bm25_date(date_to_sessions, epi_session_date_to_sessions, current_type, target_dates_list[tar_id], epi, sc_num, num_ret_history, real_question) 608 | break 609 | except: 610 | continue 611 | 612 | if real_question == '' or real_tar_id == -1 or gt_sessions == "": 613 | continue 614 | 615 | true_answer_op = '' 616 | 617 | for oi, op in enumerate(['(A)', '(B)', '(C)', '(D)', '(E)']): 618 | if true_answer.lower() == target_question_list[real_tar_id]['options'][oi].lower(): 619 | true_answer_op = op 620 | break 621 | 622 | if true_answer_op == '': 623 | continue 624 | 625 | if answer_format in ['multi_choice_unstructured', 'open_ended']: 626 | if true_answer_op == "(E)": 627 | true_answer_op = "I don't know." 628 | else: 629 | true_answer_op = true_answer 630 | 631 | 632 | question_part_prompt = '' 633 | 634 | question_part_prompt += f'{char_ask}: {real_question}' 635 | options = target_question_list[real_tar_id]['options'] 636 | if answer_format == 'multi_choice_structured': 637 | question_part_prompt += '\n' 638 | question_part_prompt += f'\t(A) {options[0]}\n' 639 | question_part_prompt += f'\t(B) {options[1]}\n' 640 | question_part_prompt += f'\t(C) {options[2]}\n' 641 | question_part_prompt += f'\t(D) {options[3]}\n' 642 | question_part_prompt += f'\t(E) {options[4]}' 643 | elif answer_format == 'open_ended': 644 | pass 645 | elif answer_format == 'multi_choice_unstructured': 646 | question_part_prompt += ' ' 647 | question_part_prompt += f'{options[0]}? or ' 648 | question_part_prompt += f'{options[1]}? or ' 649 | question_part_prompt += f'{options[2]}? or ' 650 | question_part_prompt += f'{options[3]}? or ' 651 | question_part_prompt += f"you don't know?" 652 | else: 653 | raise ValueError("Invalid answer format. Should be one of ('multi_choice_structured', 'multi_choice_unstructured', 'open_ended')") 654 | question_part_prompt_sh = name_change(script_name, question_part_prompt, name_shuffle) 655 | """Start of Answering. Time measure starts HERE""" 656 | # time measure START 657 | ans_timeout_flag = False 658 | retrieve_save_start_time = None 659 | ans_start_time = None 660 | 661 | char_ask_sh = name_change(script_name, char_ask, name_shuffle) 662 | real_question_sh = name_change(script_name, real_question, name_shuffle) 663 | 664 | if not save_timeout_flag: 665 | ret_search_start_time = time.time() 666 | try: 667 | ret_histories = func_timeout(sleep_time-save_time, retrieve_history, args=(ret_method, num_ret_history, openai_client, max_token_len, save_result, char_ask_sh, real_question_sh, data_dict, gt_sessions)) 668 | retrieve_search_time = time.time()-ret_search_start_time 669 | except FunctionTimedOut: # timeout during searching history. Note that saving history was done correctly though. 670 | ret_histories = "No history.\n" 671 | print("\nTimeout (searching history)!!!\n") 672 | search_timeout_flag = True 673 | result = "Wrong (Timeout in searching history)" 674 | is_ambiguous = False 675 | answer = "<<>>" 676 | time_in_saving = save_time # record actual time taken in saving 677 | time_in_retrieval_searching = "<<>>" 678 | time_in_ans = "<<>>" 679 | result_time = "<<>>" 680 | if not search_timeout_flag: 681 | # Model inference 682 | #question_part_prompt_sh = name_change(script_name, question_part_prompt, name_shuffle) 683 | chatbot_sh = name_change(script_name, chatbot, name_shuffle) 684 | if answer_format not in ['multi_choice_structured', 'multi_choice_unstructured', 'open_ended']: 685 | raise ValueError("Invalid answer format. Should be one of ('multi_choice_structured', 'multi_choice_unstructured', 'open_ended')") 686 | if ret_method == 'no_ret': 687 | prompt = open_file(f'./prompt/naive_llm_inference_{answer_format}.txt').replace('<<>>', date).replace('<<>>', ret_histories).replace('<<>>', question_part_prompt_sh).replace('<<>>', chatbot_sh) 688 | else: 689 | prompt = open_file(f'./prompt/RAG_qa_prompt_{answer_format}.txt').replace('<<>>', date).replace('<<>>', ret_histories).replace('<<>>', question_part_prompt_sh).replace('<<>>', chatbot_sh) 690 | 691 | ans_start_time = time.time() 692 | try: 693 | answer = func_timeout(sleep_time-save_time-retrieve_search_time, answer_question, args=(model_name, client, model, tokenizer, config, prompt)) 694 | ans_time = time.time() - ans_start_time 695 | time_in_saving = save_time 696 | time_in_retrieval_searching = retrieve_search_time 697 | time_in_answering = ans_time 698 | result_time = save_time + retrieve_search_time + ans_time 699 | except FunctionTimedOut: 700 | print("\nTimeout (answering)!!!\n") 701 | ans_timeout_flag = True 702 | result = "Wrong (Timeout in answering)" 703 | is_ambiguous = False 704 | answer = "<<>>" 705 | time_in_saving = save_time 706 | time_in_retrieval_searching = retrieve_search_time 707 | time_in_answering = "<<>>" 708 | result_time = "<<>>" 709 | """Measuring time for timeout stops HERE""" 710 | 711 | is_ambiguous = False 712 | if not ans_timeout_flag and not save_timeout_flag and not search_timeout_flag: 713 | result, is_ambiguous = judge_eq(true_answer_op, answer, question_part_prompt_sh, client, answer_format=answer_format) 714 | if result_time >= sleep_time: 715 | result = "Wrong (Timeout)" 716 | else: 717 | if fast_eval == "no": 718 | time.sleep(sleep_time-result_time) 719 | 720 | already_asked = 1 721 | # log results 722 | answer_list.append(answer) 723 | gold_answer_list.append(true_answer_op) 724 | result_list.append(result) 725 | result_time_list.append(result_time) 726 | save_time_list.append(time_in_saving) 727 | retrieve_search_time_list.append(time_in_retrieval_searching) 728 | ans_time_list.append(time_in_answering) 729 | target_level_list.append({"current_type" : current_type}) 730 | print(question_part_prompt_sh) 731 | print(f'------------------------------- Q&A result -------------------------------') 732 | print(f'result: {result}, ambiguous answer: {is_ambiguous}') 733 | print(f'true answer: {true_answer_op}\t model answer: {answer}') 734 | print(f'time spent in saving: {time_in_saving}') 735 | print(f'time spent in searching history: {time_in_retrieval_searching}') 736 | print(f'time spent in answering: {time_in_answering}') 737 | print(f'time spent overall: {result_time}') 738 | print(f'time limit: {sleep_time}') 739 | print(f'model name: {model_name}') 740 | print(f'--------------------------------------------------------------------------') 741 | 742 | if is_ambiguous: 743 | ambiguous_idx_list.append((epi, sc_num, question_part_prompt_sh)) 744 | ambiguous_answer_list.append(answer) 745 | ambiguous_gold_answer_list.append(true_answer_op) 746 | 747 | distilled_answer = distill_answer(answer) 748 | ret_histories_question_answer_list.append((ret_histories, question_part_prompt_sh, true_answer_op, distilled_answer)) 749 | 750 | calibration = calibrate(result, is_ambiguous, true_answer_op, answer, question_part_prompt_sh, distilled_answer, answer_format=answer_format, lenient=True) # (result, is_ambiguous, calibrated_distilled_answer) 751 | if isinstance(result_time, float) and result_time >= sleep_time: 752 | calibrated_result_list.append("Wrong (Timeout)") 753 | calibrated_distilled_answer_list.append("Wrong (Timeout)") 754 | else: 755 | calibrated_result_list.append(calibration[0]) 756 | calibrated_distilled_answer_list.append(calibration[2]) 757 | 758 | else: 759 | if fast_eval == "no": 760 | if save_time == None: 761 | pass 762 | else: 763 | time.sleep(sleep_time-save_time) 764 | 765 | if not already_pop and "session" in history_type and un < len(post_utterances) - 1: 766 | if ret_method == 'openai-emb' or ret_method == 'no_ret': 767 | try: 768 | data_dict["history"].pop() 769 | data_dict["ada_embedding"].pop() 770 | except: 771 | AssertionError("Unexpected error(probable cause: couldn't save even one embedding using openai-emb in time). Please run the program again.") 772 | else: 773 | try: 774 | data_dict["history"].pop() 775 | except: 776 | pass 777 | cur_conv_num += 1 778 | 779 | simulator_running_time = time.time() - simulator_start_time 780 | 781 | if "Correct" in result_list: 782 | score_total = result_list.count('Correct') / len(result_list) 783 | else: 784 | score_total = 0 785 | 786 | valid_result_time_list = [] 787 | for result_time in result_time_list: 788 | if isinstance(result_time, float): 789 | valid_result_time_list.append(result_time) 790 | 791 | if len(valid_result_time_list) == 0: 792 | result_time_mean = 0 793 | else: 794 | result_time_mean = sum(valid_result_time_list) / len(valid_result_time_list) 795 | 796 | if "Correct" in calibrated_result_list: 797 | calibrated_score = calibrated_result_list.count('Correct') / len(calibrated_result_list) 798 | else: 799 | calibrated_score = 0 800 | 801 | log_info = { 802 | "score" : score_total, 803 | "calibrated_score" : calibrated_score, 804 | "result_time_mean" : result_time_mean, 805 | "simulator_running_time" : simulator_running_time, 806 | "result_list" : result_list, 807 | "result_time_list" : result_time_list, 808 | "ambiguous_idx_list" : ambiguous_idx_list, 809 | "ambiguous_answer_list" : ambiguous_answer_list, 810 | "ambiguous_gold_answer_list" : ambiguous_gold_answer_list, 811 | "answer_list" : answer_list, 812 | "gold_answer_list" : gold_answer_list, 813 | "ret_histories_question_answer_list" : ret_histories_question_answer_list, 814 | "save_time_list" : save_time_list, 815 | "retrieve_search_time_list": retrieve_search_time_list, 816 | "ans_time_list" : ans_time_list, 817 | "calibrated_result_list" : calibrated_result_list, 818 | "calibrated_distilled_answer_list" : calibrated_distilled_answer_list, 819 | "target_level_list" : target_level_list 820 | } 821 | 822 | return log_info 823 | 824 | 825 | 826 | if __name__ == "__main__": 827 | args = parse_args() 828 | print(args) 829 | def set_affinity(num_cores, sh_number): 830 | cpu_list = range(num_cores*sh_number, num_cores*(sh_number+1)) 831 | os.sched_setaffinity(os.getpid(), set(cpu_list)) 832 | 833 | set_affinity(args.num_cores, args.sh_number) 834 | cpu_count = os.sched_getaffinity(os.getpid()) 835 | print(f"Available CPUs: {cpu_count}") 836 | 837 | log_info = simulator(script_name=args.script_name, history_type=args.history_type, sleep_time=args.sleep_time, num_ret_history=args.num_ret_history, model_name=args.model_name, \ 838 | debug=args.debug, debug_n_episodes=args.debug_n_episodes, quantization=args.quantization, ret_method=args.ret_method, name_shuffle=args.name_shuffle, openai_api_key=args.openai_api_key, gemini_api_key=args.gemini_api_key, antrhopic_api_key=args.antrhopic_api_key, fast_eval=args.fast_eval, answer_format=args.answer_format) 839 | 840 | print() 841 | print('SCORE: ', log_info["score"]) 842 | print(f'SCORE(calibrated): {log_info["calibrated_score"]}') 843 | print('Answer Time Mean: ', log_info["result_time_mean"]) 844 | 845 | log_results_path = \ 846 | f"./results/results-{args.script_name}-model_{args.model_name}-debug_{args.debug}-quantization_{args.quantization}-time_limit_{args.sleep_time}-history_type_{args.history_type}-{args.ret_method}_{args.name_shuffle}-version_{args.trial_version}.json" 847 | log_total_path = \ 848 | f"./results/entire_log-{args.script_name}-model_{args.model_name}-debug_{args.debug}-quantization_{args.quantization}-time_limit_{args.sleep_time}-history_type_{args.history_type}-{args.ret_method}_{args.name_shuffle}-version_{args.trial_version}.json" 849 | 850 | log_results(log_info, log_file_path=log_results_path) 851 | log_everything(log_info, log_file_path=log_total_path) -------------------------------------------------------------------------------- /utils/__init__.py: -------------------------------------------------------------------------------- 1 | from .utils import get_embedding, search_history, open_file -------------------------------------------------------------------------------- /utils/__pycache__/__init__.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiho283/Simulator/d36924acbb65008e07dff83be43de596a4afef49/utils/__pycache__/__init__.cpython-311.pyc -------------------------------------------------------------------------------- /utils/__pycache__/utils.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiho283/Simulator/d36924acbb65008e07dff83be43de596a4afef49/utils/__pycache__/utils.cpython-311.pyc -------------------------------------------------------------------------------- /utils/utils.py: -------------------------------------------------------------------------------- 1 | from sklearn.metrics.pairwise import cosine_similarity 2 | import numpy as np 3 | from rank_bm25 import BM25Okapi 4 | from nltk.tokenize import word_tokenize 5 | 6 | 7 | def extract_gt_sessions_bm25_date(date_to_sessions, epi_scene_date_to_sessions, current_type, target_dates, epi, sc_num, num_history, question): 8 | gt_sessions = "" 9 | cur_conv_num = 0 10 | total_history = 0 11 | gt_docs = [] 12 | if current_type in ['ans_w_time', 'dont_know_unans_time']: 13 | if len(target_dates) == 0: 14 | gt_sessions = "No Relevant History\n" 15 | return gt_sessions 16 | sessions = date_to_sessions[target_dates[0]] 17 | for session in sessions: 18 | cur_conv_num += 1 19 | gt_sessions += f'[Date: {target_dates[0]}, Session #{cur_conv_num}]{session}' 20 | total_history += 1 21 | gt_docs.append(f'[Date: {target_dates[0]}, Session #{cur_conv_num}]{session}') 22 | #if total_history == num_history: 23 | # return gt_sessions 24 | 25 | elif current_type in ['ans_wo_time', 'before_event_unans', 'dont_know_unans']: 26 | #import pdb; pdb.set_trace() 27 | scene_nums = list(epi_scene_date_to_sessions[epi]) 28 | before_date = '' 29 | for session_num in scene_nums: 30 | date = list(epi_scene_date_to_sessions[epi][session_num])[0] 31 | if date != before_date: 32 | cur_conv_num = 0 33 | before_date = date 34 | session = epi_scene_date_to_sessions[epi][session_num][date] 35 | cur_conv_num += 1 36 | 37 | gt_sessions += f'[Date: {date}, Session #{cur_conv_num}]{session}' 38 | total_history += 1 39 | gt_docs.append(f'[Date: {date}, Session #{cur_conv_num}]{session}') 40 | #if total_history == num_history: 41 | # return gt_sessions 42 | else: 43 | for date in target_dates: 44 | if 'fu' in date: 45 | continue 46 | sessions = date_to_sessions[date] 47 | cur_conv_num = 0 48 | for session in sessions: 49 | cur_conv_num += 1 50 | gt_sessions += f'[Date: {date}, Session #{cur_conv_num}]{session}' 51 | total_history += 1 52 | gt_docs.append(f'[Date: {date}, Session #{cur_conv_num}]{session}') 53 | #if total_history == num_history: 54 | # return gt_sessions 55 | if gt_sessions == "": 56 | gt_sessions = "No Relevant History\n" 57 | return gt_sessions 58 | tokenized_docs = [word_tokenize(gt_instance.lower()) for gt_instance in gt_docs] 59 | bm25 = BM25Okapi(tokenized_docs) 60 | tokenized_question = word_tokenize(question.lower()) 61 | doc_scores = bm25.get_scores(tokenized_question) 62 | top_doc_indices = sorted(range(len(doc_scores)), key=lambda i: doc_scores[i], reverse=True)[:num_history] 63 | top_docs = [gt_docs[i] for i in top_doc_indices] 64 | top_docs_date = sorted(top_docs, key=lambda x: x.split("]")[0], reverse=False) 65 | result = "".join(top_docs_date) 66 | return result 67 | 68 | 69 | def get_embedding(text, client, model="text-embedding-3-small"): 70 | text = text.replace("\n", " ") 71 | return client.embeddings.create(input = [text], model=model).data[0].embedding 72 | 73 | def search_history(df, product_description, client, n=5, model="text-embedding-3-small"): 74 | embedding = get_embedding(product_description, client, model=model) 75 | embeddings_matrix = np.vstack(np.array(df.ada_embedding.values)) 76 | similarities = cosine_similarity(embeddings_matrix, np.array(embedding)[None, :]) 77 | df['similarities'] = similarities.flatten() 78 | res = df.sort_values('similarities', ascending=False).head(n) 79 | return res 80 | 81 | def open_file(filepath): 82 | with open(filepath, 'r', encoding='utf-8') as infile: 83 | return infile.read() 84 | 85 | def name_change(script, input, mode): 86 | 87 | shuffle_mapping_dict = { 88 | 'friends': {'Monica': 'Joey', 89 | 'Chandler': 'Rachel', 90 | 'Joey': 'Monica', 91 | 'Phoebe': 'Ross', 92 | 'Rachel': 'Chandler', 93 | 'Ross': 'Phoebe'}, 94 | 'bigbang': {'Howard': 'Amy', 95 | 'Leonard': 'Howard', 96 | 'Raj': 'Penny', 97 | 'Penny': 'Raj', 98 | 'Bernadette': 'Sheldon', 99 | 'Amy': 'Leonard', 100 | 'Sheldon': 'Bernadette'}, 101 | 'theoffice': {'Dwight': 'Ryan', 102 | 'Jim': 'Michael', 103 | 'Pam': 'Dwight', 104 | 'Ryan': 'Jim', 105 | 'Michael': 'Pam'} 106 | } 107 | 108 | new_name_mapping_dict = { 109 | 'friends': {'Monica': 'Patricia', 110 | 'Chandler': 'James', 111 | 'Joey': 'John', 112 | 'Phoebe': 'Jennifer', 113 | 'Rachel': 'Linda', 114 | 'Ross': 'Robert'}, 115 | 'bigbang': {'Howard': 'Robert', 116 | 'Leonard': 'James', 117 | 'Raj': 'Michael', 118 | 'Penny': 'Jennifer', 119 | 'Bernadette': 'Linda', 120 | 'Amy': 'Patricia', 121 | 'Sheldon': 'John'}, 122 | 'theoffice': {'Dwight': 'John', 123 | 'Jim': 'Robert', 124 | 'Pam': 'Jennifer', 125 | 'Ryan': 'William', 126 | 'Michael': 'James'} 127 | } 128 | 129 | if mode == 'shuffle': 130 | chars = list(shuffle_mapping_dict[script]) 131 | sub_dict = {} 132 | for i, char in enumerate(chars): 133 | sub_dict[f'[MASK {i}]'] = shuffle_mapping_dict[script][char] 134 | input = input.replace(char, f'[MASK {i}]') 135 | for i, char in enumerate(chars): 136 | input = input.replace(f'[MASK {i}]', sub_dict[f'[MASK {i}]']) 137 | 138 | elif mode == 'new_name': 139 | chars = list(new_name_mapping_dict[script]) 140 | sub_dict = {} 141 | for i, char in enumerate(chars): 142 | sub_dict[f'[MASK {i}]'] = new_name_mapping_dict[script][char] 143 | input = input.replace(char, f'[MASK {i}]') 144 | for i, char in enumerate(chars): 145 | input = input.replace(f'[MASK {i}]', sub_dict[f'[MASK {i}]']) 146 | 147 | return input 148 | 149 | 150 | 151 | --------------------------------------------------------------------------------