├── assets ├── LLM-Agora.png ├── GSM8K_performance.png ├── MMLU_performance.png └── Math_performance.png ├── requirements.txt ├── GSM8K ├── gsm_performance.json ├── gsm_performance_cot.json ├── gsm_evaluation.py └── gsm_inference.py ├── Math ├── math_performance.json ├── math_performance_cot.json ├── math_evaluation.py └── math_inference.py ├── MMLU ├── mmlu_performance.json ├── mmlu_performance_cot.json ├── mmlu_evaluation.py └── mmlu_inference.py ├── src ├── inference_endpoint.json └── prompt_template.json ├── inference └── inference.py └── README.md /assets/LLM-Agora.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gauss5930/LLM-Agora/HEAD/assets/LLM-Agora.png -------------------------------------------------------------------------------- /assets/GSM8K_performance.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gauss5930/LLM-Agora/HEAD/assets/GSM8K_performance.png -------------------------------------------------------------------------------- /assets/MMLU_performance.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gauss5930/LLM-Agora/HEAD/assets/MMLU_performance.png -------------------------------------------------------------------------------- /assets/Math_performance.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gauss5930/LLM-Agora/HEAD/assets/Math_performance.png -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | requests 2 | openai 3 | json 4 | tqdm 5 | numpy 6 | time 7 | random 8 | argparse 9 | re 10 | pandas -------------------------------------------------------------------------------- /GSM8K/gsm_performance.json: -------------------------------------------------------------------------------- 1 | [ 2 | {"1_performance": "0.26"}, 3 | {"2_performance": "0.26"}, 4 | {"3_performance": "0.26"} 5 | ] -------------------------------------------------------------------------------- /Math/math_performance.json: -------------------------------------------------------------------------------- 1 | [ 2 | {"1_performance": "0.05"}, 3 | {"2_performance": "0.11"}, 4 | {"3_performance": "0.1"} 5 | ] -------------------------------------------------------------------------------- /GSM8K/gsm_performance_cot.json: -------------------------------------------------------------------------------- 1 | [ 2 | {"1_performance": "0.28"}, 3 | {"2_performance": "0.26"}, 4 | {"3_performance": "0.23"} 5 | ] -------------------------------------------------------------------------------- /Math/math_performance_cot.json: -------------------------------------------------------------------------------- 1 | [ 2 | {"1_performance": "0.05"}, 3 | {"2_performance": "0.11"}, 4 | {"3_performance": "0.17"} 5 | ] -------------------------------------------------------------------------------- /MMLU/mmlu_performance.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "1_performance": 0.48 4 | }, 5 | { 6 | "2_performance": 0.0 7 | }, 8 | { 9 | "3_performance": 0.54 10 | } 11 | ] -------------------------------------------------------------------------------- /MMLU/mmlu_performance_cot.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "1_performance": 0.5 4 | }, 5 | { 6 | "2_performance": 0.0 7 | }, 8 | { 9 | "3_performance": 0.58 10 | } 11 | ] -------------------------------------------------------------------------------- /src/inference_endpoint.json: -------------------------------------------------------------------------------- 1 | { 2 | "llama": { 3 | "API_URL": "", 4 | "headers": { 5 | "Authorization": "Bearer ", 6 | "Content-Type": "application/json" 7 | } 8 | }, 9 | "llama-chat": { 10 | "API_URL": "", 11 | "headers": { 12 | "Authorization": "Bearer ", 13 | "Content-Type": "application/json" 14 | } 15 | }, 16 | "vicuna": { 17 | "API_URL": "", 18 | "headers": { 19 | "Authorization": "Bearer ", 20 | "Content-Type": "application/json" 21 | } 22 | }, 23 | "falcon": { 24 | "API_URL": "", 25 | "headers": { 26 | "Authorization": "Bearer ", 27 | "Content-Type": "application/json" 28 | } 29 | }, 30 | "falcon-instruct": { 31 | "API_URL": "", 32 | "headers": { 33 | "Authorization": "Bearer ", 34 | "Content-Type": "application/json" 35 | } 36 | }, 37 | "orca": { 38 | "API_URL": "", 39 | "headers": { 40 | "Authorization": "Bearer ", 41 | "Content-Type": "application/json" 42 | } 43 | }, 44 | "wizardlm": { 45 | "API_URL": "", 46 | "headers": { 47 | "Authorization": "Bearer ", 48 | "Content-Type": "application/json" 49 | } 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /Math/math_evaluation.py: -------------------------------------------------------------------------------- 1 | import json 2 | import openai 3 | import numpy as np 4 | import time 5 | import re 6 | import argparse 7 | 8 | def args_parse(): 9 | parser = argparse.ArgumentParser() 10 | parser.add_argument( 11 | "--model_1", 12 | type=str, 13 | help="It should be the same model used in gsm_inference.py" 14 | ) 15 | parser.add_argument( 16 | "--model_2", 17 | type=str, 18 | help="It should be the same model used in gsm_inference.py" 19 | ) 20 | parser.add_argument( 21 | "--model_3", 22 | type=str, 23 | help="It should be the same model used in gsm_inference.py" 24 | ) 25 | parser.add_argument( 26 | "--cot", 27 | action="store_true" 28 | ) 29 | parser.add_argument( 30 | "--output_dir", 31 | default="Math", 32 | type=str 33 | ) 34 | 35 | return parser.parse_args() 36 | 37 | def parse_answer(input_str): 38 | pattern = r"([0-9]*)" 39 | matches = re.findall(pattern, input_str) 40 | 41 | solution = None 42 | 43 | for match_str in matches[::-1]: 44 | solution = re.sub(r"[^0-9.]", "", match_str) 45 | if solution: 46 | break 47 | 48 | if solution: 49 | return float(solution) 50 | else: 51 | return solution 52 | 53 | def answer_check(List, answer): 54 | if answer in List: 55 | return 1.0 56 | else: 57 | return 0.0 58 | 59 | def compute_accuracy(gt, pred_solutions): 60 | 61 | if type(pred_solutions) == list: 62 | pred_answers = [] 63 | 64 | for pred_solution in pred_solutions: 65 | pred_answer = parse_answer(pred_solution) 66 | 67 | pred_answers.append(pred_answer) 68 | 69 | return answer_check(pred_answers, gt) 70 | 71 | if __name__ == "__main__": 72 | args = args_parse() 73 | model_list = [args.model_1, args.model_2, args.model_3] 74 | 75 | if args.cot: 76 | file_name = "_cot.json" 77 | else: 78 | file_name = ".json" 79 | 80 | with open(f"Math/math_result{file_name}", "r") as f: 81 | response_dict = json.load(f) 82 | 83 | questions = [response_dict[i]["question"] for i in range(len(response_dict))] 84 | 85 | performance = [] 86 | 87 | for turn in range(3): 88 | accuracies = [] 89 | for idx in range(len(questions)): 90 | responses = [response_dict[idx]["agent_response"][model][turn] for model in model_list] 91 | gt = float(response_dict[idx]["answer"]) 92 | 93 | accurate = compute_accuracy(gt, responses) 94 | 95 | if accurate is not None: 96 | accuracies.append(float(accurate)) 97 | else: 98 | accuracies.append(0.0) 99 | 100 | performance.append({f"{turn+1}_performance": np.mean(accuracies)}) 101 | print(performance) 102 | 103 | 104 | print(f"The performance file 'math_performance{file_name}' is saving...") 105 | with open(args.output_dir + f"/math_performance{file_name}", "x") as f: 106 | json.dump(performance, f, indent=4) 107 | 108 | print("All done!!") 109 | -------------------------------------------------------------------------------- /GSM8K/gsm_evaluation.py: -------------------------------------------------------------------------------- 1 | import json 2 | import openai 3 | import numpy as np 4 | import time 5 | import re 6 | import argparse 7 | 8 | def args_parse(): 9 | parser = argparse.ArgumentParser() 10 | parser.add_argument( 11 | "--model_1", 12 | type=str, 13 | help="It should be the same model used in gsm_inference.py" 14 | ) 15 | parser.add_argument( 16 | "--model_2", 17 | type=str, 18 | help="It should be the same model used in gsm_inference.py" 19 | ) 20 | parser.add_argument( 21 | "--model_3", 22 | type=str, 23 | help="It should be the same model used in gsm_inference.py" 24 | ) 25 | parser.add_argument( 26 | "--cot", 27 | action="store_true" 28 | ) 29 | parser.add_argument( 30 | "--output_dir", 31 | default="GSM8K", 32 | type=str 33 | ) 34 | 35 | return parser.parse_args() 36 | 37 | def solve_math_problems(input_str): 38 | pattern = r"\d+\.?\d*" 39 | 40 | matches = re.findall(pattern, input_str) 41 | if matches: 42 | return float(matches[-1]) 43 | 44 | return None 45 | 46 | def parse_answer(input_str): 47 | pattern = r"([0-9]*)" 48 | matches = re.findall(pattern, input_str) 49 | 50 | solution = None 51 | 52 | for match_str in matches[::-1]: 53 | solution = re.sub(r"[^0-9.]", "", match_str) 54 | if solution: 55 | break 56 | 57 | if solution: 58 | return float(solution) 59 | else: 60 | return solution 61 | 62 | def answer_check(List, answer): 63 | if answer in List: 64 | return 1.0 65 | else: 66 | return 0.0 67 | 68 | def compute_accuracy(gt, pred_solutions): 69 | answers = solve_math_problems(gt) 70 | 71 | if not answers: 72 | return None 73 | 74 | if type(pred_solutions) == list: 75 | pred_answers = [] 76 | 77 | for pred_solution in pred_solutions: 78 | pred_answer = parse_answer(pred_solution) 79 | 80 | if not pred_answer: 81 | pred_answer = solve_math_problems(pred_solution) 82 | 83 | pred_answers.append(pred_answer) 84 | 85 | return answer_check(pred_answers, answers) 86 | 87 | 88 | if __name__ == "__main__": 89 | args = args_parse() 90 | model_list = [args.model_1, args.model_2, args.model_3] 91 | 92 | if args.cot: 93 | file_name = "_cot.json" 94 | else: 95 | file_name = ".json" 96 | 97 | with open(f"GSM8K/gsm_result{file_name}", "r") as f: 98 | response_dict = json.load(f) 99 | 100 | questions = [response_dict[i]["question"] for i in range(len(response_dict))] 101 | 102 | performance = [] 103 | 104 | for turn in range(3): 105 | accuracies = [] 106 | for idx in range(len(questions)): 107 | responses = [response_dict[idx]["agent_response"][model][turn] for model in model_list] 108 | gt = response_dict[idx]["answer"] 109 | 110 | accurate = compute_accuracy(gt, responses) 111 | 112 | if accurate is not None: 113 | accuracies.append(float(accurate)) 114 | else: 115 | accuracies.append(0.0) 116 | 117 | performance.append({f"{turn+1}_performance": np.mean(accuracies)}) 118 | print({f"{turn+1}_performance": np.mean(accuracies)}) 119 | 120 | print(f"The performance file 'gsm_performance{file_name}' is saving...") 121 | with open(args.output_dir + f"/gsm_performance{file_name}", "x") as f: 122 | json.dump(performance, f, indent=4) 123 | 124 | print("All done!!") 125 | -------------------------------------------------------------------------------- /MMLU/mmlu_evaluation.py: -------------------------------------------------------------------------------- 1 | import json 2 | import openai 3 | import numpy as np 4 | import time 5 | import re 6 | import argparse 7 | 8 | def args_parse(): 9 | parser = argparse.ArgumentParser() 10 | parser.add_argument( 11 | "--model_1", 12 | type=str, 13 | help="It should be the same model used in gsm_inference.py" 14 | ) 15 | parser.add_argument( 16 | "--model_2", 17 | type=str, 18 | help="It should be the same model used in gsm_inference.py" 19 | ) 20 | parser.add_argument( 21 | "--model_3", 22 | type=str, 23 | help="It should be the same model used in gsm_inference.py" 24 | ) 25 | parser.add_argument( 26 | "--cot", 27 | action="store_true" 28 | ) 29 | parser.add_argument( 30 | "--output_dir", 31 | default="MMLU", 32 | type=str 33 | ) 34 | 35 | return parser.parse_args() 36 | 37 | def solve_math_problems(input_str): 38 | pattern = r"\d+\.?\d*" 39 | 40 | matches = re.findall(pattern, input_str) 41 | if matches: 42 | return matches[-1] 43 | 44 | return None 45 | 46 | def parse_answer(input_str): 47 | pattern = r'([A-Za-z])\)' 48 | matches = re.findall(pattern, input_str) 49 | 50 | solution = None 51 | 52 | for match_str in matches[::-1]: 53 | solution = match_str.upper() 54 | if solution: 55 | break 56 | 57 | return solution 58 | 59 | def compute_accuracy(gt, pred_solutions): 60 | if type(pred_solutions) == list: 61 | pred_answers = [] 62 | 63 | for pred_solution in pred_solutions: 64 | pred_answer = parse_answer(pred_solution) 65 | 66 | if pred_answer is None: 67 | pred_answer = solve_math_problems(pred_solution) 68 | 69 | if pred_answer is not None: 70 | pred_answers.append(pred_answer) 71 | 72 | if pred_answer is None: 73 | return 0 74 | pred_answer = answer_check(pred_answers, gt) 75 | else: 76 | pred_answer = parse_answer(pred_solutions) 77 | if pred_answer is None: 78 | pred_answer = solve_math_problems(pred_solutions) 79 | pred_answer = answer_check(pred_answer, gt) 80 | 81 | return pred_answer 82 | 83 | def answer_check(List, answer): 84 | if answer in List: 85 | return 1.0 86 | else: 87 | return 0.0 88 | 89 | if __name__ == "__main__": 90 | args = args_parse() 91 | 92 | model_list = [args.model_1, args.model_2, args.model_3] 93 | 94 | if args.cot: 95 | file_name = "_cot.json" 96 | else: 97 | file_name = ".json" 98 | 99 | with open(f"MMLU/mmlu_result{file_name}", "r") as f: 100 | response_dict = json.load(f) 101 | 102 | questions = [response_dict[i]["question"] for i in range(len(response_dict))] 103 | 104 | performance = [] 105 | 106 | for turn in range(3): 107 | accuracies = [] 108 | for idx in range(len(questions)): 109 | responses = [response_dict[idx]["agent_response"][model][turn] for model in model_list] 110 | gt = response_dict[idx]["answer"] 111 | 112 | accurate = compute_accuracy(gt, responses) 113 | 114 | if accurate is not None: 115 | accuracies.append(float(accurate)) 116 | else: 117 | accuracies.append(0.0) 118 | 119 | performance.append({f"{turn+1}_performance": np.mean(accuracies)}) 120 | print(performance[-1]) 121 | 122 | print(f"The performance file 'mmlu_performance{file_name}' is saving...") 123 | with open(args.output_dir + f"/mmlu_performance{file_name}", "x") as f: 124 | json.dump(performance, f, indent=4) 125 | 126 | print("All done!!") 127 | -------------------------------------------------------------------------------- /src/prompt_template.json: -------------------------------------------------------------------------------- 1 | { 2 | "llama": { 3 | "prompt": "SYSTEM: You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature. If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.\nUSER: {instruction}\nASSISTANT: ", 4 | "response_split": "ASSISTANT: " 5 | }, 6 | "llama-chat": { 7 | "prompt": "[INST] <>\nYou are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature. If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.\n<>\n{instruction}[/INST]", 8 | "repsonse_split": "[/INST]" 9 | }, 10 | "alpaca": { 11 | "prompt_input": "Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:\n", 12 | "prompt_no_input": "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\n{instruction}\n\n### Response:\n", 13 | "response_split": "### Response:" 14 | }, 15 | "vicuna": { 16 | "prompt": "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.\nUser:\n{instruction}\nASSISTANT:\n", 17 | "response_split": "ASSISTANT:" 18 | }, 19 | "koala": { 20 | "prompt": "BEGINNING OF CONVERSATION:\nUser:\n{instruction}\nGPT:\n", 21 | "response_split": "GPT:" 22 | }, 23 | "falcon": { 24 | "prompt": "You are a helpful AI assistant and provide the answer for the question.\n>>QUESTION<<\n{instruction}\n>>ANSWER<<\n", 25 | "response_split": ">>ANSWER<<" 26 | }, 27 | "falcon-instruct": { 28 | "prompt": "You are a helpful AI assistant and provide the answer for the question.\n>>QUESTION<<\n{instruction}\n>>ANSWER<<\n", 29 | "response_split": ">>ANSWER<<" 30 | }, 31 | "orca": { 32 | "prompt_input": "### System:\nYou are an AI assistant that follows instruction extremely well. Help as much as you can.\n\n### User:\n{instruction}\n\n### Input:\n{input}\n\n### Response:\n", 33 | "prompt_no_input": "### System:\nYou are an AI assistant that follows instruction extremely well. Help as much as you can.\n\n### User:\n{instruction}\n\n### Response:\n", 34 | "response_split": "### Response:" 35 | }, 36 | "wizardlm": { 37 | "prompt": "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: {instruction} ASSISTANT: ", 38 | "response_split": "Assistant: " 39 | }, 40 | "baize": { 41 | "prompt": "The following is a conversation between a human and an AI assistant named Baize (named after a mythical creature in Chinese folklore). Baize is an open-source AI assistant developed by UCSD and Sun Yat-Sen University. The human and the AI assistant take turns chatting. Human statements start with [|Human|] and AI assistant statements start with [|AI|]. The AI assistant always provides responses in as much detail as possible, and in Markdown format. The AI assistant always declines to engage with topics, questions and instructions related to unethical, controversial, or sensitive issues. Complete the transcript in exactly that format.\n[|Human|]{instruction}\n[|AI|]", 42 | "response_split": "[|AI|]" 43 | }, 44 | "phi": { 45 | "prompt": "User:\n{instruction}\n\nAssistant:\n", 46 | "response_split": "Assistant:" 47 | } 48 | } -------------------------------------------------------------------------------- /inference/inference.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import openai 3 | import json 4 | import numpy as np 5 | import time 6 | import argparse 7 | 8 | def args_parse(): 9 | parser = argparse.ArgumentParser() 10 | parser.add_argument("--question", type=str) 11 | parser.add_argument("--model_1", type=str) 12 | parser.add_argument("--model_2", type=str) 13 | parser.add_argument("--model_3", type=str) 14 | parser.add_argument( 15 | "--API_KEY", 16 | type=str, 17 | help="your OpenAI API key to use gpt-3.5-turbo" 18 | ) 19 | parser.add_argument("--round", default=2, type=int) 20 | parser.add_argument( 21 | "--cot", 22 | default=False, 23 | action='store_true', 24 | help="If this is True, you can use Chain-of-Thought during inference." 25 | ) 26 | parser.add_argument( 27 | "--output_dir", 28 | default="inference", 29 | type=str, 30 | help="Directory to save the result file" 31 | ) 32 | 33 | return parser.parse_args() 34 | 35 | def load_json(prompt_path, endpoint_path): 36 | with open(prompt_path, "r") as prompt_file: 37 | prompt_dict = json.load(prompt_file) 38 | 39 | with open(endpoint_path, "r") as endpoint_file: 40 | endpoint_dict = json.load(endpoint_file) 41 | 42 | return prompt_dict, endpoint_dict 43 | 44 | def construct_message(agent_context, instruction, idx): 45 | prefix_string = "Here are a list of opinions from different agents: " 46 | 47 | prefix_string = prefix_string + agent_context + "\n\n Write a summary of the different opinions from each of the individual agent." 48 | 49 | message = [{"role": "user", "content": prefix_string}] 50 | 51 | try: 52 | completion = openai.ChatCompletion.create( 53 | model="gpt-3.5-turbo-0613", 54 | messages=message, 55 | max_tokens=256, 56 | n=1 57 | )['choices'][0]['message']['content'] 58 | except: 59 | print("retrying ChatGPT due to an error......") 60 | time.sleep(5) 61 | return construct_message(agent_context, instruction, idx) 62 | 63 | prefix_string = f"Here is a summary of responses from other agents: {completion}" 64 | prefix_string = prefix_string + "\n\n Use this summarization carefully as additional advice, can you provide an updated answer? Make sure to state your answer at the end of the response." + instruction 65 | return prefix_string 66 | 67 | def summarize_message(agent_contexts, instruction, idx): 68 | prefix_string = "Here are a list of opinions from different agents: " 69 | 70 | for agent in agent_contexts: 71 | agent_response = agent[-1]["content"] 72 | response = "\n\n One agent response: ```{}```".format(agent_response) 73 | 74 | prefix_string = prefix_string + response 75 | 76 | prefix_string = prefix_string + "\n\n Write a summary of the different opinions from each of the individual agent." 77 | completion = construct_message(prefix_string, instruction, idx) 78 | 79 | return completion 80 | 81 | def generate_question(agents, question): 82 | agent_contexts = [[{"model": agent, "content": question}] for agent in agents] 83 | 84 | content = agent_contexts[0][0]["content"] 85 | 86 | return agent_contexts, content 87 | 88 | if __name__ == "__main__": 89 | args = args_parse() 90 | openai.api_key = args.API_KEY 91 | model_list = [args.model_1, args.model_2, args.model_3] 92 | 93 | prompt_dict, endpoint_dict = load_json("src/prompt_template.json", "src/inference_endpoint.json") 94 | 95 | def generate_answer(model, formatted_prompt): 96 | API_URL = endpoint_dict[model]["API_URL"] 97 | headers = endpoint_dict[model]["headers"] 98 | payload = { 99 | "inputs": formatted_prompt, 100 | "parameters": { 101 | "max_new_tokens": 256 102 | } 103 | } 104 | try: 105 | resp = requests.post(API_URL, json=payload, headers=headers) 106 | response = resp.json() 107 | except: 108 | print("retrying due to an error......") 109 | time.sleep(5) 110 | return generate_answer(model, formatted_prompt) 111 | 112 | return {"model": model, "content": response[0]["generated_text"]} 113 | 114 | def prompt_formatting(model, instruction, cot): 115 | if model == "alpaca" or model == "orca": 116 | prompt = prompt_dict[model]["prompt_no_input"] 117 | else: 118 | prompt = prompt_dict[model]["prompt"] 119 | 120 | if cot: 121 | instruction += "Let's think step by step." 122 | 123 | return {"model": model, "content": prompt.format(instruction=instruction)} 124 | 125 | agents = len(model_list) 126 | rounds = args.round 127 | 128 | generated_description = [] 129 | 130 | agent_contexts, content = generate_question(agents=model_list, question=args.question) 131 | 132 | print(f"# Question starts...") 133 | 134 | message = [] 135 | 136 | # Debate 137 | for debate in range(rounds+1): 138 | # Refer to the summarized previous response 139 | if debate != 0: 140 | message.append(summarize_message(agent_contexts, content, 2 * debate - 1)) 141 | for i in range(len(agent_contexts)): 142 | agent_contexts[i].append(prompt_formatting(agent_contexts[i][-1]["model"], message, args.cot)) 143 | 144 | # Generate new response based on summarized response 145 | for agent_context in agent_contexts: 146 | completion = generate_answer(agent_context[-1]["model"], agent_context[-1]["content"]) 147 | agent_context.append(completion) 148 | 149 | print(f"# Question debate is ended.") 150 | 151 | models_response = { 152 | f"{args.model_1}": [agent_contexts[0][1]["content"], agent_contexts[0][3]["content"], agent_contexts[0][-1]["content"]], 153 | f"{args.model_2}": [agent_contexts[1][1]["content"], agent_contexts[1][3]["content"], agent_contexts[1][-1]["content"]], 154 | f"{args.model_3}": [agent_contexts[2][1]["content"], agent_contexts[2][3]["content"], agent_contexts[2][-1]["content"]] 155 | } 156 | response_summarization = [ 157 | message[0], message[1] 158 | ] 159 | generated_description.append({"question": content, "agent_response": models_response, "summarization": response_summarization}) 160 | 161 | if args.cot: 162 | file_name = "_cot.json" 163 | else: 164 | file_name = ".json" 165 | 166 | print(f"The result file 'inference_result{file_name}' is saving...") 167 | with open(args.output_dir + f"/inference_result{file_name}", "x") as f: 168 | json.dump(generated_description, f, indent=4) 169 | 170 | print(f"All done!! Please check the inference/inference_result{file_name}!!") -------------------------------------------------------------------------------- /Math/math_inference.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import openai 3 | import json 4 | import numpy as np 5 | import time 6 | from tqdm import tqdm 7 | import argparse 8 | 9 | def args_parse(): 10 | parser = argparse.ArgumentParser() 11 | parser.add_argument("--model_1", type=str) 12 | parser.add_argument("--model_2", type=str) 13 | parser.add_argument("--model_3", type=str) 14 | parser.add_argument( 15 | "--API_KEY", 16 | type=str, 17 | help="your OpenAI API key to use gpt-3.5-turbo" 18 | ) 19 | parser.add_argument("--round", default=2, type=int) 20 | parser.add_argument( 21 | "--cot", 22 | default=False, 23 | action='store_true', 24 | help="If this is True, you can use Chain-of-Thought during inference." 25 | ) 26 | parser.add_argument( 27 | "--output_dir", 28 | default="Math", 29 | type=str, 30 | help="Directory to save the result file" 31 | ) 32 | 33 | return parser.parse_args() 34 | 35 | def load_json(prompt_path, endpoint_path): 36 | with open(prompt_path, "r") as prompt_file: 37 | prompt_dict = json.load(prompt_file) 38 | 39 | with open(endpoint_path, "r") as endpoint_file: 40 | endpoint_dict = json.load(endpoint_file) 41 | 42 | return prompt_dict, endpoint_dict 43 | 44 | def construct_message(agent_context, instruction, idx): 45 | prefix_string = "Here are a list of opinions from different agents: " 46 | 47 | prefix_string = prefix_string + agent_context + "\n\n Write a summary of the different opinions from each of the individual agent." 48 | 49 | message = [{"role": "user", "content": prefix_string}] 50 | 51 | try: 52 | completion = openai.ChatCompletion.create( 53 | model="gpt-3.5-turbo-0613", 54 | messages=message, 55 | max_tokens=256, 56 | n=1 57 | )['choices'][0]['message']['content'] 58 | except: 59 | print("retrying ChatGPT due to an error......") 60 | time.sleep(5) 61 | return construct_message(agent_context, instruction, idx) 62 | 63 | prefix_string = f"Here is a summary of responses from other agents: {completion}" 64 | prefix_string = prefix_string + "\n\n Use this summarization carefully as additional advice, can you provide an updated answer? Make sure to state your answer at the end of the response." + instruction 65 | return prefix_string 66 | 67 | def summarize_message(agent_contexts, instruction, idx): 68 | prefix_string = "Here are a list of opinions from different agents: " 69 | 70 | for agent in agent_contexts: 71 | agent_response = agent[-1]["content"] 72 | response = "\n\n One agent response: ```{}```".format(agent_response) 73 | 74 | prefix_string = prefix_string + response 75 | 76 | prefix_string = prefix_string + "\n\n Write a summary of the different opinions from each of the individual agent." 77 | completion = construct_message(prefix_string, instruction, idx) 78 | 79 | return completion 80 | 81 | def generate_math(agents): 82 | a, b, c, d, e, f = np.random.randint(0, 30, size=6) 83 | 84 | answer = a + b * c + d - e * f 85 | question = "What is the result of {}+{}*{}+{}-{}*{}? Make sure to state your answer at the end of the response." 86 | 87 | agent_contexts = [[{"model": agent, "content": question.format(a, b, c, d, e, f)}] for agent in agents] 88 | 89 | content = agent_contexts[0][0]["content"] 90 | 91 | question_prompt = f"We seek to find the result of {a}+{b}*{c}+{d}-{e}*{f}?" 92 | 93 | return agent_contexts, content, question_prompt, answer 94 | 95 | if __name__ == "__main__": 96 | args = args_parse() 97 | openai.api_key = args.API_KEY 98 | model_list = [args.model_1, args.model_2, args.model_3] 99 | 100 | prompt_dict, endpoint_dict = load_json("src/prompt_template.json", "src/inference_endpoint.json") 101 | 102 | def generate_answer(model, formatted_prompt): 103 | API_URL = endpoint_dict[model]["API_URL"] 104 | headers = endpoint_dict[model]["headers"] 105 | payload = { 106 | "inputs": formatted_prompt, 107 | "parameters": { 108 | "max_new_tokens": 256 109 | } 110 | } 111 | try: 112 | resp = requests.post(API_URL, json=payload, headers=headers) 113 | response = resp.json() 114 | except: 115 | print("retrying due to an error......") 116 | time.sleep(5) 117 | return generate_answer(model, formatted_prompt) 118 | 119 | return {"model": model, "content": response[0]["generated_text"]} 120 | 121 | def prompt_formatting(model, instruction, cot): 122 | if model == "alpaca" or model == "orca": 123 | prompt = prompt_dict[model]["prompt_no_input"] 124 | else: 125 | prompt = prompt_dict[model]["prompt"] 126 | 127 | if cot: 128 | instruction += "Let's think step by step." 129 | 130 | return {"model": model, "content": prompt.format(instruction=instruction)} 131 | 132 | agents = len(model_list) 133 | rounds = args.round 134 | np.random.seed(0) 135 | 136 | evaluation = 100 137 | scores = [] 138 | 139 | generated_description = [] 140 | 141 | for round in tqdm(range(evaluation)): 142 | agent_contexts, content, question_prompt, answer = generate_math(agents=model_list) 143 | 144 | print(f"# Question No.{round+1} starts...") 145 | 146 | message = [] 147 | 148 | # Debate 149 | for debate in range(rounds+1): 150 | # Refer to the summarized previous response 151 | if debate != 0: 152 | message.append(summarize_message(agent_contexts, question_prompt, 2 * debate - 1)) 153 | for i in range(len(agent_contexts)): 154 | agent_contexts[i].append(prompt_formatting(agent_contexts[i][-1]["model"], message[-1], args.cot)) 155 | 156 | # Generate new response based on summarized response 157 | for agent_context in agent_contexts: 158 | completion = generate_answer(agent_context[-1]["model"], agent_context[-1]["content"]) 159 | agent_context.append(completion) 160 | 161 | print(f"# Question No.{round+1} debate is ended.") 162 | 163 | models_response = { 164 | f"{args.model_1}": [agent_contexts[0][1]["content"], agent_contexts[0][3]["content"], agent_contexts[0][-1]["content"]], 165 | f"{args.model_2}": [agent_contexts[1][1]["content"], agent_contexts[1][3]["content"], agent_contexts[1][-1]["content"]], 166 | f"{args.model_3}": [agent_contexts[2][1]["content"], agent_contexts[2][3]["content"], agent_contexts[2][-1]["content"]] 167 | } 168 | response_summarization = [ 169 | message[0], message[1] 170 | ] 171 | generated_description.append({"question_id": round, "question": content, "agent_response": models_response, "summarization": response_summarization, "answer": str(answer)}) 172 | 173 | if args.cot: 174 | file_name = "_cot.json" 175 | else: 176 | file_name = ".json" 177 | 178 | print(f"The result file 'math_result{file_name}' is saving...") 179 | with open(args.output_dir + f"/math_result{file_name}", "x") as f: 180 | json.dump(generated_description, f, indent=4) 181 | 182 | print("All done!!") 183 | -------------------------------------------------------------------------------- /GSM8K/gsm_inference.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import openai 3 | import json 4 | import numpy as np 5 | import random 6 | import time 7 | from tqdm import tqdm 8 | import argparse 9 | 10 | def args_parse(): 11 | parser = argparse.ArgumentParser() 12 | parser.add_argument("--model_1", type=str) 13 | parser.add_argument("--model_2", type=str) 14 | parser.add_argument("--model_3", type=str) 15 | parser.add_argument( 16 | "--API_KEY", 17 | type=str, 18 | help="your OpenAI API key to use gpt-3.5-turbo" 19 | ) 20 | parser.add_argument("--round", default=2, type=int) 21 | parser.add_argument( 22 | "--cot", 23 | default=False, 24 | action='store_true', 25 | help="If this is True, you can use Chain-of-Thought during inference." 26 | ) 27 | parser.add_argument( 28 | "--output_dir", 29 | default="Math", 30 | type=str, 31 | help="Directory to save the result file" 32 | ) 33 | 34 | return parser.parse_args() 35 | 36 | def load_json(prompt_path, endpoint_path): 37 | with open(prompt_path, "r") as prompt_file: 38 | prompt_dict = json.load(prompt_file) 39 | 40 | with open(endpoint_path, "r") as endpoint_file: 41 | endpoint_dict = json.load(endpoint_file) 42 | 43 | return prompt_dict, endpoint_dict 44 | 45 | def construct_message(agent_context, instruction, idx): 46 | prefix_string = "Here are a list of opinions from different agents: " 47 | 48 | prefix_string = prefix_string + agent_context + "\n\n Write a summary of the different opinions from each of the individual agent." 49 | 50 | message = [{"role": "user", "content": prefix_string}] 51 | 52 | try: 53 | completion = openai.ChatCompletion.create( 54 | model="gpt-3.5-turbo-0613", 55 | messages=message, 56 | max_tokens=256, 57 | n=1 58 | )['choices'][0]['message']['content'] 59 | except: 60 | print("retrying ChatGPT due to an error......") 61 | time.sleep(5) 62 | return construct_message(agent_context, instruction, idx) 63 | 64 | prefix_string = f"Here is a summary of responses from other agents: {completion}" 65 | prefix_string = prefix_string + "\n\n Use this summarization carefully as additional advice, can you provide an updated answer? Make sure to state your answer at the end of the response." + instruction 66 | return prefix_string 67 | 68 | def summarize_message(agent_contexts, instruction, idx): 69 | prefix_string = "Here are a list of opinions from different agents: " 70 | 71 | for agent in agent_contexts: 72 | agent_response = agent[-1]["content"] 73 | response = "\n\n One agent response: ```{}```".format(agent_response) 74 | 75 | prefix_string = prefix_string + response 76 | 77 | prefix_string = prefix_string + "\n\n Write a summary of the different opinions from each of the individual agent." 78 | completion = construct_message(prefix_string, instruction, idx) 79 | 80 | return completion 81 | 82 | def generate_gsm(agents, question): 83 | agent_contexts = [[{"model": agent, "content": f"Can you solve the following math problem? {question} Explain your reasoning. Your final answer should be a single numerical number, in the form \\boxed{{answer}}, at the end of your response."}] for agent in agents] 84 | return agent_contexts 85 | 86 | def read_jsonl(path: str): 87 | with open(path, "r") as fh: 88 | return [json.loads(line) for line in fh.readlines() if line] 89 | 90 | if __name__ == "__main__": 91 | args = args_parse() 92 | openai.api_key = args.API_KEY 93 | model_list = [args.model_1, args.model_2, args.model_3] 94 | 95 | prompt_dict, endpoint_dict = load_json("src/prompt_template.json", "src/inference_endpoint.json") 96 | 97 | def generate_answer(model, formatted_prompt): 98 | API_URL = endpoint_dict[model]["API_URL"] 99 | headers = endpoint_dict[model]["headers"] 100 | payload = { 101 | "inputs": formatted_prompt, 102 | "parameters": { 103 | "max_new_tokens": 256 104 | } 105 | } 106 | try: 107 | resp = requests.post(API_URL, json=payload, headers=headers) 108 | response = resp.json() 109 | except: 110 | print("retrying due to an error......") 111 | time.sleep(5) 112 | return generate_answer(model, formatted_prompt) 113 | 114 | return {"model": model, "content": response[0]["generated_text"]} 115 | 116 | def prompt_formatting(model, instruction, cot): 117 | if model == "alpaca" or model == "orca": 118 | prompt = prompt_dict[model]["prompt_no_input"] 119 | else: 120 | prompt = prompt_dict[model]["prompt"] 121 | 122 | if cot: 123 | instruction += "Let's think step by step." 124 | 125 | return {"model": model, "content": prompt.format(instruction=instruction)} 126 | 127 | agents = len(model_list) 128 | rounds = args.round 129 | random.seed(0) 130 | 131 | evaluation = 100 132 | 133 | generated_description = [] 134 | 135 | questions = read_jsonl("data/GSM8K/gsm8k_test.jsonl") 136 | random.shuffle(questions) 137 | 138 | for idx in tqdm(range(evaluation)): 139 | question = questions[idx]["question"] 140 | answer = questions[idx]["answer"] 141 | 142 | agent_contexts = generate_gsm(model_list, question) 143 | 144 | print(f"# Question No.{idx+1} starts...") 145 | 146 | message = [] 147 | 148 | # Debate 149 | for debate in range(rounds+1): 150 | # Refer to the summarized previous response 151 | if debate != 0: 152 | message.append(summarize_message(agent_contexts, question, 2 * debate - 1)) 153 | for i in range(len(agent_contexts)): 154 | agent_contexts[i].append(prompt_formatting(agent_contexts[i][-1]["model"], message, args.cot)) 155 | 156 | for agent_context in agent_contexts: 157 | # Generate new response based on summarized response 158 | completion = generate_answer(agent_context[-1]["model"], agent_context[-1]["content"]) 159 | agent_context.append(completion) 160 | 161 | print(f"# Question No.{idx+1} debate is ended.") 162 | 163 | models_response = { 164 | f"{args.model_1}": [agent_contexts[0][1]["content"], agent_contexts[0][3]["content"], agent_contexts[0][-1]["content"]], 165 | f"{args.model_2}": [agent_contexts[1][1]["content"], agent_contexts[1][3]["content"], agent_contexts[1][-1]["content"]], 166 | f"{args.model_3}": [agent_contexts[2][1]["content"], agent_contexts[2][3]["content"], agent_contexts[2][-1]["content"]] 167 | } 168 | response_summarization = [ 169 | message[0], message[1] 170 | ] 171 | generated_description.append({"question_id": idx, "question": question, "agent_response": models_response, "summarization": response_summarization, "answer": answer}) 172 | 173 | if args.cot: 174 | file_name = "_cot.json" 175 | else: 176 | file_name = ".json" 177 | 178 | print(f"The result file 'gsm_result{file_name}' is saving...") 179 | with open(args.output_dir + f"/gsm_result{file_name}", "x") as f: 180 | json.dump(generated_description, f, indent=4) 181 | 182 | print("All done!!") -------------------------------------------------------------------------------- /MMLU/mmlu_inference.py: -------------------------------------------------------------------------------- 1 | from glob import glob 2 | import pandas as pd 3 | from tqdm import tqdm 4 | import json 5 | import time 6 | import random 7 | import openai 8 | import argparse 9 | import requests 10 | 11 | def args_parse(): 12 | parser = argparse.ArgumentParser() 13 | parser.add_argument("--model_1", type=str) 14 | parser.add_argument("--model_2", type=str) 15 | parser.add_argument("--model_3", type=str) 16 | parser.add_argument( 17 | "--API_KEY", 18 | type=str, 19 | help="your OpenAI API key to use gpt-3.5-turbo" 20 | ) 21 | parser.add_argument("--round", default=2, type=int) 22 | parser.add_argument( 23 | "--cot", 24 | default=False, 25 | action='store_true', 26 | help="If this is True, you can use Chain-of-Thought during inference." 27 | ) 28 | parser.add_argument( 29 | "--output_dir", 30 | default="MMLU", 31 | type=str, 32 | help="Directory to save the result file" 33 | ) 34 | 35 | return parser.parse_args() 36 | 37 | def load_json(prompt_path, endpoint_path): 38 | with open(prompt_path, "r") as prompt_file: 39 | prompt_dict = json.load(prompt_file) 40 | 41 | with open(endpoint_path, "r") as endpoint_file: 42 | endpoint_dict = json.load(endpoint_file) 43 | 44 | return prompt_dict, endpoint_dict 45 | 46 | def construct_message(agent_context, instruction, idx): 47 | prefix_string = "Here are a list of opinions from different agents: " 48 | 49 | prefix_string = prefix_string + agent_context + "\n\n Write a summary of the different opinions from each of the individual agent." 50 | 51 | message = [{"role": "user", "content": prefix_string}] 52 | 53 | try: 54 | completion = openai.ChatCompletion.create( 55 | model="gpt-3.5-turbo-0613", 56 | messages=message, 57 | max_tokens=256, 58 | n=1 59 | )['choices'][0]['message']['content'] 60 | except: 61 | print("retrying ChatGPT due to an error......") 62 | time.sleep(5) 63 | return construct_message(agent_context, instruction, idx) 64 | 65 | prefix_string = f"Here is a summary of responses from other agents: {completion}" 66 | prefix_string = prefix_string + "\n\n Use this summarization carefully as additional advice, can you provide an updated answer? Make sure to state your answer at the end of the response." + instruction 67 | return prefix_string 68 | 69 | def summarize_message(agent_contexts, instruction, idx): 70 | prefix_string = "Here are a list of opinions from different agents: " 71 | 72 | for agent in agent_contexts: 73 | agent_response = agent[-1]["content"] 74 | response = "\n\n One agent response: ```{}```".format(agent_response) 75 | 76 | prefix_string = prefix_string + response 77 | 78 | prefix_string = prefix_string + "\n\n Write a summary of the different opinions from each of the individual agent." 79 | completion = construct_message(prefix_string, instruction, idx) 80 | 81 | return completion 82 | 83 | def parse_question_answer(df): 84 | question = f"Can you answer the following question as accurately as possible? {df['question']}: A) {df['A']}, B) {df['B']}, C) {df['C']}, D) {df['D']} Explain your answer, putting the answer in the form (X) at the end of your response." 85 | answer = df["answer"] 86 | return question, answer 87 | 88 | def generate_mmlu(agents, question): 89 | agent_contexts = [[{"model": agent, "content": question}] for agent in agents] 90 | return agent_contexts 91 | 92 | if __name__ == "__main__": 93 | args = args_parse() 94 | openai.api_key = args.API_KEY 95 | model_list = [args.model_1, args.model_2, args.model_3] 96 | 97 | prompt_dict, endpoint_dict = load_json("src/prompt_template.json", "src/inference_endpoint.json") 98 | 99 | def generate_answer(model, formatted_prompt): 100 | API_URL = endpoint_dict[model]["API_URL"] 101 | headers = endpoint_dict[model]["headers"] 102 | payload = { 103 | "inputs": formatted_prompt, 104 | "parameters": { 105 | "max_new_tokens": 256 106 | } 107 | } 108 | try: 109 | resp = requests.post(API_URL, json=payload, headers=headers) 110 | response = resp.json() 111 | except: 112 | print("retrying due to an error......") 113 | time.sleep(5) 114 | return generate_answer(API_URL, headers, payload) 115 | 116 | return {"model": model, "content": response[0]["generated_text"]} 117 | 118 | def prompt_formatting(model, instruction, cot): 119 | if model == "alpaca" or model == "orca": 120 | prompt = prompt_dict[model]["prompt_no_input"] 121 | else: 122 | prompt = prompt_dict[model]["prompt"] 123 | 124 | if cot: 125 | instruction += "Let's think step by step." 126 | 127 | return {"model": model, "content": prompt.format(instruction=instruction)} 128 | 129 | agents = len(model_list) 130 | rounds = args.round 131 | 132 | with open("data/MMLU/MMLU_test.json", "r") as f: 133 | mmlu_questions = json.load(f) 134 | 135 | random.seed(0) 136 | random.shuffle(mmlu_questions) 137 | generated_description = [] 138 | 139 | evaluation = 100 140 | 141 | for idx in tqdm(range(evaluation)): 142 | question, answer = parse_question_answer(mmlu_questions[idx]) 143 | 144 | agent_contexts = generate_mmlu(model_list, question) 145 | 146 | print(f"# Question No.{idx+1} starts...") 147 | 148 | message = [] 149 | 150 | for debate in range(rounds+1): 151 | # Refer to the summarized previous response 152 | if debate != 0: 153 | message.append(summarize_message(agent_contexts, question, 2 * debate - 1)) 154 | for i in range(len(agent_contexts)): 155 | agent_contexts[i].append(prompt_formatting(agent_contexts[i][-1]["model"], message, args.cot)) 156 | 157 | for agent_context in agent_contexts: 158 | # Generate new response based on summarized response 159 | completion = generate_answer(agent_context[-1]["model"], agent_context[-1]["content"]) 160 | agent_context.append(completion) 161 | 162 | print(f"# Question No.{idx+1} debate is ended.") 163 | 164 | models_response = { 165 | f"{args.model_1}": [agent_contexts[0][1]["content"], agent_contexts[0][2]["content"], agent_contexts[0][3]["content"]], 166 | f"{args.model_2}": [agent_contexts[1][1]["content"], agent_contexts[1][2]["content"], agent_contexts[1][3]["content"]], 167 | f"{args.model_3}": [agent_contexts[2][1]["content"], agent_contexts[2][2]["content"], agent_contexts[2][3]["content"]] 168 | } 169 | response_summarization = [ 170 | message[0], message[1] 171 | ] 172 | generated_description.append({"question_id": idx, "question": question, "agent_response": models_response, "summarization": response_summarization, "answer": answer}) 173 | 174 | if args.cot: 175 | file_name = "_cot.json" 176 | else: 177 | file_name = ".json" 178 | 179 | print(f"The result file 'mmlu_result{file_name}' is saving...") 180 | with open(args.output_dir + f"/mmlu_result{file_name}", "x") as f: 181 | json.dump(generated_description, f, indent=4) 182 | 183 | print("All done!!") 184 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |

2 | 3 | # LLM Agora 4 | LLM Agora is the place to debate between open-source LLMs and revise their responses! 5 | 6 | The **LLM Agora** 🗣️🏦 aims to improve the quality of open-source LMs' responses through debate & revision introduced in [Improving Factuality and Reasoning in Language Models through Multiagent Debate](https://arxiv.org/abs/2305.14325). 7 | We would like to thank the authors of the paper for their brilliant ideas that allowed me to pursue this project. 8 | 9 | Do you know that? 🤔 **LLMs can also improve their responses by debating with other LLMs**! 😮 We tried to apply this concept to several open-source LMs to verify that the open-source model, not the proprietary one, can sufficiently improve the response through discussion. 🤗 10 | For this, we developed **LLM Agora**! 11 | You can try the LLM Agora and check the example responses in [LLM Agora Spaces](https://huggingface.co/spaces/Cartinoe5930/LLMAgora)! 12 | 13 | We tried to follow the overall framework of [llm_multiagent_debate](https://github.com/composable-models/llm_multiagent_debate), and we added additional things such as CoT. 14 | We could confirm that through the experiments of LLM Agora, although there are still shortcomings, open-source LLMs can also improve the quality of models' responses through multi-agent debate. 15 | 16 | ## ToC 17 | 18 | 1. [Introduction & Motivation](#introduction--motivation) 19 | 2. [What is LLM Agora?](#what-is-llm-agora) 20 | 3. [Experiments](#experiments) 21 | 4. [Analysis](#analysis) 22 | 5. [Future work](#future-work) 23 | 6. [How to do?](#how-to-do) 24 | 25 | ## Introduction & Motivation 26 | 27 | The LLM Agora project is inspired by the multi-agent debate introduced in the paper '[Improving Factuality and Reasoning in Language Models through Multiagent Debate](https://arxiv.org/abs/2305.14325)' as mentioned above. 28 | Therefore, before start introducing the LLM Agora, we would like to explain the concept of multiagent debate! 29 | 30 | With the remarkable development of LLM, LLM has become capable of outputting responses at a significantly higher level. 31 | For example, GPT-4 is enough to pass even difficult exams. 32 | Despite the brilliant performance of proprietary LLMs, their first responses have some errors or mistakes. 33 | Then, how can correct and revise the responses? 34 | In the paper, they suggested that debate between several agents can revise the responses and improve the performance! 35 | Through several experiments, the fact that this method can correct the errors in responses and revise the quality of responses was proved. (If you want to know more, please check the official [GitHub Page of paper](https://composable-models.github.io/llm_debate/)!) 36 | 37 | In the paper, the overall experiment is conducted using only one model, but in the Analysis part, it is said that a synergy effect that shows further improved performance can be seen when different types of LLM are used. 38 | The LLM Agora is exactly inspired from this point! 39 | 40 | We started the LLM Agora project with the expectation that if several open-source LLMs create a synergy effect through debate between other models, we can expect an effect that can complement the shortcomings of open-source LLM, which still has some shortcomings. 41 | Therefore, we carried out the LLM Agora project because we thought it could be a groundbreaking method if multi-agent debate could improve the quality of responses of open-source LLMs. 42 | 43 | ## What is LLM Agora? 44 | 45 | The meaning of '[Agora](https://en.wikipedia.org/wiki/Agora)' is a place where meetings were held in ancient Greece. 46 | We thought this meaning was similar to a multi-agent debate, so we named it **LLM Agora**. 47 | The summarized difference between multi-agent debate and LLM Agora is as follows: 48 | 49 | 1. **Models**: **Several open-source LLMs** were utilized, unlike the paper that used proprietary LLM(ChatGPT). 50 | In addition, we analyzed whether using open-source LLM in multi-agent debate is effective or not, and used various models to check the synergy effect. 51 | 2. **Summarization**: The concatenated response was used for the debate sentence in the paper. However, according to the experimental result of the paper, it is more effective to summarize the models' responses and use them as a debate sentence. Therefore, we summarized the models' responses with ChatGPT and used it as a debate sentence. 52 | 3. **Chain-of-Thought**: We used **Chain-of-Thought** in a multi-agent debate to confirm whether open-source LLM can achieve performance improvement through Chain-of-Thought and to determine its impact on the debate. 53 | 4. **HuggingFace Space**: We implemented LLM Agora in HuggingFace Space so that people can directly use LLM Agora and check the responses generated through experiments. 54 | It's open to everyone, so check it out! [LLM Agora Space](https://huggingface.co/spaces/Cartinoe5930/LLMAgora) 55 | 56 | We hope that LLM Agora will be used in the future as a way to improve the performance of open-source models as well as proprietary models. 57 | Once again, we would like to thank the authors of the '[Improving Factuality and Reasoning in Language Models through Multiagent Debate](https://arxiv.org/abs/2305.14325)' for suggesting the idea of multiagent-debate. 58 | 59 | ## Experiments 60 | 61 | We followed the experiments progressed in the paper to prove the effectiveness of multi-agent debate on various open-source LLMs. 62 | The goal of experiments is as follows: 63 | 64 | - Effects of using open-source models for multi-agent debate 65 | - Impact of CoT on open-source models and multi-agent debate 66 | - Synergies of using diverse models 67 | 68 | ### Experimental setup 69 | 70 | #### Tasks 71 | 72 | We experimented using the same task in the paper. 73 | The tasks on which the experiment was performed are as follows: 74 | 75 | - **Math**: The problem of arithmetic operations on six randomly selected numbers. The format is `{}+{}*{}+{}-{}*{}=?` 76 | - **GSM8K**: GSM8K is a dataset consisting of high-quality linguistically diverse grade school math word problems. 77 | - **MMLU**: MMLU is a benchmark covering 57 subjects across STEM, the humanities, the social sciences, and more. 78 | 79 | For all tasks, only 100 questions were sampled and used in the experiment. 80 | 81 | #### The number of agents & rounds 82 | 83 | The multi-agent debate has some special parameters such as the number of **agents** and **rounds**. 84 | Each means **the number of used models for debate** and **the number of will be conducted debate rounds**. 85 | The number of agents and rounds were set to **3** and **2**, respectively, due to the resource issue. 86 | 87 | #### Baselines & Summarizer model 88 | 89 | The models were deployed with [HuggingFace Inference Endpoints](https://huggingface.co/inference-endpoints), but for some reason, models based on LLaMA1 cannot be deployed with Inference Endpoints, so models based on LLaMA2 were mainly used. 90 | In addition, GPTQ models were used to reduce the model size for deployment as an Inference Endpoint. 91 | Thank you to [TheBloke](https://huggingface.co/TheBloke) for uploading the GPTQ model. 92 | The models in **bold** are the baseline models used in the LLM Agora experiment. 93 | 94 | - **Llama2-13B**: https://huggingface.co/TheBloke/Carl-Llama-2-13B-GPTQ 95 | - Llama2-13B-Chat: https://huggingface.co/TheBloke/Llama-2-13B-chat-GPTQ 96 | - Vicuna2-13B: https://huggingface.co/TheBloke/vicuna-13B-v1.5-GPTQ 97 | - **WizardLM2-13B**: https://huggingface.co/TheBloke/WizardLM-13B-V1.2-GPTQ 98 | - **Orca2-13B**: https://huggingface.co/TheBloke/OpenOrcaxOpenChat-Preview2-13B-GPTQ 99 | - Falcon-7B: https://huggingface.co/tiiuae/falcon-7b 100 | - Falcon-7B-Instruct: https://huggingface.co/tiiuae/falcon-7b-instruct 101 | 102 | We also used ChatGPT(gpt-3.5-turbo) as a summarizer model that summarizes the models' responses. 103 | 104 | #### Prompt Format 105 | 106 | Please check the `src/prompt_template.json`! 107 | 108 | ### Results: Math 109 | 110 | Math task is the problem of finding the final value when four arithmetic operations are performed on six randomly given numbers. 111 | (ex. `What is the result of 13+17*4+30-12*22?`) 112 | Although it showed slightly worse performance due to poor math skills, which is a weakness in LLM, performance improved through the debate process both when CoT was used and it was not used. 113 | In addition, the effect of CoT is not much at the beginning of the debate, but after the final debate, the positive effect of CoT could be confirmed. 114 | You can check the responses to Math of each model in [LLM Agora Space](https://huggingface.co/spaces/Cartinoe5930/LLMAgora) or `Math/math_result.json` and `Math/math_result_cot.json`. 115 | The result of the Math task is as follows: 116 | 117 | **Math Result** 118 | 119 | |Response|None|CoT| 120 | |---|---|---| 121 | |1st response|5|5| 122 | |2nd response|11|11| 123 | |3rd response|10|**17**| 124 | 125 | 126 | 127 | ### Results: GSM8K 128 | 129 | GSM8K is the high-quality linguistically diverse grade school math word problem. 130 | The result of GSM8K showed that, unlike the result of other tasks, the models did not gain much benefit from debate and CoT. 131 | There was no change in performance despite the debate when CoT was not used, and performance got worse as the debate progressed when CoT was used. 132 | You can check the responses to GSM8K of each model in [LLM Agora Space](https://huggingface.co/spaces/Cartinoe5930/LLMAgora) or `GSM8K/gsm_result.json` and `GSM8K/gsm_result_cot.json`. 133 | The result of GSM8K is as follows: 134 | 135 | **GSM8K Results** 136 | 137 | |Response|None|CoT| 138 | |---|---|---| 139 | |1st response|26|**28**| 140 | |2nd response|26|26| 141 | |3rd response|26|23| 142 | 143 | 144 | 145 | ### Results: MMLU 146 | 147 | MMLU is a benchmark that covers 57 subjects across STEM, the humanities, the social sciences, and more. 148 | The result of MMLU showed that performance is improving through CoT and debate, but strangely, the performance plummets to 0 after the first debate. 149 | You can check the responses to the MMLU of each model in [LLM Agora Space](https://huggingface.co/spaces/Cartinoe5930/LLMAgora) or `MMLU/mmlu_result.json` and `MMLU/mmlu_result_cot.json`. 150 | The result of MMLU is as follows: 151 | 152 | **MMLU Results** 153 | 154 | |Response|None|CoT| 155 | |---|---|---| 156 | |1st response|48|50| 157 | |2nd response|0|0| 158 | |3rd response|54|**58**| 159 | 160 | 161 | 162 | ## Analysis 163 | 164 | The experiments of LLM Agora are performed on Math, GSM8K, and MMLU. 165 | The results showed that although open-source LLMs have some shortcomings in performing the multi-agent debate method, it is also effective in open-source LLMs. 166 | In addition, we were able to confirm that performance improved even when CoT was used. 167 | However, the performance improvement was not significant, and in the case of GSM8K, it was not affected by debate & CoT. 168 | 169 | In addition, the quality of the responses to each task of the models was not good. 170 | Since the quality of the responses is not good, it seems that the multi-agent debate hurt the quality of the responses. 171 | The analysis of the experiment results is summarized as follows: 172 | 173 | - Open-source LLMs can benefit from multi-agent debate and CoT when models output proper quality responses. 174 | - We did not investigate the synergy effect that occurs when using various models in the experiment. However judging from the results of deteriorating performance through debate, it may be possible to demonstrate a good synergy effect if the models' responses are high-quality, but if this is not the case it was confirmed that it could worsen performance. 175 | 176 | Although the LLM Agora that utilized open-source LLMs has some shortcomings, we confirmed that multi-agent debate can improve the performance of models. 177 | Therefore, multi-agent debate could become an effective method to improve the quality of models' responses if additional improvements are made to the open-source models and multi-agent debate. 178 | We hope that LLM Agora will help research methods to improve the performance of open-source models, and appreciate the authors of '[Improving Factuality and Reasoning in Language Models through Multiagent Debate](https://arxiv.org/abs/2305.14325)' for suggesting multi-agent debate, which is the motivation of LLM Agora. 179 | 180 | ## Future work 181 | 182 | As we mentioned in 'Analysis', there were some obstacles to performing the multi-agent debate with open-source LLMs. 183 | Therefore, we will try to utilize more improved open-source LLMs or research the method that can improve the quality of models' responses to make the multi-agent debate sufficiently effective to improve the performance of open-source LLMs. 184 | In addition, since the resource issue, LLM Agora supports just 7 models, however, we will try to develop the LLM Agora can support more various open-source LLMs! 185 | 186 | ## How to do? 187 | 188 | The following description is the process of our experiment. Please follow the process of our experiment, if you want to conduct them! 189 | We would like to note that we don't provide the inference endpoint APIs. 190 | Therefore, we recommend creating your inference endpoint API if you want to conduct the experiments. 191 | 192 | 0. [**Setup inference endpoint**](#setup-inference-endpoint) 193 | 1. [**Requirements**](#requirements) 194 | 2. [**Inference**](#inference) 195 | 3. [**Evaluation**](#evaluation) 196 | 197 | ### Setup inference endpoint 198 | 199 | As we mentioned above, we don't provide any inference endpoint API. 200 | Therefore, you should create your inference endpoint API if you want to conduct the experiments. 201 | The process of setup inference endpoint is as follows: 202 | 203 | 1. Create your inference endpoint API using [HuggingFace Inference Endpoints](https://huggingface.co/inference-endpoints) for the models mentioned in the **Experimental setup**. 204 | 2. Fill in the blanks of `src/inference_endpoint.json` with your inference endpoint API. `src/inference_endpoint.json` will be used when performing inference. 205 | 206 | ### Requirements 207 | 208 | To install the required library, just run these two lines of code! 209 | 210 | ``` 211 | %cd LLM-Agora 212 | pip install -r src/requirements.txt 213 | ``` 214 | 215 | ### Inference 216 | 217 | You can do inference by executing the following Math, GSM8K, and MMLU codes. 218 | At this time, you can do inferences using CoT by adding just one line, `--cot`. 219 | In addition, by executing the 'Custom Inference' code, inference about custom instructions is possible. 220 | 221 | **Math** 222 | ``` 223 | python Math/math_inference.py \ 224 | --model_1 llama \ 225 | --model_2 wizardlm \ 226 | --model_3 orca \ 227 | --API_KEY your_OpenAI_API_KEY \ 228 | --cot # If you write '--cot', cot is used. If you don't want to use cot, you don't have to write. 229 | ``` 230 | 231 | **GSM8K** 232 | ``` 233 | python GSM8K/gsm_inference.py \ 234 | --model_1 llama \ 235 | --model_2 wizardlm \ 236 | --model_3 orca \ 237 | --API_KEY your_OpenAI_API_KEY \ 238 | --cot # If you write '--cot', cot is used. If you don't want to use cot, you don't have to write. 239 | ``` 240 | 241 | **MMLU** 242 | ``` 243 | python MMLU/mmlu_inference.py \ 244 | --model_1 llama \ 245 | --model_2 wizardlm \ 246 | --model_3 orca \ 247 | --API_KEY your_OpenAI_API_KEY \ 248 | --cot # If you write '--cot', cot is used. If you don't want to use cot, you don't have to write. 249 | ``` 250 | 251 | **Custom Inference** 252 | ``` 253 | python inference/inference.py \ 254 | --model_1 model_you_want \ 255 | --model_2 model_you_want \ 256 | --model_3 model_you_want \ 257 | --API_KEY your_OpenAI_API_KEY \ 258 | --cot # If you write '--cot', cot is used. If you don't want to use cot, you don't have to write. 259 | ``` 260 | 261 | You can check the result of the multi-agent debate in the folder of each task or [LLM Agora Space](https://huggingface.co/spaces/Cartinoe5930/LLMAgora). 262 | 263 | ### Evaluation 264 | 265 | The evaluation can be performed as follows using debate response data generated through inference. 266 | You should remember that using the same model used in inference and whether or not to use CoT must be set in the same way. 267 | 268 | **Math** 269 | ``` 270 | python Math/math_evaulation.py \ 271 | --model_1 llama \ 272 | --model_2 wizardlm \ 273 | --model_3 orca \ 274 | --cot # If you used 'CoT' while inference, you need to write. 275 | ``` 276 | 277 | **GSM8K** 278 | ``` 279 | python GSM8K/gsm_evaluation.py \ 280 | --model_1 llama \ 281 | --model_2 wizardlm \ 282 | --model_3 orca \ 283 | --cot # If you used 'CoT' while inference, you need to write. 284 | ``` 285 | 286 | **MMLU** 287 | ``` 288 | python MMLU/mmlu_evaluation.py \ 289 | --model_1 llama \ 290 | --model_2 wizardlm \ 291 | --model_3 orca \ 292 | --cot # If you used 'CoT' while inference, you need to write. 293 | ``` 294 | 295 | 296 | ## Citation 297 | 298 | ``` 299 | @article{du2023improving, 300 | title={Improving Factuality and Reasoning in Language Models through Multiagent Debate}, 301 | author={Du, Yilun and Li, Shuang and Torralba, Antonio and Tenenbaum, Joshua B and Mordatch, Igor}, 302 | journal={arXiv preprint arXiv:2305.14325}, 303 | year={2023} 304 | } 305 | ``` 306 | 307 | ``` 308 | @misc{touvron2023llama, 309 | title={Llama 2: Open Foundation and Fine-Tuned Chat Models}, 310 | author={Hugo Touvron and Louis Martin and Kevin Stone and Peter Albert and Amjad Almahairi and Yasmine Babaei and Nikolay Bashlykov}, 311 | year={2023}, 312 | eprint={2307.09288}, 313 | archivePrefix={arXiv}, 314 | } 315 | ``` 316 | 317 | ``` 318 | @misc{vicuna2023, 319 | title = {Vicuna: An Open-Source Chatbot Impressing GPT-4 with 90\%* ChatGPT Quality}, 320 | url = {https://lmsys.org/blog/2023-03-30-vicuna/}, 321 | author = {Chiang, Wei-Lin and Li, Zhuohan and Lin, Zi and Sheng, Ying and Wu, Zhanghao and Zhang, Hao and Zheng, Lianmin and Zhuang, Siyuan and Zhuang, Yonghao and Gonzalez, Joseph E. and Stoica, Ion and Xing, Eric P.}, 322 | month = {March}, 323 | year = {2023} 324 | } 325 | ``` 326 | 327 | ``` 328 | @misc{xu2023wizardlm, 329 | title={WizardLM: Empowering Large Language Models to Follow Complex Instructions}, 330 | author={Can Xu and Qingfeng Sun and Kai Zheng and Xiubo Geng and Pu Zhao and Jiazhan Feng and Chongyang Tao and Daxin Jiang}, 331 | year={2023}, 332 | eprint={2304.12244}, 333 | archivePrefix={arXiv}, 334 | primaryClass={cs.CL} 335 | } 336 | ``` 337 | 338 | ``` 339 | @article{falcon40b, 340 | title={{Falcon-40B}: an open large language model with state-of-the-art performance}, 341 | author={Almazrouei, Ebtesam and Alobeidli, Hamza and Alshamsi, Abdulaziz and Cappelli, Alessandro and Cojocaru, Ruxandra and Debbah, Merouane and Goffinet, Etienne and Heslow, Daniel and Launay, Julien and Malartic, Quentin and Noune, Badreddine and Pannier, Baptiste and Penedo, Guilherme}, 342 | year={2023} 343 | } 344 | ``` 345 | 346 | ``` 347 | @misc{mukherjee2023orca, 348 | title={Orca: Progressive Learning from Complex Explanation Traces of GPT-4}, 349 | author={Subhabrata Mukherjee and Arindam Mitra and Ganesh Jawahar and Sahaj Agarwal and Hamid Palangi and Ahmed Awadallah}, 350 | year={2023}, 351 | eprint={2306.02707}, 352 | archivePrefix={arXiv}, 353 | primaryClass={cs.CL} 354 | } 355 | ``` 356 | --------------------------------------------------------------------------------