├── requirements.txt
├── results.png
├── benchmark_agents.png
├── README.md
├── LICENSE
├── scripts
    ├── prompts.py
    ├── agents.py
    ├── evaluation.py
    ├── modified_calculator.py
    └── run_agents.py
├── create_evaluation_dataset.ipynb
└── benchmark.ipynb


/requirements.txt:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/results.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aymeric-roucher/benchmark_agents/HEAD/results.png


--------------------------------------------------------------------------------
/benchmark_agents.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aymeric-roucher/benchmark_agents/HEAD/benchmark_agents.png


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # First Langchain agents based on Open Source models
2 | 
3 | This repo builds on the integration of OS Models into Langchain (see [this PR](https://github.com/langchain-ai/langchain/pull/14040)) to benchmark OS-models-based agents VS closed-source solutions.
4 | 
5 | ![benchmark](benchmark_agents.png)
6 | 
7 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 Aymeric Roucher
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/scripts/prompts.py:
--------------------------------------------------------------------------------
 1 | from langchain.prompts.chat import ChatPromptTemplate, HumanMessagePromptTemplate
 2 | from langchain.schema import SystemMessage
 3 | 
 4 | SYSTEM_PROMPT = """Answer the following questions as best you can. You have access to the following tools:
 5 | 
 6 | {tool_description_with_args}
 7 | 
 8 | The way you use the tools is by specifying a json blob.
 9 | Specifically, this json should have a `action` key (name of the tool to use) and a `action_input` key (input to the tool).
10 | 
11 | The only values that should be in the "action" field are: {tool_names}
12 | 
13 | The $JSON_BLOB should only contain a SINGLE action and MUST be formatted as markdown, do NOT return a list of multiple actions. Here is an example of a valid $JSON_BLOB:
14 | 
15 | ```
16 | {{
17 |   "action": $TOOL_NAME,
18 |   "action_input": $INPUT
19 | }}
20 | ```
21 | Make sure to have the $INPUT in the right format for the tool you are using, and do not put variable names as input if you can find the right values.
22 | 
23 | You will be given:
24 | 
25 | Question: the input question you must answer
26 | 
27 | You should ALWAYS use the following format:
28 | 
29 | Thought: you should always think about one action to take. Then use the action as follows:
30 | Action:
31 | ```
32 | $JSON_BLOB
33 | ```
34 | Observation: the result of the action
35 | ... (this Thought/Action/Observation can repeat N times, you should take several steps when needed. The $JSON_BLOB must only use a SINGLE action at a time.)
36 | 
37 | You must always end your output with the following format:
38 | 
39 | Thought: I now know the final answer.
40 | Final Answer: the final answer to the original input question
41 | 
42 | ALWAYS use the exact characters `Final Answer:` when you provide a definitive answer, and provide no additional explanations in the final answer: only the answer. MAKE SURE TO PROVIDE ONLY ONE ANSWER IN THE PROPER UNIT.
43 | 
44 | Now begin! """
45 | 
46 | 
47 | HUMAN_PROMPT = "Question: {input}"
48 | 
49 | SCRATCHPAD_PROMPT = "{agent_scratchpad}"
50 | 
51 | 
52 | evaluation_prompt = """###Task Description:
53 | An instruction (might include an Input inside it), a response to evaluate, a reference answer that gets a score of 5, and a score rubric representing a evaluation criteria are given.
54 | 1. Write a detailed feedback that assess the quality of the response strictly based on the given score rubric, not evaluating in general.
55 | 2. After writing a feedback, write a score that is an integer between 1 and 5. You should refer to the score rubric.
56 | 3. The output format should look as follows: \"Feedback: {{write a feedback for criteria}} [RESULT] {{an integer number between 1 and 5}}\"
57 | 4. Please do not generate any other opening, closing, and explanations. Be sure to include [RESULT] in your output.
58 | 
59 | ###The instruction to evaluate:
60 | {instruction}
61 | 
62 | ###Response to evaluate:
63 | {response}
64 | 
65 | ###Reference Answer (Score 5):
66 | {reference_answer}
67 | 
68 | ###Score Rubrics:
69 | [Is the response correct, accurate, and factual based on the reference answer?]
70 | Score 1: The response is completely incorrect, inaccurate, and/or not factual.
71 | Score 2: The response is mostly incorrect, inaccurate, and/or not factual.
72 | Score 3: The response is somewhat correct, accurate, and/or factual.
73 | Score 4: The response is mostly correct, accurate, and factual.
74 | Score 5: The response is completely correct, accurate, and factual.
75 | 
76 | ###Feedback:"""
77 | 
78 | EVALUATION_PROMPT_TEMPLATE = ChatPromptTemplate.from_messages(
79 |     [
80 |         SystemMessage(content="You are a fair evaluator language model."),
81 |         HumanMessagePromptTemplate.from_template(evaluation_prompt),
82 |     ]
83 | )
84 | 


--------------------------------------------------------------------------------
/scripts/agents.py:
--------------------------------------------------------------------------------
  1 | from typing import List, Optional
  2 | 
  3 | 
  4 | from langchain.agents.output_parsers import (
  5 |     ReActJsonSingleInputOutputParser,
  6 |     OpenAIFunctionsAgentOutputParser,
  7 | )
  8 | from langchain.llms import HuggingFaceEndpoint
  9 | from langchain.chat_models import ChatOpenAI
 10 | from langchain.tools.render import (
 11 |     render_text_description_and_args,
 12 |     format_tool_to_openai_function,
 13 | )
 14 | from langchain.agents.format_scratchpad import (
 15 |     format_to_openai_function_messages,
 16 |     format_log_to_str,
 17 | )
 18 | from langchain.prompts import (
 19 |     ChatPromptTemplate,
 20 |     HumanMessagePromptTemplate,
 21 |     SystemMessagePromptTemplate,
 22 |     AIMessagePromptTemplate,
 23 |     MessagesPlaceholder,
 24 | )
 25 | from langchain.agents import AgentExecutor, load_tools
 26 | from langchain.schema import HumanMessage
 27 | from langchain.chat_models.base import BaseChatModel
 28 | from langchain_community.chat_models.huggingface import ChatHuggingFace
 29 | from langchain_core.tools import Tool
 30 | 
 31 | from scripts.prompts import HUMAN_PROMPT, SYSTEM_PROMPT, SCRATCHPAD_PROMPT
 32 | from scripts.modified_calculator import LLMMathChain
 33 | 
 34 | 
 35 | def init_tools_with_llm(llm: BaseChatModel) -> List[Tool]:
 36 |     tools = load_tools(["serpapi", "llm-math"], llm=llm)
 37 |     # Rename tools in the same format used by other tools
 38 |     tools[0].name = "search"
 39 |     # llm_math_tool = Tool(
 40 |     #     name="Calculator",
 41 |     #     description="Useful for when you need to answer questions about math.",
 42 |     #     func=LLMMathChain.from_llm(llm=llm).run,
 43 |     #     coroutine=LLMMathChain.from_llm(llm=llm).arun,
 44 |     # )
 45 |     # tools.append(llm_math_tool)
 46 |     tools[1].name = "calculator"
 47 |     tools = [tools[1]]
 48 |     return tools
 49 | 
 50 | 
 51 | def build_openai_agent_with_tools(model_id: Optional[str] = "gpt-4-1106-preview") -> AgentExecutor:
 52 |     llm = ChatOpenAI(model=model_id, temperature=0.1)
 53 |     tools = init_tools_with_llm(llm)
 54 | 
 55 | 
 56 |     llm_with_tools = llm.bind(functions=[format_tool_to_openai_function(t) for t in tools])
 57 |     prompt = ChatPromptTemplate.from_messages(
 58 |         [
 59 |             ("system", "You are a helpful assistant. Answer the following question:"),
 60 |             ("user", "{input}"),
 61 |             MessagesPlaceholder(variable_name="agent_scratchpad"),
 62 |         ]
 63 |     )
 64 |     agent = (
 65 |         {
 66 |             "input": lambda x: x["input"],
 67 |             "agent_scratchpad": lambda x: format_to_openai_function_messages(
 68 |                 x["intermediate_steps"]
 69 |             ),
 70 |         }
 71 |         | prompt
 72 |         | llm_with_tools
 73 |         | OpenAIFunctionsAgentOutputParser()
 74 |     )
 75 |     return AgentExecutor(
 76 |         agent=agent,
 77 |         tools=tools,
 78 |         verbose=True,
 79 |         return_intermediate_steps=True,
 80 |         handle_parsing_errors=True,
 81 |         max_iterations=5,
 82 |     )
 83 | 
 84 | 
 85 | def build_hf_agent_with_tools(hf_endpoint_url: Optional[str] = None, repo_id: Optional[str] = None) -> AgentExecutor:
 86 |     """
 87 |     Build a zero-shot ReAct chat agent from HF endpoint.
 88 | 
 89 |     Args:
 90 |         hf_endpoint_url (str): The endpoint URL for the Hugging Face model.
 91 | 
 92 |     Returns:
 93 |         AgentExecutor: An agent executor object that can be used to run the agent.
 94 | 
 95 |     """
 96 |     assert hf_endpoint_url or repo_id, "hf_endpoint_url or repo_id must be provided."
 97 |     assert not (hf_endpoint_url and repo_id), "Only one of hf_endpoint_url or repo_id can be provided."
 98 | 
 99 |     # instantiate LLM and chat model
100 |     if hf_endpoint_url:
101 |         llm = HuggingFaceEndpoint(
102 |             endpoint_url=hf_endpoint_url,
103 |             task="text-generation",
104 |             max_new_tokens= 512,
105 |             do_sample= False,
106 |             repetition_penalty= 1.03,
107 |         )
108 |     else:
109 |         llm = HuggingFaceEndpoint(
110 |             repo_id=repo_id,
111 |             task="text-generation",
112 |             max_new_tokens= 512,
113 |             do_sample= False,
114 |             repetition_penalty= 1.03,
115 |         )
116 | 
117 |     chat_model = ChatHuggingFace(llm=llm)
118 |     tools = init_tools_with_llm(llm)
119 | 
120 |     # # TODO: remove
121 |     # tools = [tools[1]] # only use calculator for now
122 | 
123 | 
124 |     # define the prompt depending on whether the chat model supports system prompts
125 |     system_prompt_supported = check_supports_system_prompt(chat_model)
126 | 
127 |     if system_prompt_supported:
128 |         prompt = ChatPromptTemplate.from_messages(
129 |             [
130 |                 SystemMessagePromptTemplate.from_template(SYSTEM_PROMPT),
131 |                 HumanMessagePromptTemplate.from_template(HUMAN_PROMPT),
132 |                 SystemMessagePromptTemplate.from_template(SCRATCHPAD_PROMPT),
133 |             ]
134 |         )
135 |     else:
136 |         prompt = ChatPromptTemplate.from_messages(
137 |             [
138 |                 HumanMessagePromptTemplate.from_template(
139 |                     SYSTEM_PROMPT + "\nSo, here is my question:" + HUMAN_PROMPT
140 |                 ),
141 |                 AIMessagePromptTemplate.from_template(SCRATCHPAD_PROMPT),
142 |                 HumanMessage(content="Now give your next thoughts: "),
143 |             ]
144 |         )
145 | 
146 |     prompt = prompt.partial(
147 |         tool_description_with_args=render_text_description_and_args(tools),
148 |         tool_names=", ".join([t.name for t in tools]),
149 |     )
150 | 
151 |     # define the agent
152 |     chat_model_with_stop = chat_model.bind(stop=["\nObservation"])
153 |     agent = (
154 |         {
155 |             "input": lambda x: x["input"],
156 |             "agent_scratchpad": lambda x: format_log_to_str(x["intermediate_steps"]),
157 |         }
158 |         | prompt
159 |         | chat_model_with_stop
160 |         | ReActJsonSingleInputOutputParser()
161 |     )
162 | 
163 |     return AgentExecutor(
164 |         agent=agent,
165 |         tools=tools,
166 |         verbose=True,
167 |         return_intermediate_steps=True,
168 |         handle_parsing_errors=True,
169 |         max_iterations=5,
170 |     )
171 | 
172 | 
173 | def check_supports_system_prompt(chat_model):
174 |     """
175 |     Checks if the given chat model supports system prompts.
176 | 
177 |     Args:
178 |         chat_model: The chat model to be checked.
179 | 
180 |     Returns:
181 |         True if the chat model supports system prompts, False otherwise.
182 |     """
183 |     messages = ChatPromptTemplate.from_messages(
184 |         [
185 |             SystemMessagePromptTemplate.from_template(SYSTEM_PROMPT),
186 |             HumanMessagePromptTemplate.from_template(HUMAN_PROMPT),
187 |             SystemMessagePromptTemplate.from_template(SCRATCHPAD_PROMPT),
188 |         ]
189 |     )
190 |     try:
191 |         chat_model._to_chat_prompt(messages)
192 |         print("System prompt supported")
193 |         return True
194 |     except Exception as e:
195 |         print(e)
196 |         print("System prompt not supported")
197 |         return False
198 | 


--------------------------------------------------------------------------------
/scripts/evaluation.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import re
  3 | import json
  4 | from langchain.llms import HuggingFaceEndpoint
  5 | from langchain.prompts.chat import ChatPromptTemplate
  6 | import pandas as pd
  7 | import asyncio
  8 | from typing import Optional, List
  9 | import tqdm.asyncio
 10 | import numpy as np
 11 | from threading import Thread
 12 | from queue import Queue
 13 | import datasets
 14 | 
 15 | _SENTINEL_KILL_CONSUMERS = object()
 16 | 
 17 | 
 18 | def build_evaluator(hf_endpoint_url: str) -> tuple:
 19 |     """
 20 |     Build an evaluator language model using the given Hugging Face endpoint URL.
 21 | 
 22 |     Args:
 23 |         hf_endpoint_url (str): The URL of the Hugging Face endpoint.
 24 | 
 25 |     Returns:
 26 |         Tuple: A tuple containing the evaluator chat model and the correctness prompt template.
 27 |     """
 28 |     eval_chat_model = HuggingFaceEndpoint(
 29 |         endpoint_url=hf_endpoint_url,
 30 |         task="text-generation",
 31 |         model_kwargs={
 32 |             "max_new_tokens": 488,
 33 |             "do_sample": False,
 34 |             "repetition_penalty": 1.03,
 35 |         },
 36 |     )
 37 |     return eval_chat_model
 38 | 
 39 | 
 40 | async def evaluate_single_example(
 41 |     example: dict, evaluator, eval_prompt_template, evaluator_name, eval_split_string="[RESULT]", writer_queue: Optional[Queue] = None
 42 | ):
 43 |     if f"eval_score_{evaluator_name}" in example:
 44 |         try:
 45 |             el = float(example[f"eval_score_{evaluator_name}"])
 46 |             assert not np.isnan(el)
 47 |             return example
 48 |         except:
 49 |             pass
 50 |     eval_prompt = eval_prompt_template.format_messages(
 51 |         instruction=example["question"],
 52 |         response=example["prediction"],
 53 |         reference_answer=example["gt_answer"],
 54 |     )
 55 |     print("Evaluating example")
 56 |     eval_result = await evaluator.ainvoke(eval_prompt)
 57 |     eval_result = eval_result.content
 58 |     try:
 59 |         feedback, score = [item.strip() for item in eval_result.split(eval_split_string)]
 60 |     except:
 61 |         print(eval_result)
 62 |         segments = [
 63 |             segment.strip() for segment in eval_result.split(eval_split_string) if segment.strip()
 64 |         ]
 65 |         # Search for a segment that contains a numerical score
 66 |         for segment in segments:
 67 |             if segment.isdigit():
 68 |                 feedback = ""
 69 |                 score = int(segment)
 70 |     example[f"eval_score_{evaluator_name}"] = score
 71 |     example[f"eval_feedback_{evaluator_name}"] = feedback
 72 |     if writer_queue:
 73 |         writer_queue.put(example)
 74 |     return example
 75 | 
 76 | 
 77 | async def evaluate_answers(
 78 |     examples: List,
 79 |     evaluator,
 80 |     evaluator_name: str,
 81 |     eval_prompt_template: ChatPromptTemplate,
 82 |     eval_split_string: str = "[RESULT]",
 83 |     output_file_path: Optional[str] = None,
 84 | ) -> pd.DataFrame:
 85 |     """
 86 |     Run a full evaluation on the given dataset using multiple agent models.
 87 |     Uses safe writing in multithreading, from options suggested here:
 88 |     https://stackoverflow.com/questions/33107019/multiple-threads-writing-to-the-same-csv-in-python
 89 | 
 90 |     Args:
 91 |         dataset (Dataset): The dataset to test on.
 92 |         agents (Dict[str, AgentExecutor]): A dictionary of agent executors to test on the dataset
 93 | 
 94 |     Returns:
 95 |         pd.DataFrame: The evaluation results as a pandas DataFrame.
 96 |     """
 97 |     if output_file_path and os.path.isfile(output_file_path):
 98 |         previous_evaluations = pd.read_json(output_file_path, lines=True)
 99 |         if f"eval_score_{evaluator_name}" in previous_evaluations.columns:
100 |             previous_evaluations = previous_evaluations.loc[previous_evaluations[f"eval_score_{evaluator_name}"].notna()]
101 |             print('Previous evaluations:')
102 |             
103 |             examples = [example for example in examples if not len(previous_evaluations.loc[
104 |                 (previous_evaluations["question"] == example["question"]) & (previous_evaluations["agent_name"] == example["agent_name"])
105 |             ]) > 0]
106 | 
107 |     print(f"Launching evaluation for {len(examples)} examples...")
108 | 
109 |     writer_queue = Queue()
110 | 
111 |     with open(output_file_path, "a") as output_file:
112 |         def write_line():
113 |             while True:
114 |                 if not writer_queue.empty():
115 |                     annotated_example = writer_queue.get()
116 |                     
117 |                     if annotated_example is _SENTINEL_KILL_CONSUMERS:
118 |                         writer_queue.put(_SENTINEL_KILL_CONSUMERS) # put it back so that other consumers see it
119 |                         return
120 |                     
121 |                     annotated_example = {k: str(v) for k, v in annotated_example.items()}
122 | 
123 |                     # Row comes out of writer_queue; JSON writing goes here
124 |                     json.dump(annotated_example, output_file)
125 |                     output_file.write('\n')
126 |         
127 |         consumer = Thread(target=write_line)
128 |         consumer.setDaemon(True)
129 |         consumer.start()
130 | 
131 |         tasks = [
132 |             evaluate_single_example(
133 |                 example,
134 |                 evaluator,
135 |                 eval_prompt_template,
136 |                 evaluator_name,
137 |                 eval_split_string,
138 |                 writer_queue,
139 |             )
140 |             for example in examples
141 |         ]
142 | 
143 |         evaluation_results = [await f for f in tqdm.tqdm(asyncio.as_completed(tasks), total=len(tasks))]
144 |         writer_queue.put(_SENTINEL_KILL_CONSUMERS)
145 | 
146 |     return evaluation_results
147 | 
148 | 
149 | def extract_number(string):
150 |     try:
151 |         found_strings = [el.strip() for el in re.findall(r"(?:[,\d]+.?\d*)", string)]
152 | 
153 |         found_strings = [
154 |             "".join(ch for ch in el if (ch.isalnum() or ch == "."))
155 |             for el in found_strings
156 |             if el[0].isdigit() or el[0] == "."
157 |         ]
158 |         found_strings = [el for el in found_strings if len(el) > 0]
159 | 
160 |         found_string = found_strings[-1]
161 |         return float(found_string)
162 |     except Exception as e:
163 |         print("Error when extracting string:", e)
164 |         return 0
165 | 
166 | 
167 | def split_answer(row):
168 |     splitted = row["answer"].split("####")
169 |     row["true_reasoning"] = splitted[0]
170 |     row["true_answer"] = float(splitted[1].strip())
171 |     return row
172 | 
173 | 
174 | def load_math_datasets():
175 |     math_dataset = (
176 |         datasets.load_dataset("gsm8k", "main")["train"].shuffle(seed=42).select(range(100))
177 |     )
178 |     math_dataset = pd.DataFrame(math_dataset)
179 | 
180 |     math_dataset = math_dataset.apply(split_answer, axis=1)
181 |     math_dataset = math_dataset.drop(columns=["answer"])
182 |     math_dataset = datasets.Dataset.from_pandas(math_dataset)
183 | 
184 |     eval_dataset = math_dataset.select(range(30))
185 |     fewshot_dataset = math_dataset.select(range(10))
186 |     return eval_dataset, fewshot_dataset


--------------------------------------------------------------------------------
/create_evaluation_dataset.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "from datasets import Dataset, load_dataset, concatenate_datasets\n",
 10 |     "import pandas as pd\n",
 11 |     "import numpy as np"
 12 |    ]
 13 |   },
 14 |   {
 15 |    "cell_type": "markdown",
 16 |    "metadata": {},
 17 |    "source": [
 18 |     "### HotpotQA"
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "code",
 23 |    "execution_count": null,
 24 |    "metadata": {},
 25 |    "outputs": [],
 26 |    "source": [
 27 |     "hotpotqa_dataset = load_dataset(\"hotpot_qa\", \"distractor\")\n",
 28 |     "\n",
 29 |     "# let's sample a few examples from each level (of difficulty) and type (comparion or bridge)\n",
 30 |     "dataset_df = pd.DataFrame(hotpotqa_dataset[\"train\"])\n",
 31 |     "sample_indicies = (\n",
 32 |     "    dataset_df.groupby([\"level\", \"type\"]).sample(4, random_state=10).index.values\n",
 33 |     ")\n",
 34 |     "hotpotqa_dataset.reset_format()\n",
 35 |     "hotpotqa_dataset_leftout = hotpotqa_dataset[\"train\"].select(\n",
 36 |     "    [i for i in range(len(hotpotqa_dataset[\"train\"])) if i not in sample_indicies]\n",
 37 |     ")\n",
 38 |     "hotpotqa_dataset = hotpotqa_dataset[\"train\"].select(sample_indicies)\n",
 39 |     "\n",
 40 |     "hotpotqa_dataset_leftout_df = pd.DataFrame(hotpotqa_dataset_leftout)\n",
 41 |     "hotpotqa_dataset_leftout_df = (\n",
 42 |     "    hotpotqa_dataset_leftout_df.groupby([\"level\", \"type\"])\n",
 43 |     "    .sample(6, random_state=42)\n",
 44 |     "    .reset_index(drop=True)\n",
 45 |     ")\n",
 46 |     "hotpotqa_dataset_leftout = Dataset.from_pandas(hotpotqa_dataset_leftout_df)\n",
 47 |     "\n",
 48 |     "task_column = [f\"HotpotQA-{level}\" for level in hotpotqa_dataset[\"level\"]]\n",
 49 |     "hotpotqa_dataset = hotpotqa_dataset.add_column(\"task\", task_column).select_columns(\n",
 50 |     "    [\"question\", \"answer\", \"task\"]\n",
 51 |     ")\n",
 52 |     "\n",
 53 |     "task_column = [f\"HotpotQA-{level}\" for level in hotpotqa_dataset_leftout[\"level\"]]\n",
 54 |     "hotpotqa_dataset_leftout = hotpotqa_dataset_leftout.add_column(\n",
 55 |     "    \"task\", task_column\n",
 56 |     ").select_columns([\"question\", \"answer\", \"task\"])"
 57 |    ]
 58 |   },
 59 |   {
 60 |    "cell_type": "code",
 61 |    "execution_count": null,
 62 |    "metadata": {},
 63 |    "outputs": [],
 64 |    "source": [
 65 |     "hotpotqa_dataset = concatenate_datasets([hotpotqa_dataset, hotpotqa_dataset_leftout])\n",
 66 |     "print(len(hotpotqa_dataset), len(pd.Series(hotpotqa_dataset[\"question\"]).unique()))"
 67 |    ]
 68 |   },
 69 |   {
 70 |    "cell_type": "markdown",
 71 |    "metadata": {},
 72 |    "source": [
 73 |     "### GSM8K"
 74 |    ]
 75 |   },
 76 |   {
 77 |    "cell_type": "code",
 78 |    "execution_count": null,
 79 |    "metadata": {},
 80 |    "outputs": [],
 81 |    "source": [
 82 |     "np.random.seed(42)\n",
 83 |     "\n",
 84 |     "math_dataset = load_dataset(\"gsm8k\", \"main\")[\"train\"]\n",
 85 |     "\n",
 86 |     "first_selection = np.random.randint(0, len(math_dataset), 15)\n",
 87 |     "second_selection = np.random.randint(0, len(math_dataset), 15)\n",
 88 |     "second_selection_first_excluded = [\n",
 89 |     "    i for i in second_selection if i not in first_selection\n",
 90 |     "][:20]\n",
 91 |     "\n",
 92 |     "math_dataset = math_dataset.select(\n",
 93 |     "    list(first_selection) + list(second_selection_first_excluded)[:5]\n",
 94 |     ")\n",
 95 |     "\n",
 96 |     "\n",
 97 |     "task_column = [\"GSM8K\"] * len(math_dataset)\n",
 98 |     "math_dataset = math_dataset.add_column(\"task\", task_column).select_columns(\n",
 99 |     "    [\"question\", \"answer\", \"task\"]\n",
100 |     ")"
101 |    ]
102 |   },
103 |   {
104 |    "cell_type": "markdown",
105 |    "metadata": {},
106 |    "source": [
107 |     "### GAIA"
108 |    ]
109 |   },
110 |   {
111 |    "cell_type": "code",
112 |    "execution_count": null,
113 |    "metadata": {},
114 |    "outputs": [],
115 |    "source": [
116 |     "gaia_dataset = load_dataset(\"gaia-benchmark/GAIA\", \"2023_level1\")[\"validation\"]\n",
117 |     "gaia_dataset.set_format(\"pandas\")\n",
118 |     "gaia_dataset_df = gaia_dataset[:]\n",
119 |     "gaia_dataset_df[\"number_of_steps\"] = gaia_dataset_df[\"Annotator Metadata\"].apply(\n",
120 |     "    lambda row: int(row[\"Number of steps\"])\n",
121 |     ")\n",
122 |     "gaia_dataset_df[\"tools_used\"] = gaia_dataset_df[\"Annotator Metadata\"].apply(\n",
123 |     "    lambda row: row[\"Tools\"]\n",
124 |     ")\n",
125 |     "gaia_dataset_df = gaia_dataset_df.loc[\n",
126 |     "    ~gaia_dataset_df[\"tools_used\"]\n",
127 |     "    .str.lower()\n",
128 |     "    .str.contains(\n",
129 |     "        \"pdf|excel|image|video|parsing|audio|word|file|speech|viewer|markdown|python|editor|model\"\n",
130 |     "    )\n",
131 |     "]"
132 |    ]
133 |   },
134 |   {
135 |    "cell_type": "code",
136 |    "execution_count": null,
137 |    "metadata": {},
138 |    "outputs": [],
139 |    "source": [
140 |     "pd.set_option(\"display.max_colwidth\", None)\n",
141 |     "gaia_dataset_df"
142 |    ]
143 |   },
144 |   {
145 |    "cell_type": "code",
146 |    "execution_count": null,
147 |    "metadata": {},
148 |    "outputs": [],
149 |    "source": [
150 |     "selected_indicies = [\n",
151 |     "    0,\n",
152 |     "    2,\n",
153 |     "    11,\n",
154 |     "    12,\n",
155 |     "    23,\n",
156 |     "    28,\n",
157 |     "    25,\n",
158 |     "    29,\n",
159 |     "    32,\n",
160 |     "    35,\n",
161 |     "    36,\n",
162 |     "    37,\n",
163 |     "    39,\n",
164 |     "    40,\n",
165 |     "    41,\n",
166 |     "    42,\n",
167 |     "    43,\n",
168 |     "    47,\n",
169 |     "    49,\n",
170 |     "    52,\n",
171 |     "]\n",
172 |     "print(len(selected_indicies))\n",
173 |     "gaia_dataset = gaia_dataset.rename_columns(\n",
174 |     "    {\"Question\": \"question\", \"Final answer\": \"answer\"}\n",
175 |     ").select_columns([\"question\", \"answer\"])\n",
176 |     "gaia_dataset.reset_format()\n",
177 |     "gaia_dataset = gaia_dataset.select(selected_indicies)\n",
178 |     "\n",
179 |     "task_column = [\"GAIA\"] * len(gaia_dataset)\n",
180 |     "gaia_dataset = gaia_dataset.add_column(\"task\", task_column)"
181 |    ]
182 |   },
183 |   {
184 |    "cell_type": "code",
185 |    "execution_count": null,
186 |    "metadata": {},
187 |    "outputs": [],
188 |    "source": [
189 |     "full_eval_dataset = concatenate_datasets([math_dataset, hotpotqa_dataset, gaia_dataset])\n",
190 |     "pd.Series(full_eval_dataset[\"task\"]).value_counts()"
191 |    ]
192 |   },
193 |   {
194 |    "cell_type": "markdown",
195 |    "metadata": {},
196 |    "source": [
197 |     "### Export"
198 |    ]
199 |   },
200 |   {
201 |    "cell_type": "code",
202 |    "execution_count": null,
203 |    "metadata": {},
204 |    "outputs": [],
205 |    "source": [
206 |     "full_eval_dataset.push_to_hub(\"m-ric/agents_small_benchmark\")"
207 |    ]
208 |   }
209 |  ],
210 |  "metadata": {
211 |   "kernelspec": {
212 |    "display_name": "ml2",
213 |    "language": "python",
214 |    "name": "ml2"
215 |   },
216 |   "language_info": {
217 |    "codemirror_mode": {
218 |     "name": "ipython",
219 |     "version": 3
220 |    },
221 |    "file_extension": ".py",
222 |    "mimetype": "text/x-python",
223 |    "name": "python",
224 |    "nbconvert_exporter": "python",
225 |    "pygments_lexer": "ipython3",
226 |    "version": "3.10.9"
227 |   }
228 |  },
229 |  "nbformat": 4,
230 |  "nbformat_minor": 2
231 | }
232 | 


--------------------------------------------------------------------------------
/scripts/modified_calculator.py:
--------------------------------------------------------------------------------
  1 | """Chain that interprets a prompt and executes python code to do math."""
  2 | 
  3 | import math
  4 | import re
  5 | import warnings
  6 | from typing import Any, Dict, List, Optional
  7 | 
  8 | from langchain_core.callbacks import (
  9 |     AsyncCallbackManagerForChainRun,
 10 |     CallbackManagerForChainRun,
 11 | )
 12 | from langchain_core.language_models import BaseLanguageModel
 13 | from langchain_core.prompts import BasePromptTemplate
 14 | from langchain_core.pydantic_v1 import Extra, root_validator
 15 | 
 16 | from langchain.chains.base import Chain
 17 | from langchain.chains.llm import LLMChain
 18 | from langchain.chains.llm_math.prompt import PROMPT
 19 | 
 20 | 
 21 | class LLMMathChain(Chain):
 22 |     """Chain that interprets a prompt and executes python code to do math.
 23 | 
 24 |     Example:
 25 |         .. code-block:: python
 26 | 
 27 |             from langchain.chains import LLMMathChain
 28 |             from langchain_community.llms import OpenAI
 29 |             llm_math = LLMMathChain.from_llm(OpenAI())
 30 |     """
 31 | 
 32 |     llm_chain: LLMChain
 33 |     llm: Optional[BaseLanguageModel] = None
 34 |     """[Deprecated] LLM wrapper to use."""
 35 |     prompt: BasePromptTemplate = PROMPT
 36 |     """[Deprecated] Prompt to use to translate to python if necessary."""
 37 |     input_key: str = "question"  #: :meta private:
 38 |     output_key: str = "answer"  #: :meta private:
 39 | 
 40 |     class Config:
 41 |         """Configuration for this pydantic object."""
 42 | 
 43 |         extra = Extra.forbid
 44 |         arbitrary_types_allowed = True
 45 | 
 46 |     @root_validator(pre=True)
 47 |     def raise_deprecation(cls, values: Dict) -> Dict:
 48 |         try:
 49 |             import numexpr  # noqa: F401
 50 |         except ImportError:
 51 |             raise ImportError(
 52 |                 "LLMMathChain requires the numexpr package. "
 53 |                 "Please install it with `pip install numexpr`."
 54 |             )
 55 |         if "llm" in values:
 56 |             warnings.warn(
 57 |                 "Directly instantiating an LLMMathChain with an llm is deprecated. "
 58 |                 "Please instantiate with llm_chain argument or using the from_llm "
 59 |                 "class method."
 60 |             )
 61 |             if "llm_chain" not in values and values["llm"] is not None:
 62 |                 prompt = values.get("prompt", PROMPT)
 63 |                 values["llm_chain"] = LLMChain(llm=values["llm"], prompt=prompt)
 64 |         return values
 65 | 
 66 |     @property
 67 |     def input_keys(self) -> List[str]:
 68 |         """Expect input key.
 69 | 
 70 |         :meta private:
 71 |         """
 72 |         return [self.input_key]
 73 | 
 74 |     @property
 75 |     def output_keys(self) -> List[str]:
 76 |         """Expect output key.
 77 | 
 78 |         :meta private:
 79 |         """
 80 |         return [self.output_key]
 81 | 
 82 |     def _evaluate_expression(self, expression: str) -> str:
 83 |         import numexpr  # noqa: F401
 84 | 
 85 |         try:
 86 |             local_dict = {"pi": math.pi, "e": math.e}
 87 |             output = str(
 88 |                 numexpr.evaluate(
 89 |                     expression.strip(),
 90 |                     global_dict={},  # restrict access to globals
 91 |                     local_dict=local_dict,  # add common mathematical functions
 92 |                 )
 93 |             )
 94 |         except Exception as e:
 95 |             print(
 96 |                 f'LLMMathChain._evaluate("{expression}") raised error: {e}.'
 97 |                 " Please try again with a valid numerical expression"
 98 |             )
 99 |             output= f"Error when evaluating {expression}: {e}"
100 | 
101 |         # Remove any leading and trailing brackets from the output
102 |         return re.sub(r"^\[|\]$", "", output)
103 | 
104 |     def _process_llm_result(
105 |         self, llm_output: str, run_manager: CallbackManagerForChainRun
106 |     ) -> Dict[str, str]:
107 |         run_manager.on_text(llm_output, color="green", verbose=self.verbose)
108 |         llm_output = llm_output.strip()
109 |         text_match = re.search(r"^```text(.*?)```", llm_output, re.DOTALL)
110 |         if text_match:
111 |             expression = text_match.group(1)
112 |             output = self._evaluate_expression(expression)
113 |             run_manager.on_text("\nAnswer: ", verbose=self.verbose)
114 |             run_manager.on_text(output, color="yellow", verbose=self.verbose)
115 |             answer = output
116 |         elif llm_output.startswith("Answer:"):
117 |             answer = llm_output.split("Answer:")[-1]
118 |         elif "Answer:" in llm_output:
119 |             answer = llm_output.split("Answer:")[-1]
120 |         else:
121 |             return {self.output_key: f"Not a valid expression: could not process the ouput {llm_output}"}
122 |         return {self.output_key: answer}
123 | 
124 |     async def _aprocess_llm_result(
125 |         self,
126 |         llm_output: str,
127 |         run_manager: AsyncCallbackManagerForChainRun,
128 |     ) -> Dict[str, str]:
129 |         await run_manager.on_text(llm_output, color="green", verbose=self.verbose)
130 |         llm_output = llm_output.strip()
131 |         text_match = re.search(r"^```text(.*?)```", llm_output, re.DOTALL)
132 |         if text_match:
133 |             expression = text_match.group(1)
134 |             output = self._evaluate_expression(expression)
135 |             await run_manager.on_text("\nAnswer: ", verbose=self.verbose)
136 |             await run_manager.on_text(output, color="yellow", verbose=self.verbose)
137 |             answer = output
138 |         elif llm_output.startswith("Answer:"):
139 |             answer = llm_output.split("Answer:")[-1]
140 |         elif "Answer:" in llm_output:
141 |             answer = llm_output.split("Answer:")[-1]
142 |         else:
143 |             return {self.output_key: f"Not a valid expression: could not process the ouput {llm_output}"}
144 |         return {self.output_key: answer}
145 | 
146 |     def _call(
147 |         self,
148 |         inputs: Dict[str, str],
149 |         run_manager: Optional[CallbackManagerForChainRun] = None,
150 |     ) -> Dict[str, str]:
151 |         _run_manager = run_manager or CallbackManagerForChainRun.get_noop_manager()
152 |         _run_manager.on_text(inputs[self.input_key])
153 |         llm_output = self.llm_chain.predict(
154 |             question=inputs[self.input_key],
155 |             stop=["```output"],
156 |             callbacks=_run_manager.get_child(),
157 |         )
158 |         return self._process_llm_result(llm_output, _run_manager)
159 | 
160 |     async def _acall(
161 |         self,
162 |         inputs: Dict[str, str],
163 |         run_manager: Optional[AsyncCallbackManagerForChainRun] = None,
164 |     ) -> Dict[str, str]:
165 |         _run_manager = run_manager or AsyncCallbackManagerForChainRun.get_noop_manager()
166 |         await _run_manager.on_text(inputs[self.input_key])
167 |         llm_output = await self.llm_chain.apredict(
168 |             question=inputs[self.input_key],
169 |             stop=["```output"],
170 |             callbacks=_run_manager.get_child(),
171 |         )
172 |         return await self._aprocess_llm_result(llm_output, _run_manager)
173 | 
174 |     @property
175 |     def _chain_type(self) -> str:
176 |         return "llm_math_chain"
177 | 
178 |     @classmethod
179 |     def from_llm(
180 |         cls,
181 |         llm: BaseLanguageModel,
182 |         prompt: BasePromptTemplate = PROMPT,
183 |         **kwargs: Any,
184 |     ):
185 |         llm_chain = LLMChain(llm=llm, prompt=prompt)
186 |         return cls(llm_chain=llm_chain, **kwargs)
187 | 


--------------------------------------------------------------------------------
/scripts/run_agents.py:
--------------------------------------------------------------------------------
  1 | import asyncio
  2 | from datetime import datetime
  3 | from typing import Any, Dict, List, Callable
  4 | import json
  5 | import pandas as pd
  6 | from tqdm import tqdm
  7 | from datasets import Dataset
  8 | 
  9 | from langchain.agents import AgentExecutor
 10 | from langchain.tools.base import ToolException
 11 | 
 12 | 
 13 | def acall_langchain_agent(agent: AgentExecutor, question: str) -> str:
 14 |     return agent.ainvoke({"input": question})
 15 | 
 16 | def call_langchain_agent(agent: AgentExecutor, question: str) -> str:
 17 |     return agent.invoke({"input": question})
 18 | 
 19 | async def arun_agent(
 20 |     question: str,
 21 |     agent_executor: AgentExecutor,
 22 |     agent_name: str,
 23 |     agent_call_function: Callable,
 24 | ) -> dict:
 25 |     """
 26 |     Runs the execution process for a given question and ground truth answer.
 27 | 
 28 |     Args:
 29 |         question (str): The input question to be evaluated.
 30 |         agent_executor (AgentExecutor): The agent executor object used to run the agent.
 31 |         agent_name (str): The name of the agent model.
 32 | 
 33 |     Returns:
 34 |         dict: A dictionary containing the evaluation results, including the agent model ID, evaluator model ID,
 35 |         question, ground truth answer, prediction, intermediate steps, evaluation score, evaluation feedback,
 36 |         tool call parsing error flag, iteration limit exceeded flag, and agent error (if any).
 37 |     """
 38 |     start_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
 39 |     try:
 40 |         # run executor agent
 41 |         response = await agent_call_function(agent_executor, question)
 42 | 
 43 |         # check for parsing errors which indicate the LLM failed to follow the ReACT format
 44 |         # this could be due to an issue with the tool calling format or ReACT formatting (i.e. Thought, Action, Observation, etc.)
 45 |         parsing_error = (
 46 |             True
 47 |             if any(
 48 |                 [
 49 |                     "Could not parse LLM output" in step[0].log
 50 |                     for step in response["intermediate_steps"]
 51 |                 ]
 52 |             )
 53 |             else False
 54 |         )
 55 | 
 56 |         # check if iteration limit exceeded
 57 |         iteration_limit_exceeded = (
 58 |             True
 59 |             if "Agent stopped due to iteration limit or time limit." in response["output"]
 60 |             else False
 61 |         )
 62 |         raised_exception = False
 63 | 
 64 |     except (ValueError, ToolException) as e:
 65 |         print("Error on ", question, e)
 66 |         response = {"output": None, "intermediate_steps": None}
 67 |         parsing_error = False
 68 |         iteration_limit_exceeded = False
 69 |         exception = e
 70 |         raised_exception = True
 71 | 
 72 |     end_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
 73 |     # collect results
 74 |     if response["intermediate_steps"] is not None:
 75 |         intermediate_steps = [
 76 |             {
 77 |                 "tool": response[0].tool,
 78 |                 "tool_input": response[0].tool_input,
 79 |                 "tool_output": response[1],
 80 |             }
 81 |             for response in response["intermediate_steps"]
 82 |         ]
 83 |     else:
 84 |         intermediate_steps = None
 85 |     return {
 86 |         "agent_name": agent_name,
 87 |         "question": question,
 88 |         "prediction": response["output"],
 89 |         "intermediate_steps": intermediate_steps,
 90 |         "parsing_error": parsing_error,
 91 |         "iteration_limit_exceeded": iteration_limit_exceeded,
 92 |         "agent_error": repr(exception) if raised_exception else None,
 93 |         "start_time": start_time,
 94 |         "end_time": end_time,
 95 |     }
 96 | 
 97 | 
 98 | 
 99 | def run_agent(
100 |     question: str,
101 |     agent_executor: AgentExecutor,
102 |     agent_name: str,
103 |     agent_call_function: Callable,
104 | ) -> dict:
105 |     """
106 |     Runs the execution process for a given question and ground truth answer.
107 | 
108 |     Args:
109 |         question (str): The input question to be evaluated.
110 |         agent_executor (AgentExecutor): The agent executor object used to run the agent.
111 |         agent_name (str): The name of the agent model.
112 | 
113 |     Returns:
114 |         dict: A dictionary containing the evaluation results, including the agent model ID, evaluator model ID,
115 |         question, ground truth answer, prediction, intermediate steps, evaluation score, evaluation feedback,
116 |         tool call parsing error flag, iteration limit exceeded flag, and agent error (if any).
117 |     """
118 |     start_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
119 |     try:
120 |         # run executor agent
121 |         response = agent_call_function(agent_executor, question)
122 | 
123 |         # check for parsing errors which indicate the LLM failed to follow the ReACT format
124 |         # this could be due to an issue with the tool calling format or ReACT formatting (i.e. Thought, Action, Observation, etc.)
125 |         parsing_error = (
126 |             True
127 |             if any(
128 |                 [
129 |                     "Could not parse LLM output" in step[0].log
130 |                     for step in response["intermediate_steps"]
131 |                 ]
132 |             )
133 |             else False
134 |         )
135 | 
136 |         # check if iteration limit exceeded
137 |         iteration_limit_exceeded = (
138 |             True
139 |             if "Agent stopped due to iteration limit or time limit." in response["output"]
140 |             else False
141 |         )
142 |         raised_exception = False
143 | 
144 |     except Exception as e:
145 |         response = {"output": None, "intermediate_steps": None}
146 |         parsing_error = False
147 |         iteration_limit_exceeded = False
148 |         exception = e
149 |         raised_exception = True
150 | 
151 |     end_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
152 |     # collect results
153 |     if response["intermediate_steps"] is not None:
154 |         intermediate_steps = [
155 |             {
156 |                 "tool": response[0].tool,
157 |                 "tool_input": response[0].tool_input,
158 |                 "tool_output": response[1],
159 |             }
160 |             for response in response["intermediate_steps"]
161 |         ]
162 |     else:
163 |         intermediate_steps = None
164 |     return {
165 |         "agent_name": agent_name,
166 |         "question": question,
167 |         "prediction": response["output"],
168 |         "intermediate_steps": intermediate_steps,
169 |         "parsing_error": parsing_error,
170 |         "iteration_limit_exceeded": iteration_limit_exceeded,
171 |         "agent_error": repr(exception) if raised_exception else None,
172 |         "start_time": start_time,
173 |         "end_time": end_time,
174 |     }
175 | 
176 | 
177 | async def answer_questions(
178 |     dataset: Dataset,
179 |     agent_executor: AgentExecutor,
180 |     agent_name: str,
181 |     output_folder: str = "output",
182 |     agent_call_function: Callable = call_langchain_agent,
183 |     key_for_answer: str = "answer",
184 | ) -> List[Dict[str, Any]]:
185 |     """
186 |     Evaluates the agent on a given dataset.
187 | 
188 |     Args:
189 |         dataset (Dataset): The dataset to test the agent on.
190 |         agent_executor (AgentExecutor): The agent executor object used to run the agent.
191 |         agent_name (str): The name of the agent model.
192 | 
193 |     Returns:
194 |         List[Dict[str, Any]]: A list of dictionaries containing the evaluation results for each example in the dataset.
195 |         Each dictionary includes the agent model ID, evaluator model ID, question, ground truth answer, prediction,
196 |         intermediate steps, evaluation score, evaluation feedback, tool call parsing error flag, iteration limit
197 |         exceeded flag, agent error (if any), and example metadata (task).
198 |     """
199 |     try:
200 |         with open(f"{output_folder}/{agent_name}.json", "r") as f:
201 |             results = json.load(f)
202 |     except FileNotFoundError:
203 |         results = []
204 | 
205 |     results_df = pd.DataFrame(results)
206 | 
207 |     for i, example in tqdm(enumerate(dataset), total=len(dataset)):
208 |         if len(results_df) > 0:
209 |             if example["question"] in results_df["question"].unique():
210 |                 continue
211 | 
212 |         # run agent
213 |         result = await arun_agent(
214 |             question=example["question"],
215 |             agent_executor=agent_executor,
216 |             agent_name=agent_name,
217 |             agent_call_function=agent_call_function,
218 |         )
219 | 
220 |         # add in example metadata
221 |         result.update(
222 |             {
223 |                 "gt_answer": example[key_for_answer],
224 |                 "task": example["task"],
225 |             }
226 |         )
227 |         results.append(result)
228 | 
229 |         with open(f"{output_folder}/{agent_name}.json", "w") as f:
230 |             json.dump(results, f)
231 |     return results
232 | 
233 | 
234 | async def run_full_tests(
235 |     dataset: Dataset,
236 |     agents: Dict[str, AgentExecutor],
237 | ) -> pd.DataFrame:
238 |     """
239 |     Run a full evaluation on the given dataset using multiple agent models.
240 | 
241 |     Args:
242 |         dataset (Dataset): The dataset to test on.
243 |         agents (Dict[str, AgentExecutor]): A dictionary of agent executors to test on the dataset
244 | 
245 |     Returns:
246 |         pd.DataFrame: The evaluation results as a pandas DataFrame.
247 |     """
248 |     results = []
249 | 
250 |     tasks = [
251 |         answer_questions(
252 |             dataset=dataset,
253 |             agent_executor=agent_executor,
254 |             agent_name=agent_name,
255 |             agent_call_function=acall_langchain_agent,
256 |         )
257 |         for agent_name, agent_executor in agents.items()
258 |     ]
259 | 
260 |     results = await asyncio.gather(*tasks)
261 | 
262 |     return pd.DataFrame([element for sublist in results for element in sublist])
263 | 


--------------------------------------------------------------------------------
/benchmark.ipynb:
--------------------------------------------------------------------------------
   1 | {
   2 |  "cells": [
   3 |   {
   4 |    "cell_type": "markdown",
   5 |    "metadata": {},
   6 |    "source": [
   7 |     "# Open-source LLMs as Agents with `ChatHuggingFace`\n",
   8 |     "\n",
   9 |     "\n",
  10 |     "Open source LLMs are becoming viable general purpose agents.\n",
  11 |     "\n",
  12 |     "The goal of this notebook is to demonstrate how to make use of open-source LLMs as chat models to enable their usage and experimentation with agent-based workflows.\n",
  13 |     "\n",
  14 |     "We use [Hugging Face Inference Endpoints](https://huggingface.co/docs/inference-endpoints/index) with [LangChain's `ChatHuggingFace`]().\n",
  15 |     "\n",
  16 |     "In particular, we will:\n",
  17 |     "1. Utilize the [HuggingFaceEndpoint](https://github.com/langchain-ai/langchain/blob/master/libs/langchain/langchain/llms/huggingface_endpoint.py) (or [HuggingFaceTextGenInference](https://github.com/langchain-ai/langchain/blob/master/libs/langchain/langchain/llms/huggingface_text_gen_inference.py) or [HuggingFaceHub](https://github.com/langchain-ai/langchain/blob/master/libs/langchain/langchain/llms/huggingface_hub.py)) integration to call a [HF Inference Endpoint](https://huggingface.co/inference-endpoints) that's serving an LLM via [Text Generation Inference (TGI)](https://huggingface.co/docs/text-generation-inference/index)\n",
  18 |     "2. Utilize the `ChatHuggingFace` class that interfaces between LangChain's [Chat Messages](https://python.langchain.com/docs/modules/model_io/chat/#messages) and the hosted LLM by leveraging [Hugging Face's Chat Templates](https://huggingface.co/docs/transformers/chat_templating) to power a `ChatAgent` pipeline.\n",
  19 |     "4. Demonstrate how to use an open-source LLM in a zero-shot ReAct Agent workflow.\n",
  20 |     "5. Understand how several different LLMs perform as general purpose agents by running an asynchronous evaluation pipeline using LLM as the judge. \n",
  21 |     "\n",
  22 |     "\n",
  23 |     "> Note: To run this notebook, you'll need to have:\n",
  24 |     "> - an LLM deployed via a Hugging Face Inference Endpoint (the LLM must have a `chat_template` defined in its `tokenizer_config.json`)\n",
  25 |     "> - A Hugging Face Token with access to the deployed endpoint saved as an environment variable: `HUGGINGFACEHUB_API_TOKEN`\n",
  26 |     "> - A SerpAPI key saved as an environment variable: `SERPAPI_API_KEY`\n",
  27 |     "> - An OpenAI API key saved as an environment variable: `OPENAI_API_KEY`"
  28 |    ]
  29 |   },
  30 |   {
  31 |    "cell_type": "markdown",
  32 |    "metadata": {},
  33 |    "source": [
  34 |     "## Setup"
  35 |    ]
  36 |   },
  37 |   {
  38 |    "cell_type": "code",
  39 |    "execution_count": 1,
  40 |    "metadata": {},
  41 |    "outputs": [
  42 |     {
  43 |      "name": "stdout",
  44 |      "output_type": "stream",
  45 |      "text": [
  46 |       "\u001b[33mWARNING: Ignoring invalid distribution -ackaging (/opt/conda/envs/pytorch/lib/python3.10/site-packages)\u001b[0m\u001b[33m\n",
  47 |       "\u001b[0m\u001b[33mWARNING: Ignoring invalid distribution -ackaging (/opt/conda/envs/pytorch/lib/python3.10/site-packages)\u001b[0m\u001b[33m\n",
  48 |       "\u001b[0m"
  49 |      ]
  50 |     }
  51 |    ],
  52 |    "source": [
  53 |     "!pip install -q --upgrade transformers langchain langchain_community text-generation python-dotenv numexpr datasets tqdm openai sentencepiece protobuf plotly kaleido"
  54 |    ]
  55 |   },
  56 |   {
  57 |    "cell_type": "code",
  58 |    "execution_count": 3,
  59 |    "metadata": {},
  60 |    "outputs": [],
  61 |    "source": [
  62 |     "%load_ext autoreload\n",
  63 |     "%autoreload 2"
  64 |    ]
  65 |   },
  66 |   {
  67 |    "cell_type": "code",
  68 |    "execution_count": 2,
  69 |    "metadata": {},
  70 |    "outputs": [
  71 |     {
  72 |      "data": {
  73 |       "text/plain": [
  74 |        "True"
  75 |       ]
  76 |      },
  77 |      "execution_count": 2,
  78 |      "metadata": {},
  79 |      "output_type": "execute_result"
  80 |     }
  81 |    ],
  82 |    "source": [
  83 |     "from dotenv import load_dotenv\n",
  84 |     "import numpy as np\n",
  85 |     "import pandas as pd\n",
  86 |     "import glob\n",
  87 |     "import plotly.express as px\n",
  88 |     "from datasets import load_dataset\n",
  89 |     "\n",
  90 |     "from langchain.agents.format_scratchpad import format_log_to_str\n",
  91 |     "from langchain.agents import AgentExecutor\n",
  92 |     "from langchain.agents.output_parsers import (\n",
  93 |     "    ReActJsonSingleInputOutputParser,\n",
  94 |     ")\n",
  95 |     "from langchain.prompts.chat import (\n",
  96 |     "    ChatPromptTemplate,\n",
  97 |     "    HumanMessagePromptTemplate,\n",
  98 |     ")\n",
  99 |     "from langchain.agents import load_tools\n",
 100 |     "from langchain.tools.render import render_text_description_and_args\n",
 101 |     "from langchain.chat_models import ChatOpenAI\n",
 102 |     "\n",
 103 |     "from scripts.prompts import SYSTEM_PROMPT, HUMAN_PROMPT, EVALUATION_PROMPT_TEMPLATE\n",
 104 |     "from scripts.evaluation import evaluate_answers\n",
 105 |     "from scripts.run_agents import run_full_tests\n",
 106 |     "from scripts.agents import build_hf_agent_with_tools, build_openai_agent_with_tools\n",
 107 |     "\n",
 108 |     "load_dotenv(override=True)"
 109 |    ]
 110 |   },
 111 |   {
 112 |    "cell_type": "markdown",
 113 |    "metadata": {},
 114 |    "source": [
 115 |     "# 1. Creating an agent with Langchain\n",
 116 |     "\n",
 117 |     "### Instantiate an LLM"
 118 |    ]
 119 |   },
 120 |   {
 121 |    "cell_type": "markdown",
 122 |    "metadata": {},
 123 |    "source": [
 124 |     "#### `HuggingFaceHub`"
 125 |    ]
 126 |   },
 127 |   {
 128 |    "cell_type": "code",
 129 |    "execution_count": null,
 130 |    "metadata": {},
 131 |    "outputs": [],
 132 |    "source": [
 133 |     "from langchain.llms.huggingface_hub import HuggingFaceHub\n",
 134 |     "from langchain.llms.huggingface_endpoint import HuggingFaceEndpoint\n",
 135 |     "\n",
 136 |     "llm = HuggingFaceHub(\n",
 137 |     "    repo_id=\"HuggingFaceH4/zephyr-7b-beta\",\n",
 138 |     "    task=\"text-generation\",\n",
 139 |     "    model_kwargs={\n",
 140 |     "        \"max_new_tokens\": 512,\n",
 141 |     "        \"top_k\": 50,\n",
 142 |     "        \"temperature\": 0.1,\n",
 143 |     "        \"repetition_penalty\": 1.03,\n",
 144 |     "    },\n",
 145 |     ")\n",
 146 |     "\n",
 147 |     "\n",
 148 |     "# llm = HuggingFaceEndpoint(\n",
 149 |     "#     endpoint_url=\"https://ytjpei7t003tedav.us-east-1.aws.endpoints.huggingface.cloud\",\n",
 150 |     "#     task=\"text-generation\",\n",
 151 |     "#     model_kwargs={\n",
 152 |     "#         \"max_new_tokens\": 512,\n",
 153 |     "#         \"top_k\": 50,\n",
 154 |     "#         \"temperature\": 0.1,\n",
 155 |     "#         \"repetition_penalty\": 1.03,\n",
 156 |     "#     },\n",
 157 |     "# )\n",
 158 |     "\n",
 159 |     "agent = build_hf_agent_with_tools(\n",
 160 |     "    \"https://fayjubiy2xqn36z0.us-east-1.aws.endpoints.huggingface.cloud\"\n",
 161 |     ")"
 162 |    ]
 163 |   },
 164 |   {
 165 |    "cell_type": "code",
 166 |    "execution_count": 25,
 167 |    "metadata": {},
 168 |    "outputs": [
 169 |     {
 170 |      "data": {
 171 |       "text/plain": [
 172 |        "'HuggingFaceH4/zephyr-7b-beta'"
 173 |       ]
 174 |      },
 175 |      "execution_count": 25,
 176 |      "metadata": {},
 177 |      "output_type": "execute_result"
 178 |     }
 179 |    ],
 180 |    "source": [
 181 |     "from langchain_community.chat_models.huggingface import ChatHuggingFace\n",
 182 |     "\n",
 183 |     "llm = HuggingFaceEndpoint(\n",
 184 |     "    endpoint_url=\"https://fayjubiy2xqn36z0.us-east-1.aws.endpoints.huggingface.cloud\",\n",
 185 |     "    task=\"text-generation\",\n",
 186 |     ")\n",
 187 |     "chat_model = ChatHuggingFace(llm=llm)\n",
 188 |     "chat_model.model_id"
 189 |    ]
 190 |   },
 191 |   {
 192 |    "cell_type": "code",
 193 |    "execution_count": null,
 194 |    "metadata": {},
 195 |    "outputs": [],
 196 |    "source": [
 197 |     "agent([{\"user\": \"ok\"}])"
 198 |    ]
 199 |   },
 200 |   {
 201 |    "cell_type": "markdown",
 202 |    "metadata": {},
 203 |    "source": [
 204 |     "## Create a wrapper for `BaseChatModel` to apply chat templates"
 205 |    ]
 206 |   },
 207 |   {
 208 |    "cell_type": "code",
 209 |    "execution_count": 4,
 210 |    "metadata": {},
 211 |    "outputs": [],
 212 |    "source": [
 213 |     "from langchain.schema import HumanMessage\n",
 214 |     "from langchain_community.chat_models.huggingface import ChatHuggingFace"
 215 |    ]
 216 |   },
 217 |   {
 218 |    "cell_type": "markdown",
 219 |    "metadata": {},
 220 |    "source": [
 221 |     "Instantiate the model and some messages to pass."
 222 |    ]
 223 |   },
 224 |   {
 225 |    "cell_type": "code",
 226 |    "execution_count": 21,
 227 |    "metadata": {},
 228 |    "outputs": [
 229 |     {
 230 |      "data": {
 231 |       "text/plain": [
 232 |        "'HuggingFaceH4/zephyr-7b-beta'"
 233 |       ]
 234 |      },
 235 |      "execution_count": 21,
 236 |      "metadata": {},
 237 |      "output_type": "execute_result"
 238 |     }
 239 |    ],
 240 |    "source": [
 241 |     "messages = [\n",
 242 |     "    HumanMessage(\n",
 243 |     "        content=\"You're a helpful assistant. What happens when an unstoppable force meets an immovable object?\"\n",
 244 |     "    ),\n",
 245 |     "]\n",
 246 |     "\n",
 247 |     "chat_model = ChatHuggingFace(llm=llm)\n",
 248 |     "chat_model.model_id"
 249 |    ]
 250 |   },
 251 |   {
 252 |    "cell_type": "markdown",
 253 |    "metadata": {},
 254 |    "source": [
 255 |     "Call the model."
 256 |    ]
 257 |   },
 258 |   {
 259 |    "cell_type": "code",
 260 |    "execution_count": 22,
 261 |    "metadata": {},
 262 |    "outputs": [
 263 |     {
 264 |      "name": "stdout",
 265 |      "output_type": "stream",
 266 |      "text": [
 267 |       "\n",
 268 |       "\n",
 269 |       "Assistant: According to Newton's third law of motion, when an unstoppable force meets an immovable object, both remain unchanged in their respective states. This is because when two objects interact, they exert equal and opposite forces on each other, and neither object can change the other's state unless there is a net force acting on it. Therefore, an unstoppable force and an immovable object cannot both remain unchanged in their respective states\n"
 270 |      ]
 271 |     }
 272 |    ],
 273 |    "source": [
 274 |     "res = llm.invoke(messages)\n",
 275 |     "print(res)"
 276 |    ]
 277 |   },
 278 |   {
 279 |    "cell_type": "markdown",
 280 |    "metadata": {},
 281 |    "source": [
 282 |     "## Tests\n",
 283 |     "\n",
 284 |     "Here we'll test out our model as a zero-shot ReAct Agent.\n",
 285 |     "\n",
 286 |     "We set up the agent with a `react-json` style prompt and access to a search engine and calculator."
 287 |    ]
 288 |   },
 289 |   {
 290 |    "cell_type": "markdown",
 291 |    "metadata": {},
 292 |    "source": [
 293 |     "### Define tools"
 294 |    ]
 295 |   },
 296 |   {
 297 |    "cell_type": "code",
 298 |    "execution_count": 113,
 299 |    "metadata": {},
 300 |    "outputs": [],
 301 |    "source": [
 302 |     "TOOLS = load_tools([\"serpapi\", \"llm-math\"], llm=llm)"
 303 |    ]
 304 |   },
 305 |   {
 306 |    "cell_type": "code",
 307 |    "execution_count": 115,
 308 |    "metadata": {},
 309 |    "outputs": [],
 310 |    "source": [
 311 |     "prompt = ChatPromptTemplate.from_messages(\n",
 312 |     "    [\n",
 313 |     "        HumanMessagePromptTemplate.from_template(\n",
 314 |     "            SYSTEM_PROMPT + \"\\nSo, here is my question:\" + HUMAN_PROMPT\n",
 315 |     "        ),\n",
 316 |     "    ]\n",
 317 |     ")\n",
 318 |     "prompt = prompt.partial(\n",
 319 |     "    tool_description_with_args=render_text_description_and_args(TOOLS),\n",
 320 |     "    tool_names=\", \".join([t.name for t in TOOLS]),\n",
 321 |     ")\n",
 322 |     "\n",
 323 |     "\n",
 324 |     "# define the agent\n",
 325 |     "chat_model_with_stop = chat_model.bind(stop=[\"\\nObservation\"])\n",
 326 |     "agent = (\n",
 327 |     "    {\n",
 328 |     "        \"input\": lambda x: x[\"input\"],\n",
 329 |     "        \"agent_scratchpad\": lambda x: format_log_to_str(x[\"intermediate_steps\"]),\n",
 330 |     "    }\n",
 331 |     "    | prompt\n",
 332 |     "    | chat_model_with_stop\n",
 333 |     "    | ReActJsonSingleInputOutputParser()\n",
 334 |     ")\n",
 335 |     "\n",
 336 |     "# instantiate AgentExecutor\n",
 337 |     "agent_executor = AgentExecutor(\n",
 338 |     "    agent=agent,\n",
 339 |     "    tools=TOOLS,\n",
 340 |     "    verbose=True,\n",
 341 |     "    return_intermediate_steps=True,\n",
 342 |     "    handle_parsing_errors=True,\n",
 343 |     "    max_iterations=5,\n",
 344 |     ")"
 345 |    ]
 346 |   },
 347 |   {
 348 |    "cell_type": "code",
 349 |    "execution_count": 118,
 350 |    "metadata": {},
 351 |    "outputs": [
 352 |     {
 353 |      "name": "stdout",
 354 |      "output_type": "stream",
 355 |      "text": [
 356 |       "\n",
 357 |       "\n",
 358 |       "\u001b[1m> Entering new AgentExecutor chain...\u001b[0m\n",
 359 |       "\u001b[32;1m\u001b[1;3mThought: To calculate Leonardo DiCaprio's age raised to the power of 0.43, we first need to know his current age. After some research, I found out that Leonardo DiCaprio was born on November 11, 1974. Using the calculator tool, I can now perform the calculation.\n",
 360 |       "\n",
 361 |       "Action:\n",
 362 |       "```\n",
 363 |       "{\n",
 364 |       "  \"action\": \"calculator\",\n",
 365 |       "  \"action_input\": \"pow(46, 0.43)\"\n",
 366 |       "}\n",
 367 |       "```\n",
 368 |       "\u001b[0m\u001b[33;1m\u001b[1;3m5.187831569313503\u001b[0m\u001b[32;1m\u001b[1;3mThought: Based on the calculation, it seems that raising Leonardo DiCaprio's age to the power of 0.43 would result in a value less than 10. This could potentially indicate that his age, when raised to such a low power, becomes insignificant in comparison to other factors that may affect him. However, further analysis is required to draw any concrete conclusions.\n",
 369 |       "\n",
 370 |       "Thought: As an alternative approach, we could also consider the biological and physiological aspects of aging, which may provide more insight into the significance of Leonardo DiCaprio's age in different contexts. This could involve consulting medical and scientific literature, as well as analyzing his lifestyle and health habits.\n",
 371 |       "\n",
 372 |       "Thought: In any case, it's clear that the calculation we performed using the calculator tool provides us with a useful starting point for further exploration and analysis. By combining different tools and approaches, we can gain a more comprehensive understanding of the complex and multifaceted nature of aging, and how it affects individuals like Leonardo DiCaprio in various contexts.\n",
 373 |       "\n",
 374 |       "Final Answer: The calculation shows that raising Leonardo DiCaprio's age to the power of 0.43 results in a value of approximately 5.19. However, the significance of this value in different contexts requires further analysis and consideration of biological, physiological, and lifestyle factors.\u001b[0m\n",
 375 |       "\n",
 376 |       "\u001b[1m> Finished chain.\u001b[0m\n"
 377 |      ]
 378 |     }
 379 |    ],
 380 |    "source": [
 381 |     "example = {\"input\": \"What is the age of Leonardo DiCaprio, raised to the power 0.43?\"}\n",
 382 |     "\n",
 383 |     "out = agent.invoke(example)"
 384 |    ]
 385 |   },
 386 |   {
 387 |    "cell_type": "markdown",
 388 |    "metadata": {},
 389 |    "source": [
 390 |     "# 2. Benchmark agents\n",
 391 |     "\n",
 392 |     "### Evaluation dataset"
 393 |    ]
 394 |   },
 395 |   {
 396 |    "cell_type": "code",
 397 |    "execution_count": 29,
 398 |    "metadata": {},
 399 |    "outputs": [],
 400 |    "source": [
 401 |     "full_eval_dataset = load_dataset(\"m-ric/agents_small_benchmark\", split=\"train\")"
 402 |    ]
 403 |   },
 404 |   {
 405 |    "cell_type": "markdown",
 406 |    "metadata": {},
 407 |    "source": [
 408 |     "### 2.1. Run tests"
 409 |    ]
 410 |   },
 411 |   {
 412 |    "cell_type": "code",
 413 |    "execution_count": null,
 414 |    "metadata": {},
 415 |    "outputs": [],
 416 |    "source": [
 417 |     "from huggingface_hub import notebook_login\n",
 418 |     "\n",
 419 |     "notebook_login()"
 420 |    ]
 421 |   },
 422 |   {
 423 |    "cell_type": "code",
 424 |    "execution_count": null,
 425 |    "metadata": {},
 426 |    "outputs": [],
 427 |    "source": [
 428 |     "agent_endpoints = {\n",
 429 |     "    # \"Zephyr-7b-beta\": \"https://ytjpei7t003tedav.us-east-1.aws.endpoints.huggingface.cloud\",\n",
 430 |     "    # \"Mixtral-8x7B-Instruct-v0.1\": \"https://iw8z8uxlp03gvuxc.us-east-1.aws.endpoints.huggingface.cloud\",\n",
 431 |     "    # \"OpenHermes-2.5-Mistral-7B\": \"https://ou70xe634aa21gsc.us-east-1.aws.endpoints.huggingface.cloud\"\n",
 432 |     "    # \"SOLAR-10.7B\": \"https://tj7v24gjtvozke28.us-east-1.aws.endpoints.huggingface.cloud\",\n",
 433 |     "    # 'Llama-2-70b-chat': 'https://xwabjkpbpoqtnd4a.us-east-1.aws.endpoints.huggingface.cloud',\n",
 434 |     "    \"zephyr\": \"https://fayjubiy2xqn36z0.us-east-1.aws.endpoints.huggingface.cloud\"\n",
 435 |     "}\n",
 436 |     "\n",
 437 |     "agents = {\n",
 438 |     "    name: build_hf_agent_with_tools(hf_endpoint_url=endpoint)\n",
 439 |     "    for name, endpoint in agent_endpoints.items()\n",
 440 |     "}\n",
 441 |     "\n",
 442 |     "# uncomment below to test GPT as an agent\n",
 443 |     "# agents[\"GPT4\"] = build_openai_agent(model_id=\"gpt-4-1106-preview\")\n",
 444 |     "agents[\"GPT3.5-2\"] = build_openai_agent_with_tools(model_id=\"gpt-3.5-turbo-1106\")\n",
 445 |     "\n",
 446 |     "# run eval\n",
 447 |     "await run_full_tests(dataset=full_eval_dataset, agents=agents)\n",
 448 |     "print(\"Question answering is complete!\")"
 449 |    ]
 450 |   },
 451 |   {
 452 |    "cell_type": "markdown",
 453 |    "metadata": {},
 454 |    "source": [
 455 |     "### 2.2. Evaluate with LLM-as-a-judge"
 456 |    ]
 457 |   },
 458 |   {
 459 |    "cell_type": "code",
 460 |    "execution_count": 121,
 461 |    "metadata": {},
 462 |    "outputs": [],
 463 |    "source": [
 464 |     "eval_chat_model = ChatOpenAI(model=\"gpt-4-1106-preview\", temperature=0)\n",
 465 |     "eval_model_name = \"GPT4\""
 466 |    ]
 467 |   },
 468 |   {
 469 |    "cell_type": "code",
 470 |    "execution_count": 128,
 471 |    "metadata": {},
 472 |    "outputs": [],
 473 |    "source": [
 474 |     "result_df = pd.concat([pd.read_json(f) for f in glob.glob(\"output/*.json\")])\n",
 475 |     "answers = result_df.to_dict(orient=\"records\")"
 476 |    ]
 477 |   },
 478 |   {
 479 |    "cell_type": "code",
 480 |    "execution_count": 129,
 481 |    "metadata": {},
 482 |    "outputs": [
 483 |     {
 484 |      "name": "stderr",
 485 |      "output_type": "stream",
 486 |      "text": [
 487 |       "100%|██████████| 700/700 [00:00<00:00, 97694.50it/s]"
 488 |      ]
 489 |     },
 490 |     {
 491 |      "name": "stdout",
 492 |      "output_type": "stream",
 493 |      "text": [
 494 |       "Evaluation is complete!\n"
 495 |      ]
 496 |     },
 497 |     {
 498 |      "name": "stderr",
 499 |      "output_type": "stream",
 500 |      "text": [
 501 |       "\n"
 502 |      ]
 503 |     }
 504 |    ],
 505 |    "source": [
 506 |     "from scripts.evaluation import evaluate_answers\n",
 507 |     "\n",
 508 |     "results = await evaluate_answers(\n",
 509 |     "    answers, eval_chat_model, \"GPT4\", EVALUATION_PROMPT_TEMPLATE\n",
 510 |     ")\n",
 511 |     "result_df = pd.DataFrame.from_records(results)\n",
 512 |     "print(\"Evaluation is complete!\")"
 513 |    ]
 514 |   },
 515 |   {
 516 |    "cell_type": "markdown",
 517 |    "metadata": {},
 518 |    "source": [
 519 |     "### 2.3. Visualize results"
 520 |    ]
 521 |   },
 522 |   {
 523 |    "cell_type": "code",
 524 |    "execution_count": 130,
 525 |    "metadata": {},
 526 |    "outputs": [
 527 |     {
 528 |      "data": {
 529 |       "text/html": [
 530 |        "<div>\n",
 531 |        "<style scoped>\n",
 532 |        "    .dataframe tbody tr th:only-of-type {\n",
 533 |        "        vertical-align: middle;\n",
 534 |        "    }\n",
 535 |        "\n",
 536 |        "    .dataframe tbody tr th {\n",
 537 |        "        vertical-align: top;\n",
 538 |        "    }\n",
 539 |        "\n",
 540 |        "    .dataframe thead th {\n",
 541 |        "        text-align: right;\n",
 542 |        "    }\n",
 543 |        "</style>\n",
 544 |        "<table border=\"1\" class=\"dataframe\">\n",
 545 |        "  <thead>\n",
 546 |        "    <tr style=\"text-align: right;\">\n",
 547 |        "      <th></th>\n",
 548 |        "      <th></th>\n",
 549 |        "      <th>parsing_error</th>\n",
 550 |        "      <th>iteration_limit_exceeded</th>\n",
 551 |        "      <th>no_prediction</th>\n",
 552 |        "      <th>has_agent_error</th>\n",
 553 |        "    </tr>\n",
 554 |        "    <tr>\n",
 555 |        "      <th>agent_name</th>\n",
 556 |        "      <th>task</th>\n",
 557 |        "      <th></th>\n",
 558 |        "      <th></th>\n",
 559 |        "      <th></th>\n",
 560 |        "      <th></th>\n",
 561 |        "    </tr>\n",
 562 |        "  </thead>\n",
 563 |        "  <tbody>\n",
 564 |        "    <tr>\n",
 565 |        "      <th rowspan=\"3\" valign=\"top\">GPT3.5</th>\n",
 566 |        "      <th>GAIA</th>\n",
 567 |        "      <td>0</td>\n",
 568 |        "      <td>1</td>\n",
 569 |        "      <td>0</td>\n",
 570 |        "      <td>0</td>\n",
 571 |        "    </tr>\n",
 572 |        "    <tr>\n",
 573 |        "      <th>GSM8K</th>\n",
 574 |        "      <td>0</td>\n",
 575 |        "      <td>0</td>\n",
 576 |        "      <td>0</td>\n",
 577 |        "      <td>0</td>\n",
 578 |        "    </tr>\n",
 579 |        "    <tr>\n",
 580 |        "      <th>HotpotQA</th>\n",
 581 |        "      <td>0</td>\n",
 582 |        "      <td>0</td>\n",
 583 |        "      <td>0</td>\n",
 584 |        "      <td>0</td>\n",
 585 |        "    </tr>\n",
 586 |        "    <tr>\n",
 587 |        "      <th rowspan=\"3\" valign=\"top\">GPT4</th>\n",
 588 |        "      <th>GAIA</th>\n",
 589 |        "      <td>0</td>\n",
 590 |        "      <td>2</td>\n",
 591 |        "      <td>1</td>\n",
 592 |        "      <td>1</td>\n",
 593 |        "    </tr>\n",
 594 |        "    <tr>\n",
 595 |        "      <th>GSM8K</th>\n",
 596 |        "      <td>0</td>\n",
 597 |        "      <td>0</td>\n",
 598 |        "      <td>0</td>\n",
 599 |        "      <td>0</td>\n",
 600 |        "    </tr>\n",
 601 |        "    <tr>\n",
 602 |        "      <th>HotpotQA</th>\n",
 603 |        "      <td>0</td>\n",
 604 |        "      <td>0</td>\n",
 605 |        "      <td>0</td>\n",
 606 |        "      <td>0</td>\n",
 607 |        "    </tr>\n",
 608 |        "    <tr>\n",
 609 |        "      <th rowspan=\"3\" valign=\"top\">Llama-2-70b-chat</th>\n",
 610 |        "      <th>GAIA</th>\n",
 611 |        "      <td>10</td>\n",
 612 |        "      <td>11</td>\n",
 613 |        "      <td>6</td>\n",
 614 |        "      <td>6</td>\n",
 615 |        "    </tr>\n",
 616 |        "    <tr>\n",
 617 |        "      <th>GSM8K</th>\n",
 618 |        "      <td>7</td>\n",
 619 |        "      <td>6</td>\n",
 620 |        "      <td>4</td>\n",
 621 |        "      <td>4</td>\n",
 622 |        "    </tr>\n",
 623 |        "    <tr>\n",
 624 |        "      <th>HotpotQA</th>\n",
 625 |        "      <td>12</td>\n",
 626 |        "      <td>15</td>\n",
 627 |        "      <td>5</td>\n",
 628 |        "      <td>5</td>\n",
 629 |        "    </tr>\n",
 630 |        "    <tr>\n",
 631 |        "      <th rowspan=\"3\" valign=\"top\">Mixtral-8x7B-Instruct-v0.1</th>\n",
 632 |        "      <th>GAIA</th>\n",
 633 |        "      <td>2</td>\n",
 634 |        "      <td>6</td>\n",
 635 |        "      <td>3</td>\n",
 636 |        "      <td>3</td>\n",
 637 |        "    </tr>\n",
 638 |        "    <tr>\n",
 639 |        "      <th>GSM8K</th>\n",
 640 |        "      <td>2</td>\n",
 641 |        "      <td>2</td>\n",
 642 |        "      <td>0</td>\n",
 643 |        "      <td>0</td>\n",
 644 |        "    </tr>\n",
 645 |        "    <tr>\n",
 646 |        "      <th>HotpotQA</th>\n",
 647 |        "      <td>9</td>\n",
 648 |        "      <td>2</td>\n",
 649 |        "      <td>1</td>\n",
 650 |        "      <td>1</td>\n",
 651 |        "    </tr>\n",
 652 |        "    <tr>\n",
 653 |        "      <th rowspan=\"3\" valign=\"top\">OpenHermes-2.5-Mistral-7B</th>\n",
 654 |        "      <th>GAIA</th>\n",
 655 |        "      <td>2</td>\n",
 656 |        "      <td>11</td>\n",
 657 |        "      <td>1</td>\n",
 658 |        "      <td>1</td>\n",
 659 |        "    </tr>\n",
 660 |        "    <tr>\n",
 661 |        "      <th>GSM8K</th>\n",
 662 |        "      <td>3</td>\n",
 663 |        "      <td>1</td>\n",
 664 |        "      <td>0</td>\n",
 665 |        "      <td>0</td>\n",
 666 |        "    </tr>\n",
 667 |        "    <tr>\n",
 668 |        "      <th>HotpotQA</th>\n",
 669 |        "      <td>11</td>\n",
 670 |        "      <td>3</td>\n",
 671 |        "      <td>1</td>\n",
 672 |        "      <td>1</td>\n",
 673 |        "    </tr>\n",
 674 |        "    <tr>\n",
 675 |        "      <th rowspan=\"3\" valign=\"top\">SOLAR-10.7B</th>\n",
 676 |        "      <th>GAIA</th>\n",
 677 |        "      <td>9</td>\n",
 678 |        "      <td>8</td>\n",
 679 |        "      <td>3</td>\n",
 680 |        "      <td>3</td>\n",
 681 |        "    </tr>\n",
 682 |        "    <tr>\n",
 683 |        "      <th>GSM8K</th>\n",
 684 |        "      <td>8</td>\n",
 685 |        "      <td>0</td>\n",
 686 |        "      <td>1</td>\n",
 687 |        "      <td>1</td>\n",
 688 |        "    </tr>\n",
 689 |        "    <tr>\n",
 690 |        "      <th>HotpotQA</th>\n",
 691 |        "      <td>32</td>\n",
 692 |        "      <td>13</td>\n",
 693 |        "      <td>0</td>\n",
 694 |        "      <td>0</td>\n",
 695 |        "    </tr>\n",
 696 |        "    <tr>\n",
 697 |        "      <th rowspan=\"3\" valign=\"top\">Zephyr-7b-beta</th>\n",
 698 |        "      <th>GAIA</th>\n",
 699 |        "      <td>2</td>\n",
 700 |        "      <td>4</td>\n",
 701 |        "      <td>15</td>\n",
 702 |        "      <td>15</td>\n",
 703 |        "    </tr>\n",
 704 |        "    <tr>\n",
 705 |        "      <th>GSM8K</th>\n",
 706 |        "      <td>4</td>\n",
 707 |        "      <td>0</td>\n",
 708 |        "      <td>1</td>\n",
 709 |        "      <td>1</td>\n",
 710 |        "    </tr>\n",
 711 |        "    <tr>\n",
 712 |        "      <th>HotpotQA</th>\n",
 713 |        "      <td>26</td>\n",
 714 |        "      <td>25</td>\n",
 715 |        "      <td>2</td>\n",
 716 |        "      <td>2</td>\n",
 717 |        "    </tr>\n",
 718 |        "  </tbody>\n",
 719 |        "</table>\n",
 720 |        "</div>"
 721 |       ],
 722 |       "text/plain": [
 723 |        "                                     parsing_error  iteration_limit_exceeded  \\\n",
 724 |        "agent_name                 task                                                \n",
 725 |        "GPT3.5                     GAIA                  0                         1   \n",
 726 |        "                           GSM8K                 0                         0   \n",
 727 |        "                           HotpotQA              0                         0   \n",
 728 |        "GPT4                       GAIA                  0                         2   \n",
 729 |        "                           GSM8K                 0                         0   \n",
 730 |        "                           HotpotQA              0                         0   \n",
 731 |        "Llama-2-70b-chat           GAIA                 10                        11   \n",
 732 |        "                           GSM8K                 7                         6   \n",
 733 |        "                           HotpotQA             12                        15   \n",
 734 |        "Mixtral-8x7B-Instruct-v0.1 GAIA                  2                         6   \n",
 735 |        "                           GSM8K                 2                         2   \n",
 736 |        "                           HotpotQA              9                         2   \n",
 737 |        "OpenHermes-2.5-Mistral-7B  GAIA                  2                        11   \n",
 738 |        "                           GSM8K                 3                         1   \n",
 739 |        "                           HotpotQA             11                         3   \n",
 740 |        "SOLAR-10.7B                GAIA                  9                         8   \n",
 741 |        "                           GSM8K                 8                         0   \n",
 742 |        "                           HotpotQA             32                        13   \n",
 743 |        "Zephyr-7b-beta             GAIA                  2                         4   \n",
 744 |        "                           GSM8K                 4                         0   \n",
 745 |        "                           HotpotQA             26                        25   \n",
 746 |        "\n",
 747 |        "                                     no_prediction  has_agent_error  \n",
 748 |        "agent_name                 task                                      \n",
 749 |        "GPT3.5                     GAIA                  0                0  \n",
 750 |        "                           GSM8K                 0                0  \n",
 751 |        "                           HotpotQA              0                0  \n",
 752 |        "GPT4                       GAIA                  1                1  \n",
 753 |        "                           GSM8K                 0                0  \n",
 754 |        "                           HotpotQA              0                0  \n",
 755 |        "Llama-2-70b-chat           GAIA                  6                6  \n",
 756 |        "                           GSM8K                 4                4  \n",
 757 |        "                           HotpotQA              5                5  \n",
 758 |        "Mixtral-8x7B-Instruct-v0.1 GAIA                  3                3  \n",
 759 |        "                           GSM8K                 0                0  \n",
 760 |        "                           HotpotQA              1                1  \n",
 761 |        "OpenHermes-2.5-Mistral-7B  GAIA                  1                1  \n",
 762 |        "                           GSM8K                 0                0  \n",
 763 |        "                           HotpotQA              1                1  \n",
 764 |        "SOLAR-10.7B                GAIA                  3                3  \n",
 765 |        "                           GSM8K                 1                1  \n",
 766 |        "                           HotpotQA              0                0  \n",
 767 |        "Zephyr-7b-beta             GAIA                 15               15  \n",
 768 |        "                           GSM8K                 1                1  \n",
 769 |        "                           HotpotQA              2                2  "
 770 |       ]
 771 |      },
 772 |      "execution_count": 130,
 773 |      "metadata": {},
 774 |      "output_type": "execute_result"
 775 |     }
 776 |    ],
 777 |    "source": [
 778 |     "result_df[\"no_prediction\"] = result_df[\"prediction\"].apply(\n",
 779 |     "    lambda x: True if x is None else False\n",
 780 |     ")\n",
 781 |     "result_df[\"has_agent_error\"] = result_df[\"agent_error\"].apply(\n",
 782 |     "    lambda x: True if isinstance(x, str) else False\n",
 783 |     ")\n",
 784 |     "\n",
 785 |     "\n",
 786 |     "def interpret_result(x):\n",
 787 |     "    try:\n",
 788 |     "        return int(x) - 1\n",
 789 |     "    except:\n",
 790 |     "        return 0\n",
 791 |     "\n",
 792 |     "\n",
 793 |     "result_df[\"eval_score_Mixtral\"] = result_df[\"eval_score_Mixtral\"].apply(\n",
 794 |     "    interpret_result\n",
 795 |     ")\n",
 796 |     "\n",
 797 |     "# Override results with human evaluation if there is one\n",
 798 |     "result_df[\"eval_score_GPT4\"] = result_df.apply(\n",
 799 |     "    lambda row: (\n",
 800 |     "        row[\"eval_score_human\"]\n",
 801 |     "        if not np.isnan(row[\"eval_score_human\"])\n",
 802 |     "        else row[\"eval_score_GPT4\"]\n",
 803 |     "    ),\n",
 804 |     "    axis=1,\n",
 805 |     ")\n",
 806 |     "result_df[\"eval_score_GPT4\"] = result_df[\"eval_score_GPT4\"].apply(interpret_result)\n",
 807 |     "\n",
 808 |     "result_df[\"task\"] = result_df[\"task\"].apply(\n",
 809 |     "    lambda x: (\"HotpotQA\" if \"HotpotQA\" in x else x)\n",
 810 |     ")\n",
 811 |     "result_df.groupby([\"agent_name\", \"task\"]).agg(\n",
 812 |     "    {\n",
 813 |     "        \"parsing_error\": \"sum\",\n",
 814 |     "        \"iteration_limit_exceeded\": \"sum\",\n",
 815 |     "        \"no_prediction\": \"sum\",\n",
 816 |     "        \"has_agent_error\": \"sum\",\n",
 817 |     "    }\n",
 818 |     ")"
 819 |    ]
 820 |   },
 821 |   {
 822 |    "cell_type": "code",
 823 |    "execution_count": 131,
 824 |    "metadata": {},
 825 |    "outputs": [
 826 |     {
 827 |      "data": {
 828 |       "text/html": [
 829 |        "<div>\n",
 830 |        "<style scoped>\n",
 831 |        "    .dataframe tbody tr th:only-of-type {\n",
 832 |        "        vertical-align: middle;\n",
 833 |        "    }\n",
 834 |        "\n",
 835 |        "    .dataframe tbody tr th {\n",
 836 |        "        vertical-align: top;\n",
 837 |        "    }\n",
 838 |        "\n",
 839 |        "    .dataframe thead tr th {\n",
 840 |        "        text-align: left;\n",
 841 |        "    }\n",
 842 |        "\n",
 843 |        "    .dataframe thead tr:last-of-type th {\n",
 844 |        "        text-align: right;\n",
 845 |        "    }\n",
 846 |        "</style>\n",
 847 |        "<table border=\"1\" class=\"dataframe\">\n",
 848 |        "  <thead>\n",
 849 |        "    <tr>\n",
 850 |        "      <th></th>\n",
 851 |        "      <th colspan=\"3\" halign=\"left\">eval_score_GPT4</th>\n",
 852 |        "    </tr>\n",
 853 |        "    <tr>\n",
 854 |        "      <th>task</th>\n",
 855 |        "      <th>HotpotQA</th>\n",
 856 |        "      <th>GSM8K</th>\n",
 857 |        "      <th>GAIA</th>\n",
 858 |        "    </tr>\n",
 859 |        "    <tr>\n",
 860 |        "      <th>agent_name</th>\n",
 861 |        "      <th></th>\n",
 862 |        "      <th></th>\n",
 863 |        "      <th></th>\n",
 864 |        "    </tr>\n",
 865 |        "  </thead>\n",
 866 |        "  <tbody>\n",
 867 |        "    <tr>\n",
 868 |        "      <th>OpenHermes-2.5-Mistral-7B</th>\n",
 869 |        "      <td>65.833333</td>\n",
 870 |        "      <td>46.25</td>\n",
 871 |        "      <td>10.00</td>\n",
 872 |        "    </tr>\n",
 873 |        "    <tr>\n",
 874 |        "      <th>Llama-2-70b-chat</th>\n",
 875 |        "      <td>49.583333</td>\n",
 876 |        "      <td>17.50</td>\n",
 877 |        "      <td>0.00</td>\n",
 878 |        "    </tr>\n",
 879 |        "    <tr>\n",
 880 |        "      <th>Zephyr-7b-beta</th>\n",
 881 |        "      <td>36.666667</td>\n",
 882 |        "      <td>46.25</td>\n",
 883 |        "      <td>0.00</td>\n",
 884 |        "    </tr>\n",
 885 |        "    <tr>\n",
 886 |        "      <th>SOLAR-10.7B</th>\n",
 887 |        "      <td>53.750000</td>\n",
 888 |        "      <td>73.75</td>\n",
 889 |        "      <td>7.50</td>\n",
 890 |        "    </tr>\n",
 891 |        "    <tr>\n",
 892 |        "      <th>GPT4</th>\n",
 893 |        "      <td>80.833333</td>\n",
 894 |        "      <td>95.00</td>\n",
 895 |        "      <td>40.00</td>\n",
 896 |        "    </tr>\n",
 897 |        "    <tr>\n",
 898 |        "      <th>GPT3.5</th>\n",
 899 |        "      <td>74.583333</td>\n",
 900 |        "      <td>62.50</td>\n",
 901 |        "      <td>16.25</td>\n",
 902 |        "    </tr>\n",
 903 |        "    <tr>\n",
 904 |        "      <th>Mixtral-8x7B-Instruct-v0.1</th>\n",
 905 |        "      <td>77.083333</td>\n",
 906 |        "      <td>72.50</td>\n",
 907 |        "      <td>16.25</td>\n",
 908 |        "    </tr>\n",
 909 |        "  </tbody>\n",
 910 |        "</table>\n",
 911 |        "</div>"
 912 |       ],
 913 |       "text/plain": [
 914 |        "                           eval_score_GPT4              \n",
 915 |        "task                              HotpotQA  GSM8K   GAIA\n",
 916 |        "agent_name                                              \n",
 917 |        "OpenHermes-2.5-Mistral-7B        65.833333  46.25  10.00\n",
 918 |        "Llama-2-70b-chat                 49.583333  17.50   0.00\n",
 919 |        "Zephyr-7b-beta                   36.666667  46.25   0.00\n",
 920 |        "SOLAR-10.7B                      53.750000  73.75   7.50\n",
 921 |        "GPT4                             80.833333  95.00  40.00\n",
 922 |        "GPT3.5                           74.583333  62.50  16.25\n",
 923 |        "Mixtral-8x7B-Instruct-v0.1       77.083333  72.50  16.25"
 924 |       ]
 925 |      },
 926 |      "metadata": {},
 927 |      "output_type": "display_data"
 928 |     }
 929 |    ],
 930 |    "source": [
 931 |     "table_result = result_df.groupby([\"agent_name\", \"task\"], sort=False)[\n",
 932 |     "    [\"eval_score_GPT4\"]\n",
 933 |     "].mean()\n",
 934 |     "table_result = table_result / 4 * 100  # set results in percentage\n",
 935 |     "display(table_result.unstack())\n",
 936 |     "table_result = table_result.reset_index()"
 937 |    ]
 938 |   },
 939 |   {
 940 |    "cell_type": "code",
 941 |    "execution_count": 132,
 942 |    "metadata": {},
 943 |    "outputs": [
 944 |     {
 945 |      "data": {
 946 |       "application/vnd.plotly.v1+json": {
 947 |        "config": {
 948 |         "plotlyServerURL": "https://plot.ly"
 949 |        },
 950 |        "data": [
 951 |         {
 952 |          "alignmentgroup": "True",
 953 |          "hovertemplate": "<b>Task</b>=GAIA<br><b>Agent</b>=%{x}<br><b>Score</b>=%{y}<extra></extra>",
 954 |          "legendgroup": "GAIA",
 955 |          "marker": {
 956 |           "color": "#3366CC",
 957 |           "pattern": {
 958 |            "shape": ""
 959 |           }
 960 |          },
 961 |          "name": "GAIA",
 962 |          "offsetgroup": "GAIA",
 963 |          "orientation": "v",
 964 |          "showlegend": true,
 965 |          "textposition": "outside",
 966 |          "texttemplate": "%{y:.0f}",
 967 |          "type": "bar",
 968 |          "x": [
 969 |           "GPT4",
 970 |           "Mixtral-8x7B-Instruct-v0.1",
 971 |           "GPT3.5",
 972 |           "SOLAR-10.7B",
 973 |           "OpenHermes-2.5-Mistral-7B",
 974 |           "Zephyr-7b-beta",
 975 |           "Llama-2-70b-chat"
 976 |          ],
 977 |          "xaxis": "x",
 978 |          "y": [
 979 |           40,
 980 |           16.25,
 981 |           16.25,
 982 |           7.5,
 983 |           10,
 984 |           0,
 985 |           0
 986 |          ],
 987 |          "yaxis": "y"
 988 |         },
 989 |         {
 990 |          "alignmentgroup": "True",
 991 |          "hovertemplate": "<b>Task</b>=GSM8K<br><b>Agent</b>=%{x}<br><b>Score</b>=%{y}<extra></extra>",
 992 |          "legendgroup": "GSM8K",
 993 |          "marker": {
 994 |           "color": "#DC3912",
 995 |           "pattern": {
 996 |            "shape": ""
 997 |           }
 998 |          },
 999 |          "name": "GSM8K",
1000 |          "offsetgroup": "GSM8K",
1001 |          "orientation": "v",
1002 |          "showlegend": true,
1003 |          "textposition": "outside",
1004 |          "texttemplate": "%{y:.0f}",
1005 |          "type": "bar",
1006 |          "x": [
1007 |           "GPT4",
1008 |           "Mixtral-8x7B-Instruct-v0.1",
1009 |           "GPT3.5",
1010 |           "SOLAR-10.7B",
1011 |           "OpenHermes-2.5-Mistral-7B",
1012 |           "Zephyr-7b-beta",
1013 |           "Llama-2-70b-chat"
1014 |          ],
1015 |          "xaxis": "x",
1016 |          "y": [
1017 |           95,
1018 |           72.5,
1019 |           62.5,
1020 |           73.75,
1021 |           46.25,
1022 |           46.25,
1023 |           17.5
1024 |          ],
1025 |          "yaxis": "y"
1026 |         },
1027 |         {
1028 |          "alignmentgroup": "True",
1029 |          "hovertemplate": "<b>Task</b>=HotpotQA<br><b>Agent</b>=%{x}<br><b>Score</b>=%{y}<extra></extra>",
1030 |          "legendgroup": "HotpotQA",
1031 |          "marker": {
1032 |           "color": "#FF9900",
1033 |           "pattern": {
1034 |            "shape": ""
1035 |           }
1036 |          },
1037 |          "name": "HotpotQA",
1038 |          "offsetgroup": "HotpotQA",
1039 |          "orientation": "v",
1040 |          "showlegend": true,
1041 |          "textposition": "outside",
1042 |          "texttemplate": "%{y:.0f}",
1043 |          "type": "bar",
1044 |          "x": [
1045 |           "GPT4",
1046 |           "Mixtral-8x7B-Instruct-v0.1",
1047 |           "GPT3.5",
1048 |           "SOLAR-10.7B",
1049 |           "OpenHermes-2.5-Mistral-7B",
1050 |           "Zephyr-7b-beta",
1051 |           "Llama-2-70b-chat"
1052 |          ],
1053 |          "xaxis": "x",
1054 |          "y": [
1055 |           80.83333333333333,
1056 |           77.08333333333334,
1057 |           74.58333333333333,
1058 |           53.75,
1059 |           65.83333333333333,
1060 |           36.666666666666664,
1061 |           49.583333333333336
1062 |          ],
1063 |          "yaxis": "y"
1064 |         }
1065 |        ],
1066 |        "layout": {
1067 |         "bargap": 0.35,
1068 |         "bargroupgap": 0,
1069 |         "barmode": "group",
1070 |         "height": 600,
1071 |         "legend": {
1072 |          "title": {
1073 |           "text": "<b>Task</b>"
1074 |          },
1075 |          "tracegroupgap": 0
1076 |         },
1077 |         "template": {
1078 |          "data": {
1079 |           "bar": [
1080 |            {
1081 |             "error_x": {
1082 |              "color": "#2a3f5f"
1083 |             },
1084 |             "error_y": {
1085 |              "color": "#2a3f5f"
1086 |             },
1087 |             "marker": {
1088 |              "line": {
1089 |               "color": "#E5ECF6",
1090 |               "width": 0.5
1091 |              },
1092 |              "pattern": {
1093 |               "fillmode": "overlay",
1094 |               "size": 10,
1095 |               "solidity": 0.2
1096 |              }
1097 |             },
1098 |             "type": "bar"
1099 |            }
1100 |           ],
1101 |           "barpolar": [
1102 |            {
1103 |             "marker": {
1104 |              "line": {
1105 |               "color": "#E5ECF6",
1106 |               "width": 0.5
1107 |              },
1108 |              "pattern": {
1109 |               "fillmode": "overlay",
1110 |               "size": 10,
1111 |               "solidity": 0.2
1112 |              }
1113 |             },
1114 |             "type": "barpolar"
1115 |            }
1116 |           ],
1117 |           "carpet": [
1118 |            {
1119 |             "aaxis": {
1120 |              "endlinecolor": "#2a3f5f",
1121 |              "gridcolor": "white",
1122 |              "linecolor": "white",
1123 |              "minorgridcolor": "white",
1124 |              "startlinecolor": "#2a3f5f"
1125 |             },
1126 |             "baxis": {
1127 |              "endlinecolor": "#2a3f5f",
1128 |              "gridcolor": "white",
1129 |              "linecolor": "white",
1130 |              "minorgridcolor": "white",
1131 |              "startlinecolor": "#2a3f5f"
1132 |             },
1133 |             "type": "carpet"
1134 |            }
1135 |           ],
1136 |           "choropleth": [
1137 |            {
1138 |             "colorbar": {
1139 |              "outlinewidth": 0,
1140 |              "ticks": ""
1141 |             },
1142 |             "type": "choropleth"
1143 |            }
1144 |           ],
1145 |           "contour": [
1146 |            {
1147 |             "colorbar": {
1148 |              "outlinewidth": 0,
1149 |              "ticks": ""
1150 |             },
1151 |             "colorscale": [
1152 |              [
1153 |               0,
1154 |               "#0d0887"
1155 |              ],
1156 |              [
1157 |               0.1111111111111111,
1158 |               "#46039f"
1159 |              ],
1160 |              [
1161 |               0.2222222222222222,
1162 |               "#7201a8"
1163 |              ],
1164 |              [
1165 |               0.3333333333333333,
1166 |               "#9c179e"
1167 |              ],
1168 |              [
1169 |               0.4444444444444444,
1170 |               "#bd3786"
1171 |              ],
1172 |              [
1173 |               0.5555555555555556,
1174 |               "#d8576b"
1175 |              ],
1176 |              [
1177 |               0.6666666666666666,
1178 |               "#ed7953"
1179 |              ],
1180 |              [
1181 |               0.7777777777777778,
1182 |               "#fb9f3a"
1183 |              ],
1184 |              [
1185 |               0.8888888888888888,
1186 |               "#fdca26"
1187 |              ],
1188 |              [
1189 |               1,
1190 |               "#f0f921"
1191 |              ]
1192 |             ],
1193 |             "type": "contour"
1194 |            }
1195 |           ],
1196 |           "contourcarpet": [
1197 |            {
1198 |             "colorbar": {
1199 |              "outlinewidth": 0,
1200 |              "ticks": ""
1201 |             },
1202 |             "type": "contourcarpet"
1203 |            }
1204 |           ],
1205 |           "heatmap": [
1206 |            {
1207 |             "colorbar": {
1208 |              "outlinewidth": 0,
1209 |              "ticks": ""
1210 |             },
1211 |             "colorscale": [
1212 |              [
1213 |               0,
1214 |               "#0d0887"
1215 |              ],
1216 |              [
1217 |               0.1111111111111111,
1218 |               "#46039f"
1219 |              ],
1220 |              [
1221 |               0.2222222222222222,
1222 |               "#7201a8"
1223 |              ],
1224 |              [
1225 |               0.3333333333333333,
1226 |               "#9c179e"
1227 |              ],
1228 |              [
1229 |               0.4444444444444444,
1230 |               "#bd3786"
1231 |              ],
1232 |              [
1233 |               0.5555555555555556,
1234 |               "#d8576b"
1235 |              ],
1236 |              [
1237 |               0.6666666666666666,
1238 |               "#ed7953"
1239 |              ],
1240 |              [
1241 |               0.7777777777777778,
1242 |               "#fb9f3a"
1243 |              ],
1244 |              [
1245 |               0.8888888888888888,
1246 |               "#fdca26"
1247 |              ],
1248 |              [
1249 |               1,
1250 |               "#f0f921"
1251 |              ]
1252 |             ],
1253 |             "type": "heatmap"
1254 |            }
1255 |           ],
1256 |           "heatmapgl": [
1257 |            {
1258 |             "colorbar": {
1259 |              "outlinewidth": 0,
1260 |              "ticks": ""
1261 |             },
1262 |             "colorscale": [
1263 |              [
1264 |               0,
1265 |               "#0d0887"
1266 |              ],
1267 |              [
1268 |               0.1111111111111111,
1269 |               "#46039f"
1270 |              ],
1271 |              [
1272 |               0.2222222222222222,
1273 |               "#7201a8"
1274 |              ],
1275 |              [
1276 |               0.3333333333333333,
1277 |               "#9c179e"
1278 |              ],
1279 |              [
1280 |               0.4444444444444444,
1281 |               "#bd3786"
1282 |              ],
1283 |              [
1284 |               0.5555555555555556,
1285 |               "#d8576b"
1286 |              ],
1287 |              [
1288 |               0.6666666666666666,
1289 |               "#ed7953"
1290 |              ],
1291 |              [
1292 |               0.7777777777777778,
1293 |               "#fb9f3a"
1294 |              ],
1295 |              [
1296 |               0.8888888888888888,
1297 |               "#fdca26"
1298 |              ],
1299 |              [
1300 |               1,
1301 |               "#f0f921"
1302 |              ]
1303 |             ],
1304 |             "type": "heatmapgl"
1305 |            }
1306 |           ],
1307 |           "histogram": [
1308 |            {
1309 |             "marker": {
1310 |              "pattern": {
1311 |               "fillmode": "overlay",
1312 |               "size": 10,
1313 |               "solidity": 0.2
1314 |              }
1315 |             },
1316 |             "type": "histogram"
1317 |            }
1318 |           ],
1319 |           "histogram2d": [
1320 |            {
1321 |             "colorbar": {
1322 |              "outlinewidth": 0,
1323 |              "ticks": ""
1324 |             },
1325 |             "colorscale": [
1326 |              [
1327 |               0,
1328 |               "#0d0887"
1329 |              ],
1330 |              [
1331 |               0.1111111111111111,
1332 |               "#46039f"
1333 |              ],
1334 |              [
1335 |               0.2222222222222222,
1336 |               "#7201a8"
1337 |              ],
1338 |              [
1339 |               0.3333333333333333,
1340 |               "#9c179e"
1341 |              ],
1342 |              [
1343 |               0.4444444444444444,
1344 |               "#bd3786"
1345 |              ],
1346 |              [
1347 |               0.5555555555555556,
1348 |               "#d8576b"
1349 |              ],
1350 |              [
1351 |               0.6666666666666666,
1352 |               "#ed7953"
1353 |              ],
1354 |              [
1355 |               0.7777777777777778,
1356 |               "#fb9f3a"
1357 |              ],
1358 |              [
1359 |               0.8888888888888888,
1360 |               "#fdca26"
1361 |              ],
1362 |              [
1363 |               1,
1364 |               "#f0f921"
1365 |              ]
1366 |             ],
1367 |             "type": "histogram2d"
1368 |            }
1369 |           ],
1370 |           "histogram2dcontour": [
1371 |            {
1372 |             "colorbar": {
1373 |              "outlinewidth": 0,
1374 |              "ticks": ""
1375 |             },
1376 |             "colorscale": [
1377 |              [
1378 |               0,
1379 |               "#0d0887"
1380 |              ],
1381 |              [
1382 |               0.1111111111111111,
1383 |               "#46039f"
1384 |              ],
1385 |              [
1386 |               0.2222222222222222,
1387 |               "#7201a8"
1388 |              ],
1389 |              [
1390 |               0.3333333333333333,
1391 |               "#9c179e"
1392 |              ],
1393 |              [
1394 |               0.4444444444444444,
1395 |               "#bd3786"
1396 |              ],
1397 |              [
1398 |               0.5555555555555556,
1399 |               "#d8576b"
1400 |              ],
1401 |              [
1402 |               0.6666666666666666,
1403 |               "#ed7953"
1404 |              ],
1405 |              [
1406 |               0.7777777777777778,
1407 |               "#fb9f3a"
1408 |              ],
1409 |              [
1410 |               0.8888888888888888,
1411 |               "#fdca26"
1412 |              ],
1413 |              [
1414 |               1,
1415 |               "#f0f921"
1416 |              ]
1417 |             ],
1418 |             "type": "histogram2dcontour"
1419 |            }
1420 |           ],
1421 |           "mesh3d": [
1422 |            {
1423 |             "colorbar": {
1424 |              "outlinewidth": 0,
1425 |              "ticks": ""
1426 |             },
1427 |             "type": "mesh3d"
1428 |            }
1429 |           ],
1430 |           "parcoords": [
1431 |            {
1432 |             "line": {
1433 |              "colorbar": {
1434 |               "outlinewidth": 0,
1435 |               "ticks": ""
1436 |              }
1437 |             },
1438 |             "type": "parcoords"
1439 |            }
1440 |           ],
1441 |           "pie": [
1442 |            {
1443 |             "automargin": true,
1444 |             "type": "pie"
1445 |            }
1446 |           ],
1447 |           "scatter": [
1448 |            {
1449 |             "fillpattern": {
1450 |              "fillmode": "overlay",
1451 |              "size": 10,
1452 |              "solidity": 0.2
1453 |             },
1454 |             "type": "scatter"
1455 |            }
1456 |           ],
1457 |           "scatter3d": [
1458 |            {
1459 |             "line": {
1460 |              "colorbar": {
1461 |               "outlinewidth": 0,
1462 |               "ticks": ""
1463 |              }
1464 |             },
1465 |             "marker": {
1466 |              "colorbar": {
1467 |               "outlinewidth": 0,
1468 |               "ticks": ""
1469 |              }
1470 |             },
1471 |             "type": "scatter3d"
1472 |            }
1473 |           ],
1474 |           "scattercarpet": [
1475 |            {
1476 |             "marker": {
1477 |              "colorbar": {
1478 |               "outlinewidth": 0,
1479 |               "ticks": ""
1480 |              }
1481 |             },
1482 |             "type": "scattercarpet"
1483 |            }
1484 |           ],
1485 |           "scattergeo": [
1486 |            {
1487 |             "marker": {
1488 |              "colorbar": {
1489 |               "outlinewidth": 0,
1490 |               "ticks": ""
1491 |              }
1492 |             },
1493 |             "type": "scattergeo"
1494 |            }
1495 |           ],
1496 |           "scattergl": [
1497 |            {
1498 |             "marker": {
1499 |              "colorbar": {
1500 |               "outlinewidth": 0,
1501 |               "ticks": ""
1502 |              }
1503 |             },
1504 |             "type": "scattergl"
1505 |            }
1506 |           ],
1507 |           "scattermapbox": [
1508 |            {
1509 |             "marker": {
1510 |              "colorbar": {
1511 |               "outlinewidth": 0,
1512 |               "ticks": ""
1513 |              }
1514 |             },
1515 |             "type": "scattermapbox"
1516 |            }
1517 |           ],
1518 |           "scatterpolar": [
1519 |            {
1520 |             "marker": {
1521 |              "colorbar": {
1522 |               "outlinewidth": 0,
1523 |               "ticks": ""
1524 |              }
1525 |             },
1526 |             "type": "scatterpolar"
1527 |            }
1528 |           ],
1529 |           "scatterpolargl": [
1530 |            {
1531 |             "marker": {
1532 |              "colorbar": {
1533 |               "outlinewidth": 0,
1534 |               "ticks": ""
1535 |              }
1536 |             },
1537 |             "type": "scatterpolargl"
1538 |            }
1539 |           ],
1540 |           "scatterternary": [
1541 |            {
1542 |             "marker": {
1543 |              "colorbar": {
1544 |               "outlinewidth": 0,
1545 |               "ticks": ""
1546 |              }
1547 |             },
1548 |             "type": "scatterternary"
1549 |            }
1550 |           ],
1551 |           "surface": [
1552 |            {
1553 |             "colorbar": {
1554 |              "outlinewidth": 0,
1555 |              "ticks": ""
1556 |             },
1557 |             "colorscale": [
1558 |              [
1559 |               0,
1560 |               "#0d0887"
1561 |              ],
1562 |              [
1563 |               0.1111111111111111,
1564 |               "#46039f"
1565 |              ],
1566 |              [
1567 |               0.2222222222222222,
1568 |               "#7201a8"
1569 |              ],
1570 |              [
1571 |               0.3333333333333333,
1572 |               "#9c179e"
1573 |              ],
1574 |              [
1575 |               0.4444444444444444,
1576 |               "#bd3786"
1577 |              ],
1578 |              [
1579 |               0.5555555555555556,
1580 |               "#d8576b"
1581 |              ],
1582 |              [
1583 |               0.6666666666666666,
1584 |               "#ed7953"
1585 |              ],
1586 |              [
1587 |               0.7777777777777778,
1588 |               "#fb9f3a"
1589 |              ],
1590 |              [
1591 |               0.8888888888888888,
1592 |               "#fdca26"
1593 |              ],
1594 |              [
1595 |               1,
1596 |               "#f0f921"
1597 |              ]
1598 |             ],
1599 |             "type": "surface"
1600 |            }
1601 |           ],
1602 |           "table": [
1603 |            {
1604 |             "cells": {
1605 |              "fill": {
1606 |               "color": "#EBF0F8"
1607 |              },
1608 |              "line": {
1609 |               "color": "white"
1610 |              }
1611 |             },
1612 |             "header": {
1613 |              "fill": {
1614 |               "color": "#C8D4E3"
1615 |              },
1616 |              "line": {
1617 |               "color": "white"
1618 |              }
1619 |             },
1620 |             "type": "table"
1621 |            }
1622 |           ]
1623 |          },
1624 |          "layout": {
1625 |           "annotationdefaults": {
1626 |            "arrowcolor": "#2a3f5f",
1627 |            "arrowhead": 0,
1628 |            "arrowwidth": 1
1629 |           },
1630 |           "autotypenumbers": "strict",
1631 |           "coloraxis": {
1632 |            "colorbar": {
1633 |             "outlinewidth": 0,
1634 |             "ticks": ""
1635 |            }
1636 |           },
1637 |           "colorscale": {
1638 |            "diverging": [
1639 |             [
1640 |              0,
1641 |              "#8e0152"
1642 |             ],
1643 |             [
1644 |              0.1,
1645 |              "#c51b7d"
1646 |             ],
1647 |             [
1648 |              0.2,
1649 |              "#de77ae"
1650 |             ],
1651 |             [
1652 |              0.3,
1653 |              "#f1b6da"
1654 |             ],
1655 |             [
1656 |              0.4,
1657 |              "#fde0ef"
1658 |             ],
1659 |             [
1660 |              0.5,
1661 |              "#f7f7f7"
1662 |             ],
1663 |             [
1664 |              0.6,
1665 |              "#e6f5d0"
1666 |             ],
1667 |             [
1668 |              0.7,
1669 |              "#b8e186"
1670 |             ],
1671 |             [
1672 |              0.8,
1673 |              "#7fbc41"
1674 |             ],
1675 |             [
1676 |              0.9,
1677 |              "#4d9221"
1678 |             ],
1679 |             [
1680 |              1,
1681 |              "#276419"
1682 |             ]
1683 |            ],
1684 |            "sequential": [
1685 |             [
1686 |              0,
1687 |              "#0d0887"
1688 |             ],
1689 |             [
1690 |              0.1111111111111111,
1691 |              "#46039f"
1692 |             ],
1693 |             [
1694 |              0.2222222222222222,
1695 |              "#7201a8"
1696 |             ],
1697 |             [
1698 |              0.3333333333333333,
1699 |              "#9c179e"
1700 |             ],
1701 |             [
1702 |              0.4444444444444444,
1703 |              "#bd3786"
1704 |             ],
1705 |             [
1706 |              0.5555555555555556,
1707 |              "#d8576b"
1708 |             ],
1709 |             [
1710 |              0.6666666666666666,
1711 |              "#ed7953"
1712 |             ],
1713 |             [
1714 |              0.7777777777777778,
1715 |              "#fb9f3a"
1716 |             ],
1717 |             [
1718 |              0.8888888888888888,
1719 |              "#fdca26"
1720 |             ],
1721 |             [
1722 |              1,
1723 |              "#f0f921"
1724 |             ]
1725 |            ],
1726 |            "sequentialminus": [
1727 |             [
1728 |              0,
1729 |              "#0d0887"
1730 |             ],
1731 |             [
1732 |              0.1111111111111111,
1733 |              "#46039f"
1734 |             ],
1735 |             [
1736 |              0.2222222222222222,
1737 |              "#7201a8"
1738 |             ],
1739 |             [
1740 |              0.3333333333333333,
1741 |              "#9c179e"
1742 |             ],
1743 |             [
1744 |              0.4444444444444444,
1745 |              "#bd3786"
1746 |             ],
1747 |             [
1748 |              0.5555555555555556,
1749 |              "#d8576b"
1750 |             ],
1751 |             [
1752 |              0.6666666666666666,
1753 |              "#ed7953"
1754 |             ],
1755 |             [
1756 |              0.7777777777777778,
1757 |              "#fb9f3a"
1758 |             ],
1759 |             [
1760 |              0.8888888888888888,
1761 |              "#fdca26"
1762 |             ],
1763 |             [
1764 |              1,
1765 |              "#f0f921"
1766 |             ]
1767 |            ]
1768 |           },
1769 |           "colorway": [
1770 |            "#636efa",
1771 |            "#EF553B",
1772 |            "#00cc96",
1773 |            "#ab63fa",
1774 |            "#FFA15A",
1775 |            "#19d3f3",
1776 |            "#FF6692",
1777 |            "#B6E880",
1778 |            "#FF97FF",
1779 |            "#FECB52"
1780 |           ],
1781 |           "font": {
1782 |            "color": "#2a3f5f"
1783 |           },
1784 |           "geo": {
1785 |            "bgcolor": "white",
1786 |            "lakecolor": "white",
1787 |            "landcolor": "#E5ECF6",
1788 |            "showlakes": true,
1789 |            "showland": true,
1790 |            "subunitcolor": "white"
1791 |           },
1792 |           "hoverlabel": {
1793 |            "align": "left"
1794 |           },
1795 |           "hovermode": "closest",
1796 |           "mapbox": {
1797 |            "style": "light"
1798 |           },
1799 |           "paper_bgcolor": "white",
1800 |           "plot_bgcolor": "#E5ECF6",
1801 |           "polar": {
1802 |            "angularaxis": {
1803 |             "gridcolor": "white",
1804 |             "linecolor": "white",
1805 |             "ticks": ""
1806 |            },
1807 |            "bgcolor": "#E5ECF6",
1808 |            "radialaxis": {
1809 |             "gridcolor": "white",
1810 |             "linecolor": "white",
1811 |             "ticks": ""
1812 |            }
1813 |           },
1814 |           "scene": {
1815 |            "xaxis": {
1816 |             "backgroundcolor": "#E5ECF6",
1817 |             "gridcolor": "white",
1818 |             "gridwidth": 2,
1819 |             "linecolor": "white",
1820 |             "showbackground": true,
1821 |             "ticks": "",
1822 |             "zerolinecolor": "white"
1823 |            },
1824 |            "yaxis": {
1825 |             "backgroundcolor": "#E5ECF6",
1826 |             "gridcolor": "white",
1827 |             "gridwidth": 2,
1828 |             "linecolor": "white",
1829 |             "showbackground": true,
1830 |             "ticks": "",
1831 |             "zerolinecolor": "white"
1832 |            },
1833 |            "zaxis": {
1834 |             "backgroundcolor": "#E5ECF6",
1835 |             "gridcolor": "white",
1836 |             "gridwidth": 2,
1837 |             "linecolor": "white",
1838 |             "showbackground": true,
1839 |             "ticks": "",
1840 |             "zerolinecolor": "white"
1841 |            }
1842 |           },
1843 |           "shapedefaults": {
1844 |            "line": {
1845 |             "color": "#2a3f5f"
1846 |            }
1847 |           },
1848 |           "ternary": {
1849 |            "aaxis": {
1850 |             "gridcolor": "white",
1851 |             "linecolor": "white",
1852 |             "ticks": ""
1853 |            },
1854 |            "baxis": {
1855 |             "gridcolor": "white",
1856 |             "linecolor": "white",
1857 |             "ticks": ""
1858 |            },
1859 |            "bgcolor": "#E5ECF6",
1860 |            "caxis": {
1861 |             "gridcolor": "white",
1862 |             "linecolor": "white",
1863 |             "ticks": ""
1864 |            }
1865 |           },
1866 |           "title": {
1867 |            "x": 0.05
1868 |           },
1869 |           "xaxis": {
1870 |            "automargin": true,
1871 |            "gridcolor": "white",
1872 |            "linecolor": "white",
1873 |            "ticks": "",
1874 |            "title": {
1875 |             "standoff": 15
1876 |            },
1877 |            "zerolinecolor": "white",
1878 |            "zerolinewidth": 2
1879 |           },
1880 |           "yaxis": {
1881 |            "automargin": true,
1882 |            "gridcolor": "white",
1883 |            "linecolor": "white",
1884 |            "ticks": "",
1885 |            "title": {
1886 |             "standoff": 15
1887 |            },
1888 |            "zerolinecolor": "white",
1889 |            "zerolinewidth": 2
1890 |           }
1891 |          }
1892 |         },
1893 |         "title": {
1894 |          "text": "<b>Average Evaluation Score (LLM-as-a-judge)</b>"
1895 |         },
1896 |         "width": 1000,
1897 |         "xaxis": {
1898 |          "anchor": "y",
1899 |          "domain": [
1900 |           0,
1901 |           1
1902 |          ],
1903 |          "title": {
1904 |           "text": "<b>Agent</b>"
1905 |          }
1906 |         },
1907 |         "yaxis": {
1908 |          "anchor": "x",
1909 |          "domain": [
1910 |           0,
1911 |           1
1912 |          ],
1913 |          "range": [
1914 |           0,
1915 |           100
1916 |          ],
1917 |          "ticksuffix": "%",
1918 |          "title": {
1919 |           "text": "<b>Score</b>"
1920 |          }
1921 |         }
1922 |        }
1923 |       },
1924 |       "text/html": [
1925 |        "<div>                            <div id=\"cb14ba4e-cbff-4b02-95ea-b9c4bc73b0fb\" class=\"plotly-graph-div\" style=\"height:600px; width:1000px;\"></div>            <script type=\"text/javascript\">                require([\"plotly\"], function(Plotly) {                    window.PLOTLYENV=window.PLOTLYENV || {};                                    if (document.getElementById(\"cb14ba4e-cbff-4b02-95ea-b9c4bc73b0fb\")) {                    Plotly.newPlot(                        \"cb14ba4e-cbff-4b02-95ea-b9c4bc73b0fb\",                        [{\"alignmentgroup\":\"True\",\"hovertemplate\":\"\\u003cb\\u003eTask\\u003c\\u002fb\\u003e=GAIA\\u003cbr\\u003e\\u003cb\\u003eAgent\\u003c\\u002fb\\u003e=%{x}\\u003cbr\\u003e\\u003cb\\u003eScore\\u003c\\u002fb\\u003e=%{y}\\u003cextra\\u003e\\u003c\\u002fextra\\u003e\",\"legendgroup\":\"GAIA\",\"marker\":{\"color\":\"#3366CC\",\"pattern\":{\"shape\":\"\"}},\"name\":\"GAIA\",\"offsetgroup\":\"GAIA\",\"orientation\":\"v\",\"showlegend\":true,\"textposition\":\"outside\",\"x\":[\"GPT4\",\"Mixtral-8x7B-Instruct-v0.1\",\"GPT3.5\",\"SOLAR-10.7B\",\"OpenHermes-2.5-Mistral-7B\",\"Zephyr-7b-beta\",\"Llama-2-70b-chat\"],\"xaxis\":\"x\",\"y\":[40.0,16.25,16.25,7.5,10.0,0.0,0.0],\"yaxis\":\"y\",\"type\":\"bar\",\"texttemplate\":\"%{y:.0f}\"},{\"alignmentgroup\":\"True\",\"hovertemplate\":\"\\u003cb\\u003eTask\\u003c\\u002fb\\u003e=GSM8K\\u003cbr\\u003e\\u003cb\\u003eAgent\\u003c\\u002fb\\u003e=%{x}\\u003cbr\\u003e\\u003cb\\u003eScore\\u003c\\u002fb\\u003e=%{y}\\u003cextra\\u003e\\u003c\\u002fextra\\u003e\",\"legendgroup\":\"GSM8K\",\"marker\":{\"color\":\"#DC3912\",\"pattern\":{\"shape\":\"\"}},\"name\":\"GSM8K\",\"offsetgroup\":\"GSM8K\",\"orientation\":\"v\",\"showlegend\":true,\"textposition\":\"outside\",\"x\":[\"GPT4\",\"Mixtral-8x7B-Instruct-v0.1\",\"GPT3.5\",\"SOLAR-10.7B\",\"OpenHermes-2.5-Mistral-7B\",\"Zephyr-7b-beta\",\"Llama-2-70b-chat\"],\"xaxis\":\"x\",\"y\":[95.0,72.5,62.5,73.75,46.25,46.25,17.5],\"yaxis\":\"y\",\"type\":\"bar\",\"texttemplate\":\"%{y:.0f}\"},{\"alignmentgroup\":\"True\",\"hovertemplate\":\"\\u003cb\\u003eTask\\u003c\\u002fb\\u003e=HotpotQA\\u003cbr\\u003e\\u003cb\\u003eAgent\\u003c\\u002fb\\u003e=%{x}\\u003cbr\\u003e\\u003cb\\u003eScore\\u003c\\u002fb\\u003e=%{y}\\u003cextra\\u003e\\u003c\\u002fextra\\u003e\",\"legendgroup\":\"HotpotQA\",\"marker\":{\"color\":\"#FF9900\",\"pattern\":{\"shape\":\"\"}},\"name\":\"HotpotQA\",\"offsetgroup\":\"HotpotQA\",\"orientation\":\"v\",\"showlegend\":true,\"textposition\":\"outside\",\"x\":[\"GPT4\",\"Mixtral-8x7B-Instruct-v0.1\",\"GPT3.5\",\"SOLAR-10.7B\",\"OpenHermes-2.5-Mistral-7B\",\"Zephyr-7b-beta\",\"Llama-2-70b-chat\"],\"xaxis\":\"x\",\"y\":[80.83333333333333,77.08333333333334,74.58333333333333,53.75,65.83333333333333,36.666666666666664,49.583333333333336],\"yaxis\":\"y\",\"type\":\"bar\",\"texttemplate\":\"%{y:.0f}\"}],                        {\"template\":{\"data\":{\"histogram2dcontour\":[{\"type\":\"histogram2dcontour\",\"colorbar\":{\"outlinewidth\":0,\"ticks\":\"\"},\"colorscale\":[[0.0,\"#0d0887\"],[0.1111111111111111,\"#46039f\"],[0.2222222222222222,\"#7201a8\"],[0.3333333333333333,\"#9c179e\"],[0.4444444444444444,\"#bd3786\"],[0.5555555555555556,\"#d8576b\"],[0.6666666666666666,\"#ed7953\"],[0.7777777777777778,\"#fb9f3a\"],[0.8888888888888888,\"#fdca26\"],[1.0,\"#f0f921\"]]}],\"choropleth\":[{\"type\":\"choropleth\",\"colorbar\":{\"outlinewidth\":0,\"ticks\":\"\"}}],\"histogram2d\":[{\"type\":\"histogram2d\",\"colorbar\":{\"outlinewidth\":0,\"ticks\":\"\"},\"colorscale\":[[0.0,\"#0d0887\"],[0.1111111111111111,\"#46039f\"],[0.2222222222222222,\"#7201a8\"],[0.3333333333333333,\"#9c179e\"],[0.4444444444444444,\"#bd3786\"],[0.5555555555555556,\"#d8576b\"],[0.6666666666666666,\"#ed7953\"],[0.7777777777777778,\"#fb9f3a\"],[0.8888888888888888,\"#fdca26\"],[1.0,\"#f0f921\"]]}],\"heatmap\":[{\"type\":\"heatmap\",\"colorbar\":{\"outlinewidth\":0,\"ticks\":\"\"},\"colorscale\":[[0.0,\"#0d0887\"],[0.1111111111111111,\"#46039f\"],[0.2222222222222222,\"#7201a8\"],[0.3333333333333333,\"#9c179e\"],[0.4444444444444444,\"#bd3786\"],[0.5555555555555556,\"#d8576b\"],[0.6666666666666666,\"#ed7953\"],[0.7777777777777778,\"#fb9f3a\"],[0.8888888888888888,\"#fdca26\"],[1.0,\"#f0f921\"]]}],\"heatmapgl\":[{\"type\":\"heatmapgl\",\"colorbar\":{\"outlinewidth\":0,\"ticks\":\"\"},\"colorscale\":[[0.0,\"#0d0887\"],[0.1111111111111111,\"#46039f\"],[0.2222222222222222,\"#7201a8\"],[0.3333333333333333,\"#9c179e\"],[0.4444444444444444,\"#bd3786\"],[0.5555555555555556,\"#d8576b\"],[0.6666666666666666,\"#ed7953\"],[0.7777777777777778,\"#fb9f3a\"],[0.8888888888888888,\"#fdca26\"],[1.0,\"#f0f921\"]]}],\"contourcarpet\":[{\"type\":\"contourcarpet\",\"colorbar\":{\"outlinewidth\":0,\"ticks\":\"\"}}],\"contour\":[{\"type\":\"contour\",\"colorbar\":{\"outlinewidth\":0,\"ticks\":\"\"},\"colorscale\":[[0.0,\"#0d0887\"],[0.1111111111111111,\"#46039f\"],[0.2222222222222222,\"#7201a8\"],[0.3333333333333333,\"#9c179e\"],[0.4444444444444444,\"#bd3786\"],[0.5555555555555556,\"#d8576b\"],[0.6666666666666666,\"#ed7953\"],[0.7777777777777778,\"#fb9f3a\"],[0.8888888888888888,\"#fdca26\"],[1.0,\"#f0f921\"]]}],\"surface\":[{\"type\":\"surface\",\"colorbar\":{\"outlinewidth\":0,\"ticks\":\"\"},\"colorscale\":[[0.0,\"#0d0887\"],[0.1111111111111111,\"#46039f\"],[0.2222222222222222,\"#7201a8\"],[0.3333333333333333,\"#9c179e\"],[0.4444444444444444,\"#bd3786\"],[0.5555555555555556,\"#d8576b\"],[0.6666666666666666,\"#ed7953\"],[0.7777777777777778,\"#fb9f3a\"],[0.8888888888888888,\"#fdca26\"],[1.0,\"#f0f921\"]]}],\"mesh3d\":[{\"type\":\"mesh3d\",\"colorbar\":{\"outlinewidth\":0,\"ticks\":\"\"}}],\"scatter\":[{\"fillpattern\":{\"fillmode\":\"overlay\",\"size\":10,\"solidity\":0.2},\"type\":\"scatter\"}],\"parcoords\":[{\"type\":\"parcoords\",\"line\":{\"colorbar\":{\"outlinewidth\":0,\"ticks\":\"\"}}}],\"scatterpolargl\":[{\"type\":\"scatterpolargl\",\"marker\":{\"colorbar\":{\"outlinewidth\":0,\"ticks\":\"\"}}}],\"bar\":[{\"error_x\":{\"color\":\"#2a3f5f\"},\"error_y\":{\"color\":\"#2a3f5f\"},\"marker\":{\"line\":{\"color\":\"#E5ECF6\",\"width\":0.5},\"pattern\":{\"fillmode\":\"overlay\",\"size\":10,\"solidity\":0.2}},\"type\":\"bar\"}],\"scattergeo\":[{\"type\":\"scattergeo\",\"marker\":{\"colorbar\":{\"outlinewidth\":0,\"ticks\":\"\"}}}],\"scatterpolar\":[{\"type\":\"scatterpolar\",\"marker\":{\"colorbar\":{\"outlinewidth\":0,\"ticks\":\"\"}}}],\"histogram\":[{\"marker\":{\"pattern\":{\"fillmode\":\"overlay\",\"size\":10,\"solidity\":0.2}},\"type\":\"histogram\"}],\"scattergl\":[{\"type\":\"scattergl\",\"marker\":{\"colorbar\":{\"outlinewidth\":0,\"ticks\":\"\"}}}],\"scatter3d\":[{\"type\":\"scatter3d\",\"line\":{\"colorbar\":{\"outlinewidth\":0,\"ticks\":\"\"}},\"marker\":{\"colorbar\":{\"outlinewidth\":0,\"ticks\":\"\"}}}],\"scattermapbox\":[{\"type\":\"scattermapbox\",\"marker\":{\"colorbar\":{\"outlinewidth\":0,\"ticks\":\"\"}}}],\"scatterternary\":[{\"type\":\"scatterternary\",\"marker\":{\"colorbar\":{\"outlinewidth\":0,\"ticks\":\"\"}}}],\"scattercarpet\":[{\"type\":\"scattercarpet\",\"marker\":{\"colorbar\":{\"outlinewidth\":0,\"ticks\":\"\"}}}],\"carpet\":[{\"aaxis\":{\"endlinecolor\":\"#2a3f5f\",\"gridcolor\":\"white\",\"linecolor\":\"white\",\"minorgridcolor\":\"white\",\"startlinecolor\":\"#2a3f5f\"},\"baxis\":{\"endlinecolor\":\"#2a3f5f\",\"gridcolor\":\"white\",\"linecolor\":\"white\",\"minorgridcolor\":\"white\",\"startlinecolor\":\"#2a3f5f\"},\"type\":\"carpet\"}],\"table\":[{\"cells\":{\"fill\":{\"color\":\"#EBF0F8\"},\"line\":{\"color\":\"white\"}},\"header\":{\"fill\":{\"color\":\"#C8D4E3\"},\"line\":{\"color\":\"white\"}},\"type\":\"table\"}],\"barpolar\":[{\"marker\":{\"line\":{\"color\":\"#E5ECF6\",\"width\":0.5},\"pattern\":{\"fillmode\":\"overlay\",\"size\":10,\"solidity\":0.2}},\"type\":\"barpolar\"}],\"pie\":[{\"automargin\":true,\"type\":\"pie\"}]},\"layout\":{\"autotypenumbers\":\"strict\",\"colorway\":[\"#636efa\",\"#EF553B\",\"#00cc96\",\"#ab63fa\",\"#FFA15A\",\"#19d3f3\",\"#FF6692\",\"#B6E880\",\"#FF97FF\",\"#FECB52\"],\"font\":{\"color\":\"#2a3f5f\"},\"hovermode\":\"closest\",\"hoverlabel\":{\"align\":\"left\"},\"paper_bgcolor\":\"white\",\"plot_bgcolor\":\"#E5ECF6\",\"polar\":{\"bgcolor\":\"#E5ECF6\",\"angularaxis\":{\"gridcolor\":\"white\",\"linecolor\":\"white\",\"ticks\":\"\"},\"radialaxis\":{\"gridcolor\":\"white\",\"linecolor\":\"white\",\"ticks\":\"\"}},\"ternary\":{\"bgcolor\":\"#E5ECF6\",\"aaxis\":{\"gridcolor\":\"white\",\"linecolor\":\"white\",\"ticks\":\"\"},\"baxis\":{\"gridcolor\":\"white\",\"linecolor\":\"white\",\"ticks\":\"\"},\"caxis\":{\"gridcolor\":\"white\",\"linecolor\":\"white\",\"ticks\":\"\"}},\"coloraxis\":{\"colorbar\":{\"outlinewidth\":0,\"ticks\":\"\"}},\"colorscale\":{\"sequential\":[[0.0,\"#0d0887\"],[0.1111111111111111,\"#46039f\"],[0.2222222222222222,\"#7201a8\"],[0.3333333333333333,\"#9c179e\"],[0.4444444444444444,\"#bd3786\"],[0.5555555555555556,\"#d8576b\"],[0.6666666666666666,\"#ed7953\"],[0.7777777777777778,\"#fb9f3a\"],[0.8888888888888888,\"#fdca26\"],[1.0,\"#f0f921\"]],\"sequentialminus\":[[0.0,\"#0d0887\"],[0.1111111111111111,\"#46039f\"],[0.2222222222222222,\"#7201a8\"],[0.3333333333333333,\"#9c179e\"],[0.4444444444444444,\"#bd3786\"],[0.5555555555555556,\"#d8576b\"],[0.6666666666666666,\"#ed7953\"],[0.7777777777777778,\"#fb9f3a\"],[0.8888888888888888,\"#fdca26\"],[1.0,\"#f0f921\"]],\"diverging\":[[0,\"#8e0152\"],[0.1,\"#c51b7d\"],[0.2,\"#de77ae\"],[0.3,\"#f1b6da\"],[0.4,\"#fde0ef\"],[0.5,\"#f7f7f7\"],[0.6,\"#e6f5d0\"],[0.7,\"#b8e186\"],[0.8,\"#7fbc41\"],[0.9,\"#4d9221\"],[1,\"#276419\"]]},\"xaxis\":{\"gridcolor\":\"white\",\"linecolor\":\"white\",\"ticks\":\"\",\"title\":{\"standoff\":15},\"zerolinecolor\":\"white\",\"automargin\":true,\"zerolinewidth\":2},\"yaxis\":{\"gridcolor\":\"white\",\"linecolor\":\"white\",\"ticks\":\"\",\"title\":{\"standoff\":15},\"zerolinecolor\":\"white\",\"automargin\":true,\"zerolinewidth\":2},\"scene\":{\"xaxis\":{\"backgroundcolor\":\"#E5ECF6\",\"gridcolor\":\"white\",\"linecolor\":\"white\",\"showbackground\":true,\"ticks\":\"\",\"zerolinecolor\":\"white\",\"gridwidth\":2},\"yaxis\":{\"backgroundcolor\":\"#E5ECF6\",\"gridcolor\":\"white\",\"linecolor\":\"white\",\"showbackground\":true,\"ticks\":\"\",\"zerolinecolor\":\"white\",\"gridwidth\":2},\"zaxis\":{\"backgroundcolor\":\"#E5ECF6\",\"gridcolor\":\"white\",\"linecolor\":\"white\",\"showbackground\":true,\"ticks\":\"\",\"zerolinecolor\":\"white\",\"gridwidth\":2}},\"shapedefaults\":{\"line\":{\"color\":\"#2a3f5f\"}},\"annotationdefaults\":{\"arrowcolor\":\"#2a3f5f\",\"arrowhead\":0,\"arrowwidth\":1},\"geo\":{\"bgcolor\":\"white\",\"landcolor\":\"#E5ECF6\",\"subunitcolor\":\"white\",\"showland\":true,\"showlakes\":true,\"lakecolor\":\"white\"},\"title\":{\"x\":0.05},\"mapbox\":{\"style\":\"light\"}}},\"xaxis\":{\"anchor\":\"y\",\"domain\":[0.0,1.0],\"title\":{\"text\":\"\\u003cb\\u003eAgent\\u003c\\u002fb\\u003e\"}},\"yaxis\":{\"anchor\":\"x\",\"domain\":[0.0,1.0],\"title\":{\"text\":\"\\u003cb\\u003eScore\\u003c\\u002fb\\u003e\"},\"range\":[0,100],\"ticksuffix\":\"%\"},\"legend\":{\"title\":{\"text\":\"\\u003cb\\u003eTask\\u003c\\u002fb\\u003e\"},\"tracegroupgap\":0},\"title\":{\"text\":\"\\u003cb\\u003eAverage Evaluation Score (LLM-as-a-judge)\\u003c\\u002fb\\u003e\"},\"barmode\":\"group\",\"width\":1000,\"height\":600,\"bargap\":0.35,\"bargroupgap\":0.0},                        {\"responsive\": true}                    ).then(function(){\n",
1926 |        "                            \n",
1927 |        "var gd = document.getElementById('cb14ba4e-cbff-4b02-95ea-b9c4bc73b0fb');\n",
1928 |        "var x = new MutationObserver(function (mutations, observer) {{\n",
1929 |        "        var display = window.getComputedStyle(gd).display;\n",
1930 |        "        if (!display || display === 'none') {{\n",
1931 |        "            console.log([gd, 'removed!']);\n",
1932 |        "            Plotly.purge(gd);\n",
1933 |        "            observer.disconnect();\n",
1934 |        "        }}\n",
1935 |        "}});\n",
1936 |        "\n",
1937 |        "// Listen for the removal of the full notebook cells\n",
1938 |        "var notebookContainer = gd.closest('#notebook-container');\n",
1939 |        "if (notebookContainer) {{\n",
1940 |        "    x.observe(notebookContainer, {childList: true});\n",
1941 |        "}}\n",
1942 |        "\n",
1943 |        "// Listen for the clearing of the current output cell\n",
1944 |        "var outputEl = gd.closest('.output');\n",
1945 |        "if (outputEl) {{\n",
1946 |        "    x.observe(outputEl, {childList: true});\n",
1947 |        "}}\n",
1948 |        "\n",
1949 |        "                        })                };                });            </script>        </div>"
1950 |       ]
1951 |      },
1952 |      "metadata": {},
1953 |      "output_type": "display_data"
1954 |     }
1955 |    ],
1956 |    "source": [
1957 |     "import kaleido\n",
1958 |     "\n",
1959 |     "sorter = [\n",
1960 |     "    \"GPT4\",\n",
1961 |     "    \"Mixtral-8x7B-Instruct-v0.1\",\n",
1962 |     "    \"GPT3.5\",\n",
1963 |     "    \"SOLAR-10.7B\",\n",
1964 |     "    \"OpenHermes-2.5-Mistral-7B\",\n",
1965 |     "    \"Zephyr-7b-beta\",\n",
1966 |     "    \"Llama-2-70b-chat\",\n",
1967 |     "]\n",
1968 |     "table_result = table_result.sort_values(\n",
1969 |     "    \"agent_name\", key=lambda column: column.map(lambda e: sorter.index(e))\n",
1970 |     ")\n",
1971 |     "# Plot results\n",
1972 |     "fig = px.bar(\n",
1973 |     "    table_result,\n",
1974 |     "    x=\"agent_name\",\n",
1975 |     "    y=\"eval_score_GPT4\",\n",
1976 |     "    color=\"task\",\n",
1977 |     "    title=f\"<b>Average Evaluation Score (LLM-as-a-judge)</b>\",\n",
1978 |     "    labels={\n",
1979 |     "        \"agent_name\": \"<b>Agent</b>\",\n",
1980 |     "        \"task\": \"<b>Task</b>\",\n",
1981 |     "        \"score\": \"Performance\",\n",
1982 |     "        \"eval_score_GPT4\": \"<b>Score</b>\",\n",
1983 |     "    },\n",
1984 |     "    color_discrete_sequence=px.colors.qualitative.G10,\n",
1985 |     ")\n",
1986 |     "fig.update_layout(\n",
1987 |     "    width=1000,\n",
1988 |     "    height=600,\n",
1989 |     "    barmode=\"group\",\n",
1990 |     "    bargap=0.35,\n",
1991 |     "    bargroupgap=0.0,\n",
1992 |     "    yaxis_range=[0, 100],\n",
1993 |     ")\n",
1994 |     "fig.update_traces(texttemplate=\"%{y:.0f}\", textposition=\"outside\")\n",
1995 |     "fig.layout.yaxis.ticksuffix = \"%\"\n",
1996 |     "fig.write_image(\"benchmark_agents.png\", scale=4)\n",
1997 |     "fig.show()"
1998 |    ]
1999 |   },
2000 |   {
2001 |    "cell_type": "markdown",
2002 |    "metadata": {},
2003 |    "source": [
2004 |     "#### Study intermediate steps"
2005 |    ]
2006 |   },
2007 |   {
2008 |    "cell_type": "code",
2009 |    "execution_count": 28,
2010 |    "metadata": {},
2011 |    "outputs": [
2012 |     {
2013 |      "name": "stderr",
2014 |      "output_type": "stream",
2015 |      "text": [
2016 |       "/tmp/ipykernel_46633/2687269850.py:9: SettingWithCopyWarning:\n",
2017 |       "\n",
2018 |       "\n",
2019 |       "A value is trying to be set on a copy of a slice from a DataFrame.\n",
2020 |       "Try using .loc[row_indexer,col_indexer] = value instead\n",
2021 |       "\n",
2022 |       "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
2023 |       "\n",
2024 |       "/tmp/ipykernel_46633/2687269850.py:12: SettingWithCopyWarning:\n",
2025 |       "\n",
2026 |       "\n",
2027 |       "A value is trying to be set on a copy of a slice from a DataFrame.\n",
2028 |       "Try using .loc[row_indexer,col_indexer] = value instead\n",
2029 |       "\n",
2030 |       "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
2031 |       "\n"
2032 |      ]
2033 |     }
2034 |    ],
2035 |    "source": [
2036 |     "try:\n",
2037 |     "    result_df[\"tools_used\"] = result_df[\"intermediate_steps\"].apply(\n",
2038 |     "        lambda row: ([step[\"tool\"] for step in row] if row is not None else None)\n",
2039 |     "    )\n",
2040 |     "except:\n",
2041 |     "    pass\n",
2042 |     "result_df[\"has_answer\"] = result_df[\"prediction\"].apply(lambda row: (row is not None))\n",
2043 |     "result_df_answers_only = result_df.loc[result_df[\"has_answer\"] == True]\n",
2044 |     "result_df_answers_only[\"correct_answer\"] = (\n",
2045 |     "    result_df_answers_only[f\"eval_score_{eval_model_name}\"] >= 4\n",
2046 |     ")\n",
2047 |     "result_df_answers_only[\"number_of_steps\"] = result_df_answers_only[\"tools_used\"].apply(\n",
2048 |     "    lambda x: len(x) + 1\n",
2049 |     ")\n",
2050 |     "aggregated_resuts = result_df_answers_only.groupby(\n",
2051 |     "    [\"agent_name\", \"task\", \"correct_answer\"]\n",
2052 |     ").agg({\"number_of_steps\": \"mean\"})"
2053 |    ]
2054 |   },
2055 |   {
2056 |    "cell_type": "code",
2057 |    "execution_count": 29,
2058 |    "metadata": {},
2059 |    "outputs": [
2060 |     {
2061 |      "data": {
2062 |       "application/vnd.plotly.v1+json": {
2063 |        "config": {
2064 |         "plotlyServerURL": "https://plot.ly"
2065 |        },
2066 |        "data": [
2067 |         {
2068 |          "alignmentgroup": "True",
2069 |          "hovertemplate": "Answer correctness=False<br>Agent=GPT3.5<br>Task=%{x}<br>Mean Number of Steps=%{y}<extra></extra>",
2070 |          "legendgroup": "False",
2071 |          "marker": {
2072 |           "color": "#636efa",
2073 |           "pattern": {
2074 |            "shape": ""
2075 |           }
2076 |          },
2077 |          "name": "False",
2078 |          "offsetgroup": "False",
2079 |          "orientation": "v",
2080 |          "showlegend": true,
2081 |          "textposition": "auto",
2082 |          "type": "bar",
2083 |          "x": [
2084 |           "GAIA",
2085 |           "GSM8K",
2086 |           "HotpotQA"
2087 |          ],
2088 |          "xaxis": "x",
2089 |          "y": [
2090 |           2.111111111111111,
2091 |           2.125,
2092 |           1.7
2093 |          ],
2094 |          "yaxis": "y"
2095 |         },
2096 |         {
2097 |          "alignmentgroup": "True",
2098 |          "hovertemplate": "Answer correctness=False<br>Agent=GPT4<br>Task=%{x}<br>Mean Number of Steps=%{y}<extra></extra>",
2099 |          "legendgroup": "False",
2100 |          "marker": {
2101 |           "color": "#636efa",
2102 |           "pattern": {
2103 |            "shape": ""
2104 |           }
2105 |          },
2106 |          "name": "False",
2107 |          "offsetgroup": "False",
2108 |          "orientation": "v",
2109 |          "showlegend": false,
2110 |          "textposition": "auto",
2111 |          "type": "bar",
2112 |          "x": [
2113 |           "GAIA",
2114 |           "GSM8K",
2115 |           "HotpotQA"
2116 |          ],
2117 |          "xaxis": "x2",
2118 |          "y": [
2119 |           2.3846153846153846,
2120 |           3,
2121 |           1.8823529411764706
2122 |          ],
2123 |          "yaxis": "y2"
2124 |         },
2125 |         {
2126 |          "alignmentgroup": "True",
2127 |          "hovertemplate": "Answer correctness=False<br>Agent=Llama-2-70b-chat<br>Task=%{x}<br>Mean Number of Steps=%{y}<extra></extra>",
2128 |          "legendgroup": "False",
2129 |          "marker": {
2130 |           "color": "#636efa",
2131 |           "pattern": {
2132 |            "shape": ""
2133 |           }
2134 |          },
2135 |          "name": "False",
2136 |          "offsetgroup": "False",
2137 |          "orientation": "v",
2138 |          "showlegend": false,
2139 |          "textposition": "auto",
2140 |          "type": "bar",
2141 |          "x": [
2142 |           "GAIA",
2143 |           "GSM8K",
2144 |           "HotpotQA"
2145 |          ],
2146 |          "xaxis": "x3",
2147 |          "y": [
2148 |           5.428571428571429,
2149 |           4.571428571428571,
2150 |           4.5
2151 |          ],
2152 |          "yaxis": "y3"
2153 |         },
2154 |         {
2155 |          "alignmentgroup": "True",
2156 |          "hovertemplate": "Answer correctness=False<br>Agent=Mixtral-8x7B-Instruct-v0.1<br>Task=%{x}<br>Mean Number of Steps=%{y}<extra></extra>",
2157 |          "legendgroup": "False",
2158 |          "marker": {
2159 |           "color": "#636efa",
2160 |           "pattern": {
2161 |            "shape": ""
2162 |           }
2163 |          },
2164 |          "name": "False",
2165 |          "offsetgroup": "False",
2166 |          "orientation": "v",
2167 |          "showlegend": false,
2168 |          "textposition": "auto",
2169 |          "type": "bar",
2170 |          "x": [
2171 |           "GAIA",
2172 |           "GSM8K",
2173 |           "HotpotQA"
2174 |          ],
2175 |          "xaxis": "x4",
2176 |          "y": [
2177 |           4.5,
2178 |           4.166666666666667,
2179 |           3.409090909090909
2180 |          ],
2181 |          "yaxis": "y4"
2182 |         },
2183 |         {
2184 |          "alignmentgroup": "True",
2185 |          "hovertemplate": "Answer correctness=False<br>Agent=OpenHermes-2.5-Mistral-7B<br>Task=%{x}<br>Mean Number of Steps=%{y}<extra></extra>",
2186 |          "legendgroup": "False",
2187 |          "marker": {
2188 |           "color": "#636efa",
2189 |           "pattern": {
2190 |            "shape": ""
2191 |           }
2192 |          },
2193 |          "name": "False",
2194 |          "offsetgroup": "False",
2195 |          "orientation": "v",
2196 |          "showlegend": false,
2197 |          "textposition": "auto",
2198 |          "type": "bar",
2199 |          "x": [
2200 |           "GAIA",
2201 |           "GSM8K",
2202 |           "HotpotQA"
2203 |          ],
2204 |          "xaxis": "x5",
2205 |          "y": [
2206 |           5.176470588235294,
2207 |           4.181818181818182,
2208 |           3.5
2209 |          ],
2210 |          "yaxis": "y5"
2211 |         },
2212 |         {
2213 |          "alignmentgroup": "True",
2214 |          "hovertemplate": "Answer correctness=False<br>Agent=SOLAR-10.7B<br>Task=%{x}<br>Mean Number of Steps=%{y}<extra></extra>",
2215 |          "legendgroup": "False",
2216 |          "marker": {
2217 |           "color": "#636efa",
2218 |           "pattern": {
2219 |            "shape": ""
2220 |           }
2221 |          },
2222 |          "name": "False",
2223 |          "offsetgroup": "False",
2224 |          "orientation": "v",
2225 |          "showlegend": false,
2226 |          "textposition": "auto",
2227 |          "type": "bar",
2228 |          "x": [
2229 |           "GAIA",
2230 |           "GSM8K",
2231 |           "HotpotQA"
2232 |          ],
2233 |          "xaxis": "x6",
2234 |          "y": [
2235 |           4.375,
2236 |           2.2,
2237 |           4.258064516129032
2238 |          ],
2239 |          "yaxis": "y6"
2240 |         },
2241 |         {
2242 |          "alignmentgroup": "True",
2243 |          "hovertemplate": "Answer correctness=False<br>Agent=Zephyr-7b-beta<br>Task=%{x}<br>Mean Number of Steps=%{y}<extra></extra>",
2244 |          "legendgroup": "False",
2245 |          "marker": {
2246 |           "color": "#636efa",
2247 |           "pattern": {
2248 |            "shape": ""
2249 |           }
2250 |          },
2251 |          "name": "False",
2252 |          "offsetgroup": "False",
2253 |          "orientation": "v",
2254 |          "showlegend": false,
2255 |          "textposition": "auto",
2256 |          "type": "bar",
2257 |          "x": [
2258 |           "GAIA",
2259 |           "GSM8K",
2260 |           "HotpotQA"
2261 |          ],
2262 |          "xaxis": "x7",
2263 |          "y": [
2264 |           5.2,
2265 |           2.4,
2266 |           4.868421052631579
2267 |          ],
2268 |          "yaxis": "y7"
2269 |         },
2270 |         {
2271 |          "alignmentgroup": "True",
2272 |          "hovertemplate": "Answer correctness=True<br>Agent=GPT3.5<br>Task=%{x}<br>Mean Number of Steps=%{y}<extra></extra>",
2273 |          "legendgroup": "True",
2274 |          "marker": {
2275 |           "color": "#EF553B",
2276 |           "pattern": {
2277 |            "shape": ""
2278 |           }
2279 |          },
2280 |          "name": "True",
2281 |          "offsetgroup": "True",
2282 |          "orientation": "v",
2283 |          "showlegend": true,
2284 |          "textposition": "auto",
2285 |          "type": "bar",
2286 |          "x": [
2287 |           "GAIA",
2288 |           "GSM8K",
2289 |           "HotpotQA"
2290 |          ],
2291 |          "xaxis": "x",
2292 |          "y": [
2293 |           1,
2294 |           2.1666666666666665,
2295 |           1.875
2296 |          ],
2297 |          "yaxis": "y"
2298 |         },
2299 |         {
2300 |          "alignmentgroup": "True",
2301 |          "hovertemplate": "Answer correctness=True<br>Agent=GPT4<br>Task=%{x}<br>Mean Number of Steps=%{y}<extra></extra>",
2302 |          "legendgroup": "True",
2303 |          "marker": {
2304 |           "color": "#EF553B",
2305 |           "pattern": {
2306 |            "shape": ""
2307 |           }
2308 |          },
2309 |          "name": "True",
2310 |          "offsetgroup": "True",
2311 |          "orientation": "v",
2312 |          "showlegend": false,
2313 |          "textposition": "auto",
2314 |          "type": "bar",
2315 |          "x": [
2316 |           "GAIA",
2317 |           "GSM8K",
2318 |           "HotpotQA"
2319 |          ],
2320 |          "xaxis": "x2",
2321 |          "y": [
2322 |           1.6666666666666667,
2323 |           2.473684210526316,
2324 |           1.6511627906976745
2325 |          ],
2326 |          "yaxis": "y2"
2327 |         },
2328 |         {
2329 |          "alignmentgroup": "True",
2330 |          "hovertemplate": "Answer correctness=True<br>Agent=Llama-2-70b-chat<br>Task=%{x}<br>Mean Number of Steps=%{y}<extra></extra>",
2331 |          "legendgroup": "True",
2332 |          "marker": {
2333 |           "color": "#EF553B",
2334 |           "pattern": {
2335 |            "shape": ""
2336 |           }
2337 |          },
2338 |          "name": "True",
2339 |          "offsetgroup": "True",
2340 |          "orientation": "v",
2341 |          "showlegend": false,
2342 |          "textposition": "auto",
2343 |          "type": "bar",
2344 |          "x": [
2345 |           "GSM8K",
2346 |           "HotpotQA"
2347 |          ],
2348 |          "xaxis": "x3",
2349 |          "y": [
2350 |           2,
2351 |           2.88
2352 |          ],
2353 |          "yaxis": "y3"
2354 |         },
2355 |         {
2356 |          "alignmentgroup": "True",
2357 |          "hovertemplate": "Answer correctness=True<br>Agent=Mixtral-8x7B-Instruct-v0.1<br>Task=%{x}<br>Mean Number of Steps=%{y}<extra></extra>",
2358 |          "legendgroup": "True",
2359 |          "marker": {
2360 |           "color": "#EF553B",
2361 |           "pattern": {
2362 |            "shape": ""
2363 |           }
2364 |          },
2365 |          "name": "True",
2366 |          "offsetgroup": "True",
2367 |          "orientation": "v",
2368 |          "showlegend": false,
2369 |          "textposition": "auto",
2370 |          "type": "bar",
2371 |          "x": [
2372 |           "GAIA",
2373 |           "GSM8K",
2374 |           "HotpotQA"
2375 |          ],
2376 |          "xaxis": "x4",
2377 |          "y": [
2378 |           3,
2379 |           3.0714285714285716,
2380 |           3.027027027027027
2381 |          ],
2382 |          "yaxis": "y4"
2383 |         },
2384 |         {
2385 |          "alignmentgroup": "True",
2386 |          "hovertemplate": "Answer correctness=True<br>Agent=OpenHermes-2.5-Mistral-7B<br>Task=%{x}<br>Mean Number of Steps=%{y}<extra></extra>",
2387 |          "legendgroup": "True",
2388 |          "marker": {
2389 |           "color": "#EF553B",
2390 |           "pattern": {
2391 |            "shape": ""
2392 |           }
2393 |          },
2394 |          "name": "True",
2395 |          "offsetgroup": "True",
2396 |          "orientation": "v",
2397 |          "showlegend": false,
2398 |          "textposition": "auto",
2399 |          "type": "bar",
2400 |          "x": [
2401 |           "GAIA",
2402 |           "GSM8K",
2403 |           "HotpotQA"
2404 |          ],
2405 |          "xaxis": "x5",
2406 |          "y": [
2407 |           3.5,
2408 |           3.5555555555555554,
2409 |           3.0303030303030303
2410 |          ],
2411 |          "yaxis": "y5"
2412 |         },
2413 |         {
2414 |          "alignmentgroup": "True",
2415 |          "hovertemplate": "Answer correctness=True<br>Agent=SOLAR-10.7B<br>Task=%{x}<br>Mean Number of Steps=%{y}<extra></extra>",
2416 |          "legendgroup": "True",
2417 |          "marker": {
2418 |           "color": "#EF553B",
2419 |           "pattern": {
2420 |            "shape": ""
2421 |           }
2422 |          },
2423 |          "name": "True",
2424 |          "offsetgroup": "True",
2425 |          "orientation": "v",
2426 |          "showlegend": false,
2427 |          "textposition": "auto",
2428 |          "type": "bar",
2429 |          "x": [
2430 |           "GAIA",
2431 |           "GSM8K",
2432 |           "HotpotQA"
2433 |          ],
2434 |          "xaxis": "x6",
2435 |          "y": [
2436 |           4,
2437 |           2.357142857142857,
2438 |           2.896551724137931
2439 |          ],
2440 |          "yaxis": "y6"
2441 |         },
2442 |         {
2443 |          "alignmentgroup": "True",
2444 |          "hovertemplate": "Answer correctness=True<br>Agent=Zephyr-7b-beta<br>Task=%{x}<br>Mean Number of Steps=%{y}<extra></extra>",
2445 |          "legendgroup": "True",
2446 |          "marker": {
2447 |           "color": "#EF553B",
2448 |           "pattern": {
2449 |            "shape": ""
2450 |           }
2451 |          },
2452 |          "name": "True",
2453 |          "offsetgroup": "True",
2454 |          "orientation": "v",
2455 |          "showlegend": false,
2456 |          "textposition": "auto",
2457 |          "type": "bar",
2458 |          "x": [
2459 |           "GSM8K",
2460 |           "HotpotQA"
2461 |          ],
2462 |          "xaxis": "x7",
2463 |          "y": [
2464 |           1.75,
2465 |           2.45
2466 |          ],
2467 |          "yaxis": "y7"
2468 |         }
2469 |        ],
2470 |        "layout": {
2471 |         "annotations": [
2472 |          {
2473 |           "font": {},
2474 |           "showarrow": false,
2475 |           "text": "Agent=GPT3.5",
2476 |           "x": 0.06285714285714285,
2477 |           "xanchor": "center",
2478 |           "xref": "paper",
2479 |           "y": 1,
2480 |           "yanchor": "bottom",
2481 |           "yref": "paper"
2482 |          },
2483 |          {
2484 |           "font": {},
2485 |           "showarrow": false,
2486 |           "text": "Agent=GPT4",
2487 |           "x": 0.20857142857142852,
2488 |           "xanchor": "center",
2489 |           "xref": "paper",
2490 |           "y": 1,
2491 |           "yanchor": "bottom",
2492 |           "yref": "paper"
2493 |          },
2494 |          {
2495 |           "font": {},
2496 |           "showarrow": false,
2497 |           "text": "Agent=Llama-2-70b-chat",
2498 |           "x": 0.3542857142857142,
2499 |           "xanchor": "center",
2500 |           "xref": "paper",
2501 |           "y": 1,
2502 |           "yanchor": "bottom",
2503 |           "yref": "paper"
2504 |          },
2505 |          {
2506 |           "font": {},
2507 |           "showarrow": false,
2508 |           "text": "Agent=Mixtral-8x7B-Instruct-v0.1",
2509 |           "x": 0.5,
2510 |           "xanchor": "center",
2511 |           "xref": "paper",
2512 |           "y": 1,
2513 |           "yanchor": "bottom",
2514 |           "yref": "paper"
2515 |          },
2516 |          {
2517 |           "font": {},
2518 |           "showarrow": false,
2519 |           "text": "Agent=OpenHermes-2.5-Mistral-7B",
2520 |           "x": 0.6457142857142856,
2521 |           "xanchor": "center",
2522 |           "xref": "paper",
2523 |           "y": 1,
2524 |           "yanchor": "bottom",
2525 |           "yref": "paper"
2526 |          },
2527 |          {
2528 |           "font": {},
2529 |           "showarrow": false,
2530 |           "text": "Agent=SOLAR-10.7B",
2531 |           "x": 0.7914285714285713,
2532 |           "xanchor": "center",
2533 |           "xref": "paper",
2534 |           "y": 1,
2535 |           "yanchor": "bottom",
2536 |           "yref": "paper"
2537 |          },
2538 |          {
2539 |           "font": {},
2540 |           "showarrow": false,
2541 |           "text": "Agent=Zephyr-7b-beta",
2542 |           "x": 0.9371428571428569,
2543 |           "xanchor": "center",
2544 |           "xref": "paper",
2545 |           "y": 1,
2546 |           "yanchor": "bottom",
2547 |           "yref": "paper"
2548 |          }
2549 |         ],
2550 |         "barmode": "group",
2551 |         "legend": {
2552 |          "title": {
2553 |           "text": "Answer correctness"
2554 |          },
2555 |          "tracegroupgap": 0
2556 |         },
2557 |         "template": {
2558 |          "data": {
2559 |           "bar": [
2560 |            {
2561 |             "error_x": {
2562 |              "color": "#2a3f5f"
2563 |             },
2564 |             "error_y": {
2565 |              "color": "#2a3f5f"
2566 |             },
2567 |             "marker": {
2568 |              "line": {
2569 |               "color": "#E5ECF6",
2570 |               "width": 0.5
2571 |              },
2572 |              "pattern": {
2573 |               "fillmode": "overlay",
2574 |               "size": 10,
2575 |               "solidity": 0.2
2576 |              }
2577 |             },
2578 |             "type": "bar"
2579 |            }
2580 |           ],
2581 |           "barpolar": [
2582 |            {
2583 |             "marker": {
2584 |              "line": {
2585 |               "color": "#E5ECF6",
2586 |               "width": 0.5
2587 |              },
2588 |              "pattern": {
2589 |               "fillmode": "overlay",
2590 |               "size": 10,
2591 |               "solidity": 0.2
2592 |              }
2593 |             },
2594 |             "type": "barpolar"
2595 |            }
2596 |           ],
2597 |           "carpet": [
2598 |            {
2599 |             "aaxis": {
2600 |              "endlinecolor": "#2a3f5f",
2601 |              "gridcolor": "white",
2602 |              "linecolor": "white",
2603 |              "minorgridcolor": "white",
2604 |              "startlinecolor": "#2a3f5f"
2605 |             },
2606 |             "baxis": {
2607 |              "endlinecolor": "#2a3f5f",
2608 |              "gridcolor": "white",
2609 |              "linecolor": "white",
2610 |              "minorgridcolor": "white",
2611 |              "startlinecolor": "#2a3f5f"
2612 |             },
2613 |             "type": "carpet"
2614 |            }
2615 |           ],
2616 |           "choropleth": [
2617 |            {
2618 |             "colorbar": {
2619 |              "outlinewidth": 0,
2620 |              "ticks": ""
2621 |             },
2622 |             "type": "choropleth"
2623 |            }
2624 |           ],
2625 |           "contour": [
2626 |            {
2627 |             "colorbar": {
2628 |              "outlinewidth": 0,
2629 |              "ticks": ""
2630 |             },
2631 |             "colorscale": [
2632 |              [
2633 |               0,
2634 |               "#0d0887"
2635 |              ],
2636 |              [
2637 |               0.1111111111111111,
2638 |               "#46039f"
2639 |              ],
2640 |              [
2641 |               0.2222222222222222,
2642 |               "#7201a8"
2643 |              ],
2644 |              [
2645 |               0.3333333333333333,
2646 |               "#9c179e"
2647 |              ],
2648 |              [
2649 |               0.4444444444444444,
2650 |               "#bd3786"
2651 |              ],
2652 |              [
2653 |               0.5555555555555556,
2654 |               "#d8576b"
2655 |              ],
2656 |              [
2657 |               0.6666666666666666,
2658 |               "#ed7953"
2659 |              ],
2660 |              [
2661 |               0.7777777777777778,
2662 |               "#fb9f3a"
2663 |              ],
2664 |              [
2665 |               0.8888888888888888,
2666 |               "#fdca26"
2667 |              ],
2668 |              [
2669 |               1,
2670 |               "#f0f921"
2671 |              ]
2672 |             ],
2673 |             "type": "contour"
2674 |            }
2675 |           ],
2676 |           "contourcarpet": [
2677 |            {
2678 |             "colorbar": {
2679 |              "outlinewidth": 0,
2680 |              "ticks": ""
2681 |             },
2682 |             "type": "contourcarpet"
2683 |            }
2684 |           ],
2685 |           "heatmap": [
2686 |            {
2687 |             "colorbar": {
2688 |              "outlinewidth": 0,
2689 |              "ticks": ""
2690 |             },
2691 |             "colorscale": [
2692 |              [
2693 |               0,
2694 |               "#0d0887"
2695 |              ],
2696 |              [
2697 |               0.1111111111111111,
2698 |               "#46039f"
2699 |              ],
2700 |              [
2701 |               0.2222222222222222,
2702 |               "#7201a8"
2703 |              ],
2704 |              [
2705 |               0.3333333333333333,
2706 |               "#9c179e"
2707 |              ],
2708 |              [
2709 |               0.4444444444444444,
2710 |               "#bd3786"
2711 |              ],
2712 |              [
2713 |               0.5555555555555556,
2714 |               "#d8576b"
2715 |              ],
2716 |              [
2717 |               0.6666666666666666,
2718 |               "#ed7953"
2719 |              ],
2720 |              [
2721 |               0.7777777777777778,
2722 |               "#fb9f3a"
2723 |              ],
2724 |              [
2725 |               0.8888888888888888,
2726 |               "#fdca26"
2727 |              ],
2728 |              [
2729 |               1,
2730 |               "#f0f921"
2731 |              ]
2732 |             ],
2733 |             "type": "heatmap"
2734 |            }
2735 |           ],
2736 |           "heatmapgl": [
2737 |            {
2738 |             "colorbar": {
2739 |              "outlinewidth": 0,
2740 |              "ticks": ""
2741 |             },
2742 |             "colorscale": [
2743 |              [
2744 |               0,
2745 |               "#0d0887"
2746 |              ],
2747 |              [
2748 |               0.1111111111111111,
2749 |               "#46039f"
2750 |              ],
2751 |              [
2752 |               0.2222222222222222,
2753 |               "#7201a8"
2754 |              ],
2755 |              [
2756 |               0.3333333333333333,
2757 |               "#9c179e"
2758 |              ],
2759 |              [
2760 |               0.4444444444444444,
2761 |               "#bd3786"
2762 |              ],
2763 |              [
2764 |               0.5555555555555556,
2765 |               "#d8576b"
2766 |              ],
2767 |              [
2768 |               0.6666666666666666,
2769 |               "#ed7953"
2770 |              ],
2771 |              [
2772 |               0.7777777777777778,
2773 |               "#fb9f3a"
2774 |              ],
2775 |              [
2776 |               0.8888888888888888,
2777 |               "#fdca26"
2778 |              ],
2779 |              [
2780 |               1,
2781 |               "#f0f921"
2782 |              ]
2783 |             ],
2784 |             "type": "heatmapgl"
2785 |            }
2786 |           ],
2787 |           "histogram": [
2788 |            {
2789 |             "marker": {
2790 |              "pattern": {
2791 |               "fillmode": "overlay",
2792 |               "size": 10,
2793 |               "solidity": 0.2
2794 |              }
2795 |             },
2796 |             "type": "histogram"
2797 |            }
2798 |           ],
2799 |           "histogram2d": [
2800 |            {
2801 |             "colorbar": {
2802 |              "outlinewidth": 0,
2803 |              "ticks": ""
2804 |             },
2805 |             "colorscale": [
2806 |              [
2807 |               0,
2808 |               "#0d0887"
2809 |              ],
2810 |              [
2811 |               0.1111111111111111,
2812 |               "#46039f"
2813 |              ],
2814 |              [
2815 |               0.2222222222222222,
2816 |               "#7201a8"
2817 |              ],
2818 |              [
2819 |               0.3333333333333333,
2820 |               "#9c179e"
2821 |              ],
2822 |              [
2823 |               0.4444444444444444,
2824 |               "#bd3786"
2825 |              ],
2826 |              [
2827 |               0.5555555555555556,
2828 |               "#d8576b"
2829 |              ],
2830 |              [
2831 |               0.6666666666666666,
2832 |               "#ed7953"
2833 |              ],
2834 |              [
2835 |               0.7777777777777778,
2836 |               "#fb9f3a"
2837 |              ],
2838 |              [
2839 |               0.8888888888888888,
2840 |               "#fdca26"
2841 |              ],
2842 |              [
2843 |               1,
2844 |               "#f0f921"
2845 |              ]
2846 |             ],
2847 |             "type": "histogram2d"
2848 |            }
2849 |           ],
2850 |           "histogram2dcontour": [
2851 |            {
2852 |             "colorbar": {
2853 |              "outlinewidth": 0,
2854 |              "ticks": ""
2855 |             },
2856 |             "colorscale": [
2857 |              [
2858 |               0,
2859 |               "#0d0887"
2860 |              ],
2861 |              [
2862 |               0.1111111111111111,
2863 |               "#46039f"
2864 |              ],
2865 |              [
2866 |               0.2222222222222222,
2867 |               "#7201a8"
2868 |              ],
2869 |              [
2870 |               0.3333333333333333,
2871 |               "#9c179e"
2872 |              ],
2873 |              [
2874 |               0.4444444444444444,
2875 |               "#bd3786"
2876 |              ],
2877 |              [
2878 |               0.5555555555555556,
2879 |               "#d8576b"
2880 |              ],
2881 |              [
2882 |               0.6666666666666666,
2883 |               "#ed7953"
2884 |              ],
2885 |              [
2886 |               0.7777777777777778,
2887 |               "#fb9f3a"
2888 |              ],
2889 |              [
2890 |               0.8888888888888888,
2891 |               "#fdca26"
2892 |              ],
2893 |              [
2894 |               1,
2895 |               "#f0f921"
2896 |              ]
2897 |             ],
2898 |             "type": "histogram2dcontour"
2899 |            }
2900 |           ],
2901 |           "mesh3d": [
2902 |            {
2903 |             "colorbar": {
2904 |              "outlinewidth": 0,
2905 |              "ticks": ""
2906 |             },
2907 |             "type": "mesh3d"
2908 |            }
2909 |           ],
2910 |           "parcoords": [
2911 |            {
2912 |             "line": {
2913 |              "colorbar": {
2914 |               "outlinewidth": 0,
2915 |               "ticks": ""
2916 |              }
2917 |             },
2918 |             "type": "parcoords"
2919 |            }
2920 |           ],
2921 |           "pie": [
2922 |            {
2923 |             "automargin": true,
2924 |             "type": "pie"
2925 |            }
2926 |           ],
2927 |           "scatter": [
2928 |            {
2929 |             "fillpattern": {
2930 |              "fillmode": "overlay",
2931 |              "size": 10,
2932 |              "solidity": 0.2
2933 |             },
2934 |             "type": "scatter"
2935 |            }
2936 |           ],
2937 |           "scatter3d": [
2938 |            {
2939 |             "line": {
2940 |              "colorbar": {
2941 |               "outlinewidth": 0,
2942 |               "ticks": ""
2943 |              }
2944 |             },
2945 |             "marker": {
2946 |              "colorbar": {
2947 |               "outlinewidth": 0,
2948 |               "ticks": ""
2949 |              }
2950 |             },
2951 |             "type": "scatter3d"
2952 |            }
2953 |           ],
2954 |           "scattercarpet": [
2955 |            {
2956 |             "marker": {
2957 |              "colorbar": {
2958 |               "outlinewidth": 0,
2959 |               "ticks": ""
2960 |              }
2961 |             },
2962 |             "type": "scattercarpet"
2963 |            }
2964 |           ],
2965 |           "scattergeo": [
2966 |            {
2967 |             "marker": {
2968 |              "colorbar": {
2969 |               "outlinewidth": 0,
2970 |               "ticks": ""
2971 |              }
2972 |             },
2973 |             "type": "scattergeo"
2974 |            }
2975 |           ],
2976 |           "scattergl": [
2977 |            {
2978 |             "marker": {
2979 |              "colorbar": {
2980 |               "outlinewidth": 0,
2981 |               "ticks": ""
2982 |              }
2983 |             },
2984 |             "type": "scattergl"
2985 |            }
2986 |           ],
2987 |           "scattermapbox": [
2988 |            {
2989 |             "marker": {
2990 |              "colorbar": {
2991 |               "outlinewidth": 0,
2992 |               "ticks": ""
2993 |              }
2994 |             },
2995 |             "type": "scattermapbox"
2996 |            }
2997 |           ],
2998 |           "scatterpolar": [
2999 |            {
3000 |             "marker": {
3001 |              "colorbar": {
3002 |               "outlinewidth": 0,
3003 |               "ticks": ""
3004 |              }
3005 |             },
3006 |             "type": "scatterpolar"
3007 |            }
3008 |           ],
3009 |           "scatterpolargl": [
3010 |            {
3011 |             "marker": {
3012 |              "colorbar": {
3013 |               "outlinewidth": 0,
3014 |               "ticks": ""
3015 |              }
3016 |             },
3017 |             "type": "scatterpolargl"
3018 |            }
3019 |           ],
3020 |           "scatterternary": [
3021 |            {
3022 |             "marker": {
3023 |              "colorbar": {
3024 |               "outlinewidth": 0,
3025 |               "ticks": ""
3026 |              }
3027 |             },
3028 |             "type": "scatterternary"
3029 |            }
3030 |           ],
3031 |           "surface": [
3032 |            {
3033 |             "colorbar": {
3034 |              "outlinewidth": 0,
3035 |              "ticks": ""
3036 |             },
3037 |             "colorscale": [
3038 |              [
3039 |               0,
3040 |               "#0d0887"
3041 |              ],
3042 |              [
3043 |               0.1111111111111111,
3044 |               "#46039f"
3045 |              ],
3046 |              [
3047 |               0.2222222222222222,
3048 |               "#7201a8"
3049 |              ],
3050 |              [
3051 |               0.3333333333333333,
3052 |               "#9c179e"
3053 |              ],
3054 |              [
3055 |               0.4444444444444444,
3056 |               "#bd3786"
3057 |              ],
3058 |              [
3059 |               0.5555555555555556,
3060 |               "#d8576b"
3061 |              ],
3062 |              [
3063 |               0.6666666666666666,
3064 |               "#ed7953"
3065 |              ],
3066 |              [
3067 |               0.7777777777777778,
3068 |               "#fb9f3a"
3069 |              ],
3070 |              [
3071 |               0.8888888888888888,
3072 |               "#fdca26"
3073 |              ],
3074 |              [
3075 |               1,
3076 |               "#f0f921"
3077 |              ]
3078 |             ],
3079 |             "type": "surface"
3080 |            }
3081 |           ],
3082 |           "table": [
3083 |            {
3084 |             "cells": {
3085 |              "fill": {
3086 |               "color": "#EBF0F8"
3087 |              },
3088 |              "line": {
3089 |               "color": "white"
3090 |              }
3091 |             },
3092 |             "header": {
3093 |              "fill": {
3094 |               "color": "#C8D4E3"
3095 |              },
3096 |              "line": {
3097 |               "color": "white"
3098 |              }
3099 |             },
3100 |             "type": "table"
3101 |            }
3102 |           ]
3103 |          },
3104 |          "layout": {
3105 |           "annotationdefaults": {
3106 |            "arrowcolor": "#2a3f5f",
3107 |            "arrowhead": 0,
3108 |            "arrowwidth": 1
3109 |           },
3110 |           "autotypenumbers": "strict",
3111 |           "coloraxis": {
3112 |            "colorbar": {
3113 |             "outlinewidth": 0,
3114 |             "ticks": ""
3115 |            }
3116 |           },
3117 |           "colorscale": {
3118 |            "diverging": [
3119 |             [
3120 |              0,
3121 |              "#8e0152"
3122 |             ],
3123 |             [
3124 |              0.1,
3125 |              "#c51b7d"
3126 |             ],
3127 |             [
3128 |              0.2,
3129 |              "#de77ae"
3130 |             ],
3131 |             [
3132 |              0.3,
3133 |              "#f1b6da"
3134 |             ],
3135 |             [
3136 |              0.4,
3137 |              "#fde0ef"
3138 |             ],
3139 |             [
3140 |              0.5,
3141 |              "#f7f7f7"
3142 |             ],
3143 |             [
3144 |              0.6,
3145 |              "#e6f5d0"
3146 |             ],
3147 |             [
3148 |              0.7,
3149 |              "#b8e186"
3150 |             ],
3151 |             [
3152 |              0.8,
3153 |              "#7fbc41"
3154 |             ],
3155 |             [
3156 |              0.9,
3157 |              "#4d9221"
3158 |             ],
3159 |             [
3160 |              1,
3161 |              "#276419"
3162 |             ]
3163 |            ],
3164 |            "sequential": [
3165 |             [
3166 |              0,
3167 |              "#0d0887"
3168 |             ],
3169 |             [
3170 |              0.1111111111111111,
3171 |              "#46039f"
3172 |             ],
3173 |             [
3174 |              0.2222222222222222,
3175 |              "#7201a8"
3176 |             ],
3177 |             [
3178 |              0.3333333333333333,
3179 |              "#9c179e"
3180 |             ],
3181 |             [
3182 |              0.4444444444444444,
3183 |              "#bd3786"
3184 |             ],
3185 |             [
3186 |              0.5555555555555556,
3187 |              "#d8576b"
3188 |             ],
3189 |             [
3190 |              0.6666666666666666,
3191 |              "#ed7953"
3192 |             ],
3193 |             [
3194 |              0.7777777777777778,
3195 |              "#fb9f3a"
3196 |             ],
3197 |             [
3198 |              0.8888888888888888,
3199 |              "#fdca26"
3200 |             ],
3201 |             [
3202 |              1,
3203 |              "#f0f921"
3204 |             ]
3205 |            ],
3206 |            "sequentialminus": [
3207 |             [
3208 |              0,
3209 |              "#0d0887"
3210 |             ],
3211 |             [
3212 |              0.1111111111111111,
3213 |              "#46039f"
3214 |             ],
3215 |             [
3216 |              0.2222222222222222,
3217 |              "#7201a8"
3218 |             ],
3219 |             [
3220 |              0.3333333333333333,
3221 |              "#9c179e"
3222 |             ],
3223 |             [
3224 |              0.4444444444444444,
3225 |              "#bd3786"
3226 |             ],
3227 |             [
3228 |              0.5555555555555556,
3229 |              "#d8576b"
3230 |             ],
3231 |             [
3232 |              0.6666666666666666,
3233 |              "#ed7953"
3234 |             ],
3235 |             [
3236 |              0.7777777777777778,
3237 |              "#fb9f3a"
3238 |             ],
3239 |             [
3240 |              0.8888888888888888,
3241 |              "#fdca26"
3242 |             ],
3243 |             [
3244 |              1,
3245 |              "#f0f921"
3246 |             ]
3247 |            ]
3248 |           },
3249 |           "colorway": [
3250 |            "#636efa",
3251 |            "#EF553B",
3252 |            "#00cc96",
3253 |            "#ab63fa",
3254 |            "#FFA15A",
3255 |            "#19d3f3",
3256 |            "#FF6692",
3257 |            "#B6E880",
3258 |            "#FF97FF",
3259 |            "#FECB52"
3260 |           ],
3261 |           "font": {
3262 |            "color": "#2a3f5f"
3263 |           },
3264 |           "geo": {
3265 |            "bgcolor": "white",
3266 |            "lakecolor": "white",
3267 |            "landcolor": "#E5ECF6",
3268 |            "showlakes": true,
3269 |            "showland": true,
3270 |            "subunitcolor": "white"
3271 |           },
3272 |           "hoverlabel": {
3273 |            "align": "left"
3274 |           },
3275 |           "hovermode": "closest",
3276 |           "mapbox": {
3277 |            "style": "light"
3278 |           },
3279 |           "paper_bgcolor": "white",
3280 |           "plot_bgcolor": "#E5ECF6",
3281 |           "polar": {
3282 |            "angularaxis": {
3283 |             "gridcolor": "white",
3284 |             "linecolor": "white",
3285 |             "ticks": ""
3286 |            },
3287 |            "bgcolor": "#E5ECF6",
3288 |            "radialaxis": {
3289 |             "gridcolor": "white",
3290 |             "linecolor": "white",
3291 |             "ticks": ""
3292 |            }
3293 |           },
3294 |           "scene": {
3295 |            "xaxis": {
3296 |             "backgroundcolor": "#E5ECF6",
3297 |             "gridcolor": "white",
3298 |             "gridwidth": 2,
3299 |             "linecolor": "white",
3300 |             "showbackground": true,
3301 |             "ticks": "",
3302 |             "zerolinecolor": "white"
3303 |            },
3304 |            "yaxis": {
3305 |             "backgroundcolor": "#E5ECF6",
3306 |             "gridcolor": "white",
3307 |             "gridwidth": 2,
3308 |             "linecolor": "white",
3309 |             "showbackground": true,
3310 |             "ticks": "",
3311 |             "zerolinecolor": "white"
3312 |            },
3313 |            "zaxis": {
3314 |             "backgroundcolor": "#E5ECF6",
3315 |             "gridcolor": "white",
3316 |             "gridwidth": 2,
3317 |             "linecolor": "white",
3318 |             "showbackground": true,
3319 |             "ticks": "",
3320 |             "zerolinecolor": "white"
3321 |            }
3322 |           },
3323 |           "shapedefaults": {
3324 |            "line": {
3325 |             "color": "#2a3f5f"
3326 |            }
3327 |           },
3328 |           "ternary": {
3329 |            "aaxis": {
3330 |             "gridcolor": "white",
3331 |             "linecolor": "white",
3332 |             "ticks": ""
3333 |            },
3334 |            "baxis": {
3335 |             "gridcolor": "white",
3336 |             "linecolor": "white",
3337 |             "ticks": ""
3338 |            },
3339 |            "bgcolor": "#E5ECF6",
3340 |            "caxis": {
3341 |             "gridcolor": "white",
3342 |             "linecolor": "white",
3343 |             "ticks": ""
3344 |            }
3345 |           },
3346 |           "title": {
3347 |            "x": 0.05
3348 |           },
3349 |           "xaxis": {
3350 |            "automargin": true,
3351 |            "gridcolor": "white",
3352 |            "linecolor": "white",
3353 |            "ticks": "",
3354 |            "title": {
3355 |             "standoff": 15
3356 |            },
3357 |            "zerolinecolor": "white",
3358 |            "zerolinewidth": 2
3359 |           },
3360 |           "yaxis": {
3361 |            "automargin": true,
3362 |            "gridcolor": "white",
3363 |            "linecolor": "white",
3364 |            "ticks": "",
3365 |            "title": {
3366 |             "standoff": 15
3367 |            },
3368 |            "zerolinecolor": "white",
3369 |            "zerolinewidth": 2
3370 |           }
3371 |          }
3372 |         },
3373 |         "title": {
3374 |          "text": "Mean Number of Steps to Solve a Task"
3375 |         },
3376 |         "width": 1500,
3377 |         "xaxis": {
3378 |          "anchor": "y",
3379 |          "domain": [
3380 |           0,
3381 |           0.1257142857142857
3382 |          ],
3383 |          "title": {
3384 |           "text": "Task"
3385 |          }
3386 |         },
3387 |         "xaxis2": {
3388 |          "anchor": "y2",
3389 |          "domain": [
3390 |           0.14571428571428569,
3391 |           0.27142857142857135
3392 |          ],
3393 |          "matches": "x",
3394 |          "title": {
3395 |           "text": "Task"
3396 |          }
3397 |         },
3398 |         "xaxis3": {
3399 |          "anchor": "y3",
3400 |          "domain": [
3401 |           0.29142857142857137,
3402 |           0.41714285714285704
3403 |          ],
3404 |          "matches": "x",
3405 |          "title": {
3406 |           "text": "Task"
3407 |          }
3408 |         },
3409 |         "xaxis4": {
3410 |          "anchor": "y4",
3411 |          "domain": [
3412 |           0.4371428571428571,
3413 |           0.5628571428571428
3414 |          ],
3415 |          "matches": "x",
3416 |          "title": {
3417 |           "text": "Task"
3418 |          }
3419 |         },
3420 |         "xaxis5": {
3421 |          "anchor": "y5",
3422 |          "domain": [
3423 |           0.5828571428571427,
3424 |           0.7085714285714284
3425 |          ],
3426 |          "matches": "x",
3427 |          "title": {
3428 |           "text": "Task"
3429 |          }
3430 |         },
3431 |         "xaxis6": {
3432 |          "anchor": "y6",
3433 |          "domain": [
3434 |           0.7285714285714284,
3435 |           0.8542857142857141
3436 |          ],
3437 |          "matches": "x",
3438 |          "title": {
3439 |           "text": "Task"
3440 |          }
3441 |         },
3442 |         "xaxis7": {
3443 |          "anchor": "y7",
3444 |          "domain": [
3445 |           0.8742857142857141,
3446 |           0.9999999999999998
3447 |          ],
3448 |          "matches": "x",
3449 |          "title": {
3450 |           "text": "Task"
3451 |          }
3452 |         },
3453 |         "yaxis": {
3454 |          "anchor": "x",
3455 |          "domain": [
3456 |           0,
3457 |           1
3458 |          ],
3459 |          "title": {
3460 |           "text": "Mean Number of Steps"
3461 |          }
3462 |         },
3463 |         "yaxis2": {
3464 |          "anchor": "x2",
3465 |          "domain": [
3466 |           0,
3467 |           1
3468 |          ],
3469 |          "matches": "y",
3470 |          "showticklabels": false
3471 |         },
3472 |         "yaxis3": {
3473 |          "anchor": "x3",
3474 |          "domain": [
3475 |           0,
3476 |           1
3477 |          ],
3478 |          "matches": "y",
3479 |          "showticklabels": false
3480 |         },
3481 |         "yaxis4": {
3482 |          "anchor": "x4",
3483 |          "domain": [
3484 |           0,
3485 |           1
3486 |          ],
3487 |          "matches": "y",
3488 |          "showticklabels": false
3489 |         },
3490 |         "yaxis5": {
3491 |          "anchor": "x5",
3492 |          "domain": [
3493 |           0,
3494 |           1
3495 |          ],
3496 |          "matches": "y",
3497 |          "showticklabels": false
3498 |         },
3499 |         "yaxis6": {
3500 |          "anchor": "x6",
3501 |          "domain": [
3502 |           0,
3503 |           1
3504 |          ],
3505 |          "matches": "y",
3506 |          "showticklabels": false
3507 |         },
3508 |         "yaxis7": {
3509 |          "anchor": "x7",
3510 |          "domain": [
3511 |           0,
3512 |           1
3513 |          ],
3514 |          "matches": "y",
3515 |          "showticklabels": false
3516 |         }
3517 |        }
3518 |       },
3519 |       "text/html": [
3520 |        "<div>                            <div id=\"e23c2b41-b14d-4353-be6c-e12f08e3a387\" class=\"plotly-graph-div\" style=\"height:525px; width:1500px;\"></div>            <script type=\"text/javascript\">                require([\"plotly\"], function(Plotly) {                    window.PLOTLYENV=window.PLOTLYENV || {};                                    if (document.getElementById(\"e23c2b41-b14d-4353-be6c-e12f08e3a387\")) {                    Plotly.newPlot(                        \"e23c2b41-b14d-4353-be6c-e12f08e3a387\",                        [{\"alignmentgroup\":\"True\",\"hovertemplate\":\"Answer correctness=False\\u003cbr\\u003eAgent=GPT3.5\\u003cbr\\u003eTask=%{x}\\u003cbr\\u003eMean Number of Steps=%{y}\\u003cextra\\u003e\\u003c\\u002fextra\\u003e\",\"legendgroup\":\"False\",\"marker\":{\"color\":\"#636efa\",\"pattern\":{\"shape\":\"\"}},\"name\":\"False\",\"offsetgroup\":\"False\",\"orientation\":\"v\",\"showlegend\":true,\"textposition\":\"auto\",\"x\":[\"GAIA\",\"GSM8K\",\"HotpotQA\"],\"xaxis\":\"x\",\"y\":[2.111111111111111,2.125,1.7],\"yaxis\":\"y\",\"type\":\"bar\"},{\"alignmentgroup\":\"True\",\"hovertemplate\":\"Answer correctness=False\\u003cbr\\u003eAgent=GPT4\\u003cbr\\u003eTask=%{x}\\u003cbr\\u003eMean Number of Steps=%{y}\\u003cextra\\u003e\\u003c\\u002fextra\\u003e\",\"legendgroup\":\"False\",\"marker\":{\"color\":\"#636efa\",\"pattern\":{\"shape\":\"\"}},\"name\":\"False\",\"offsetgroup\":\"False\",\"orientation\":\"v\",\"showlegend\":false,\"textposition\":\"auto\",\"x\":[\"GAIA\",\"GSM8K\",\"HotpotQA\"],\"xaxis\":\"x2\",\"y\":[2.3846153846153846,3.0,1.8823529411764706],\"yaxis\":\"y2\",\"type\":\"bar\"},{\"alignmentgroup\":\"True\",\"hovertemplate\":\"Answer correctness=False\\u003cbr\\u003eAgent=Llama-2-70b-chat\\u003cbr\\u003eTask=%{x}\\u003cbr\\u003eMean Number of Steps=%{y}\\u003cextra\\u003e\\u003c\\u002fextra\\u003e\",\"legendgroup\":\"False\",\"marker\":{\"color\":\"#636efa\",\"pattern\":{\"shape\":\"\"}},\"name\":\"False\",\"offsetgroup\":\"False\",\"orientation\":\"v\",\"showlegend\":false,\"textposition\":\"auto\",\"x\":[\"GAIA\",\"GSM8K\",\"HotpotQA\"],\"xaxis\":\"x3\",\"y\":[5.428571428571429,4.571428571428571,4.5],\"yaxis\":\"y3\",\"type\":\"bar\"},{\"alignmentgroup\":\"True\",\"hovertemplate\":\"Answer correctness=False\\u003cbr\\u003eAgent=Mixtral-8x7B-Instruct-v0.1\\u003cbr\\u003eTask=%{x}\\u003cbr\\u003eMean Number of Steps=%{y}\\u003cextra\\u003e\\u003c\\u002fextra\\u003e\",\"legendgroup\":\"False\",\"marker\":{\"color\":\"#636efa\",\"pattern\":{\"shape\":\"\"}},\"name\":\"False\",\"offsetgroup\":\"False\",\"orientation\":\"v\",\"showlegend\":false,\"textposition\":\"auto\",\"x\":[\"GAIA\",\"GSM8K\",\"HotpotQA\"],\"xaxis\":\"x4\",\"y\":[4.5,4.166666666666667,3.409090909090909],\"yaxis\":\"y4\",\"type\":\"bar\"},{\"alignmentgroup\":\"True\",\"hovertemplate\":\"Answer correctness=False\\u003cbr\\u003eAgent=OpenHermes-2.5-Mistral-7B\\u003cbr\\u003eTask=%{x}\\u003cbr\\u003eMean Number of Steps=%{y}\\u003cextra\\u003e\\u003c\\u002fextra\\u003e\",\"legendgroup\":\"False\",\"marker\":{\"color\":\"#636efa\",\"pattern\":{\"shape\":\"\"}},\"name\":\"False\",\"offsetgroup\":\"False\",\"orientation\":\"v\",\"showlegend\":false,\"textposition\":\"auto\",\"x\":[\"GAIA\",\"GSM8K\",\"HotpotQA\"],\"xaxis\":\"x5\",\"y\":[5.176470588235294,4.181818181818182,3.5],\"yaxis\":\"y5\",\"type\":\"bar\"},{\"alignmentgroup\":\"True\",\"hovertemplate\":\"Answer correctness=False\\u003cbr\\u003eAgent=SOLAR-10.7B\\u003cbr\\u003eTask=%{x}\\u003cbr\\u003eMean Number of Steps=%{y}\\u003cextra\\u003e\\u003c\\u002fextra\\u003e\",\"legendgroup\":\"False\",\"marker\":{\"color\":\"#636efa\",\"pattern\":{\"shape\":\"\"}},\"name\":\"False\",\"offsetgroup\":\"False\",\"orientation\":\"v\",\"showlegend\":false,\"textposition\":\"auto\",\"x\":[\"GAIA\",\"GSM8K\",\"HotpotQA\"],\"xaxis\":\"x6\",\"y\":[4.375,2.2,4.258064516129032],\"yaxis\":\"y6\",\"type\":\"bar\"},{\"alignmentgroup\":\"True\",\"hovertemplate\":\"Answer correctness=False\\u003cbr\\u003eAgent=Zephyr-7b-beta\\u003cbr\\u003eTask=%{x}\\u003cbr\\u003eMean Number of Steps=%{y}\\u003cextra\\u003e\\u003c\\u002fextra\\u003e\",\"legendgroup\":\"False\",\"marker\":{\"color\":\"#636efa\",\"pattern\":{\"shape\":\"\"}},\"name\":\"False\",\"offsetgroup\":\"False\",\"orientation\":\"v\",\"showlegend\":false,\"textposition\":\"auto\",\"x\":[\"GAIA\",\"GSM8K\",\"HotpotQA\"],\"xaxis\":\"x7\",\"y\":[5.2,2.4,4.868421052631579],\"yaxis\":\"y7\",\"type\":\"bar\"},{\"alignmentgroup\":\"True\",\"hovertemplate\":\"Answer correctness=True\\u003cbr\\u003eAgent=GPT3.5\\u003cbr\\u003eTask=%{x}\\u003cbr\\u003eMean Number of Steps=%{y}\\u003cextra\\u003e\\u003c\\u002fextra\\u003e\",\"legendgroup\":\"True\",\"marker\":{\"color\":\"#EF553B\",\"pattern\":{\"shape\":\"\"}},\"name\":\"True\",\"offsetgroup\":\"True\",\"orientation\":\"v\",\"showlegend\":true,\"textposition\":\"auto\",\"x\":[\"GAIA\",\"GSM8K\",\"HotpotQA\"],\"xaxis\":\"x\",\"y\":[1.0,2.1666666666666665,1.875],\"yaxis\":\"y\",\"type\":\"bar\"},{\"alignmentgroup\":\"True\",\"hovertemplate\":\"Answer correctness=True\\u003cbr\\u003eAgent=GPT4\\u003cbr\\u003eTask=%{x}\\u003cbr\\u003eMean Number of Steps=%{y}\\u003cextra\\u003e\\u003c\\u002fextra\\u003e\",\"legendgroup\":\"True\",\"marker\":{\"color\":\"#EF553B\",\"pattern\":{\"shape\":\"\"}},\"name\":\"True\",\"offsetgroup\":\"True\",\"orientation\":\"v\",\"showlegend\":false,\"textposition\":\"auto\",\"x\":[\"GAIA\",\"GSM8K\",\"HotpotQA\"],\"xaxis\":\"x2\",\"y\":[1.6666666666666667,2.473684210526316,1.6511627906976745],\"yaxis\":\"y2\",\"type\":\"bar\"},{\"alignmentgroup\":\"True\",\"hovertemplate\":\"Answer correctness=True\\u003cbr\\u003eAgent=Llama-2-70b-chat\\u003cbr\\u003eTask=%{x}\\u003cbr\\u003eMean Number of Steps=%{y}\\u003cextra\\u003e\\u003c\\u002fextra\\u003e\",\"legendgroup\":\"True\",\"marker\":{\"color\":\"#EF553B\",\"pattern\":{\"shape\":\"\"}},\"name\":\"True\",\"offsetgroup\":\"True\",\"orientation\":\"v\",\"showlegend\":false,\"textposition\":\"auto\",\"x\":[\"GSM8K\",\"HotpotQA\"],\"xaxis\":\"x3\",\"y\":[2.0,2.88],\"yaxis\":\"y3\",\"type\":\"bar\"},{\"alignmentgroup\":\"True\",\"hovertemplate\":\"Answer correctness=True\\u003cbr\\u003eAgent=Mixtral-8x7B-Instruct-v0.1\\u003cbr\\u003eTask=%{x}\\u003cbr\\u003eMean Number of Steps=%{y}\\u003cextra\\u003e\\u003c\\u002fextra\\u003e\",\"legendgroup\":\"True\",\"marker\":{\"color\":\"#EF553B\",\"pattern\":{\"shape\":\"\"}},\"name\":\"True\",\"offsetgroup\":\"True\",\"orientation\":\"v\",\"showlegend\":false,\"textposition\":\"auto\",\"x\":[\"GAIA\",\"GSM8K\",\"HotpotQA\"],\"xaxis\":\"x4\",\"y\":[3.0,3.0714285714285716,3.027027027027027],\"yaxis\":\"y4\",\"type\":\"bar\"},{\"alignmentgroup\":\"True\",\"hovertemplate\":\"Answer correctness=True\\u003cbr\\u003eAgent=OpenHermes-2.5-Mistral-7B\\u003cbr\\u003eTask=%{x}\\u003cbr\\u003eMean Number of Steps=%{y}\\u003cextra\\u003e\\u003c\\u002fextra\\u003e\",\"legendgroup\":\"True\",\"marker\":{\"color\":\"#EF553B\",\"pattern\":{\"shape\":\"\"}},\"name\":\"True\",\"offsetgroup\":\"True\",\"orientation\":\"v\",\"showlegend\":false,\"textposition\":\"auto\",\"x\":[\"GAIA\",\"GSM8K\",\"HotpotQA\"],\"xaxis\":\"x5\",\"y\":[3.5,3.5555555555555554,3.0303030303030303],\"yaxis\":\"y5\",\"type\":\"bar\"},{\"alignmentgroup\":\"True\",\"hovertemplate\":\"Answer correctness=True\\u003cbr\\u003eAgent=SOLAR-10.7B\\u003cbr\\u003eTask=%{x}\\u003cbr\\u003eMean Number of Steps=%{y}\\u003cextra\\u003e\\u003c\\u002fextra\\u003e\",\"legendgroup\":\"True\",\"marker\":{\"color\":\"#EF553B\",\"pattern\":{\"shape\":\"\"}},\"name\":\"True\",\"offsetgroup\":\"True\",\"orientation\":\"v\",\"showlegend\":false,\"textposition\":\"auto\",\"x\":[\"GAIA\",\"GSM8K\",\"HotpotQA\"],\"xaxis\":\"x6\",\"y\":[4.0,2.357142857142857,2.896551724137931],\"yaxis\":\"y6\",\"type\":\"bar\"},{\"alignmentgroup\":\"True\",\"hovertemplate\":\"Answer correctness=True\\u003cbr\\u003eAgent=Zephyr-7b-beta\\u003cbr\\u003eTask=%{x}\\u003cbr\\u003eMean Number of Steps=%{y}\\u003cextra\\u003e\\u003c\\u002fextra\\u003e\",\"legendgroup\":\"True\",\"marker\":{\"color\":\"#EF553B\",\"pattern\":{\"shape\":\"\"}},\"name\":\"True\",\"offsetgroup\":\"True\",\"orientation\":\"v\",\"showlegend\":false,\"textposition\":\"auto\",\"x\":[\"GSM8K\",\"HotpotQA\"],\"xaxis\":\"x7\",\"y\":[1.75,2.45],\"yaxis\":\"y7\",\"type\":\"bar\"}],                        {\"template\":{\"data\":{\"histogram2dcontour\":[{\"type\":\"histogram2dcontour\",\"colorbar\":{\"outlinewidth\":0,\"ticks\":\"\"},\"colorscale\":[[0.0,\"#0d0887\"],[0.1111111111111111,\"#46039f\"],[0.2222222222222222,\"#7201a8\"],[0.3333333333333333,\"#9c179e\"],[0.4444444444444444,\"#bd3786\"],[0.5555555555555556,\"#d8576b\"],[0.6666666666666666,\"#ed7953\"],[0.7777777777777778,\"#fb9f3a\"],[0.8888888888888888,\"#fdca26\"],[1.0,\"#f0f921\"]]}],\"choropleth\":[{\"type\":\"choropleth\",\"colorbar\":{\"outlinewidth\":0,\"ticks\":\"\"}}],\"histogram2d\":[{\"type\":\"histogram2d\",\"colorbar\":{\"outlinewidth\":0,\"ticks\":\"\"},\"colorscale\":[[0.0,\"#0d0887\"],[0.1111111111111111,\"#46039f\"],[0.2222222222222222,\"#7201a8\"],[0.3333333333333333,\"#9c179e\"],[0.4444444444444444,\"#bd3786\"],[0.5555555555555556,\"#d8576b\"],[0.6666666666666666,\"#ed7953\"],[0.7777777777777778,\"#fb9f3a\"],[0.8888888888888888,\"#fdca26\"],[1.0,\"#f0f921\"]]}],\"heatmap\":[{\"type\":\"heatmap\",\"colorbar\":{\"outlinewidth\":0,\"ticks\":\"\"},\"colorscale\":[[0.0,\"#0d0887\"],[0.1111111111111111,\"#46039f\"],[0.2222222222222222,\"#7201a8\"],[0.3333333333333333,\"#9c179e\"],[0.4444444444444444,\"#bd3786\"],[0.5555555555555556,\"#d8576b\"],[0.6666666666666666,\"#ed7953\"],[0.7777777777777778,\"#fb9f3a\"],[0.8888888888888888,\"#fdca26\"],[1.0,\"#f0f921\"]]}],\"heatmapgl\":[{\"type\":\"heatmapgl\",\"colorbar\":{\"outlinewidth\":0,\"ticks\":\"\"},\"colorscale\":[[0.0,\"#0d0887\"],[0.1111111111111111,\"#46039f\"],[0.2222222222222222,\"#7201a8\"],[0.3333333333333333,\"#9c179e\"],[0.4444444444444444,\"#bd3786\"],[0.5555555555555556,\"#d8576b\"],[0.6666666666666666,\"#ed7953\"],[0.7777777777777778,\"#fb9f3a\"],[0.8888888888888888,\"#fdca26\"],[1.0,\"#f0f921\"]]}],\"contourcarpet\":[{\"type\":\"contourcarpet\",\"colorbar\":{\"outlinewidth\":0,\"ticks\":\"\"}}],\"contour\":[{\"type\":\"contour\",\"colorbar\":{\"outlinewidth\":0,\"ticks\":\"\"},\"colorscale\":[[0.0,\"#0d0887\"],[0.1111111111111111,\"#46039f\"],[0.2222222222222222,\"#7201a8\"],[0.3333333333333333,\"#9c179e\"],[0.4444444444444444,\"#bd3786\"],[0.5555555555555556,\"#d8576b\"],[0.6666666666666666,\"#ed7953\"],[0.7777777777777778,\"#fb9f3a\"],[0.8888888888888888,\"#fdca26\"],[1.0,\"#f0f921\"]]}],\"surface\":[{\"type\":\"surface\",\"colorbar\":{\"outlinewidth\":0,\"ticks\":\"\"},\"colorscale\":[[0.0,\"#0d0887\"],[0.1111111111111111,\"#46039f\"],[0.2222222222222222,\"#7201a8\"],[0.3333333333333333,\"#9c179e\"],[0.4444444444444444,\"#bd3786\"],[0.5555555555555556,\"#d8576b\"],[0.6666666666666666,\"#ed7953\"],[0.7777777777777778,\"#fb9f3a\"],[0.8888888888888888,\"#fdca26\"],[1.0,\"#f0f921\"]]}],\"mesh3d\":[{\"type\":\"mesh3d\",\"colorbar\":{\"outlinewidth\":0,\"ticks\":\"\"}}],\"scatter\":[{\"fillpattern\":{\"fillmode\":\"overlay\",\"size\":10,\"solidity\":0.2},\"type\":\"scatter\"}],\"parcoords\":[{\"type\":\"parcoords\",\"line\":{\"colorbar\":{\"outlinewidth\":0,\"ticks\":\"\"}}}],\"scatterpolargl\":[{\"type\":\"scatterpolargl\",\"marker\":{\"colorbar\":{\"outlinewidth\":0,\"ticks\":\"\"}}}],\"bar\":[{\"error_x\":{\"color\":\"#2a3f5f\"},\"error_y\":{\"color\":\"#2a3f5f\"},\"marker\":{\"line\":{\"color\":\"#E5ECF6\",\"width\":0.5},\"pattern\":{\"fillmode\":\"overlay\",\"size\":10,\"solidity\":0.2}},\"type\":\"bar\"}],\"scattergeo\":[{\"type\":\"scattergeo\",\"marker\":{\"colorbar\":{\"outlinewidth\":0,\"ticks\":\"\"}}}],\"scatterpolar\":[{\"type\":\"scatterpolar\",\"marker\":{\"colorbar\":{\"outlinewidth\":0,\"ticks\":\"\"}}}],\"histogram\":[{\"marker\":{\"pattern\":{\"fillmode\":\"overlay\",\"size\":10,\"solidity\":0.2}},\"type\":\"histogram\"}],\"scattergl\":[{\"type\":\"scattergl\",\"marker\":{\"colorbar\":{\"outlinewidth\":0,\"ticks\":\"\"}}}],\"scatter3d\":[{\"type\":\"scatter3d\",\"line\":{\"colorbar\":{\"outlinewidth\":0,\"ticks\":\"\"}},\"marker\":{\"colorbar\":{\"outlinewidth\":0,\"ticks\":\"\"}}}],\"scattermapbox\":[{\"type\":\"scattermapbox\",\"marker\":{\"colorbar\":{\"outlinewidth\":0,\"ticks\":\"\"}}}],\"scatterternary\":[{\"type\":\"scatterternary\",\"marker\":{\"colorbar\":{\"outlinewidth\":0,\"ticks\":\"\"}}}],\"scattercarpet\":[{\"type\":\"scattercarpet\",\"marker\":{\"colorbar\":{\"outlinewidth\":0,\"ticks\":\"\"}}}],\"carpet\":[{\"aaxis\":{\"endlinecolor\":\"#2a3f5f\",\"gridcolor\":\"white\",\"linecolor\":\"white\",\"minorgridcolor\":\"white\",\"startlinecolor\":\"#2a3f5f\"},\"baxis\":{\"endlinecolor\":\"#2a3f5f\",\"gridcolor\":\"white\",\"linecolor\":\"white\",\"minorgridcolor\":\"white\",\"startlinecolor\":\"#2a3f5f\"},\"type\":\"carpet\"}],\"table\":[{\"cells\":{\"fill\":{\"color\":\"#EBF0F8\"},\"line\":{\"color\":\"white\"}},\"header\":{\"fill\":{\"color\":\"#C8D4E3\"},\"line\":{\"color\":\"white\"}},\"type\":\"table\"}],\"barpolar\":[{\"marker\":{\"line\":{\"color\":\"#E5ECF6\",\"width\":0.5},\"pattern\":{\"fillmode\":\"overlay\",\"size\":10,\"solidity\":0.2}},\"type\":\"barpolar\"}],\"pie\":[{\"automargin\":true,\"type\":\"pie\"}]},\"layout\":{\"autotypenumbers\":\"strict\",\"colorway\":[\"#636efa\",\"#EF553B\",\"#00cc96\",\"#ab63fa\",\"#FFA15A\",\"#19d3f3\",\"#FF6692\",\"#B6E880\",\"#FF97FF\",\"#FECB52\"],\"font\":{\"color\":\"#2a3f5f\"},\"hovermode\":\"closest\",\"hoverlabel\":{\"align\":\"left\"},\"paper_bgcolor\":\"white\",\"plot_bgcolor\":\"#E5ECF6\",\"polar\":{\"bgcolor\":\"#E5ECF6\",\"angularaxis\":{\"gridcolor\":\"white\",\"linecolor\":\"white\",\"ticks\":\"\"},\"radialaxis\":{\"gridcolor\":\"white\",\"linecolor\":\"white\",\"ticks\":\"\"}},\"ternary\":{\"bgcolor\":\"#E5ECF6\",\"aaxis\":{\"gridcolor\":\"white\",\"linecolor\":\"white\",\"ticks\":\"\"},\"baxis\":{\"gridcolor\":\"white\",\"linecolor\":\"white\",\"ticks\":\"\"},\"caxis\":{\"gridcolor\":\"white\",\"linecolor\":\"white\",\"ticks\":\"\"}},\"coloraxis\":{\"colorbar\":{\"outlinewidth\":0,\"ticks\":\"\"}},\"colorscale\":{\"sequential\":[[0.0,\"#0d0887\"],[0.1111111111111111,\"#46039f\"],[0.2222222222222222,\"#7201a8\"],[0.3333333333333333,\"#9c179e\"],[0.4444444444444444,\"#bd3786\"],[0.5555555555555556,\"#d8576b\"],[0.6666666666666666,\"#ed7953\"],[0.7777777777777778,\"#fb9f3a\"],[0.8888888888888888,\"#fdca26\"],[1.0,\"#f0f921\"]],\"sequentialminus\":[[0.0,\"#0d0887\"],[0.1111111111111111,\"#46039f\"],[0.2222222222222222,\"#7201a8\"],[0.3333333333333333,\"#9c179e\"],[0.4444444444444444,\"#bd3786\"],[0.5555555555555556,\"#d8576b\"],[0.6666666666666666,\"#ed7953\"],[0.7777777777777778,\"#fb9f3a\"],[0.8888888888888888,\"#fdca26\"],[1.0,\"#f0f921\"]],\"diverging\":[[0,\"#8e0152\"],[0.1,\"#c51b7d\"],[0.2,\"#de77ae\"],[0.3,\"#f1b6da\"],[0.4,\"#fde0ef\"],[0.5,\"#f7f7f7\"],[0.6,\"#e6f5d0\"],[0.7,\"#b8e186\"],[0.8,\"#7fbc41\"],[0.9,\"#4d9221\"],[1,\"#276419\"]]},\"xaxis\":{\"gridcolor\":\"white\",\"linecolor\":\"white\",\"ticks\":\"\",\"title\":{\"standoff\":15},\"zerolinecolor\":\"white\",\"automargin\":true,\"zerolinewidth\":2},\"yaxis\":{\"gridcolor\":\"white\",\"linecolor\":\"white\",\"ticks\":\"\",\"title\":{\"standoff\":15},\"zerolinecolor\":\"white\",\"automargin\":true,\"zerolinewidth\":2},\"scene\":{\"xaxis\":{\"backgroundcolor\":\"#E5ECF6\",\"gridcolor\":\"white\",\"linecolor\":\"white\",\"showbackground\":true,\"ticks\":\"\",\"zerolinecolor\":\"white\",\"gridwidth\":2},\"yaxis\":{\"backgroundcolor\":\"#E5ECF6\",\"gridcolor\":\"white\",\"linecolor\":\"white\",\"showbackground\":true,\"ticks\":\"\",\"zerolinecolor\":\"white\",\"gridwidth\":2},\"zaxis\":{\"backgroundcolor\":\"#E5ECF6\",\"gridcolor\":\"white\",\"linecolor\":\"white\",\"showbackground\":true,\"ticks\":\"\",\"zerolinecolor\":\"white\",\"gridwidth\":2}},\"shapedefaults\":{\"line\":{\"color\":\"#2a3f5f\"}},\"annotationdefaults\":{\"arrowcolor\":\"#2a3f5f\",\"arrowhead\":0,\"arrowwidth\":1},\"geo\":{\"bgcolor\":\"white\",\"landcolor\":\"#E5ECF6\",\"subunitcolor\":\"white\",\"showland\":true,\"showlakes\":true,\"lakecolor\":\"white\"},\"title\":{\"x\":0.05},\"mapbox\":{\"style\":\"light\"}}},\"xaxis\":{\"anchor\":\"y\",\"domain\":[0.0,0.1257142857142857],\"title\":{\"text\":\"Task\"}},\"yaxis\":{\"anchor\":\"x\",\"domain\":[0.0,1.0],\"title\":{\"text\":\"Mean Number of Steps\"}},\"xaxis2\":{\"anchor\":\"y2\",\"domain\":[0.14571428571428569,0.27142857142857135],\"matches\":\"x\",\"title\":{\"text\":\"Task\"}},\"yaxis2\":{\"anchor\":\"x2\",\"domain\":[0.0,1.0],\"matches\":\"y\",\"showticklabels\":false},\"xaxis3\":{\"anchor\":\"y3\",\"domain\":[0.29142857142857137,0.41714285714285704],\"matches\":\"x\",\"title\":{\"text\":\"Task\"}},\"yaxis3\":{\"anchor\":\"x3\",\"domain\":[0.0,1.0],\"matches\":\"y\",\"showticklabels\":false},\"xaxis4\":{\"anchor\":\"y4\",\"domain\":[0.4371428571428571,0.5628571428571428],\"matches\":\"x\",\"title\":{\"text\":\"Task\"}},\"yaxis4\":{\"anchor\":\"x4\",\"domain\":[0.0,1.0],\"matches\":\"y\",\"showticklabels\":false},\"xaxis5\":{\"anchor\":\"y5\",\"domain\":[0.5828571428571427,0.7085714285714284],\"matches\":\"x\",\"title\":{\"text\":\"Task\"}},\"yaxis5\":{\"anchor\":\"x5\",\"domain\":[0.0,1.0],\"matches\":\"y\",\"showticklabels\":false},\"xaxis6\":{\"anchor\":\"y6\",\"domain\":[0.7285714285714284,0.8542857142857141],\"matches\":\"x\",\"title\":{\"text\":\"Task\"}},\"yaxis6\":{\"anchor\":\"x6\",\"domain\":[0.0,1.0],\"matches\":\"y\",\"showticklabels\":false},\"xaxis7\":{\"anchor\":\"y7\",\"domain\":[0.8742857142857141,0.9999999999999998],\"matches\":\"x\",\"title\":{\"text\":\"Task\"}},\"yaxis7\":{\"anchor\":\"x7\",\"domain\":[0.0,1.0],\"matches\":\"y\",\"showticklabels\":false},\"annotations\":[{\"font\":{},\"showarrow\":false,\"text\":\"Agent=GPT3.5\",\"x\":0.06285714285714285,\"xanchor\":\"center\",\"xref\":\"paper\",\"y\":1.0,\"yanchor\":\"bottom\",\"yref\":\"paper\"},{\"font\":{},\"showarrow\":false,\"text\":\"Agent=GPT4\",\"x\":0.20857142857142852,\"xanchor\":\"center\",\"xref\":\"paper\",\"y\":1.0,\"yanchor\":\"bottom\",\"yref\":\"paper\"},{\"font\":{},\"showarrow\":false,\"text\":\"Agent=Llama-2-70b-chat\",\"x\":0.3542857142857142,\"xanchor\":\"center\",\"xref\":\"paper\",\"y\":1.0,\"yanchor\":\"bottom\",\"yref\":\"paper\"},{\"font\":{},\"showarrow\":false,\"text\":\"Agent=Mixtral-8x7B-Instruct-v0.1\",\"x\":0.5,\"xanchor\":\"center\",\"xref\":\"paper\",\"y\":1.0,\"yanchor\":\"bottom\",\"yref\":\"paper\"},{\"font\":{},\"showarrow\":false,\"text\":\"Agent=OpenHermes-2.5-Mistral-7B\",\"x\":0.6457142857142856,\"xanchor\":\"center\",\"xref\":\"paper\",\"y\":1.0,\"yanchor\":\"bottom\",\"yref\":\"paper\"},{\"font\":{},\"showarrow\":false,\"text\":\"Agent=SOLAR-10.7B\",\"x\":0.7914285714285713,\"xanchor\":\"center\",\"xref\":\"paper\",\"y\":1.0,\"yanchor\":\"bottom\",\"yref\":\"paper\"},{\"font\":{},\"showarrow\":false,\"text\":\"Agent=Zephyr-7b-beta\",\"x\":0.9371428571428569,\"xanchor\":\"center\",\"xref\":\"paper\",\"y\":1.0,\"yanchor\":\"bottom\",\"yref\":\"paper\"}],\"legend\":{\"title\":{\"text\":\"Answer correctness\"},\"tracegroupgap\":0},\"title\":{\"text\":\"Mean Number of Steps to Solve a Task\"},\"barmode\":\"group\",\"width\":1500},                        {\"responsive\": true}                    ).then(function(){\n",
3521 |        "                            \n",
3522 |        "var gd = document.getElementById('e23c2b41-b14d-4353-be6c-e12f08e3a387');\n",
3523 |        "var x = new MutationObserver(function (mutations, observer) {{\n",
3524 |        "        var display = window.getComputedStyle(gd).display;\n",
3525 |        "        if (!display || display === 'none') {{\n",
3526 |        "            console.log([gd, 'removed!']);\n",
3527 |        "            Plotly.purge(gd);\n",
3528 |        "            observer.disconnect();\n",
3529 |        "        }}\n",
3530 |        "}});\n",
3531 |        "\n",
3532 |        "// Listen for the removal of the full notebook cells\n",
3533 |        "var notebookContainer = gd.closest('#notebook-container');\n",
3534 |        "if (notebookContainer) {{\n",
3535 |        "    x.observe(notebookContainer, {childList: true});\n",
3536 |        "}}\n",
3537 |        "\n",
3538 |        "// Listen for the clearing of the current output cell\n",
3539 |        "var outputEl = gd.closest('.output');\n",
3540 |        "if (outputEl) {{\n",
3541 |        "    x.observe(outputEl, {childList: true});\n",
3542 |        "}}\n",
3543 |        "\n",
3544 |        "                        })                };                });            </script>        </div>"
3545 |       ]
3546 |      },
3547 |      "metadata": {},
3548 |      "output_type": "display_data"
3549 |     }
3550 |    ],
3551 |    "source": [
3552 |     "px.bar(\n",
3553 |     "    aggregated_resuts.reset_index(),\n",
3554 |     "    x=\"task\",\n",
3555 |     "    y=\"number_of_steps\",\n",
3556 |     "    facet_col=\"agent_name\",\n",
3557 |     "    color=\"correct_answer\",\n",
3558 |     "    title=\"Mean Number of Steps to Solve a Task\",\n",
3559 |     "    barmode=\"group\",\n",
3560 |     "    labels={\n",
3561 |     "        \"agent_name\": \"Agent\",\n",
3562 |     "        \"number_of_steps\": \"Mean Number of Steps\",\n",
3563 |     "        \"correct_answer\": \"Answer correctness\",\n",
3564 |     "        \"task\": \"Task\",\n",
3565 |     "    },\n",
3566 |     "    width=1500,\n",
3567 |     ")"
3568 |    ]
3569 |   },
3570 |   {
3571 |    "cell_type": "markdown",
3572 |    "metadata": {},
3573 |    "source": [
3574 |     "Here we can see that on average, __tasks that failed have a higher number of steps taken than successful tasks__: taking many steps seems to be an indication that the agent is trying wrong directions to solve the problem."
3575 |    ]
3576 |   },
3577 |   {
3578 |    "cell_type": "markdown",
3579 |    "metadata": {},
3580 |    "source": [
3581 |     "#### Bonus: LLM-as-a-judge - Prometheus-13B vs GPT4"
3582 |    ]
3583 |   },
3584 |   {
3585 |    "cell_type": "code",
3586 |    "execution_count": 30,
3587 |    "metadata": {},
3588 |    "outputs": [],
3589 |    "source": [
3590 |     "res = pd.read_json(\"output/GPT4.json\")\n",
3591 |     "res = res.loc[~res[\"eval_score_human\"].isnull()]\n",
3592 |     "res = res[\n",
3593 |     "    [\"eval_score_human\", \"eval_score_GPT4\", \"eval_score_Prometheus-13B-v1.0\"]\n",
3594 |     "].reset_index(drop=True)"
3595 |    ]
3596 |   },
3597 |   {
3598 |    "cell_type": "code",
3599 |    "execution_count": 31,
3600 |    "metadata": {},
3601 |    "outputs": [
3602 |     {
3603 |      "data": {
3604 |       "application/vnd.plotly.v1+json": {
3605 |        "config": {
3606 |         "plotlyServerURL": "https://plot.ly"
3607 |        },
3608 |        "data": [
3609 |         {
3610 |          "alignmentgroup": "True",
3611 |          "hovertemplate": "variable=eval_score_human<br>index=%{x}<br>value=%{y}<extra></extra>",
3612 |          "legendgroup": "eval_score_human",
3613 |          "marker": {
3614 |           "color": "#636efa",
3615 |           "pattern": {
3616 |            "shape": ""
3617 |           }
3618 |          },
3619 |          "name": "eval_score_human",
3620 |          "offsetgroup": "eval_score_human",
3621 |          "orientation": "v",
3622 |          "showlegend": true,
3623 |          "textposition": "auto",
3624 |          "type": "bar",
3625 |          "x": [
3626 |           0,
3627 |           1,
3628 |           2,
3629 |           3,
3630 |           4,
3631 |           5,
3632 |           6,
3633 |           7,
3634 |           8,
3635 |           9,
3636 |           10,
3637 |           11,
3638 |           12,
3639 |           13,
3640 |           14,
3641 |           15,
3642 |           16,
3643 |           17,
3644 |           18,
3645 |           19,
3646 |           20,
3647 |           21,
3648 |           22,
3649 |           23,
3650 |           24,
3651 |           25,
3652 |           26
3653 |          ],
3654 |          "xaxis": "x",
3655 |          "y": [
3656 |           5,
3657 |           5,
3658 |           5,
3659 |           5,
3660 |           5,
3661 |           5,
3662 |           5,
3663 |           5,
3664 |           1,
3665 |           5,
3666 |           5,
3667 |           5,
3668 |           5,
3669 |           5,
3670 |           5,
3671 |           5,
3672 |           3,
3673 |           3,
3674 |           1,
3675 |           5,
3676 |           1,
3677 |           1,
3678 |           5,
3679 |           5,
3680 |           5,
3681 |           5,
3682 |           5
3683 |          ],
3684 |          "yaxis": "y"
3685 |         },
3686 |         {
3687 |          "alignmentgroup": "True",
3688 |          "hovertemplate": "variable=eval_score_GPT4<br>index=%{x}<br>value=%{y}<extra></extra>",
3689 |          "legendgroup": "eval_score_GPT4",
3690 |          "marker": {
3691 |           "color": "#EF553B",
3692 |           "pattern": {
3693 |            "shape": ""
3694 |           }
3695 |          },
3696 |          "name": "eval_score_GPT4",
3697 |          "offsetgroup": "eval_score_GPT4",
3698 |          "orientation": "v",
3699 |          "showlegend": true,
3700 |          "textposition": "auto",
3701 |          "type": "bar",
3702 |          "x": [
3703 |           0,
3704 |           1,
3705 |           2,
3706 |           3,
3707 |           4,
3708 |           5,
3709 |           6,
3710 |           7,
3711 |           8,
3712 |           9,
3713 |           10,
3714 |           11,
3715 |           12,
3716 |           13,
3717 |           14,
3718 |           15,
3719 |           16,
3720 |           17,
3721 |           18,
3722 |           19,
3723 |           20,
3724 |           21,
3725 |           22,
3726 |           23,
3727 |           24,
3728 |           25,
3729 |           26
3730 |          ],
3731 |          "xaxis": "x",
3732 |          "y": [
3733 |           5,
3734 |           5,
3735 |           5,
3736 |           5,
3737 |           5,
3738 |           5,
3739 |           5,
3740 |           5,
3741 |           1,
3742 |           5,
3743 |           5,
3744 |           5,
3745 |           5,
3746 |           5,
3747 |           5,
3748 |           5,
3749 |           3,
3750 |           3,
3751 |           1,
3752 |           5,
3753 |           1,
3754 |           1,
3755 |           5,
3756 |           5,
3757 |           4,
3758 |           5,
3759 |           5
3760 |          ],
3761 |          "yaxis": "y"
3762 |         },
3763 |         {
3764 |          "alignmentgroup": "True",
3765 |          "hovertemplate": "variable=eval_score_Prometheus-13B-v1.0<br>index=%{x}<br>value=%{y}<extra></extra>",
3766 |          "legendgroup": "eval_score_Prometheus-13B-v1.0",
3767 |          "marker": {
3768 |           "color": "#00cc96",
3769 |           "pattern": {
3770 |            "shape": ""
3771 |           }
3772 |          },
3773 |          "name": "eval_score_Prometheus-13B-v1.0",
3774 |          "offsetgroup": "eval_score_Prometheus-13B-v1.0",
3775 |          "orientation": "v",
3776 |          "showlegend": true,
3777 |          "textposition": "auto",
3778 |          "type": "bar",
3779 |          "x": [
3780 |           0,
3781 |           1,
3782 |           2,
3783 |           3,
3784 |           4,
3785 |           5,
3786 |           6,
3787 |           7,
3788 |           8,
3789 |           9,
3790 |           10,
3791 |           11,
3792 |           12,
3793 |           13,
3794 |           14,
3795 |           15,
3796 |           16,
3797 |           17,
3798 |           18,
3799 |           19,
3800 |           20,
3801 |           21,
3802 |           22,
3803 |           23,
3804 |           24,
3805 |           25,
3806 |           26
3807 |          ],
3808 |          "xaxis": "x",
3809 |          "y": [
3810 |           4,
3811 |           4,
3812 |           5,
3813 |           5,
3814 |           4,
3815 |           1,
3816 |           5,
3817 |           1,
3818 |           1,
3819 |           4,
3820 |           1,
3821 |           1,
3822 |           5,
3823 |           3,
3824 |           1,
3825 |           5,
3826 |           5,
3827 |           1,
3828 |           1,
3829 |           1,
3830 |           2,
3831 |           1,
3832 |           1,
3833 |           5,
3834 |           4,
3835 |           1,
3836 |           4
3837 |          ],
3838 |          "yaxis": "y"
3839 |         }
3840 |        ],
3841 |        "layout": {
3842 |         "barmode": "group",
3843 |         "height": 300,
3844 |         "legend": {
3845 |          "title": {
3846 |           "text": "variable"
3847 |          },
3848 |          "tracegroupgap": 0
3849 |         },
3850 |         "margin": {
3851 |          "t": 60
3852 |         },
3853 |         "template": {
3854 |          "data": {
3855 |           "bar": [
3856 |            {
3857 |             "error_x": {
3858 |              "color": "#2a3f5f"
3859 |             },
3860 |             "error_y": {
3861 |              "color": "#2a3f5f"
3862 |             },
3863 |             "marker": {
3864 |              "line": {
3865 |               "color": "#E5ECF6",
3866 |               "width": 0.5
3867 |              },
3868 |              "pattern": {
3869 |               "fillmode": "overlay",
3870 |               "size": 10,
3871 |               "solidity": 0.2
3872 |              }
3873 |             },
3874 |             "type": "bar"
3875 |            }
3876 |           ],
3877 |           "barpolar": [
3878 |            {
3879 |             "marker": {
3880 |              "line": {
3881 |               "color": "#E5ECF6",
3882 |               "width": 0.5
3883 |              },
3884 |              "pattern": {
3885 |               "fillmode": "overlay",
3886 |               "size": 10,
3887 |               "solidity": 0.2
3888 |              }
3889 |             },
3890 |             "type": "barpolar"
3891 |            }
3892 |           ],
3893 |           "carpet": [
3894 |            {
3895 |             "aaxis": {
3896 |              "endlinecolor": "#2a3f5f",
3897 |              "gridcolor": "white",
3898 |              "linecolor": "white",
3899 |              "minorgridcolor": "white",
3900 |              "startlinecolor": "#2a3f5f"
3901 |             },
3902 |             "baxis": {
3903 |              "endlinecolor": "#2a3f5f",
3904 |              "gridcolor": "white",
3905 |              "linecolor": "white",
3906 |              "minorgridcolor": "white",
3907 |              "startlinecolor": "#2a3f5f"
3908 |             },
3909 |             "type": "carpet"
3910 |            }
3911 |           ],
3912 |           "choropleth": [
3913 |            {
3914 |             "colorbar": {
3915 |              "outlinewidth": 0,
3916 |              "ticks": ""
3917 |             },
3918 |             "type": "choropleth"
3919 |            }
3920 |           ],
3921 |           "contour": [
3922 |            {
3923 |             "colorbar": {
3924 |              "outlinewidth": 0,
3925 |              "ticks": ""
3926 |             },
3927 |             "colorscale": [
3928 |              [
3929 |               0,
3930 |               "#0d0887"
3931 |              ],
3932 |              [
3933 |               0.1111111111111111,
3934 |               "#46039f"
3935 |              ],
3936 |              [
3937 |               0.2222222222222222,
3938 |               "#7201a8"
3939 |              ],
3940 |              [
3941 |               0.3333333333333333,
3942 |               "#9c179e"
3943 |              ],
3944 |              [
3945 |               0.4444444444444444,
3946 |               "#bd3786"
3947 |              ],
3948 |              [
3949 |               0.5555555555555556,
3950 |               "#d8576b"
3951 |              ],
3952 |              [
3953 |               0.6666666666666666,
3954 |               "#ed7953"
3955 |              ],
3956 |              [
3957 |               0.7777777777777778,
3958 |               "#fb9f3a"
3959 |              ],
3960 |              [
3961 |               0.8888888888888888,
3962 |               "#fdca26"
3963 |              ],
3964 |              [
3965 |               1,
3966 |               "#f0f921"
3967 |              ]
3968 |             ],
3969 |             "type": "contour"
3970 |            }
3971 |           ],
3972 |           "contourcarpet": [
3973 |            {
3974 |             "colorbar": {
3975 |              "outlinewidth": 0,
3976 |              "ticks": ""
3977 |             },
3978 |             "type": "contourcarpet"
3979 |            }
3980 |           ],
3981 |           "heatmap": [
3982 |            {
3983 |             "colorbar": {
3984 |              "outlinewidth": 0,
3985 |              "ticks": ""
3986 |             },
3987 |             "colorscale": [
3988 |              [
3989 |               0,
3990 |               "#0d0887"
3991 |              ],
3992 |              [
3993 |               0.1111111111111111,
3994 |               "#46039f"
3995 |              ],
3996 |              [
3997 |               0.2222222222222222,
3998 |               "#7201a8"
3999 |              ],
4000 |              [
4001 |               0.3333333333333333,
4002 |               "#9c179e"
4003 |              ],
4004 |              [
4005 |               0.4444444444444444,
4006 |               "#bd3786"
4007 |              ],
4008 |              [
4009 |               0.5555555555555556,
4010 |               "#d8576b"
4011 |              ],
4012 |              [
4013 |               0.6666666666666666,
4014 |               "#ed7953"
4015 |              ],
4016 |              [
4017 |               0.7777777777777778,
4018 |               "#fb9f3a"
4019 |              ],
4020 |              [
4021 |               0.8888888888888888,
4022 |               "#fdca26"
4023 |              ],
4024 |              [
4025 |               1,
4026 |               "#f0f921"
4027 |              ]
4028 |             ],
4029 |             "type": "heatmap"
4030 |            }
4031 |           ],
4032 |           "heatmapgl": [
4033 |            {
4034 |             "colorbar": {
4035 |              "outlinewidth": 0,
4036 |              "ticks": ""
4037 |             },
4038 |             "colorscale": [
4039 |              [
4040 |               0,
4041 |               "#0d0887"
4042 |              ],
4043 |              [
4044 |               0.1111111111111111,
4045 |               "#46039f"
4046 |              ],
4047 |              [
4048 |               0.2222222222222222,
4049 |               "#7201a8"
4050 |              ],
4051 |              [
4052 |               0.3333333333333333,
4053 |               "#9c179e"
4054 |              ],
4055 |              [
4056 |               0.4444444444444444,
4057 |               "#bd3786"
4058 |              ],
4059 |              [
4060 |               0.5555555555555556,
4061 |               "#d8576b"
4062 |              ],
4063 |              [
4064 |               0.6666666666666666,
4065 |               "#ed7953"
4066 |              ],
4067 |              [
4068 |               0.7777777777777778,
4069 |               "#fb9f3a"
4070 |              ],
4071 |              [
4072 |               0.8888888888888888,
4073 |               "#fdca26"
4074 |              ],
4075 |              [
4076 |               1,
4077 |               "#f0f921"
4078 |              ]
4079 |             ],
4080 |             "type": "heatmapgl"
4081 |            }
4082 |           ],
4083 |           "histogram": [
4084 |            {
4085 |             "marker": {
4086 |              "pattern": {
4087 |               "fillmode": "overlay",
4088 |               "size": 10,
4089 |               "solidity": 0.2
4090 |              }
4091 |             },
4092 |             "type": "histogram"
4093 |            }
4094 |           ],
4095 |           "histogram2d": [
4096 |            {
4097 |             "colorbar": {
4098 |              "outlinewidth": 0,
4099 |              "ticks": ""
4100 |             },
4101 |             "colorscale": [
4102 |              [
4103 |               0,
4104 |               "#0d0887"
4105 |              ],
4106 |              [
4107 |               0.1111111111111111,
4108 |               "#46039f"
4109 |              ],
4110 |              [
4111 |               0.2222222222222222,
4112 |               "#7201a8"
4113 |              ],
4114 |              [
4115 |               0.3333333333333333,
4116 |               "#9c179e"
4117 |              ],
4118 |              [
4119 |               0.4444444444444444,
4120 |               "#bd3786"
4121 |              ],
4122 |              [
4123 |               0.5555555555555556,
4124 |               "#d8576b"
4125 |              ],
4126 |              [
4127 |               0.6666666666666666,
4128 |               "#ed7953"
4129 |              ],
4130 |              [
4131 |               0.7777777777777778,
4132 |               "#fb9f3a"
4133 |              ],
4134 |              [
4135 |               0.8888888888888888,
4136 |               "#fdca26"
4137 |              ],
4138 |              [
4139 |               1,
4140 |               "#f0f921"
4141 |              ]
4142 |             ],
4143 |             "type": "histogram2d"
4144 |            }
4145 |           ],
4146 |           "histogram2dcontour": [
4147 |            {
4148 |             "colorbar": {
4149 |              "outlinewidth": 0,
4150 |              "ticks": ""
4151 |             },
4152 |             "colorscale": [
4153 |              [
4154 |               0,
4155 |               "#0d0887"
4156 |              ],
4157 |              [
4158 |               0.1111111111111111,
4159 |               "#46039f"
4160 |              ],
4161 |              [
4162 |               0.2222222222222222,
4163 |               "#7201a8"
4164 |              ],
4165 |              [
4166 |               0.3333333333333333,
4167 |               "#9c179e"
4168 |              ],
4169 |              [
4170 |               0.4444444444444444,
4171 |               "#bd3786"
4172 |              ],
4173 |              [
4174 |               0.5555555555555556,
4175 |               "#d8576b"
4176 |              ],
4177 |              [
4178 |               0.6666666666666666,
4179 |               "#ed7953"
4180 |              ],
4181 |              [
4182 |               0.7777777777777778,
4183 |               "#fb9f3a"
4184 |              ],
4185 |              [
4186 |               0.8888888888888888,
4187 |               "#fdca26"
4188 |              ],
4189 |              [
4190 |               1,
4191 |               "#f0f921"
4192 |              ]
4193 |             ],
4194 |             "type": "histogram2dcontour"
4195 |            }
4196 |           ],
4197 |           "mesh3d": [
4198 |            {
4199 |             "colorbar": {
4200 |              "outlinewidth": 0,
4201 |              "ticks": ""
4202 |             },
4203 |             "type": "mesh3d"
4204 |            }
4205 |           ],
4206 |           "parcoords": [
4207 |            {
4208 |             "line": {
4209 |              "colorbar": {
4210 |               "outlinewidth": 0,
4211 |               "ticks": ""
4212 |              }
4213 |             },
4214 |             "type": "parcoords"
4215 |            }
4216 |           ],
4217 |           "pie": [
4218 |            {
4219 |             "automargin": true,
4220 |             "type": "pie"
4221 |            }
4222 |           ],
4223 |           "scatter": [
4224 |            {
4225 |             "fillpattern": {
4226 |              "fillmode": "overlay",
4227 |              "size": 10,
4228 |              "solidity": 0.2
4229 |             },
4230 |             "type": "scatter"
4231 |            }
4232 |           ],
4233 |           "scatter3d": [
4234 |            {
4235 |             "line": {
4236 |              "colorbar": {
4237 |               "outlinewidth": 0,
4238 |               "ticks": ""
4239 |              }
4240 |             },
4241 |             "marker": {
4242 |              "colorbar": {
4243 |               "outlinewidth": 0,
4244 |               "ticks": ""
4245 |              }
4246 |             },
4247 |             "type": "scatter3d"
4248 |            }
4249 |           ],
4250 |           "scattercarpet": [
4251 |            {
4252 |             "marker": {
4253 |              "colorbar": {
4254 |               "outlinewidth": 0,
4255 |               "ticks": ""
4256 |              }
4257 |             },
4258 |             "type": "scattercarpet"
4259 |            }
4260 |           ],
4261 |           "scattergeo": [
4262 |            {
4263 |             "marker": {
4264 |              "colorbar": {
4265 |               "outlinewidth": 0,
4266 |               "ticks": ""
4267 |              }
4268 |             },
4269 |             "type": "scattergeo"
4270 |            }
4271 |           ],
4272 |           "scattergl": [
4273 |            {
4274 |             "marker": {
4275 |              "colorbar": {
4276 |               "outlinewidth": 0,
4277 |               "ticks": ""
4278 |              }
4279 |             },
4280 |             "type": "scattergl"
4281 |            }
4282 |           ],
4283 |           "scattermapbox": [
4284 |            {
4285 |             "marker": {
4286 |              "colorbar": {
4287 |               "outlinewidth": 0,
4288 |               "ticks": ""
4289 |              }
4290 |             },
4291 |             "type": "scattermapbox"
4292 |            }
4293 |           ],
4294 |           "scatterpolar": [
4295 |            {
4296 |             "marker": {
4297 |              "colorbar": {
4298 |               "outlinewidth": 0,
4299 |               "ticks": ""
4300 |              }
4301 |             },
4302 |             "type": "scatterpolar"
4303 |            }
4304 |           ],
4305 |           "scatterpolargl": [
4306 |            {
4307 |             "marker": {
4308 |              "colorbar": {
4309 |               "outlinewidth": 0,
4310 |               "ticks": ""
4311 |              }
4312 |             },
4313 |             "type": "scatterpolargl"
4314 |            }
4315 |           ],
4316 |           "scatterternary": [
4317 |            {
4318 |             "marker": {
4319 |              "colorbar": {
4320 |               "outlinewidth": 0,
4321 |               "ticks": ""
4322 |              }
4323 |             },
4324 |             "type": "scatterternary"
4325 |            }
4326 |           ],
4327 |           "surface": [
4328 |            {
4329 |             "colorbar": {
4330 |              "outlinewidth": 0,
4331 |              "ticks": ""
4332 |             },
4333 |             "colorscale": [
4334 |              [
4335 |               0,
4336 |               "#0d0887"
4337 |              ],
4338 |              [
4339 |               0.1111111111111111,
4340 |               "#46039f"
4341 |              ],
4342 |              [
4343 |               0.2222222222222222,
4344 |               "#7201a8"
4345 |              ],
4346 |              [
4347 |               0.3333333333333333,
4348 |               "#9c179e"
4349 |              ],
4350 |              [
4351 |               0.4444444444444444,
4352 |               "#bd3786"
4353 |              ],
4354 |              [
4355 |               0.5555555555555556,
4356 |               "#d8576b"
4357 |              ],
4358 |              [
4359 |               0.6666666666666666,
4360 |               "#ed7953"
4361 |              ],
4362 |              [
4363 |               0.7777777777777778,
4364 |               "#fb9f3a"
4365 |              ],
4366 |              [
4367 |               0.8888888888888888,
4368 |               "#fdca26"
4369 |              ],
4370 |              [
4371 |               1,
4372 |               "#f0f921"
4373 |              ]
4374 |             ],
4375 |             "type": "surface"
4376 |            }
4377 |           ],
4378 |           "table": [
4379 |            {
4380 |             "cells": {
4381 |              "fill": {
4382 |               "color": "#EBF0F8"
4383 |              },
4384 |              "line": {
4385 |               "color": "white"
4386 |              }
4387 |             },
4388 |             "header": {
4389 |              "fill": {
4390 |               "color": "#C8D4E3"
4391 |              },
4392 |              "line": {
4393 |               "color": "white"
4394 |              }
4395 |             },
4396 |             "type": "table"
4397 |            }
4398 |           ]
4399 |          },
4400 |          "layout": {
4401 |           "annotationdefaults": {
4402 |            "arrowcolor": "#2a3f5f",
4403 |            "arrowhead": 0,
4404 |            "arrowwidth": 1
4405 |           },
4406 |           "autotypenumbers": "strict",
4407 |           "coloraxis": {
4408 |            "colorbar": {
4409 |             "outlinewidth": 0,
4410 |             "ticks": ""
4411 |            }
4412 |           },
4413 |           "colorscale": {
4414 |            "diverging": [
4415 |             [
4416 |              0,
4417 |              "#8e0152"
4418 |             ],
4419 |             [
4420 |              0.1,
4421 |              "#c51b7d"
4422 |             ],
4423 |             [
4424 |              0.2,
4425 |              "#de77ae"
4426 |             ],
4427 |             [
4428 |              0.3,
4429 |              "#f1b6da"
4430 |             ],
4431 |             [
4432 |              0.4,
4433 |              "#fde0ef"
4434 |             ],
4435 |             [
4436 |              0.5,
4437 |              "#f7f7f7"
4438 |             ],
4439 |             [
4440 |              0.6,
4441 |              "#e6f5d0"
4442 |             ],
4443 |             [
4444 |              0.7,
4445 |              "#b8e186"
4446 |             ],
4447 |             [
4448 |              0.8,
4449 |              "#7fbc41"
4450 |             ],
4451 |             [
4452 |              0.9,
4453 |              "#4d9221"
4454 |             ],
4455 |             [
4456 |              1,
4457 |              "#276419"
4458 |             ]
4459 |            ],
4460 |            "sequential": [
4461 |             [
4462 |              0,
4463 |              "#0d0887"
4464 |             ],
4465 |             [
4466 |              0.1111111111111111,
4467 |              "#46039f"
4468 |             ],
4469 |             [
4470 |              0.2222222222222222,
4471 |              "#7201a8"
4472 |             ],
4473 |             [
4474 |              0.3333333333333333,
4475 |              "#9c179e"
4476 |             ],
4477 |             [
4478 |              0.4444444444444444,
4479 |              "#bd3786"
4480 |             ],
4481 |             [
4482 |              0.5555555555555556,
4483 |              "#d8576b"
4484 |             ],
4485 |             [
4486 |              0.6666666666666666,
4487 |              "#ed7953"
4488 |             ],
4489 |             [
4490 |              0.7777777777777778,
4491 |              "#fb9f3a"
4492 |             ],
4493 |             [
4494 |              0.8888888888888888,
4495 |              "#fdca26"
4496 |             ],
4497 |             [
4498 |              1,
4499 |              "#f0f921"
4500 |             ]
4501 |            ],
4502 |            "sequentialminus": [
4503 |             [
4504 |              0,
4505 |              "#0d0887"
4506 |             ],
4507 |             [
4508 |              0.1111111111111111,
4509 |              "#46039f"
4510 |             ],
4511 |             [
4512 |              0.2222222222222222,
4513 |              "#7201a8"
4514 |             ],
4515 |             [
4516 |              0.3333333333333333,
4517 |              "#9c179e"
4518 |             ],
4519 |             [
4520 |              0.4444444444444444,
4521 |              "#bd3786"
4522 |             ],
4523 |             [
4524 |              0.5555555555555556,
4525 |              "#d8576b"
4526 |             ],
4527 |             [
4528 |              0.6666666666666666,
4529 |              "#ed7953"
4530 |             ],
4531 |             [
4532 |              0.7777777777777778,
4533 |              "#fb9f3a"
4534 |             ],
4535 |             [
4536 |              0.8888888888888888,
4537 |              "#fdca26"
4538 |             ],
4539 |             [
4540 |              1,
4541 |              "#f0f921"
4542 |             ]
4543 |            ]
4544 |           },
4545 |           "colorway": [
4546 |            "#636efa",
4547 |            "#EF553B",
4548 |            "#00cc96",
4549 |            "#ab63fa",
4550 |            "#FFA15A",
4551 |            "#19d3f3",
4552 |            "#FF6692",
4553 |            "#B6E880",
4554 |            "#FF97FF",
4555 |            "#FECB52"
4556 |           ],
4557 |           "font": {
4558 |            "color": "#2a3f5f"
4559 |           },
4560 |           "geo": {
4561 |            "bgcolor": "white",
4562 |            "lakecolor": "white",
4563 |            "landcolor": "#E5ECF6",
4564 |            "showlakes": true,
4565 |            "showland": true,
4566 |            "subunitcolor": "white"
4567 |           },
4568 |           "hoverlabel": {
4569 |            "align": "left"
4570 |           },
4571 |           "hovermode": "closest",
4572 |           "mapbox": {
4573 |            "style": "light"
4574 |           },
4575 |           "paper_bgcolor": "white",
4576 |           "plot_bgcolor": "#E5ECF6",
4577 |           "polar": {
4578 |            "angularaxis": {
4579 |             "gridcolor": "white",
4580 |             "linecolor": "white",
4581 |             "ticks": ""
4582 |            },
4583 |            "bgcolor": "#E5ECF6",
4584 |            "radialaxis": {
4585 |             "gridcolor": "white",
4586 |             "linecolor": "white",
4587 |             "ticks": ""
4588 |            }
4589 |           },
4590 |           "scene": {
4591 |            "xaxis": {
4592 |             "backgroundcolor": "#E5ECF6",
4593 |             "gridcolor": "white",
4594 |             "gridwidth": 2,
4595 |             "linecolor": "white",
4596 |             "showbackground": true,
4597 |             "ticks": "",
4598 |             "zerolinecolor": "white"
4599 |            },
4600 |            "yaxis": {
4601 |             "backgroundcolor": "#E5ECF6",
4602 |             "gridcolor": "white",
4603 |             "gridwidth": 2,
4604 |             "linecolor": "white",
4605 |             "showbackground": true,
4606 |             "ticks": "",
4607 |             "zerolinecolor": "white"
4608 |            },
4609 |            "zaxis": {
4610 |             "backgroundcolor": "#E5ECF6",
4611 |             "gridcolor": "white",
4612 |             "gridwidth": 2,
4613 |             "linecolor": "white",
4614 |             "showbackground": true,
4615 |             "ticks": "",
4616 |             "zerolinecolor": "white"
4617 |            }
4618 |           },
4619 |           "shapedefaults": {
4620 |            "line": {
4621 |             "color": "#2a3f5f"
4622 |            }
4623 |           },
4624 |           "ternary": {
4625 |            "aaxis": {
4626 |             "gridcolor": "white",
4627 |             "linecolor": "white",
4628 |             "ticks": ""
4629 |            },
4630 |            "baxis": {
4631 |             "gridcolor": "white",
4632 |             "linecolor": "white",
4633 |             "ticks": ""
4634 |            },
4635 |            "bgcolor": "#E5ECF6",
4636 |            "caxis": {
4637 |             "gridcolor": "white",
4638 |             "linecolor": "white",
4639 |             "ticks": ""
4640 |            }
4641 |           },
4642 |           "title": {
4643 |            "x": 0.05
4644 |           },
4645 |           "xaxis": {
4646 |            "automargin": true,
4647 |            "gridcolor": "white",
4648 |            "linecolor": "white",
4649 |            "ticks": "",
4650 |            "title": {
4651 |             "standoff": 15
4652 |            },
4653 |            "zerolinecolor": "white",
4654 |            "zerolinewidth": 2
4655 |           },
4656 |           "yaxis": {
4657 |            "automargin": true,
4658 |            "gridcolor": "white",
4659 |            "linecolor": "white",
4660 |            "ticks": "",
4661 |            "title": {
4662 |             "standoff": 15
4663 |            },
4664 |            "zerolinecolor": "white",
4665 |            "zerolinewidth": 2
4666 |           }
4667 |          }
4668 |         },
4669 |         "width": 1100,
4670 |         "xaxis": {
4671 |          "anchor": "y",
4672 |          "domain": [
4673 |           0,
4674 |           1
4675 |          ],
4676 |          "title": {
4677 |           "text": "index"
4678 |          }
4679 |         },
4680 |         "yaxis": {
4681 |          "anchor": "x",
4682 |          "domain": [
4683 |           0,
4684 |           1
4685 |          ],
4686 |          "range": [
4687 |           0.5,
4688 |           5.5
4689 |          ],
4690 |          "tickmode": "array",
4691 |          "tickvals": [
4692 |           1,
4693 |           3,
4694 |           5
4695 |          ],
4696 |          "title": {
4697 |           "text": "Score"
4698 |          }
4699 |         }
4700 |        }
4701 |       },
4702 |       "text/html": [
4703 |        "<div>                            <div id=\"0ddf99d2-28ea-472c-af70-6549f14aa932\" class=\"plotly-graph-div\" style=\"height:300px; width:1100px;\"></div>            <script type=\"text/javascript\">                require([\"plotly\"], function(Plotly) {                    window.PLOTLYENV=window.PLOTLYENV || {};                                    if (document.getElementById(\"0ddf99d2-28ea-472c-af70-6549f14aa932\")) {                    Plotly.newPlot(                        \"0ddf99d2-28ea-472c-af70-6549f14aa932\",                        [{\"alignmentgroup\":\"True\",\"hovertemplate\":\"variable=eval_score_human\\u003cbr\\u003eindex=%{x}\\u003cbr\\u003evalue=%{y}\\u003cextra\\u003e\\u003c\\u002fextra\\u003e\",\"legendgroup\":\"eval_score_human\",\"marker\":{\"color\":\"#636efa\",\"pattern\":{\"shape\":\"\"}},\"name\":\"eval_score_human\",\"offsetgroup\":\"eval_score_human\",\"orientation\":\"v\",\"showlegend\":true,\"textposition\":\"auto\",\"x\":[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26],\"xaxis\":\"x\",\"y\":[5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,1.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,3.0,3.0,1.0,5.0,1.0,1.0,5.0,5.0,5.0,5.0,5.0],\"yaxis\":\"y\",\"type\":\"bar\"},{\"alignmentgroup\":\"True\",\"hovertemplate\":\"variable=eval_score_GPT4\\u003cbr\\u003eindex=%{x}\\u003cbr\\u003evalue=%{y}\\u003cextra\\u003e\\u003c\\u002fextra\\u003e\",\"legendgroup\":\"eval_score_GPT4\",\"marker\":{\"color\":\"#EF553B\",\"pattern\":{\"shape\":\"\"}},\"name\":\"eval_score_GPT4\",\"offsetgroup\":\"eval_score_GPT4\",\"orientation\":\"v\",\"showlegend\":true,\"textposition\":\"auto\",\"x\":[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26],\"xaxis\":\"x\",\"y\":[5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,1.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,3.0,3.0,1.0,5.0,1.0,1.0,5.0,5.0,4.0,5.0,5.0],\"yaxis\":\"y\",\"type\":\"bar\"},{\"alignmentgroup\":\"True\",\"hovertemplate\":\"variable=eval_score_Prometheus-13B-v1.0\\u003cbr\\u003eindex=%{x}\\u003cbr\\u003evalue=%{y}\\u003cextra\\u003e\\u003c\\u002fextra\\u003e\",\"legendgroup\":\"eval_score_Prometheus-13B-v1.0\",\"marker\":{\"color\":\"#00cc96\",\"pattern\":{\"shape\":\"\"}},\"name\":\"eval_score_Prometheus-13B-v1.0\",\"offsetgroup\":\"eval_score_Prometheus-13B-v1.0\",\"orientation\":\"v\",\"showlegend\":true,\"textposition\":\"auto\",\"x\":[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26],\"xaxis\":\"x\",\"y\":[4.0,4.0,5.0,5.0,4.0,1.0,5.0,1.0,1.0,4.0,1.0,1.0,5.0,3.0,1.0,5.0,5.0,1.0,1.0,1.0,2.0,1.0,1.0,5.0,4.0,1.0,4.0],\"yaxis\":\"y\",\"type\":\"bar\"}],                        {\"template\":{\"data\":{\"histogram2dcontour\":[{\"type\":\"histogram2dcontour\",\"colorbar\":{\"outlinewidth\":0,\"ticks\":\"\"},\"colorscale\":[[0.0,\"#0d0887\"],[0.1111111111111111,\"#46039f\"],[0.2222222222222222,\"#7201a8\"],[0.3333333333333333,\"#9c179e\"],[0.4444444444444444,\"#bd3786\"],[0.5555555555555556,\"#d8576b\"],[0.6666666666666666,\"#ed7953\"],[0.7777777777777778,\"#fb9f3a\"],[0.8888888888888888,\"#fdca26\"],[1.0,\"#f0f921\"]]}],\"choropleth\":[{\"type\":\"choropleth\",\"colorbar\":{\"outlinewidth\":0,\"ticks\":\"\"}}],\"histogram2d\":[{\"type\":\"histogram2d\",\"colorbar\":{\"outlinewidth\":0,\"ticks\":\"\"},\"colorscale\":[[0.0,\"#0d0887\"],[0.1111111111111111,\"#46039f\"],[0.2222222222222222,\"#7201a8\"],[0.3333333333333333,\"#9c179e\"],[0.4444444444444444,\"#bd3786\"],[0.5555555555555556,\"#d8576b\"],[0.6666666666666666,\"#ed7953\"],[0.7777777777777778,\"#fb9f3a\"],[0.8888888888888888,\"#fdca26\"],[1.0,\"#f0f921\"]]}],\"heatmap\":[{\"type\":\"heatmap\",\"colorbar\":{\"outlinewidth\":0,\"ticks\":\"\"},\"colorscale\":[[0.0,\"#0d0887\"],[0.1111111111111111,\"#46039f\"],[0.2222222222222222,\"#7201a8\"],[0.3333333333333333,\"#9c179e\"],[0.4444444444444444,\"#bd3786\"],[0.5555555555555556,\"#d8576b\"],[0.6666666666666666,\"#ed7953\"],[0.7777777777777778,\"#fb9f3a\"],[0.8888888888888888,\"#fdca26\"],[1.0,\"#f0f921\"]]}],\"heatmapgl\":[{\"type\":\"heatmapgl\",\"colorbar\":{\"outlinewidth\":0,\"ticks\":\"\"},\"colorscale\":[[0.0,\"#0d0887\"],[0.1111111111111111,\"#46039f\"],[0.2222222222222222,\"#7201a8\"],[0.3333333333333333,\"#9c179e\"],[0.4444444444444444,\"#bd3786\"],[0.5555555555555556,\"#d8576b\"],[0.6666666666666666,\"#ed7953\"],[0.7777777777777778,\"#fb9f3a\"],[0.8888888888888888,\"#fdca26\"],[1.0,\"#f0f921\"]]}],\"contourcarpet\":[{\"type\":\"contourcarpet\",\"colorbar\":{\"outlinewidth\":0,\"ticks\":\"\"}}],\"contour\":[{\"type\":\"contour\",\"colorbar\":{\"outlinewidth\":0,\"ticks\":\"\"},\"colorscale\":[[0.0,\"#0d0887\"],[0.1111111111111111,\"#46039f\"],[0.2222222222222222,\"#7201a8\"],[0.3333333333333333,\"#9c179e\"],[0.4444444444444444,\"#bd3786\"],[0.5555555555555556,\"#d8576b\"],[0.6666666666666666,\"#ed7953\"],[0.7777777777777778,\"#fb9f3a\"],[0.8888888888888888,\"#fdca26\"],[1.0,\"#f0f921\"]]}],\"surface\":[{\"type\":\"surface\",\"colorbar\":{\"outlinewidth\":0,\"ticks\":\"\"},\"colorscale\":[[0.0,\"#0d0887\"],[0.1111111111111111,\"#46039f\"],[0.2222222222222222,\"#7201a8\"],[0.3333333333333333,\"#9c179e\"],[0.4444444444444444,\"#bd3786\"],[0.5555555555555556,\"#d8576b\"],[0.6666666666666666,\"#ed7953\"],[0.7777777777777778,\"#fb9f3a\"],[0.8888888888888888,\"#fdca26\"],[1.0,\"#f0f921\"]]}],\"mesh3d\":[{\"type\":\"mesh3d\",\"colorbar\":{\"outlinewidth\":0,\"ticks\":\"\"}}],\"scatter\":[{\"fillpattern\":{\"fillmode\":\"overlay\",\"size\":10,\"solidity\":0.2},\"type\":\"scatter\"}],\"parcoords\":[{\"type\":\"parcoords\",\"line\":{\"colorbar\":{\"outlinewidth\":0,\"ticks\":\"\"}}}],\"scatterpolargl\":[{\"type\":\"scatterpolargl\",\"marker\":{\"colorbar\":{\"outlinewidth\":0,\"ticks\":\"\"}}}],\"bar\":[{\"error_x\":{\"color\":\"#2a3f5f\"},\"error_y\":{\"color\":\"#2a3f5f\"},\"marker\":{\"line\":{\"color\":\"#E5ECF6\",\"width\":0.5},\"pattern\":{\"fillmode\":\"overlay\",\"size\":10,\"solidity\":0.2}},\"type\":\"bar\"}],\"scattergeo\":[{\"type\":\"scattergeo\",\"marker\":{\"colorbar\":{\"outlinewidth\":0,\"ticks\":\"\"}}}],\"scatterpolar\":[{\"type\":\"scatterpolar\",\"marker\":{\"colorbar\":{\"outlinewidth\":0,\"ticks\":\"\"}}}],\"histogram\":[{\"marker\":{\"pattern\":{\"fillmode\":\"overlay\",\"size\":10,\"solidity\":0.2}},\"type\":\"histogram\"}],\"scattergl\":[{\"type\":\"scattergl\",\"marker\":{\"colorbar\":{\"outlinewidth\":0,\"ticks\":\"\"}}}],\"scatter3d\":[{\"type\":\"scatter3d\",\"line\":{\"colorbar\":{\"outlinewidth\":0,\"ticks\":\"\"}},\"marker\":{\"colorbar\":{\"outlinewidth\":0,\"ticks\":\"\"}}}],\"scattermapbox\":[{\"type\":\"scattermapbox\",\"marker\":{\"colorbar\":{\"outlinewidth\":0,\"ticks\":\"\"}}}],\"scatterternary\":[{\"type\":\"scatterternary\",\"marker\":{\"colorbar\":{\"outlinewidth\":0,\"ticks\":\"\"}}}],\"scattercarpet\":[{\"type\":\"scattercarpet\",\"marker\":{\"colorbar\":{\"outlinewidth\":0,\"ticks\":\"\"}}}],\"carpet\":[{\"aaxis\":{\"endlinecolor\":\"#2a3f5f\",\"gridcolor\":\"white\",\"linecolor\":\"white\",\"minorgridcolor\":\"white\",\"startlinecolor\":\"#2a3f5f\"},\"baxis\":{\"endlinecolor\":\"#2a3f5f\",\"gridcolor\":\"white\",\"linecolor\":\"white\",\"minorgridcolor\":\"white\",\"startlinecolor\":\"#2a3f5f\"},\"type\":\"carpet\"}],\"table\":[{\"cells\":{\"fill\":{\"color\":\"#EBF0F8\"},\"line\":{\"color\":\"white\"}},\"header\":{\"fill\":{\"color\":\"#C8D4E3\"},\"line\":{\"color\":\"white\"}},\"type\":\"table\"}],\"barpolar\":[{\"marker\":{\"line\":{\"color\":\"#E5ECF6\",\"width\":0.5},\"pattern\":{\"fillmode\":\"overlay\",\"size\":10,\"solidity\":0.2}},\"type\":\"barpolar\"}],\"pie\":[{\"automargin\":true,\"type\":\"pie\"}]},\"layout\":{\"autotypenumbers\":\"strict\",\"colorway\":[\"#636efa\",\"#EF553B\",\"#00cc96\",\"#ab63fa\",\"#FFA15A\",\"#19d3f3\",\"#FF6692\",\"#B6E880\",\"#FF97FF\",\"#FECB52\"],\"font\":{\"color\":\"#2a3f5f\"},\"hovermode\":\"closest\",\"hoverlabel\":{\"align\":\"left\"},\"paper_bgcolor\":\"white\",\"plot_bgcolor\":\"#E5ECF6\",\"polar\":{\"bgcolor\":\"#E5ECF6\",\"angularaxis\":{\"gridcolor\":\"white\",\"linecolor\":\"white\",\"ticks\":\"\"},\"radialaxis\":{\"gridcolor\":\"white\",\"linecolor\":\"white\",\"ticks\":\"\"}},\"ternary\":{\"bgcolor\":\"#E5ECF6\",\"aaxis\":{\"gridcolor\":\"white\",\"linecolor\":\"white\",\"ticks\":\"\"},\"baxis\":{\"gridcolor\":\"white\",\"linecolor\":\"white\",\"ticks\":\"\"},\"caxis\":{\"gridcolor\":\"white\",\"linecolor\":\"white\",\"ticks\":\"\"}},\"coloraxis\":{\"colorbar\":{\"outlinewidth\":0,\"ticks\":\"\"}},\"colorscale\":{\"sequential\":[[0.0,\"#0d0887\"],[0.1111111111111111,\"#46039f\"],[0.2222222222222222,\"#7201a8\"],[0.3333333333333333,\"#9c179e\"],[0.4444444444444444,\"#bd3786\"],[0.5555555555555556,\"#d8576b\"],[0.6666666666666666,\"#ed7953\"],[0.7777777777777778,\"#fb9f3a\"],[0.8888888888888888,\"#fdca26\"],[1.0,\"#f0f921\"]],\"sequentialminus\":[[0.0,\"#0d0887\"],[0.1111111111111111,\"#46039f\"],[0.2222222222222222,\"#7201a8\"],[0.3333333333333333,\"#9c179e\"],[0.4444444444444444,\"#bd3786\"],[0.5555555555555556,\"#d8576b\"],[0.6666666666666666,\"#ed7953\"],[0.7777777777777778,\"#fb9f3a\"],[0.8888888888888888,\"#fdca26\"],[1.0,\"#f0f921\"]],\"diverging\":[[0,\"#8e0152\"],[0.1,\"#c51b7d\"],[0.2,\"#de77ae\"],[0.3,\"#f1b6da\"],[0.4,\"#fde0ef\"],[0.5,\"#f7f7f7\"],[0.6,\"#e6f5d0\"],[0.7,\"#b8e186\"],[0.8,\"#7fbc41\"],[0.9,\"#4d9221\"],[1,\"#276419\"]]},\"xaxis\":{\"gridcolor\":\"white\",\"linecolor\":\"white\",\"ticks\":\"\",\"title\":{\"standoff\":15},\"zerolinecolor\":\"white\",\"automargin\":true,\"zerolinewidth\":2},\"yaxis\":{\"gridcolor\":\"white\",\"linecolor\":\"white\",\"ticks\":\"\",\"title\":{\"standoff\":15},\"zerolinecolor\":\"white\",\"automargin\":true,\"zerolinewidth\":2},\"scene\":{\"xaxis\":{\"backgroundcolor\":\"#E5ECF6\",\"gridcolor\":\"white\",\"linecolor\":\"white\",\"showbackground\":true,\"ticks\":\"\",\"zerolinecolor\":\"white\",\"gridwidth\":2},\"yaxis\":{\"backgroundcolor\":\"#E5ECF6\",\"gridcolor\":\"white\",\"linecolor\":\"white\",\"showbackground\":true,\"ticks\":\"\",\"zerolinecolor\":\"white\",\"gridwidth\":2},\"zaxis\":{\"backgroundcolor\":\"#E5ECF6\",\"gridcolor\":\"white\",\"linecolor\":\"white\",\"showbackground\":true,\"ticks\":\"\",\"zerolinecolor\":\"white\",\"gridwidth\":2}},\"shapedefaults\":{\"line\":{\"color\":\"#2a3f5f\"}},\"annotationdefaults\":{\"arrowcolor\":\"#2a3f5f\",\"arrowhead\":0,\"arrowwidth\":1},\"geo\":{\"bgcolor\":\"white\",\"landcolor\":\"#E5ECF6\",\"subunitcolor\":\"white\",\"showland\":true,\"showlakes\":true,\"lakecolor\":\"white\"},\"title\":{\"x\":0.05},\"mapbox\":{\"style\":\"light\"}}},\"xaxis\":{\"anchor\":\"y\",\"domain\":[0.0,1.0],\"title\":{\"text\":\"index\"}},\"yaxis\":{\"anchor\":\"x\",\"domain\":[0.0,1.0],\"title\":{\"text\":\"Score\"},\"range\":[0.5,5.5],\"tickmode\":\"array\",\"tickvals\":[1,3,5]},\"legend\":{\"title\":{\"text\":\"variable\"},\"tracegroupgap\":0},\"margin\":{\"t\":60},\"barmode\":\"group\",\"width\":1100,\"height\":300},                        {\"responsive\": true}                    ).then(function(){\n",
4704 |        "                            \n",
4705 |        "var gd = document.getElementById('0ddf99d2-28ea-472c-af70-6549f14aa932');\n",
4706 |        "var x = new MutationObserver(function (mutations, observer) {{\n",
4707 |        "        var display = window.getComputedStyle(gd).display;\n",
4708 |        "        if (!display || display === 'none') {{\n",
4709 |        "            console.log([gd, 'removed!']);\n",
4710 |        "            Plotly.purge(gd);\n",
4711 |        "            observer.disconnect();\n",
4712 |        "        }}\n",
4713 |        "}});\n",
4714 |        "\n",
4715 |        "// Listen for the removal of the full notebook cells\n",
4716 |        "var notebookContainer = gd.closest('#notebook-container');\n",
4717 |        "if (notebookContainer) {{\n",
4718 |        "    x.observe(notebookContainer, {childList: true});\n",
4719 |        "}}\n",
4720 |        "\n",
4721 |        "// Listen for the clearing of the current output cell\n",
4722 |        "var outputEl = gd.closest('.output');\n",
4723 |        "if (outputEl) {{\n",
4724 |        "    x.observe(outputEl, {childList: true});\n",
4725 |        "}}\n",
4726 |        "\n",
4727 |        "                        })                };                });            </script>        </div>"
4728 |       ]
4729 |      },
4730 |      "metadata": {},
4731 |      "output_type": "display_data"
4732 |     }
4733 |    ],
4734 |    "source": [
4735 |     "fig = px.bar(\n",
4736 |     "    res,\n",
4737 |     ")\n",
4738 |     "fig.update_layout(\n",
4739 |     "    width=1100,\n",
4740 |     "    height=300,\n",
4741 |     "    barmode=\"group\",\n",
4742 |     "    yaxis_range=[0.5, 5.5],\n",
4743 |     "    yaxis_title=\"Score\",\n",
4744 |     "    yaxis=dict(\n",
4745 |     "        tickmode=\"array\",\n",
4746 |     "        tickvals=[1, 3, 5],\n",
4747 |     "    ),\n",
4748 |     ")\n",
4749 |     "fig.show()"
4750 |    ]
4751 |   },
4752 |   {
4753 |    "cell_type": "code",
4754 |    "execution_count": 32,
4755 |    "metadata": {},
4756 |    "outputs": [
4757 |     {
4758 |      "name": "stdout",
4759 |      "output_type": "stream",
4760 |      "text": [
4761 |       "Proportion of cases matching human eval:\n",
4762 |       "0.333 for Prometheus\n",
4763 |       "0.963 for GPT4\n"
4764 |      ]
4765 |     }
4766 |    ],
4767 |    "source": [
4768 |     "print(\"Proportion of cases matching human eval:\")\n",
4769 |     "print(\n",
4770 |     "    (res[\"eval_score_human\"] == res[\"eval_score_Prometheus-13B-v1.0\"]).mean().round(3),\n",
4771 |     "    \"for Prometheus\",\n",
4772 |     ")\n",
4773 |     "print((res[\"eval_score_human\"] == res[\"eval_score_GPT4\"]).mean().round(3), \"for GPT4\")"
4774 |    ]
4775 |   },
4776 |   {
4777 |    "cell_type": "markdown",
4778 |    "metadata": {},
4779 |    "source": [
4780 |     "Given the high rate of error of Prometheus-13B, we could not use it for this evaluation."
4781 |    ]
4782 |   }
4783 |  ],
4784 |  "metadata": {
4785 |   "kernelspec": {
4786 |    "display_name": "pytorch",
4787 |    "language": "python",
4788 |    "name": "python3"
4789 |   },
4790 |   "language_info": {
4791 |    "codemirror_mode": {
4792 |     "name": "ipython",
4793 |     "version": 3
4794 |    },
4795 |    "file_extension": ".py",
4796 |    "mimetype": "text/x-python",
4797 |    "name": "python",
4798 |    "nbconvert_exporter": "python",
4799 |    "pygments_lexer": "ipython3",
4800 |    "version": "3.1.0"
4801 |   }
4802 |  },
4803 |  "nbformat": 4,
4804 |  "nbformat_minor": 2
4805 | }
4806 | 


--------------------------------------------------------------------------------