├── .gitignore
├── OpenAgent
├── __init__.py
├── agents
│ ├── __init__.py
│ ├── base.py
│ ├── function_calling.py
│ ├── toolgen
│ │ ├── __init__.py
│ │ ├── inference.py
│ │ ├── toolgen.py
│ │ ├── toolgen_service.py
│ │ └── utils.py
│ └── tree
│ │ ├── __init__.py
│ │ └── tree.py
└── tools
│ ├── __init__.py
│ ├── base.py
│ ├── retrieval
│ ├── __init__.py
│ ├── embeddings.py
│ └── indexers.py
│ └── src
│ ├── __init__.py
│ ├── basic_tools.py
│ └── rapidapi
│ ├── __init__.py
│ ├── rapidapi.py
│ ├── server.py
│ └── utils.py
├── README.md
├── assets
└── banner.png
├── evaluation
├── retrieval
│ ├── __init__.py
│ ├── eval_bm25.py
│ ├── eval_encoder.py
│ ├── eval_longcontext.py
│ ├── eval_openai_embedding.py
│ ├── eval_toolgen.py
│ ├── eval_toolgen_atomic.py
│ └── metrics.py
├── toolbench
│ ├── __init__.py
│ ├── inference
│ │ ├── Algorithms
│ │ │ ├── DFS.py
│ │ │ ├── __init__.py
│ │ │ ├── base_search.py
│ │ │ └── single_chain.py
│ │ ├── Downstream_tasks
│ │ │ ├── __init__.py
│ │ │ ├── base_env.py
│ │ │ ├── rapidapi.py
│ │ │ └── rapidapi_multithread.py
│ │ ├── LLM
│ │ │ ├── __init__.py
│ │ │ ├── base_io.py
│ │ │ ├── chatgpt_function_model.py
│ │ │ ├── davinci_model.py
│ │ │ ├── llama_model.py
│ │ │ ├── retriever.py
│ │ │ ├── tool_chat_model.py
│ │ │ ├── tool_llama_lora_model.py
│ │ │ ├── tool_llama_model.py
│ │ │ ├── toolgen.py
│ │ │ └── toolgen_atomic.py
│ │ ├── LLM_rank
│ │ │ ├── __init__.py
│ │ │ └── rank_candidate.py
│ │ ├── Prompts
│ │ │ ├── ReAct_prompts.py
│ │ │ ├── Tree_search_prompts.py
│ │ │ ├── __init__.py
│ │ │ └── rank_prompts.py
│ │ ├── Tree
│ │ │ ├── Tree.py
│ │ │ └── __init__.py
│ │ ├── callbacks
│ │ │ └── ServerEventCallback.py
│ │ ├── qa_pipeline.py
│ │ ├── qa_pipeline_multithread.py
│ │ ├── qa_pipeline_open_domain.py
│ │ ├── server.py
│ │ ├── toolbench_server.py
│ │ └── utils.py
│ ├── model
│ │ ├── __init__.py
│ │ ├── apply_delta.py
│ │ ├── compression.py
│ │ ├── make_delta.py
│ │ └── model_adapter.py
│ ├── retrieval
│ │ ├── api_evaluator.py
│ │ ├── inference_example.py
│ │ └── train.py
│ ├── tool_conversation.py
│ ├── tooleval
│ │ ├── README.md
│ │ ├── README_ZH.md
│ │ ├── ToolBench.code-workspace
│ │ ├── __init__.py
│ │ ├── automatic_eval_sample.py
│ │ ├── convert_answers.py
│ │ ├── convert_to_answer_format.py
│ │ ├── eval_and_update_leaderboard.py
│ │ ├── eval_pass_rate.py
│ │ ├── eval_preference.py
│ │ ├── evaluation
│ │ │ ├── __init__.py
│ │ │ ├── dataclass.py
│ │ │ ├── methodcls.py
│ │ │ └── usereval.py
│ │ ├── evaluators
│ │ │ ├── __init__.py
│ │ │ ├── registered_cls
│ │ │ │ ├── __init__.py
│ │ │ │ ├── base.py
│ │ │ │ ├── rtl.py
│ │ │ │ ├── tooleval.py
│ │ │ │ └── utils.py
│ │ │ ├── tooleval_gpt-3.5-turbo_default
│ │ │ │ ├── config.yaml
│ │ │ │ └── template.txt
│ │ │ ├── tooleval_gpt-3.5-turbo_fn
│ │ │ │ ├── config.yaml
│ │ │ │ └── template.txt
│ │ │ └── tooleval_gpt-3.5-turbo_normalized
│ │ │ │ ├── config.yaml
│ │ │ │ └── template.txt
│ │ ├── evaluators_comparison.py
│ │ ├── requirements.txt
│ │ └── utils.py
│ └── utils.py
└── utils
│ ├── __init__.py
│ ├── embedding.py
│ ├── retrieval.py
│ └── utils.py
├── requirements.txt
├── scripts
├── convert_answer
│ └── run_convert_answer.sh
├── eval_full_pipeline.sh
├── eval_opendomain_full_pipeline.sh
├── inference
│ ├── inference_gpt_pipeline_virtual.sh
│ ├── inference_opendomain_toolllama_pipeline_virtual.sh
│ ├── inference_toolgen_pipeline_virtual.sh
│ └── inference_toolllama_pipeline_virtual.sh
├── pass_rate
│ └── run_pass_rate.sh
├── preference
│ └── run_preference.sh
└── retrieval
│ ├── eval_bm25.sh
│ ├── eval_encoder.sh
│ ├── eval_longcontext.sh
│ ├── eval_openai_embedding.sh
│ └── eval_toolgen.sh
└── training
├── README.md
├── data
├── __init__.py
├── dataset.py
├── loading.py
└── utils.py
├── models
├── __init__.py
├── causallm.py
├── loading.py
└── utils.py
├── prompts
├── __init__.py
├── conversations.py
├── templates.py
└── utils.py
├── scripts
└── train_toolgen.sh
├── src
├── __init__.py
├── configs
│ ├── ds_z2_config.json
│ ├── ds_z3_config.json
│ ├── ds_z3_offload_config.json
│ ├── project_config.json
│ └── virtual_tokens.txt
├── convert_deepspeed_to_huggingface.py
└── zero_to_fp32.py
├── train.py
└── utils
├── __init__.py
├── distributed.py
├── huggingface.py
├── logging.py
└── setting.py
/.gitignore:
--------------------------------------------------------------------------------
1 | *__pycache__*
2 | *.pyc
3 | *.ipynb_checkpoints
4 | *.ipynb
5 | data/
6 | keys.json
7 | log_file.txt
8 | *tar.gz
9 | .vscode
10 | scripts/retrieval/efficiency
11 | evaluation/retrieval/efficiency
12 | training/wandb
13 | training/logs
14 | training/checkpoints
15 | !training/data
--------------------------------------------------------------------------------
/OpenAgent/__init__.py:
--------------------------------------------------------------------------------
1 | from .agents import ToolGen
2 | from .tools import RapidAPIWrapper
--------------------------------------------------------------------------------
/OpenAgent/agents/__init__.py:
--------------------------------------------------------------------------------
1 | from .toolgen.toolgen import ToolGen
--------------------------------------------------------------------------------
/OpenAgent/agents/toolgen/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Reason-Wang/ToolGen/6839374a255810efe69deea4056eec5c55e25802/OpenAgent/agents/toolgen/__init__.py
--------------------------------------------------------------------------------
/OpenAgent/agents/toolgen/inference.py:
--------------------------------------------------------------------------------
1 | from typing import List
2 | import torch
3 | from transformers import LogitsProcessor
4 |
5 | class DisjunctiveTrie:
6 | def __init__(self, nested_token_ids: List[List[int]], no_subsets=True):
7 | r"""
8 | A helper class that builds a trie with the words represented in `nested_token_ids`.
9 | """
10 | self.max_height = max([len(one) for one in nested_token_ids])
11 |
12 | root = {}
13 | for token_ids in nested_token_ids:
14 | level = root
15 | for tidx, token_id in enumerate(token_ids):
16 | if token_id not in level:
17 | level[token_id] = {}
18 |
19 | level = level[token_id]
20 |
21 | if no_subsets and self.has_subsets(root, nested_token_ids):
22 | raise ValueError(
23 | "Each list in `nested_token_ids` can't be a complete subset of another list, but is"
24 | f" {nested_token_ids}."
25 | )
26 |
27 | self.trie = root
28 |
29 | def next_tokens(self, current_seq):
30 | """
31 | The next possible tokens that will progress the trie, given the current sequence of tokens in `current_seq`.
32 | """
33 | start = self.trie
34 |
35 | for current_token in current_seq:
36 | start = start[current_token]
37 |
38 | next_tokens = list(start.keys())
39 |
40 | return next_tokens
41 |
42 | def reached_leaf(self, current_seq):
43 | next_tokens = self.next_tokens(current_seq)
44 |
45 | return len(next_tokens) == 0
46 |
47 | def count_leaves(self, root):
48 | next_nodes = list(root.values())
49 | if len(next_nodes) == 0:
50 | return 1
51 | else:
52 | return sum([self.count_leaves(nn) for nn in next_nodes])
53 |
54 | def has_subsets(self, trie, nested_token_ids):
55 | """
56 | Returns whether # of leaves == # of words. Otherwise some word is a subset of another.
57 | """
58 | leaf_count = self.count_leaves(trie)
59 | return len(nested_token_ids) != leaf_count
60 |
61 |
62 | class AllowKeyWordsProcessor(LogitsProcessor):
63 | ''' renxi.wang@mbzuai.ac.ae
64 | A logits processor that limit output text to be in a set of predefined keywords.
65 | tokenizer: tokenizer used to encode the keywords
66 | trie: DisjunctiveTrie of predefined keywords
67 | input_ids: input_ids of the prompt that the model is generating from
68 | return:
69 | scores: scores of the logits, where impossible tokens are masked
70 | For beam search, scores are log-softmax of logits, others are logits
71 | '''
72 | def __init__(self, tokenizer, trie, input_ids):
73 | self.tokenizer = tokenizer
74 | self.trie = trie
75 | self.input_ids = input_ids
76 |
77 | def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor):
78 | input_length = self.input_ids.shape[1]
79 | generated_ids = input_ids[:, input_length:].tolist()
80 | new_token_ids = []
81 | for ids in generated_ids:
82 | try:
83 | next_token_ids = self.trie.next_tokens(ids)
84 | except KeyError as e:
85 | next_token_ids = [self.tokenizer.eos_token_id]
86 | if not next_token_ids:
87 | next_token_ids = [self.tokenizer.eos_token_id]
88 | new_token_ids.append(next_token_ids)
89 |
90 | for row, token_ids in enumerate(new_token_ids):
91 | mask = torch.ones_like(scores[row], dtype=torch.bool)
92 | mask[torch.tensor(token_ids)] = False
93 | scores[row, mask] = -1e10
94 |
95 | return scores
96 |
97 |
98 | class AllowTokenIdsProcessor(LogitsProcessor):
99 | def __init__(self, allowed_token_ids: List[int]):
100 | self.allowed_token_ids = allowed_token_ids
101 |
102 | def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor):
103 | mask = torch.ones_like(scores, dtype=torch.bool)
104 | mask[:, self.allowed_token_ids] = False
105 | scores = scores.masked_fill(mask, -1e10)
106 |
107 | return scores
108 |
--------------------------------------------------------------------------------
/OpenAgent/agents/toolgen/utils.py:
--------------------------------------------------------------------------------
1 | import json
2 | import re
3 |
4 | import requests
5 |
6 | def standardize(string):
7 | res = re.compile("[^\\u4e00-\\u9fa5^a-z^A-Z^0-9^_]")
8 | string = res.sub("_", string)
9 | string = re.sub(r"(_)\1+","_", string).lower()
10 | while True:
11 | if len(string) == 0:
12 | return string
13 | if string[0] == "_":
14 | string = string[1:]
15 | else:
16 | break
17 | while True:
18 | if len(string) == 0:
19 | return string
20 | if string[-1] == "_":
21 | string = string[:-1]
22 | else:
23 | break
24 | if string[0].isdigit():
25 | string = "get_" + string
26 | return string
27 |
28 | def change_name(name):
29 | change_list = ["from", "class", "return", "false", "true", "id", "and"]
30 | if name in change_list:
31 | name = "is_" + name
32 | return name
33 |
34 | def get_toolbench_name(tool_name, api_name):
35 | tool_name = standardize(tool_name)
36 | api_name = change_name(standardize(api_name))
37 | toolbench_name = api_name+f"_for_{tool_name}"
38 | toolbench_name = toolbench_name[-64:]
39 | return toolbench_name
40 |
41 |
42 | def toolgen_request(endpoint_url, query, system_prompt=None):
43 | payload = {
44 | "query": query,
45 | "system_prompt": system_prompt
46 | }
47 |
48 | try:
49 | response = requests.post(endpoint_url, json=payload, stream=True) # Enable streaming
50 | response.raise_for_status() # Raise an error for HTTP errors
51 | for line in response.iter_lines(decode_unicode=True):
52 | if line: # Filter out keep-alive new lines
53 | yield json.loads(line) # Parse each line as JSON
54 | except requests.exceptions.RequestException as e:
55 | print(f"Error calling ToolGen model: {e}")
56 | yield {"error": str(e)}
--------------------------------------------------------------------------------
/OpenAgent/agents/tree/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Reason-Wang/ToolGen/6839374a255810efe69deea4056eec5c55e25802/OpenAgent/agents/tree/__init__.py
--------------------------------------------------------------------------------
/OpenAgent/agents/tree/tree.py:
--------------------------------------------------------------------------------
1 | from termcolor import colored
2 | import numpy as np
3 | from copy import deepcopy
4 | import math
5 |
6 | class Tree:
7 | def __init__(self):
8 | self.root = TreeNode()
9 | self.now_deal_node = self.root
10 |
11 |
12 | def to_json_recursive(self,use_messages=False):
13 | tree_structure = self.root.to_json_recursive(use_messages=use_messages)
14 | js_obj = {
15 | "size": self.root.get_size(),
16 | "max_length":self.root.get_max_depth(),
17 | "tree": tree_structure,
18 | }
19 | return js_obj
20 |
21 |
22 | class TreeNode:
23 |
24 | def __init__(self):
25 | self.is_terminal = False
26 | self.pruned = False
27 | # self.finished = False
28 | self.node_type = None
29 | self.description = ""
30 | self.observation = ""
31 | self.observation_code = None
32 | self.father = None
33 | self.children = []
34 | # self.io_state = None
35 |
36 | # openai-messages of this node
37 | self.messages = []
38 |
39 |
40 | def get_depth(self):
41 | if self.father == None:
42 | return 0
43 | return self.father.get_depth() + 1
44 |
45 | def print(self,process_id = 0):
46 | if process_id != 0:
47 | return
48 | color_converter = {"Thought":"red", "Action": "blue", "Action Input": "cyan","Final Answer": "green","Reflection":"blue"}
49 | print(colored(f"{self.node_type}: {self.description}",color = color_converter[self.node_type]))
50 | if self.observation != "":
51 | if len(self.observation) < 1536:
52 | print(colored(f"Observation: {self.observation}",color="yellow"))
53 | else:
54 | print(colored(f"Observation: {self.observation[:1536]}......(len={len(self.observation)})",color="yellow"))
55 |
56 |
57 | def to_json_recursive(self,use_messages=False):
58 | js_obj = self.to_json(use_messages=use_messages)
59 | js_obj["children"] = []
60 | for child in self.children:
61 | js_obj["children"].append(child.to_json_recursive())
62 | return js_obj
63 |
64 |
65 | def get_chain_result_from_this_node(self,use_messages=False):
66 | '''
67 | Returns chained results, starting from this node up to the root node
68 | '''
69 | now_node = self
70 | result = []
71 | while now_node.father != None:
72 | result = [now_node.to_json(use_messages=use_messages)] + result
73 | now_node = now_node.father
74 | return result
75 |
76 | def to_json(self, use_messages=False):
77 |
78 | json_obj = {}
79 | json_obj["is_terminal"] = self.is_terminal
80 | json_obj["pruned"] = self.pruned
81 |
82 | json_obj["depth"] = self.get_depth()
83 | json_obj["node_type"] = self.node_type
84 | json_obj["description"] = self.description
85 | if self.observation != "":
86 | json_obj["observation"] = self.observation
87 | if self.observation_code != None:
88 | json_obj["observation_code"] = self.observation_code
89 | json_obj["child_count"] = len(self.children)
90 |
91 | # if self.io_state != None and self.node_type == "Action Input":
92 | # json_obj["io_state"] = self.io_state.to_json()
93 |
94 |
95 | if use_messages:
96 | json_obj["messages"] = []
97 | for message in self.messages:
98 | if not ("valid" in message.keys() and message["valid"] == False):
99 | json_obj["messages"].append(message["role"])
100 | else:
101 | json_obj["messages"].append(message["role"] + "_invalid")
102 |
103 | return json_obj
--------------------------------------------------------------------------------
/OpenAgent/tools/__init__.py:
--------------------------------------------------------------------------------
1 | from .src.rapidapi.rapidapi import RapidAPIWrapper
--------------------------------------------------------------------------------
/OpenAgent/tools/base.py:
--------------------------------------------------------------------------------
1 | import json
2 | from termcolor import colored
3 |
4 | class BaseTool():
5 | def __init__(self, tools, tools_map, max_observation_length=1024):
6 | self.tools = {}
7 | for tool in tools:
8 | self.tools[tool['function']['name']] = tool
9 | # self.tools = tools
10 | self.tools_map = tools_map
11 | self.max_observation_length = max_observation_length
12 | self.success = 0
13 |
14 | def call(self, action_name, action_input):
15 | # print(f"Calling {action_name} with input: {action_input}")
16 | if action_name in self.tools:
17 | obs, code = self._call(action_name, action_input)
18 | if len(obs) > self.max_observation_length:
19 | obs = obs[:self.max_observation_length] + "..."
20 | return obs, code
21 | else:
22 | return {"error": f"No such tool name: {action_name}"}, 0
23 |
24 | def check_success(self):
25 | return self.success
26 |
27 | def _call(self, action_name, action_input):
28 | """Need to return an observation string and status code:
29 | 0 means normal response
30 | 1 means there is no corresponding api name
31 | 2 means there is an error in the input
32 | 3 represents the end of the generation and the final answer appears
33 | 4 means that the model decides to pruning by itself
34 | 5 represents api call timeout
35 | 6 for 404
36 | 7 means not subscribed
37 | 8 represents unauthorized
38 | 9 represents too many requests
39 | 10 stands for rate limit
40 | 11 message contains "error" field
41 | 12 error sending request
42 | """
43 | json_data = json.loads(action_input)
44 | if action_name in self.tools:
45 | function = self.tools_map[action_name]
46 | # print(function)
47 | print(colored(f"Querying: {action_name}", color="yellow"))
48 |
49 | response = function(**json_data)
50 | else:
51 | response = {
52 | "error": "invalid hallucation of function name."
53 | }
54 | status_code = 0
55 | return json.dumps(response), status_code
56 |
57 | if isinstance(response, dict) and "status_code" in response:
58 | status_code = response['status_code']
59 | del response['status_code']
60 | # whether generated the final answer
61 | if status_code == 3:
62 | self.success = 1
63 | else:
64 | status_code = 0
65 |
66 | return json.dumps(response), status_code
67 |
68 | def to_json(self):
69 | return {}
70 |
--------------------------------------------------------------------------------
/OpenAgent/tools/retrieval/__init__.py:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/OpenAgent/tools/retrieval/embeddings.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import torch
3 |
4 | def get_embeddings(model, device, texts, batch_size=16):
5 | model.eval()
6 | model.to(device)
7 | # tbar = tqdm(dataloader)
8 | embeddings = []
9 | with torch.no_grad():
10 | for i in range(0, len(texts), batch_size):
11 | batch = texts[i:i + batch_size]
12 | embeddings.append(model.encode(batch, device=device))
13 | return np.concatenate(embeddings)
--------------------------------------------------------------------------------
/OpenAgent/tools/retrieval/indexers.py:
--------------------------------------------------------------------------------
1 | import faiss
2 | import numpy as np
3 |
4 | class Indexer:
5 | def __init__(self, embeddings, vector_size, ids=None, similarity="cosine"):
6 | self.index = faiss.IndexFlatIP(vector_size)
7 | self.similarity = similarity
8 | if similarity == "cosine":
9 | embeddings /= np.linalg.norm(embeddings, axis=1)[:, None]
10 | self.index.add(embeddings)
11 | if ids is None:
12 | self.ids = list(range(embeddings.shape[0]))
13 | else:
14 | self.ids = ids
15 |
16 |
17 | def add(self, embeddings, ids=None):
18 | if self.similarity == "cosine":
19 | embeddings /= np.linalg.norm(embeddings, axis=1)[:, None]
20 | if len(embeddings.shape) == 1:
21 | embeddings = embeddings.reshape(1, -1)
22 | self.index.add(embeddings)
23 | if ids is None:
24 | self.ids.extend(list(range(self.ids[-1] + 1, self.ids[-1] + 1 + embeddings.shape[0])))
25 | else:
26 | self.ids.extend(ids)
27 |
28 | def search(self, queries: np.array, top_n: int):
29 | if len(queries.shape) == 1:
30 | queries = queries.reshape(1, -1)
31 | try:
32 | if self.similarity == "cosine":
33 | queries /= np.linalg.norm(queries, axis=1)[:, None]
34 | scores, indexes = self.index.search(queries, top_n)
35 | except AttributeError:
36 | print(queries)
37 | scores_ids = []
38 | for top_n_score, top_n_idx in zip(scores, indexes):
39 | top_n_score_id = []
40 | for s, i in zip(top_n_score, top_n_idx):
41 | top_n_score_id.append((s, self.ids[i]))
42 | scores_ids.append(top_n_score_id)
43 |
44 | return scores_ids
--------------------------------------------------------------------------------
/OpenAgent/tools/src/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Reason-Wang/ToolGen/6839374a255810efe69deea4056eec5c55e25802/OpenAgent/tools/src/__init__.py
--------------------------------------------------------------------------------
/OpenAgent/tools/src/basic_tools.py:
--------------------------------------------------------------------------------
1 |
2 |
3 | import json
4 |
5 |
6 | BasicTools = [
7 | # Finish function
8 | {
9 | "type": "function",
10 | "function": {
11 | "name": "Finish",
12 | "description": "If you believe that you have obtained a result that can answer the task, please call this function to provide the final answer. Alternatively, if you recognize that you are unable to proceed with the task in the current state, call this function to restart. Remember: you must ALWAYS call this function at the end of your attempt, and the only part that will be shown to the user is the final answer, so it should contain sufficient information.",
13 | "parameters": {
14 | "type": "object",
15 | "properties": {
16 | "return_type": {
17 | "type": "string",
18 | "enum": ["give_answer","give_up_and_restart"],
19 | },
20 | "final_answer": {
21 | "type": "string",
22 | "description": "The final answer you want to give the user. You should have this field if \"return_type\"==\"give_answer\"",
23 | }
24 | },
25 | "required": ["return_type"],
26 | },
27 | }
28 | }
29 | ]
30 |
31 |
32 | TestTools = [
33 | {
34 | "type": "function",
35 | "function": {
36 | "name": "get_current_temperature",
37 | "description": "Get the current temperature for a specific location",
38 | "parameters": {
39 | "type": "object",
40 | "properties": {
41 | "location": {
42 | "type": "string",
43 | "description": "The city and state, e.g., San Francisco, CA"
44 | },
45 | "unit": {
46 | "type": "string",
47 | "enum": ["Celsius", "Fahrenheit"],
48 | "description": "The temperature unit to use. Infer this from the user's location."
49 | }
50 | },
51 | "required": ["location", "unit"]
52 | }
53 | }
54 | },
55 | {
56 | "type": "function",
57 | "function": {
58 | "name": "get_rain_probability",
59 | "description": "Get the probability of rain for a specific location",
60 | "parameters": {
61 | "type": "object",
62 | "properties": {
63 | "location": {
64 | "type": "string",
65 | "description": "The city and state, e.g., San Francisco, CA"
66 | }
67 | },
68 | "required": ["location"]
69 | }
70 | }
71 | },
72 | {
73 | "type": "function",
74 | "function": {
75 | "name": "Finish",
76 | "description": "If you believe that you have obtained a result that can answer the task, please call this function to provide the final answer. Alternatively, if you recognize that you are unable to proceed with the task in the current state, call this function to restart. Remember: you must ALWAYS call this function at the end of your attempt, and the only part that will be shown to the user is the final answer, so it should contain sufficient information.",
77 | "parameters": {
78 | "type": "object",
79 | "properties": {
80 | "return_type": {
81 | "type": "string",
82 | "enum": ["give_answer","give_up_and_restart"],
83 | },
84 | "final_answer": {
85 | "type": "string",
86 | "description": "The final answer you want to give the user. You should have this field if \"return_type\"==\"give_answer\"",
87 | }
88 | },
89 | "required": ["return_type"],
90 | },
91 | }
92 | }
93 | ]
94 |
95 | def finish(return_type=None, final_answer=None):
96 |
97 | if return_type is None:
98 | response = {"error": "must have \"return_type\""}
99 | status = 2
100 | if return_type == "give_up_and_restart":
101 | response = {"response": "chose to give up and restart"}
102 | status = 4
103 | elif return_type == "give_answer":
104 | if final_answer is None:
105 | response = {"error": "must have \"final_answer\""}
106 | status = 2
107 | else:
108 | response = {"response": "successfully giving the final answer."}
109 | status = 3
110 | else:
111 | response = {"error": "\"return_type\" is not a valid choice\""}
112 | status = 2
113 |
114 | response['status_code'] = status
115 | return response
116 |
117 |
118 | def get_temperature(location, unit):
119 | return 75
120 |
121 | def get_rain_probability(location):
122 | return 0.2
123 |
124 | TestToolsMap = {
125 | "get_current_temperature": get_temperature,
126 | "get_rain_probability": get_rain_probability,
127 | "Finish": finish
128 | }
129 |
--------------------------------------------------------------------------------
/OpenAgent/tools/src/rapidapi/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Reason-Wang/ToolGen/6839374a255810efe69deea4056eec5c55e25802/OpenAgent/tools/src/rapidapi/__init__.py
--------------------------------------------------------------------------------
/OpenAgent/tools/src/rapidapi/utils.py:
--------------------------------------------------------------------------------
1 | import json
2 | import re
3 |
4 | def standardize(string):
5 | res = re.compile("[^\\u4e00-\\u9fa5^a-z^A-Z^0-9^_]")
6 | string = res.sub("_", string)
7 | string = re.sub(r"(_)\1+","_", string).lower()
8 | while True:
9 | if len(string) == 0:
10 | return string
11 | if string[0] == "_":
12 | string = string[1:]
13 | else:
14 | break
15 | while True:
16 | if len(string) == 0:
17 | return string
18 | if string[-1] == "_":
19 | string = string[:-1]
20 | else:
21 | break
22 | if string[0].isdigit():
23 | string = "get_" + string
24 | return string
25 |
26 | def change_name(name):
27 | change_list = ["from", "class", "return", "false", "true", "id", "and"]
28 | if name in change_list:
29 | name = "is_" + name
30 | return name
31 |
32 |
33 | def finish(action_input):
34 | try:
35 | json_data = json.loads(action_input, strict=False)
36 | except:
37 | json_data = {}
38 | if '"return_type": "' in action_input:
39 | if '"return_type": "give_answer"' in action_input:
40 | return_type = "give_answer"
41 | elif '"return_type": "give_up_and_restart"' in action_input:
42 | return_type = "give_up_and_restart"
43 | else:
44 | return_type = action_input[action_input.find('"return_type": "')+len('"return_type": "'):action_input.find('",')]
45 | json_data["return_type"] = return_type
46 | if '"final_answer": "' in action_input:
47 | final_answer = action_input[action_input.find('"final_answer": "')+len('"final_answer": "'):]
48 | json_data["final_answer"] = final_answer
49 | if "return_type" not in json_data.keys():
50 | return "{error:\"must have \"return_type\"\"}", 2
51 | if json_data["return_type"] == "give_up_and_restart":
52 | return "{\"response\":\"chose to give up and restart\"}",4
53 | elif json_data["return_type"] == "give_answer":
54 | if "final_answer" not in json_data.keys():
55 | return "{error:\"must have \"final_answer\"\"}", 2
56 |
57 | return "{\"response\":\"successfully giving the final answer.\"}", 3
58 | else:
59 | return "{error:\"\"return_type\" is not a valid choice\"}", 2
60 |
61 |
--------------------------------------------------------------------------------
/assets/banner.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Reason-Wang/ToolGen/6839374a255810efe69deea4056eec5c55e25802/assets/banner.png
--------------------------------------------------------------------------------
/evaluation/retrieval/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Reason-Wang/ToolGen/6839374a255810efe69deea4056eec5c55e25802/evaluation/retrieval/__init__.py
--------------------------------------------------------------------------------
/evaluation/retrieval/metrics.py:
--------------------------------------------------------------------------------
1 |
2 | import numpy as np
3 | from tqdm import trange
4 | import sklearn
5 | from utils.retrieval import Indexer
6 | from utils.embedding import get_embeddings
7 |
8 |
9 | def ndcg_score(model, tokenizer, queries, corpus, relevant_docs, batch_size=32, corpus_chunk_size=32):
10 | query_embeddings = get_embeddings(
11 | model,
12 | tokenizer,
13 | device="cuda",
14 | texts=queries,
15 | )
16 | doc_embeddings = get_embeddings(
17 | model,
18 | tokenizer,
19 | device="cuda",
20 | texts=list(corpus.values()),
21 | )
22 |
23 | indexer = Indexer(doc_embeddings, doc_embeddings.shape[1], ids=list(corpus.keys()))
24 | scores_docids = indexer.search(query_embeddings, 5)
25 | queries_ids = list(corpus.keys())
26 | ndcg_scores = compute_ndcg_scores(queries_ids, scores_docids, relevant_docs)
27 |
28 | return ndcg_scores
29 |
30 | def compute_ndcg(relevant_docs_ids, score_docid, k):
31 | # Build the ground truth relevance scores and the model's predicted scores
32 | length = len(corpus_ids)
33 | true_relevance = np.zeros(length)
34 | predicted_scores = np.zeros(length)
35 | top_hits = score_docid
36 | for hit in top_hits:
37 | predicted_scores[corpus_ids.index(hit[1])] = hit[0]
38 | if hit[1] in relevant_docs_ids:
39 | true_relevance[corpus_ids.index(hit[1])] = 1
40 |
41 | return sklearn.metrics.ndcg_score([true_relevance], [predicted_scores], k=k)
42 |
43 |
44 | def compute_ndcg_scores(queries_ids, scores_docids, relevant_docs):
45 | ndcg_scores = []
46 | for query_id, scores_docid in zip(queries_ids, scores_docids):
47 | relevant_docs_ids = relevant_docs[query_id]
48 | ndcg_score = compute_ndcg(relevant_docs_ids, scores_docid, k=5)
49 | ndcg_scores.append(ndcg_score)
50 | return np.mean(ndcg_scores)
--------------------------------------------------------------------------------
/evaluation/toolbench/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Reason-Wang/ToolGen/6839374a255810efe69deea4056eec5c55e25802/evaluation/toolbench/__init__.py
--------------------------------------------------------------------------------
/evaluation/toolbench/inference/Algorithms/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Reason-Wang/ToolGen/6839374a255810efe69deea4056eec5c55e25802/evaluation/toolbench/inference/Algorithms/__init__.py
--------------------------------------------------------------------------------
/evaluation/toolbench/inference/Algorithms/base_search.py:
--------------------------------------------------------------------------------
1 | from Downstream_tasks.base_env import base_env
2 |
3 | class base_search_method:
4 | """For the base tree search method, you need to support the following functions"""
5 |
6 | def __init__(self,llm,io_func: base_env, process_id=0, callbacks = None):
7 | """Args:
8 | llm: The interface of the LLM
9 | io_func(base_env): Interface to the environment,
10 | process_id (int, optional): In multiprocessing annotation, this describes the process id. Defaults to 0.
11 | callbacks (_type_, optional): _description_. Defaults to None.
12 | """
13 | pass
14 |
15 | def to_json(self,answer=False,process=True):
16 | '''
17 | return a json object,
18 | If "answer" = True. must have the following field to make answer annotation
19 | If "process" = True. You need provide the full information of the tree searching process
20 |
21 | "answer_generation": {
22 | "valid_data": bool,
23 | "final_answer": string,
24 | "finish_type": enum["give_up","give_answer"]
25 | "train_messages": [ [openAI-message] ],
26 | }
27 | '''
28 | raise NotImplementedError
29 |
30 | def start(self, **args):
31 | """This is the entry point of the searching process"""
32 | raise NotImplementedError
33 |
34 |
--------------------------------------------------------------------------------
/evaluation/toolbench/inference/Downstream_tasks/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Reason-Wang/ToolGen/6839374a255810efe69deea4056eec5c55e25802/evaluation/toolbench/inference/Downstream_tasks/__init__.py
--------------------------------------------------------------------------------
/evaluation/toolbench/inference/Downstream_tasks/base_env.py:
--------------------------------------------------------------------------------
1 | class base_env:
2 |
3 | def __init__(self):
4 | self.task_description = ""
5 | self.input_description = ""
6 | self.tool_names = []
7 | self.functions = []
8 |
9 | def restart(self):
10 | '''
11 | Restrat the environment
12 | '''
13 | raise NotImplementedError
14 |
15 | def get_score(self):
16 | '''
17 | Get the value of the current state
18 | A fake function, used to search in oracle mode, which is not actually used (and impossible to obtain)
19 | '''
20 | raise NotImplementedError
21 |
22 | def step(self, action, input_str):
23 | '''
24 | Perform an interaction in natural language mode
25 | return value (output str, status code)
26 | '''
27 | raise NotImplementedError
28 |
29 | def check_success(self):
30 | '''
31 | Returns 1 if successful, otherwise returns 0
32 | '''
33 | raise NotImplementedError
34 |
35 | def to_json(self):
36 | raise NotImplementedError
--------------------------------------------------------------------------------
/evaluation/toolbench/inference/LLM/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Reason-Wang/ToolGen/6839374a255810efe69deea4056eec5c55e25802/evaluation/toolbench/inference/LLM/__init__.py
--------------------------------------------------------------------------------
/evaluation/toolbench/inference/LLM/base_io.py:
--------------------------------------------------------------------------------
1 | import re
2 |
3 | def base_io(input_str):
4 | pass
--------------------------------------------------------------------------------
/evaluation/toolbench/inference/LLM/davinci_model.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # coding=utf-8
3 | from typing import Optional, List, Mapping, Any
4 | from termcolor import colored
5 | import json
6 | import random
7 | import openai
8 | from typing import Optional
9 | from evaluation.toolbench.model.model_adapter import get_conversation_template
10 | from evaluation.toolbench.inference.utils import SimpleChatIO, react_parser
11 | from evaluation.toolbench.inference.Prompts.ReAct_prompts import FORMAT_INSTRUCTIONS_SYSTEM_FUNCTION_ZEROSHOT
12 |
13 |
14 | class Davinci:
15 | def __init__(self, model="text-davinci-003", openai_key="") -> None:
16 | super().__init__()
17 | self.model = model
18 | self.openai_key = openai_key
19 | self.chatio = SimpleChatIO()
20 |
21 | def prediction(self, prompt: str, stop: Optional[List[str]] = None) -> str:
22 | max_try = 10
23 | while True:
24 | openai.api_key = self.openai_key
25 | try:
26 | response = openai.Completion.create(
27 | engine=self.model,
28 | prompt=prompt,
29 | temperature=0.5,
30 | max_tokens=512,
31 | top_p=1,
32 | frequency_penalty=0,
33 | presence_penalty=0,
34 | stop="End Action"
35 | )
36 | result = response['choices'][0]['text'].strip()
37 | break
38 | except Exception as e:
39 | print(e)
40 | max_try -= 1
41 | if max_try < 0:
42 | result = "Exceed max retry times. Please check your davinci api calling."
43 | break
44 | return result, response["usage"]
45 |
46 | def add_message(self, message):
47 | self.conversation_history.append(message)
48 |
49 | def change_messages(self,messages):
50 | self.conversation_history = messages
51 |
52 | def display_conversation(self, detailed=False):
53 | role_to_color = {
54 | "system": "red",
55 | "user": "green",
56 | "assistant": "blue",
57 | "function": "magenta",
58 | }
59 | print("before_print"+"*"*50)
60 | for message in self.conversation_history:
61 | print_obj = f"{message['role']}: {message['content']} "
62 | if "function_call" in message.keys():
63 | print_obj = print_obj + f"function_call: {message['function_call']}"
64 | print_obj += ""
65 | print(
66 | colored(
67 | print_obj,
68 | role_to_color[message["role"]],
69 | )
70 | )
71 | print("end_print"+"*"*50)
72 |
73 | def parse(self,functions,process_id,**args):
74 | conv = get_conversation_template("tool-llama-single-round")
75 | roles = {"system": conv.roles[0], "user": conv.roles[1], "function": conv.roles[2], "assistant": conv.roles[3]}
76 | conversation_history = self.conversation_history
77 | question = ''
78 | for message in conversation_history:
79 | role = roles[message['role']]
80 | content = message['content']
81 | if role == "User":
82 | question = content
83 | break
84 | func_str = ""
85 | func_list = []
86 | for function_dict in functions:
87 | param_str = ""
88 | api_name = function_dict["name"]
89 | func_list.append(api_name)
90 | if "Finish" in api_name:
91 | param_str = f'"return_type": string, "final_answer": string, '
92 | api_desc = "If you believe that you have obtained a result that can answer the task, please call this function to provide the final answer. ALWAYS call this function at the end of your attempt to answer the question finally."
93 | func_str += f"{api_name}: {api_desc}. Your input should be a json (args json schema): {param_str} The Action to trigger this API should be {api_name} and the input parameters should be a json dict string. Pay attention to the type of parameters.\n\n"
94 | else:
95 | api_desc = function_dict["description"][function_dict["description"].find("The description of this function is: ")+len("The description of this function is: "):]
96 | for param_name in function_dict["parameters"]["properties"]:
97 | data_type = function_dict["parameters"]["properties"][param_name]["type"]
98 | param_str += f'"{param_name}": {data_type}, '
99 | param_str = "{{" + param_str + "}}"
100 | func_str += f"{api_name}: {api_desc}. Your input should be a json (args json schema): {param_str} The Action to trigger this API should be {api_name} and the input parameters should be a json dict string. Pay attention to the type of parameters.\n\n"
101 | func_list = str(func_list)
102 | prompt = FORMAT_INSTRUCTIONS_SYSTEM_FUNCTION_ZEROSHOT.replace("{func_str}", func_str).replace("{func_list}", func_list).replace("{func_list}", func_list).replace("{question}", question)
103 | prompt = prompt.replace("{{", "{").replace("}}", "}")
104 | for message in conversation_history:
105 | role = roles[message['role']]
106 | content = message['content']
107 | if role == "Assistant":
108 | prompt += f"\n{content}\n"
109 | elif role == "Function":
110 | prompt += f"Observation: {content}\n"
111 | if functions != []:
112 | predictions, usage = self.prediction(prompt)
113 | else:
114 | predictions, usage = self.prediction(prompt)
115 |
116 | # react format prediction
117 | thought, action, action_input = react_parser(predictions)
118 | message = {
119 | "role": "assistant",
120 | "content": thought,
121 | "function_call": {
122 | "name": action,
123 | "arguments": action_input
124 | }
125 | }
126 | return message, 0, usage["total_tokens"]
127 |
128 |
129 | if __name__ == "__main__":
130 | llm = Davinci()
131 | result = llm.prediction("How old are you?")
132 | print(result)
--------------------------------------------------------------------------------
/evaluation/toolbench/inference/LLM/llama_model.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # coding=utf-8
3 | from typing import Optional, List, Mapping, Any
4 | from transformers import AutoTokenizer, AutoModelForCausalLM
5 | from termcolor import colored
6 | import time
7 | from typing import Optional
8 | from transformers import (
9 | AutoTokenizer,
10 | AutoModelForCausalLM
11 | )
12 | from toolbench.utils import process_system_message
13 | from toolbench.model.model_adapter import get_conversation_template
14 | from toolbench.inference.utils import SimpleChatIO, generate_stream, react_parser
15 |
16 |
17 | class LlamaModel:
18 | def __init__(self, model_name_or_path: str, template:str="tool-llama-single-round", device: str="cuda", cpu_offloading: bool=False, max_sequence_length: int=2048) -> None:
19 | super().__init__()
20 | self.model_name = model_name_or_path
21 | self.template = template
22 | self.max_sequence_length = max_sequence_length
23 | self.tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=False, model_max_length=self.max_sequence_length)
24 | self.model = AutoModelForCausalLM.from_pretrained(
25 | model_name_or_path, low_cpu_mem_usage=True
26 | )
27 | if self.tokenizer.pad_token_id == None:
28 | self.tokenizer.add_special_tokens({"bos_token": "", "eos_token": "", "pad_token": ""})
29 | self.model.resize_token_embeddings(len(self.tokenizer))
30 | self.use_gpu = (True if device == "cuda" else False)
31 | if (device == "cuda" and not cpu_offloading) or device == "mps":
32 | self.model.to(device)
33 | self.chatio = SimpleChatIO()
34 |
35 | def prediction(self, prompt: str, stop: Optional[List[str]] = None) -> str:
36 | gen_params = {
37 | "model": "",
38 | "prompt": prompt,
39 | "temperature": 0.5,
40 | "max_new_tokens": 512,
41 | "stop": "",
42 | "stop_token_ids": None,
43 | "echo": False
44 | }
45 | generate_stream_func = generate_stream
46 | output_stream = generate_stream_func(self.model, self.tokenizer, gen_params, "cuda", self.max_sequence_length, force_generate=True)
47 | outputs = self.chatio.return_output(output_stream)
48 | prediction = outputs.strip()
49 | return prediction
50 |
51 | def add_message(self, message):
52 | self.conversation_history.append(message)
53 |
54 | def change_messages(self,messages):
55 | self.conversation_history = messages
56 |
57 | def display_conversation(self, detailed=False):
58 | role_to_color = {
59 | "system": "red",
60 | "user": "green",
61 | "assistant": "blue",
62 | "function": "magenta",
63 | }
64 | print("before_print"+"*"*50)
65 | for message in self.conversation_history:
66 | print_obj = f"{message['role']}: {message['content']} "
67 | if "function_call" in message.keys():
68 | print_obj = print_obj + f"function_call: {message['function_call']}"
69 | print_obj += ""
70 | print(
71 | colored(
72 | print_obj,
73 | role_to_color[message["role"]],
74 | )
75 | )
76 | print("end_print"+"*"*50)
77 |
78 | def parse(self,functions,process_id,**args):
79 | conv = get_conversation_template(self.template)
80 | if self.template == "tool-llama":
81 | roles = {"human": conv.roles[0], "gpt": conv.roles[1]}
82 | elif self.template == "tool-llama-single-round" or self.template == "tool-llama-multi-rounds":
83 | roles = {"system": conv.roles[0], "user": conv.roles[1], "function": conv.roles[2], "assistant": conv.roles[3]}
84 |
85 | self.time = time.time()
86 | conversation_history = self.conversation_history
87 | prompt = ''
88 | for message in conversation_history:
89 | role = roles[message['role']]
90 | content = message['content']
91 | if role == "System" and functions != []:
92 | content = process_system_message(content, functions)
93 | prompt += f"{role}: {content}\n"
94 | prompt += "Assistant:\n"
95 | if functions != []:
96 | predictions = self.prediction(prompt)
97 | else:
98 | predictions = self.prediction(prompt)
99 |
100 | decoded_token_len = len(self.tokenizer(predictions))
101 | if process_id == 0:
102 | print(f"[process({process_id})]total tokens: {decoded_token_len}")
103 |
104 | thought, action, action_input = react_parser(predictions)
105 | if len(thought.strip()) > 1:
106 | print(thought)
107 | # input()
108 | message = {
109 | "role": "assistant",
110 | "content": thought,
111 | "function_call": {
112 | "name": action,
113 | "arguments": action_input
114 | }
115 | }
116 | return message, 0, decoded_token_len
117 |
118 |
119 | if __name__ == "__main__":
120 | # can accept all huggingface LlamaModel family
121 | llm = LlamaModel("decapoda-research/llama-7b-hf")
122 | messages = [
123 | {'role': 'system', 'content': '''You are AutoGPT, you can use many tools(functions) to do
124 | the following task.\nFirst I will give you the task description, and your task start.\nAt each step, you need to give your thought to analyze the status now and what to do next, with a function call to actually excute your step.\nAfter the call, you will get the call result, and you are now in a new state.\nThen you will analyze your status now, then decide what to do next...\nAfter many (Thought-call) pairs, you finally perform the task, then you can give your finial answer.\nRemember: \n1.the state change is , you can\'t go
125 | back to the former state, if you want to restart the task, say "I give up and restart".\n2.All the thought is short, at most in 5 sentence.\nLet\'s Begin!\nTask description: Use numbers and basic arithmetic operations (+ - * /) to obtain exactly one number=24. Each
126 | step, you are only allowed to choose two of the left numbers to obtain a new number. For example, you can combine [3,13,9,7] as 7*9 - 3*13 = 24.\nRemember:\n1.all of the number must be used , and must be used ONCE. So Only when left numbers is exact 24, you will win. So you don\'t succeed when left number = [24, 5]. You succeed when left number = [24]. \n2.all the try takes exactly 3 steps, look
127 | at the input format'''},
128 | {'role': 'user', 'content': '\nThe real task input is: [1, 2, 4, 7]\nBegin!\n'}
129 | ]
130 | functions = [{'name': 'play_24', 'description': '''make your current conbine with the format "x operation y = z (left: aaa) " like "1+2=3, (left: 3 5 7)", then I will tell you whether you win. This is the ONLY way
131 | to interact with the game, and the total process of a input use 3 steps of call, each step you can only combine 2 of the left numbers, so the count of left numbers decrease from 4 to 1''','parameters':{'type': 'object', 'properties':{}}}]#, 'parameters': {'type': 'object', 'properties': {'input': {'type': 'string', 'description': 'describe what number you want to conbine, and how to conbine.'}}, 'required': ['input']}}]
132 |
133 | llm.change_messages(messages)
134 | output = llm.parse(functions=functions)
135 | print(output)
--------------------------------------------------------------------------------
/evaluation/toolbench/inference/LLM/retriever.py:
--------------------------------------------------------------------------------
1 | import time
2 | import pandas as pd
3 | from sentence_transformers import SentenceTransformer, util
4 | import json
5 | import re
6 | from evaluation.toolbench.utils import standardize, standardize_category, change_name, process_retrieval_ducoment
7 |
8 |
9 | class ToolRetriever:
10 | def __init__(self, corpus_tsv_path = "", model_path=""):
11 | self.corpus_tsv_path = corpus_tsv_path
12 | self.model_path = model_path
13 | self.corpus, self.corpus2tool = self.build_retrieval_corpus()
14 | self.embedder = self.build_retrieval_embedder()
15 | self.corpus_embeddings = self.build_corpus_embeddings()
16 |
17 | def build_retrieval_corpus(self):
18 | print("Building corpus...")
19 | documents_df = pd.read_csv(self.corpus_tsv_path, sep='\t')
20 | corpus, corpus2tool = process_retrieval_ducoment(documents_df)
21 | corpus_ids = list(corpus.keys())
22 | corpus = [corpus[cid] for cid in corpus_ids]
23 | return corpus, corpus2tool
24 |
25 | def build_retrieval_embedder(self):
26 | print("Building embedder...")
27 | embedder = SentenceTransformer(self.model_path)
28 | return embedder
29 |
30 | def build_corpus_embeddings(self):
31 | print("Building corpus embeddings with embedder...")
32 | corpus_embeddings = self.embedder.encode(self.corpus, convert_to_tensor=True)
33 | return corpus_embeddings
34 |
35 | def retrieving(self, query, top_k=5, excluded_tools={}):
36 | print("Retrieving...")
37 | start = time.time()
38 | query_embedding = self.embedder.encode(query, convert_to_tensor=True)
39 | hits = util.semantic_search(query_embedding, self.corpus_embeddings, top_k=10*top_k, score_function=util.cos_sim)
40 | retrieved_tools = []
41 | for rank, hit in enumerate(hits[0]):
42 | try:
43 | category, tool_name, api_name = self.corpus2tool[self.corpus[hit['corpus_id']]].split('\t')
44 | except ValueError as e:
45 | print(len(self.corpus2tool[self.corpus[hit['corpus_id']]]))
46 | print(self.corpus2tool[self.corpus[hit['corpus_id']]][0])
47 | print(self.corpus2tool[self.corpus[hit['corpus_id']]][1])
48 |
49 | category = standardize_category(category)
50 | tool_name = standardize(tool_name) # standardizing
51 | api_name = change_name(standardize(api_name)) # standardizing
52 | if category in excluded_tools:
53 | if tool_name in excluded_tools[category]:
54 | top_k += 1
55 | continue
56 | tmp_dict = {
57 | "category": category,
58 | "tool_name": tool_name,
59 | "api_name": api_name
60 | }
61 | retrieved_tools.append(tmp_dict)
62 | return retrieved_tools
--------------------------------------------------------------------------------
/evaluation/toolbench/inference/LLM_rank/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Reason-Wang/ToolGen/6839374a255810efe69deea4056eec5c55e25802/evaluation/toolbench/inference/LLM_rank/__init__.py
--------------------------------------------------------------------------------
/evaluation/toolbench/inference/LLM_rank/rank_candidate.py:
--------------------------------------------------------------------------------
1 | '''
2 | Evaluate the score of a query corresponding to different candidates
3 | '''
4 |
5 | from Prompts.rank_prompts import LLM_PAIRWISE_RANK_SUBFIX_SYSTEM_PROMPT, LLM_PAIRWISE_RANK_USER_PROMPT
6 | import random
7 | from Tree.Tree import tree_node
8 |
9 |
10 | def rank2symmetry(llm_interface, LLM_rank_args, cand1,cand2):
11 | '''
12 | Use llm to compare the height, due to the sequence, you need to compare each of the two in the front
13 | '''
14 | single_rank_func = LLM_rank_args["rank_func"]
15 | score = [0,0]
16 | bigger1,query_count1, total_tokens1 = single_rank_func(llm_interface, LLM_rank_args, cand1,cand2)
17 | score[1 - bigger1] += 1
18 | bigger2,query_count2, total_tokens2 = single_rank_func(llm_interface, LLM_rank_args, cand2,cand1)
19 | score[bigger2] += 1
20 | if score[0] > score[1]:
21 | return 1 , query_count1 + query_count2, total_tokens1 + total_tokens2
22 | elif score[0] < score[1]:
23 | return -1, query_count1 + query_count2, total_tokens1 + total_tokens2
24 | else:
25 | return 0, query_count1 + query_count2, total_tokens1 + total_tokens2
26 |
27 |
28 |
29 | def rank2_subfix(llm_interface,LLM_rank_args, cand1,cand2):
30 | '''
31 | Assumed that the two candidates have a long common prefix
32 | '''
33 | anscestor_interesction = tree_node.find_ancestor_intersection(cand1,cand2)
34 | assert anscestor_interesction != None
35 | intersect_trice = anscestor_interesction.get_former_trice_from_this_node(end_node=None)
36 | trice_1 = cand1.get_former_trice_from_this_node(end_node=anscestor_interesction)
37 | trice_2 = cand2.get_former_trice_from_this_node(end_node=anscestor_interesction)
38 |
39 | system_message = LLM_PAIRWISE_RANK_SUBFIX_SYSTEM_PROMPT
40 | system_message = system_message.replace("{task_description}", LLM_rank_args["task_description"])
41 | system_message = system_message.replace("{intersect_trice}", intersect_trice)
42 | system_message = system_message.replace("{candidate_A}",trice_1)
43 | system_message = system_message.replace("{candidate_B}",trice_2)
44 | llm_interface.change_messages([{"role":"system","content":system_message},
45 | {"role":"user","content":LLM_PAIRWISE_RANK_USER_PROMPT},
46 | ])
47 | output,error_code, total_tokens = llm_interface.parse(functions=LLM_rank_args["functions"],function_call="none",process_id=LLM_rank_args["process_id"])
48 | if output["content"].strip().lower()[-1] == "a":
49 | return 1, 1, total_tokens
50 | else:
51 | return 0, 1, total_tokens
52 |
53 | def sum_based_rankn(llm_interface,LLM_rank_args, candidates):
54 | '''
55 | All pairs are sorted pairwise, sum the total points, and choose the best
56 | '''
57 | total_querys = 0
58 | total_tokens = 0
59 | scores = [0]*len(candidates)
60 | for i in range(len(candidates)-1):
61 | for j in range(i+1,len(candidates)):
62 | pairwise_rank,query_count,rank2_tokens = rank2symmetry(llm_interface,LLM_rank_args, candidates[i],candidates[j])
63 | total_querys += query_count
64 | total_tokens += rank2_tokens
65 | if pairwise_rank > 0:
66 | scores[i] += 1
67 | elif pairwise_rank < 0:
68 | scores[j] += 1
69 | else:
70 | scores[i] += 0.5
71 | scores[j] += 0.5
72 | return scores, total_querys, total_tokens
73 |
74 |
75 |
76 | if __name__ == "__main__":
77 | random.seed(42)
78 | # candidates = [
79 | # "234",
80 | # "66.5",
81 | # "77.1",
82 | # "88.967",
83 | # "pi",
84 | # # "e",
85 | # # "ln(2)"
86 | # ]
87 | candidates = [
88 | "77.1",
89 | "88.967",
90 | "pi",
91 | "66.5",
92 | "234",
93 | "ln(2)"
94 | ]
95 | '''
96 | starting_delta:
97 | 50 -> 42.85%
98 | 100 -> 35.99%
99 | 150 -> 29.66%
100 | 200 -> 24.03%
101 | '''
102 |
--------------------------------------------------------------------------------
/evaluation/toolbench/inference/Prompts/ReAct_prompts.py:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | FORMAT_INSTRUCTIONS_SYSTEM_FUNCTION = """You are AutoGPT, you can use many tools(functions) to do the following task.
5 | First I will give you the task description, and your task start.
6 | At each step, you need to give your thought to analyze the status now and what to do next, with a function call to actually excute your step.
7 | After the call, you will get the call result, and you are now in a new state.
8 | Then you will analyze your status now, then decide what to do next...
9 | After many (Thought-call) pairs, you finally perform the task, then you can give your finial answer.
10 | Remember:
11 | 1.the state change is irreversible, you can't go back to one of the former state, if you want to restart the task, say "I give up and restart".
12 | 2.All the thought is short, at most in 5 sentence.
13 | 3.You can do more then one trys, so if your plan is to continusly try some conditions, you can do one of the conditions per try.
14 | Let's Begin!
15 | Task description: {task_description}"""
16 |
17 | FORMAT_INSTRUCTIONS_USER_FUNCTION = """
18 | {input_description}
19 | Begin!
20 | """
21 |
22 | FORMAT_INSTRUCTIONS_SYSTEM_FUNCTION_ZEROSHOT = """Answer the following questions as best you can. Specifically, you have access to the following APIs:
23 |
24 | {func_str}
25 |
26 | Use the following format:
27 | Thought: you should always think about what to do
28 | Action: the action to take, should be one of {func_list}
29 | Action Input: the input to the action
30 | End Action
31 |
32 | Begin! Remember: (1) Follow the format, i.e,
33 | Thought:
34 | Action:
35 | Action Input:
36 | End Action
37 | (2)The Action: MUST be one of the following:{func_list}
38 | (3)If you believe that you have obtained enough information (which can be judge from the history observations) that can answer the task, please call:
39 | Action: Finish
40 | Action Input: {{"return_type": "give_answer", "final_answer": your answer string}}.
41 | Question: {question}
42 |
43 | Here are the history actions and observations:
44 | """
45 |
--------------------------------------------------------------------------------
/evaluation/toolbench/inference/Prompts/Tree_search_prompts.py:
--------------------------------------------------------------------------------
1 | DIVERSITY_PROMPT='''This is not the first time you try this task, all previous trails failed.
2 | Before you generate my thought for this state, I will first show you your previous actions for this state, and then you must generate actions that is different from all of them. Here are some previous actions candidates:
3 | {previous_candidate}
4 | Remember you are now in the intermediate state of a trail, you will first analyze the now state and previous action candidates, then make actions that is different from all the previous.'''
5 |
6 |
7 |
--------------------------------------------------------------------------------
/evaluation/toolbench/inference/Prompts/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Reason-Wang/ToolGen/6839374a255810efe69deea4056eec5c55e25802/evaluation/toolbench/inference/Prompts/__init__.py
--------------------------------------------------------------------------------
/evaluation/toolbench/inference/Prompts/rank_prompts.py:
--------------------------------------------------------------------------------
1 |
2 | LLM_PAIRWISE_RANK_SUBFIX_SYSTEM_PROMPT = '''
3 | You are value-GPT, which is an expert of defining which trail is better, which trail is more close to solving the task.
4 | All candidate tries to solve this task with some funciton calls:
5 | *******************************
6 | {{TASK_DESCRIPTION}}
7 | {task_description}
8 | {{END_TASK_DESCRIPTION}}
9 | *******************************
10 | First, all candidate do the following things:
11 | {intersect_trice}
12 | After that, there are two candidates A and B, they do different things:
13 | *******************************
14 | {{CANDIDATE_A_START}}
15 | {candidate_A}
16 | {{CANDIDATE_A_END}}
17 | *******************************
18 | {{CANDIDATE_B_START}}
19 | {candidate_B}
20 | {{CANDIDATE_B_END}}
21 | Which try do you think is more helpful to solving the task?
22 | '''
23 |
24 |
25 |
26 |
27 | LLM_PAIRWISE_RANK_USER_PROMPT = '''
28 | Tell me which candidate is better in ONE Word: "A" or "B":'''
--------------------------------------------------------------------------------
/evaluation/toolbench/inference/Tree/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Reason-Wang/ToolGen/6839374a255810efe69deea4056eec5c55e25802/evaluation/toolbench/inference/Tree/__init__.py
--------------------------------------------------------------------------------
/evaluation/toolbench/inference/callbacks/ServerEventCallback.py:
--------------------------------------------------------------------------------
1 | from typing import Any, Dict, List, Union
2 | import queue
3 | class ServerEventCallback():
4 | """Base callback handler"""
5 |
6 | def __init__(self, queue: queue.Queue, *args, **kwargs):
7 | super().__init__(*args, **kwargs)
8 | self.queue = queue
9 | self.llm_block_id = 0
10 | self.tool_block_id = 0
11 | self.tool_descriptions = {}
12 |
13 | def add_to_queue(self, method_name: str, block_id, **kwargs: Any):
14 | data = {
15 | "method_name": method_name,
16 | "block_id": block_id,
17 | }
18 | data.update(kwargs)
19 | self.queue.put(data)
20 |
21 | def on_tool_retrieval_start(self):
22 | # tools should be of the form
23 | # {tool_name, tool_desc}
24 | self.add_to_queue(
25 | "on_tool_retrieval_start",
26 | "recommendation-1",
27 | )
28 | print("on_tool_retrieval_start method called")
29 |
30 | def on_tool_retrieval_end(self, tools):
31 | # tool should be of the form
32 | # {tool_name, tool_desc}
33 | self.add_to_queue(
34 | "on_tool_retrieval_end",
35 | "recommendation-1",
36 | recommendations=tools
37 | )
38 | self.tool_descriptions = {
39 | tool["name"]: tool for tool in tools
40 | }
41 | print("on_tool_retrieval_end method called")
42 | def on_request_start(self, user_input: str, method: str) -> Any:
43 | self.tool_block_id = 0
44 | self.llm_block_id = 0
45 | self.add_to_queue(
46 | "on_request_start",
47 | block_id="start",
48 | user_input=user_input,
49 | method=method
50 | )
51 | def on_request_end(self, outputs: str, chain: List[Any]):
52 | self.add_to_queue(
53 | "on_request_end",
54 | block_id="end",
55 | output=outputs,
56 | chain=chain
57 | )
58 | def on_request_error(self, error: str):
59 | self.add_to_queue(
60 | "on_request_error",
61 | block_id="error",
62 | error=error
63 | )
64 |
65 | # keep
66 | def on_chain_start(self, inputs: str, depth: int) -> Any:
67 | """Run when chain starts running."""
68 | print("on_chain_start method called")
69 | self.llm_block_id += 1
70 | block_id = "llm-" + str(self.llm_block_id)
71 | self.add_to_queue(
72 | "on_chain_start",
73 | block_id=block_id,
74 | messages=inputs,
75 | depth=depth
76 | )
77 | return block_id
78 |
79 | # this one needs the block_id memorized
80 | def on_chain_end(self, block_id: str, depth: int) -> Any:
81 | self.add_to_queue(
82 | "on_chain_end",
83 | block_id=block_id,
84 | # output=output,
85 | depth=depth
86 | )
87 | print("on_chain_end method called")
88 |
89 | def on_chain_error(self, error: Union[Exception, KeyboardInterrupt], **kwargs: Any) -> Any:
90 | method_name = "on_chain_error"
91 | self.add_to_queue(method_name, error=error, **kwargs)
92 | print("on_chain_error method called")
93 |
94 | def on_llm_start(
95 | self, messages: str, depth: int
96 | ) -> Any:
97 | """Run when LLM starts running."""
98 | self.add_to_queue(
99 | "on_llm_start",
100 | block_id="llm-" + str(self.llm_block_id),
101 | messages=messages,
102 | depth=depth
103 | )
104 | print("on_llm_start method called")
105 |
106 | def on_llm_new_token(self, token: str, **kwargs: Any) -> Any:
107 | """Run on new LLM token. Only available when streaming is enabled."""
108 | method_name = "on_llm_new_token"
109 | self.add_to_queue(method_name, token=token, **kwargs)
110 | print("on_llm_new_token method called")
111 |
112 | def on_llm_end(self, response: str, depth: int) -> Any:
113 | """Run when LLM ends running."""
114 | self.add_to_queue(
115 | "on_llm_end",
116 | block_id="llm-" + str(self.llm_block_id),
117 | response=response,
118 | depth=depth
119 | )
120 | print("on_llm_end method called")
121 |
122 | def on_llm_error(self, error: Union[Exception, KeyboardInterrupt]) -> Any:
123 | """Run when LLM errors."""
124 | self.add_to_queue(
125 | "on_llm_error",
126 | block_id="llm-" + str(self.llm_block_id),
127 | message=str(error),
128 | error=error
129 | )
130 | print("on_llm_error method called")
131 |
132 | def on_agent_action(self, action, action_input, depth: int) -> str:
133 | self.tool_block_id += 1
134 | block_id="tool-" + str(self.tool_block_id)
135 | self.add_to_queue(
136 | "on_agent_action",
137 | block_id=block_id,
138 | action=action,
139 | action_input = action_input,
140 | depth=depth
141 | )
142 | print("on_agent_action method called")
143 | return block_id
144 |
145 | def on_tool_start(self, tool_name: str, tool_input: str, depth: int) -> Any:
146 | method_name = "on_tool_start"
147 | tool_description = "Tool not found in tool descriptions"
148 | if tool_name in self.tool_descriptions:
149 | tool_description = self.tool_descriptions[tool_name]
150 | else:
151 | print(self.tool_descriptions)
152 | print("Key", tool_name, "not found in tool descriptions")
153 | self.add_to_queue(
154 | method_name,
155 | block_id="tool-" + str(self.tool_block_id),
156 | tool_name=tool_name,
157 | tool_description=tool_description,
158 | tool_input=tool_input,
159 | depth=depth
160 | )
161 | print("on_tool_start method called")
162 |
163 | def on_tool_end(self, output: str, status:int, depth: int) -> Any:
164 | method_name = "on_tool_end"
165 | self.add_to_queue(
166 | method_name,
167 | block_id="tool-" + str(self.tool_block_id),
168 | output=output,
169 | status= status,
170 | depth=depth
171 | )
172 | print("on_tool_end method called")
173 |
174 | def on_tool_error(self, error: Union[Exception, KeyboardInterrupt]) -> Any:
175 | method_name = "on_tool_error"
176 | self.add_to_queue(
177 | method_name,
178 | error=error
179 | )
180 | print("on_tool_error method called")
181 |
182 | def on_agent_end(self, block_id:str, depth: int):
183 | self.add_to_queue(
184 | "on_agent_end",
185 | block_id=block_id,
186 | depth=depth
187 | )
188 | print("on_agent_end method called")
--------------------------------------------------------------------------------
/evaluation/toolbench/inference/qa_pipeline.py:
--------------------------------------------------------------------------------
1 | '''
2 | Close-domain QA Pipeline
3 | '''
4 |
5 | import argparse
6 | from toolbench.inference.Downstream_tasks.rapidapi import pipeline_runner
7 |
8 |
9 | if __name__ == "__main__":
10 |
11 | parser = argparse.ArgumentParser()
12 | parser.add_argument('--backbone_model', type=str, default="toolllama", required=False, help='chatgpt_function or davinci or toolllama')
13 | parser.add_argument('--chatgpt_model', type=str, default=None, required=True, help='gpt-3.5-turbo or gpt-4')
14 | parser.add_argument('--base_url', type=str, default="https://api.openai.com/v1", required=False, help='openai api url')
15 | parser.add_argument('--openai_key', type=str, default="", required=False, help='openai key for chatgpt_function or davinci model')
16 | parser.add_argument('--model_path', type=str, default="your_model_path/", required=False, help='')
17 | parser.add_argument('--tool_root_dir', type=str, default="your_tools_path/", required=True, help='')
18 | parser.add_argument("--lora", action="store_true", help="Load lora model or not.")
19 | parser.add_argument('--lora_path', type=str, default="your_lora_path if lora", required=False, help='')
20 | parser.add_argument('--max_observation_length', type=int, default=1024, required=False, help='maximum observation length')
21 | parser.add_argument('--max_source_sequence_length', type=int, default=4096, required=False, help='original maximum model sequence length')
22 | parser.add_argument('--max_sequence_length', type=int, default=8192, required=False, help='maximum model sequence length')
23 | parser.add_argument('--observ_compress_method', type=str, default="truncate", choices=["truncate", "filter", "random"], required=False, help='observation compress method')
24 | parser.add_argument('--method', type=str, default="CoT@1", required=False, help='method for answer generation: CoT@n,Reflexion@n,BFS,DFS,UCT_vote')
25 | parser.add_argument('--input_query_file', type=str, default="", required=False, help='input path')
26 | parser.add_argument('--output_answer_file', type=str, default="",required=False, help='output path')
27 | parser.add_argument('--toolbench_key', type=str, default="",required=False, help='your toolbench key to request rapidapi service')
28 | parser.add_argument('--rapidapi_key', type=str, default="",required=False, help='your rapidapi key to request rapidapi service')
29 | parser.add_argument('--use_rapidapi_key', action="store_true", help="To use customized rapidapi service or not.")
30 | parser.add_argument('--api_customization', action="store_true", help="To use customized api or not.")
31 |
32 | args = parser.parse_args()
33 |
34 | pipeline_runner = pipeline_runner(args)
35 | pipeline_runner.run()
36 |
37 |
--------------------------------------------------------------------------------
/evaluation/toolbench/inference/qa_pipeline_multithread.py:
--------------------------------------------------------------------------------
1 | '''
2 | Close-domain QA Pipeline
3 | '''
4 |
5 | import argparse, os
6 | from evaluation.toolbench.inference.Downstream_tasks.rapidapi_multithread import pipeline_runner
7 |
8 |
9 | if __name__ == "__main__":
10 |
11 | parser = argparse.ArgumentParser()
12 | parser.add_argument('--corpus_tsv_path', type=str, default=None, required=False, help="your_retrival_corpus_path/")
13 | parser.add_argument('--retrieval_model_path', type=str, default=None, required=False, help="your_model_path/")
14 | parser.add_argument('--retrieved_api_nums', type=int, default=5, required=False, help='')
15 | parser.add_argument('--backbone_model', type=str, default="toolllama", required=False, help='chatgpt_function or davinci or toolllama')
16 | parser.add_argument('--chatgpt_model', type=str, default=None, required=True, help='gpt-3.5-turbo or gpt-4')
17 | parser.add_argument('--base_url', type=str, default="https://api.openai.com/v1", required=False, help='openai api url')
18 | parser.add_argument('--openai_key', type=str, default="", required=False, help='openai key for chatgpt_function or davinci model')
19 | parser.add_argument('--model_path', type=str, default="your_model_path/", required=False, help='')
20 | parser.add_argument('--tool_root_dir', type=str, default="your_tools_path/", required=True, help='')
21 | parser.add_argument("--lora", action="store_true", help="Load lora model or not.")
22 | parser.add_argument('--lora_path', type=str, default="your_lora_path if lora", required=False, help='')
23 | parser.add_argument('--max_observation_length', type=int, default=1024, required=False, help='maximum observation length')
24 | parser.add_argument('--max_source_sequence_length', type=int, default=4096, required=False, help='original maximum model sequence length')
25 | parser.add_argument('--max_sequence_length', type=int, default=8192, required=False, help='maximum model sequence length')
26 | parser.add_argument('--single_chain_max_step', type=int, default=16, required=False, help='maximum step for single chain')
27 | parser.add_argument('--max_query_count', type=int, default=200, required=False, help='maximum query count')
28 | parser.add_argument('--observ_compress_method', type=str, default="truncate", choices=["truncate", "filter", "random"], required=False, help='observation compress method')
29 | parser.add_argument('--method', type=str, default="CoT@1", required=False, help='method for answer generation: CoT@n,Reflexion@n,BFS,DFS,UCT_vote')
30 | parser.add_argument('--input_query_file', type=str, default="", required=False, help='input path')
31 | parser.add_argument('--output_answer_file', type=str, default="",required=False, help='output path')
32 | parser.add_argument('--toolbench_key', type=str, default="",required=False, help='your toolbench key to request rapidapi service')
33 | parser.add_argument('--rapidapi_key', type=str, default="",required=False, help='your rapidapi key to request rapidapi service')
34 | parser.add_argument('--use_rapidapi_key', action="store_true", help="To use customized rapidapi service or not.")
35 | parser.add_argument('--api_customization', action="store_true", help="To use customized api or not.")
36 | parser.add_argument('--num_thread', type=int, default=1, required=False, help='number of threads')
37 | parser.add_argument('--disable_tqdm', action="store_true", help="disable tqdm or not.")
38 | parser.add_argument('--overwrite', action='store_true', help='overwrite existing runs')
39 | parser.add_argument('--function_provider', type=str, default="truth", required=False, help='')
40 | parser.add_argument('--replace_file', type=str, default="", required=False, help='')
41 | parser.add_argument('--indexing', type=str, default="")
42 | parser.add_argument('--template', type=str, default="")
43 | args = parser.parse_args()
44 | if args.overwrite:
45 | os.system(f"rm -rf {args.output_answer_file}")
46 |
47 | from evaluation.utils.utils import seed_everything
48 | seed_everything(42)
49 |
50 | pipeline_runner = pipeline_runner(args, add_retrieval=True if args.retrieval_model_path else False)
51 | pipeline_runner.run()
52 |
53 |
--------------------------------------------------------------------------------
/evaluation/toolbench/inference/qa_pipeline_open_domain.py:
--------------------------------------------------------------------------------
1 | '''
2 | Open-domain QA Pipeline
3 | '''
4 | import argparse
5 | from toolbench.inference.Downstream_tasks.rapidapi import pipeline_runner
6 |
7 |
8 | if __name__ == "__main__":
9 |
10 | parser = argparse.ArgumentParser()
11 | parser.add_argument('--corpus_tsv_path', type=str, default="your_retrival_corpus_path/", required=False, help='')
12 | parser.add_argument('--retrieval_model_path', type=str, default="your_model_path/", required=False, help='')
13 | parser.add_argument('--retrieved_api_nums', type=int, default=5, required=False, help='')
14 | parser.add_argument('--backbone_model', type=str, default="toolllama", required=False, help='chatgpt_function or davinci or toolllama')
15 | parser.add_argument('--chatgpt_model', type=str, default=None, required=True, help='gpt-3.5-turbo or gpt-4')
16 | parser.add_argument('--base_url', type=str, default="https://api.openai.com/v1", required=False, help='openai api url')
17 | parser.add_argument('--openai_key', type=str, default="", required=False, help='openai key for chatgpt_function or davinci model')
18 | parser.add_argument('--model_path', type=str, default="your_model_path/", required=False, help='')
19 | parser.add_argument('--tool_root_dir', type=str, default="your_tools_path/", required=True, help='')
20 | parser.add_argument("--lora", action="store_true", help="Load lora model or not.")
21 | parser.add_argument('--lora_path', type=str, default="your_lora_path if lora", required=False, help='')
22 | parser.add_argument('--max_observation_length', type=int, default=1024, required=False, help='maximum observation length')
23 | parser.add_argument('--max_source_sequence_length', type=int, default=4096, required=False, help='original maximum model sequence length')
24 | parser.add_argument('--max_sequence_length', type=int, default=8192, required=False, help='maximum model sequence length')
25 | parser.add_argument('--observ_compress_method', type=str, default="truncate", choices=["truncate", "filter", "random"], required=False, help='maximum observation length')
26 | parser.add_argument('--method', type=str, default="CoT@1", required=False, help='method for answer generation: CoT@n,Reflexion@n,BFS,DFS,UCT_vote')
27 | parser.add_argument('--input_query_file', type=str, default="", required=False, help='input path')
28 | parser.add_argument('--output_answer_file', type=str, default="",required=False, help='output path')
29 | parser.add_argument('--toolbench_key', type=str, default="",required=False, help='your toolbench key to request rapidapi service')
30 | parser.add_argument('--rapidapi_key', type=str, default="",required=False, help='your rapidapi key to request rapidapi service')
31 | parser.add_argument('--use_rapidapi_key', action="store_true", help="To use customized rapidapi service or not.")
32 | parser.add_argument('--api_customization', action="store_true", help="To use customized api or not. NOT SUPPORTED currently under open domain setting.")
33 |
34 | args = parser.parse_args()
35 |
36 | pipeline_runner = pipeline_runner(args, add_retrieval=True)
37 | pipeline_runner.run()
38 |
--------------------------------------------------------------------------------
/evaluation/toolbench/model/__init__.py:
--------------------------------------------------------------------------------
1 | from evaluation.toolbench.model.model_adapter import (
2 | load_model,
3 | get_conversation_template,
4 | add_model_args,
5 | )
6 |
--------------------------------------------------------------------------------
/evaluation/toolbench/model/apply_delta.py:
--------------------------------------------------------------------------------
1 | """
2 | Apply the delta weights on top of a base model.
3 |
4 | Usage:
5 | python3 -m fastchat.model.apply_delta --base ~/model_weights/llama-7b --target ~/model_weights/vicuna-7b --delta lmsys/vicuna-7b-delta-v1.1
6 | """
7 | import argparse
8 | import gc
9 | import glob
10 | import json
11 | import os
12 | import shutil
13 | import tempfile
14 |
15 | from huggingface_hub import snapshot_download
16 | import torch
17 | from torch import nn
18 | from tqdm import tqdm
19 | from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig
20 |
21 |
22 | GB = 1 << 30
23 |
24 |
25 | def split_files(model_path, tmp_path, split_size):
26 | if not os.path.exists(model_path):
27 | model_path = snapshot_download(repo_id=model_path)
28 | if not os.path.exists(tmp_path):
29 | os.makedirs(tmp_path)
30 |
31 | file_pattern = os.path.join(model_path, "pytorch_model-*.bin")
32 | files = glob.glob(file_pattern)
33 |
34 | part = 0
35 | try:
36 | for file_path in tqdm(files):
37 | state_dict = torch.load(file_path)
38 | new_state_dict = {}
39 |
40 | current_size = 0
41 | for name, param in state_dict.items():
42 | param_size = param.numel() * param.element_size()
43 |
44 | if current_size + param_size > split_size:
45 | new_file_name = f"pytorch_model-{part}.bin"
46 | new_file_path = os.path.join(tmp_path, new_file_name)
47 | torch.save(new_state_dict, new_file_path)
48 | current_size = 0
49 | new_state_dict = None
50 | gc.collect()
51 | new_state_dict = {}
52 | part += 1
53 |
54 | new_state_dict[name] = param
55 | current_size += param_size
56 |
57 | new_file_name = f"pytorch_model-{part}.bin"
58 | new_file_path = os.path.join(tmp_path, new_file_name)
59 | torch.save(new_state_dict, new_file_path)
60 | new_state_dict = None
61 | gc.collect()
62 | new_state_dict = {}
63 | part += 1
64 | except Exception as e:
65 | print(f"An error occurred during split_files: {e}")
66 | shutil.rmtree(tmp_path)
67 | raise
68 |
69 |
70 | def apply_delta_low_cpu_mem(base_model_path, target_model_path, delta_path):
71 | delta_tokenizer = AutoTokenizer.from_pretrained(delta_path, use_fast=False)
72 | delta_config = AutoConfig.from_pretrained(delta_path)
73 |
74 | if os.path.exists(target_model_path):
75 | shutil.rmtree(target_model_path)
76 | os.makedirs(target_model_path)
77 |
78 | split_size = 4 * GB
79 |
80 | with tempfile.TemporaryDirectory() as tmp_base_path, tempfile.TemporaryDirectory() as tmp_delta_path:
81 | print(f"Split files for the base model to {tmp_base_path}")
82 | split_files(base_model_path, tmp_base_path, split_size)
83 | print(f"Split files for the delta weights to {tmp_delta_path}")
84 | split_files(delta_path, tmp_delta_path, split_size)
85 |
86 | base_pattern = os.path.join(tmp_base_path, "pytorch_model-*.bin")
87 | base_files = glob.glob(base_pattern)
88 | delta_pattern = os.path.join(tmp_delta_path, "pytorch_model-*.bin")
89 | delta_files = glob.glob(delta_pattern)
90 | delta_state_dict = torch.load(delta_files[0])
91 |
92 | print("Applying the delta")
93 | weight_map = {}
94 | total_size = 0
95 |
96 | for i, base_file in tqdm(enumerate(base_files)):
97 | state_dict = torch.load(base_file)
98 | file_name = f"pytorch_model-{i}.bin"
99 | for name, param in state_dict.items():
100 | if name not in delta_state_dict:
101 | for delta_file in delta_files:
102 | delta_state_dict = torch.load(delta_file)
103 | gc.collect()
104 | if name in delta_state_dict:
105 | break
106 |
107 | state_dict[name] += delta_state_dict[name]
108 | weight_map[name] = file_name
109 | total_size += param.numel() * param.element_size()
110 | gc.collect()
111 | torch.save(state_dict, os.path.join(target_model_path, file_name))
112 |
113 | with open(
114 | os.path.join(target_model_path, "pytorch_model.bin.index.json"), "w"
115 | ) as f:
116 | json.dump(
117 | {"weight_map": weight_map, "metadata": {"total_size": total_size}}, f
118 | )
119 |
120 | print(f"Saving the target model to {target_model_path}")
121 | delta_tokenizer.save_pretrained(target_model_path)
122 | delta_config.save_pretrained(target_model_path)
123 |
124 |
125 | def apply_delta(base_model_path, target_model_path, delta_path):
126 | print(f"Loading the delta weights from {delta_path}")
127 | delta_tokenizer = AutoTokenizer.from_pretrained(delta_path, use_fast=False)
128 | delta = AutoModelForCausalLM.from_pretrained(
129 | delta_path, torch_dtype=torch.float16, low_cpu_mem_usage=True
130 | )
131 |
132 | print(f"Loading the base model from {base_model_path}")
133 | base = AutoModelForCausalLM.from_pretrained(
134 | base_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True
135 | )
136 |
137 | print("Applying the delta")
138 | for name, param in tqdm(base.state_dict().items(), desc="Applying delta"):
139 | assert name in delta.state_dict()
140 | param.data += delta.state_dict()[name]
141 |
142 | print(f"Saving the target model to {target_model_path}")
143 | base.save_pretrained(target_model_path)
144 | delta_tokenizer.save_pretrained(target_model_path)
145 |
146 |
147 | if __name__ == "__main__":
148 | parser = argparse.ArgumentParser()
149 | parser.add_argument("--base-model-path", type=str, required=True)
150 | parser.add_argument("--target-model-path", type=str, required=True)
151 | parser.add_argument("--delta-path", type=str, required=True)
152 | parser.add_argument(
153 | "--low-cpu-mem",
154 | action="store_true",
155 | help="Lower the cpu memory usage. This will split large files and use "
156 | "disk as swap to reduce the memory usage below 10GB.",
157 | )
158 | args = parser.parse_args()
159 |
160 | if args.low_cpu_mem:
161 | apply_delta_low_cpu_mem(
162 | args.base_model_path, args.target_model_path, args.delta_path
163 | )
164 | else:
165 | apply_delta(args.base_model_path, args.target_model_path, args.delta_path)
166 |
--------------------------------------------------------------------------------
/evaluation/toolbench/model/compression.py:
--------------------------------------------------------------------------------
1 | import dataclasses
2 | import os
3 |
4 | import torch
5 | import torch.nn as nn
6 | from torch.nn import functional as F
7 | from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig
8 |
9 |
10 | @dataclasses.dataclass
11 | class CompressionConfig:
12 | """Group-wise quantization."""
13 |
14 | num_bits: int
15 | group_size: int
16 | group_dim: int
17 | symmetric: bool
18 | enabled: bool = True
19 |
20 |
21 | default_compression_config = CompressionConfig(
22 | num_bits=8, group_size=256, group_dim=1, symmetric=True, enabled=True
23 | )
24 |
25 |
26 | class CLinear(nn.Module):
27 | """Compressed Linear Layer."""
28 |
29 | def __init__(self, weight=None, bias=None, device=None):
30 | super().__init__()
31 | self.weight = weight
32 | self.bias = bias
33 |
34 | def forward(self, input):
35 | return F.linear(input.to(self.weight.dtype), self.weight, self.bias)
36 |
37 |
38 | def compress_module(module, target_device):
39 | for name, child in module.named_children():
40 | if isinstance(child, nn.Linear):
41 | setattr(
42 | module,
43 | name,
44 | CLinear(child.weight, child.bias, target_device),
45 | )
46 | compress_module(child, target_device)
47 |
48 |
49 | def get_compressed_list(module, prefix=""):
50 | compressed_list = []
51 | for name, child in module.named_children():
52 | if isinstance(child, nn.Linear):
53 | full_name = f"{prefix}.{name}.weight" if prefix else f"{name}.weight"
54 | compressed_list.append(full_name)
55 | compressed_list.extend(
56 | get_compressed_list(child, full_name)
57 | )
58 | return compressed_list
59 |
60 |
61 | def apply_compressed_weight(module, compressed_state_dict, target_device, prefix=""):
62 | for name, child in module.named_children():
63 | if isinstance(child, nn.Linear):
64 | full_name = f"{prefix}.{name}.weight" if prefix else f"{name}.weight"
65 | setattr(
66 | module,
67 | name,
68 | CLinear(
69 | compressed_state_dict[full_name], child.bias, target_device
70 | ),
71 | )
72 | apply_compressed_weight(child, compressed_state_dict, target_device, full_name)
73 |
74 |
75 | def load_compress_model(model_path, device, torch_dtype):
76 | # partially load model
77 | tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False)
78 | base_pattern = os.path.join(model_path, "pytorch_model-*.bin")
79 | files = glob.glob(base_pattern)
80 |
81 | config = AutoConfig.from_pretrained(
82 | model_path, low_cpu_mem_usage=True, torch_dtype=torch_dtype
83 | )
84 | model = AutoModelForCausalLM.from_config(config)
85 | linear_weights = get_compressed_list(model)
86 |
87 | compressed_state_dict = {}
88 |
89 | for filename in files:
90 | tmp_state_dict = torch.load(filename)
91 | for name in tmp_state_dict:
92 | if name in linear_weights:
93 | tensor = tmp_state_dict[name].to(device).data.to(torch_dtype)
94 | compressed_state_dict[name] = compress(
95 | tensor, default_compression_config
96 | )
97 | else:
98 | compressed_state_dict[name] = tmp_state_dict[name].to(device)
99 | tmp_state_dict[name] = None
100 | tensor = None
101 | torch.cuda.empty_cache()
102 |
103 | for name, param in model.named_parameters():
104 | if name not in linear_weights:
105 | param.data = compressed_state_dict[name]
106 | apply_compressed_weight(model, compressed_state_dict, device)
107 |
108 | model.to(device)
109 |
110 | return model, tokenizer
111 |
112 |
113 | def compress(tensor, config):
114 | """Simulate group-wise quantization."""
115 | if not config.enabled:
116 | return tensor
117 |
118 | group_size, num_bits, group_dim, symmetric = (
119 | config.group_size,
120 | config.num_bits,
121 | config.group_dim,
122 | config.symmetric,
123 | )
124 | assert num_bits <= 8
125 |
126 | original_shape = tensor.shape
127 | num_groups = (original_shape[group_dim] + group_size - 1) // group_size
128 | new_shape = (
129 | original_shape[:group_dim]
130 | + (num_groups, group_size)
131 | + original_shape[group_dim + 1 :]
132 | )
133 |
134 | # Pad
135 | pad_len = group_size - original_shape[group_dim] % group_size
136 | if pad_len != 0:
137 | pad_shape = (
138 | original_shape[:group_dim] + (pad_len,) + original_shape[group_dim + 1 :]
139 | )
140 | tensor = torch.cat(
141 | [tensor, torch.zeros(pad_shape, dtype=tensor.dtype, device=tensor.device)],
142 | dim=group_dim,
143 | )
144 | data = tensor.view(new_shape)
145 |
146 | # Quantize
147 | if symmetric:
148 | B = 2 ** (num_bits - 1) - 1
149 | scale = B / torch.max(data.abs(), dim=group_dim + 1, keepdim=True)[0]
150 | data = data * scale
151 | data = data.clamp_(-B, B).round_().to(torch.int8)
152 | return data, scale, original_shape
153 | else:
154 | B = 2**num_bits - 1
155 | mn = torch.min(data, dim=group_dim + 1, keepdim=True)[0]
156 | mx = torch.max(data, dim=group_dim + 1, keepdim=True)[0]
157 |
158 | scale = B / (mx - mn)
159 | data = data - mn
160 | data *= scale
161 |
162 | data = data.clamp_(0, B).round_().to(torch.uint8)
163 | return data, mn, scale, original_shape
164 |
165 |
166 | def decompress(packed_data, config):
167 | """Simulate group-wise dequantization."""
168 | if not config.enabled:
169 | return packed_data
170 |
171 | group_size, num_bits, group_dim, symmetric = (
172 | config.group_size,
173 | config.num_bits,
174 | config.group_dim,
175 | config.symmetric,
176 | )
177 |
178 | # Dequantize
179 | if symmetric:
180 | data, scale, original_shape = packed_data
181 | data = data / scale
182 | else:
183 | data, mn, scale, original_shape = packed_data
184 | data = data / scale
185 | data += mn
186 |
187 | # Unpad
188 | pad_len = group_size - original_shape[group_dim] % group_size
189 | if pad_len:
190 | padded_original_shape = (
191 | original_shape[:group_dim]
192 | + (original_shape[group_dim] + pad_len,)
193 | + original_shape[group_dim + 1 :]
194 | )
195 | data = data.reshape(padded_original_shape)
196 | indices = [slice(0, x) for x in original_shape]
197 | return data[indices].contiguous()
198 | else:
199 | return data.view(original_shape)
200 |
--------------------------------------------------------------------------------
/evaluation/toolbench/model/make_delta.py:
--------------------------------------------------------------------------------
1 | """
2 | Make the delta weights by subtracting base weights.
3 |
4 | Usage:
5 | python3 -m fastchat.model.make_delta --base ~/model_weights/llama-13b --target ~/model_weights/vicuna-13b --delta ~/model_weights/vicuna-13b-delta --hub-repo-id lmsys/vicuna-13b-delta-v1.1
6 | """
7 | import argparse
8 |
9 | import torch
10 | from tqdm import tqdm
11 | from transformers import AutoTokenizer, AutoModelForCausalLM
12 |
13 |
14 | def make_delta(base_model_path, target_model_path, delta_path):
15 | print(f"Loading the base model from {base_model_path}")
16 | base = AutoModelForCausalLM.from_pretrained(
17 | base_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True
18 | )
19 |
20 | print(f"Loading the target model from {target_model_path}")
21 | target = AutoModelForCausalLM.from_pretrained(
22 | target_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True
23 | )
24 | target_tokenizer = AutoTokenizer.from_pretrained(target_model_path, use_fast=False)
25 |
26 | print("Calculating the delta")
27 | for name, param in tqdm(target.state_dict().items(), desc="Calculating delta"):
28 | assert name in base.state_dict()
29 | param.data -= base.state_dict()[name]
30 |
31 | print(f"Saving the delta to {delta_path}")
32 | if args.hub_repo_id:
33 | kwargs = {"push_to_hub": True, "repo_id": args.hub_repo_id}
34 | else:
35 | kwargs = {}
36 | target.save_pretrained(delta_path, **kwargs)
37 | target_tokenizer.save_pretrained(delta_path, **kwargs)
38 |
39 |
40 | if __name__ == "__main__":
41 | parser = argparse.ArgumentParser()
42 | parser.add_argument("--base-model-path", type=str, required=True)
43 | parser.add_argument("--target-model-path", type=str, required=True)
44 | parser.add_argument("--delta-path", type=str, required=True)
45 | parser.add_argument("--hub-repo-id", type=str)
46 | args = parser.parse_args()
47 |
48 | make_delta(args.base_model_path, args.target_model_path, args.delta_path)
49 |
--------------------------------------------------------------------------------
/evaluation/toolbench/retrieval/inference_example.py:
--------------------------------------------------------------------------------
1 | from sentence_transformers import SentenceTransformer, util
2 | import json
3 | import pandas as pd
4 | from collections import defaultdict
5 | import torch
6 | from tqdm import tqdm
7 | import argparse
8 | import os
9 |
10 | # 创建参数解析器并添加参数
11 | parser = argparse.ArgumentParser()
12 | parser.add_argument('model_path', type=str, required=True, help='Your trained model path')
13 | parser.add_argument('dataset_path', help='The processed dataset files path')
14 |
15 | # 解析命令行参数
16 | args = parser.parse_args()
17 |
18 | # Check if a GPU is available and if not, use a CPU
19 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
20 |
21 | model_path = args.model_path
22 |
23 | # Load the trained model
24 | model = SentenceTransformer(model_path).to(device)
25 |
26 | # Load test data
27 | documents_df = pd.read_csv(os.path.join(args.dataset_path, 'corpus.tsv'), sep='\t')
28 | test_queries_df = pd.read_csv(os.path.join(args.dataset_path, 'test.query.txt'), sep='\t', names=['qid', 'query_text'])
29 | test_labels_df = pd.read_csv(os.path.join(args.dataset_path, 'qrels.test.tsv'), sep='\t', names=['qid', 'useless', 'docid', 'label'])
30 |
31 | # Create mappings, get 'tool_name' and 'api_name' from the document_content
32 | ir_corpus = {row.docid: (json.loads(row.document_content)['tool_name'], json.loads(row.document_content)['api_name']) for _, row in documents_df.iterrows()}
33 | ir_test_queries = {row.qid: row.query_text for _, row in test_queries_df.iterrows()}
34 |
35 | # Create query-doc mapping from the test set
36 | ir_relevant_docs = defaultdict(list)
37 | for _, row in test_labels_df.iterrows():
38 | ir_relevant_docs[row.qid].append(row.docid)
39 |
40 | # Convert queries and documents to embeddings
41 | test_query_embeddings = model.encode(list(ir_test_queries.values()), convert_to_tensor=True).to(device)
42 | corpus_embeddings = model.encode(list(map(' '.join, ir_corpus.values())), convert_to_tensor=True).to(device)
43 |
44 | # Compute cosine similarity between queries and documents
45 | cos_scores = util.pytorch_cos_sim(test_query_embeddings, corpus_embeddings)
46 |
47 | # Get the top_k most similar documents for each query
48 | top_k = 5
49 | top_results = {}
50 | for query_index, (query_id, query) in enumerate(ir_test_queries.items()):
51 | relevant_docs_indices = cos_scores[query_index].topk(top_k).indices
52 | relevant_docs_scores = cos_scores[query_index].topk(top_k).values
53 | relevant_docs = [(list(ir_corpus.keys())[index], list(ir_corpus.values())[index]) for index in relevant_docs_indices]
54 | relevant_docs_with_scores = {str((doc_id, tool_name_api_name)): {'score': float(score)} for (doc_id, tool_name_api_name), score in zip(relevant_docs, relevant_docs_scores)}
55 |
56 | # Count the number of successful matches
57 | matches = len(set([doc_id for doc_id, _ in relevant_docs]) & set(ir_relevant_docs[query_id]))
58 |
59 | # Save query, original docs, top 5 docs with scores, and successful match count
60 | top_results[query] = {
61 | 'original_docs': [' '.join(ir_corpus[doc_id]) for doc_id in ir_relevant_docs[query_id]],
62 | 'top_docs': relevant_docs_with_scores,
63 | 'successful_matches': matches
64 | }
65 |
66 | # Save the results to a json file
67 | with open('top5_results_with_matches.json', 'w') as f:
68 | json.dump(top_results, f, indent=4)
--------------------------------------------------------------------------------
/evaluation/toolbench/retrieval/train.py:
--------------------------------------------------------------------------------
1 | import sys
2 | # Add the path to the sys.path
3 | sys.path.append('.')
4 | import logging
5 | import os
6 | import json
7 | import pandas as pd
8 | from datetime import datetime
9 | import torch
10 | import torch.nn as nn
11 | from sentence_transformers import SentenceTransformer, models, InputExample, losses, LoggingHandler
12 | from torch.utils.data import DataLoader
13 | from torch.utils.tensorboard import SummaryWriter
14 | from api_evaluator import APIEvaluator
15 | import argparse
16 | import os
17 | from toolbench.utils import process_retrieval_ducoment
18 |
19 | import os
20 |
21 | parser = argparse.ArgumentParser()
22 | parser.add_argument("--data_path", default=None, type=str, required=True,
23 | help="The input data dir. Should contain the .tsv files for the task.")
24 | parser.add_argument("--model_name", default=None, type=str, required=True,
25 | help="The base model name.")
26 | parser.add_argument("--output_path", default=None, type=str, required=True,
27 | help="The base path where the model output will be saved.")
28 | parser.add_argument("--num_epochs", default=5, type=int, required=True,
29 | help="Train epochs.")
30 | parser.add_argument("--train_batch_size", default=32, type=int, required=True,
31 | help="Train batch size.")
32 | parser.add_argument("--learning_rate", default=2e-5, type=float, required=True,
33 | help="Learning rate.")
34 | parser.add_argument("--warmup_steps", default=500, type=float, required=True,
35 | help="Warmup steps.")
36 | parser.add_argument("--max_seq_length", default=256, type=int, required=True,
37 | help="Max sequence length.")
38 | args = parser.parse_args()
39 |
40 | logging.basicConfig(format='%(asctime)s - %(message)s',
41 | datefmt='%Y-%m-%d %H:%M:%S',
42 | level=logging.INFO,
43 | handlers=[LoggingHandler()])
44 | logger = logging.getLogger(__name__)
45 |
46 | torch.manual_seed(42)
47 | torch.cuda.manual_seed(42)
48 |
49 | num_epochs = args.num_epochs
50 | train_batch_size = args.train_batch_size
51 | lr = args.learning_rate
52 | warmup_steps = args.warmup_steps
53 | data_path = args.data_path
54 | output_path = args.output_path
55 | os.makedirs(output_path, exist_ok=True)
56 |
57 | model_save_path = os.path.join(output_path, datetime.now().strftime("%Y-%m-%d_%H-%M-%S"))
58 | os.makedirs(model_save_path, exist_ok=True)
59 |
60 | tensorboard_name = 'name_desc'
61 | logs_writer = SummaryWriter(os.path.join(output_path, 'tensorboard', tensorboard_name))
62 |
63 |
64 | def log_callback_st(train_ix, global_step, training_steps, current_lr, loss_value):
65 | logs_writer.add_scalar('train_loss', loss_value, global_step)
66 | logs_writer.add_scalar('lr', current_lr[0], global_step)
67 |
68 |
69 | # Model definition
70 | model = SentenceTransformer(args.model_name)
71 | # word_embedding_model = models.Transformer(args.model_name, max_seq_length=args.max_seq_length)
72 | # pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
73 | # model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
74 |
75 | ir_train_queries = {}
76 | ir_test_queries = {}
77 | ir_relevant_docs = {}
78 | train_samples = []
79 |
80 | documents_df = pd.read_csv(os.path.join(data_path, 'corpus.tsv'), sep='\t')
81 | ir_corpus, _ = process_retrieval_ducoment(documents_df)
82 |
83 | train_queries_df = pd.read_csv(os.path.join(data_path, 'train.query.txt'), sep='\t', names=['qid', 'query'])
84 | for row in train_queries_df.itertuples():
85 | ir_train_queries[row.qid] = row.query
86 | train_queries_df = pd.read_csv(os.path.join(data_path, 'test.query.txt'), sep='\t', names=['qid', 'query'])
87 | for row in train_queries_df.itertuples():
88 | ir_test_queries[row.qid] = row.query
89 |
90 | labels_df = pd.read_csv(os.path.join(data_path, 'qrels.train.tsv'), sep='\t', names=['qid', 'useless', 'docid', 'label'])
91 | for row in labels_df.itertuples():
92 | sample = InputExample(texts=[ir_train_queries[row.qid], ir_corpus[row.docid]], label=row.label)
93 | train_samples.append(sample)
94 | labels_df = pd.read_csv(os.path.join(data_path, 'qrels.test.tsv'), sep='\t', names=['qid', 'useless', 'docid', 'label'])
95 | for row in labels_df.itertuples():
96 | ir_relevant_docs.setdefault(row.qid, set()).add(row.docid)
97 |
98 | train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=train_batch_size, pin_memory=True)
99 | train_loss = losses.MultipleNegativesRankingLoss(model)
100 | ir_evaluator = APIEvaluator(ir_test_queries, ir_corpus, ir_relevant_docs)
101 |
102 | # You may need to modify the .fit() method to ensure all data is moved to the correct device during parallel computations
103 |
104 | # model.fit(train_objectives=[(train_dataloader, train_loss)],
105 | # evaluator=ir_evaluator,
106 | # epochs=num_epochs,
107 | # warmup_steps=warmup_steps,
108 | # optimizer_params={'lr': lr},
109 | # output_path=model_save_path
110 | # )
111 |
112 | # evaluate
113 | ir_evaluator(model, output_path=model_save_path)
114 |
115 |
--------------------------------------------------------------------------------
/evaluation/toolbench/tooleval/README_ZH.md:
--------------------------------------------------------------------------------
1 |
2 |
🛠️Tool Eval🤖
3 |
4 |
5 | 通过在ToolBench上对LLaMA进行微调,我们得到了**ToolLLaMA**。考虑到人工评估非常耗时,我们借鉴[AlpacaEval](https://tatsu-lab.github.io/alpaca_eval/)开发了一个高效的机器自动评估**ToolEval**,其中包含两个评估指标:
6 |
7 | - **通过率**:计算在有限的OpenAI API调用次数内成功完成指令的比例。
8 |
9 | - **偏好**:通过比较给定指令的两个答案(动作序列)来衡量。我们预先定义了一组更好答案的标准,这些标准被组织成ChatGPT的提示。我们向评估器提供测试指令和两个候选答案,并获得其偏好。我们对每个答案对进行多次评估以提高系统的可靠性。然后,我们计算**优胜率**(被评估器选择为更优的百分比。有关详细信息,请参阅我们的论文。
10 |
11 | 为了验证ChatGPT评估器在通过率和胜率方面的可靠性,我们从四种不同的方法(ChatGPT+ReACT,ChatGPT+DFSDT,ToolLLaMA+DFSDT和GPT4+DFSDT)中进行采样,为每种方法的300个测试指令获取解决方案对。然后,我们请人类标注ChatGPT+DFSDT,ToolLLaMA+DFSDT和GPT4+DFSDT的通过率,以及ChatGPT+ReACT和ChatGPT+DFSDT之间的胜率。
12 |
13 | 我们的ChatGPT评估器在通过率方面与人类标注者具有高达**87.1%**的一致性,在胜率方面具有**80.3%**的一致性。这个结果表明,我们的评估器生成的评估结果与人类非常相似,并且可以视为在通过率和胜率上模拟人类评估的可靠评估器。
14 | 有关ToolEval的更多细节,请参阅我们的论文。
15 |
16 | ## 🚀用法
17 |
18 | ### Install
19 | Install Package (python>=3.9)
20 | ```bash
21 | pip install -r requirements.txt
22 | ```
23 |
24 | ### Evaluation
25 | *若要复现结果,直接通过[Google Drive](https://drive.google.com/drive/folders/1yBUQ732mPu-KclJnuQELEhtKakdXFc3J)下载我们的`reproduction_data.zip`,解压后置`reproduction_data`于`ToolBench/data/`下即可,可以跳过数据准备流程。*
26 | - 数据准备。若要使用 ToolEval 评估您自己的模型和方法,首先需要为六个测试子集准备所有的模型预测。创建一个以您的模型和方法命名的目录,例如 `chatgpt_cot`,然后将每个测试集的预测放在该目录下。目录的文件结构应如下:
27 | ```
28 | ├── /chatgpt_cot/
29 | │ ├── /G1_instruction/
30 | │ │ ├── /10160_CoT@1.json
31 | │ │ └── ...
32 | │ ├── /G1_tool/
33 | │ │ ├── /10221_CoT@1.json
34 | │ │ └── ...
35 | │ ├── ...
36 | │ ├── /G3_instruction/
37 | │ │ ├── /10221_CoT@1.json
38 | │ │ └── ...
39 | ```
40 |
41 | 然后对模型预测进行预处理:
42 |
43 | ```bash
44 | export RAW_ANSWER_PATH=../../data/reproduction_data/model_predictions/
45 | export CONVERTED_ANSWER_PATH=../../data/reproduction_data/model_predictions_converted/
46 | export MODEL_NAME=chatgpt_cot
47 | export METHOD=CoT
48 | mkdir ${CONVERTED_ANSWER_PATH}/${MODEL_NAME}
49 | for test_set in G1_instruction G1_category G1_tool G2_category G2_instruction G3_instruction
50 | do
51 | answer_dir=${RAW_ANSWER_PATH}/${MODEL_NAME}/${test_set}
52 | output_file=${CONVERTED_ANSWER_PATH}/${MODEL_NAME}/${test_set}.json
53 | python convert_to_answer_format.py\
54 | --answer_dir ${answer_dir} \
55 | --method ${METHOD} \
56 | --output ${output_file}
57 | done
58 | ```
59 | 之后,检查`${CONVERTED_ANSWER_PATH}/${MODEL_NAME}`下是否有测试集的预处理JSON文件。如果有,你就可以准备运行以下评估过程了。如果没有,请检查模型的预测是否有问题。
60 |
61 | - OpenAI Key
62 | 准备您的OpenAI Key来搭建我们的evaluator。Key需要被存储到一个json file中,如`path/to/your/openai_key_json_file.json`:
63 | ```bash
64 | [
65 | {
66 | "username": "your_user_name",
67 | "passwd": "your_password",
68 | "api_key": "your_openai_key",
69 | "organization": "your_organization"
70 | },
71 | ...
72 | ]
73 | ```
74 | - Pass rate.
75 | ```bash
76 | export CONVERTED_ANSWER_PATH=../../data/reproduction_data/model_predictions_converted/
77 | export SAVE_PATH=pass_rate_results
78 | export CANDIDATE_MODEL=chatgpt_cot
79 | export API_POOL_FILE=path/to/your/openai_key_json_file.json
80 |
81 | python eval_pass_rate.py \
82 | --converted_answer_path ${CONVERTED_ANSWER_PATH} \
83 | --save_path ${SAVE_PATH} \
84 | --reference_model ${CANDIDATE_MODEL} \
85 | --test_ids ../../data/test_query_ids/ \
86 | --max_eval_threads 20 \
87 | --evaluate_times 4
88 |
89 | ```
90 |
91 | 结果文件会被存储至${SAVE_PATH}中。
92 |
93 | - Win rate. 以下示例以ChatGPT-ReACT作为参考模型,GPT4-ReACT作为候选模型。请注意,您首先需要获取两个模型的pass rate结果,然后运行以下命令来评估GPT4-ReACT的win rate结果:
94 | ```bash
95 | export CONVERTED_ANSWER_PATH=../../data/reproduction_data/model_predictions_converted/
96 | export SAVE_PATH=preference_results
97 | export PASS_TARE_PATH=pass_rate_results
98 | export REFERENCE_MODEL=chatgpt_cot
99 | export CANDIDATE_MODEL=gpt-4-0613_cot
100 | export API_POOL_FILE=path/to/your/openai_key_json_file.json
101 |
102 | python eval_preference.py \
103 | --converted_answer_path ${CONVERTED_ANSWER_PATH} \
104 | --reference_model ${REFERENCE_MODEL} \
105 | --output_model ${CANDIDATE_MODEL} \
106 | --test_ids ../../data/test_query_ids/ \
107 | --save_path ${SAVE_PATH} \
108 | --pass_rate_result_path ${PASS_TARE_PATH} \
109 | --max_eval_threads 20 \
110 | --use_pass_rate true \
111 | --evaluate_times 4
112 | ```
113 |
114 | 结果文件会被存储至${SAVE_PATH}中。
115 |
116 | ### 评估新方法
117 | 要评估除了ReACT和DFSDT之外的方法,您需要遵循以上Data preparation的步骤准备您的预处理好的answer数据。预处理好的answer数据需遵循以下json格式:
118 |
119 | ```json
120 | [
121 | {
122 | "method":"method name",
123 | "total_steps": int, // a integer count total steps in answer details
124 | "final_answer": "final answer from the method",
125 | "answer_details":[{
126 | "role":"node role, can be system, user, assistant and tool",
127 | "message":"message for the node",
128 | "next":[//next steps, can have multiple elements if the node have multiple candidates.
129 | {
130 | "role":"",
131 | "message":"",
132 | "next":[...]
133 | },
134 | ...//more candidates
135 | ]
136 | }]
137 | }
138 | ... // more answers for the give query in the testdata
139 | ]
140 | ```
141 |
142 |
143 | ### 更新排行榜
144 |
145 | 如果您想将您的模型的结果上传到[ToolEval Leaderboard](https://openbmb.github.io/ToolBench/),请您将您的结果文件整理成上述格式发送给我们(urtoolbench@gmail.com)或者开一个pull request。
146 | 我们将运行评测脚本更新结果并将您的模型添加到排行榜中。
147 |
148 |
149 | ### 创建新的自动评估器
150 | 如果您想创建新的自动评估器,您需要按下列步骤进行:
151 | 1. 在路径`toolbench/tooleval/evaluators`下创建一个评测器配置文件目录,命名与你的评测器名一致。在其中添加`config.yaml`文件与`template.txt`文件。具体配置方式可参考`toolbench/tooleval/evaluators/tooleval_gpt-3.5-turbo_normalized`中的实现。
152 | 2. 创建你的evaluator类并实现`fn_completions`函数在文件夹`toolbench/tooleval/evaluators/registered_cls`中,或者你可以使用我们预先定义好的类例如`OpenAINormalizedEvaluator`。
153 | 完成后将配置文件中`registered_cls_name`字段填写为该类的名称。
154 | 这里给出一个例子:
155 | ```Python
156 | from evaluators import register_evaluator,BaseEvaluator
157 | from typing import Dict,List
158 |
159 | @register_evaluator
160 | class MyEvaluator(BaseEvaluator):
161 | def __init__(self,config):
162 | super().__init__(
163 | fn_completions=self.fn_completions,
164 | )
165 | # set your configures here
166 |
167 | def fn_completions(self,query:Dict,answers:List[Dict])->int:
168 | # implement your evaluator here
169 | # return the index of the preferred answer
170 | return 0
171 | ```
172 | 其中register_evaluator是一个装饰器,用于注册评估器,BaseEvaluator是一个基类,用于实现评估器的基本功能。
173 | 3. 测试评估器的性能,运行脚本`evaluators_comparison.py`。
174 |
--------------------------------------------------------------------------------
/evaluation/toolbench/tooleval/ToolBench.code-workspace:
--------------------------------------------------------------------------------
1 | {
2 | "folders": [
3 | {
4 | "path": "../../../ToolBench"
5 | },
6 | {
7 | "path": "../.."
8 | },
9 | {
10 | "path": "../../../STC/RapidAPI-Server"
11 | }
12 | ],
13 | "settings": {
14 | "git.ignoreLimitWarning": true
15 | }
16 | }
--------------------------------------------------------------------------------
/evaluation/toolbench/tooleval/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Reason-Wang/ToolGen/6839374a255810efe69deea4056eec5c55e25802/evaluation/toolbench/tooleval/__init__.py
--------------------------------------------------------------------------------
/evaluation/toolbench/tooleval/automatic_eval_sample.py:
--------------------------------------------------------------------------------
1 | import os
2 | import json
3 | import time
4 | from concurrent.futures import ThreadPoolExecutor,as_completed
5 | from tqdm import tqdm
6 | import numpy as np
7 | import argparse
8 | import random
9 | from evaluation import UserEvaluation,BaseToolMethod
10 | from evaluators import load_registered_automatic_evaluator
11 | from typing import List,Dict,Callable
12 | import pandas as pd
13 |
14 | abs_dir = os.path.split(__file__)[0]
15 |
16 |
17 | def parse_args():
18 | parser = argparse.ArgumentParser()
19 | parser.add_argument('--output',default=os.path.join(abs_dir,'dataset','test.json'),help='where to store the method output.')
20 | parser.add_argument('--method',default='unknown',help='what the name of the method.')
21 | parser.add_argument('--ref_method',default='gpt-3.5-turbo_CoT',help='what the reference method is')
22 | parser.add_argument('--ref_output',default=os.path.join(abs_dir,'dataset','ref_sample.json'),help='where the reference answer stored')
23 | parser.add_argument('--evaluators_cfg_path',default=os.path.join(abs_dir,'evaluators'),help='where the evaluators config files are stored')
24 | parser.add_argument('--evaluator',default='tooleval_gpt-3.5-turbo_normalized',help='which evaluator to use')
25 | parser.add_argument('--max_eval_threads',default=16,type=int,help='how many threads to use for evaluation')
26 | parser.add_argument('--evalset',default='default_evalset',help='which the evaluation dataset to use')
27 | parser.add_argument('--eval_server_address',default='http://localhost:8000',help='the address of the evaluation server')
28 | parser.add_argument('--use_existed_output',default=False,action='store_true',help='whether to use the existed output')
29 |
30 | return parser.parse_args()
31 |
32 |
33 | ## !!define your method here !!
34 | class SampleMethod(BaseToolMethod):
35 | def __init__(self):
36 | super().__init__()
37 | def forward(self,query:str,tools:List[Dict],tool_func:Callable)->Dict:
38 | return {}
39 | def convert_result_to_dict(self,result):
40 | return {
41 | 'method': 'sample',
42 | 'total_steps': 0,
43 | 'final_answer': '',
44 | 'answer_details': []
45 | }
46 |
47 | if __name__=='__main__':
48 | args = parse_args()
49 |
50 | exec_generating_method_outputs = True
51 | if os.path.exists(args.output):
52 | print('Output file {} already exists!'.format(args.output))
53 | if args.use_existed_output:
54 | exec_generating_method_outputs = False
55 | else:
56 | print('Overwrite? (y/n)')
57 | exec_generating_method_outputs = input()=='y'
58 |
59 | if exec_generating_method_outputs:
60 | ## change the SampleMethod to your method
61 | usereval = UserEvaluation(SampleMethod(),args.eval_server_address,args.evalset)
62 | print('Generating method outputs...')
63 | results = usereval.run()
64 | print('Saving method outputs...')
65 | with open(args.output,'w') as f:
66 | json.dump(results,f)
67 | else:
68 | print('Use existed output.')
69 | results = json.load(open(args.output))
70 |
71 | print('Loading reference answer for evaluation...')
72 | try:
73 | ref_output = json.load(open(args.ref_output))
74 | except:
75 | raise Exception('Cannot load reference answer from {}\n Please Download before evaluation!'.format(args.ref_output))
76 |
77 | print('Loading automatic evaluators...')
78 | evaluators = [load_registered_automatic_evaluator(vars(args)) for _ in range(args.max_eval_threads)]
79 |
80 | def get_preference(qid,query,tools,ref_ans,ans,):
81 | global evaluators
82 | evaluator = random.choice(evaluators)
83 | ret = evaluator.annotate_preference(
84 | query,
85 | tools,
86 | [ref_ans,ans])
87 | return qid,ret
88 | def get_most_preferred(d:list)->np.ndarray:
89 | if np.iterable(d):
90 | d = np.asanyarray(d)
91 | bins = np.bincount(d)
92 | max_val = np.max(bins)
93 | argmax = np.where(max_val==bins)[0]
94 | return argmax
95 | else:
96 | return np.asarray([d])
97 |
98 | print('Evaluating...')
99 | prefer_dict = {}
100 | with ThreadPoolExecutor(args.max_eval_threads) as pool:
101 | future = []
102 | for qid in ref_output.keys():
103 | try:
104 | future.append(pool.submit(
105 | get_preference,
106 | qid,
107 | ref_output[qid]['query'],
108 | ref_output[qid]['available_tools'],
109 | ref_output[qid]['answer'],
110 | results[qid]['answer']
111 | ))
112 | except KeyError as e:
113 | print('Warning : Missing answer for query {} in answer file! '.format(e))
114 |
115 | for thd in tqdm(as_completed(future),total=len(future),ncols=100):
116 | qid,preference = thd.result()
117 | prefer_dict[qid] = get_most_preferred(preference)[0]
118 |
119 | prefer = list(prefer_dict.values())
120 |
121 | prefer = np.array(prefer)
122 | df = pd.DataFrame.from_dict([{
123 | 'Method':args.method,
124 | 'Win Rate':prefer.mean(),
125 | 'Std Error':np.std(prefer)/np.sqrt(len(prefer))
126 | }])
127 | print('###### Leaderboard vs {} ######'.format(args.ref_method))
128 | print(df)
129 | save_file = os.path.join(abs_dir,'results',args.evalset,args.method)
130 | os.makedirs(save_file,exist_ok=True)
131 | df.to_csv(os.path.join(save_file,'win.csv'))
132 |
--------------------------------------------------------------------------------
/evaluation/toolbench/tooleval/convert_answers.py:
--------------------------------------------------------------------------------
1 | from convert_to_answer_format import process_invalid_data,process_valid_data
2 | import json
3 | from glob import glob
4 | import os
5 |
6 | save_dir = 'path/to/save/dir'
7 |
8 | groups_dirs = ['path/to/dataset/eval/result/folders']
9 |
10 | for groups_dir in groups_dirs:
11 | method = os.path.split(groups_dir)[1]
12 | print(method)
13 | groups_save_dir = os.path.join(save_dir,method)
14 | os.makedirs(groups_save_dir,exist_ok=True)
15 | groups = [os.path.split(g)[1] for g in glob(groups_dir+'/*')]
16 | full_answer = {}
17 | for g in groups:
18 | print(g)
19 | answer_dict = {}
20 | files = glob(os.path.join(groups_dir,g,'*.json'))
21 | for file in files:
22 | qid = os.path.split(file)[1].split('_')[0]
23 | try:
24 | data = json.load(open(file))
25 | except:
26 | print('Read error: ',file)
27 | continue
28 | if not data['answer_generation']['valid_data']:
29 | answer_dict[qid] = process_invalid_data(method,data)
30 | else:
31 | answer_dict[qid] = process_valid_data(method,data['answer_generation'])
32 | json.dump(answer_dict,open(os.path.join(groups_save_dir,f'{g}.json'),'w'))
33 | full_answer.update(answer_dict)
34 | # json.dump(full_answer,open(os.path.join(groups_save_dir,f'fullanswer.json'),'w'))
--------------------------------------------------------------------------------
/evaluation/toolbench/tooleval/evaluation/__init__.py:
--------------------------------------------------------------------------------
1 | from .usereval import UserEvaluation
2 | from .methodcls import BaseToolMethod
3 | from .dataclass import ExecutionGraph,ExecutionNode,DirectedEdge
--------------------------------------------------------------------------------
/evaluation/toolbench/tooleval/evaluation/methodcls.py:
--------------------------------------------------------------------------------
1 | from typing import Dict, List,Callable
2 |
3 | class BaseToolMethod:
4 | def __init__(self):
5 | pass
6 | def convert_result_to_dict(self,result):
7 | '''Return Format
8 | --------
9 | {
10 | 'method': 'method name',
11 | 'total_steps': int,
12 | 'final_answer': 'answer',
13 | 'answer_details': [{
14 | "role": "system",
15 | "message": "",
16 | "next": [
17 | {
18 | "role": "user",
19 | "message": "I am planning ...",
20 | "next": [
21 | {
22 | "role": "tool",
23 | "message": "{'name': 'Finish', 'arguments': '{\\n \"return_type\": \"give_answer\",\\n \"final_answer\": \"I encountere...",
24 | "next": []
25 | }
26 | ]
27 | }
28 | ]
29 | }]
30 | }
31 |
32 | '''
33 | pass
34 | def forward(self,query:str,tools:List[Dict],tool_func:Callable)->Dict:
35 | pass
36 |
37 | def __call__(self,query:str,tools:List[Dict],tool_func:Callable)->Dict:
38 | result = self.forward(query,tools,tool_func)
39 | return self.convert_result_to_dict(result)
40 |
41 |
--------------------------------------------------------------------------------
/evaluation/toolbench/tooleval/evaluation/usereval.py:
--------------------------------------------------------------------------------
1 | import requests
2 | from tqdm import tqdm
3 | from typing import Union, Dict, List, Optional,Tuple
4 | from .methodcls import BaseToolMethod
5 | from .dataclass import *
6 | import json
7 |
8 | class UserEvaluation:
9 | def __init__(self,
10 | method:BaseToolMethod,
11 | eval_server_addr='http://localhost:8000',
12 | evalset='eval20230718'):
13 | self.eval_server_addr = eval_server_addr
14 | self.evalset = evalset
15 | self.method = method
16 | res = requests.post(self.eval_server_addr+'/neweval',json=self.evalset)
17 | if res.status_code != 200:
18 | raise Exception('Failed to obtain new evaluation id! Error: '+res.text)
19 | ret = res.json()
20 | self.eval_id = ret['evaluation_id']
21 | self.len = ret['len']
22 |
23 | def get_new_question(self)->Tuple[str,List]:
24 | res = requests.post(self.eval_server_addr+'/next_question',json=self.eval_id)
25 | if res.status_code == 204:
26 | raise EvalCompleted()
27 | if res.status_code != 200:
28 | raise Exception('Failed to obtain new question!')
29 |
30 | self.question = Question(**res.json())
31 | self.tool_name_to_id = {}
32 | tools = [tool.model_dump() for tool in self.question.available_tools]
33 | for tool in tools:
34 | self.tool_name_to_id[tool['name']] = tool.pop('tid')
35 |
36 |
37 | return self.question.query,tools
38 | def tool_func(self,tool_name:str,tool_args:str)->requests.Response:
39 | tid = self.tool_name_to_id[tool_name]
40 | # res = requests.post(self.eval_server_addr+'/api',json={
41 | # 'evaluation_id':self.eval_id,
42 | # 'tool_id':tid,
43 | # 'tool_args':tool_args
44 | # })
45 | res = requests.post(self.eval_server_addr+'/rapidapi',json={
46 | 'evaluation_id':self.eval_id,
47 | 'tool_id':tid,
48 | 'tool_args':tool_args
49 | })
50 |
51 | return res
52 | def _forward(self,query:str,tools:List[Dict])->Dict:
53 | method_ret = self.method(query,tools,self.tool_func)
54 |
55 | return self.question.qid,{
56 | 'query':query,
57 | 'available_tools':tools,
58 | 'answer':method_ret
59 | }
60 |
61 |
62 | def run(self)->Dict:
63 | results = {}
64 | for _ in tqdm(range(self.len),ncols=100):
65 | try:
66 | qid,ret = self._forward(*self.get_new_question())
67 | except EvalCompleted:
68 | return results
69 | results[qid] = ret
70 | return results
71 |
--------------------------------------------------------------------------------
/evaluation/toolbench/tooleval/evaluators/__init__.py:
--------------------------------------------------------------------------------
1 | from .registered_cls import BaseEvaluator,register_evaluator,get_evaluator_cls
2 |
3 | __all__=['register_evaluator','get_evaluator_cls','BaseEvaluator','load_registered_automatic_evaluator']
4 |
5 |
6 |
7 | def load_registered_automatic_evaluator(config:dict={},evaluator_name=None,evaluators_cfg_path=None)->BaseEvaluator:
8 | import os
9 | import yaml
10 |
11 | evaluator_name = config['evaluator'] if evaluator_name is None else evaluator_name
12 | cfg_path = config['evaluators_cfg_path'] if evaluators_cfg_path is None else evaluators_cfg_path
13 | cfg_path = os.path.join(cfg_path,evaluator_name)
14 |
15 | cls_name = yaml.load(open(os.path.join(cfg_path,'config.yaml')),Loader=yaml.FullLoader)['registered_cls_name']
16 | # print(evaluator_name)
17 | # print(cfg_path)
18 | # print(cls_name)
19 |
20 | evaluator:BaseEvaluator = get_evaluator_cls(cls_name)(cfg_path)
21 | # print(type(evaluator))
22 | return evaluator
--------------------------------------------------------------------------------
/evaluation/toolbench/tooleval/evaluators/registered_cls/__init__.py:
--------------------------------------------------------------------------------
1 | from .base import BaseEvaluator
2 | from .utils import register_evaluator,get_evaluator_cls
3 |
4 | __all__ = ['register_evaluator','get_evaluator_cls','BaseEvaluator']
5 |
6 | import os
7 | import importlib
8 | current_dir = os.path.dirname(__file__)
9 |
10 | for item in os.listdir(current_dir):
11 | item_path = os.path.join(current_dir, item)
12 |
13 | if os.path.isfile(item_path) and item != '__init__.py' and item.endswith('.py'):
14 | module_name = item[:-3]
15 |
16 | full_module_path = f"{__name__}.{module_name}"
17 |
18 | imported_module = importlib.import_module(full_module_path)
19 |
20 | globals()[module_name] = imported_module
21 |
--------------------------------------------------------------------------------
/evaluation/toolbench/tooleval/evaluators/registered_cls/base.py:
--------------------------------------------------------------------------------
1 | import random
2 | from typing import List, Union, Dict, Any, Callable
3 | import os
4 | import openai
5 | import yaml
6 | from .utils import register_evaluator
7 |
8 | def process_answer(answer: Dict):
9 | answer['final_answer'] = answer['final_answer'][:1000]
10 | answer['answer_details'] = answer['answer_details'][:3000]
11 | answer.pop('method', None)
12 | return answer
13 |
14 |
15 | def process_tools(tools: List[Dict]):
16 | # print(len(tools))
17 | for tool in tools:
18 | # try:
19 |
20 | # print(tool)
21 | tool.pop('description', None)
22 | tool.pop('parameters', None)
23 | # except Exception as e:
24 | # print(tool)
25 | # raise e
26 | return tools
27 |
28 | @register_evaluator
29 | class BaseEvaluator:
30 | """Base class for evaluators.
31 |
32 | Attributes:
33 | ----------
34 | fn_completions : Callable[[Dict,List[Dict]],int]
35 | The completion function of the evaluator, used to get annotated results.
36 | This function should take two arguments: `task_description`:Dict and `answers`:List[Dict], return a int stand for the index of best answer.
37 |
38 | Functions:
39 | ---------
40 | annotate_preference : Callable
41 | Annotate and return the index of the preferred answer.
42 |
43 | """
44 | def __init__(self,
45 | fn_completions: Callable[[Dict,List[Dict]],int] = None,
46 | *args,
47 | **kwargs):
48 | self.fn_completions = fn_completions
49 | # print(fn_completions)
50 | def annotate_preference(self,
51 | query: str,
52 | available_tools: List[Dict[Any, Any]],
53 | answers:List[Dict],
54 | multisample=False,
55 | sample_n=4,
56 | task_status=None,
57 | answer_statuss=[None, None]) -> Union[List[int], int]:
58 | """Annotate and return the index of the preferred answer.
59 |
60 | For given query, available tools, and two answers, return the index of the preferred answer by calling function `fn_completions` of the evaluator.
61 |
62 | Parameters:
63 | ----------
64 | query : str
65 | The query of the task.
66 | available_tools : List[Dict[Any, Any]]
67 | The list of available tools for the task. The specific format of the tool is defined in `tooleval/evaluation/dataclass.py`
68 | answers : List[Dict]
69 | The list of answers for comparison.
70 | multisample : bool, optional
71 | Whether to use multisample to get the preference. If True, the function will return a list of preferences, otherwise return a single preference.
72 | sample_n : int, optional
73 | The number of samples to get the preference.
74 |
75 | Returns:
76 | -------
77 | preference : Union[List[int], int]
78 | The index of the preferred answer. If `multisample` is True, return a list of preferences, otherwise return a single preference.
79 |
80 | Raise:
81 | -----
82 |
83 | """
84 | answers_processed = [process_answer(ans) for ans in answers]
85 | # print("Available tools:", available_tools)
86 | if isinstance(available_tools, dict):
87 | available_tools = list(available_tools.values())
88 | available_tools = process_tools(available_tools)
89 |
90 | def shuffle_run() -> int:
91 | indexs = list(range(len(answers_processed)))
92 | random.shuffle(indexs)
93 |
94 | answers_projected = [answers[idx] for idx in indexs]
95 |
96 | try:
97 | preferred_index = self.fn_completions(
98 | {
99 | 'query':query,
100 | 'available_tools':available_tools,
101 | },
102 | answers_projected,
103 | task_status,
104 | answer_statuss
105 | )
106 | except openai.BadRequestError as e:
107 | print(f"Error: {e}, set reference model to win.")
108 | preferred_index = 0
109 |
110 | if preferred_index in indexs:
111 | return indexs.index(preferred_index)
112 | raise ValueError(f'Preferred index {preferred_index} is invalid!')
113 |
114 | if not multisample:
115 | return shuffle_run()
116 | else:
117 | prefers = [shuffle_run() for _ in range(sample_n)]
118 | return prefers
119 |
120 | @register_evaluator
121 | class ToolEvalEvaluator(BaseEvaluator):
122 | """ToolEval common evaluator class.
123 |
124 | Attributes:
125 | ----------
126 | cfg_path : str
127 | A path store the configuration of the evaluator.
128 |
129 |
130 | """
131 | def __init__(self,
132 | cfg_path: str = None,
133 | ):
134 | eval_config = yaml.load(open(os.path.join(cfg_path,'config.yaml')),Loader=yaml.FullLoader)
135 | template = open(os.path.join(cfg_path,eval_config['prompt_template'])).read()
136 |
137 | super().__init__(
138 | fn_completions=getattr(self,eval_config['fn_completions'])
139 | )
140 | self.eval_config = eval_config
141 | self.template = template
--------------------------------------------------------------------------------
/evaluation/toolbench/tooleval/evaluators/registered_cls/utils.py:
--------------------------------------------------------------------------------
1 | import os
2 | import json
3 | from typing import List,Dict
4 | import requests
5 | from tenacity import retry, wait_random_exponential, stop_after_attempt
6 |
7 | from openai import OpenAI, AzureOpenAI
8 | import random
9 |
10 | __registered_evaluators__ = {}
11 |
12 | def register_evaluator(cls):
13 | """
14 | Decorator function to register classes with the registered_evaluators list.
15 | """
16 | __registered_evaluators__[cls.__name__] = cls
17 | return cls
18 |
19 | def get_evaluator_cls(clsname):
20 | """
21 | Return the evaluator class with the given name.
22 | """
23 | try:
24 | return __registered_evaluators__.get(clsname)
25 | except:
26 | raise ModuleNotFoundError('Cannot find evaluator class {}'.format(clsname))
27 |
28 |
29 | class OpenaiPoolRequest:
30 | def __init__(self, pool_json_file=None):
31 | self.pool:List[Dict] = []
32 | __pool_file = pool_json_file
33 | if os.environ.get('API_POOL_FILE',None) is not None:
34 | __pool_file = os.environ.get('API_POOL_FILE')
35 | self.now_pos = random.randint(-1, len(self.pool))
36 | if os.path.exists(__pool_file):
37 | self.pool = json.load(open(__pool_file))
38 | self.now_pos = random.randint(-1, len(self.pool))
39 | # print(__pool_file)
40 | if os.environ.get('OPENAI_KEY',None) is not None:
41 | self.pool.append({
42 | 'api_key':os.environ.get('OPENAI_KEY'),
43 | 'organization':os.environ.get('OPENAI_ORG',None),
44 | 'api_type':os.environ.get('OPENAI_TYPE',None),
45 | 'api_version':os.environ.get('OPENAI_VER',None)
46 | })
47 |
48 | # @retry(wait=wait_random_exponential(multiplier=1, max=30), stop=stop_after_attempt(10),reraise=True)
49 | def request(self,messages,**kwargs):
50 | self.now_pos = (self.now_pos + 1) % len(self.pool)
51 | key_pos = self.now_pos
52 | item = self.pool[key_pos]
53 | api_key = item['api_key']
54 | api_version = item.get('api_version', None)
55 | api_base = item.get('api_base', None)
56 |
57 | # if kwargs.get('model') == 'gpt-4o':
58 | # client = AzureOpenAI(
59 | # api_key=api_key,
60 | # api_version=api_version,
61 | # azure_endpoint = api_base,
62 | # )
63 | # else:
64 | if api_base:
65 | client = OpenAI(api_key=api_key, api_base=api_base)
66 | else:
67 | client = OpenAI(api_key=api_key)
68 |
69 | response = client.chat.completions.create(messages=messages,**kwargs)
70 | return response
71 |
72 | def __call__(self,messages,**kwargs):
73 | return self.request(messages,**kwargs)
74 |
--------------------------------------------------------------------------------
/evaluation/toolbench/tooleval/evaluators/tooleval_gpt-3.5-turbo_default/config.yaml:
--------------------------------------------------------------------------------
1 | evaluator_name: "tooleval_gpt-3.5-turbo_default"
2 | registered_cls_name: "ReinforceToolLearningEvaluator"
3 | prompt_template: "template.txt"
4 | fn_completions: "normalized_openai_completions"
5 | apis_json: "your/path/to/api_pool.json"
6 | completions_kwargs:
7 | model: "gpt-3.5-turbo-16k"
8 | max_tokens: 1000
9 | temperature: 0.2
10 | timeout: 10
11 | functions:
12 | - name: "check_answer_status"
13 | description: "Parse the json answer with layerd nodes and return the answer_status about the answer"
14 | parameters:
15 | type: "object"
16 | properties:
17 | answer_status:
18 | type: "string"
19 | enum: ["Unsure","Unsolved","Solved"]
20 | required: ["answer_status"]
21 | - name: "parse_answer_status"
22 | description: "Parse the json answer with layerd nodes and return the answer_status about the answer"
23 | parameters:
24 | type: "object"
25 | properties:
26 | answer_status:
27 | type: "string"
28 | enum: ["Unsure","Unsolved","Solved"]
29 | required: ["answer_status"]
30 | - name: "check_task_solvable"
31 | description: "Parse the task description and return the task_status about the task"
32 | parameters:
33 | type: "object"
34 | properties:
35 | task_status:
36 | type: "string"
37 | enum: ["Unsure","Unsolvable","Solvable"]
38 | required: ["task_status"]
39 | - name: "select_better_answer"
40 | description: "Select the better answer with a comprehensive investigation on given aspects. You should ignore the impact of the order of candidate answers."
41 | parameters:
42 | type: "object"
43 | properties:
44 | index:
45 | type: "number"
46 | description: "The `index` value in the selected better answer."
47 | required: ["index"]
48 | fn_completion_parser: "index_parser"
49 | batch_size: 1
50 |
--------------------------------------------------------------------------------
/evaluation/toolbench/tooleval/evaluators/tooleval_gpt-3.5-turbo_default/template.txt:
--------------------------------------------------------------------------------
1 |
2 | check_answer_status
3 |
4 | Giving the query and answer, you need give `answer_status` of the answer by following rules:
5 | 1. If the answer is a sorry message or not a positive/straight response for the given query, return "Unsolved".
6 | 2. If the answer is a positive/straight response for the given query, you have to further check.
7 | 2.1 If the answer is not sufficient to determine whether the solve the query or not, return "Unsure".
8 | 2.2 If you are confident that the answer is sufficient to determine whether the solve the query or not, return "Solved" or "Unsolved".
9 |
10 | Query:
11 | {query}
12 | Answer:
13 | {answer}
14 |
15 | Now give your reason in "content" and `answer_status` of JSON to `check_answer_status`.
16 |
17 |
18 |
19 |
20 | parse_answer_status
21 |
22 | Giving the query and the correspond execution detail of an answer, you need give `answer_status` of the answer by following rules:
23 | 1. If all 'tool' nodes' message indicate that there are errors happened, return "Unsolved"
24 | 2. If you find the information in the "final_answer" is not true/valid according to the messages in 'tool' nodes, return "Unsolved"
25 | 3. If you are unable to verify the authenticity and validity of the information, return "Unsure"
26 | 4. If there are 'tool' node in the chain contains successful func calling and those calling indeed solve the query, return "Solved"
27 |
28 | Query:
29 | {query}
30 | Answer:
31 | {answer}
32 |
33 | Now you are requested to give reason in "content" and `answer_status` of JSON to `parse_answer_status`.
34 |
35 |
36 |
37 |
38 | check_task_solvable
39 |
40 | Please check whether the given task solvable with following rules:
41 | 1. If the `query` provide invalid information (e.g. invalid email address or phone number), return "Unsolvable"
42 | 2. If the `query` needs more information to solve (e.g. the target restaurant name in a navigation task), return "Unsolvable"
43 | 3. If you are unable to draw a conclusion, return "Unsure"
44 | 4. If the currently `available_tools` are enough to solve the query, return "Solvable"
45 |
46 | Task:
47 | {task}
48 |
49 | Now give your reason in "content" and `task_status` of JSON to `check_task_solvable`.
50 |
51 |
52 |
53 |
54 |
55 |
56 | select_better_answer
57 |
58 | Query:
59 | {query}
60 |
61 | Answer_0:
62 | {answer_0}
63 |
64 | Answer_1:
65 | {answer_1}
66 |
67 | Given above query and answers in JSON format, you must follow the rules to select the relatively better answer and give the index of the answer **(0 for Answer_0, 1 for Answer_1)**:
68 | 1. Compare the value of "final_answer" in following aspects:
69 | - Informative: whether it contains all necessary information to reply to the query.
70 | - Factuality: whether it accurately describes what has been done, and what failed in the end.
71 | - Reasoning: If answer does not solve the query, whether gives a detailed and accurate reason for failure.
72 | 2. If you cannot determine yet, compare the value of "answer_details" in following aspects:
73 | - Tool calling costs: calculating the percentage of failed and replicated tools calling.
74 | - Running costs: calculating the total tokens T used in execution.
75 | - Milestone: calculating the milestone(fixed subtasks) reached in execution.
76 | - Exploration: whether tries potential useful tools in execution. Just count times of successful tool calling with different tools/arguments in execution.
77 |
78 | If you have made your decision, calling `select_better_answer`, else if you cannot determine, select a random answer.
79 |
80 |
--------------------------------------------------------------------------------
/evaluation/toolbench/tooleval/evaluators/tooleval_gpt-3.5-turbo_fn/config.yaml:
--------------------------------------------------------------------------------
1 | evaluator_name: "tooleval_gpt-3.5-turbo_fn"
2 | registered_cls_name: "OpenAIEvaluator"
3 | prompt_template: "template.txt"
4 | fn_completions: "openai_completions"
5 | apis_json: "your/path/to/api_pool.json"
6 | completions_kwargs:
7 | model: "gpt-3.5-turbo-16k"
8 | max_tokens: 100
9 | temperature: 0
10 | timeout: 10
11 | function_call:
12 | name: "choose_preference"
13 | functions:
14 | - name: "choose_preference"
15 | description: "Choose the preferred answer for the query within all given answers."
16 | parameters:
17 | type: "object"
18 | properties:
19 | preference:
20 | type: "number"
21 | description: "The index of the preferred answer in all given answers."
22 | required: [ "preference" ]
23 | fn_completion_parser: "index_parser"
24 | batch_size: 1
25 |
--------------------------------------------------------------------------------
/evaluation/toolbench/tooleval/evaluators/tooleval_gpt-3.5-turbo_fn/template.txt:
--------------------------------------------------------------------------------
1 |
2 | system
3 | You are a helpful annotator, that help user to annotate data.
4 |
5 |
6 | user
7 | Giving task description and candidate answers, I want you to choose one preferred answer based on the rules. To do so, I will give you the task description that given to the models, and the candidate answers in a list for chosen. To choose the one preferred answer, you need to first analyse answers based on rules, then give the index number of the preferred answer of JSON to `choose_preference`.
8 |
9 | Here are the preference rules:
10 | 1. if both answers give the none empty `final_answer`, check whether the given `final_answer` solves the given query.
11 | 1.1 if both answers solve the query, choose one with smaller `total_steps`.
12 | 1.1.1 if `total_steps` are same, choose one answer with better `final_answer` quality.
13 | 1.2 if one answer solve while the other not, chose the answer that solve query.
14 | 1.3 if both answers failed, check the `answer_details` to choose one with considering following preference:
15 | 1.3.1 check `response` and prefer more successful tool calling.
16 | 1.3.2 check `name` and prefer using more various tool usage.
17 | 1.3.3 prefer smaller `total_steps`.
18 | 2. if one give none empty `final_answer` while other not, choose the one give `final_answer`.
19 | 3. if both failed to give none empty `final_answer`, following 1.3 to choose one with better `answer_details`.
20 |
21 | Here is the task description in JSON format:
22 | {task_description}
23 |
24 | Here are the candidate answers in JSON format:
25 | {answers}
26 |
27 | Now choose the preferred answer by analysing results and the rules given, return the index in range [0,1].
28 |
--------------------------------------------------------------------------------
/evaluation/toolbench/tooleval/evaluators/tooleval_gpt-3.5-turbo_normalized/config.yaml:
--------------------------------------------------------------------------------
1 | evaluator_name: "tooleval_gpt-3.5-turbo_normalized"
2 | registered_cls_name: "OpenAINormalizedEvaluator"
3 | prompt_template: "template.txt"
4 | fn_completions: "normalized_openai_completions"
5 | apis_json: "your/path/to/api_pool.json"
6 | completions_kwargs:
7 | model: "gpt-3.5-turbo-16k"
8 | max_tokens: 100
9 | temperature: 0
10 | timeout: 10
11 | functions:
12 | - name: "parse_answer_details"
13 | description: "Parse the json answer with layerd nodes and return the informations about the answer"
14 | parameters:
15 | type: "object"
16 | properties:
17 | succeed_tool_calling:
18 | type: "number"
19 | description: "Give the number of times that the 'tool' nodes' message is called successfully without any errors in the response"
20 | used_tool_types:
21 | type: "number"
22 | description: "Give the number of different 'name' in 'tool' nodes' message"
23 | required: [ "succeed_tool_calling", "used_tool_types"]
24 | - name: "select_best_final_answer"
25 | description: "For given query, select the best answer in answers list and return the index of the best answer"
26 | parameters:
27 | type: "object"
28 | properties:
29 | best_answer_index:
30 | type: "number"
31 | description: "The index of the best answer in the answer list, start from 0"
32 | required: [ "best_answer_index"]
33 | - name: "check_solve_query"
34 | description: "Check whether the given answer solve the given query, return true or false"
35 | parameters:
36 | type: "object"
37 | properties:
38 | is_solved:
39 | type: "boolean"
40 | description: "true if solved and false if not"
41 | required: ["is_solved"]
42 | fn_completion_parser: "index_parser"
43 | batch_size: 1
44 |
--------------------------------------------------------------------------------
/evaluation/toolbench/tooleval/evaluators/tooleval_gpt-3.5-turbo_normalized/template.txt:
--------------------------------------------------------------------------------
1 |
2 | parse_answer_details
3 |
4 | Giving answer details in the following JSON format:
5 | {answer_details}
6 |
7 | I want you to parse the answer details and give the information of JSON to `parse_answer_details`. Now parse the answer.
8 |
9 |
10 |
11 | select_best_final_answer
12 |
13 | For query {query}, you have the following answers in JSON format:
14 | {final_answers}
15 |
16 | I want you to select the best answer from the above answers and give the index of the answer of JSON to `select_best_final_answer`. Now select the best answer.
17 |
18 |
19 |
20 | check_solve_query
21 |
22 | Please check whether the answer solve the query or not.
23 | Query:
24 | {query}
25 |
26 | Answer:
27 | {final_answer}
28 |
29 | Now give your judgment of JSON to `check_solve_query`, remember do not be too strict.
30 |
31 |
--------------------------------------------------------------------------------
/evaluation/toolbench/tooleval/evaluators_comparison.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import json
3 | from concurrent.futures import ThreadPoolExecutor,as_completed
4 | from tqdm import tqdm
5 | from evaluators import load_registered_automatic_evaluator
6 | import os
7 | import numpy as np
8 | import copy
9 | from typing import List
10 | from scipy.stats import pearsonr,spearmanr
11 | import random
12 | random.seed(42)
13 |
14 | abs_dir = os.path.split(__file__)[0]
15 | annotated_data = json.load(open(os.path.join(abs_dir,'dataset/human_cross_annotated_data.json')))
16 | NUM_WORKERS=16
17 |
18 | def get_most_preferred(d:list)->np.ndarray:
19 | if np.iterable(d):
20 | d = np.asanyarray(d)
21 | bins = np.bincount(d)
22 | max_val = np.max(bins)
23 | argmax = np.where(max_val==bins)[0]
24 | return argmax
25 | else:
26 | return np.asarray([d])
27 |
28 | def agreement_score(x,ref:list)->float:
29 | majority_x = get_most_preferred(x)
30 | majority_ref = get_most_preferred(ref)
31 | score_unit = 1/len(majority_x)/len(majority_ref)
32 | score = 0.0
33 | for x in majority_x:
34 | if x in majority_ref:
35 | score += score_unit
36 | return score
37 | def get_correlation(x,y):
38 | x= np.asarray(x)
39 | y = np.asarray(y)
40 | x = x+1
41 | y = y+1
42 | if np.var(x)==0 or np.var(y)==0:
43 | return float(random.choice(get_most_preferred(x))==random.choice(get_most_preferred(y)))
44 | return pearsonr(x,y)[0]
45 |
46 | def test_on_annotated_data(evaluator_cfg)->List[List[int]]:
47 | evaluators = [load_registered_automatic_evaluator(evaluator_cfg) for _ in range(NUM_WORKERS)]
48 | def get_preference(idx):
49 | data = annotated_data[idx]
50 | def process_tools(tools:list):
51 | for tool in tools:
52 | tool.pop('description',None)
53 | tool.pop('parameters',None)
54 | return tools
55 |
56 | tools = process_tools(data['available_tools'])
57 | ret = evaluators[idx%NUM_WORKERS].annotate_preference(
58 | data['query'],
59 | tools,
60 | data['answers'],multisample=True)
61 | return idx,ret
62 | prefer_dict = {}
63 | with ThreadPoolExecutor(NUM_WORKERS) as pool:
64 | # future = [pool.submit(get_preference,idx) for idx in range(100)]
65 | future = [pool.submit(get_preference,idx) for idx in range(len(annotated_data))]
66 | for thd in tqdm(as_completed(future),total=len(future),ncols=100):
67 | if thd.exception() is not None:
68 | pool.shutdown(cancel_futures=True)
69 | raise thd.exception()
70 | exit(-1)
71 | idx,preference = thd.result()
72 | prefer_dict[idx] = preference
73 | prefer = [prefer_dict[idx] for idx in range(len(future))]
74 | return prefer
75 |
76 | def get_popped_and_rest(d:list,index:int):
77 | l = copy.deepcopy(d)
78 | popped = l.pop(index)
79 | return popped,l
80 |
81 | def calculate_human_performance():
82 | human_agreement = []
83 | variance = []
84 | for data in annotated_data:
85 | agreement_scores = [
86 | agreement_score(*get_popped_and_rest(data['preference'],idx))
87 | for idx in range(len(data['preference']))
88 | ]
89 | human_agreement.append(np.mean(agreement_scores))
90 | variance.append(np.var([1-agreement_scores[idx] for idx in range(len(agreement_scores))]))
91 |
92 |
93 | return {
94 | 'human_agreement':np.mean(human_agreement),
95 | 'bias':0,
96 | 'variance':np.mean(variance)
97 | }
98 |
99 |
100 |
101 | def calculate_evaluator_performance(evaluator_preference,human_preference):
102 | human_agreement = []
103 | bias = []
104 | variance = []
105 | assert len(evaluator_preference)==len(human_preference),'length of evaluator_preference and human_preference should be the same!'
106 | correlation = []
107 | for idx in range(len(evaluator_preference)):
108 | human_pref = human_preference[idx]
109 | evaluator_pref = evaluator_preference[idx]
110 |
111 | human_agreement.append([
112 | agreement_score(pref,human_pref) for pref in evaluator_pref
113 | ])
114 | bias.append(
115 | 1 - agreement_score(human_pref,evaluator_pref)
116 | )
117 | variance.append(
118 | np.var([1-score for score in human_agreement[-1]])
119 | )
120 | correlation.append(get_correlation(human_pref,evaluator_pref))
121 |
122 | return{
123 | 'correlation': np.mean(correlation),
124 | 'human_agreement':np.mean(np.mean(human_agreement,axis=1)),
125 | 'bias':np.mean(bias),
126 | 'variance':np.mean(variance)
127 | }
128 |
129 | if __name__=='__main__':
130 | evaluators = ['tooleval_gpt-3.5-turbo_normalized',]
131 | human_perference = [
132 | data['preference'] for data in annotated_data
133 | ]
134 |
135 | evaluator_performance = [calculate_human_performance()]
136 | for evaluator in evaluators:
137 | if not os.path.exists(os.path.join(abs_dir,'dataset',f'performance_{evaluator}.npy')):
138 | evaluator_cfg = {
139 | 'evaluators_cfg_path':os.path.join(abs_dir,'evaluators'),
140 | 'evaluator':evaluator
141 | }
142 | evaluator_perference = test_on_annotated_data(evaluator_cfg)
143 | np.save(os.path.join(abs_dir,'dataset',f'performance_{evaluator}.npy'),evaluator_perference)
144 |
145 | evaluator_perference = np.load(os.path.join(abs_dir,'dataset',f'performance_{evaluator}.npy'),allow_pickle=True)
146 | performance = calculate_evaluator_performance(evaluator_perference,human_perference)
147 | print(performance)
148 | evaluator_performance.append(performance)
149 |
150 | df = pd.DataFrame(evaluator_performance,index=['human']+evaluators)
151 | df.to_csv(os.path.join(abs_dir,'dataset','evaluator_performance.csv'))
152 | print(df)
--------------------------------------------------------------------------------
/evaluation/toolbench/tooleval/requirements.txt:
--------------------------------------------------------------------------------
1 | tqdm
2 | numpy
3 | pandas
4 | pydantic
5 | tenacity
6 | openai
7 | pyyaml
--------------------------------------------------------------------------------
/evaluation/toolbench/utils.py:
--------------------------------------------------------------------------------
1 | import json
2 | import re
3 | import torch
4 | import transformers
5 | import transformers.models.llama.modeling_llama
6 | from functools import partial
7 |
8 |
9 | def process_system_message(system_message, functions):
10 | assert "with a function call to actually excute your step." in system_message
11 | # we find that following ReACT format and merging the thought node and function call node is easier for model to learn to integrate the action input json string in its prediction than learn to predict a json string directly.
12 | system_message = system_message.replace("with a function call to actually excute your step.", "with a function call to actually excute your step. Your output should follow this format:\nThought:\nAction\nAction Input:\n")
13 | # add all the function dicts in the prompt.
14 | system_message = system_message + "\nSpecifically, you have access to the following APIs: " + str(functions)
15 | return system_message
16 |
17 | def get_gpu_memory(max_gpus=None):
18 | """Get available memory for each GPU."""
19 | gpu_memory = []
20 | num_gpus = (
21 | torch.cuda.device_count()
22 | if max_gpus is None
23 | else min(max_gpus, torch.cuda.device_count())
24 | )
25 |
26 | for gpu_id in range(num_gpus):
27 | with torch.cuda.device(gpu_id):
28 | device = torch.cuda.current_device()
29 | gpu_properties = torch.cuda.get_device_properties(device)
30 | total_memory = gpu_properties.total_memory / (1024**3)
31 | allocated_memory = torch.cuda.memory_allocated() / (1024**3)
32 | available_memory = total_memory - allocated_memory
33 | gpu_memory.append(available_memory)
34 | return gpu_memory
35 |
36 |
37 | def standardize_category(category):
38 | save_category = category.replace(" ", "_").replace(",", "_").replace("/", "_")
39 | while " " in save_category or "," in save_category:
40 | save_category = save_category.replace(" ", "_").replace(",", "_")
41 | save_category = save_category.replace("__", "_")
42 | return save_category
43 |
44 | def standardize(string):
45 | res = re.compile("[^\\u4e00-\\u9fa5^a-z^A-Z^0-9^_]")
46 | string = res.sub("_", string)
47 | string = re.sub(r"(_)\1+","_", string).lower()
48 | while True:
49 | if len(string) == 0:
50 | return string
51 | if string[0] == "_":
52 | string = string[1:]
53 | else:
54 | break
55 | while True:
56 | if len(string) == 0:
57 | return string
58 | if string[-1] == "_":
59 | string = string[:-1]
60 | else:
61 | break
62 | if string[0].isdigit():
63 | string = "get_" + string
64 | return string
65 |
66 | def change_name(name):
67 | change_list = ["from", "class", "return", "false", "true", "id", "and"]
68 | if name in change_list:
69 | name = "is_" + name
70 | return name
71 |
72 | # code adapted from https://huggingface.co/kaiokendev/superhot-13b-8k-no-rlhf-test/blob/main/llama_rope_scaled_monkey_patch.py
73 | class CondenseRotaryEmbedding(torch.nn.Module):
74 | def __init__(self, dim, ratio, max_position_embeddings=2048, base=10000, device=None):
75 | super().__init__()
76 | inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2).float().to(device) / dim))
77 | self.register_buffer("inv_freq", inv_freq)
78 |
79 | # Build here to make `torch.jit.trace` work.
80 | self.ratio = ratio
81 | max_position_embeddings *= ratio
82 | print(f"Condensing Positional embeddings from {max_position_embeddings} to {max_position_embeddings // ratio}")
83 | self.max_seq_len_cached = max_position_embeddings
84 | t = torch.arange(self.max_seq_len_cached, device=self.inv_freq.device, dtype=self.inv_freq.dtype) / ratio
85 | freqs = torch.einsum("i,j->ij", t, self.inv_freq)
86 | # Different from paper, but it uses a different permutation in order to obtain the same calculation
87 | emb = torch.cat((freqs, freqs), dim=-1)
88 | dtype = torch.get_default_dtype()
89 | self.register_buffer("cos_cached", emb.cos()[None, None, :, :].to(dtype), persistent=False)
90 | self.register_buffer("sin_cached", emb.sin()[None, None, :, :].to(dtype), persistent=False)
91 |
92 | def forward(self, x, seq_len=None):
93 | # x: [bs, num_attention_heads, seq_len, head_size]
94 | # This `if` block is unlikely to be run after we build sin/cos in `__init__`. Keep the logic here just in case.
95 | if seq_len > self.max_seq_len_cached:
96 | self.max_seq_len_cached = seq_len
97 | t = torch.arange(self.max_seq_len_cached, device=x.device, dtype=self.inv_freq.dtype) / self.ratio
98 | freqs = torch.einsum("i,j->ij", t, self.inv_freq)
99 | # Different from paper, but it uses a different permutation in order to obtain the same calculation
100 | emb = torch.cat((freqs, freqs), dim=-1).to(x.device)
101 | self.register_buffer("cos_cached", emb.cos()[None, None, :, :].to(x.dtype), persistent=False)
102 | self.register_buffer("sin_cached", emb.sin()[None, None, :, :].to(x.dtype), persistent=False)
103 | return (
104 | self.cos_cached[:, :, :seq_len, ...].to(dtype=x.dtype),
105 | self.sin_cached[:, :, :seq_len, ...].to(dtype=x.dtype),
106 | )
107 |
108 | def replace_llama_with_condense(ratio):
109 | transformers.models.llama.modeling_llama.LlamaRotaryEmbedding = partial(CondenseRotaryEmbedding, ratio=ratio)
110 |
111 |
112 | def process_retrieval_ducoment(documents_df):
113 | ir_corpus = {}
114 | corpus2tool = {}
115 | for row in documents_df.itertuples():
116 | doc = json.loads(row.document_content)
117 | ir_corpus[row.docid] = (doc.get('category_name', '') or '') + ', ' + \
118 | (doc.get('tool_name', '') or '') + ', ' + \
119 | (doc.get('api_name', '') or '') + ', ' + \
120 | (doc.get('api_description', '') or '') + \
121 | ', required_params: ' + json.dumps(doc.get('required_parameters', '')) + \
122 | ', optional_params: ' + json.dumps(doc.get('optional_parameters', '')) + \
123 | ', return_schema: ' + json.dumps(doc.get('template_response', ''))
124 | corpus2tool[(doc.get('category_name', '') or '') + ', ' + \
125 | (doc.get('tool_name', '') or '') + ', ' + \
126 | (doc.get('api_name', '') or '') + ', ' + \
127 | (doc.get('api_description', '') or '') + \
128 | ', required_params: ' + json.dumps(doc.get('required_parameters', '')) + \
129 | ', optional_params: ' + json.dumps(doc.get('optional_parameters', '')) + \
130 | ', return_schema: ' + json.dumps(doc.get('template_response', ''))] = doc['category_name'] + '\t' + doc['tool_name'] + '\t' + doc['api_name']
131 | return ir_corpus, corpus2tool
--------------------------------------------------------------------------------
/evaluation/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Reason-Wang/ToolGen/6839374a255810efe69deea4056eec5c55e25802/evaluation/utils/__init__.py
--------------------------------------------------------------------------------
/evaluation/utils/embedding.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import numpy as np
3 | from openai import OpenAI
4 | import tiktoken
5 | from tqdm import tqdm
6 |
7 |
8 | def truncate_text_tokens(text, max_tokens=4096):
9 | # Truncate texts to 4096 tokens
10 | encoding = tiktoken.get_encoding("cl100k_base")
11 | return encoding.encode(text)[:max_tokens]
12 |
13 |
14 | def get_openai_embeddings(texts, batch_size, model, api_key):
15 | client = OpenAI(api_key=api_key)
16 | texts = [text.replace("\n", " ") for text in texts]
17 | # Truncate texts to 4096 tokens
18 | truncated_text_tokens = [truncate_text_tokens(text) for text in texts]
19 |
20 | embeddings = []
21 | for i in tqdm(range(0, len(truncated_text_tokens), batch_size)):
22 | batch = truncated_text_tokens[i:i + batch_size]
23 | data = client.embeddings.create(input=batch, model=model).data
24 | embedding = [d.embedding for d in data]
25 | embeddings.extend(embedding)
26 |
27 | return np.array(embeddings)
28 | # return client.embeddings.create(input=texts, model=model).data[0].embedding
29 |
30 |
31 | def get_embeddings(model, device, texts, batch_size=16):
32 | model.eval()
33 | model.to(device)
34 | # tbar = tqdm(dataloader)
35 | embeddings = []
36 | with torch.no_grad():
37 | for i in range(0, len(texts), batch_size):
38 | batch = texts[i:i + batch_size]
39 | embeddings.append(model.encode(batch, device=device))
40 | return np.concatenate(embeddings)
41 |
42 |
--------------------------------------------------------------------------------
/evaluation/utils/retrieval.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from rank_bm25 import BM25Okapi
3 | from nltk.tokenize import word_tokenize
4 | import faiss
5 |
6 | class BM25Indexer:
7 | def __init__(self, corpus, ids=None):
8 | self.corpus = corpus
9 | if ids is None:
10 | self.ids = list(range(embeddings.shape[0]))
11 | else:
12 | self.ids = ids
13 | self.tokenized_corpus = [word_tokenize(document.lower()) for document in corpus]
14 | self.bm25 = BM25Okapi(self.tokenized_corpus)
15 |
16 | def search(self, queries, top_n=5):
17 | if isinstance(queries, str):
18 | queries = [queries]
19 | tokenized_queries = [word_tokenize(query.lower()) for query in queries]
20 | docs_scores = [self.bm25.get_scores(tokenized_query) for tokenized_query in tokenized_queries]
21 |
22 | docs_scores = [[(score, idx) for idx, score in enumerate(doc_scores)] for doc_scores in docs_scores]
23 | scores_ids = [sorted(doc_scores, reverse=True)[:top_n] for doc_scores in docs_scores]
24 |
25 | # For tests only
26 | # scores_ids = [(doc_scores)[:top_n] for doc_scores in docs_scores]
27 |
28 | new_scores_ids = []
29 | for score_ids in scores_ids:
30 | new_score_ids = []
31 | for score, idx in score_ids:
32 | new_score_ids.append((score, self.ids[idx]))
33 | new_scores_ids.append(new_score_ids)
34 |
35 | return new_scores_ids
36 |
37 |
38 |
39 |
40 | class Indexer:
41 | def __init__(self, embeddings, vector_size, ids=None, similarity="cosine"):
42 | self.index = faiss.IndexFlatIP(vector_size)
43 | self.similarity = similarity
44 | if similarity == "cosine":
45 | embeddings /= np.linalg.norm(embeddings, axis=1)[:, None]
46 | self.index.add(embeddings)
47 | if ids is None:
48 | self.ids = list(range(embeddings.shape[0]))
49 | else:
50 | self.ids = ids
51 |
52 | def search(self, queries: np.array, top_n: int):
53 | if len(queries.shape) == 1:
54 | queries = queries.reshape(1, -1)
55 | try:
56 | if self.similarity == "cosine":
57 | queries /= np.linalg.norm(queries, axis=1)[:, None]
58 | scores, indexes = self.index.search(queries, top_n)
59 | except AttributeError:
60 | print(queries)
61 | scores_ids = []
62 | for top_n_score, top_n_idx in zip(scores, indexes):
63 | top_n_score_id = []
64 | for s, i in zip(top_n_score, top_n_idx):
65 | top_n_score_id.append((s, self.ids[i]))
66 | scores_ids.append(top_n_score_id)
67 |
68 | return scores_ids
69 |
70 |
71 | if __name__ == "__main__":
72 | texts = [
73 | "A man standing in front of a building",
74 | "Mooncake is a Chinese bakery product traditionally eaten during the Mid-Autumn Festival",
75 | "PCA is a statistical procedure that uses an orthogonal transformation to convert a set of observations of possibly correlated variables ...",
76 | "The quick brown fox jumps over the lazy dog",
77 | "Barack Obama was the 44th president of the United States",
78 | "The Great Wall of China is a series of fortifications made of stone, brick, tamped earth, wood, and other materials, generally built along an east-to-west line across the historical northern borders of China",
79 | "The domestic dog is a domesticated descendant of the wolf",
80 | "The original cat species to evolve into the domestic cat is the African wildcat",
81 | "Camels and llamas are common pack animals",
82 | ]
83 | query = ["Give me some facts about animals.", "What is the Great Wall of China?"]
84 | from sentence_transformers import SentenceTransformer
85 | model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
86 | embeddings = model.encode(texts)
87 | query_embedding = model.encode(query)
88 | indexer = Indexer(embeddings, embeddings.shape[1])
89 | scores_docids = indexer.search(query_embedding, top_n=3)
90 | print(scores_docids)
91 | top_ids = [[score_id[1] for score_id in score_ids] for score_ids in scores_docids]
92 | print(top_ids)
93 | best_docs = [texts[ids[0]] for ids in top_ids]
94 | print(best_docs)
95 |
--------------------------------------------------------------------------------
/evaluation/utils/utils.py:
--------------------------------------------------------------------------------
1 | import time
2 | from openai import AzureOpenAI, OpenAI
3 | from tqdm import tqdm
4 | from evaluation.toolbench.utils import change_name, standardize
5 | from transformers import LogitsProcessor
6 | from typing import List
7 | import torch
8 |
9 | def get_toolbench_name(tool_name, api_name):
10 | tool_name = standardize(tool_name)
11 | api_name = change_name(standardize(api_name))
12 | toolbench_name = api_name+f"_for_{tool_name}"
13 | toolbench_name = toolbench_name[-64:]
14 | return toolbench_name
15 |
16 | class DisjunctiveTrie:
17 | def __init__(self, nested_token_ids: List[List[int]], no_subsets=True):
18 | r"""
19 | A helper class that builds a trie with the words represented in `nested_token_ids`.
20 | """
21 | self.max_height = max([len(one) for one in nested_token_ids])
22 |
23 | root = {}
24 | for token_ids in nested_token_ids:
25 | level = root
26 | for tidx, token_id in enumerate(token_ids):
27 | if token_id not in level:
28 | level[token_id] = {}
29 |
30 | level = level[token_id]
31 |
32 | if no_subsets and self.has_subsets(root, nested_token_ids):
33 | raise ValueError(
34 | "Each list in `nested_token_ids` can't be a complete subset of another list, but is"
35 | f" {nested_token_ids}."
36 | )
37 |
38 | self.trie = root
39 |
40 | def next_tokens(self, current_seq):
41 | """
42 | The next possible tokens that will progress the trie, given the current sequence of tokens in `current_seq`.
43 | """
44 | start = self.trie
45 |
46 | for current_token in current_seq:
47 | start = start[current_token]
48 |
49 | next_tokens = list(start.keys())
50 |
51 | return next_tokens
52 |
53 | def reached_leaf(self, current_seq):
54 | next_tokens = self.next_tokens(current_seq)
55 |
56 | return len(next_tokens) == 0
57 |
58 | def count_leaves(self, root):
59 | next_nodes = list(root.values())
60 | if len(next_nodes) == 0:
61 | return 1
62 | else:
63 | return sum([self.count_leaves(nn) for nn in next_nodes])
64 |
65 | def has_subsets(self, trie, nested_token_ids):
66 | """
67 | Returns whether # of leaves == # of words. Otherwise some word is a subset of another.
68 | """
69 | leaf_count = self.count_leaves(trie)
70 | return len(nested_token_ids) != leaf_count
71 |
72 |
73 | class AllowTokenIdsProcessor(LogitsProcessor):
74 | def __init__(self, allowed_token_ids: List[int]):
75 | self.allowed_token_ids = allowed_token_ids
76 |
77 | def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor):
78 | mask = torch.ones_like(scores, dtype=torch.bool)
79 | mask[:, self.allowed_token_ids] = False
80 | scores = scores.masked_fill(mask, -1e10)
81 |
82 | return scores
83 |
84 |
85 | class AllowKeyWordsProcessor(LogitsProcessor):
86 | ''' renxi.wang@mbzuai.ac.ae
87 | A logits processor that limit output text to be in a set of predefined keywords.
88 | tokenizer: tokenizer used to encode the keywords
89 | trie: DisjunctiveTrie of predefined keywords
90 | input_ids: input_ids of the prompt that the model is generating from
91 | return:
92 | scores: scores of the logits, where impossible tokens are masked
93 | For beam search, scores are log-softmax of logits, others are logits
94 | '''
95 | def __init__(self, tokenizer, trie, input_ids):
96 | self.tokenizer = tokenizer
97 | self.trie = trie
98 | self.input_ids = input_ids
99 |
100 | def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor):
101 | input_length = self.input_ids.shape[1]
102 | generated_ids = input_ids[:, input_length:].tolist()
103 | new_token_ids = []
104 | for ids in generated_ids:
105 | try:
106 | next_token_ids = self.trie.next_tokens(ids)
107 | except KeyError as e:
108 | next_token_ids = [self.tokenizer.eos_token_id]
109 | if not next_token_ids:
110 | next_token_ids = [self.tokenizer.eos_token_id]
111 | new_token_ids.append(next_token_ids)
112 |
113 | for row, token_ids in enumerate(new_token_ids):
114 | mask = torch.ones_like(scores[row], dtype=torch.bool)
115 | mask[torch.tensor(token_ids)] = False
116 | scores[row, mask] = -1e10
117 |
118 | return scores
119 |
120 |
121 | def openai_client_request(client, model, messages, num_retries: int = 5, return_dict: bool = True, **kwargs):
122 | print(f"Arguments: {kwargs}")
123 | response = {}
124 | # retry request (handles connection errors, timeouts, and overloaded API)
125 | for i in range(num_retries):
126 | try:
127 | response = client.chat.completions.create(
128 | model=model,
129 | messages=messages,
130 | **kwargs
131 | )
132 | # response['success'] = True
133 | break
134 | except Exception as e:
135 | # response['success'] = False
136 | tqdm.write(str(e))
137 | tqdm.write("Retrying...")
138 | time.sleep(10)
139 | if return_dict:
140 | return response
141 | else:
142 | return response.choices[0].message.content
143 |
144 |
145 | class OpenAIChatModel:
146 | def __init__(self, model: str, api_key, api_base=None, api_version=None, azure_endpoint=None, temperature: float=None, stop: List[str]=None):
147 | self.model = model
148 | if api_base:
149 | self.client = OpenAI(api_key=api_key, api_base=api_base)
150 | else:
151 | self.client = OpenAI(api_key=api_key)
152 | self.temperature = temperature
153 | self.stop = stop
154 |
155 | def generate(self, messages: List, temperature: float = None, stop: List[str] = None, print_prompt=False):
156 | if print_prompt:
157 | print(messages)
158 |
159 | kwargs = {}
160 | if self.temperature:
161 | kwargs['temperature'] = self.temperature
162 | elif temperature:
163 | kwargs['temperature'] = temperature
164 | if self.stop:
165 | kwargs['stop'] = self.stop
166 |
167 |
168 | temperature=self.temperature if self.temperature else temperature,
169 | response = openai_client_request(
170 | client=self.client,
171 | model=self.model,
172 | messages=messages,
173 | return_dict=False,
174 | **kwargs
175 | )
176 |
177 | return response
178 |
179 |
180 | def seed_everything(seed: int):
181 | import random, os
182 | import numpy as np
183 | import torch
184 |
185 | random.seed(seed)
186 | os.environ['PYTHONHASHSEED'] = str(seed)
187 | np.random.seed(seed)
188 | torch.manual_seed(seed)
189 | torch.cuda.manual_seed(seed)
190 | torch.backends.cudnn.deterministic = True
191 | torch.backends.cudnn.benchmark = True
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | backoff==2.2.1
2 | click==8.1.7
3 | faiss_cpu
4 | Flask==3.0.3
5 | flask_cors==5.0.0
6 | fschat==0.2.36
7 | httpx==0.27.2
8 | huggingface_hub==0.24.6
9 | nltk==3.9.1
10 | numpy
11 | openai
12 | pandas==2.2.3
13 | peft
14 | psutil
15 | pydantic==2.9.2
16 | PyYAML
17 | rank_bm25==0.2.2
18 | Requests==2.32.3
19 | scikit_learn==1.5.2
20 | scipy==1.14.1
21 | sentence_transformers==3.1.0
22 | tenacity==8.5.0
23 | termcolor==2.5.0
24 | tiktoken==0.7.0
25 | torch==2.4.1
26 | tqdm
27 | transformers
28 | Unidecode
29 |
--------------------------------------------------------------------------------
/scripts/convert_answer/run_convert_answer.sh:
--------------------------------------------------------------------------------
1 | export RAW_ANSWER_PATH=data/answer
2 | export CONVERTED_ANSWER_PATH=data/model_predictions_converted
3 | export MODEL_NAME=test
4 | export test_set=G2_instruction
5 | method="CoT@1"
6 |
7 | mkdir -p ${CONVERTED_ANSWER_PATH}/${MODEL_NAME}
8 | answer_dir=${RAW_ANSWER_PATH}/${MODEL_NAME}/${test_set}
9 | output_file=${CONVERTED_ANSWER_PATH}/${MODEL_NAME}/${test_set}.json
10 | echo ${output_file}
11 | python -m evaluation.toolbench.tooleval.convert_to_answer_format\
12 | --answer_dir ${answer_dir} \
13 | --method ${method} \
14 | --output ${output_file}
--------------------------------------------------------------------------------
/scripts/eval_full_pipeline.sh:
--------------------------------------------------------------------------------
1 | export TOOLBENCH_KEY="Put your ToolBench key here"
2 | export OPENAI_KEY="Put your OpenAI key here"
3 | export PYTHONPATH=./
4 | export GPT_MODEL="gpt-3.5-turbo-16k"
5 | export SERVICE_URL="http://localhost:8080/virtual"
6 |
7 | # MODEL_NAME=virtual-gpt35-16k-step16-cot
8 | # model_path="virtual-gpt35-16k-step16-cot"
9 | # backbone_model="chatgpt_function"
10 | # function_provider="truth"
11 |
12 | # MODEL_NAME="ToolLlama-v2-t0.0-cot"
13 | # model_path="ToolBench/ToolLLaMA-2-7b-v2"
14 | # indexing="None"
15 | # function_provider="truth"
16 | # backbone_model="toolllama"
17 |
18 | # MODEL_NAME="ToolLlama-Llama-3-8B-cot"
19 | # model_path="reasonwang/ToolLlama-Llama-3-8B"
20 | # function_provider="truth"
21 | # backbone_model="toolchat"
22 |
23 |
24 | # MODEL_NAME="ToolGen-Semantic-Llama-3-8B-cot"
25 | # model_path="reasonwang/ToolGen-Semantic-Llama-3-8B"
26 | # indexing="Semantic"
27 |
28 |
29 | # model_path="reasonwang/ToolGen-Llama-3-8B-Instruct"
30 | # indexing="Atomic"
31 | # template="llama-3"
32 | # MODEL_NAME="ToolGen-Llama-3-8B-Instruct"
33 |
34 |
35 | # model_code="Qwen2.5-14B"
36 | # model_path="reasonwang/ToolGen-${model_code}"
37 | # indexing="Atomic"
38 | # template="qwen-7b-chat"
39 | # MODEL_NAME="ToolGen-${model_code}-WoRetry"
40 | # function_provider="all"
41 |
42 | if [ $indexing == "Atomic" ]; then
43 | backbone_model="toolgen_atomic"
44 | else
45 | list=("Semantic" "Numeric" "Hierarchical")
46 | for item in "${list[@]}"; do
47 | if [ "$item" = "$indexing" ]; then
48 | backbone_model="toolgen"
49 | break
50 | fi
51 | done
52 | fi
53 |
54 |
55 | export CUDA_VISIBLE_DEVICES=4
56 | OUTPUT_DIR="data/answer/${MODEL_NAME}"
57 | stage="G2"
58 | group="instruction"
59 | method="CoT@1"
60 |
61 | mkdir -p $OUTPUT_DIR; mkdir -p $OUTPUT_DIR/${stage}_${group}
62 | cmd_inference="python evaluation/toolbench/inference/qa_pipeline_multithread.py \
63 | --model_path ${model_path} \
64 | --template ${template} \
65 | --indexing ${indexing} \
66 | --chatgpt_model ${GPT_MODEL} \
67 | --tool_root_dir data/toolenv/tools \
68 | --backbone_model ${backbone_model} \
69 | --openai_key ${OPENAI_KEY} \
70 | --max_observation_length 1024 \
71 | --method ${method} \
72 | --input_query_file data/solvable_queries/test_instruction/${stage}_${group}.json \
73 | --output_answer_file $OUTPUT_DIR/${stage}_${group} \
74 | --toolbench_key ${TOOLBENCH_KEY} \
75 | --num_thread 1 \
76 | --function_provider ${function_provider}"
77 |
78 | echo $cmd_inference
79 | eval $cmd_inference
80 |
81 |
82 | RAW_ANSWER_PATH="data/answer"
83 | CONVERTED_ANSWER_PATH="data/model_predictions_converted"
84 |
85 | mkdir -p ${CONVERTED_ANSWER_PATH}/${MODEL_NAME}
86 | answer_dir="${RAW_ANSWER_PATH}/${MODEL_NAME}/${stage}_${group}"
87 | output_file="${CONVERTED_ANSWER_PATH}/${MODEL_NAME}/${stage}_${group}.json"
88 | echo ${output_file}
89 | cmd_convert="python -m evaluation.toolbench.tooleval.convert_to_answer_format\
90 | --answer_dir ${answer_dir} \
91 | --method ${method} \
92 | --output ${output_file}"
93 |
94 | echo $cmd_convert
95 | eval $cmd_convert
96 |
97 | export API_POOL_FILE=openai_keys.json
98 | SAVE_PATH="data/results/pass_rate"
99 | mkdir -p ${SAVE_PATH}
100 | export EVAL_MODEL=gpt-4-turbo-2024-04-09
101 | mkdir -p ${SAVE_PATH}/${MODEL_NAME}
102 |
103 | cmd_pass="python -m evaluation.toolbench.tooleval.eval_pass_rate \
104 | --converted_answer_path ${CONVERTED_ANSWER_PATH} \
105 | --save_path ${SAVE_PATH}/${MODEL_NAME} \
106 | --reference_model ${MODEL_NAME} \
107 | --test_ids data/solvable_queries/test_query_ids \
108 | --max_eval_threads 3 \
109 | --evaluate_times 3 \
110 | --test_set ${stage}_${group}"
111 |
112 | echo $cmd_pass
113 | eval $cmd_pass
114 |
115 | export API_POOL_FILE=openai_keys.json
116 | SAVE_PATH="data/results/preference_rate"
117 | PASS_RATE_PATH="data/results/pass_rate"
118 | REFERENCE_MODEL=virtual-gpt35-16k-step16-cot
119 | export EVAL_MODEL=gpt-4o-2024-05-13
120 | mkdir -p ${SAVE_PATH}
121 |
122 | cmd_preference="python -m evaluation.toolbench.tooleval.eval_preference \
123 | --converted_answer_path ${CONVERTED_ANSWER_PATH} \
124 | --reference_model ${REFERENCE_MODEL} \
125 | --output_model ${MODEL_NAME} \
126 | --test_ids data/solvable_queries/test_query_ids/ \
127 | --save_path ${SAVE_PATH}/${MODEL_NAME} \
128 | --pass_rate_result_path ${PASS_RATE_PATH} \
129 | --max_eval_threads 3 \
130 | --use_pass_rate true \
131 | --evaluate_times 3 \
132 | --test_set ${stage}_${group}"
133 |
134 | echo $cmd_preference
135 | eval $cmd_preference
--------------------------------------------------------------------------------
/scripts/eval_opendomain_full_pipeline.sh:
--------------------------------------------------------------------------------
1 | TOOLBENCH_KEY=""
2 | OPENAI_KEY=""
3 | export PYTHONPATH=./
4 | export GPT_MODEL="gpt-3.5-turbo-16k"
5 | export SERVICE_URL="http://localhost:8080/virtual"
6 |
7 | # MODEL_NAME="virtual-gpt35-16k-step16-cot-opendomain"
8 | # model_path="virtual-gpt35-16k-step16-cot"
9 | # backbone_model="chatgpt_function"
10 | # function_provider="retriever"
11 |
12 | MODEL_NAME="ToolLlama-Llama-3-8B-t0.0-cot-toolretriever"
13 | model_path="reasonwang/ToolLlama-Llama-3-8B"
14 | function_provider="retriever"
15 | backbone_model="toolchat"
16 |
17 |
18 |
19 | # MODEL_NAME="ToolLlama-Llama-3-8B-t0.0-cot-toolretriever"
20 | # model_path="reasonwang/ToolLlama-Llama-3-8B"
21 | # function_provider="retriever"
22 | # backbone_model="toolchat"
23 |
24 |
25 | # MODEL_NAME="ToolLlama-v2-t0.0-cot-opendomain-toolretriever-retry-finish"
26 | # model_path="ToolBench/ToolLLaMA-2-7b-v2"
27 | # indexing="None"
28 | # replace_file="None"
29 | # function_provider="retriever"
30 | # backbone_model="toolllama"
31 |
32 |
33 |
34 | OUTPUT_DIR="data/answer/${MODEL_NAME}"
35 | export CUDA_VISIBLE_DEVICES=0
36 | stage="G2"
37 | group="instruction"
38 | method="CoT@1"
39 |
40 |
41 | # Open domain setting
42 | # corpus_tsv_path="data/retrieval/${stage}/corpus.tsv"
43 | corpus_tsv_path="data/retrieval/corpus_G123.tsv"
44 | # retrieval_model_path="reasonwang/BERT-${stage}"
45 | retrieval_model_path="ToolBench/ToolBench_IR_bert_based_uncased"
46 | mkdir -p $OUTPUT_DIR; mkdir -p $OUTPUT_DIR/${stage}_${group}
47 |
48 | cmd="python toolbench/inference/qa_pipeline_multithread.py \
49 | --replace_file ${replace_file} \
50 | --model_path ${model_path} \
51 | --chatgpt_model ${GPT_MODEL} \
52 | --tool_root_dir data/toolenv/tools \
53 | --corpus_tsv_path ${corpus_tsv_path} \
54 | --retrieval_model_path ${retrieval_model_path} \
55 | --backbone_model ${backbone_model} \
56 | --openai_key ${OPENAI_KEY} \
57 | --max_observation_length 1024 \
58 | --method ${method} \
59 | --input_query_file data/solvable_queries/test_instruction/${stage}_${group}.json \
60 | --output_answer_file $OUTPUT_DIR/${stage}_${group} \
61 | --toolbench_key $TOOLBENCH_KEY \
62 | --num_thread 1 \
63 | --function_provider ${function_provider}"
64 | echo $cmd
65 | eval $cmd
66 |
67 |
68 | RAW_ANSWER_PATH="data/answer"
69 | CONVERTED_ANSWER_PATH="data/model_predictions_converted"
70 |
71 | mkdir -p ${CONVERTED_ANSWER_PATH}/${MODEL_NAME}
72 | answer_dir="${RAW_ANSWER_PATH}/${MODEL_NAME}/${stage}_${group}"
73 | output_file="${CONVERTED_ANSWER_PATH}/${MODEL_NAME}/${stage}_${group}.json"
74 | echo ${output_file}
75 | cmd="python -m toolbench.tooleval.convert_to_answer_format\
76 | --answer_dir ${answer_dir} \
77 | --method ${method} \
78 | --output ${output_file}"
79 | echo $cmd
80 | eval $cmd
81 |
82 |
83 | export API_POOL_FILE=openai_key_mbz.json
84 | SAVE_PATH="data/results/pass_rate"
85 | mkdir -p ${SAVE_PATH}
86 | export EVAL_MODEL=gpt-4-turbo-2024-04-09
87 | mkdir -p ${SAVE_PATH}/${MODEL_NAME}
88 |
89 | cmd="python -m toolbench.tooleval.eval_pass_rate \
90 | --converted_answer_path ${CONVERTED_ANSWER_PATH} \
91 | --save_path ${SAVE_PATH}/${MODEL_NAME} \
92 | --reference_model ${MODEL_NAME} \
93 | --test_ids data/solvable_queries/test_query_ids \
94 | --max_eval_threads 3 \
95 | --evaluate_times 3 \
96 | --test_set ${stage}_${group} --overwrite"
97 | echo $cmd
98 | eval $cmd
99 |
100 | export API_POOL_FILE=openai_key.json
101 | SAVE_PATH="data/results/preference_rate"
102 | PASS_RATE_PATH="data/results/pass_rate"
103 | REFERENCE_MODEL=virtual-gpt35-16k-step16-cot
104 | export EVAL_MODEL=gpt-4o
105 | mkdir -p ${SAVE_PATH}
106 |
107 | cmd="python -m toolbench.tooleval.eval_preference \
108 | --converted_answer_path ${CONVERTED_ANSWER_PATH} \
109 | --reference_model ${REFERENCE_MODEL} \
110 | --output_model ${MODEL_NAME} \
111 | --test_ids data/solvable_queries/test_query_ids/ \
112 | --save_path ${SAVE_PATH}/${MODEL_NAME} \
113 | --pass_rate_result_path ${PASS_RATE_PATH} \
114 | --max_eval_threads 3 \
115 | --use_pass_rate true \
116 | --evaluate_times 3 \
117 | --test_set ${stage}_${group} --overwrite"
118 | echo $cmd
119 | eval $cmd
--------------------------------------------------------------------------------
/scripts/inference/inference_gpt_pipeline_virtual.sh:
--------------------------------------------------------------------------------
1 | export TOOLBENCH_KEY="Set your toolbench key here"
2 | export OPENAI_KEY="Set your openai api key here"
3 | export PYTHONPATH=./
4 | export SERVICE_URL="http://localhost:8080/virtual"
5 |
6 |
7 | export GPT_MODEL="gpt-3.5-turbo-16k"
8 | export OUTPUT_DIR="data/answer/test"
9 | group="G2_instruction"
10 |
11 | mkdir -p $OUTPUT_DIR; mkdir -p $OUTPUT_DIR/$group
12 | python evaluation/toolbench/inference/qa_pipeline_multithread.py \
13 | --tool_root_dir data/toolenv/tools \
14 | --backbone_model chatgpt_function \
15 | --chatgpt_model $GPT_MODEL \
16 | --openai_key $OPENAI_KEY \
17 | --max_observation_length 1024 \
18 | --method CoT@1 \
19 | --input_query_file data/solvable_queries/test_instruction/${group}.json \
20 | --output_answer_file $OUTPUT_DIR/$group \
21 | --toolbench_key $TOOLBENCH_KEY \
22 | --num_thread 1 --function_provider "truth"
--------------------------------------------------------------------------------
/scripts/inference/inference_opendomain_toolllama_pipeline_virtual.sh:
--------------------------------------------------------------------------------
1 | export TOOLBENCH_KEY="Set your toolbench key here"
2 | export OPENAI_KEY="Set you openai api key here"
3 | export PYTHONPATH=./
4 | chatgpt_model="gpt-4o"
5 | export SERVICE_URL="http://localhost:8080/virtual"
6 | export OUTPUT_DIR="data/answer/test"
7 | export CUDA_VISIBLE_DEVICES=0
8 |
9 | model_path="reasonwang/ToolLlama-Llama-3-8B"
10 | stage="G2"
11 | group="instruction"
12 |
13 |
14 | # Open domain setting
15 | corpus_tsv_path="data/retrieval/${stage}/corpus.tsv"
16 | # retrieval_model_path="../models/ToolLlama/retriever/bert_${stage}"
17 | retrieval_model_path="ToolBench/ToolBench_IR_bert_based_uncased"
18 |
19 |
20 | mkdir -p $OUTPUT_DIR; mkdir -p $OUTPUT_DIR/$group
21 |
22 | # Open Domain
23 | cmd="python evaluation/toolbench/inference/qa_pipeline_multithread.py \
24 | --model_path ${model_path} \
25 | --tool_root_dir data/toolenv/tools \
26 | --chatgpt_model ${chatgpt_model} \
27 | --corpus_tsv_path ${corpus_tsv_path} \
28 | --retrieval_model_path ${retrieval_model_path} \
29 | --backbone_model toolchat \
30 | --openai_key $OPENAI_KEY \
31 | --max_observation_length 1024 \
32 | --method CoT@1 \
33 | --input_query_file data/solvable_queries/test_instruction/${stage}_${group}.json \
34 | --output_answer_file $OUTPUT_DIR/${stage}_${group} \
35 | --toolbench_key $TOOLBENCH_KEY \
36 | --num_thread 1 \
37 | --function_provider retriever"
38 |
39 | echo $cmd
40 | eval $cmd
--------------------------------------------------------------------------------
/scripts/inference/inference_toolgen_pipeline_virtual.sh:
--------------------------------------------------------------------------------
1 | export TOOLBENCH_KEY="Set your ToolBench key here"
2 | export OPENAI_KEY="Set your OpenAI key here"
3 | export PYTHONPATH=./
4 | export SERVICE_URL="http://localhost:8080/virtual"
5 | export CUDA_VISIBLE_DEVICES=0
6 |
7 | model_path="reasonwang/ToolGen-Llama-3-8B"
8 | indexing="Atomic"
9 | template="llama-3"
10 |
11 | # model_name="Qwen2.5-3B"
12 | # model_path="reasonwang/ToolGen-${model_name}"
13 | # indexing="Atomic"
14 | # template="qwen-7b-chat"
15 |
16 | export OUTPUT_DIR="data/answer/${model_name}/"
17 | stage="G3"
18 | group="instruction"
19 |
20 | if [ $indexing == "Atomic" ]; then
21 | backbone_model="toolgen_atomic"
22 | else
23 | backbone_model="toolgen"
24 | fi
25 |
26 | mkdir -p $OUTPUT_DIR; mkdir -p $OUTPUT_DIR/${stage}_${group}
27 | python evaluation/toolbench/inference/qa_pipeline_multithread.py \
28 | --chatgpt_model gpt-4o \
29 | --model_path ${model_path} \
30 | --template ${template} \
31 | --indexing ${indexing} \
32 | --tool_root_dir data/toolenv/tools \
33 | --backbone_model ${backbone_model} \
34 | --openai_key $OPENAI_KEY \
35 | --max_observation_length 1024 \
36 | --method CoT@1 \
37 | --input_query_file data/solvable_queries/test_instruction/${stage}_${group}.json \
38 | --output_answer_file $OUTPUT_DIR/${stage}_${group} \
39 | --toolbench_key $TOOLBENCH_KEY \
40 | --num_thread 1 \
41 | --function_provider all
--------------------------------------------------------------------------------
/scripts/inference/inference_toolllama_pipeline_virtual.sh:
--------------------------------------------------------------------------------
1 | export TOOLBENCH_KEY="Set your toolbench key here"
2 | export OPENAI_KEY="Set your openai api key here"
3 | export PYTHONPATH=./
4 | export SERVICE_URL="http://localhost:8080/virtual"
5 | chatgpt_model="gpt-4o"
6 | export OUTPUT_DIR="data/answer/test"
7 |
8 | model_path="reasonwang/ToolLlama-Llama-3-8B"
9 | stage="G2"
10 | group="instruction"
11 |
12 |
13 | mkdir -p $OUTPUT_DIR; mkdir -p $OUTPUT_DIR/$group
14 | cmd="python evaluation/toolbench/inference/qa_pipeline_multithread.py \
15 | --model_path ${model_path} \
16 | --tool_root_dir data/toolenv/tools \
17 | --chatgpt_model ${chatgpt_model} \
18 | --backbone_model toolchat \
19 | --openai_key $OPENAI_KEY \
20 | --max_observation_length 1024 \
21 | --method CoT@1 \
22 | --input_query_file data/solvable_queries/test_instruction/${stage}_${group}.json \
23 | --output_answer_file $OUTPUT_DIR/${stage}_${group} \
24 | --toolbench_key $TOOLBENCH_KEY \
25 | --num_thread 1 --function_provider truth"
26 |
27 | echo $cmd
28 | eval $cmd
--------------------------------------------------------------------------------
/scripts/pass_rate/run_pass_rate.sh:
--------------------------------------------------------------------------------
1 | export OPENAI_KEY="Set your openai api key here"
2 | export API_POOL_FILE=openai_keys.json
3 | # export OPENAI_API_BASE="https://api.openai.com/v1"
4 | export CONVERTED_ANSWER_PATH=data/model_predictions_converted
5 | export SAVE_PATH=data/results/pass_rate
6 | mkdir -p ${SAVE_PATH}
7 | export CANDIDATE_MODEL=test
8 | TEST_SET="G2_instruction"
9 | export EVAL_MODEL=gpt-4-turbo-2024-04-09
10 | mkdir -p ${SAVE_PATH}/${CANDIDATE_MODEL}
11 |
12 |
13 | python -m evaluation.toolbench.tooleval.eval_pass_rate \
14 | --converted_answer_path ${CONVERTED_ANSWER_PATH} \
15 | --save_path ${SAVE_PATH}/${CANDIDATE_MODEL} \
16 | --reference_model ${CANDIDATE_MODEL} \
17 | --test_ids data/solvable_queries/test_query_ids \
18 | --max_eval_threads 3 \
19 | --evaluate_times 3 \
20 | --test_set ${TEST_SET}
--------------------------------------------------------------------------------
/scripts/preference/run_preference.sh:
--------------------------------------------------------------------------------
1 | export OPENAI_KEY="Set your openai api key here"
2 | export API_POOL_FILE=openai_keys.json
3 | export CONVERTED_ANSWER_PATH=data/model_predictions_converted
4 | export SAVE_PATH=data/results/preference_results
5 | export PASS_RATE_PATH=data/results/pass_rate
6 | export REFERENCE_MODEL=virtual-gpt35-16k-step16-cot
7 | export CANDIDATE_MODEL=test
8 | export EVAL_MODEL=gpt-4o-2024-05-13
9 | test_set="G2_instruction"
10 | mkdir -p ${SAVE_PATH}
11 |
12 |
13 | cmd="python -m evaluation.toolbench.tooleval.eval_preference \
14 | --converted_answer_path ${CONVERTED_ANSWER_PATH} \
15 | --reference_model ${REFERENCE_MODEL} \
16 | --output_model ${CANDIDATE_MODEL} \
17 | --test_ids data/solvable_queries/test_query_ids/ \
18 | --save_path ${SAVE_PATH} \
19 | --pass_rate_result_path ${PASS_RATE_PATH} \
20 | --max_eval_threads 3 \
21 | --use_pass_rate true \
22 | --evaluate_times 3 \
23 | --test_set ${test_set}"
24 | echo $cmd
25 | eval $cmd
--------------------------------------------------------------------------------
/scripts/retrieval/eval_bm25.sh:
--------------------------------------------------------------------------------
1 | model="bm25"
2 | stage="G1"
3 | split="instruction" # instruction, tool, category
4 | result_path="bm25"
5 | corpus="G123" # G123, G1, G2, G3. G123 is the multi-domain setting
6 |
7 | cmd="python -m evaluation.retrieval.eval_bm25 \
8 | --model_name_or_path ${model} \
9 | --stage ${stage} \
10 | --split ${split} \
11 | --result_path \"data/results/retrieval/${result_path}_${stage}_${split}.json\" \
12 | --corpus ${corpus}"
13 |
14 | echo $cmd
15 | eval $cmd
--------------------------------------------------------------------------------
/scripts/retrieval/eval_encoder.sh:
--------------------------------------------------------------------------------
1 | # model="reasonwang/BERT-G3"
2 | model="ToolBench/ToolBench_IR_bert_based_uncased"
3 | stage="G1" # G1, G2, G3
4 | split="instruction"
5 | corpus="G123" # G123, G1, G2, G3. G123 is the multi-domain setting
6 | result_path="BERT-G1-full-tools"
7 | cmd="python -m evaluation.retrieval.eval_encoder \
8 | --model_name_or_path ${model} \
9 | --stage ${stage} \
10 | --split ${split} \
11 | --result_path \"data/results/retrieval/${result_path}_${stage}_${split}.json\"
12 | --corpus ${corpus}"
13 |
14 | echo $cmd
15 | eval $cmd
--------------------------------------------------------------------------------
/scripts/retrieval/eval_longcontext.sh:
--------------------------------------------------------------------------------
1 | export OPENAI_API_KEY=""
2 | model="gpt-4o"
3 | stage="G3" # G1, G2, G3
4 | split="test"
5 | corpus="G3" # G123, G1, G2, G3. G123 is the multi-domain setting
6 | result_path="GPT-4o"
7 | cmd="python -m evaluation.retrieval.eval_longcontext \
8 | --model_name_or_path ${model} \
9 | --stage ${stage} \
10 | --split ${split} \
11 | --result_path \"data/results/retrieval/${result_path}_${stage}_${split}.json\"
12 | --corpus ${corpus}"
13 |
14 | echo $cmd
15 | eval $cmd
--------------------------------------------------------------------------------
/scripts/retrieval/eval_openai_embedding.sh:
--------------------------------------------------------------------------------
1 | model_name_or_path="text_embedding_large"
2 | stage="G1" # G1, G2, G3
3 | split="instruction"
4 | result_path="openai"
5 | api_key="Set your openai api key here"
6 | corpus="G123" # G123, G1, G2, G3. G123 is the multi-domain setting
7 |
8 | cmd="python -m evaluation.retrieval.eval_openai_embedding \
9 | --model_name_or_path ${model_name_or_path} \
10 | --api_key ${api_key} \
11 | --stage ${stage} \
12 | --split ${split} \
13 | --result_path \"data/results/retrieval/${result_path}_${stage}_${split}.json\" \
14 | --corpus ${corpus}"
15 |
16 | echo $cmd
17 | eval $cmd
--------------------------------------------------------------------------------
/scripts/retrieval/eval_toolgen.sh:
--------------------------------------------------------------------------------
1 | model_name_or_path="reasonwang/ToolGen-Llama-3-8B-Tool-Retriever"
2 | indexing="Atomic"
3 | constrain="True"
4 | limit_to_stage_space="False"
5 | template="llama-3"
6 |
7 | stage="G1" # G1, G2, G3
8 | split="instruction" # instruction, tool, category
9 |
10 | cmd="python -m evaluation.retrieval.eval_toolgen \
11 | --model_name_or_path ${model_name_or_path} \
12 | --indexing ${indexing} \
13 | --stage ${stage} \
14 | --split ${split} \
15 | --result_path data/results/retrieval/ \
16 | --constrain ${constrain} \
17 | --limit_to_stage_space ${limit_to_stage_space} \
18 | --template ${template}"
19 | echo $cmd
20 | eval $cmd
--------------------------------------------------------------------------------
/training/README.md:
--------------------------------------------------------------------------------
1 | ## Training
2 |
3 | Training requires DeepSpeed as dependency:
4 | ```
5 | pip install deepspeed
6 | ```
7 |
8 |
9 | ### Tool Memorization
10 | In the first stage, we use the following command to train ToolGen. The LLM (Llama-3-8B in this case) is first added tool tokens then expanded embeddings, which is controled by `add_virtual_tokens` argument.
11 |
12 | ```bash
13 | deepspeed --include=localhost:0,1,2,3,4,5,6,7 --master_port 25024 train.py \
14 | --model_name_or_path meta-llama/Meta-Llama-3-8B \
15 | --add_virtual_tokens True \
16 | --flash_attention True \
17 | --deepspeed src/configs/ds_z2_config.json \
18 | --chat True \
19 | --template llama-3 \
20 | --architecture causal \
21 | --output_dir checkpoints/ToolGen-Llama-3-8B-Tool-Memorization \
22 | --save_strategy steps \
23 | --save_steps 1000 \
24 | --gather_weights True \
25 | --learning_rate 2e-5 \
26 | --warmup_ratio 0.03 \
27 | --datasets toolgen_atomic_memorization.json \
28 | --dataset_nums 10000000 \
29 | --per_device_train_batch_size 2 \
30 | --gradient_accumulation_steps 64 \
31 | --max_length 1024 \
32 | --num_train_epochs 8 \
33 | --gradient_checkpointing False \
34 | --bf16 True \
35 | --logging_steps 1 \
36 | --report_to wandb \
37 | --run_name llama-3-8b-tool-memorization
38 | ```
39 |
40 | ### Tool Retrieval
41 | In the second stage, we train the ToolGen model with queries and tool tokens, intialized from the model obtained in the first stage. Since the model is already added tool tokens and expanded embeddings, we set `add_virtual_tokens` to `False`.
42 | ```bash
43 | deepspeed --include=localhost:0,1,2,3,4,5,6,7 --master_port 25024 train.py \
44 | --model_name_or_path checkpoints/ToolGen-Llama-3-8B-Tool-Memorization \
45 | --add_virtual_tokens False \
46 | --flash_attention True \
47 | --deepspeed src/configs/ds_z2_config.json \
48 | --chat True \
49 | --template llama-3 \
50 | --architecture causal \
51 | --output_dir checkpoints/ToolGen-Llama-3-8B-Tool-Retriever \
52 | --save_strategy steps \
53 | --save_steps 1000 \
54 | --gather_weights True \
55 | --learning_rate 2e-5 \
56 | --warmup_ratio 0.03 \
57 | --datasets toolgen_atomic_retrieval_G123.json \
58 | --dataset_nums 1000000 \
59 | --per_device_train_batch_size 2 \
60 | --gradient_accumulation_steps 64 \
61 | --max_length 1024 \
62 | --num_train_epochs 1 \
63 | --gradient_checkpointing False \
64 | --bf16 True \
65 | --logging_steps 1 \
66 | --report_to wandb \
67 | --run_name llama-3-8b-tool-retrieval
68 | ```
69 |
70 | ### End-to-End Training
71 | In the last stage, we train the ToolGen agent model with end-to-end trajectories. We set the maximum length to 6144, which generally needs large GPU memory. Based on our experiments, 4 GPUs each with 80GB memory are enough for this stage (Deepspeed zero 3 with offloading is used).
72 | ```bash
73 | deepspeed --include=localhost:0,1,2,3,4,5,6,7 --master_port 25024 train.py \
74 | --model_name_or_path checkpoints/ToolGen-Llama-3-8B-Tool-Retriever \
75 | --add_virtual_tokens False \
76 | --flash_attention True \
77 | --deepspeed src/configs/ds_z3_offload_config.json \
78 | --chat True \
79 | --template llama-3 \
80 | --architecture causal \
81 | --output_dir checkpoints/ToolGen-Llama-3-8B \
82 | --save_strategy steps \
83 | --save_steps 1000 \
84 | --gather_weights True \
85 | --learning_rate 2e-5 \
86 | --warmup_ratio 0.03 \
87 | --datasets toolgen_atomic_G123_dfs.json \
88 | --dataset_nums 10000000 \
89 | --per_device_train_batch_size 1 \
90 | --gradient_accumulation_steps 64 \
91 | --max_length 6144 \
92 | --num_train_epochs 1 \
93 | --gradient_checkpointing False \
94 | --bf16 True \
95 | --logging_steps 1 \
96 | --report_to wandb \
97 | --run_name llama-3-8b-end2end
98 | ```
99 |
--------------------------------------------------------------------------------
/training/data/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Reason-Wang/ToolGen/6839374a255810efe69deea4056eec5c55e25802/training/data/__init__.py
--------------------------------------------------------------------------------
/training/data/loading.py:
--------------------------------------------------------------------------------
1 |
2 | from data.dataset import CausalLMDataset, CausalLMCollator, CausalLMChatDataset, Seq2SeqDataset, Seq2SeqCollator
3 | from data.utils import load_chat_data, load_instruction_data
4 |
5 |
6 | def load_datasets(chat, architecture, datasets, dataset_nums, tokenizer, max_length, template):
7 | if chat:
8 | # assert args.architecture == 'causal' # Only causal is supported for chat
9 | messages_list = load_chat_data(
10 | datasets,
11 | dataset_nums,
12 | )
13 | dataset = CausalLMChatDataset(tokenizer, messages_list, max_length=max_length, template=template)
14 | collator = CausalLMCollator(tokenizer, max_length=max_length)
15 | else:
16 | instructions, responses = load_instruction_data(
17 | datasets,
18 | dataset_nums,
19 | )
20 | # TODO: Support better template system
21 | if architecture == 'causal':
22 | dataset = CausalLMDataset(
23 | tokenizer,
24 | instructions,
25 | responses,
26 | max_length=max_length,
27 | template=template
28 | )
29 | # Currently max_length is not used in the collator
30 | collator = CausalLMCollator(tokenizer, max_length=max_length)
31 | elif architecture == 'seq2seq':
32 | dataset = Seq2SeqDataset(
33 | tokenizer,
34 | instructions,
35 | responses,
36 | max_length=max_length,
37 | template=template
38 | )
39 | collator = Seq2SeqCollator(tokenizer, max_length=max_length)
40 | else:
41 | raise ValueError(f"Architecture {architecture} not supported")
42 |
43 | return dataset, collator
--------------------------------------------------------------------------------
/training/data/utils.py:
--------------------------------------------------------------------------------
1 | import json
2 | import random
3 |
4 | import pandas as pd
5 |
6 |
7 | def read_jsonl_to_list(file_path):
8 | data_list = []
9 | with open(file_path, 'r') as file:
10 | for line in file:
11 | data_list.append(json.loads(line))
12 | return data_list
13 |
14 |
15 | def load_instruction_data(datasets, nums):
16 | instructions = []
17 | responses = []
18 | for (d, n) in zip(datasets, nums):
19 | data_path = f'{d}'
20 | with open(data_path, 'r') as f:
21 | data_list = json.load(f)[:n]
22 |
23 | for sample in data_list:
24 | instruction = sample['instruction']
25 | if 'input' in sample:
26 | instruction = instruction + ' ' + sample['input']
27 | instruction = instruction.strip()
28 | response = sample['output']
29 |
30 | instructions.append(instruction)
31 | responses.append(response)
32 |
33 | return instructions, responses
34 |
35 |
36 | def load_chat_data(datasets, nums):
37 | assert len(datasets) == len(nums)
38 | messages_list = []
39 | for (d, n) in zip(datasets, nums):
40 | data_path = f'{d}'
41 | with open(data_path, 'r') as f:
42 | data_list = json.load(f)
43 | if n <= len(data_list):
44 | # randomly sample n conversations
45 | data_list = random.sample(data_list, n)
46 | messages_list.extend([data['conversations'] for data in data_list])
47 |
48 | return messages_list
49 |
--------------------------------------------------------------------------------
/training/models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Reason-Wang/ToolGen/6839374a255810efe69deea4056eec5c55e25802/training/models/__init__.py
--------------------------------------------------------------------------------
/training/models/causallm.py:
--------------------------------------------------------------------------------
1 | from typing import List
2 | from prompts.conversations import get_conv_template
3 | from transformers import AutoModelForCausalLM, AutoTokenizer, StoppingCriteria, StoppingCriteriaList
4 | import huggingface_hub
5 | import torch
6 | from prompts.utils import k2_tokenize
7 | from models.utils import KeywordsStoppingCriteria, TextStoppingCriteria
8 |
9 |
10 | class ChatCausalLM:
11 | def __init__(
12 | self,
13 | model_name,
14 | max_new_tokens=512,
15 | temperature=0.7,
16 | device="auto",
17 | system_prompt=None,
18 | cache_dir=None,
19 | conversation_template=None,
20 | ):
21 | self.max_new_tokens = max_new_tokens
22 | self.temperature = temperature
23 | self.device = "cuda" if device=="auto" else device
24 | self.tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=cache_dir)
25 |
26 | self.model = AutoModelForCausalLM.from_pretrained(
27 | model_name,
28 | torch_dtype=torch.bfloat16,
29 | device_map=device,
30 | cache_dir=cache_dir
31 | )
32 |
33 | self.system_prompt = system_prompt
34 | self.conversation_history = []
35 | self.conversation_template = conversation_template
36 |
37 | def generate(self, messages, stop=None, print_prompt=False):
38 | human_role_set = {"user", "human"}
39 | ai_role_set = {"bot", "ai", "gpt", "assistant"}
40 | conv = get_conv_template(self.conversation_template)
41 | for message in messages:
42 | if message['role'] == 'system':
43 | conv.set_system_message(message['content'])
44 | else:
45 | conv.append_message(
46 | conv.roles[0] if message['role'] in human_role_set else conv.roles[1],
47 | message["content"]
48 | )
49 | conv.append_message(conv.roles[1], None)
50 | prompt = conv.get_prompt()
51 | if print_prompt:
52 | print(prompt)
53 | # inputs = self.tokenizer(prompt, return_tensors="pt")
54 | if self.conversation_template == 'k2':
55 | inputs = k2_tokenize(self.tokenizer, prompt, return_tensors="pt")
56 | else:
57 | inputs = self.tokenizer(prompt, return_tensors="pt")
58 | for k, v in inputs.items():
59 | inputs[k] = v.to(self.device)
60 |
61 | if self.conversation_template == 'k2':
62 | stop_criteria = StoppingCriteriaList([TextStoppingCriteria(stop, self.tokenizer, self.device)]) if stop else None
63 | else:
64 | stop_criteria = StoppingCriteriaList([KeywordsStoppingCriteria(stop, self.tokenizer, self.device)]) if stop else None
65 |
66 | outputs = self.model.generate(
67 | **inputs,
68 | max_new_tokens=self.max_new_tokens,
69 | do_sample=True,
70 | temperature=self.temperature,
71 | stopping_criteria=stop_criteria,
72 | eos_token_id=self.tokenizer.eos_token_id,
73 | )
74 | inputs_token_length = len(inputs['input_ids'][0])
75 | new_tokens = outputs[0][inputs_token_length:]
76 | text = self.tokenizer.decode(new_tokens, skip_special_tokens=True)
77 |
78 | if stop:
79 | for ending in stop:
80 | if text.endswith(ending):
81 | text = text[:-len(ending)]
82 | break
83 |
84 | return text.strip()
85 |
86 | def chat(self, text, stop=None, print_prompt=False):
87 |
88 | self.conversation_history.append({"role": "user", "content": text})
89 | messages = [{"role": "system", "content": self.system_prompt}] if self.system_prompt else []
90 | messages.extend(self.conversation_history)
91 | response = self.generate(messages, stop=stop, print_prompt=print_prompt)
92 | self.conversation_history.append({"role": "assistant", "content": response})
93 |
94 | return response
95 |
96 | def clear_history(self):
97 | self.conversation_history = []
98 |
99 |
--------------------------------------------------------------------------------
/training/models/loading.py:
--------------------------------------------------------------------------------
1 | from utils.distributed import is_main_process
2 | import transformers
3 | import torch
4 | from unidecode import unidecode
5 |
6 |
7 | def load_tokenizer(model_name_or_path, cache_dir=None, virtual_tokens=False):
8 | tokenizer = transformers.AutoTokenizer.from_pretrained(
9 | model_name_or_path,
10 | cache_dir=cache_dir,
11 | )
12 | if virtual_tokens:
13 | # if "llama-3" in model_name_or_path.lower():
14 | # tokenizer = transformers.AutoTokenizer.from_pretrained(
15 | # "meta-llama/Meta-Llama-3-8B",
16 | # cache_dir=cache_dir,
17 | # )
18 | # else:
19 | # raise ValueError(f"Virtual tokens not supported for tokenizer {model_name_or_path}")
20 | with open('src/configs/virtual_tokens.txt', 'r') as f:
21 | virtual_tokens = f.readlines()
22 | virtual_tokens = [unidecode(vt.strip()) for vt in virtual_tokens]
23 | tokenizer.add_tokens(new_tokens=virtual_tokens, special_tokens=False)
24 | if is_main_process():
25 | print(f"Added {len(virtual_tokens)} virtual tokens")
26 |
27 | return tokenizer
28 |
29 |
30 | def load_model(model_name_or_path, architecture, tokenizer=None, flash_attention=False, cache_dir=None, virtual_tokens=False):
31 | if architecture == 'causal':
32 | # Check hf_home
33 | # rank = get_rank()
34 | # print(f"Rank {rank}: {os.environ['HF_HOME']}")
35 | # print(f"Rank {rank}: cache dir: {args.cache_dir}")
36 | if flash_attention:
37 | model = transformers.AutoModelForCausalLM.from_pretrained(
38 | model_name_or_path,
39 | cache_dir=cache_dir,
40 | torch_dtype=torch.bfloat16,
41 | attn_implementation='flash_attention_2'
42 | )
43 | else:
44 | model = transformers.AutoModelForCausalLM.from_pretrained(
45 | model_name_or_path,
46 | cache_dir=cache_dir,
47 | torch_dtype=torch.bfloat16,
48 | )
49 | elif architecture == 'seq2seq':
50 | if flash_attention:
51 | model = transformers.AutoModelForSeq2SeqLM.from_pretrained(
52 | model_name_or_path,
53 | cache_dir=cache_dir,
54 | attn_implementation='flash_attention_2'
55 | )
56 | else:
57 | model = transformers.AutoModelForSeq2SeqLM.from_pretrained(
58 | model_name_or_path,
59 | cache_dir=cache_dir,
60 | )
61 | else:
62 | raise ValueError(f"Architecture {architecture} not supported")
63 |
64 | if virtual_tokens:
65 | model.resize_token_embeddings(len(tokenizer))
66 | if is_main_process():
67 | print(f"Model resized token embeddings to {len(tokenizer)}")
68 |
69 | with open('src/configs/virtual_tokens.txt', 'r') as f:
70 | virtual_tokens = f.readlines()
71 | virtual_tokens = [unidecode(vt).strip() for vt in virtual_tokens]
72 | combined_tokens = []
73 | for vt in virtual_tokens:
74 | combined_token = vt[2:-2].split("&&")
75 | combined_tokens.append(combined_token)
76 |
77 | for combined_token, virtual_token in zip(combined_tokens, virtual_tokens):
78 | combined_token_ids = tokenizer(" ".join(combined_token), add_special_tokens=False).input_ids
79 | virtual_token_id = tokenizer(virtual_token, add_special_tokens=False).input_ids
80 | # print(combined_token_ids)
81 | # print(virtual_token_id)
82 | assert len(virtual_token_id) == 1
83 | # print(model.device)
84 | combined_token_embeddings = model.model.embed_tokens(torch.tensor(combined_token_ids).to(model.device))
85 | # print(combined_token_embeddings.shape)
86 | embedding = torch.mean(combined_token_embeddings, dim=0)
87 | # print(embedding.shape)
88 | model.model.embed_tokens.weight.data[virtual_token_id[0]] = embedding
89 | else:
90 | if is_main_process():
91 | print(f"Initialized from {model_name_or_path} without adding embeddings.")
92 |
93 | return model
94 |
--------------------------------------------------------------------------------
/training/prompts/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Reason-Wang/ToolGen/6839374a255810efe69deea4056eec5c55e25802/training/prompts/__init__.py
--------------------------------------------------------------------------------
/training/prompts/templates.py:
--------------------------------------------------------------------------------
1 | null_template = {
2 | "full_template": '''{instruction} {response}''',
3 | "user_template": '''{instruction}''',
4 | }
5 |
6 | damsa_template = '''Translate the Arabic dialects to Modern Standard Arabic (MSA): {instruction} Response: {response}'''
--------------------------------------------------------------------------------
/training/prompts/utils.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import transformers
3 | from prompts.conversations import Conversation, get_conv_template
4 |
5 | def k2_tokenize(tokenizer, text, add_special_tokens=True, return_tensors=None):
6 | """
7 | Tokenize the prompt and return the input_ids and attention_mask
8 | To make tokenized correctly, we split the text by "" and tokenize each part separately.
9 | :param tokenizer:
10 | :param prompt:
11 | :param max_length:
12 | :return: input_ids, attention_mask
13 | """
14 | if add_special_tokens:
15 | input_ids = [tokenizer.bos_token_id]
16 | attention_mask = [1]
17 | else:
18 | input_ids = []
19 | attention_mask = []
20 |
21 | splited_texts = text.split("")
22 | inputs = tokenizer(splited_texts[0], add_special_tokens=False)
23 | input_ids.extend(inputs['input_ids'])
24 | attention_mask.extend(inputs['attention_mask'])
25 | if len(splited_texts) > 1:
26 | for text in splited_texts[1:]:
27 | current_inputs = tokenizer(text, add_special_tokens=False)
28 | input_ids += [tokenizer.eos_token_id] + current_inputs['input_ids']
29 | attention_mask += [1] + current_inputs['attention_mask']
30 | if return_tensors == 'pt':
31 | input_ids = torch.tensor([input_ids])
32 | attention_mask = torch.tensor([attention_mask])
33 |
34 | return dict(
35 | input_ids=input_ids,
36 | attention_mask=attention_mask
37 | )
38 |
39 |
40 | def format_conversation(messages, conv_template):
41 | # When there is no 'loss', we set it to False
42 | for message in messages:
43 | if 'loss' not in message:
44 | message['loss'] = False
45 |
46 | human_role_set = {"human", "user"}
47 | ai_role_set = {"ai", "gpt", "assistant"}
48 | conv = get_conv_template(conv_template)
49 | if 'from' in messages[0]:
50 | role_label, content_label = "from", "value"
51 | elif 'role' in messages[0]:
52 | role_label, content_label = "role", "content"
53 | else:
54 | raise ValueError("Cannot find role label and content label in the data.")
55 |
56 | for message in messages:
57 | if message[role_label] == 'system':
58 | conv.set_system_message(message[content_label])
59 | else:
60 | conv.append_message(conv.roles[0] if message[role_label] in human_role_set else conv.roles[1], message[content_label], message['loss'])
61 |
62 | # conv.append_message(conv.roles[1], None)
63 | return conv
64 |
65 |
66 | def tokenize_conversation(
67 | messages,
68 | tokenizer,
69 | conv_template,
70 | max_length,
71 | ):
72 | """
73 | We want to tokenize the whole conversation. But we can't just simply
74 | use get_prompt to get string prompt and tokenize it. Because the loss
75 | can only be computed on model's response. We want:
76 | input_ids
77 | attention_mask
78 | labels: should be -100 for user prompt and input id for model's response
79 | action_mask: should be 0 for user prompt and 1 for model's response
80 | :param messages:
81 | :param tokenizer:
82 | :param conv_template:
83 | :param max_length:
84 | :return: input_ids, attention_mask, labels, action_mask
85 | """
86 | conv = format_conversation(messages, conv_template)
87 | separate_prompts = conv.get_separate_prompt_with_to_loss()
88 | # print(separate_prompts)
89 | input_ids = []
90 | attention_mask = []
91 | labels = []
92 | action_mask = []
93 | for i, (prompt, to_loss) in enumerate(separate_prompts):
94 | if i == 0:
95 | if tokenizer.bos_token:
96 | prompt = tokenizer.bos_token + prompt
97 |
98 | if conv_template == 'k2':
99 | tmp_input_ids = k2_tokenize(tokenizer, prompt, add_special_tokens=False)['input_ids']
100 | else:
101 | tmp_input_ids = tokenizer(prompt, add_special_tokens=False)['input_ids']
102 | if to_loss:
103 | tmp_target = tmp_input_ids.copy()
104 | tmp_action_mask = [1] * len(tmp_input_ids)
105 | else:
106 | tmp_target = [-100] * len(tmp_input_ids)
107 | tmp_action_mask = [0] * len(tmp_input_ids)
108 | # print(tmp_input_ids)
109 | input_ids.extend(tmp_input_ids)
110 | attention_mask.extend([1] * len(tmp_input_ids))
111 | labels.extend(tmp_target)
112 | action_mask.extend(tmp_action_mask)
113 |
114 | input_ids = input_ids[:max_length]
115 | attention_mask = attention_mask[:max_length]
116 | labels = labels[:max_length]
117 | action_mask = action_mask[:max_length]
118 |
119 | # TODO: remove this check if everything is correct
120 | assert len(input_ids) == len(attention_mask) == len(labels) == len(action_mask)
121 |
122 | return dict(
123 | input_ids=torch.tensor([input_ids]),
124 | attention_mask=torch.tensor([attention_mask]),
125 | labels=torch.tensor([labels]),
126 | # action_mask=torch.tensor([action_mask])
127 | )
128 |
129 |
--------------------------------------------------------------------------------
/training/scripts/train_toolgen.sh:
--------------------------------------------------------------------------------
1 | # Train tool memorization
2 | pretrain_dir="meta-llama/Meta-Llama-3-8B"
3 | checkpoint_dir="checkpoints/ToolGen-Llama-3-8B-Tool-Memorization"
4 | flash_attention="True"
5 | run_name="llama-3-8b-tool-memorization"
6 | datasets="toolgen_atomic_memorization.json"
7 | dataset_nums="10000000"
8 | max_length="1024"
9 | batch_size="2"
10 | lr="2e-5"
11 | accumulation_steps="64"
12 | epochs="8"
13 | add_virtual_tokens="True"
14 | template="llama-3"
15 | save_strategy="steps"
16 | save_steps="1000"
17 | zero="z2"
18 |
19 | # Train tool retrieval
20 | # pretrain_dir="checkpoints/ToolGen-Llama-3-8B-Tool-Memorization"
21 | # checkpoint_dir="checkpoints/ToolGen-Llama-3-8B-Tool-Retriever"
22 | # flash_attention="True"
23 | # run_name="llama-3-8b-tool-retrieval"
24 | # datasets="toolgen_atomic_retrieval_G123.json"
25 | # dataset_nums="1000000"
26 | # max_length="1024"
27 | # batch_size="2"
28 | # lr="2e-5"
29 | # accumulation_steps="64"
30 | # epochs="1"
31 | # add_virtual_tokens="False"
32 | # template="llama-3"
33 | # save_strategy="steps"
34 | # save_steps="1000"
35 | # zero="z2"
36 |
37 | # End2End
38 | # pretrain_dir="checkpoints/ToolGen-Llama-3-8B-Tool-Retriever"
39 | # checkpoint_dir="checkpoints/ToolGen-Llama-3-8B"
40 | # flash_attention="True"
41 | # run_name="llama-3-8b-end2end"
42 | # datasets="toolgen_atomic_G123_dfs.json"
43 | # dataset_nums="10000000"
44 | # max_length="6144"
45 | # batch_size="1"
46 | # lr="2e-5"
47 | # accumulation_steps="64"
48 | # epochs="1"
49 | # add_virtual_tokens="False"
50 | # template="llama-3"
51 | # save_strategy="steps"
52 | # save_steps="1000"
53 | # zero="z3_offload"
54 |
55 | chat="True"
56 |
57 | cmd="deepspeed --include=localhost:0,1,2,3,4,5,6,7 --master_port 25024 train.py \
58 | --model_name_or_path ${pretrain_dir} \
59 | --add_virtual_tokens ${add_virtual_tokens} \
60 | --flash_attention ${flash_attention} \
61 | --deepspeed src/configs/ds_${zero}_config.json \
62 | --chat ${chat} \
63 | --template ${template} \
64 | --architecture causal \
65 | --output_dir ${checkpoint_dir} \
66 | --save_strategy ${save_strategy} \
67 | --save_steps ${save_steps} \
68 | --gather_weights True \
69 | --learning_rate ${lr} \
70 | --warmup_ratio 0.03 \
71 | --datasets ${datasets} \
72 | --dataset_nums ${dataset_nums} \
73 | --per_device_train_batch_size ${batch_size} \
74 | --gradient_accumulation_steps ${accumulation_steps} \
75 | --max_length ${max_length} \
76 | --num_train_epochs ${epochs} \
77 | --gradient_checkpointing False \
78 | --bf16 True \
79 | --logging_steps 1 \
80 | --report_to wandb \
81 | --run_name ${run_name}"
82 |
83 | echo $cmd
84 | eval $cmd
--------------------------------------------------------------------------------
/training/src/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Reason-Wang/ToolGen/6839374a255810efe69deea4056eec5c55e25802/training/src/__init__.py
--------------------------------------------------------------------------------
/training/src/configs/ds_z2_config.json:
--------------------------------------------------------------------------------
1 | {
2 | "train_batch_size": "auto",
3 | "train_micro_batch_size_per_gpu": "auto",
4 | "gradient_accumulation_steps": "auto",
5 | "zero_optimization": {
6 | "stage": 2,
7 | "reduce_bucket_size": "auto"
8 | },
9 | "bf16": {
10 | "enabled": "auto"
11 | }
12 | }
--------------------------------------------------------------------------------
/training/src/configs/ds_z3_config.json:
--------------------------------------------------------------------------------
1 | {
2 | "train_batch_size": "auto",
3 | "train_micro_batch_size_per_gpu": "auto",
4 | "gradient_accumulation_steps": "auto",
5 | "optimizer": {
6 | "type": "AdamW",
7 | "params": {
8 | "lr": "auto",
9 | "betas": "auto",
10 | "eps": "auto",
11 | "weight_decay": "auto"
12 | }
13 | },
14 | "scheduler": {
15 | "type": "WarmupCosineLR",
16 | "params": {
17 | "warmup_num_steps": "auto",
18 | "total_num_steps": "auto"
19 | }
20 | },
21 | "zero_optimization": {
22 | "stage": 3,
23 | "overlap_comm": true,
24 | "reduce_bucket_size": "auto",
25 | "stage3_prefetch_bucket_size": "auto",
26 | "stage3_param_persistence_threshold": "auto",
27 | "stage3_max_live_parameters": 1e9,
28 | "stage3_max_reuse_distance": 1e9,
29 | "stage3_gather_16bit_weights_on_model_save": true
30 | },
31 | "checkpoint": {
32 | "tag_validation": "Warn",
33 | "load_universal": false,
34 | "use_node_local_storage": true,
35 | "parallel_write":{
36 | "pipeline_stage": true
37 | }
38 | },
39 | "bf16": {
40 | "enabled": "auto"
41 | }
42 | }
--------------------------------------------------------------------------------
/training/src/configs/ds_z3_offload_config.json:
--------------------------------------------------------------------------------
1 | {
2 | "train_batch_size": "auto",
3 | "train_micro_batch_size_per_gpu": "auto",
4 | "gradient_accumulation_steps": "auto",
5 | "optimizer": {
6 | "type": "AdamW",
7 | "params": {
8 | "lr": "auto",
9 | "betas": "auto",
10 | "eps": "auto",
11 | "weight_decay": "auto"
12 | }
13 | },
14 | "scheduler": {
15 | "type": "WarmupCosineLR",
16 | "params": {
17 | "warmup_num_steps": "auto",
18 | "total_num_steps": "auto"
19 | }
20 | },
21 | "zero_optimization": {
22 | "stage": 3,
23 | "offload_optimizer": {
24 | "device": "cpu",
25 | "pin_memory": false
26 | },
27 | "offload_param": {
28 | "device": "cpu",
29 | "pin_memory": false
30 | },
31 | "memory_efficient_linear": true,
32 | "overlap_comm": true,
33 | "reduce_bucket_size": "auto",
34 | "stage3_prefetch_bucket_size": "auto",
35 | "stage3_param_persistence_threshold": "auto",
36 | "stage3_max_live_parameters": 1e9,
37 | "stage3_max_reuse_distance": 1e9,
38 | "stage3_gather_16bit_weights_on_model_save": true
39 | },
40 | "checkpoint": {
41 | "tag_validation": "Warn",
42 | "load_universal": false,
43 | "use_node_local_storage": true,
44 | "parallel_write":{
45 | "pipeline_stage": true
46 | }
47 | },
48 | "bf16": {
49 | "enabled": "auto"
50 | }
51 | }
--------------------------------------------------------------------------------
/training/src/configs/project_config.json:
--------------------------------------------------------------------------------
1 | {
2 | "WANDB_PROJECT": "ToolGen"
3 | }
--------------------------------------------------------------------------------
/training/train.py:
--------------------------------------------------------------------------------
1 | from data.loading import load_datasets
2 | from models.loading import load_model, load_tokenizer
3 | from utils.setting import set_project, set_system, set_args, set_distributed_logging
4 | from dataclasses import field, dataclass
5 | from typing import Optional, Any
6 | import torch
7 | from data.loading import load_datasets
8 | import os
9 | import transformers
10 | from transformers import Trainer
11 | from typing import List
12 | from prompts.templates import null_template
13 | from utils.logging import get_logger
14 | from utils.distributed import get_rank, is_main_process
15 |
16 |
17 | @dataclass
18 | class TrainingArguments(transformers.TrainingArguments):
19 | model_name_or_path: str = field(default="")
20 | chat: bool = False
21 | architecture: str = field(default='causal')
22 | flash_attention: bool = False
23 | data_path: str = field(default="")
24 | cache_dir: Optional[str] = field(default=None)
25 | optim: str = field(default="adamw_torch")
26 | resume_training: bool = False
27 | per_device_train_batch_size = 8
28 | max_length: int = 2048
29 | learning_rate: float = 5e-5
30 | num_train_epochs: int = 3
31 | gather_weights: bool = True
32 | datasets: List[str] = field(default_factory=list)
33 | dataset_nums: List[int] = field(default_factory=int)
34 | template: str = field(default="llama-3")
35 | add_virtual_tokens: bool = False
36 |
37 |
38 | def train():
39 | # set_system("src/configs/project_config.json")
40 | # set_distributed_logging(strict=True)
41 | parser = transformers.HfArgumentParser(TrainingArguments)
42 | args = parser.parse_args_into_dataclasses()[0]
43 | set_args(args)
44 | set_project(args)
45 |
46 | # Get rank
47 | rank = get_rank()
48 | Logger = get_logger("logs", level="INFO", rank=rank)
49 |
50 | # Load VAgent tokenizer
51 | tokenizer = load_tokenizer(
52 | args.model_name_or_path,
53 | cache_dir=args.cache_dir,
54 | virtual_tokens=args.add_virtual_tokens,
55 | )
56 |
57 | Logger.info("---- Loading Datasets ----")
58 | dataset, collator = load_datasets(
59 | chat=args.chat,
60 | architecture=args.architecture,
61 | datasets=args.datasets,
62 | dataset_nums=args.dataset_nums,
63 | tokenizer=tokenizer,
64 | max_length=args.max_length,
65 | template=args.template,
66 | )
67 | Logger.info(f"Data length: {len(dataset)}")
68 |
69 | Logger.info("---- Loading Model ----")
70 | model = load_model(
71 | args.model_name_or_path,
72 | architecture=args.architecture,
73 | tokenizer=tokenizer,
74 | flash_attention=args.flash_attention,
75 | cache_dir=args.cache_dir,
76 | virtual_tokens=args.add_virtual_tokens,
77 | )
78 |
79 | trainer = Trainer(
80 | model,
81 | args=args,
82 | data_collator=collator,
83 | train_dataset=dataset,
84 | )
85 |
86 | trainer.train(resume_from_checkpoint=args.resume_training)
87 | if is_main_process():
88 | tokenizer.save_pretrained(args.output_dir)
89 |
90 | # Whether to gather weights before saving
91 | # This is prefered for small models
92 | if args.gather_weights:
93 | trainer.save_model(args.output_dir)
94 | else:
95 | trainer.deepspeed.save_checkpoint(args.output_dir)
96 |
97 |
98 | if __name__ == "__main__":
99 | train()
100 |
--------------------------------------------------------------------------------
/training/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Reason-Wang/ToolGen/6839374a255810efe69deea4056eec5c55e25802/training/utils/__init__.py
--------------------------------------------------------------------------------
/training/utils/distributed.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 |
4 | def is_main_process():
5 | # Check if the current process is the main process
6 | rank = int(os.environ.get('RANK', -1))
7 | return rank == 0 or rank == -1
8 |
9 |
10 | def get_rank():
11 | # When using this function, make sure to call it after deepspeed is initialized
12 | # Using launcher or deepspeed.initialize()
13 | # Get the current rank
14 | rank = int(os.environ.get('RANK', -1))
15 | return rank
16 |
--------------------------------------------------------------------------------
/training/utils/huggingface.py:
--------------------------------------------------------------------------------
1 | from typing import Optional
2 | from transformers import AutoTokenizer, AutoModelForCausalLM
3 | from huggingface_hub import HfApi, repo_info, create_repo
4 | from huggingface_hub.utils import RepositoryNotFoundError
5 | import torch
6 |
7 |
8 | def repo_exists(repo_id, repo_type: Optional[str]=None, token: Optional[str]=None):
9 | """
10 | Check if a repository exists on the Hugging Face Hub
11 |
12 | Args:
13 | repo_id (str): The repository ID to check
14 | repo_type (str): The type of repository to check
15 | token (str): The Hugging Face API token
16 |
17 | Returns:
18 | bool: Whether the repository exists
19 | """
20 | try:
21 | repo_info(repo_id, repo_type=repo_type, token=token)
22 | return True
23 | except RepositoryNotFoundError:
24 | return False
25 |
26 |
27 | def upload_model(model_name_or_path, repo_id, private=False, token=""):
28 | """
29 | Upload a model to the Hugging Face Hub
30 |
31 | Args:
32 | model_name_or_path (str): The model name or path to upload
33 | repo_id (str): The repository ID to upload the model to
34 | """
35 | # Load the model
36 | # tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
37 | model = AutoModelForCausalLM.from_pretrained(
38 | model_name_or_path,
39 | torch_dtype=torch.bfloat16,
40 | device_map="cpu",
41 | )
42 |
43 | if not repo_exists(repo_id, token=token):
44 | print(f"Repo {repo_id} does not exist, creating repo...")
45 | create_repo(repo_id, private=private, token=token)
46 |
47 | model.push_to_hub(repo_id, token=token)
--------------------------------------------------------------------------------
/training/utils/logging.py:
--------------------------------------------------------------------------------
1 | import os
2 | from typing import Union
3 | from logging import WARNING, getLogger, INFO, StreamHandler, FileHandler, Formatter, DEBUG
4 |
5 |
6 | class Logger:
7 | def __init__(self, logger, rank: int):
8 | self.logger = logger
9 | self.rank = rank
10 |
11 | def info(self, msg):
12 | if self.rank in [-1, 0]:
13 | self.logger.info(msg)
14 |
15 | def debug(self, msg):
16 | if self.rank in [-1, 0]:
17 | self.logger.debug(msg)
18 |
19 | def warning(self, msg):
20 | if self.rank in [-1, 0]:
21 | self.logger.warning(msg)
22 |
23 |
24 | def get_logger(directory, level="INFO", rank: int=-1):
25 | # print(f"Local rank: {local_rank}")
26 | os.makedirs(directory, exist_ok=True)
27 | filename = directory + '/train'
28 | logger = getLogger(__name__)
29 | logger.propagate = False
30 | logger.handlers.clear()
31 | handler1 = StreamHandler()
32 | handler1.setFormatter(Formatter("%(message)s"))
33 | handler2 = FileHandler(filename=f"{filename}.log")
34 | handler2.setFormatter(Formatter("%(message)s"))
35 | logger.addHandler(handler1)
36 | logger.addHandler(handler2)
37 | # if level == "INFO":
38 | # handler2 = FileHandler(filename=f"{filename}.log")
39 | # handler2.setFormatter(Formatter("%(message)s"))
40 | # logger.addHandler(handler2)
41 | # logger.setLevel(INFO)
42 | # elif level == "DEBUG":
43 | # handler1 = StreamHandler()
44 | # handler1.setFormatter(Formatter("%(message)s"))
45 | # handler2 = FileHandler(filename=f"{filename}.log")
46 | # handler2.setFormatter(Formatter("%(message)s"))
47 | # logger.addHandler(handler1)
48 | # logger.addHandler(handler2)
49 | if level == "WARNING":
50 | logger.setLevel(WARNING)
51 | elif level == "INFO":
52 | logger.setLevel(INFO)
53 | elif level == "DEBUG":
54 | logger.setLevel(DEBUG)
55 | else:
56 | raise ValueError(f"Unknown level: {level}")
57 |
58 | logger = Logger(logger, rank)
59 |
60 | return logger
61 |
62 |
63 | if __name__=="__main__":
64 | logger = get_logger("test", level="INFO")
65 | logger.info("test")
--------------------------------------------------------------------------------
/training/utils/setting.py:
--------------------------------------------------------------------------------
1 | import json
2 | import logging
3 | import os
4 |
5 | import wandb
6 | from utils.distributed import get_rank, is_main_process
7 | import warnings
8 |
9 |
10 | def set_system(config_path):
11 | with open(config_path, "r") as f:
12 | config = json.load(f)
13 |
14 | if "NinjaPath" in config:
15 | os.environ["PATH"] = config["NinjaPath"] + ":" + os.environ["PATH"]
16 |
17 | if "Environment" in config:
18 | for key, value in config["Environment"].items():
19 | os.environ[key] = value
20 |
21 | return None
22 |
23 |
24 | def set_distributed_logging(strict: bool = False):
25 | '''
26 | In default, only the main process will log INFO level
27 | Currently this function only controls logs from logging and warnings modules
28 | Some others libraries implemented their own logging system:
29 | - Deepspeed: implemented yellow color warnings
30 | But still need to be used carefully since some important logs might be missed
31 | '''
32 | rank = get_rank()
33 | if is_main_process():
34 | print(f"Rank {rank}: Setting logging level to INFO")
35 | logging.basicConfig(level=logging.INFO)
36 | else:
37 | if strict:
38 | print(f"Rank {rank}: Setting logging level to ERROR")
39 | logging.basicConfig(level=logging.ERROR)
40 | warnings.filterwarnings("ignore")
41 | else:
42 | print(f"Rank {rank}: Setting logging level to WARNING")
43 | logging.basicConfig(level=logging.WARNING)
44 |
45 |
46 | def set_args(args):
47 | if args.cache_dir is not None:
48 | # User has specified a cache directory
49 | pass
50 | else:
51 | # System setted cache directory
52 | if "HF_HUB_CACHE" in os.environ:
53 | args.cache_dir = os.environ["HF_HUB_CACHE"]
54 | # Use HF default cache directory
55 | else:
56 | args.cache_dir = None
57 |
58 | return None
59 |
60 | def set_project(args):
61 | with open("src/configs/project_config.json", "r") as f:
62 | project_config = json.load(f)
63 |
64 | if "WANDB_PROJECT" in project_config:
65 | os.environ["WANDB_PROJECT"] = project_config["WANDB_PROJECT"]
66 | if "WANDB_ENTITY" in project_config:
67 | os.environ["WANDB_ENTITY"] = project_config["WANDB_ENTITY"]
68 |
69 | # Detect if file exists
70 | keys_file = "src/configs/keys.json"
71 | if os.path.exists(keys_file):
72 | with open(keys_file, "r") as f:
73 | keys = json.load(f)
74 |
75 | if "WANDB_KEY" in keys:
76 | wandb.login(key=keys["WANDB_KEY"])
77 |
78 |
--------------------------------------------------------------------------------