├── .gitignore ├── OpenAgent ├── __init__.py ├── agents │ ├── __init__.py │ ├── base.py │ ├── function_calling.py │ ├── toolgen │ │ ├── __init__.py │ │ ├── inference.py │ │ ├── toolgen.py │ │ ├── toolgen_service.py │ │ └── utils.py │ └── tree │ │ ├── __init__.py │ │ └── tree.py └── tools │ ├── __init__.py │ ├── base.py │ ├── retrieval │ ├── __init__.py │ ├── embeddings.py │ └── indexers.py │ └── src │ ├── __init__.py │ ├── basic_tools.py │ └── rapidapi │ ├── __init__.py │ ├── rapidapi.py │ ├── server.py │ └── utils.py ├── README.md ├── assets └── banner.png ├── evaluation ├── retrieval │ ├── __init__.py │ ├── eval_bm25.py │ ├── eval_encoder.py │ ├── eval_longcontext.py │ ├── eval_openai_embedding.py │ ├── eval_toolgen.py │ ├── eval_toolgen_atomic.py │ └── metrics.py ├── toolbench │ ├── __init__.py │ ├── inference │ │ ├── Algorithms │ │ │ ├── DFS.py │ │ │ ├── __init__.py │ │ │ ├── base_search.py │ │ │ └── single_chain.py │ │ ├── Downstream_tasks │ │ │ ├── __init__.py │ │ │ ├── base_env.py │ │ │ ├── rapidapi.py │ │ │ └── rapidapi_multithread.py │ │ ├── LLM │ │ │ ├── __init__.py │ │ │ ├── base_io.py │ │ │ ├── chatgpt_function_model.py │ │ │ ├── davinci_model.py │ │ │ ├── llama_model.py │ │ │ ├── retriever.py │ │ │ ├── tool_chat_model.py │ │ │ ├── tool_llama_lora_model.py │ │ │ ├── tool_llama_model.py │ │ │ ├── toolgen.py │ │ │ └── toolgen_atomic.py │ │ ├── LLM_rank │ │ │ ├── __init__.py │ │ │ └── rank_candidate.py │ │ ├── Prompts │ │ │ ├── ReAct_prompts.py │ │ │ ├── Tree_search_prompts.py │ │ │ ├── __init__.py │ │ │ └── rank_prompts.py │ │ ├── Tree │ │ │ ├── Tree.py │ │ │ └── __init__.py │ │ ├── callbacks │ │ │ └── ServerEventCallback.py │ │ ├── qa_pipeline.py │ │ ├── qa_pipeline_multithread.py │ │ ├── qa_pipeline_open_domain.py │ │ ├── server.py │ │ ├── toolbench_server.py │ │ └── utils.py │ ├── model │ │ ├── __init__.py │ │ ├── apply_delta.py │ │ ├── compression.py │ │ ├── make_delta.py │ │ └── model_adapter.py │ ├── retrieval │ │ ├── api_evaluator.py │ │ ├── inference_example.py │ │ └── train.py │ ├── tool_conversation.py │ ├── tooleval │ │ ├── README.md │ │ ├── README_ZH.md │ │ ├── ToolBench.code-workspace │ │ ├── __init__.py │ │ ├── automatic_eval_sample.py │ │ ├── convert_answers.py │ │ ├── convert_to_answer_format.py │ │ ├── eval_and_update_leaderboard.py │ │ ├── eval_pass_rate.py │ │ ├── eval_preference.py │ │ ├── evaluation │ │ │ ├── __init__.py │ │ │ ├── dataclass.py │ │ │ ├── methodcls.py │ │ │ └── usereval.py │ │ ├── evaluators │ │ │ ├── __init__.py │ │ │ ├── registered_cls │ │ │ │ ├── __init__.py │ │ │ │ ├── base.py │ │ │ │ ├── rtl.py │ │ │ │ ├── tooleval.py │ │ │ │ └── utils.py │ │ │ ├── tooleval_gpt-3.5-turbo_default │ │ │ │ ├── config.yaml │ │ │ │ └── template.txt │ │ │ ├── tooleval_gpt-3.5-turbo_fn │ │ │ │ ├── config.yaml │ │ │ │ └── template.txt │ │ │ └── tooleval_gpt-3.5-turbo_normalized │ │ │ │ ├── config.yaml │ │ │ │ └── template.txt │ │ ├── evaluators_comparison.py │ │ ├── requirements.txt │ │ └── utils.py │ └── utils.py └── utils │ ├── __init__.py │ ├── embedding.py │ ├── retrieval.py │ └── utils.py ├── requirements.txt ├── scripts ├── convert_answer │ └── run_convert_answer.sh ├── eval_full_pipeline.sh ├── eval_opendomain_full_pipeline.sh ├── inference │ ├── inference_gpt_pipeline_virtual.sh │ ├── inference_opendomain_toolllama_pipeline_virtual.sh │ ├── inference_toolgen_pipeline_virtual.sh │ └── inference_toolllama_pipeline_virtual.sh ├── pass_rate │ └── run_pass_rate.sh ├── preference │ └── run_preference.sh └── retrieval │ ├── eval_bm25.sh │ ├── eval_encoder.sh │ ├── eval_longcontext.sh │ ├── eval_openai_embedding.sh │ └── eval_toolgen.sh └── training ├── README.md ├── data ├── __init__.py ├── dataset.py ├── loading.py └── utils.py ├── models ├── __init__.py ├── causallm.py ├── loading.py └── utils.py ├── prompts ├── __init__.py ├── conversations.py ├── templates.py └── utils.py ├── scripts └── train_toolgen.sh ├── src ├── __init__.py ├── configs │ ├── ds_z2_config.json │ ├── ds_z3_config.json │ ├── ds_z3_offload_config.json │ ├── project_config.json │ └── virtual_tokens.txt ├── convert_deepspeed_to_huggingface.py └── zero_to_fp32.py ├── train.py └── utils ├── __init__.py ├── distributed.py ├── huggingface.py ├── logging.py └── setting.py /.gitignore: -------------------------------------------------------------------------------- 1 | *__pycache__* 2 | *.pyc 3 | *.ipynb_checkpoints 4 | *.ipynb 5 | data/ 6 | keys.json 7 | log_file.txt 8 | *tar.gz 9 | .vscode 10 | scripts/retrieval/efficiency 11 | evaluation/retrieval/efficiency 12 | training/wandb 13 | training/logs 14 | training/checkpoints 15 | !training/data -------------------------------------------------------------------------------- /OpenAgent/__init__.py: -------------------------------------------------------------------------------- 1 | from .agents import ToolGen 2 | from .tools import RapidAPIWrapper -------------------------------------------------------------------------------- /OpenAgent/agents/__init__.py: -------------------------------------------------------------------------------- 1 | from .toolgen.toolgen import ToolGen -------------------------------------------------------------------------------- /OpenAgent/agents/toolgen/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Reason-Wang/ToolGen/6839374a255810efe69deea4056eec5c55e25802/OpenAgent/agents/toolgen/__init__.py -------------------------------------------------------------------------------- /OpenAgent/agents/toolgen/inference.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | import torch 3 | from transformers import LogitsProcessor 4 | 5 | class DisjunctiveTrie: 6 | def __init__(self, nested_token_ids: List[List[int]], no_subsets=True): 7 | r""" 8 | A helper class that builds a trie with the words represented in `nested_token_ids`. 9 | """ 10 | self.max_height = max([len(one) for one in nested_token_ids]) 11 | 12 | root = {} 13 | for token_ids in nested_token_ids: 14 | level = root 15 | for tidx, token_id in enumerate(token_ids): 16 | if token_id not in level: 17 | level[token_id] = {} 18 | 19 | level = level[token_id] 20 | 21 | if no_subsets and self.has_subsets(root, nested_token_ids): 22 | raise ValueError( 23 | "Each list in `nested_token_ids` can't be a complete subset of another list, but is" 24 | f" {nested_token_ids}." 25 | ) 26 | 27 | self.trie = root 28 | 29 | def next_tokens(self, current_seq): 30 | """ 31 | The next possible tokens that will progress the trie, given the current sequence of tokens in `current_seq`. 32 | """ 33 | start = self.trie 34 | 35 | for current_token in current_seq: 36 | start = start[current_token] 37 | 38 | next_tokens = list(start.keys()) 39 | 40 | return next_tokens 41 | 42 | def reached_leaf(self, current_seq): 43 | next_tokens = self.next_tokens(current_seq) 44 | 45 | return len(next_tokens) == 0 46 | 47 | def count_leaves(self, root): 48 | next_nodes = list(root.values()) 49 | if len(next_nodes) == 0: 50 | return 1 51 | else: 52 | return sum([self.count_leaves(nn) for nn in next_nodes]) 53 | 54 | def has_subsets(self, trie, nested_token_ids): 55 | """ 56 | Returns whether # of leaves == # of words. Otherwise some word is a subset of another. 57 | """ 58 | leaf_count = self.count_leaves(trie) 59 | return len(nested_token_ids) != leaf_count 60 | 61 | 62 | class AllowKeyWordsProcessor(LogitsProcessor): 63 | ''' renxi.wang@mbzuai.ac.ae 64 | A logits processor that limit output text to be in a set of predefined keywords. 65 | tokenizer: tokenizer used to encode the keywords 66 | trie: DisjunctiveTrie of predefined keywords 67 | input_ids: input_ids of the prompt that the model is generating from 68 | return: 69 | scores: scores of the logits, where impossible tokens are masked 70 | For beam search, scores are log-softmax of logits, others are logits 71 | ''' 72 | def __init__(self, tokenizer, trie, input_ids): 73 | self.tokenizer = tokenizer 74 | self.trie = trie 75 | self.input_ids = input_ids 76 | 77 | def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor): 78 | input_length = self.input_ids.shape[1] 79 | generated_ids = input_ids[:, input_length:].tolist() 80 | new_token_ids = [] 81 | for ids in generated_ids: 82 | try: 83 | next_token_ids = self.trie.next_tokens(ids) 84 | except KeyError as e: 85 | next_token_ids = [self.tokenizer.eos_token_id] 86 | if not next_token_ids: 87 | next_token_ids = [self.tokenizer.eos_token_id] 88 | new_token_ids.append(next_token_ids) 89 | 90 | for row, token_ids in enumerate(new_token_ids): 91 | mask = torch.ones_like(scores[row], dtype=torch.bool) 92 | mask[torch.tensor(token_ids)] = False 93 | scores[row, mask] = -1e10 94 | 95 | return scores 96 | 97 | 98 | class AllowTokenIdsProcessor(LogitsProcessor): 99 | def __init__(self, allowed_token_ids: List[int]): 100 | self.allowed_token_ids = allowed_token_ids 101 | 102 | def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor): 103 | mask = torch.ones_like(scores, dtype=torch.bool) 104 | mask[:, self.allowed_token_ids] = False 105 | scores = scores.masked_fill(mask, -1e10) 106 | 107 | return scores 108 | -------------------------------------------------------------------------------- /OpenAgent/agents/toolgen/utils.py: -------------------------------------------------------------------------------- 1 | import json 2 | import re 3 | 4 | import requests 5 | 6 | def standardize(string): 7 | res = re.compile("[^\\u4e00-\\u9fa5^a-z^A-Z^0-9^_]") 8 | string = res.sub("_", string) 9 | string = re.sub(r"(_)\1+","_", string).lower() 10 | while True: 11 | if len(string) == 0: 12 | return string 13 | if string[0] == "_": 14 | string = string[1:] 15 | else: 16 | break 17 | while True: 18 | if len(string) == 0: 19 | return string 20 | if string[-1] == "_": 21 | string = string[:-1] 22 | else: 23 | break 24 | if string[0].isdigit(): 25 | string = "get_" + string 26 | return string 27 | 28 | def change_name(name): 29 | change_list = ["from", "class", "return", "false", "true", "id", "and"] 30 | if name in change_list: 31 | name = "is_" + name 32 | return name 33 | 34 | def get_toolbench_name(tool_name, api_name): 35 | tool_name = standardize(tool_name) 36 | api_name = change_name(standardize(api_name)) 37 | toolbench_name = api_name+f"_for_{tool_name}" 38 | toolbench_name = toolbench_name[-64:] 39 | return toolbench_name 40 | 41 | 42 | def toolgen_request(endpoint_url, query, system_prompt=None): 43 | payload = { 44 | "query": query, 45 | "system_prompt": system_prompt 46 | } 47 | 48 | try: 49 | response = requests.post(endpoint_url, json=payload, stream=True) # Enable streaming 50 | response.raise_for_status() # Raise an error for HTTP errors 51 | for line in response.iter_lines(decode_unicode=True): 52 | if line: # Filter out keep-alive new lines 53 | yield json.loads(line) # Parse each line as JSON 54 | except requests.exceptions.RequestException as e: 55 | print(f"Error calling ToolGen model: {e}") 56 | yield {"error": str(e)} -------------------------------------------------------------------------------- /OpenAgent/agents/tree/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Reason-Wang/ToolGen/6839374a255810efe69deea4056eec5c55e25802/OpenAgent/agents/tree/__init__.py -------------------------------------------------------------------------------- /OpenAgent/agents/tree/tree.py: -------------------------------------------------------------------------------- 1 | from termcolor import colored 2 | import numpy as np 3 | from copy import deepcopy 4 | import math 5 | 6 | class Tree: 7 | def __init__(self): 8 | self.root = TreeNode() 9 | self.now_deal_node = self.root 10 | 11 | 12 | def to_json_recursive(self,use_messages=False): 13 | tree_structure = self.root.to_json_recursive(use_messages=use_messages) 14 | js_obj = { 15 | "size": self.root.get_size(), 16 | "max_length":self.root.get_max_depth(), 17 | "tree": tree_structure, 18 | } 19 | return js_obj 20 | 21 | 22 | class TreeNode: 23 | 24 | def __init__(self): 25 | self.is_terminal = False 26 | self.pruned = False 27 | # self.finished = False 28 | self.node_type = None 29 | self.description = "" 30 | self.observation = "" 31 | self.observation_code = None 32 | self.father = None 33 | self.children = [] 34 | # self.io_state = None 35 | 36 | # openai-messages of this node 37 | self.messages = [] 38 | 39 | 40 | def get_depth(self): 41 | if self.father == None: 42 | return 0 43 | return self.father.get_depth() + 1 44 | 45 | def print(self,process_id = 0): 46 | if process_id != 0: 47 | return 48 | color_converter = {"Thought":"red", "Action": "blue", "Action Input": "cyan","Final Answer": "green","Reflection":"blue"} 49 | print(colored(f"{self.node_type}: {self.description}",color = color_converter[self.node_type])) 50 | if self.observation != "": 51 | if len(self.observation) < 1536: 52 | print(colored(f"Observation: {self.observation}",color="yellow")) 53 | else: 54 | print(colored(f"Observation: {self.observation[:1536]}......(len={len(self.observation)})",color="yellow")) 55 | 56 | 57 | def to_json_recursive(self,use_messages=False): 58 | js_obj = self.to_json(use_messages=use_messages) 59 | js_obj["children"] = [] 60 | for child in self.children: 61 | js_obj["children"].append(child.to_json_recursive()) 62 | return js_obj 63 | 64 | 65 | def get_chain_result_from_this_node(self,use_messages=False): 66 | ''' 67 | Returns chained results, starting from this node up to the root node 68 | ''' 69 | now_node = self 70 | result = [] 71 | while now_node.father != None: 72 | result = [now_node.to_json(use_messages=use_messages)] + result 73 | now_node = now_node.father 74 | return result 75 | 76 | def to_json(self, use_messages=False): 77 | 78 | json_obj = {} 79 | json_obj["is_terminal"] = self.is_terminal 80 | json_obj["pruned"] = self.pruned 81 | 82 | json_obj["depth"] = self.get_depth() 83 | json_obj["node_type"] = self.node_type 84 | json_obj["description"] = self.description 85 | if self.observation != "": 86 | json_obj["observation"] = self.observation 87 | if self.observation_code != None: 88 | json_obj["observation_code"] = self.observation_code 89 | json_obj["child_count"] = len(self.children) 90 | 91 | # if self.io_state != None and self.node_type == "Action Input": 92 | # json_obj["io_state"] = self.io_state.to_json() 93 | 94 | 95 | if use_messages: 96 | json_obj["messages"] = [] 97 | for message in self.messages: 98 | if not ("valid" in message.keys() and message["valid"] == False): 99 | json_obj["messages"].append(message["role"]) 100 | else: 101 | json_obj["messages"].append(message["role"] + "_invalid") 102 | 103 | return json_obj -------------------------------------------------------------------------------- /OpenAgent/tools/__init__.py: -------------------------------------------------------------------------------- 1 | from .src.rapidapi.rapidapi import RapidAPIWrapper -------------------------------------------------------------------------------- /OpenAgent/tools/base.py: -------------------------------------------------------------------------------- 1 | import json 2 | from termcolor import colored 3 | 4 | class BaseTool(): 5 | def __init__(self, tools, tools_map, max_observation_length=1024): 6 | self.tools = {} 7 | for tool in tools: 8 | self.tools[tool['function']['name']] = tool 9 | # self.tools = tools 10 | self.tools_map = tools_map 11 | self.max_observation_length = max_observation_length 12 | self.success = 0 13 | 14 | def call(self, action_name, action_input): 15 | # print(f"Calling {action_name} with input: {action_input}") 16 | if action_name in self.tools: 17 | obs, code = self._call(action_name, action_input) 18 | if len(obs) > self.max_observation_length: 19 | obs = obs[:self.max_observation_length] + "..." 20 | return obs, code 21 | else: 22 | return {"error": f"No such tool name: {action_name}"}, 0 23 | 24 | def check_success(self): 25 | return self.success 26 | 27 | def _call(self, action_name, action_input): 28 | """Need to return an observation string and status code: 29 | 0 means normal response 30 | 1 means there is no corresponding api name 31 | 2 means there is an error in the input 32 | 3 represents the end of the generation and the final answer appears 33 | 4 means that the model decides to pruning by itself 34 | 5 represents api call timeout 35 | 6 for 404 36 | 7 means not subscribed 37 | 8 represents unauthorized 38 | 9 represents too many requests 39 | 10 stands for rate limit 40 | 11 message contains "error" field 41 | 12 error sending request 42 | """ 43 | json_data = json.loads(action_input) 44 | if action_name in self.tools: 45 | function = self.tools_map[action_name] 46 | # print(function) 47 | print(colored(f"Querying: {action_name}", color="yellow")) 48 | 49 | response = function(**json_data) 50 | else: 51 | response = { 52 | "error": "invalid hallucation of function name." 53 | } 54 | status_code = 0 55 | return json.dumps(response), status_code 56 | 57 | if isinstance(response, dict) and "status_code" in response: 58 | status_code = response['status_code'] 59 | del response['status_code'] 60 | # whether generated the final answer 61 | if status_code == 3: 62 | self.success = 1 63 | else: 64 | status_code = 0 65 | 66 | return json.dumps(response), status_code 67 | 68 | def to_json(self): 69 | return {} 70 | -------------------------------------------------------------------------------- /OpenAgent/tools/retrieval/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /OpenAgent/tools/retrieval/embeddings.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | 4 | def get_embeddings(model, device, texts, batch_size=16): 5 | model.eval() 6 | model.to(device) 7 | # tbar = tqdm(dataloader) 8 | embeddings = [] 9 | with torch.no_grad(): 10 | for i in range(0, len(texts), batch_size): 11 | batch = texts[i:i + batch_size] 12 | embeddings.append(model.encode(batch, device=device)) 13 | return np.concatenate(embeddings) -------------------------------------------------------------------------------- /OpenAgent/tools/retrieval/indexers.py: -------------------------------------------------------------------------------- 1 | import faiss 2 | import numpy as np 3 | 4 | class Indexer: 5 | def __init__(self, embeddings, vector_size, ids=None, similarity="cosine"): 6 | self.index = faiss.IndexFlatIP(vector_size) 7 | self.similarity = similarity 8 | if similarity == "cosine": 9 | embeddings /= np.linalg.norm(embeddings, axis=1)[:, None] 10 | self.index.add(embeddings) 11 | if ids is None: 12 | self.ids = list(range(embeddings.shape[0])) 13 | else: 14 | self.ids = ids 15 | 16 | 17 | def add(self, embeddings, ids=None): 18 | if self.similarity == "cosine": 19 | embeddings /= np.linalg.norm(embeddings, axis=1)[:, None] 20 | if len(embeddings.shape) == 1: 21 | embeddings = embeddings.reshape(1, -1) 22 | self.index.add(embeddings) 23 | if ids is None: 24 | self.ids.extend(list(range(self.ids[-1] + 1, self.ids[-1] + 1 + embeddings.shape[0]))) 25 | else: 26 | self.ids.extend(ids) 27 | 28 | def search(self, queries: np.array, top_n: int): 29 | if len(queries.shape) == 1: 30 | queries = queries.reshape(1, -1) 31 | try: 32 | if self.similarity == "cosine": 33 | queries /= np.linalg.norm(queries, axis=1)[:, None] 34 | scores, indexes = self.index.search(queries, top_n) 35 | except AttributeError: 36 | print(queries) 37 | scores_ids = [] 38 | for top_n_score, top_n_idx in zip(scores, indexes): 39 | top_n_score_id = [] 40 | for s, i in zip(top_n_score, top_n_idx): 41 | top_n_score_id.append((s, self.ids[i])) 42 | scores_ids.append(top_n_score_id) 43 | 44 | return scores_ids -------------------------------------------------------------------------------- /OpenAgent/tools/src/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Reason-Wang/ToolGen/6839374a255810efe69deea4056eec5c55e25802/OpenAgent/tools/src/__init__.py -------------------------------------------------------------------------------- /OpenAgent/tools/src/basic_tools.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | import json 4 | 5 | 6 | BasicTools = [ 7 | # Finish function 8 | { 9 | "type": "function", 10 | "function": { 11 | "name": "Finish", 12 | "description": "If you believe that you have obtained a result that can answer the task, please call this function to provide the final answer. Alternatively, if you recognize that you are unable to proceed with the task in the current state, call this function to restart. Remember: you must ALWAYS call this function at the end of your attempt, and the only part that will be shown to the user is the final answer, so it should contain sufficient information.", 13 | "parameters": { 14 | "type": "object", 15 | "properties": { 16 | "return_type": { 17 | "type": "string", 18 | "enum": ["give_answer","give_up_and_restart"], 19 | }, 20 | "final_answer": { 21 | "type": "string", 22 | "description": "The final answer you want to give the user. You should have this field if \"return_type\"==\"give_answer\"", 23 | } 24 | }, 25 | "required": ["return_type"], 26 | }, 27 | } 28 | } 29 | ] 30 | 31 | 32 | TestTools = [ 33 | { 34 | "type": "function", 35 | "function": { 36 | "name": "get_current_temperature", 37 | "description": "Get the current temperature for a specific location", 38 | "parameters": { 39 | "type": "object", 40 | "properties": { 41 | "location": { 42 | "type": "string", 43 | "description": "The city and state, e.g., San Francisco, CA" 44 | }, 45 | "unit": { 46 | "type": "string", 47 | "enum": ["Celsius", "Fahrenheit"], 48 | "description": "The temperature unit to use. Infer this from the user's location." 49 | } 50 | }, 51 | "required": ["location", "unit"] 52 | } 53 | } 54 | }, 55 | { 56 | "type": "function", 57 | "function": { 58 | "name": "get_rain_probability", 59 | "description": "Get the probability of rain for a specific location", 60 | "parameters": { 61 | "type": "object", 62 | "properties": { 63 | "location": { 64 | "type": "string", 65 | "description": "The city and state, e.g., San Francisco, CA" 66 | } 67 | }, 68 | "required": ["location"] 69 | } 70 | } 71 | }, 72 | { 73 | "type": "function", 74 | "function": { 75 | "name": "Finish", 76 | "description": "If you believe that you have obtained a result that can answer the task, please call this function to provide the final answer. Alternatively, if you recognize that you are unable to proceed with the task in the current state, call this function to restart. Remember: you must ALWAYS call this function at the end of your attempt, and the only part that will be shown to the user is the final answer, so it should contain sufficient information.", 77 | "parameters": { 78 | "type": "object", 79 | "properties": { 80 | "return_type": { 81 | "type": "string", 82 | "enum": ["give_answer","give_up_and_restart"], 83 | }, 84 | "final_answer": { 85 | "type": "string", 86 | "description": "The final answer you want to give the user. You should have this field if \"return_type\"==\"give_answer\"", 87 | } 88 | }, 89 | "required": ["return_type"], 90 | }, 91 | } 92 | } 93 | ] 94 | 95 | def finish(return_type=None, final_answer=None): 96 | 97 | if return_type is None: 98 | response = {"error": "must have \"return_type\""} 99 | status = 2 100 | if return_type == "give_up_and_restart": 101 | response = {"response": "chose to give up and restart"} 102 | status = 4 103 | elif return_type == "give_answer": 104 | if final_answer is None: 105 | response = {"error": "must have \"final_answer\""} 106 | status = 2 107 | else: 108 | response = {"response": "successfully giving the final answer."} 109 | status = 3 110 | else: 111 | response = {"error": "\"return_type\" is not a valid choice\""} 112 | status = 2 113 | 114 | response['status_code'] = status 115 | return response 116 | 117 | 118 | def get_temperature(location, unit): 119 | return 75 120 | 121 | def get_rain_probability(location): 122 | return 0.2 123 | 124 | TestToolsMap = { 125 | "get_current_temperature": get_temperature, 126 | "get_rain_probability": get_rain_probability, 127 | "Finish": finish 128 | } 129 | -------------------------------------------------------------------------------- /OpenAgent/tools/src/rapidapi/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Reason-Wang/ToolGen/6839374a255810efe69deea4056eec5c55e25802/OpenAgent/tools/src/rapidapi/__init__.py -------------------------------------------------------------------------------- /OpenAgent/tools/src/rapidapi/utils.py: -------------------------------------------------------------------------------- 1 | import json 2 | import re 3 | 4 | def standardize(string): 5 | res = re.compile("[^\\u4e00-\\u9fa5^a-z^A-Z^0-9^_]") 6 | string = res.sub("_", string) 7 | string = re.sub(r"(_)\1+","_", string).lower() 8 | while True: 9 | if len(string) == 0: 10 | return string 11 | if string[0] == "_": 12 | string = string[1:] 13 | else: 14 | break 15 | while True: 16 | if len(string) == 0: 17 | return string 18 | if string[-1] == "_": 19 | string = string[:-1] 20 | else: 21 | break 22 | if string[0].isdigit(): 23 | string = "get_" + string 24 | return string 25 | 26 | def change_name(name): 27 | change_list = ["from", "class", "return", "false", "true", "id", "and"] 28 | if name in change_list: 29 | name = "is_" + name 30 | return name 31 | 32 | 33 | def finish(action_input): 34 | try: 35 | json_data = json.loads(action_input, strict=False) 36 | except: 37 | json_data = {} 38 | if '"return_type": "' in action_input: 39 | if '"return_type": "give_answer"' in action_input: 40 | return_type = "give_answer" 41 | elif '"return_type": "give_up_and_restart"' in action_input: 42 | return_type = "give_up_and_restart" 43 | else: 44 | return_type = action_input[action_input.find('"return_type": "')+len('"return_type": "'):action_input.find('",')] 45 | json_data["return_type"] = return_type 46 | if '"final_answer": "' in action_input: 47 | final_answer = action_input[action_input.find('"final_answer": "')+len('"final_answer": "'):] 48 | json_data["final_answer"] = final_answer 49 | if "return_type" not in json_data.keys(): 50 | return "{error:\"must have \"return_type\"\"}", 2 51 | if json_data["return_type"] == "give_up_and_restart": 52 | return "{\"response\":\"chose to give up and restart\"}",4 53 | elif json_data["return_type"] == "give_answer": 54 | if "final_answer" not in json_data.keys(): 55 | return "{error:\"must have \"final_answer\"\"}", 2 56 | 57 | return "{\"response\":\"successfully giving the final answer.\"}", 3 58 | else: 59 | return "{error:\"\"return_type\" is not a valid choice\"}", 2 60 | 61 | -------------------------------------------------------------------------------- /assets/banner.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Reason-Wang/ToolGen/6839374a255810efe69deea4056eec5c55e25802/assets/banner.png -------------------------------------------------------------------------------- /evaluation/retrieval/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Reason-Wang/ToolGen/6839374a255810efe69deea4056eec5c55e25802/evaluation/retrieval/__init__.py -------------------------------------------------------------------------------- /evaluation/retrieval/metrics.py: -------------------------------------------------------------------------------- 1 | 2 | import numpy as np 3 | from tqdm import trange 4 | import sklearn 5 | from utils.retrieval import Indexer 6 | from utils.embedding import get_embeddings 7 | 8 | 9 | def ndcg_score(model, tokenizer, queries, corpus, relevant_docs, batch_size=32, corpus_chunk_size=32): 10 | query_embeddings = get_embeddings( 11 | model, 12 | tokenizer, 13 | device="cuda", 14 | texts=queries, 15 | ) 16 | doc_embeddings = get_embeddings( 17 | model, 18 | tokenizer, 19 | device="cuda", 20 | texts=list(corpus.values()), 21 | ) 22 | 23 | indexer = Indexer(doc_embeddings, doc_embeddings.shape[1], ids=list(corpus.keys())) 24 | scores_docids = indexer.search(query_embeddings, 5) 25 | queries_ids = list(corpus.keys()) 26 | ndcg_scores = compute_ndcg_scores(queries_ids, scores_docids, relevant_docs) 27 | 28 | return ndcg_scores 29 | 30 | def compute_ndcg(relevant_docs_ids, score_docid, k): 31 | # Build the ground truth relevance scores and the model's predicted scores 32 | length = len(corpus_ids) 33 | true_relevance = np.zeros(length) 34 | predicted_scores = np.zeros(length) 35 | top_hits = score_docid 36 | for hit in top_hits: 37 | predicted_scores[corpus_ids.index(hit[1])] = hit[0] 38 | if hit[1] in relevant_docs_ids: 39 | true_relevance[corpus_ids.index(hit[1])] = 1 40 | 41 | return sklearn.metrics.ndcg_score([true_relevance], [predicted_scores], k=k) 42 | 43 | 44 | def compute_ndcg_scores(queries_ids, scores_docids, relevant_docs): 45 | ndcg_scores = [] 46 | for query_id, scores_docid in zip(queries_ids, scores_docids): 47 | relevant_docs_ids = relevant_docs[query_id] 48 | ndcg_score = compute_ndcg(relevant_docs_ids, scores_docid, k=5) 49 | ndcg_scores.append(ndcg_score) 50 | return np.mean(ndcg_scores) -------------------------------------------------------------------------------- /evaluation/toolbench/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Reason-Wang/ToolGen/6839374a255810efe69deea4056eec5c55e25802/evaluation/toolbench/__init__.py -------------------------------------------------------------------------------- /evaluation/toolbench/inference/Algorithms/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Reason-Wang/ToolGen/6839374a255810efe69deea4056eec5c55e25802/evaluation/toolbench/inference/Algorithms/__init__.py -------------------------------------------------------------------------------- /evaluation/toolbench/inference/Algorithms/base_search.py: -------------------------------------------------------------------------------- 1 | from Downstream_tasks.base_env import base_env 2 | 3 | class base_search_method: 4 | """For the base tree search method, you need to support the following functions""" 5 | 6 | def __init__(self,llm,io_func: base_env, process_id=0, callbacks = None): 7 | """Args: 8 | llm: The interface of the LLM 9 | io_func(base_env): Interface to the environment, 10 | process_id (int, optional): In multiprocessing annotation, this describes the process id. Defaults to 0. 11 | callbacks (_type_, optional): _description_. Defaults to None. 12 | """ 13 | pass 14 | 15 | def to_json(self,answer=False,process=True): 16 | ''' 17 | return a json object, 18 | If "answer" = True. must have the following field to make answer annotation 19 | If "process" = True. You need provide the full information of the tree searching process 20 | 21 | "answer_generation": { 22 | "valid_data": bool, 23 | "final_answer": string, 24 | "finish_type": enum["give_up","give_answer"] 25 | "train_messages": [ [openAI-message] ], 26 | } 27 | ''' 28 | raise NotImplementedError 29 | 30 | def start(self, **args): 31 | """This is the entry point of the searching process""" 32 | raise NotImplementedError 33 | 34 | -------------------------------------------------------------------------------- /evaluation/toolbench/inference/Downstream_tasks/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Reason-Wang/ToolGen/6839374a255810efe69deea4056eec5c55e25802/evaluation/toolbench/inference/Downstream_tasks/__init__.py -------------------------------------------------------------------------------- /evaluation/toolbench/inference/Downstream_tasks/base_env.py: -------------------------------------------------------------------------------- 1 | class base_env: 2 | 3 | def __init__(self): 4 | self.task_description = "" 5 | self.input_description = "" 6 | self.tool_names = [] 7 | self.functions = [] 8 | 9 | def restart(self): 10 | ''' 11 | Restrat the environment 12 | ''' 13 | raise NotImplementedError 14 | 15 | def get_score(self): 16 | ''' 17 | Get the value of the current state 18 | A fake function, used to search in oracle mode, which is not actually used (and impossible to obtain) 19 | ''' 20 | raise NotImplementedError 21 | 22 | def step(self, action, input_str): 23 | ''' 24 | Perform an interaction in natural language mode 25 | return value (output str, status code) 26 | ''' 27 | raise NotImplementedError 28 | 29 | def check_success(self): 30 | ''' 31 | Returns 1 if successful, otherwise returns 0 32 | ''' 33 | raise NotImplementedError 34 | 35 | def to_json(self): 36 | raise NotImplementedError -------------------------------------------------------------------------------- /evaluation/toolbench/inference/LLM/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Reason-Wang/ToolGen/6839374a255810efe69deea4056eec5c55e25802/evaluation/toolbench/inference/LLM/__init__.py -------------------------------------------------------------------------------- /evaluation/toolbench/inference/LLM/base_io.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | def base_io(input_str): 4 | pass -------------------------------------------------------------------------------- /evaluation/toolbench/inference/LLM/davinci_model.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | from typing import Optional, List, Mapping, Any 4 | from termcolor import colored 5 | import json 6 | import random 7 | import openai 8 | from typing import Optional 9 | from evaluation.toolbench.model.model_adapter import get_conversation_template 10 | from evaluation.toolbench.inference.utils import SimpleChatIO, react_parser 11 | from evaluation.toolbench.inference.Prompts.ReAct_prompts import FORMAT_INSTRUCTIONS_SYSTEM_FUNCTION_ZEROSHOT 12 | 13 | 14 | class Davinci: 15 | def __init__(self, model="text-davinci-003", openai_key="") -> None: 16 | super().__init__() 17 | self.model = model 18 | self.openai_key = openai_key 19 | self.chatio = SimpleChatIO() 20 | 21 | def prediction(self, prompt: str, stop: Optional[List[str]] = None) -> str: 22 | max_try = 10 23 | while True: 24 | openai.api_key = self.openai_key 25 | try: 26 | response = openai.Completion.create( 27 | engine=self.model, 28 | prompt=prompt, 29 | temperature=0.5, 30 | max_tokens=512, 31 | top_p=1, 32 | frequency_penalty=0, 33 | presence_penalty=0, 34 | stop="End Action" 35 | ) 36 | result = response['choices'][0]['text'].strip() 37 | break 38 | except Exception as e: 39 | print(e) 40 | max_try -= 1 41 | if max_try < 0: 42 | result = "Exceed max retry times. Please check your davinci api calling." 43 | break 44 | return result, response["usage"] 45 | 46 | def add_message(self, message): 47 | self.conversation_history.append(message) 48 | 49 | def change_messages(self,messages): 50 | self.conversation_history = messages 51 | 52 | def display_conversation(self, detailed=False): 53 | role_to_color = { 54 | "system": "red", 55 | "user": "green", 56 | "assistant": "blue", 57 | "function": "magenta", 58 | } 59 | print("before_print"+"*"*50) 60 | for message in self.conversation_history: 61 | print_obj = f"{message['role']}: {message['content']} " 62 | if "function_call" in message.keys(): 63 | print_obj = print_obj + f"function_call: {message['function_call']}" 64 | print_obj += "" 65 | print( 66 | colored( 67 | print_obj, 68 | role_to_color[message["role"]], 69 | ) 70 | ) 71 | print("end_print"+"*"*50) 72 | 73 | def parse(self,functions,process_id,**args): 74 | conv = get_conversation_template("tool-llama-single-round") 75 | roles = {"system": conv.roles[0], "user": conv.roles[1], "function": conv.roles[2], "assistant": conv.roles[3]} 76 | conversation_history = self.conversation_history 77 | question = '' 78 | for message in conversation_history: 79 | role = roles[message['role']] 80 | content = message['content'] 81 | if role == "User": 82 | question = content 83 | break 84 | func_str = "" 85 | func_list = [] 86 | for function_dict in functions: 87 | param_str = "" 88 | api_name = function_dict["name"] 89 | func_list.append(api_name) 90 | if "Finish" in api_name: 91 | param_str = f'"return_type": string, "final_answer": string, ' 92 | api_desc = "If you believe that you have obtained a result that can answer the task, please call this function to provide the final answer. ALWAYS call this function at the end of your attempt to answer the question finally." 93 | func_str += f"{api_name}: {api_desc}. Your input should be a json (args json schema): {param_str} The Action to trigger this API should be {api_name} and the input parameters should be a json dict string. Pay attention to the type of parameters.\n\n" 94 | else: 95 | api_desc = function_dict["description"][function_dict["description"].find("The description of this function is: ")+len("The description of this function is: "):] 96 | for param_name in function_dict["parameters"]["properties"]: 97 | data_type = function_dict["parameters"]["properties"][param_name]["type"] 98 | param_str += f'"{param_name}": {data_type}, ' 99 | param_str = "{{" + param_str + "}}" 100 | func_str += f"{api_name}: {api_desc}. Your input should be a json (args json schema): {param_str} The Action to trigger this API should be {api_name} and the input parameters should be a json dict string. Pay attention to the type of parameters.\n\n" 101 | func_list = str(func_list) 102 | prompt = FORMAT_INSTRUCTIONS_SYSTEM_FUNCTION_ZEROSHOT.replace("{func_str}", func_str).replace("{func_list}", func_list).replace("{func_list}", func_list).replace("{question}", question) 103 | prompt = prompt.replace("{{", "{").replace("}}", "}") 104 | for message in conversation_history: 105 | role = roles[message['role']] 106 | content = message['content'] 107 | if role == "Assistant": 108 | prompt += f"\n{content}\n" 109 | elif role == "Function": 110 | prompt += f"Observation: {content}\n" 111 | if functions != []: 112 | predictions, usage = self.prediction(prompt) 113 | else: 114 | predictions, usage = self.prediction(prompt) 115 | 116 | # react format prediction 117 | thought, action, action_input = react_parser(predictions) 118 | message = { 119 | "role": "assistant", 120 | "content": thought, 121 | "function_call": { 122 | "name": action, 123 | "arguments": action_input 124 | } 125 | } 126 | return message, 0, usage["total_tokens"] 127 | 128 | 129 | if __name__ == "__main__": 130 | llm = Davinci() 131 | result = llm.prediction("How old are you?") 132 | print(result) -------------------------------------------------------------------------------- /evaluation/toolbench/inference/LLM/llama_model.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | from typing import Optional, List, Mapping, Any 4 | from transformers import AutoTokenizer, AutoModelForCausalLM 5 | from termcolor import colored 6 | import time 7 | from typing import Optional 8 | from transformers import ( 9 | AutoTokenizer, 10 | AutoModelForCausalLM 11 | ) 12 | from toolbench.utils import process_system_message 13 | from toolbench.model.model_adapter import get_conversation_template 14 | from toolbench.inference.utils import SimpleChatIO, generate_stream, react_parser 15 | 16 | 17 | class LlamaModel: 18 | def __init__(self, model_name_or_path: str, template:str="tool-llama-single-round", device: str="cuda", cpu_offloading: bool=False, max_sequence_length: int=2048) -> None: 19 | super().__init__() 20 | self.model_name = model_name_or_path 21 | self.template = template 22 | self.max_sequence_length = max_sequence_length 23 | self.tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=False, model_max_length=self.max_sequence_length) 24 | self.model = AutoModelForCausalLM.from_pretrained( 25 | model_name_or_path, low_cpu_mem_usage=True 26 | ) 27 | if self.tokenizer.pad_token_id == None: 28 | self.tokenizer.add_special_tokens({"bos_token": "", "eos_token": "", "pad_token": ""}) 29 | self.model.resize_token_embeddings(len(self.tokenizer)) 30 | self.use_gpu = (True if device == "cuda" else False) 31 | if (device == "cuda" and not cpu_offloading) or device == "mps": 32 | self.model.to(device) 33 | self.chatio = SimpleChatIO() 34 | 35 | def prediction(self, prompt: str, stop: Optional[List[str]] = None) -> str: 36 | gen_params = { 37 | "model": "", 38 | "prompt": prompt, 39 | "temperature": 0.5, 40 | "max_new_tokens": 512, 41 | "stop": "", 42 | "stop_token_ids": None, 43 | "echo": False 44 | } 45 | generate_stream_func = generate_stream 46 | output_stream = generate_stream_func(self.model, self.tokenizer, gen_params, "cuda", self.max_sequence_length, force_generate=True) 47 | outputs = self.chatio.return_output(output_stream) 48 | prediction = outputs.strip() 49 | return prediction 50 | 51 | def add_message(self, message): 52 | self.conversation_history.append(message) 53 | 54 | def change_messages(self,messages): 55 | self.conversation_history = messages 56 | 57 | def display_conversation(self, detailed=False): 58 | role_to_color = { 59 | "system": "red", 60 | "user": "green", 61 | "assistant": "blue", 62 | "function": "magenta", 63 | } 64 | print("before_print"+"*"*50) 65 | for message in self.conversation_history: 66 | print_obj = f"{message['role']}: {message['content']} " 67 | if "function_call" in message.keys(): 68 | print_obj = print_obj + f"function_call: {message['function_call']}" 69 | print_obj += "" 70 | print( 71 | colored( 72 | print_obj, 73 | role_to_color[message["role"]], 74 | ) 75 | ) 76 | print("end_print"+"*"*50) 77 | 78 | def parse(self,functions,process_id,**args): 79 | conv = get_conversation_template(self.template) 80 | if self.template == "tool-llama": 81 | roles = {"human": conv.roles[0], "gpt": conv.roles[1]} 82 | elif self.template == "tool-llama-single-round" or self.template == "tool-llama-multi-rounds": 83 | roles = {"system": conv.roles[0], "user": conv.roles[1], "function": conv.roles[2], "assistant": conv.roles[3]} 84 | 85 | self.time = time.time() 86 | conversation_history = self.conversation_history 87 | prompt = '' 88 | for message in conversation_history: 89 | role = roles[message['role']] 90 | content = message['content'] 91 | if role == "System" and functions != []: 92 | content = process_system_message(content, functions) 93 | prompt += f"{role}: {content}\n" 94 | prompt += "Assistant:\n" 95 | if functions != []: 96 | predictions = self.prediction(prompt) 97 | else: 98 | predictions = self.prediction(prompt) 99 | 100 | decoded_token_len = len(self.tokenizer(predictions)) 101 | if process_id == 0: 102 | print(f"[process({process_id})]total tokens: {decoded_token_len}") 103 | 104 | thought, action, action_input = react_parser(predictions) 105 | if len(thought.strip()) > 1: 106 | print(thought) 107 | # input() 108 | message = { 109 | "role": "assistant", 110 | "content": thought, 111 | "function_call": { 112 | "name": action, 113 | "arguments": action_input 114 | } 115 | } 116 | return message, 0, decoded_token_len 117 | 118 | 119 | if __name__ == "__main__": 120 | # can accept all huggingface LlamaModel family 121 | llm = LlamaModel("decapoda-research/llama-7b-hf") 122 | messages = [ 123 | {'role': 'system', 'content': '''You are AutoGPT, you can use many tools(functions) to do 124 | the following task.\nFirst I will give you the task description, and your task start.\nAt each step, you need to give your thought to analyze the status now and what to do next, with a function call to actually excute your step.\nAfter the call, you will get the call result, and you are now in a new state.\nThen you will analyze your status now, then decide what to do next...\nAfter many (Thought-call) pairs, you finally perform the task, then you can give your finial answer.\nRemember: \n1.the state change is , you can\'t go 125 | back to the former state, if you want to restart the task, say "I give up and restart".\n2.All the thought is short, at most in 5 sentence.\nLet\'s Begin!\nTask description: Use numbers and basic arithmetic operations (+ - * /) to obtain exactly one number=24. Each 126 | step, you are only allowed to choose two of the left numbers to obtain a new number. For example, you can combine [3,13,9,7] as 7*9 - 3*13 = 24.\nRemember:\n1.all of the number must be used , and must be used ONCE. So Only when left numbers is exact 24, you will win. So you don\'t succeed when left number = [24, 5]. You succeed when left number = [24]. \n2.all the try takes exactly 3 steps, look 127 | at the input format'''}, 128 | {'role': 'user', 'content': '\nThe real task input is: [1, 2, 4, 7]\nBegin!\n'} 129 | ] 130 | functions = [{'name': 'play_24', 'description': '''make your current conbine with the format "x operation y = z (left: aaa) " like "1+2=3, (left: 3 5 7)", then I will tell you whether you win. This is the ONLY way 131 | to interact with the game, and the total process of a input use 3 steps of call, each step you can only combine 2 of the left numbers, so the count of left numbers decrease from 4 to 1''','parameters':{'type': 'object', 'properties':{}}}]#, 'parameters': {'type': 'object', 'properties': {'input': {'type': 'string', 'description': 'describe what number you want to conbine, and how to conbine.'}}, 'required': ['input']}}] 132 | 133 | llm.change_messages(messages) 134 | output = llm.parse(functions=functions) 135 | print(output) -------------------------------------------------------------------------------- /evaluation/toolbench/inference/LLM/retriever.py: -------------------------------------------------------------------------------- 1 | import time 2 | import pandas as pd 3 | from sentence_transformers import SentenceTransformer, util 4 | import json 5 | import re 6 | from evaluation.toolbench.utils import standardize, standardize_category, change_name, process_retrieval_ducoment 7 | 8 | 9 | class ToolRetriever: 10 | def __init__(self, corpus_tsv_path = "", model_path=""): 11 | self.corpus_tsv_path = corpus_tsv_path 12 | self.model_path = model_path 13 | self.corpus, self.corpus2tool = self.build_retrieval_corpus() 14 | self.embedder = self.build_retrieval_embedder() 15 | self.corpus_embeddings = self.build_corpus_embeddings() 16 | 17 | def build_retrieval_corpus(self): 18 | print("Building corpus...") 19 | documents_df = pd.read_csv(self.corpus_tsv_path, sep='\t') 20 | corpus, corpus2tool = process_retrieval_ducoment(documents_df) 21 | corpus_ids = list(corpus.keys()) 22 | corpus = [corpus[cid] for cid in corpus_ids] 23 | return corpus, corpus2tool 24 | 25 | def build_retrieval_embedder(self): 26 | print("Building embedder...") 27 | embedder = SentenceTransformer(self.model_path) 28 | return embedder 29 | 30 | def build_corpus_embeddings(self): 31 | print("Building corpus embeddings with embedder...") 32 | corpus_embeddings = self.embedder.encode(self.corpus, convert_to_tensor=True) 33 | return corpus_embeddings 34 | 35 | def retrieving(self, query, top_k=5, excluded_tools={}): 36 | print("Retrieving...") 37 | start = time.time() 38 | query_embedding = self.embedder.encode(query, convert_to_tensor=True) 39 | hits = util.semantic_search(query_embedding, self.corpus_embeddings, top_k=10*top_k, score_function=util.cos_sim) 40 | retrieved_tools = [] 41 | for rank, hit in enumerate(hits[0]): 42 | try: 43 | category, tool_name, api_name = self.corpus2tool[self.corpus[hit['corpus_id']]].split('\t') 44 | except ValueError as e: 45 | print(len(self.corpus2tool[self.corpus[hit['corpus_id']]])) 46 | print(self.corpus2tool[self.corpus[hit['corpus_id']]][0]) 47 | print(self.corpus2tool[self.corpus[hit['corpus_id']]][1]) 48 | 49 | category = standardize_category(category) 50 | tool_name = standardize(tool_name) # standardizing 51 | api_name = change_name(standardize(api_name)) # standardizing 52 | if category in excluded_tools: 53 | if tool_name in excluded_tools[category]: 54 | top_k += 1 55 | continue 56 | tmp_dict = { 57 | "category": category, 58 | "tool_name": tool_name, 59 | "api_name": api_name 60 | } 61 | retrieved_tools.append(tmp_dict) 62 | return retrieved_tools -------------------------------------------------------------------------------- /evaluation/toolbench/inference/LLM_rank/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Reason-Wang/ToolGen/6839374a255810efe69deea4056eec5c55e25802/evaluation/toolbench/inference/LLM_rank/__init__.py -------------------------------------------------------------------------------- /evaluation/toolbench/inference/LLM_rank/rank_candidate.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Evaluate the score of a query corresponding to different candidates 3 | ''' 4 | 5 | from Prompts.rank_prompts import LLM_PAIRWISE_RANK_SUBFIX_SYSTEM_PROMPT, LLM_PAIRWISE_RANK_USER_PROMPT 6 | import random 7 | from Tree.Tree import tree_node 8 | 9 | 10 | def rank2symmetry(llm_interface, LLM_rank_args, cand1,cand2): 11 | ''' 12 | Use llm to compare the height, due to the sequence, you need to compare each of the two in the front 13 | ''' 14 | single_rank_func = LLM_rank_args["rank_func"] 15 | score = [0,0] 16 | bigger1,query_count1, total_tokens1 = single_rank_func(llm_interface, LLM_rank_args, cand1,cand2) 17 | score[1 - bigger1] += 1 18 | bigger2,query_count2, total_tokens2 = single_rank_func(llm_interface, LLM_rank_args, cand2,cand1) 19 | score[bigger2] += 1 20 | if score[0] > score[1]: 21 | return 1 , query_count1 + query_count2, total_tokens1 + total_tokens2 22 | elif score[0] < score[1]: 23 | return -1, query_count1 + query_count2, total_tokens1 + total_tokens2 24 | else: 25 | return 0, query_count1 + query_count2, total_tokens1 + total_tokens2 26 | 27 | 28 | 29 | def rank2_subfix(llm_interface,LLM_rank_args, cand1,cand2): 30 | ''' 31 | Assumed that the two candidates have a long common prefix 32 | ''' 33 | anscestor_interesction = tree_node.find_ancestor_intersection(cand1,cand2) 34 | assert anscestor_interesction != None 35 | intersect_trice = anscestor_interesction.get_former_trice_from_this_node(end_node=None) 36 | trice_1 = cand1.get_former_trice_from_this_node(end_node=anscestor_interesction) 37 | trice_2 = cand2.get_former_trice_from_this_node(end_node=anscestor_interesction) 38 | 39 | system_message = LLM_PAIRWISE_RANK_SUBFIX_SYSTEM_PROMPT 40 | system_message = system_message.replace("{task_description}", LLM_rank_args["task_description"]) 41 | system_message = system_message.replace("{intersect_trice}", intersect_trice) 42 | system_message = system_message.replace("{candidate_A}",trice_1) 43 | system_message = system_message.replace("{candidate_B}",trice_2) 44 | llm_interface.change_messages([{"role":"system","content":system_message}, 45 | {"role":"user","content":LLM_PAIRWISE_RANK_USER_PROMPT}, 46 | ]) 47 | output,error_code, total_tokens = llm_interface.parse(functions=LLM_rank_args["functions"],function_call="none",process_id=LLM_rank_args["process_id"]) 48 | if output["content"].strip().lower()[-1] == "a": 49 | return 1, 1, total_tokens 50 | else: 51 | return 0, 1, total_tokens 52 | 53 | def sum_based_rankn(llm_interface,LLM_rank_args, candidates): 54 | ''' 55 | All pairs are sorted pairwise, sum the total points, and choose the best 56 | ''' 57 | total_querys = 0 58 | total_tokens = 0 59 | scores = [0]*len(candidates) 60 | for i in range(len(candidates)-1): 61 | for j in range(i+1,len(candidates)): 62 | pairwise_rank,query_count,rank2_tokens = rank2symmetry(llm_interface,LLM_rank_args, candidates[i],candidates[j]) 63 | total_querys += query_count 64 | total_tokens += rank2_tokens 65 | if pairwise_rank > 0: 66 | scores[i] += 1 67 | elif pairwise_rank < 0: 68 | scores[j] += 1 69 | else: 70 | scores[i] += 0.5 71 | scores[j] += 0.5 72 | return scores, total_querys, total_tokens 73 | 74 | 75 | 76 | if __name__ == "__main__": 77 | random.seed(42) 78 | # candidates = [ 79 | # "234", 80 | # "66.5", 81 | # "77.1", 82 | # "88.967", 83 | # "pi", 84 | # # "e", 85 | # # "ln(2)" 86 | # ] 87 | candidates = [ 88 | "77.1", 89 | "88.967", 90 | "pi", 91 | "66.5", 92 | "234", 93 | "ln(2)" 94 | ] 95 | ''' 96 | starting_delta: 97 | 50 -> 42.85% 98 | 100 -> 35.99% 99 | 150 -> 29.66% 100 | 200 -> 24.03% 101 | ''' 102 | -------------------------------------------------------------------------------- /evaluation/toolbench/inference/Prompts/ReAct_prompts.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | FORMAT_INSTRUCTIONS_SYSTEM_FUNCTION = """You are AutoGPT, you can use many tools(functions) to do the following task. 5 | First I will give you the task description, and your task start. 6 | At each step, you need to give your thought to analyze the status now and what to do next, with a function call to actually excute your step. 7 | After the call, you will get the call result, and you are now in a new state. 8 | Then you will analyze your status now, then decide what to do next... 9 | After many (Thought-call) pairs, you finally perform the task, then you can give your finial answer. 10 | Remember: 11 | 1.the state change is irreversible, you can't go back to one of the former state, if you want to restart the task, say "I give up and restart". 12 | 2.All the thought is short, at most in 5 sentence. 13 | 3.You can do more then one trys, so if your plan is to continusly try some conditions, you can do one of the conditions per try. 14 | Let's Begin! 15 | Task description: {task_description}""" 16 | 17 | FORMAT_INSTRUCTIONS_USER_FUNCTION = """ 18 | {input_description} 19 | Begin! 20 | """ 21 | 22 | FORMAT_INSTRUCTIONS_SYSTEM_FUNCTION_ZEROSHOT = """Answer the following questions as best you can. Specifically, you have access to the following APIs: 23 | 24 | {func_str} 25 | 26 | Use the following format: 27 | Thought: you should always think about what to do 28 | Action: the action to take, should be one of {func_list} 29 | Action Input: the input to the action 30 | End Action 31 | 32 | Begin! Remember: (1) Follow the format, i.e, 33 | Thought: 34 | Action: 35 | Action Input: 36 | End Action 37 | (2)The Action: MUST be one of the following:{func_list} 38 | (3)If you believe that you have obtained enough information (which can be judge from the history observations) that can answer the task, please call: 39 | Action: Finish 40 | Action Input: {{"return_type": "give_answer", "final_answer": your answer string}}. 41 | Question: {question} 42 | 43 | Here are the history actions and observations: 44 | """ 45 | -------------------------------------------------------------------------------- /evaluation/toolbench/inference/Prompts/Tree_search_prompts.py: -------------------------------------------------------------------------------- 1 | DIVERSITY_PROMPT='''This is not the first time you try this task, all previous trails failed. 2 | Before you generate my thought for this state, I will first show you your previous actions for this state, and then you must generate actions that is different from all of them. Here are some previous actions candidates: 3 | {previous_candidate} 4 | Remember you are now in the intermediate state of a trail, you will first analyze the now state and previous action candidates, then make actions that is different from all the previous.''' 5 | 6 | 7 | -------------------------------------------------------------------------------- /evaluation/toolbench/inference/Prompts/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Reason-Wang/ToolGen/6839374a255810efe69deea4056eec5c55e25802/evaluation/toolbench/inference/Prompts/__init__.py -------------------------------------------------------------------------------- /evaluation/toolbench/inference/Prompts/rank_prompts.py: -------------------------------------------------------------------------------- 1 | 2 | LLM_PAIRWISE_RANK_SUBFIX_SYSTEM_PROMPT = ''' 3 | You are value-GPT, which is an expert of defining which trail is better, which trail is more close to solving the task. 4 | All candidate tries to solve this task with some funciton calls: 5 | ******************************* 6 | {{TASK_DESCRIPTION}} 7 | {task_description} 8 | {{END_TASK_DESCRIPTION}} 9 | ******************************* 10 | First, all candidate do the following things: 11 | {intersect_trice} 12 | After that, there are two candidates A and B, they do different things: 13 | ******************************* 14 | {{CANDIDATE_A_START}} 15 | {candidate_A} 16 | {{CANDIDATE_A_END}} 17 | ******************************* 18 | {{CANDIDATE_B_START}} 19 | {candidate_B} 20 | {{CANDIDATE_B_END}} 21 | Which try do you think is more helpful to solving the task? 22 | ''' 23 | 24 | 25 | 26 | 27 | LLM_PAIRWISE_RANK_USER_PROMPT = ''' 28 | Tell me which candidate is better in ONE Word: "A" or "B":''' -------------------------------------------------------------------------------- /evaluation/toolbench/inference/Tree/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Reason-Wang/ToolGen/6839374a255810efe69deea4056eec5c55e25802/evaluation/toolbench/inference/Tree/__init__.py -------------------------------------------------------------------------------- /evaluation/toolbench/inference/callbacks/ServerEventCallback.py: -------------------------------------------------------------------------------- 1 | from typing import Any, Dict, List, Union 2 | import queue 3 | class ServerEventCallback(): 4 | """Base callback handler""" 5 | 6 | def __init__(self, queue: queue.Queue, *args, **kwargs): 7 | super().__init__(*args, **kwargs) 8 | self.queue = queue 9 | self.llm_block_id = 0 10 | self.tool_block_id = 0 11 | self.tool_descriptions = {} 12 | 13 | def add_to_queue(self, method_name: str, block_id, **kwargs: Any): 14 | data = { 15 | "method_name": method_name, 16 | "block_id": block_id, 17 | } 18 | data.update(kwargs) 19 | self.queue.put(data) 20 | 21 | def on_tool_retrieval_start(self): 22 | # tools should be of the form 23 | # {tool_name, tool_desc} 24 | self.add_to_queue( 25 | "on_tool_retrieval_start", 26 | "recommendation-1", 27 | ) 28 | print("on_tool_retrieval_start method called") 29 | 30 | def on_tool_retrieval_end(self, tools): 31 | # tool should be of the form 32 | # {tool_name, tool_desc} 33 | self.add_to_queue( 34 | "on_tool_retrieval_end", 35 | "recommendation-1", 36 | recommendations=tools 37 | ) 38 | self.tool_descriptions = { 39 | tool["name"]: tool for tool in tools 40 | } 41 | print("on_tool_retrieval_end method called") 42 | def on_request_start(self, user_input: str, method: str) -> Any: 43 | self.tool_block_id = 0 44 | self.llm_block_id = 0 45 | self.add_to_queue( 46 | "on_request_start", 47 | block_id="start", 48 | user_input=user_input, 49 | method=method 50 | ) 51 | def on_request_end(self, outputs: str, chain: List[Any]): 52 | self.add_to_queue( 53 | "on_request_end", 54 | block_id="end", 55 | output=outputs, 56 | chain=chain 57 | ) 58 | def on_request_error(self, error: str): 59 | self.add_to_queue( 60 | "on_request_error", 61 | block_id="error", 62 | error=error 63 | ) 64 | 65 | # keep 66 | def on_chain_start(self, inputs: str, depth: int) -> Any: 67 | """Run when chain starts running.""" 68 | print("on_chain_start method called") 69 | self.llm_block_id += 1 70 | block_id = "llm-" + str(self.llm_block_id) 71 | self.add_to_queue( 72 | "on_chain_start", 73 | block_id=block_id, 74 | messages=inputs, 75 | depth=depth 76 | ) 77 | return block_id 78 | 79 | # this one needs the block_id memorized 80 | def on_chain_end(self, block_id: str, depth: int) -> Any: 81 | self.add_to_queue( 82 | "on_chain_end", 83 | block_id=block_id, 84 | # output=output, 85 | depth=depth 86 | ) 87 | print("on_chain_end method called") 88 | 89 | def on_chain_error(self, error: Union[Exception, KeyboardInterrupt], **kwargs: Any) -> Any: 90 | method_name = "on_chain_error" 91 | self.add_to_queue(method_name, error=error, **kwargs) 92 | print("on_chain_error method called") 93 | 94 | def on_llm_start( 95 | self, messages: str, depth: int 96 | ) -> Any: 97 | """Run when LLM starts running.""" 98 | self.add_to_queue( 99 | "on_llm_start", 100 | block_id="llm-" + str(self.llm_block_id), 101 | messages=messages, 102 | depth=depth 103 | ) 104 | print("on_llm_start method called") 105 | 106 | def on_llm_new_token(self, token: str, **kwargs: Any) -> Any: 107 | """Run on new LLM token. Only available when streaming is enabled.""" 108 | method_name = "on_llm_new_token" 109 | self.add_to_queue(method_name, token=token, **kwargs) 110 | print("on_llm_new_token method called") 111 | 112 | def on_llm_end(self, response: str, depth: int) -> Any: 113 | """Run when LLM ends running.""" 114 | self.add_to_queue( 115 | "on_llm_end", 116 | block_id="llm-" + str(self.llm_block_id), 117 | response=response, 118 | depth=depth 119 | ) 120 | print("on_llm_end method called") 121 | 122 | def on_llm_error(self, error: Union[Exception, KeyboardInterrupt]) -> Any: 123 | """Run when LLM errors.""" 124 | self.add_to_queue( 125 | "on_llm_error", 126 | block_id="llm-" + str(self.llm_block_id), 127 | message=str(error), 128 | error=error 129 | ) 130 | print("on_llm_error method called") 131 | 132 | def on_agent_action(self, action, action_input, depth: int) -> str: 133 | self.tool_block_id += 1 134 | block_id="tool-" + str(self.tool_block_id) 135 | self.add_to_queue( 136 | "on_agent_action", 137 | block_id=block_id, 138 | action=action, 139 | action_input = action_input, 140 | depth=depth 141 | ) 142 | print("on_agent_action method called") 143 | return block_id 144 | 145 | def on_tool_start(self, tool_name: str, tool_input: str, depth: int) -> Any: 146 | method_name = "on_tool_start" 147 | tool_description = "Tool not found in tool descriptions" 148 | if tool_name in self.tool_descriptions: 149 | tool_description = self.tool_descriptions[tool_name] 150 | else: 151 | print(self.tool_descriptions) 152 | print("Key", tool_name, "not found in tool descriptions") 153 | self.add_to_queue( 154 | method_name, 155 | block_id="tool-" + str(self.tool_block_id), 156 | tool_name=tool_name, 157 | tool_description=tool_description, 158 | tool_input=tool_input, 159 | depth=depth 160 | ) 161 | print("on_tool_start method called") 162 | 163 | def on_tool_end(self, output: str, status:int, depth: int) -> Any: 164 | method_name = "on_tool_end" 165 | self.add_to_queue( 166 | method_name, 167 | block_id="tool-" + str(self.tool_block_id), 168 | output=output, 169 | status= status, 170 | depth=depth 171 | ) 172 | print("on_tool_end method called") 173 | 174 | def on_tool_error(self, error: Union[Exception, KeyboardInterrupt]) -> Any: 175 | method_name = "on_tool_error" 176 | self.add_to_queue( 177 | method_name, 178 | error=error 179 | ) 180 | print("on_tool_error method called") 181 | 182 | def on_agent_end(self, block_id:str, depth: int): 183 | self.add_to_queue( 184 | "on_agent_end", 185 | block_id=block_id, 186 | depth=depth 187 | ) 188 | print("on_agent_end method called") -------------------------------------------------------------------------------- /evaluation/toolbench/inference/qa_pipeline.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Close-domain QA Pipeline 3 | ''' 4 | 5 | import argparse 6 | from toolbench.inference.Downstream_tasks.rapidapi import pipeline_runner 7 | 8 | 9 | if __name__ == "__main__": 10 | 11 | parser = argparse.ArgumentParser() 12 | parser.add_argument('--backbone_model', type=str, default="toolllama", required=False, help='chatgpt_function or davinci or toolllama') 13 | parser.add_argument('--chatgpt_model', type=str, default=None, required=True, help='gpt-3.5-turbo or gpt-4') 14 | parser.add_argument('--base_url', type=str, default="https://api.openai.com/v1", required=False, help='openai api url') 15 | parser.add_argument('--openai_key', type=str, default="", required=False, help='openai key for chatgpt_function or davinci model') 16 | parser.add_argument('--model_path', type=str, default="your_model_path/", required=False, help='') 17 | parser.add_argument('--tool_root_dir', type=str, default="your_tools_path/", required=True, help='') 18 | parser.add_argument("--lora", action="store_true", help="Load lora model or not.") 19 | parser.add_argument('--lora_path', type=str, default="your_lora_path if lora", required=False, help='') 20 | parser.add_argument('--max_observation_length', type=int, default=1024, required=False, help='maximum observation length') 21 | parser.add_argument('--max_source_sequence_length', type=int, default=4096, required=False, help='original maximum model sequence length') 22 | parser.add_argument('--max_sequence_length', type=int, default=8192, required=False, help='maximum model sequence length') 23 | parser.add_argument('--observ_compress_method', type=str, default="truncate", choices=["truncate", "filter", "random"], required=False, help='observation compress method') 24 | parser.add_argument('--method', type=str, default="CoT@1", required=False, help='method for answer generation: CoT@n,Reflexion@n,BFS,DFS,UCT_vote') 25 | parser.add_argument('--input_query_file', type=str, default="", required=False, help='input path') 26 | parser.add_argument('--output_answer_file', type=str, default="",required=False, help='output path') 27 | parser.add_argument('--toolbench_key', type=str, default="",required=False, help='your toolbench key to request rapidapi service') 28 | parser.add_argument('--rapidapi_key', type=str, default="",required=False, help='your rapidapi key to request rapidapi service') 29 | parser.add_argument('--use_rapidapi_key', action="store_true", help="To use customized rapidapi service or not.") 30 | parser.add_argument('--api_customization', action="store_true", help="To use customized api or not.") 31 | 32 | args = parser.parse_args() 33 | 34 | pipeline_runner = pipeline_runner(args) 35 | pipeline_runner.run() 36 | 37 | -------------------------------------------------------------------------------- /evaluation/toolbench/inference/qa_pipeline_multithread.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Close-domain QA Pipeline 3 | ''' 4 | 5 | import argparse, os 6 | from evaluation.toolbench.inference.Downstream_tasks.rapidapi_multithread import pipeline_runner 7 | 8 | 9 | if __name__ == "__main__": 10 | 11 | parser = argparse.ArgumentParser() 12 | parser.add_argument('--corpus_tsv_path', type=str, default=None, required=False, help="your_retrival_corpus_path/") 13 | parser.add_argument('--retrieval_model_path', type=str, default=None, required=False, help="your_model_path/") 14 | parser.add_argument('--retrieved_api_nums', type=int, default=5, required=False, help='') 15 | parser.add_argument('--backbone_model', type=str, default="toolllama", required=False, help='chatgpt_function or davinci or toolllama') 16 | parser.add_argument('--chatgpt_model', type=str, default=None, required=True, help='gpt-3.5-turbo or gpt-4') 17 | parser.add_argument('--base_url', type=str, default="https://api.openai.com/v1", required=False, help='openai api url') 18 | parser.add_argument('--openai_key', type=str, default="", required=False, help='openai key for chatgpt_function or davinci model') 19 | parser.add_argument('--model_path', type=str, default="your_model_path/", required=False, help='') 20 | parser.add_argument('--tool_root_dir', type=str, default="your_tools_path/", required=True, help='') 21 | parser.add_argument("--lora", action="store_true", help="Load lora model or not.") 22 | parser.add_argument('--lora_path', type=str, default="your_lora_path if lora", required=False, help='') 23 | parser.add_argument('--max_observation_length', type=int, default=1024, required=False, help='maximum observation length') 24 | parser.add_argument('--max_source_sequence_length', type=int, default=4096, required=False, help='original maximum model sequence length') 25 | parser.add_argument('--max_sequence_length', type=int, default=8192, required=False, help='maximum model sequence length') 26 | parser.add_argument('--single_chain_max_step', type=int, default=16, required=False, help='maximum step for single chain') 27 | parser.add_argument('--max_query_count', type=int, default=200, required=False, help='maximum query count') 28 | parser.add_argument('--observ_compress_method', type=str, default="truncate", choices=["truncate", "filter", "random"], required=False, help='observation compress method') 29 | parser.add_argument('--method', type=str, default="CoT@1", required=False, help='method for answer generation: CoT@n,Reflexion@n,BFS,DFS,UCT_vote') 30 | parser.add_argument('--input_query_file', type=str, default="", required=False, help='input path') 31 | parser.add_argument('--output_answer_file', type=str, default="",required=False, help='output path') 32 | parser.add_argument('--toolbench_key', type=str, default="",required=False, help='your toolbench key to request rapidapi service') 33 | parser.add_argument('--rapidapi_key', type=str, default="",required=False, help='your rapidapi key to request rapidapi service') 34 | parser.add_argument('--use_rapidapi_key', action="store_true", help="To use customized rapidapi service or not.") 35 | parser.add_argument('--api_customization', action="store_true", help="To use customized api or not.") 36 | parser.add_argument('--num_thread', type=int, default=1, required=False, help='number of threads') 37 | parser.add_argument('--disable_tqdm', action="store_true", help="disable tqdm or not.") 38 | parser.add_argument('--overwrite', action='store_true', help='overwrite existing runs') 39 | parser.add_argument('--function_provider', type=str, default="truth", required=False, help='') 40 | parser.add_argument('--replace_file', type=str, default="", required=False, help='') 41 | parser.add_argument('--indexing', type=str, default="") 42 | parser.add_argument('--template', type=str, default="") 43 | args = parser.parse_args() 44 | if args.overwrite: 45 | os.system(f"rm -rf {args.output_answer_file}") 46 | 47 | from evaluation.utils.utils import seed_everything 48 | seed_everything(42) 49 | 50 | pipeline_runner = pipeline_runner(args, add_retrieval=True if args.retrieval_model_path else False) 51 | pipeline_runner.run() 52 | 53 | -------------------------------------------------------------------------------- /evaluation/toolbench/inference/qa_pipeline_open_domain.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Open-domain QA Pipeline 3 | ''' 4 | import argparse 5 | from toolbench.inference.Downstream_tasks.rapidapi import pipeline_runner 6 | 7 | 8 | if __name__ == "__main__": 9 | 10 | parser = argparse.ArgumentParser() 11 | parser.add_argument('--corpus_tsv_path', type=str, default="your_retrival_corpus_path/", required=False, help='') 12 | parser.add_argument('--retrieval_model_path', type=str, default="your_model_path/", required=False, help='') 13 | parser.add_argument('--retrieved_api_nums', type=int, default=5, required=False, help='') 14 | parser.add_argument('--backbone_model', type=str, default="toolllama", required=False, help='chatgpt_function or davinci or toolllama') 15 | parser.add_argument('--chatgpt_model', type=str, default=None, required=True, help='gpt-3.5-turbo or gpt-4') 16 | parser.add_argument('--base_url', type=str, default="https://api.openai.com/v1", required=False, help='openai api url') 17 | parser.add_argument('--openai_key', type=str, default="", required=False, help='openai key for chatgpt_function or davinci model') 18 | parser.add_argument('--model_path', type=str, default="your_model_path/", required=False, help='') 19 | parser.add_argument('--tool_root_dir', type=str, default="your_tools_path/", required=True, help='') 20 | parser.add_argument("--lora", action="store_true", help="Load lora model or not.") 21 | parser.add_argument('--lora_path', type=str, default="your_lora_path if lora", required=False, help='') 22 | parser.add_argument('--max_observation_length', type=int, default=1024, required=False, help='maximum observation length') 23 | parser.add_argument('--max_source_sequence_length', type=int, default=4096, required=False, help='original maximum model sequence length') 24 | parser.add_argument('--max_sequence_length', type=int, default=8192, required=False, help='maximum model sequence length') 25 | parser.add_argument('--observ_compress_method', type=str, default="truncate", choices=["truncate", "filter", "random"], required=False, help='maximum observation length') 26 | parser.add_argument('--method', type=str, default="CoT@1", required=False, help='method for answer generation: CoT@n,Reflexion@n,BFS,DFS,UCT_vote') 27 | parser.add_argument('--input_query_file', type=str, default="", required=False, help='input path') 28 | parser.add_argument('--output_answer_file', type=str, default="",required=False, help='output path') 29 | parser.add_argument('--toolbench_key', type=str, default="",required=False, help='your toolbench key to request rapidapi service') 30 | parser.add_argument('--rapidapi_key', type=str, default="",required=False, help='your rapidapi key to request rapidapi service') 31 | parser.add_argument('--use_rapidapi_key', action="store_true", help="To use customized rapidapi service or not.") 32 | parser.add_argument('--api_customization', action="store_true", help="To use customized api or not. NOT SUPPORTED currently under open domain setting.") 33 | 34 | args = parser.parse_args() 35 | 36 | pipeline_runner = pipeline_runner(args, add_retrieval=True) 37 | pipeline_runner.run() 38 | -------------------------------------------------------------------------------- /evaluation/toolbench/model/__init__.py: -------------------------------------------------------------------------------- 1 | from evaluation.toolbench.model.model_adapter import ( 2 | load_model, 3 | get_conversation_template, 4 | add_model_args, 5 | ) 6 | -------------------------------------------------------------------------------- /evaluation/toolbench/model/apply_delta.py: -------------------------------------------------------------------------------- 1 | """ 2 | Apply the delta weights on top of a base model. 3 | 4 | Usage: 5 | python3 -m fastchat.model.apply_delta --base ~/model_weights/llama-7b --target ~/model_weights/vicuna-7b --delta lmsys/vicuna-7b-delta-v1.1 6 | """ 7 | import argparse 8 | import gc 9 | import glob 10 | import json 11 | import os 12 | import shutil 13 | import tempfile 14 | 15 | from huggingface_hub import snapshot_download 16 | import torch 17 | from torch import nn 18 | from tqdm import tqdm 19 | from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig 20 | 21 | 22 | GB = 1 << 30 23 | 24 | 25 | def split_files(model_path, tmp_path, split_size): 26 | if not os.path.exists(model_path): 27 | model_path = snapshot_download(repo_id=model_path) 28 | if not os.path.exists(tmp_path): 29 | os.makedirs(tmp_path) 30 | 31 | file_pattern = os.path.join(model_path, "pytorch_model-*.bin") 32 | files = glob.glob(file_pattern) 33 | 34 | part = 0 35 | try: 36 | for file_path in tqdm(files): 37 | state_dict = torch.load(file_path) 38 | new_state_dict = {} 39 | 40 | current_size = 0 41 | for name, param in state_dict.items(): 42 | param_size = param.numel() * param.element_size() 43 | 44 | if current_size + param_size > split_size: 45 | new_file_name = f"pytorch_model-{part}.bin" 46 | new_file_path = os.path.join(tmp_path, new_file_name) 47 | torch.save(new_state_dict, new_file_path) 48 | current_size = 0 49 | new_state_dict = None 50 | gc.collect() 51 | new_state_dict = {} 52 | part += 1 53 | 54 | new_state_dict[name] = param 55 | current_size += param_size 56 | 57 | new_file_name = f"pytorch_model-{part}.bin" 58 | new_file_path = os.path.join(tmp_path, new_file_name) 59 | torch.save(new_state_dict, new_file_path) 60 | new_state_dict = None 61 | gc.collect() 62 | new_state_dict = {} 63 | part += 1 64 | except Exception as e: 65 | print(f"An error occurred during split_files: {e}") 66 | shutil.rmtree(tmp_path) 67 | raise 68 | 69 | 70 | def apply_delta_low_cpu_mem(base_model_path, target_model_path, delta_path): 71 | delta_tokenizer = AutoTokenizer.from_pretrained(delta_path, use_fast=False) 72 | delta_config = AutoConfig.from_pretrained(delta_path) 73 | 74 | if os.path.exists(target_model_path): 75 | shutil.rmtree(target_model_path) 76 | os.makedirs(target_model_path) 77 | 78 | split_size = 4 * GB 79 | 80 | with tempfile.TemporaryDirectory() as tmp_base_path, tempfile.TemporaryDirectory() as tmp_delta_path: 81 | print(f"Split files for the base model to {tmp_base_path}") 82 | split_files(base_model_path, tmp_base_path, split_size) 83 | print(f"Split files for the delta weights to {tmp_delta_path}") 84 | split_files(delta_path, tmp_delta_path, split_size) 85 | 86 | base_pattern = os.path.join(tmp_base_path, "pytorch_model-*.bin") 87 | base_files = glob.glob(base_pattern) 88 | delta_pattern = os.path.join(tmp_delta_path, "pytorch_model-*.bin") 89 | delta_files = glob.glob(delta_pattern) 90 | delta_state_dict = torch.load(delta_files[0]) 91 | 92 | print("Applying the delta") 93 | weight_map = {} 94 | total_size = 0 95 | 96 | for i, base_file in tqdm(enumerate(base_files)): 97 | state_dict = torch.load(base_file) 98 | file_name = f"pytorch_model-{i}.bin" 99 | for name, param in state_dict.items(): 100 | if name not in delta_state_dict: 101 | for delta_file in delta_files: 102 | delta_state_dict = torch.load(delta_file) 103 | gc.collect() 104 | if name in delta_state_dict: 105 | break 106 | 107 | state_dict[name] += delta_state_dict[name] 108 | weight_map[name] = file_name 109 | total_size += param.numel() * param.element_size() 110 | gc.collect() 111 | torch.save(state_dict, os.path.join(target_model_path, file_name)) 112 | 113 | with open( 114 | os.path.join(target_model_path, "pytorch_model.bin.index.json"), "w" 115 | ) as f: 116 | json.dump( 117 | {"weight_map": weight_map, "metadata": {"total_size": total_size}}, f 118 | ) 119 | 120 | print(f"Saving the target model to {target_model_path}") 121 | delta_tokenizer.save_pretrained(target_model_path) 122 | delta_config.save_pretrained(target_model_path) 123 | 124 | 125 | def apply_delta(base_model_path, target_model_path, delta_path): 126 | print(f"Loading the delta weights from {delta_path}") 127 | delta_tokenizer = AutoTokenizer.from_pretrained(delta_path, use_fast=False) 128 | delta = AutoModelForCausalLM.from_pretrained( 129 | delta_path, torch_dtype=torch.float16, low_cpu_mem_usage=True 130 | ) 131 | 132 | print(f"Loading the base model from {base_model_path}") 133 | base = AutoModelForCausalLM.from_pretrained( 134 | base_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True 135 | ) 136 | 137 | print("Applying the delta") 138 | for name, param in tqdm(base.state_dict().items(), desc="Applying delta"): 139 | assert name in delta.state_dict() 140 | param.data += delta.state_dict()[name] 141 | 142 | print(f"Saving the target model to {target_model_path}") 143 | base.save_pretrained(target_model_path) 144 | delta_tokenizer.save_pretrained(target_model_path) 145 | 146 | 147 | if __name__ == "__main__": 148 | parser = argparse.ArgumentParser() 149 | parser.add_argument("--base-model-path", type=str, required=True) 150 | parser.add_argument("--target-model-path", type=str, required=True) 151 | parser.add_argument("--delta-path", type=str, required=True) 152 | parser.add_argument( 153 | "--low-cpu-mem", 154 | action="store_true", 155 | help="Lower the cpu memory usage. This will split large files and use " 156 | "disk as swap to reduce the memory usage below 10GB.", 157 | ) 158 | args = parser.parse_args() 159 | 160 | if args.low_cpu_mem: 161 | apply_delta_low_cpu_mem( 162 | args.base_model_path, args.target_model_path, args.delta_path 163 | ) 164 | else: 165 | apply_delta(args.base_model_path, args.target_model_path, args.delta_path) 166 | -------------------------------------------------------------------------------- /evaluation/toolbench/model/compression.py: -------------------------------------------------------------------------------- 1 | import dataclasses 2 | import os 3 | 4 | import torch 5 | import torch.nn as nn 6 | from torch.nn import functional as F 7 | from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig 8 | 9 | 10 | @dataclasses.dataclass 11 | class CompressionConfig: 12 | """Group-wise quantization.""" 13 | 14 | num_bits: int 15 | group_size: int 16 | group_dim: int 17 | symmetric: bool 18 | enabled: bool = True 19 | 20 | 21 | default_compression_config = CompressionConfig( 22 | num_bits=8, group_size=256, group_dim=1, symmetric=True, enabled=True 23 | ) 24 | 25 | 26 | class CLinear(nn.Module): 27 | """Compressed Linear Layer.""" 28 | 29 | def __init__(self, weight=None, bias=None, device=None): 30 | super().__init__() 31 | self.weight = weight 32 | self.bias = bias 33 | 34 | def forward(self, input): 35 | return F.linear(input.to(self.weight.dtype), self.weight, self.bias) 36 | 37 | 38 | def compress_module(module, target_device): 39 | for name, child in module.named_children(): 40 | if isinstance(child, nn.Linear): 41 | setattr( 42 | module, 43 | name, 44 | CLinear(child.weight, child.bias, target_device), 45 | ) 46 | compress_module(child, target_device) 47 | 48 | 49 | def get_compressed_list(module, prefix=""): 50 | compressed_list = [] 51 | for name, child in module.named_children(): 52 | if isinstance(child, nn.Linear): 53 | full_name = f"{prefix}.{name}.weight" if prefix else f"{name}.weight" 54 | compressed_list.append(full_name) 55 | compressed_list.extend( 56 | get_compressed_list(child, full_name) 57 | ) 58 | return compressed_list 59 | 60 | 61 | def apply_compressed_weight(module, compressed_state_dict, target_device, prefix=""): 62 | for name, child in module.named_children(): 63 | if isinstance(child, nn.Linear): 64 | full_name = f"{prefix}.{name}.weight" if prefix else f"{name}.weight" 65 | setattr( 66 | module, 67 | name, 68 | CLinear( 69 | compressed_state_dict[full_name], child.bias, target_device 70 | ), 71 | ) 72 | apply_compressed_weight(child, compressed_state_dict, target_device, full_name) 73 | 74 | 75 | def load_compress_model(model_path, device, torch_dtype): 76 | # partially load model 77 | tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False) 78 | base_pattern = os.path.join(model_path, "pytorch_model-*.bin") 79 | files = glob.glob(base_pattern) 80 | 81 | config = AutoConfig.from_pretrained( 82 | model_path, low_cpu_mem_usage=True, torch_dtype=torch_dtype 83 | ) 84 | model = AutoModelForCausalLM.from_config(config) 85 | linear_weights = get_compressed_list(model) 86 | 87 | compressed_state_dict = {} 88 | 89 | for filename in files: 90 | tmp_state_dict = torch.load(filename) 91 | for name in tmp_state_dict: 92 | if name in linear_weights: 93 | tensor = tmp_state_dict[name].to(device).data.to(torch_dtype) 94 | compressed_state_dict[name] = compress( 95 | tensor, default_compression_config 96 | ) 97 | else: 98 | compressed_state_dict[name] = tmp_state_dict[name].to(device) 99 | tmp_state_dict[name] = None 100 | tensor = None 101 | torch.cuda.empty_cache() 102 | 103 | for name, param in model.named_parameters(): 104 | if name not in linear_weights: 105 | param.data = compressed_state_dict[name] 106 | apply_compressed_weight(model, compressed_state_dict, device) 107 | 108 | model.to(device) 109 | 110 | return model, tokenizer 111 | 112 | 113 | def compress(tensor, config): 114 | """Simulate group-wise quantization.""" 115 | if not config.enabled: 116 | return tensor 117 | 118 | group_size, num_bits, group_dim, symmetric = ( 119 | config.group_size, 120 | config.num_bits, 121 | config.group_dim, 122 | config.symmetric, 123 | ) 124 | assert num_bits <= 8 125 | 126 | original_shape = tensor.shape 127 | num_groups = (original_shape[group_dim] + group_size - 1) // group_size 128 | new_shape = ( 129 | original_shape[:group_dim] 130 | + (num_groups, group_size) 131 | + original_shape[group_dim + 1 :] 132 | ) 133 | 134 | # Pad 135 | pad_len = group_size - original_shape[group_dim] % group_size 136 | if pad_len != 0: 137 | pad_shape = ( 138 | original_shape[:group_dim] + (pad_len,) + original_shape[group_dim + 1 :] 139 | ) 140 | tensor = torch.cat( 141 | [tensor, torch.zeros(pad_shape, dtype=tensor.dtype, device=tensor.device)], 142 | dim=group_dim, 143 | ) 144 | data = tensor.view(new_shape) 145 | 146 | # Quantize 147 | if symmetric: 148 | B = 2 ** (num_bits - 1) - 1 149 | scale = B / torch.max(data.abs(), dim=group_dim + 1, keepdim=True)[0] 150 | data = data * scale 151 | data = data.clamp_(-B, B).round_().to(torch.int8) 152 | return data, scale, original_shape 153 | else: 154 | B = 2**num_bits - 1 155 | mn = torch.min(data, dim=group_dim + 1, keepdim=True)[0] 156 | mx = torch.max(data, dim=group_dim + 1, keepdim=True)[0] 157 | 158 | scale = B / (mx - mn) 159 | data = data - mn 160 | data *= scale 161 | 162 | data = data.clamp_(0, B).round_().to(torch.uint8) 163 | return data, mn, scale, original_shape 164 | 165 | 166 | def decompress(packed_data, config): 167 | """Simulate group-wise dequantization.""" 168 | if not config.enabled: 169 | return packed_data 170 | 171 | group_size, num_bits, group_dim, symmetric = ( 172 | config.group_size, 173 | config.num_bits, 174 | config.group_dim, 175 | config.symmetric, 176 | ) 177 | 178 | # Dequantize 179 | if symmetric: 180 | data, scale, original_shape = packed_data 181 | data = data / scale 182 | else: 183 | data, mn, scale, original_shape = packed_data 184 | data = data / scale 185 | data += mn 186 | 187 | # Unpad 188 | pad_len = group_size - original_shape[group_dim] % group_size 189 | if pad_len: 190 | padded_original_shape = ( 191 | original_shape[:group_dim] 192 | + (original_shape[group_dim] + pad_len,) 193 | + original_shape[group_dim + 1 :] 194 | ) 195 | data = data.reshape(padded_original_shape) 196 | indices = [slice(0, x) for x in original_shape] 197 | return data[indices].contiguous() 198 | else: 199 | return data.view(original_shape) 200 | -------------------------------------------------------------------------------- /evaluation/toolbench/model/make_delta.py: -------------------------------------------------------------------------------- 1 | """ 2 | Make the delta weights by subtracting base weights. 3 | 4 | Usage: 5 | python3 -m fastchat.model.make_delta --base ~/model_weights/llama-13b --target ~/model_weights/vicuna-13b --delta ~/model_weights/vicuna-13b-delta --hub-repo-id lmsys/vicuna-13b-delta-v1.1 6 | """ 7 | import argparse 8 | 9 | import torch 10 | from tqdm import tqdm 11 | from transformers import AutoTokenizer, AutoModelForCausalLM 12 | 13 | 14 | def make_delta(base_model_path, target_model_path, delta_path): 15 | print(f"Loading the base model from {base_model_path}") 16 | base = AutoModelForCausalLM.from_pretrained( 17 | base_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True 18 | ) 19 | 20 | print(f"Loading the target model from {target_model_path}") 21 | target = AutoModelForCausalLM.from_pretrained( 22 | target_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True 23 | ) 24 | target_tokenizer = AutoTokenizer.from_pretrained(target_model_path, use_fast=False) 25 | 26 | print("Calculating the delta") 27 | for name, param in tqdm(target.state_dict().items(), desc="Calculating delta"): 28 | assert name in base.state_dict() 29 | param.data -= base.state_dict()[name] 30 | 31 | print(f"Saving the delta to {delta_path}") 32 | if args.hub_repo_id: 33 | kwargs = {"push_to_hub": True, "repo_id": args.hub_repo_id} 34 | else: 35 | kwargs = {} 36 | target.save_pretrained(delta_path, **kwargs) 37 | target_tokenizer.save_pretrained(delta_path, **kwargs) 38 | 39 | 40 | if __name__ == "__main__": 41 | parser = argparse.ArgumentParser() 42 | parser.add_argument("--base-model-path", type=str, required=True) 43 | parser.add_argument("--target-model-path", type=str, required=True) 44 | parser.add_argument("--delta-path", type=str, required=True) 45 | parser.add_argument("--hub-repo-id", type=str) 46 | args = parser.parse_args() 47 | 48 | make_delta(args.base_model_path, args.target_model_path, args.delta_path) 49 | -------------------------------------------------------------------------------- /evaluation/toolbench/retrieval/inference_example.py: -------------------------------------------------------------------------------- 1 | from sentence_transformers import SentenceTransformer, util 2 | import json 3 | import pandas as pd 4 | from collections import defaultdict 5 | import torch 6 | from tqdm import tqdm 7 | import argparse 8 | import os 9 | 10 | # 创建参数解析器并添加参数 11 | parser = argparse.ArgumentParser() 12 | parser.add_argument('model_path', type=str, required=True, help='Your trained model path') 13 | parser.add_argument('dataset_path', help='The processed dataset files path') 14 | 15 | # 解析命令行参数 16 | args = parser.parse_args() 17 | 18 | # Check if a GPU is available and if not, use a CPU 19 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 20 | 21 | model_path = args.model_path 22 | 23 | # Load the trained model 24 | model = SentenceTransformer(model_path).to(device) 25 | 26 | # Load test data 27 | documents_df = pd.read_csv(os.path.join(args.dataset_path, 'corpus.tsv'), sep='\t') 28 | test_queries_df = pd.read_csv(os.path.join(args.dataset_path, 'test.query.txt'), sep='\t', names=['qid', 'query_text']) 29 | test_labels_df = pd.read_csv(os.path.join(args.dataset_path, 'qrels.test.tsv'), sep='\t', names=['qid', 'useless', 'docid', 'label']) 30 | 31 | # Create mappings, get 'tool_name' and 'api_name' from the document_content 32 | ir_corpus = {row.docid: (json.loads(row.document_content)['tool_name'], json.loads(row.document_content)['api_name']) for _, row in documents_df.iterrows()} 33 | ir_test_queries = {row.qid: row.query_text for _, row in test_queries_df.iterrows()} 34 | 35 | # Create query-doc mapping from the test set 36 | ir_relevant_docs = defaultdict(list) 37 | for _, row in test_labels_df.iterrows(): 38 | ir_relevant_docs[row.qid].append(row.docid) 39 | 40 | # Convert queries and documents to embeddings 41 | test_query_embeddings = model.encode(list(ir_test_queries.values()), convert_to_tensor=True).to(device) 42 | corpus_embeddings = model.encode(list(map(' '.join, ir_corpus.values())), convert_to_tensor=True).to(device) 43 | 44 | # Compute cosine similarity between queries and documents 45 | cos_scores = util.pytorch_cos_sim(test_query_embeddings, corpus_embeddings) 46 | 47 | # Get the top_k most similar documents for each query 48 | top_k = 5 49 | top_results = {} 50 | for query_index, (query_id, query) in enumerate(ir_test_queries.items()): 51 | relevant_docs_indices = cos_scores[query_index].topk(top_k).indices 52 | relevant_docs_scores = cos_scores[query_index].topk(top_k).values 53 | relevant_docs = [(list(ir_corpus.keys())[index], list(ir_corpus.values())[index]) for index in relevant_docs_indices] 54 | relevant_docs_with_scores = {str((doc_id, tool_name_api_name)): {'score': float(score)} for (doc_id, tool_name_api_name), score in zip(relevant_docs, relevant_docs_scores)} 55 | 56 | # Count the number of successful matches 57 | matches = len(set([doc_id for doc_id, _ in relevant_docs]) & set(ir_relevant_docs[query_id])) 58 | 59 | # Save query, original docs, top 5 docs with scores, and successful match count 60 | top_results[query] = { 61 | 'original_docs': [' '.join(ir_corpus[doc_id]) for doc_id in ir_relevant_docs[query_id]], 62 | 'top_docs': relevant_docs_with_scores, 63 | 'successful_matches': matches 64 | } 65 | 66 | # Save the results to a json file 67 | with open('top5_results_with_matches.json', 'w') as f: 68 | json.dump(top_results, f, indent=4) -------------------------------------------------------------------------------- /evaluation/toolbench/retrieval/train.py: -------------------------------------------------------------------------------- 1 | import sys 2 | # Add the path to the sys.path 3 | sys.path.append('.') 4 | import logging 5 | import os 6 | import json 7 | import pandas as pd 8 | from datetime import datetime 9 | import torch 10 | import torch.nn as nn 11 | from sentence_transformers import SentenceTransformer, models, InputExample, losses, LoggingHandler 12 | from torch.utils.data import DataLoader 13 | from torch.utils.tensorboard import SummaryWriter 14 | from api_evaluator import APIEvaluator 15 | import argparse 16 | import os 17 | from toolbench.utils import process_retrieval_ducoment 18 | 19 | import os 20 | 21 | parser = argparse.ArgumentParser() 22 | parser.add_argument("--data_path", default=None, type=str, required=True, 23 | help="The input data dir. Should contain the .tsv files for the task.") 24 | parser.add_argument("--model_name", default=None, type=str, required=True, 25 | help="The base model name.") 26 | parser.add_argument("--output_path", default=None, type=str, required=True, 27 | help="The base path where the model output will be saved.") 28 | parser.add_argument("--num_epochs", default=5, type=int, required=True, 29 | help="Train epochs.") 30 | parser.add_argument("--train_batch_size", default=32, type=int, required=True, 31 | help="Train batch size.") 32 | parser.add_argument("--learning_rate", default=2e-5, type=float, required=True, 33 | help="Learning rate.") 34 | parser.add_argument("--warmup_steps", default=500, type=float, required=True, 35 | help="Warmup steps.") 36 | parser.add_argument("--max_seq_length", default=256, type=int, required=True, 37 | help="Max sequence length.") 38 | args = parser.parse_args() 39 | 40 | logging.basicConfig(format='%(asctime)s - %(message)s', 41 | datefmt='%Y-%m-%d %H:%M:%S', 42 | level=logging.INFO, 43 | handlers=[LoggingHandler()]) 44 | logger = logging.getLogger(__name__) 45 | 46 | torch.manual_seed(42) 47 | torch.cuda.manual_seed(42) 48 | 49 | num_epochs = args.num_epochs 50 | train_batch_size = args.train_batch_size 51 | lr = args.learning_rate 52 | warmup_steps = args.warmup_steps 53 | data_path = args.data_path 54 | output_path = args.output_path 55 | os.makedirs(output_path, exist_ok=True) 56 | 57 | model_save_path = os.path.join(output_path, datetime.now().strftime("%Y-%m-%d_%H-%M-%S")) 58 | os.makedirs(model_save_path, exist_ok=True) 59 | 60 | tensorboard_name = 'name_desc' 61 | logs_writer = SummaryWriter(os.path.join(output_path, 'tensorboard', tensorboard_name)) 62 | 63 | 64 | def log_callback_st(train_ix, global_step, training_steps, current_lr, loss_value): 65 | logs_writer.add_scalar('train_loss', loss_value, global_step) 66 | logs_writer.add_scalar('lr', current_lr[0], global_step) 67 | 68 | 69 | # Model definition 70 | model = SentenceTransformer(args.model_name) 71 | # word_embedding_model = models.Transformer(args.model_name, max_seq_length=args.max_seq_length) 72 | # pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension()) 73 | # model = SentenceTransformer(modules=[word_embedding_model, pooling_model]) 74 | 75 | ir_train_queries = {} 76 | ir_test_queries = {} 77 | ir_relevant_docs = {} 78 | train_samples = [] 79 | 80 | documents_df = pd.read_csv(os.path.join(data_path, 'corpus.tsv'), sep='\t') 81 | ir_corpus, _ = process_retrieval_ducoment(documents_df) 82 | 83 | train_queries_df = pd.read_csv(os.path.join(data_path, 'train.query.txt'), sep='\t', names=['qid', 'query']) 84 | for row in train_queries_df.itertuples(): 85 | ir_train_queries[row.qid] = row.query 86 | train_queries_df = pd.read_csv(os.path.join(data_path, 'test.query.txt'), sep='\t', names=['qid', 'query']) 87 | for row in train_queries_df.itertuples(): 88 | ir_test_queries[row.qid] = row.query 89 | 90 | labels_df = pd.read_csv(os.path.join(data_path, 'qrels.train.tsv'), sep='\t', names=['qid', 'useless', 'docid', 'label']) 91 | for row in labels_df.itertuples(): 92 | sample = InputExample(texts=[ir_train_queries[row.qid], ir_corpus[row.docid]], label=row.label) 93 | train_samples.append(sample) 94 | labels_df = pd.read_csv(os.path.join(data_path, 'qrels.test.tsv'), sep='\t', names=['qid', 'useless', 'docid', 'label']) 95 | for row in labels_df.itertuples(): 96 | ir_relevant_docs.setdefault(row.qid, set()).add(row.docid) 97 | 98 | train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=train_batch_size, pin_memory=True) 99 | train_loss = losses.MultipleNegativesRankingLoss(model) 100 | ir_evaluator = APIEvaluator(ir_test_queries, ir_corpus, ir_relevant_docs) 101 | 102 | # You may need to modify the .fit() method to ensure all data is moved to the correct device during parallel computations 103 | 104 | # model.fit(train_objectives=[(train_dataloader, train_loss)], 105 | # evaluator=ir_evaluator, 106 | # epochs=num_epochs, 107 | # warmup_steps=warmup_steps, 108 | # optimizer_params={'lr': lr}, 109 | # output_path=model_save_path 110 | # ) 111 | 112 | # evaluate 113 | ir_evaluator(model, output_path=model_save_path) 114 | 115 | -------------------------------------------------------------------------------- /evaluation/toolbench/tooleval/README_ZH.md: -------------------------------------------------------------------------------- 1 |
2 |

🛠️Tool Eval🤖

3 |
4 | 5 | 通过在ToolBench上对LLaMA进行微调,我们得到了**ToolLLaMA**。考虑到人工评估非常耗时,我们借鉴[AlpacaEval](https://tatsu-lab.github.io/alpaca_eval/)开发了一个高效的机器自动评估**ToolEval**,其中包含两个评估指标: 6 | 7 | - **通过率**:计算在有限的OpenAI API调用次数内成功完成指令的比例。 8 | 9 | - **偏好**:通过比较给定指令的两个答案(动作序列)来衡量。我们预先定义了一组更好答案的标准,这些标准被组织成ChatGPT的提示。我们向评估器提供测试指令和两个候选答案,并获得其偏好。我们对每个答案对进行多次评估以提高系统的可靠性。然后,我们计算**优胜率**(被评估器选择为更优的百分比。有关详细信息,请参阅我们的论文。 10 | 11 | 为了验证ChatGPT评估器在通过率和胜率方面的可靠性,我们从四种不同的方法(ChatGPT+ReACT,ChatGPT+DFSDT,ToolLLaMA+DFSDT和GPT4+DFSDT)中进行采样,为每种方法的300个测试指令获取解决方案对。然后,我们请人类标注ChatGPT+DFSDT,ToolLLaMA+DFSDT和GPT4+DFSDT的通过率,以及ChatGPT+ReACT和ChatGPT+DFSDT之间的胜率。 12 | 13 | 我们的ChatGPT评估器在通过率方面与人类标注者具有高达**87.1%**的一致性,在胜率方面具有**80.3%**的一致性。这个结果表明,我们的评估器生成的评估结果与人类非常相似,并且可以视为在通过率和胜率上模拟人类评估的可靠评估器。 14 | 有关ToolEval的更多细节,请参阅我们的论文。 15 | 16 | ## 🚀用法 17 | 18 | ### Install 19 | Install Package (python>=3.9) 20 | ```bash 21 | pip install -r requirements.txt 22 | ``` 23 | 24 | ### Evaluation 25 | *若要复现结果,直接通过[Google Drive](https://drive.google.com/drive/folders/1yBUQ732mPu-KclJnuQELEhtKakdXFc3J)下载我们的`reproduction_data.zip`,解压后置`reproduction_data`于`ToolBench/data/`下即可,可以跳过数据准备流程。* 26 | - 数据准备。若要使用 ToolEval 评估您自己的模型和方法,首先需要为六个测试子集准备所有的模型预测。创建一个以您的模型和方法命名的目录,例如 `chatgpt_cot`,然后将每个测试集的预测放在该目录下。目录的文件结构应如下: 27 | ``` 28 | ├── /chatgpt_cot/ 29 | │ ├── /G1_instruction/ 30 | │ │ ├── /10160_CoT@1.json 31 | │ │ └── ... 32 | │ ├── /G1_tool/ 33 | │ │ ├── /10221_CoT@1.json 34 | │ │ └── ... 35 | │ ├── ... 36 | │ ├── /G3_instruction/ 37 | │ │ ├── /10221_CoT@1.json 38 | │ │ └── ... 39 | ``` 40 | 41 | 然后对模型预测进行预处理: 42 | 43 | ```bash 44 | export RAW_ANSWER_PATH=../../data/reproduction_data/model_predictions/ 45 | export CONVERTED_ANSWER_PATH=../../data/reproduction_data/model_predictions_converted/ 46 | export MODEL_NAME=chatgpt_cot 47 | export METHOD=CoT 48 | mkdir ${CONVERTED_ANSWER_PATH}/${MODEL_NAME} 49 | for test_set in G1_instruction G1_category G1_tool G2_category G2_instruction G3_instruction 50 | do 51 | answer_dir=${RAW_ANSWER_PATH}/${MODEL_NAME}/${test_set} 52 | output_file=${CONVERTED_ANSWER_PATH}/${MODEL_NAME}/${test_set}.json 53 | python convert_to_answer_format.py\ 54 | --answer_dir ${answer_dir} \ 55 | --method ${METHOD} \ 56 | --output ${output_file} 57 | done 58 | ``` 59 | 之后,检查`${CONVERTED_ANSWER_PATH}/${MODEL_NAME}`下是否有测试集的预处理JSON文件。如果有,你就可以准备运行以下评估过程了。如果没有,请检查模型的预测是否有问题。 60 | 61 | - OpenAI Key 62 | 准备您的OpenAI Key来搭建我们的evaluator。Key需要被存储到一个json file中,如`path/to/your/openai_key_json_file.json`: 63 | ```bash 64 | [ 65 | { 66 | "username": "your_user_name", 67 | "passwd": "your_password", 68 | "api_key": "your_openai_key", 69 | "organization": "your_organization" 70 | }, 71 | ... 72 | ] 73 | ``` 74 | - Pass rate. 75 | ```bash 76 | export CONVERTED_ANSWER_PATH=../../data/reproduction_data/model_predictions_converted/ 77 | export SAVE_PATH=pass_rate_results 78 | export CANDIDATE_MODEL=chatgpt_cot 79 | export API_POOL_FILE=path/to/your/openai_key_json_file.json 80 | 81 | python eval_pass_rate.py \ 82 | --converted_answer_path ${CONVERTED_ANSWER_PATH} \ 83 | --save_path ${SAVE_PATH} \ 84 | --reference_model ${CANDIDATE_MODEL} \ 85 | --test_ids ../../data/test_query_ids/ \ 86 | --max_eval_threads 20 \ 87 | --evaluate_times 4 88 | 89 | ``` 90 | 91 | 结果文件会被存储至${SAVE_PATH}中。 92 | 93 | - Win rate. 以下示例以ChatGPT-ReACT作为参考模型,GPT4-ReACT作为候选模型。请注意,您首先需要获取两个模型的pass rate结果,然后运行以下命令来评估GPT4-ReACT的win rate结果: 94 | ```bash 95 | export CONVERTED_ANSWER_PATH=../../data/reproduction_data/model_predictions_converted/ 96 | export SAVE_PATH=preference_results 97 | export PASS_TARE_PATH=pass_rate_results 98 | export REFERENCE_MODEL=chatgpt_cot 99 | export CANDIDATE_MODEL=gpt-4-0613_cot 100 | export API_POOL_FILE=path/to/your/openai_key_json_file.json 101 | 102 | python eval_preference.py \ 103 | --converted_answer_path ${CONVERTED_ANSWER_PATH} \ 104 | --reference_model ${REFERENCE_MODEL} \ 105 | --output_model ${CANDIDATE_MODEL} \ 106 | --test_ids ../../data/test_query_ids/ \ 107 | --save_path ${SAVE_PATH} \ 108 | --pass_rate_result_path ${PASS_TARE_PATH} \ 109 | --max_eval_threads 20 \ 110 | --use_pass_rate true \ 111 | --evaluate_times 4 112 | ``` 113 | 114 | 结果文件会被存储至${SAVE_PATH}中。 115 | 116 | ### 评估新方法 117 | 要评估除了ReACT和DFSDT之外的方法,您需要遵循以上Data preparation的步骤准备您的预处理好的answer数据。预处理好的answer数据需遵循以下json格式: 118 | 119 | ```json 120 | [ 121 | { 122 | "method":"method name", 123 | "total_steps": int, // a integer count total steps in answer details 124 | "final_answer": "final answer from the method", 125 | "answer_details":[{ 126 | "role":"node role, can be system, user, assistant and tool", 127 | "message":"message for the node", 128 | "next":[//next steps, can have multiple elements if the node have multiple candidates. 129 | { 130 | "role":"", 131 | "message":"", 132 | "next":[...] 133 | }, 134 | ...//more candidates 135 | ] 136 | }] 137 | } 138 | ... // more answers for the give query in the testdata 139 | ] 140 | ``` 141 | 142 | 143 | ### 更新排行榜 144 | 145 | 如果您想将您的模型的结果上传到[ToolEval Leaderboard](https://openbmb.github.io/ToolBench/),请您将您的结果文件整理成上述格式发送给我们(urtoolbench@gmail.com)或者开一个pull request。 146 | 我们将运行评测脚本更新结果并将您的模型添加到排行榜中。 147 | 148 | 149 | ### 创建新的自动评估器 150 | 如果您想创建新的自动评估器,您需要按下列步骤进行: 151 | 1. 在路径`toolbench/tooleval/evaluators`下创建一个评测器配置文件目录,命名与你的评测器名一致。在其中添加`config.yaml`文件与`template.txt`文件。具体配置方式可参考`toolbench/tooleval/evaluators/tooleval_gpt-3.5-turbo_normalized`中的实现。 152 | 2. 创建你的evaluator类并实现`fn_completions`函数在文件夹`toolbench/tooleval/evaluators/registered_cls`中,或者你可以使用我们预先定义好的类例如`OpenAINormalizedEvaluator`。 153 | 完成后将配置文件中`registered_cls_name`字段填写为该类的名称。 154 | 这里给出一个例子: 155 | ```Python 156 | from evaluators import register_evaluator,BaseEvaluator 157 | from typing import Dict,List 158 | 159 | @register_evaluator 160 | class MyEvaluator(BaseEvaluator): 161 | def __init__(self,config): 162 | super().__init__( 163 | fn_completions=self.fn_completions, 164 | ) 165 | # set your configures here 166 | 167 | def fn_completions(self,query:Dict,answers:List[Dict])->int: 168 | # implement your evaluator here 169 | # return the index of the preferred answer 170 | return 0 171 | ``` 172 | 其中register_evaluator是一个装饰器,用于注册评估器,BaseEvaluator是一个基类,用于实现评估器的基本功能。 173 | 3. 测试评估器的性能,运行脚本`evaluators_comparison.py`。 174 | -------------------------------------------------------------------------------- /evaluation/toolbench/tooleval/ToolBench.code-workspace: -------------------------------------------------------------------------------- 1 | { 2 | "folders": [ 3 | { 4 | "path": "../../../ToolBench" 5 | }, 6 | { 7 | "path": "../.." 8 | }, 9 | { 10 | "path": "../../../STC/RapidAPI-Server" 11 | } 12 | ], 13 | "settings": { 14 | "git.ignoreLimitWarning": true 15 | } 16 | } -------------------------------------------------------------------------------- /evaluation/toolbench/tooleval/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Reason-Wang/ToolGen/6839374a255810efe69deea4056eec5c55e25802/evaluation/toolbench/tooleval/__init__.py -------------------------------------------------------------------------------- /evaluation/toolbench/tooleval/automatic_eval_sample.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import time 4 | from concurrent.futures import ThreadPoolExecutor,as_completed 5 | from tqdm import tqdm 6 | import numpy as np 7 | import argparse 8 | import random 9 | from evaluation import UserEvaluation,BaseToolMethod 10 | from evaluators import load_registered_automatic_evaluator 11 | from typing import List,Dict,Callable 12 | import pandas as pd 13 | 14 | abs_dir = os.path.split(__file__)[0] 15 | 16 | 17 | def parse_args(): 18 | parser = argparse.ArgumentParser() 19 | parser.add_argument('--output',default=os.path.join(abs_dir,'dataset','test.json'),help='where to store the method output.') 20 | parser.add_argument('--method',default='unknown',help='what the name of the method.') 21 | parser.add_argument('--ref_method',default='gpt-3.5-turbo_CoT',help='what the reference method is') 22 | parser.add_argument('--ref_output',default=os.path.join(abs_dir,'dataset','ref_sample.json'),help='where the reference answer stored') 23 | parser.add_argument('--evaluators_cfg_path',default=os.path.join(abs_dir,'evaluators'),help='where the evaluators config files are stored') 24 | parser.add_argument('--evaluator',default='tooleval_gpt-3.5-turbo_normalized',help='which evaluator to use') 25 | parser.add_argument('--max_eval_threads',default=16,type=int,help='how many threads to use for evaluation') 26 | parser.add_argument('--evalset',default='default_evalset',help='which the evaluation dataset to use') 27 | parser.add_argument('--eval_server_address',default='http://localhost:8000',help='the address of the evaluation server') 28 | parser.add_argument('--use_existed_output',default=False,action='store_true',help='whether to use the existed output') 29 | 30 | return parser.parse_args() 31 | 32 | 33 | ## !!define your method here !! 34 | class SampleMethod(BaseToolMethod): 35 | def __init__(self): 36 | super().__init__() 37 | def forward(self,query:str,tools:List[Dict],tool_func:Callable)->Dict: 38 | return {} 39 | def convert_result_to_dict(self,result): 40 | return { 41 | 'method': 'sample', 42 | 'total_steps': 0, 43 | 'final_answer': '', 44 | 'answer_details': [] 45 | } 46 | 47 | if __name__=='__main__': 48 | args = parse_args() 49 | 50 | exec_generating_method_outputs = True 51 | if os.path.exists(args.output): 52 | print('Output file {} already exists!'.format(args.output)) 53 | if args.use_existed_output: 54 | exec_generating_method_outputs = False 55 | else: 56 | print('Overwrite? (y/n)') 57 | exec_generating_method_outputs = input()=='y' 58 | 59 | if exec_generating_method_outputs: 60 | ## change the SampleMethod to your method 61 | usereval = UserEvaluation(SampleMethod(),args.eval_server_address,args.evalset) 62 | print('Generating method outputs...') 63 | results = usereval.run() 64 | print('Saving method outputs...') 65 | with open(args.output,'w') as f: 66 | json.dump(results,f) 67 | else: 68 | print('Use existed output.') 69 | results = json.load(open(args.output)) 70 | 71 | print('Loading reference answer for evaluation...') 72 | try: 73 | ref_output = json.load(open(args.ref_output)) 74 | except: 75 | raise Exception('Cannot load reference answer from {}\n Please Download before evaluation!'.format(args.ref_output)) 76 | 77 | print('Loading automatic evaluators...') 78 | evaluators = [load_registered_automatic_evaluator(vars(args)) for _ in range(args.max_eval_threads)] 79 | 80 | def get_preference(qid,query,tools,ref_ans,ans,): 81 | global evaluators 82 | evaluator = random.choice(evaluators) 83 | ret = evaluator.annotate_preference( 84 | query, 85 | tools, 86 | [ref_ans,ans]) 87 | return qid,ret 88 | def get_most_preferred(d:list)->np.ndarray: 89 | if np.iterable(d): 90 | d = np.asanyarray(d) 91 | bins = np.bincount(d) 92 | max_val = np.max(bins) 93 | argmax = np.where(max_val==bins)[0] 94 | return argmax 95 | else: 96 | return np.asarray([d]) 97 | 98 | print('Evaluating...') 99 | prefer_dict = {} 100 | with ThreadPoolExecutor(args.max_eval_threads) as pool: 101 | future = [] 102 | for qid in ref_output.keys(): 103 | try: 104 | future.append(pool.submit( 105 | get_preference, 106 | qid, 107 | ref_output[qid]['query'], 108 | ref_output[qid]['available_tools'], 109 | ref_output[qid]['answer'], 110 | results[qid]['answer'] 111 | )) 112 | except KeyError as e: 113 | print('Warning : Missing answer for query {} in answer file! '.format(e)) 114 | 115 | for thd in tqdm(as_completed(future),total=len(future),ncols=100): 116 | qid,preference = thd.result() 117 | prefer_dict[qid] = get_most_preferred(preference)[0] 118 | 119 | prefer = list(prefer_dict.values()) 120 | 121 | prefer = np.array(prefer) 122 | df = pd.DataFrame.from_dict([{ 123 | 'Method':args.method, 124 | 'Win Rate':prefer.mean(), 125 | 'Std Error':np.std(prefer)/np.sqrt(len(prefer)) 126 | }]) 127 | print('###### Leaderboard vs {} ######'.format(args.ref_method)) 128 | print(df) 129 | save_file = os.path.join(abs_dir,'results',args.evalset,args.method) 130 | os.makedirs(save_file,exist_ok=True) 131 | df.to_csv(os.path.join(save_file,'win.csv')) 132 | -------------------------------------------------------------------------------- /evaluation/toolbench/tooleval/convert_answers.py: -------------------------------------------------------------------------------- 1 | from convert_to_answer_format import process_invalid_data,process_valid_data 2 | import json 3 | from glob import glob 4 | import os 5 | 6 | save_dir = 'path/to/save/dir' 7 | 8 | groups_dirs = ['path/to/dataset/eval/result/folders'] 9 | 10 | for groups_dir in groups_dirs: 11 | method = os.path.split(groups_dir)[1] 12 | print(method) 13 | groups_save_dir = os.path.join(save_dir,method) 14 | os.makedirs(groups_save_dir,exist_ok=True) 15 | groups = [os.path.split(g)[1] for g in glob(groups_dir+'/*')] 16 | full_answer = {} 17 | for g in groups: 18 | print(g) 19 | answer_dict = {} 20 | files = glob(os.path.join(groups_dir,g,'*.json')) 21 | for file in files: 22 | qid = os.path.split(file)[1].split('_')[0] 23 | try: 24 | data = json.load(open(file)) 25 | except: 26 | print('Read error: ',file) 27 | continue 28 | if not data['answer_generation']['valid_data']: 29 | answer_dict[qid] = process_invalid_data(method,data) 30 | else: 31 | answer_dict[qid] = process_valid_data(method,data['answer_generation']) 32 | json.dump(answer_dict,open(os.path.join(groups_save_dir,f'{g}.json'),'w')) 33 | full_answer.update(answer_dict) 34 | # json.dump(full_answer,open(os.path.join(groups_save_dir,f'fullanswer.json'),'w')) -------------------------------------------------------------------------------- /evaluation/toolbench/tooleval/evaluation/__init__.py: -------------------------------------------------------------------------------- 1 | from .usereval import UserEvaluation 2 | from .methodcls import BaseToolMethod 3 | from .dataclass import ExecutionGraph,ExecutionNode,DirectedEdge -------------------------------------------------------------------------------- /evaluation/toolbench/tooleval/evaluation/methodcls.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, List,Callable 2 | 3 | class BaseToolMethod: 4 | def __init__(self): 5 | pass 6 | def convert_result_to_dict(self,result): 7 | '''Return Format 8 | -------- 9 | { 10 | 'method': 'method name', 11 | 'total_steps': int, 12 | 'final_answer': 'answer', 13 | 'answer_details': [{ 14 | "role": "system", 15 | "message": "", 16 | "next": [ 17 | { 18 | "role": "user", 19 | "message": "I am planning ...", 20 | "next": [ 21 | { 22 | "role": "tool", 23 | "message": "{'name': 'Finish', 'arguments': '{\\n \"return_type\": \"give_answer\",\\n \"final_answer\": \"I encountere...", 24 | "next": [] 25 | } 26 | ] 27 | } 28 | ] 29 | }] 30 | } 31 | 32 | ''' 33 | pass 34 | def forward(self,query:str,tools:List[Dict],tool_func:Callable)->Dict: 35 | pass 36 | 37 | def __call__(self,query:str,tools:List[Dict],tool_func:Callable)->Dict: 38 | result = self.forward(query,tools,tool_func) 39 | return self.convert_result_to_dict(result) 40 | 41 | -------------------------------------------------------------------------------- /evaluation/toolbench/tooleval/evaluation/usereval.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from tqdm import tqdm 3 | from typing import Union, Dict, List, Optional,Tuple 4 | from .methodcls import BaseToolMethod 5 | from .dataclass import * 6 | import json 7 | 8 | class UserEvaluation: 9 | def __init__(self, 10 | method:BaseToolMethod, 11 | eval_server_addr='http://localhost:8000', 12 | evalset='eval20230718'): 13 | self.eval_server_addr = eval_server_addr 14 | self.evalset = evalset 15 | self.method = method 16 | res = requests.post(self.eval_server_addr+'/neweval',json=self.evalset) 17 | if res.status_code != 200: 18 | raise Exception('Failed to obtain new evaluation id! Error: '+res.text) 19 | ret = res.json() 20 | self.eval_id = ret['evaluation_id'] 21 | self.len = ret['len'] 22 | 23 | def get_new_question(self)->Tuple[str,List]: 24 | res = requests.post(self.eval_server_addr+'/next_question',json=self.eval_id) 25 | if res.status_code == 204: 26 | raise EvalCompleted() 27 | if res.status_code != 200: 28 | raise Exception('Failed to obtain new question!') 29 | 30 | self.question = Question(**res.json()) 31 | self.tool_name_to_id = {} 32 | tools = [tool.model_dump() for tool in self.question.available_tools] 33 | for tool in tools: 34 | self.tool_name_to_id[tool['name']] = tool.pop('tid') 35 | 36 | 37 | return self.question.query,tools 38 | def tool_func(self,tool_name:str,tool_args:str)->requests.Response: 39 | tid = self.tool_name_to_id[tool_name] 40 | # res = requests.post(self.eval_server_addr+'/api',json={ 41 | # 'evaluation_id':self.eval_id, 42 | # 'tool_id':tid, 43 | # 'tool_args':tool_args 44 | # }) 45 | res = requests.post(self.eval_server_addr+'/rapidapi',json={ 46 | 'evaluation_id':self.eval_id, 47 | 'tool_id':tid, 48 | 'tool_args':tool_args 49 | }) 50 | 51 | return res 52 | def _forward(self,query:str,tools:List[Dict])->Dict: 53 | method_ret = self.method(query,tools,self.tool_func) 54 | 55 | return self.question.qid,{ 56 | 'query':query, 57 | 'available_tools':tools, 58 | 'answer':method_ret 59 | } 60 | 61 | 62 | def run(self)->Dict: 63 | results = {} 64 | for _ in tqdm(range(self.len),ncols=100): 65 | try: 66 | qid,ret = self._forward(*self.get_new_question()) 67 | except EvalCompleted: 68 | return results 69 | results[qid] = ret 70 | return results 71 | -------------------------------------------------------------------------------- /evaluation/toolbench/tooleval/evaluators/__init__.py: -------------------------------------------------------------------------------- 1 | from .registered_cls import BaseEvaluator,register_evaluator,get_evaluator_cls 2 | 3 | __all__=['register_evaluator','get_evaluator_cls','BaseEvaluator','load_registered_automatic_evaluator'] 4 | 5 | 6 | 7 | def load_registered_automatic_evaluator(config:dict={},evaluator_name=None,evaluators_cfg_path=None)->BaseEvaluator: 8 | import os 9 | import yaml 10 | 11 | evaluator_name = config['evaluator'] if evaluator_name is None else evaluator_name 12 | cfg_path = config['evaluators_cfg_path'] if evaluators_cfg_path is None else evaluators_cfg_path 13 | cfg_path = os.path.join(cfg_path,evaluator_name) 14 | 15 | cls_name = yaml.load(open(os.path.join(cfg_path,'config.yaml')),Loader=yaml.FullLoader)['registered_cls_name'] 16 | # print(evaluator_name) 17 | # print(cfg_path) 18 | # print(cls_name) 19 | 20 | evaluator:BaseEvaluator = get_evaluator_cls(cls_name)(cfg_path) 21 | # print(type(evaluator)) 22 | return evaluator -------------------------------------------------------------------------------- /evaluation/toolbench/tooleval/evaluators/registered_cls/__init__.py: -------------------------------------------------------------------------------- 1 | from .base import BaseEvaluator 2 | from .utils import register_evaluator,get_evaluator_cls 3 | 4 | __all__ = ['register_evaluator','get_evaluator_cls','BaseEvaluator'] 5 | 6 | import os 7 | import importlib 8 | current_dir = os.path.dirname(__file__) 9 | 10 | for item in os.listdir(current_dir): 11 | item_path = os.path.join(current_dir, item) 12 | 13 | if os.path.isfile(item_path) and item != '__init__.py' and item.endswith('.py'): 14 | module_name = item[:-3] 15 | 16 | full_module_path = f"{__name__}.{module_name}" 17 | 18 | imported_module = importlib.import_module(full_module_path) 19 | 20 | globals()[module_name] = imported_module 21 | -------------------------------------------------------------------------------- /evaluation/toolbench/tooleval/evaluators/registered_cls/base.py: -------------------------------------------------------------------------------- 1 | import random 2 | from typing import List, Union, Dict, Any, Callable 3 | import os 4 | import openai 5 | import yaml 6 | from .utils import register_evaluator 7 | 8 | def process_answer(answer: Dict): 9 | answer['final_answer'] = answer['final_answer'][:1000] 10 | answer['answer_details'] = answer['answer_details'][:3000] 11 | answer.pop('method', None) 12 | return answer 13 | 14 | 15 | def process_tools(tools: List[Dict]): 16 | # print(len(tools)) 17 | for tool in tools: 18 | # try: 19 | 20 | # print(tool) 21 | tool.pop('description', None) 22 | tool.pop('parameters', None) 23 | # except Exception as e: 24 | # print(tool) 25 | # raise e 26 | return tools 27 | 28 | @register_evaluator 29 | class BaseEvaluator: 30 | """Base class for evaluators. 31 | 32 | Attributes: 33 | ---------- 34 | fn_completions : Callable[[Dict,List[Dict]],int] 35 | The completion function of the evaluator, used to get annotated results. 36 | This function should take two arguments: `task_description`:Dict and `answers`:List[Dict], return a int stand for the index of best answer. 37 | 38 | Functions: 39 | --------- 40 | annotate_preference : Callable 41 | Annotate and return the index of the preferred answer. 42 | 43 | """ 44 | def __init__(self, 45 | fn_completions: Callable[[Dict,List[Dict]],int] = None, 46 | *args, 47 | **kwargs): 48 | self.fn_completions = fn_completions 49 | # print(fn_completions) 50 | def annotate_preference(self, 51 | query: str, 52 | available_tools: List[Dict[Any, Any]], 53 | answers:List[Dict], 54 | multisample=False, 55 | sample_n=4, 56 | task_status=None, 57 | answer_statuss=[None, None]) -> Union[List[int], int]: 58 | """Annotate and return the index of the preferred answer. 59 | 60 | For given query, available tools, and two answers, return the index of the preferred answer by calling function `fn_completions` of the evaluator. 61 | 62 | Parameters: 63 | ---------- 64 | query : str 65 | The query of the task. 66 | available_tools : List[Dict[Any, Any]] 67 | The list of available tools for the task. The specific format of the tool is defined in `tooleval/evaluation/dataclass.py` 68 | answers : List[Dict] 69 | The list of answers for comparison. 70 | multisample : bool, optional 71 | Whether to use multisample to get the preference. If True, the function will return a list of preferences, otherwise return a single preference. 72 | sample_n : int, optional 73 | The number of samples to get the preference. 74 | 75 | Returns: 76 | ------- 77 | preference : Union[List[int], int] 78 | The index of the preferred answer. If `multisample` is True, return a list of preferences, otherwise return a single preference. 79 | 80 | Raise: 81 | ----- 82 | 83 | """ 84 | answers_processed = [process_answer(ans) for ans in answers] 85 | # print("Available tools:", available_tools) 86 | if isinstance(available_tools, dict): 87 | available_tools = list(available_tools.values()) 88 | available_tools = process_tools(available_tools) 89 | 90 | def shuffle_run() -> int: 91 | indexs = list(range(len(answers_processed))) 92 | random.shuffle(indexs) 93 | 94 | answers_projected = [answers[idx] for idx in indexs] 95 | 96 | try: 97 | preferred_index = self.fn_completions( 98 | { 99 | 'query':query, 100 | 'available_tools':available_tools, 101 | }, 102 | answers_projected, 103 | task_status, 104 | answer_statuss 105 | ) 106 | except openai.BadRequestError as e: 107 | print(f"Error: {e}, set reference model to win.") 108 | preferred_index = 0 109 | 110 | if preferred_index in indexs: 111 | return indexs.index(preferred_index) 112 | raise ValueError(f'Preferred index {preferred_index} is invalid!') 113 | 114 | if not multisample: 115 | return shuffle_run() 116 | else: 117 | prefers = [shuffle_run() for _ in range(sample_n)] 118 | return prefers 119 | 120 | @register_evaluator 121 | class ToolEvalEvaluator(BaseEvaluator): 122 | """ToolEval common evaluator class. 123 | 124 | Attributes: 125 | ---------- 126 | cfg_path : str 127 | A path store the configuration of the evaluator. 128 | 129 | 130 | """ 131 | def __init__(self, 132 | cfg_path: str = None, 133 | ): 134 | eval_config = yaml.load(open(os.path.join(cfg_path,'config.yaml')),Loader=yaml.FullLoader) 135 | template = open(os.path.join(cfg_path,eval_config['prompt_template'])).read() 136 | 137 | super().__init__( 138 | fn_completions=getattr(self,eval_config['fn_completions']) 139 | ) 140 | self.eval_config = eval_config 141 | self.template = template -------------------------------------------------------------------------------- /evaluation/toolbench/tooleval/evaluators/registered_cls/utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | from typing import List,Dict 4 | import requests 5 | from tenacity import retry, wait_random_exponential, stop_after_attempt 6 | 7 | from openai import OpenAI, AzureOpenAI 8 | import random 9 | 10 | __registered_evaluators__ = {} 11 | 12 | def register_evaluator(cls): 13 | """ 14 | Decorator function to register classes with the registered_evaluators list. 15 | """ 16 | __registered_evaluators__[cls.__name__] = cls 17 | return cls 18 | 19 | def get_evaluator_cls(clsname): 20 | """ 21 | Return the evaluator class with the given name. 22 | """ 23 | try: 24 | return __registered_evaluators__.get(clsname) 25 | except: 26 | raise ModuleNotFoundError('Cannot find evaluator class {}'.format(clsname)) 27 | 28 | 29 | class OpenaiPoolRequest: 30 | def __init__(self, pool_json_file=None): 31 | self.pool:List[Dict] = [] 32 | __pool_file = pool_json_file 33 | if os.environ.get('API_POOL_FILE',None) is not None: 34 | __pool_file = os.environ.get('API_POOL_FILE') 35 | self.now_pos = random.randint(-1, len(self.pool)) 36 | if os.path.exists(__pool_file): 37 | self.pool = json.load(open(__pool_file)) 38 | self.now_pos = random.randint(-1, len(self.pool)) 39 | # print(__pool_file) 40 | if os.environ.get('OPENAI_KEY',None) is not None: 41 | self.pool.append({ 42 | 'api_key':os.environ.get('OPENAI_KEY'), 43 | 'organization':os.environ.get('OPENAI_ORG',None), 44 | 'api_type':os.environ.get('OPENAI_TYPE',None), 45 | 'api_version':os.environ.get('OPENAI_VER',None) 46 | }) 47 | 48 | # @retry(wait=wait_random_exponential(multiplier=1, max=30), stop=stop_after_attempt(10),reraise=True) 49 | def request(self,messages,**kwargs): 50 | self.now_pos = (self.now_pos + 1) % len(self.pool) 51 | key_pos = self.now_pos 52 | item = self.pool[key_pos] 53 | api_key = item['api_key'] 54 | api_version = item.get('api_version', None) 55 | api_base = item.get('api_base', None) 56 | 57 | # if kwargs.get('model') == 'gpt-4o': 58 | # client = AzureOpenAI( 59 | # api_key=api_key, 60 | # api_version=api_version, 61 | # azure_endpoint = api_base, 62 | # ) 63 | # else: 64 | if api_base: 65 | client = OpenAI(api_key=api_key, api_base=api_base) 66 | else: 67 | client = OpenAI(api_key=api_key) 68 | 69 | response = client.chat.completions.create(messages=messages,**kwargs) 70 | return response 71 | 72 | def __call__(self,messages,**kwargs): 73 | return self.request(messages,**kwargs) 74 | -------------------------------------------------------------------------------- /evaluation/toolbench/tooleval/evaluators/tooleval_gpt-3.5-turbo_default/config.yaml: -------------------------------------------------------------------------------- 1 | evaluator_name: "tooleval_gpt-3.5-turbo_default" 2 | registered_cls_name: "ReinforceToolLearningEvaluator" 3 | prompt_template: "template.txt" 4 | fn_completions: "normalized_openai_completions" 5 | apis_json: "your/path/to/api_pool.json" 6 | completions_kwargs: 7 | model: "gpt-3.5-turbo-16k" 8 | max_tokens: 1000 9 | temperature: 0.2 10 | timeout: 10 11 | functions: 12 | - name: "check_answer_status" 13 | description: "Parse the json answer with layerd nodes and return the answer_status about the answer" 14 | parameters: 15 | type: "object" 16 | properties: 17 | answer_status: 18 | type: "string" 19 | enum: ["Unsure","Unsolved","Solved"] 20 | required: ["answer_status"] 21 | - name: "parse_answer_status" 22 | description: "Parse the json answer with layerd nodes and return the answer_status about the answer" 23 | parameters: 24 | type: "object" 25 | properties: 26 | answer_status: 27 | type: "string" 28 | enum: ["Unsure","Unsolved","Solved"] 29 | required: ["answer_status"] 30 | - name: "check_task_solvable" 31 | description: "Parse the task description and return the task_status about the task" 32 | parameters: 33 | type: "object" 34 | properties: 35 | task_status: 36 | type: "string" 37 | enum: ["Unsure","Unsolvable","Solvable"] 38 | required: ["task_status"] 39 | - name: "select_better_answer" 40 | description: "Select the better answer with a comprehensive investigation on given aspects. You should ignore the impact of the order of candidate answers." 41 | parameters: 42 | type: "object" 43 | properties: 44 | index: 45 | type: "number" 46 | description: "The `index` value in the selected better answer." 47 | required: ["index"] 48 | fn_completion_parser: "index_parser" 49 | batch_size: 1 50 | -------------------------------------------------------------------------------- /evaluation/toolbench/tooleval/evaluators/tooleval_gpt-3.5-turbo_default/template.txt: -------------------------------------------------------------------------------- 1 | 2 | check_answer_status 3 | 4 | Giving the query and answer, you need give `answer_status` of the answer by following rules: 5 | 1. If the answer is a sorry message or not a positive/straight response for the given query, return "Unsolved". 6 | 2. If the answer is a positive/straight response for the given query, you have to further check. 7 | 2.1 If the answer is not sufficient to determine whether the solve the query or not, return "Unsure". 8 | 2.2 If you are confident that the answer is sufficient to determine whether the solve the query or not, return "Solved" or "Unsolved". 9 | 10 | Query: 11 | {query} 12 | Answer: 13 | {answer} 14 | 15 | Now give your reason in "content" and `answer_status` of JSON to `check_answer_status`. 16 | 17 | 18 | 19 | 20 | parse_answer_status 21 | 22 | Giving the query and the correspond execution detail of an answer, you need give `answer_status` of the answer by following rules: 23 | 1. If all 'tool' nodes' message indicate that there are errors happened, return "Unsolved" 24 | 2. If you find the information in the "final_answer" is not true/valid according to the messages in 'tool' nodes, return "Unsolved" 25 | 3. If you are unable to verify the authenticity and validity of the information, return "Unsure" 26 | 4. If there are 'tool' node in the chain contains successful func calling and those calling indeed solve the query, return "Solved" 27 | 28 | Query: 29 | {query} 30 | Answer: 31 | {answer} 32 | 33 | Now you are requested to give reason in "content" and `answer_status` of JSON to `parse_answer_status`. 34 | 35 | 36 | 37 | 38 | check_task_solvable 39 | 40 | Please check whether the given task solvable with following rules: 41 | 1. If the `query` provide invalid information (e.g. invalid email address or phone number), return "Unsolvable" 42 | 2. If the `query` needs more information to solve (e.g. the target restaurant name in a navigation task), return "Unsolvable" 43 | 3. If you are unable to draw a conclusion, return "Unsure" 44 | 4. If the currently `available_tools` are enough to solve the query, return "Solvable" 45 | 46 | Task: 47 | {task} 48 | 49 | Now give your reason in "content" and `task_status` of JSON to `check_task_solvable`. 50 | 51 | 52 | 53 | 54 | 55 | 56 | select_better_answer 57 | 58 | Query: 59 | {query} 60 | 61 | Answer_0: 62 | {answer_0} 63 | 64 | Answer_1: 65 | {answer_1} 66 | 67 | Given above query and answers in JSON format, you must follow the rules to select the relatively better answer and give the index of the answer **(0 for Answer_0, 1 for Answer_1)**: 68 | 1. Compare the value of "final_answer" in following aspects: 69 | - Informative: whether it contains all necessary information to reply to the query. 70 | - Factuality: whether it accurately describes what has been done, and what failed in the end. 71 | - Reasoning: If answer does not solve the query, whether gives a detailed and accurate reason for failure. 72 | 2. If you cannot determine yet, compare the value of "answer_details" in following aspects: 73 | - Tool calling costs: calculating the percentage of failed and replicated tools calling. 74 | - Running costs: calculating the total tokens T used in execution. 75 | - Milestone: calculating the milestone(fixed subtasks) reached in execution. 76 | - Exploration: whether tries potential useful tools in execution. Just count times of successful tool calling with different tools/arguments in execution. 77 | 78 | If you have made your decision, calling `select_better_answer`, else if you cannot determine, select a random answer. 79 | 80 | -------------------------------------------------------------------------------- /evaluation/toolbench/tooleval/evaluators/tooleval_gpt-3.5-turbo_fn/config.yaml: -------------------------------------------------------------------------------- 1 | evaluator_name: "tooleval_gpt-3.5-turbo_fn" 2 | registered_cls_name: "OpenAIEvaluator" 3 | prompt_template: "template.txt" 4 | fn_completions: "openai_completions" 5 | apis_json: "your/path/to/api_pool.json" 6 | completions_kwargs: 7 | model: "gpt-3.5-turbo-16k" 8 | max_tokens: 100 9 | temperature: 0 10 | timeout: 10 11 | function_call: 12 | name: "choose_preference" 13 | functions: 14 | - name: "choose_preference" 15 | description: "Choose the preferred answer for the query within all given answers." 16 | parameters: 17 | type: "object" 18 | properties: 19 | preference: 20 | type: "number" 21 | description: "The index of the preferred answer in all given answers." 22 | required: [ "preference" ] 23 | fn_completion_parser: "index_parser" 24 | batch_size: 1 25 | -------------------------------------------------------------------------------- /evaluation/toolbench/tooleval/evaluators/tooleval_gpt-3.5-turbo_fn/template.txt: -------------------------------------------------------------------------------- 1 | 2 | system 3 | You are a helpful annotator, that help user to annotate data. 4 | 5 | 6 | user 7 | Giving task description and candidate answers, I want you to choose one preferred answer based on the rules. To do so, I will give you the task description that given to the models, and the candidate answers in a list for chosen. To choose the one preferred answer, you need to first analyse answers based on rules, then give the index number of the preferred answer of JSON to `choose_preference`. 8 | 9 | Here are the preference rules: 10 | 1. if both answers give the none empty `final_answer`, check whether the given `final_answer` solves the given query. 11 | 1.1 if both answers solve the query, choose one with smaller `total_steps`. 12 | 1.1.1 if `total_steps` are same, choose one answer with better `final_answer` quality. 13 | 1.2 if one answer solve while the other not, chose the answer that solve query. 14 | 1.3 if both answers failed, check the `answer_details` to choose one with considering following preference: 15 | 1.3.1 check `response` and prefer more successful tool calling. 16 | 1.3.2 check `name` and prefer using more various tool usage. 17 | 1.3.3 prefer smaller `total_steps`. 18 | 2. if one give none empty `final_answer` while other not, choose the one give `final_answer`. 19 | 3. if both failed to give none empty `final_answer`, following 1.3 to choose one with better `answer_details`. 20 | 21 | Here is the task description in JSON format: 22 | {task_description} 23 | 24 | Here are the candidate answers in JSON format: 25 | {answers} 26 | 27 | Now choose the preferred answer by analysing results and the rules given, return the index in range [0,1]. 28 | -------------------------------------------------------------------------------- /evaluation/toolbench/tooleval/evaluators/tooleval_gpt-3.5-turbo_normalized/config.yaml: -------------------------------------------------------------------------------- 1 | evaluator_name: "tooleval_gpt-3.5-turbo_normalized" 2 | registered_cls_name: "OpenAINormalizedEvaluator" 3 | prompt_template: "template.txt" 4 | fn_completions: "normalized_openai_completions" 5 | apis_json: "your/path/to/api_pool.json" 6 | completions_kwargs: 7 | model: "gpt-3.5-turbo-16k" 8 | max_tokens: 100 9 | temperature: 0 10 | timeout: 10 11 | functions: 12 | - name: "parse_answer_details" 13 | description: "Parse the json answer with layerd nodes and return the informations about the answer" 14 | parameters: 15 | type: "object" 16 | properties: 17 | succeed_tool_calling: 18 | type: "number" 19 | description: "Give the number of times that the 'tool' nodes' message is called successfully without any errors in the response" 20 | used_tool_types: 21 | type: "number" 22 | description: "Give the number of different 'name' in 'tool' nodes' message" 23 | required: [ "succeed_tool_calling", "used_tool_types"] 24 | - name: "select_best_final_answer" 25 | description: "For given query, select the best answer in answers list and return the index of the best answer" 26 | parameters: 27 | type: "object" 28 | properties: 29 | best_answer_index: 30 | type: "number" 31 | description: "The index of the best answer in the answer list, start from 0" 32 | required: [ "best_answer_index"] 33 | - name: "check_solve_query" 34 | description: "Check whether the given answer solve the given query, return true or false" 35 | parameters: 36 | type: "object" 37 | properties: 38 | is_solved: 39 | type: "boolean" 40 | description: "true if solved and false if not" 41 | required: ["is_solved"] 42 | fn_completion_parser: "index_parser" 43 | batch_size: 1 44 | -------------------------------------------------------------------------------- /evaluation/toolbench/tooleval/evaluators/tooleval_gpt-3.5-turbo_normalized/template.txt: -------------------------------------------------------------------------------- 1 | 2 | parse_answer_details 3 | 4 | Giving answer details in the following JSON format: 5 | {answer_details} 6 | 7 | I want you to parse the answer details and give the information of JSON to `parse_answer_details`. Now parse the answer. 8 | 9 | 10 | 11 | select_best_final_answer 12 | 13 | For query {query}, you have the following answers in JSON format: 14 | {final_answers} 15 | 16 | I want you to select the best answer from the above answers and give the index of the answer of JSON to `select_best_final_answer`. Now select the best answer. 17 | 18 | 19 | 20 | check_solve_query 21 | 22 | Please check whether the answer solve the query or not. 23 | Query: 24 | {query} 25 | 26 | Answer: 27 | {final_answer} 28 | 29 | Now give your judgment of JSON to `check_solve_query`, remember do not be too strict. 30 | 31 | -------------------------------------------------------------------------------- /evaluation/toolbench/tooleval/evaluators_comparison.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import json 3 | from concurrent.futures import ThreadPoolExecutor,as_completed 4 | from tqdm import tqdm 5 | from evaluators import load_registered_automatic_evaluator 6 | import os 7 | import numpy as np 8 | import copy 9 | from typing import List 10 | from scipy.stats import pearsonr,spearmanr 11 | import random 12 | random.seed(42) 13 | 14 | abs_dir = os.path.split(__file__)[0] 15 | annotated_data = json.load(open(os.path.join(abs_dir,'dataset/human_cross_annotated_data.json'))) 16 | NUM_WORKERS=16 17 | 18 | def get_most_preferred(d:list)->np.ndarray: 19 | if np.iterable(d): 20 | d = np.asanyarray(d) 21 | bins = np.bincount(d) 22 | max_val = np.max(bins) 23 | argmax = np.where(max_val==bins)[0] 24 | return argmax 25 | else: 26 | return np.asarray([d]) 27 | 28 | def agreement_score(x,ref:list)->float: 29 | majority_x = get_most_preferred(x) 30 | majority_ref = get_most_preferred(ref) 31 | score_unit = 1/len(majority_x)/len(majority_ref) 32 | score = 0.0 33 | for x in majority_x: 34 | if x in majority_ref: 35 | score += score_unit 36 | return score 37 | def get_correlation(x,y): 38 | x= np.asarray(x) 39 | y = np.asarray(y) 40 | x = x+1 41 | y = y+1 42 | if np.var(x)==0 or np.var(y)==0: 43 | return float(random.choice(get_most_preferred(x))==random.choice(get_most_preferred(y))) 44 | return pearsonr(x,y)[0] 45 | 46 | def test_on_annotated_data(evaluator_cfg)->List[List[int]]: 47 | evaluators = [load_registered_automatic_evaluator(evaluator_cfg) for _ in range(NUM_WORKERS)] 48 | def get_preference(idx): 49 | data = annotated_data[idx] 50 | def process_tools(tools:list): 51 | for tool in tools: 52 | tool.pop('description',None) 53 | tool.pop('parameters',None) 54 | return tools 55 | 56 | tools = process_tools(data['available_tools']) 57 | ret = evaluators[idx%NUM_WORKERS].annotate_preference( 58 | data['query'], 59 | tools, 60 | data['answers'],multisample=True) 61 | return idx,ret 62 | prefer_dict = {} 63 | with ThreadPoolExecutor(NUM_WORKERS) as pool: 64 | # future = [pool.submit(get_preference,idx) for idx in range(100)] 65 | future = [pool.submit(get_preference,idx) for idx in range(len(annotated_data))] 66 | for thd in tqdm(as_completed(future),total=len(future),ncols=100): 67 | if thd.exception() is not None: 68 | pool.shutdown(cancel_futures=True) 69 | raise thd.exception() 70 | exit(-1) 71 | idx,preference = thd.result() 72 | prefer_dict[idx] = preference 73 | prefer = [prefer_dict[idx] for idx in range(len(future))] 74 | return prefer 75 | 76 | def get_popped_and_rest(d:list,index:int): 77 | l = copy.deepcopy(d) 78 | popped = l.pop(index) 79 | return popped,l 80 | 81 | def calculate_human_performance(): 82 | human_agreement = [] 83 | variance = [] 84 | for data in annotated_data: 85 | agreement_scores = [ 86 | agreement_score(*get_popped_and_rest(data['preference'],idx)) 87 | for idx in range(len(data['preference'])) 88 | ] 89 | human_agreement.append(np.mean(agreement_scores)) 90 | variance.append(np.var([1-agreement_scores[idx] for idx in range(len(agreement_scores))])) 91 | 92 | 93 | return { 94 | 'human_agreement':np.mean(human_agreement), 95 | 'bias':0, 96 | 'variance':np.mean(variance) 97 | } 98 | 99 | 100 | 101 | def calculate_evaluator_performance(evaluator_preference,human_preference): 102 | human_agreement = [] 103 | bias = [] 104 | variance = [] 105 | assert len(evaluator_preference)==len(human_preference),'length of evaluator_preference and human_preference should be the same!' 106 | correlation = [] 107 | for idx in range(len(evaluator_preference)): 108 | human_pref = human_preference[idx] 109 | evaluator_pref = evaluator_preference[idx] 110 | 111 | human_agreement.append([ 112 | agreement_score(pref,human_pref) for pref in evaluator_pref 113 | ]) 114 | bias.append( 115 | 1 - agreement_score(human_pref,evaluator_pref) 116 | ) 117 | variance.append( 118 | np.var([1-score for score in human_agreement[-1]]) 119 | ) 120 | correlation.append(get_correlation(human_pref,evaluator_pref)) 121 | 122 | return{ 123 | 'correlation': np.mean(correlation), 124 | 'human_agreement':np.mean(np.mean(human_agreement,axis=1)), 125 | 'bias':np.mean(bias), 126 | 'variance':np.mean(variance) 127 | } 128 | 129 | if __name__=='__main__': 130 | evaluators = ['tooleval_gpt-3.5-turbo_normalized',] 131 | human_perference = [ 132 | data['preference'] for data in annotated_data 133 | ] 134 | 135 | evaluator_performance = [calculate_human_performance()] 136 | for evaluator in evaluators: 137 | if not os.path.exists(os.path.join(abs_dir,'dataset',f'performance_{evaluator}.npy')): 138 | evaluator_cfg = { 139 | 'evaluators_cfg_path':os.path.join(abs_dir,'evaluators'), 140 | 'evaluator':evaluator 141 | } 142 | evaluator_perference = test_on_annotated_data(evaluator_cfg) 143 | np.save(os.path.join(abs_dir,'dataset',f'performance_{evaluator}.npy'),evaluator_perference) 144 | 145 | evaluator_perference = np.load(os.path.join(abs_dir,'dataset',f'performance_{evaluator}.npy'),allow_pickle=True) 146 | performance = calculate_evaluator_performance(evaluator_perference,human_perference) 147 | print(performance) 148 | evaluator_performance.append(performance) 149 | 150 | df = pd.DataFrame(evaluator_performance,index=['human']+evaluators) 151 | df.to_csv(os.path.join(abs_dir,'dataset','evaluator_performance.csv')) 152 | print(df) -------------------------------------------------------------------------------- /evaluation/toolbench/tooleval/requirements.txt: -------------------------------------------------------------------------------- 1 | tqdm 2 | numpy 3 | pandas 4 | pydantic 5 | tenacity 6 | openai 7 | pyyaml -------------------------------------------------------------------------------- /evaluation/toolbench/utils.py: -------------------------------------------------------------------------------- 1 | import json 2 | import re 3 | import torch 4 | import transformers 5 | import transformers.models.llama.modeling_llama 6 | from functools import partial 7 | 8 | 9 | def process_system_message(system_message, functions): 10 | assert "with a function call to actually excute your step." in system_message 11 | # we find that following ReACT format and merging the thought node and function call node is easier for model to learn to integrate the action input json string in its prediction than learn to predict a json string directly. 12 | system_message = system_message.replace("with a function call to actually excute your step.", "with a function call to actually excute your step. Your output should follow this format:\nThought:\nAction\nAction Input:\n") 13 | # add all the function dicts in the prompt. 14 | system_message = system_message + "\nSpecifically, you have access to the following APIs: " + str(functions) 15 | return system_message 16 | 17 | def get_gpu_memory(max_gpus=None): 18 | """Get available memory for each GPU.""" 19 | gpu_memory = [] 20 | num_gpus = ( 21 | torch.cuda.device_count() 22 | if max_gpus is None 23 | else min(max_gpus, torch.cuda.device_count()) 24 | ) 25 | 26 | for gpu_id in range(num_gpus): 27 | with torch.cuda.device(gpu_id): 28 | device = torch.cuda.current_device() 29 | gpu_properties = torch.cuda.get_device_properties(device) 30 | total_memory = gpu_properties.total_memory / (1024**3) 31 | allocated_memory = torch.cuda.memory_allocated() / (1024**3) 32 | available_memory = total_memory - allocated_memory 33 | gpu_memory.append(available_memory) 34 | return gpu_memory 35 | 36 | 37 | def standardize_category(category): 38 | save_category = category.replace(" ", "_").replace(",", "_").replace("/", "_") 39 | while " " in save_category or "," in save_category: 40 | save_category = save_category.replace(" ", "_").replace(",", "_") 41 | save_category = save_category.replace("__", "_") 42 | return save_category 43 | 44 | def standardize(string): 45 | res = re.compile("[^\\u4e00-\\u9fa5^a-z^A-Z^0-9^_]") 46 | string = res.sub("_", string) 47 | string = re.sub(r"(_)\1+","_", string).lower() 48 | while True: 49 | if len(string) == 0: 50 | return string 51 | if string[0] == "_": 52 | string = string[1:] 53 | else: 54 | break 55 | while True: 56 | if len(string) == 0: 57 | return string 58 | if string[-1] == "_": 59 | string = string[:-1] 60 | else: 61 | break 62 | if string[0].isdigit(): 63 | string = "get_" + string 64 | return string 65 | 66 | def change_name(name): 67 | change_list = ["from", "class", "return", "false", "true", "id", "and"] 68 | if name in change_list: 69 | name = "is_" + name 70 | return name 71 | 72 | # code adapted from https://huggingface.co/kaiokendev/superhot-13b-8k-no-rlhf-test/blob/main/llama_rope_scaled_monkey_patch.py 73 | class CondenseRotaryEmbedding(torch.nn.Module): 74 | def __init__(self, dim, ratio, max_position_embeddings=2048, base=10000, device=None): 75 | super().__init__() 76 | inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2).float().to(device) / dim)) 77 | self.register_buffer("inv_freq", inv_freq) 78 | 79 | # Build here to make `torch.jit.trace` work. 80 | self.ratio = ratio 81 | max_position_embeddings *= ratio 82 | print(f"Condensing Positional embeddings from {max_position_embeddings} to {max_position_embeddings // ratio}") 83 | self.max_seq_len_cached = max_position_embeddings 84 | t = torch.arange(self.max_seq_len_cached, device=self.inv_freq.device, dtype=self.inv_freq.dtype) / ratio 85 | freqs = torch.einsum("i,j->ij", t, self.inv_freq) 86 | # Different from paper, but it uses a different permutation in order to obtain the same calculation 87 | emb = torch.cat((freqs, freqs), dim=-1) 88 | dtype = torch.get_default_dtype() 89 | self.register_buffer("cos_cached", emb.cos()[None, None, :, :].to(dtype), persistent=False) 90 | self.register_buffer("sin_cached", emb.sin()[None, None, :, :].to(dtype), persistent=False) 91 | 92 | def forward(self, x, seq_len=None): 93 | # x: [bs, num_attention_heads, seq_len, head_size] 94 | # This `if` block is unlikely to be run after we build sin/cos in `__init__`. Keep the logic here just in case. 95 | if seq_len > self.max_seq_len_cached: 96 | self.max_seq_len_cached = seq_len 97 | t = torch.arange(self.max_seq_len_cached, device=x.device, dtype=self.inv_freq.dtype) / self.ratio 98 | freqs = torch.einsum("i,j->ij", t, self.inv_freq) 99 | # Different from paper, but it uses a different permutation in order to obtain the same calculation 100 | emb = torch.cat((freqs, freqs), dim=-1).to(x.device) 101 | self.register_buffer("cos_cached", emb.cos()[None, None, :, :].to(x.dtype), persistent=False) 102 | self.register_buffer("sin_cached", emb.sin()[None, None, :, :].to(x.dtype), persistent=False) 103 | return ( 104 | self.cos_cached[:, :, :seq_len, ...].to(dtype=x.dtype), 105 | self.sin_cached[:, :, :seq_len, ...].to(dtype=x.dtype), 106 | ) 107 | 108 | def replace_llama_with_condense(ratio): 109 | transformers.models.llama.modeling_llama.LlamaRotaryEmbedding = partial(CondenseRotaryEmbedding, ratio=ratio) 110 | 111 | 112 | def process_retrieval_ducoment(documents_df): 113 | ir_corpus = {} 114 | corpus2tool = {} 115 | for row in documents_df.itertuples(): 116 | doc = json.loads(row.document_content) 117 | ir_corpus[row.docid] = (doc.get('category_name', '') or '') + ', ' + \ 118 | (doc.get('tool_name', '') or '') + ', ' + \ 119 | (doc.get('api_name', '') or '') + ', ' + \ 120 | (doc.get('api_description', '') or '') + \ 121 | ', required_params: ' + json.dumps(doc.get('required_parameters', '')) + \ 122 | ', optional_params: ' + json.dumps(doc.get('optional_parameters', '')) + \ 123 | ', return_schema: ' + json.dumps(doc.get('template_response', '')) 124 | corpus2tool[(doc.get('category_name', '') or '') + ', ' + \ 125 | (doc.get('tool_name', '') or '') + ', ' + \ 126 | (doc.get('api_name', '') or '') + ', ' + \ 127 | (doc.get('api_description', '') or '') + \ 128 | ', required_params: ' + json.dumps(doc.get('required_parameters', '')) + \ 129 | ', optional_params: ' + json.dumps(doc.get('optional_parameters', '')) + \ 130 | ', return_schema: ' + json.dumps(doc.get('template_response', ''))] = doc['category_name'] + '\t' + doc['tool_name'] + '\t' + doc['api_name'] 131 | return ir_corpus, corpus2tool -------------------------------------------------------------------------------- /evaluation/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Reason-Wang/ToolGen/6839374a255810efe69deea4056eec5c55e25802/evaluation/utils/__init__.py -------------------------------------------------------------------------------- /evaluation/utils/embedding.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | from openai import OpenAI 4 | import tiktoken 5 | from tqdm import tqdm 6 | 7 | 8 | def truncate_text_tokens(text, max_tokens=4096): 9 | # Truncate texts to 4096 tokens 10 | encoding = tiktoken.get_encoding("cl100k_base") 11 | return encoding.encode(text)[:max_tokens] 12 | 13 | 14 | def get_openai_embeddings(texts, batch_size, model, api_key): 15 | client = OpenAI(api_key=api_key) 16 | texts = [text.replace("\n", " ") for text in texts] 17 | # Truncate texts to 4096 tokens 18 | truncated_text_tokens = [truncate_text_tokens(text) for text in texts] 19 | 20 | embeddings = [] 21 | for i in tqdm(range(0, len(truncated_text_tokens), batch_size)): 22 | batch = truncated_text_tokens[i:i + batch_size] 23 | data = client.embeddings.create(input=batch, model=model).data 24 | embedding = [d.embedding for d in data] 25 | embeddings.extend(embedding) 26 | 27 | return np.array(embeddings) 28 | # return client.embeddings.create(input=texts, model=model).data[0].embedding 29 | 30 | 31 | def get_embeddings(model, device, texts, batch_size=16): 32 | model.eval() 33 | model.to(device) 34 | # tbar = tqdm(dataloader) 35 | embeddings = [] 36 | with torch.no_grad(): 37 | for i in range(0, len(texts), batch_size): 38 | batch = texts[i:i + batch_size] 39 | embeddings.append(model.encode(batch, device=device)) 40 | return np.concatenate(embeddings) 41 | 42 | -------------------------------------------------------------------------------- /evaluation/utils/retrieval.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from rank_bm25 import BM25Okapi 3 | from nltk.tokenize import word_tokenize 4 | import faiss 5 | 6 | class BM25Indexer: 7 | def __init__(self, corpus, ids=None): 8 | self.corpus = corpus 9 | if ids is None: 10 | self.ids = list(range(embeddings.shape[0])) 11 | else: 12 | self.ids = ids 13 | self.tokenized_corpus = [word_tokenize(document.lower()) for document in corpus] 14 | self.bm25 = BM25Okapi(self.tokenized_corpus) 15 | 16 | def search(self, queries, top_n=5): 17 | if isinstance(queries, str): 18 | queries = [queries] 19 | tokenized_queries = [word_tokenize(query.lower()) for query in queries] 20 | docs_scores = [self.bm25.get_scores(tokenized_query) for tokenized_query in tokenized_queries] 21 | 22 | docs_scores = [[(score, idx) for idx, score in enumerate(doc_scores)] for doc_scores in docs_scores] 23 | scores_ids = [sorted(doc_scores, reverse=True)[:top_n] for doc_scores in docs_scores] 24 | 25 | # For tests only 26 | # scores_ids = [(doc_scores)[:top_n] for doc_scores in docs_scores] 27 | 28 | new_scores_ids = [] 29 | for score_ids in scores_ids: 30 | new_score_ids = [] 31 | for score, idx in score_ids: 32 | new_score_ids.append((score, self.ids[idx])) 33 | new_scores_ids.append(new_score_ids) 34 | 35 | return new_scores_ids 36 | 37 | 38 | 39 | 40 | class Indexer: 41 | def __init__(self, embeddings, vector_size, ids=None, similarity="cosine"): 42 | self.index = faiss.IndexFlatIP(vector_size) 43 | self.similarity = similarity 44 | if similarity == "cosine": 45 | embeddings /= np.linalg.norm(embeddings, axis=1)[:, None] 46 | self.index.add(embeddings) 47 | if ids is None: 48 | self.ids = list(range(embeddings.shape[0])) 49 | else: 50 | self.ids = ids 51 | 52 | def search(self, queries: np.array, top_n: int): 53 | if len(queries.shape) == 1: 54 | queries = queries.reshape(1, -1) 55 | try: 56 | if self.similarity == "cosine": 57 | queries /= np.linalg.norm(queries, axis=1)[:, None] 58 | scores, indexes = self.index.search(queries, top_n) 59 | except AttributeError: 60 | print(queries) 61 | scores_ids = [] 62 | for top_n_score, top_n_idx in zip(scores, indexes): 63 | top_n_score_id = [] 64 | for s, i in zip(top_n_score, top_n_idx): 65 | top_n_score_id.append((s, self.ids[i])) 66 | scores_ids.append(top_n_score_id) 67 | 68 | return scores_ids 69 | 70 | 71 | if __name__ == "__main__": 72 | texts = [ 73 | "A man standing in front of a building", 74 | "Mooncake is a Chinese bakery product traditionally eaten during the Mid-Autumn Festival", 75 | "PCA is a statistical procedure that uses an orthogonal transformation to convert a set of observations of possibly correlated variables ...", 76 | "The quick brown fox jumps over the lazy dog", 77 | "Barack Obama was the 44th president of the United States", 78 | "The Great Wall of China is a series of fortifications made of stone, brick, tamped earth, wood, and other materials, generally built along an east-to-west line across the historical northern borders of China", 79 | "The domestic dog is a domesticated descendant of the wolf", 80 | "The original cat species to evolve into the domestic cat is the African wildcat", 81 | "Camels and llamas are common pack animals", 82 | ] 83 | query = ["Give me some facts about animals.", "What is the Great Wall of China?"] 84 | from sentence_transformers import SentenceTransformer 85 | model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2") 86 | embeddings = model.encode(texts) 87 | query_embedding = model.encode(query) 88 | indexer = Indexer(embeddings, embeddings.shape[1]) 89 | scores_docids = indexer.search(query_embedding, top_n=3) 90 | print(scores_docids) 91 | top_ids = [[score_id[1] for score_id in score_ids] for score_ids in scores_docids] 92 | print(top_ids) 93 | best_docs = [texts[ids[0]] for ids in top_ids] 94 | print(best_docs) 95 | -------------------------------------------------------------------------------- /evaluation/utils/utils.py: -------------------------------------------------------------------------------- 1 | import time 2 | from openai import AzureOpenAI, OpenAI 3 | from tqdm import tqdm 4 | from evaluation.toolbench.utils import change_name, standardize 5 | from transformers import LogitsProcessor 6 | from typing import List 7 | import torch 8 | 9 | def get_toolbench_name(tool_name, api_name): 10 | tool_name = standardize(tool_name) 11 | api_name = change_name(standardize(api_name)) 12 | toolbench_name = api_name+f"_for_{tool_name}" 13 | toolbench_name = toolbench_name[-64:] 14 | return toolbench_name 15 | 16 | class DisjunctiveTrie: 17 | def __init__(self, nested_token_ids: List[List[int]], no_subsets=True): 18 | r""" 19 | A helper class that builds a trie with the words represented in `nested_token_ids`. 20 | """ 21 | self.max_height = max([len(one) for one in nested_token_ids]) 22 | 23 | root = {} 24 | for token_ids in nested_token_ids: 25 | level = root 26 | for tidx, token_id in enumerate(token_ids): 27 | if token_id not in level: 28 | level[token_id] = {} 29 | 30 | level = level[token_id] 31 | 32 | if no_subsets and self.has_subsets(root, nested_token_ids): 33 | raise ValueError( 34 | "Each list in `nested_token_ids` can't be a complete subset of another list, but is" 35 | f" {nested_token_ids}." 36 | ) 37 | 38 | self.trie = root 39 | 40 | def next_tokens(self, current_seq): 41 | """ 42 | The next possible tokens that will progress the trie, given the current sequence of tokens in `current_seq`. 43 | """ 44 | start = self.trie 45 | 46 | for current_token in current_seq: 47 | start = start[current_token] 48 | 49 | next_tokens = list(start.keys()) 50 | 51 | return next_tokens 52 | 53 | def reached_leaf(self, current_seq): 54 | next_tokens = self.next_tokens(current_seq) 55 | 56 | return len(next_tokens) == 0 57 | 58 | def count_leaves(self, root): 59 | next_nodes = list(root.values()) 60 | if len(next_nodes) == 0: 61 | return 1 62 | else: 63 | return sum([self.count_leaves(nn) for nn in next_nodes]) 64 | 65 | def has_subsets(self, trie, nested_token_ids): 66 | """ 67 | Returns whether # of leaves == # of words. Otherwise some word is a subset of another. 68 | """ 69 | leaf_count = self.count_leaves(trie) 70 | return len(nested_token_ids) != leaf_count 71 | 72 | 73 | class AllowTokenIdsProcessor(LogitsProcessor): 74 | def __init__(self, allowed_token_ids: List[int]): 75 | self.allowed_token_ids = allowed_token_ids 76 | 77 | def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor): 78 | mask = torch.ones_like(scores, dtype=torch.bool) 79 | mask[:, self.allowed_token_ids] = False 80 | scores = scores.masked_fill(mask, -1e10) 81 | 82 | return scores 83 | 84 | 85 | class AllowKeyWordsProcessor(LogitsProcessor): 86 | ''' renxi.wang@mbzuai.ac.ae 87 | A logits processor that limit output text to be in a set of predefined keywords. 88 | tokenizer: tokenizer used to encode the keywords 89 | trie: DisjunctiveTrie of predefined keywords 90 | input_ids: input_ids of the prompt that the model is generating from 91 | return: 92 | scores: scores of the logits, where impossible tokens are masked 93 | For beam search, scores are log-softmax of logits, others are logits 94 | ''' 95 | def __init__(self, tokenizer, trie, input_ids): 96 | self.tokenizer = tokenizer 97 | self.trie = trie 98 | self.input_ids = input_ids 99 | 100 | def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor): 101 | input_length = self.input_ids.shape[1] 102 | generated_ids = input_ids[:, input_length:].tolist() 103 | new_token_ids = [] 104 | for ids in generated_ids: 105 | try: 106 | next_token_ids = self.trie.next_tokens(ids) 107 | except KeyError as e: 108 | next_token_ids = [self.tokenizer.eos_token_id] 109 | if not next_token_ids: 110 | next_token_ids = [self.tokenizer.eos_token_id] 111 | new_token_ids.append(next_token_ids) 112 | 113 | for row, token_ids in enumerate(new_token_ids): 114 | mask = torch.ones_like(scores[row], dtype=torch.bool) 115 | mask[torch.tensor(token_ids)] = False 116 | scores[row, mask] = -1e10 117 | 118 | return scores 119 | 120 | 121 | def openai_client_request(client, model, messages, num_retries: int = 5, return_dict: bool = True, **kwargs): 122 | print(f"Arguments: {kwargs}") 123 | response = {} 124 | # retry request (handles connection errors, timeouts, and overloaded API) 125 | for i in range(num_retries): 126 | try: 127 | response = client.chat.completions.create( 128 | model=model, 129 | messages=messages, 130 | **kwargs 131 | ) 132 | # response['success'] = True 133 | break 134 | except Exception as e: 135 | # response['success'] = False 136 | tqdm.write(str(e)) 137 | tqdm.write("Retrying...") 138 | time.sleep(10) 139 | if return_dict: 140 | return response 141 | else: 142 | return response.choices[0].message.content 143 | 144 | 145 | class OpenAIChatModel: 146 | def __init__(self, model: str, api_key, api_base=None, api_version=None, azure_endpoint=None, temperature: float=None, stop: List[str]=None): 147 | self.model = model 148 | if api_base: 149 | self.client = OpenAI(api_key=api_key, api_base=api_base) 150 | else: 151 | self.client = OpenAI(api_key=api_key) 152 | self.temperature = temperature 153 | self.stop = stop 154 | 155 | def generate(self, messages: List, temperature: float = None, stop: List[str] = None, print_prompt=False): 156 | if print_prompt: 157 | print(messages) 158 | 159 | kwargs = {} 160 | if self.temperature: 161 | kwargs['temperature'] = self.temperature 162 | elif temperature: 163 | kwargs['temperature'] = temperature 164 | if self.stop: 165 | kwargs['stop'] = self.stop 166 | 167 | 168 | temperature=self.temperature if self.temperature else temperature, 169 | response = openai_client_request( 170 | client=self.client, 171 | model=self.model, 172 | messages=messages, 173 | return_dict=False, 174 | **kwargs 175 | ) 176 | 177 | return response 178 | 179 | 180 | def seed_everything(seed: int): 181 | import random, os 182 | import numpy as np 183 | import torch 184 | 185 | random.seed(seed) 186 | os.environ['PYTHONHASHSEED'] = str(seed) 187 | np.random.seed(seed) 188 | torch.manual_seed(seed) 189 | torch.cuda.manual_seed(seed) 190 | torch.backends.cudnn.deterministic = True 191 | torch.backends.cudnn.benchmark = True -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | backoff==2.2.1 2 | click==8.1.7 3 | faiss_cpu 4 | Flask==3.0.3 5 | flask_cors==5.0.0 6 | fschat==0.2.36 7 | httpx==0.27.2 8 | huggingface_hub==0.24.6 9 | nltk==3.9.1 10 | numpy 11 | openai 12 | pandas==2.2.3 13 | peft 14 | psutil 15 | pydantic==2.9.2 16 | PyYAML 17 | rank_bm25==0.2.2 18 | Requests==2.32.3 19 | scikit_learn==1.5.2 20 | scipy==1.14.1 21 | sentence_transformers==3.1.0 22 | tenacity==8.5.0 23 | termcolor==2.5.0 24 | tiktoken==0.7.0 25 | torch==2.4.1 26 | tqdm 27 | transformers 28 | Unidecode 29 | -------------------------------------------------------------------------------- /scripts/convert_answer/run_convert_answer.sh: -------------------------------------------------------------------------------- 1 | export RAW_ANSWER_PATH=data/answer 2 | export CONVERTED_ANSWER_PATH=data/model_predictions_converted 3 | export MODEL_NAME=test 4 | export test_set=G2_instruction 5 | method="CoT@1" 6 | 7 | mkdir -p ${CONVERTED_ANSWER_PATH}/${MODEL_NAME} 8 | answer_dir=${RAW_ANSWER_PATH}/${MODEL_NAME}/${test_set} 9 | output_file=${CONVERTED_ANSWER_PATH}/${MODEL_NAME}/${test_set}.json 10 | echo ${output_file} 11 | python -m evaluation.toolbench.tooleval.convert_to_answer_format\ 12 | --answer_dir ${answer_dir} \ 13 | --method ${method} \ 14 | --output ${output_file} -------------------------------------------------------------------------------- /scripts/eval_full_pipeline.sh: -------------------------------------------------------------------------------- 1 | export TOOLBENCH_KEY="Put your ToolBench key here" 2 | export OPENAI_KEY="Put your OpenAI key here" 3 | export PYTHONPATH=./ 4 | export GPT_MODEL="gpt-3.5-turbo-16k" 5 | export SERVICE_URL="http://localhost:8080/virtual" 6 | 7 | # MODEL_NAME=virtual-gpt35-16k-step16-cot 8 | # model_path="virtual-gpt35-16k-step16-cot" 9 | # backbone_model="chatgpt_function" 10 | # function_provider="truth" 11 | 12 | # MODEL_NAME="ToolLlama-v2-t0.0-cot" 13 | # model_path="ToolBench/ToolLLaMA-2-7b-v2" 14 | # indexing="None" 15 | # function_provider="truth" 16 | # backbone_model="toolllama" 17 | 18 | # MODEL_NAME="ToolLlama-Llama-3-8B-cot" 19 | # model_path="reasonwang/ToolLlama-Llama-3-8B" 20 | # function_provider="truth" 21 | # backbone_model="toolchat" 22 | 23 | 24 | # MODEL_NAME="ToolGen-Semantic-Llama-3-8B-cot" 25 | # model_path="reasonwang/ToolGen-Semantic-Llama-3-8B" 26 | # indexing="Semantic" 27 | 28 | 29 | # model_path="reasonwang/ToolGen-Llama-3-8B-Instruct" 30 | # indexing="Atomic" 31 | # template="llama-3" 32 | # MODEL_NAME="ToolGen-Llama-3-8B-Instruct" 33 | 34 | 35 | # model_code="Qwen2.5-14B" 36 | # model_path="reasonwang/ToolGen-${model_code}" 37 | # indexing="Atomic" 38 | # template="qwen-7b-chat" 39 | # MODEL_NAME="ToolGen-${model_code}-WoRetry" 40 | # function_provider="all" 41 | 42 | if [ $indexing == "Atomic" ]; then 43 | backbone_model="toolgen_atomic" 44 | else 45 | list=("Semantic" "Numeric" "Hierarchical") 46 | for item in "${list[@]}"; do 47 | if [ "$item" = "$indexing" ]; then 48 | backbone_model="toolgen" 49 | break 50 | fi 51 | done 52 | fi 53 | 54 | 55 | export CUDA_VISIBLE_DEVICES=4 56 | OUTPUT_DIR="data/answer/${MODEL_NAME}" 57 | stage="G2" 58 | group="instruction" 59 | method="CoT@1" 60 | 61 | mkdir -p $OUTPUT_DIR; mkdir -p $OUTPUT_DIR/${stage}_${group} 62 | cmd_inference="python evaluation/toolbench/inference/qa_pipeline_multithread.py \ 63 | --model_path ${model_path} \ 64 | --template ${template} \ 65 | --indexing ${indexing} \ 66 | --chatgpt_model ${GPT_MODEL} \ 67 | --tool_root_dir data/toolenv/tools \ 68 | --backbone_model ${backbone_model} \ 69 | --openai_key ${OPENAI_KEY} \ 70 | --max_observation_length 1024 \ 71 | --method ${method} \ 72 | --input_query_file data/solvable_queries/test_instruction/${stage}_${group}.json \ 73 | --output_answer_file $OUTPUT_DIR/${stage}_${group} \ 74 | --toolbench_key ${TOOLBENCH_KEY} \ 75 | --num_thread 1 \ 76 | --function_provider ${function_provider}" 77 | 78 | echo $cmd_inference 79 | eval $cmd_inference 80 | 81 | 82 | RAW_ANSWER_PATH="data/answer" 83 | CONVERTED_ANSWER_PATH="data/model_predictions_converted" 84 | 85 | mkdir -p ${CONVERTED_ANSWER_PATH}/${MODEL_NAME} 86 | answer_dir="${RAW_ANSWER_PATH}/${MODEL_NAME}/${stage}_${group}" 87 | output_file="${CONVERTED_ANSWER_PATH}/${MODEL_NAME}/${stage}_${group}.json" 88 | echo ${output_file} 89 | cmd_convert="python -m evaluation.toolbench.tooleval.convert_to_answer_format\ 90 | --answer_dir ${answer_dir} \ 91 | --method ${method} \ 92 | --output ${output_file}" 93 | 94 | echo $cmd_convert 95 | eval $cmd_convert 96 | 97 | export API_POOL_FILE=openai_keys.json 98 | SAVE_PATH="data/results/pass_rate" 99 | mkdir -p ${SAVE_PATH} 100 | export EVAL_MODEL=gpt-4-turbo-2024-04-09 101 | mkdir -p ${SAVE_PATH}/${MODEL_NAME} 102 | 103 | cmd_pass="python -m evaluation.toolbench.tooleval.eval_pass_rate \ 104 | --converted_answer_path ${CONVERTED_ANSWER_PATH} \ 105 | --save_path ${SAVE_PATH}/${MODEL_NAME} \ 106 | --reference_model ${MODEL_NAME} \ 107 | --test_ids data/solvable_queries/test_query_ids \ 108 | --max_eval_threads 3 \ 109 | --evaluate_times 3 \ 110 | --test_set ${stage}_${group}" 111 | 112 | echo $cmd_pass 113 | eval $cmd_pass 114 | 115 | export API_POOL_FILE=openai_keys.json 116 | SAVE_PATH="data/results/preference_rate" 117 | PASS_RATE_PATH="data/results/pass_rate" 118 | REFERENCE_MODEL=virtual-gpt35-16k-step16-cot 119 | export EVAL_MODEL=gpt-4o-2024-05-13 120 | mkdir -p ${SAVE_PATH} 121 | 122 | cmd_preference="python -m evaluation.toolbench.tooleval.eval_preference \ 123 | --converted_answer_path ${CONVERTED_ANSWER_PATH} \ 124 | --reference_model ${REFERENCE_MODEL} \ 125 | --output_model ${MODEL_NAME} \ 126 | --test_ids data/solvable_queries/test_query_ids/ \ 127 | --save_path ${SAVE_PATH}/${MODEL_NAME} \ 128 | --pass_rate_result_path ${PASS_RATE_PATH} \ 129 | --max_eval_threads 3 \ 130 | --use_pass_rate true \ 131 | --evaluate_times 3 \ 132 | --test_set ${stage}_${group}" 133 | 134 | echo $cmd_preference 135 | eval $cmd_preference -------------------------------------------------------------------------------- /scripts/eval_opendomain_full_pipeline.sh: -------------------------------------------------------------------------------- 1 | TOOLBENCH_KEY="" 2 | OPENAI_KEY="" 3 | export PYTHONPATH=./ 4 | export GPT_MODEL="gpt-3.5-turbo-16k" 5 | export SERVICE_URL="http://localhost:8080/virtual" 6 | 7 | # MODEL_NAME="virtual-gpt35-16k-step16-cot-opendomain" 8 | # model_path="virtual-gpt35-16k-step16-cot" 9 | # backbone_model="chatgpt_function" 10 | # function_provider="retriever" 11 | 12 | MODEL_NAME="ToolLlama-Llama-3-8B-t0.0-cot-toolretriever" 13 | model_path="reasonwang/ToolLlama-Llama-3-8B" 14 | function_provider="retriever" 15 | backbone_model="toolchat" 16 | 17 | 18 | 19 | # MODEL_NAME="ToolLlama-Llama-3-8B-t0.0-cot-toolretriever" 20 | # model_path="reasonwang/ToolLlama-Llama-3-8B" 21 | # function_provider="retriever" 22 | # backbone_model="toolchat" 23 | 24 | 25 | # MODEL_NAME="ToolLlama-v2-t0.0-cot-opendomain-toolretriever-retry-finish" 26 | # model_path="ToolBench/ToolLLaMA-2-7b-v2" 27 | # indexing="None" 28 | # replace_file="None" 29 | # function_provider="retriever" 30 | # backbone_model="toolllama" 31 | 32 | 33 | 34 | OUTPUT_DIR="data/answer/${MODEL_NAME}" 35 | export CUDA_VISIBLE_DEVICES=0 36 | stage="G2" 37 | group="instruction" 38 | method="CoT@1" 39 | 40 | 41 | # Open domain setting 42 | # corpus_tsv_path="data/retrieval/${stage}/corpus.tsv" 43 | corpus_tsv_path="data/retrieval/corpus_G123.tsv" 44 | # retrieval_model_path="reasonwang/BERT-${stage}" 45 | retrieval_model_path="ToolBench/ToolBench_IR_bert_based_uncased" 46 | mkdir -p $OUTPUT_DIR; mkdir -p $OUTPUT_DIR/${stage}_${group} 47 | 48 | cmd="python toolbench/inference/qa_pipeline_multithread.py \ 49 | --replace_file ${replace_file} \ 50 | --model_path ${model_path} \ 51 | --chatgpt_model ${GPT_MODEL} \ 52 | --tool_root_dir data/toolenv/tools \ 53 | --corpus_tsv_path ${corpus_tsv_path} \ 54 | --retrieval_model_path ${retrieval_model_path} \ 55 | --backbone_model ${backbone_model} \ 56 | --openai_key ${OPENAI_KEY} \ 57 | --max_observation_length 1024 \ 58 | --method ${method} \ 59 | --input_query_file data/solvable_queries/test_instruction/${stage}_${group}.json \ 60 | --output_answer_file $OUTPUT_DIR/${stage}_${group} \ 61 | --toolbench_key $TOOLBENCH_KEY \ 62 | --num_thread 1 \ 63 | --function_provider ${function_provider}" 64 | echo $cmd 65 | eval $cmd 66 | 67 | 68 | RAW_ANSWER_PATH="data/answer" 69 | CONVERTED_ANSWER_PATH="data/model_predictions_converted" 70 | 71 | mkdir -p ${CONVERTED_ANSWER_PATH}/${MODEL_NAME} 72 | answer_dir="${RAW_ANSWER_PATH}/${MODEL_NAME}/${stage}_${group}" 73 | output_file="${CONVERTED_ANSWER_PATH}/${MODEL_NAME}/${stage}_${group}.json" 74 | echo ${output_file} 75 | cmd="python -m toolbench.tooleval.convert_to_answer_format\ 76 | --answer_dir ${answer_dir} \ 77 | --method ${method} \ 78 | --output ${output_file}" 79 | echo $cmd 80 | eval $cmd 81 | 82 | 83 | export API_POOL_FILE=openai_key_mbz.json 84 | SAVE_PATH="data/results/pass_rate" 85 | mkdir -p ${SAVE_PATH} 86 | export EVAL_MODEL=gpt-4-turbo-2024-04-09 87 | mkdir -p ${SAVE_PATH}/${MODEL_NAME} 88 | 89 | cmd="python -m toolbench.tooleval.eval_pass_rate \ 90 | --converted_answer_path ${CONVERTED_ANSWER_PATH} \ 91 | --save_path ${SAVE_PATH}/${MODEL_NAME} \ 92 | --reference_model ${MODEL_NAME} \ 93 | --test_ids data/solvable_queries/test_query_ids \ 94 | --max_eval_threads 3 \ 95 | --evaluate_times 3 \ 96 | --test_set ${stage}_${group} --overwrite" 97 | echo $cmd 98 | eval $cmd 99 | 100 | export API_POOL_FILE=openai_key.json 101 | SAVE_PATH="data/results/preference_rate" 102 | PASS_RATE_PATH="data/results/pass_rate" 103 | REFERENCE_MODEL=virtual-gpt35-16k-step16-cot 104 | export EVAL_MODEL=gpt-4o 105 | mkdir -p ${SAVE_PATH} 106 | 107 | cmd="python -m toolbench.tooleval.eval_preference \ 108 | --converted_answer_path ${CONVERTED_ANSWER_PATH} \ 109 | --reference_model ${REFERENCE_MODEL} \ 110 | --output_model ${MODEL_NAME} \ 111 | --test_ids data/solvable_queries/test_query_ids/ \ 112 | --save_path ${SAVE_PATH}/${MODEL_NAME} \ 113 | --pass_rate_result_path ${PASS_RATE_PATH} \ 114 | --max_eval_threads 3 \ 115 | --use_pass_rate true \ 116 | --evaluate_times 3 \ 117 | --test_set ${stage}_${group} --overwrite" 118 | echo $cmd 119 | eval $cmd -------------------------------------------------------------------------------- /scripts/inference/inference_gpt_pipeline_virtual.sh: -------------------------------------------------------------------------------- 1 | export TOOLBENCH_KEY="Set your toolbench key here" 2 | export OPENAI_KEY="Set your openai api key here" 3 | export PYTHONPATH=./ 4 | export SERVICE_URL="http://localhost:8080/virtual" 5 | 6 | 7 | export GPT_MODEL="gpt-3.5-turbo-16k" 8 | export OUTPUT_DIR="data/answer/test" 9 | group="G2_instruction" 10 | 11 | mkdir -p $OUTPUT_DIR; mkdir -p $OUTPUT_DIR/$group 12 | python evaluation/toolbench/inference/qa_pipeline_multithread.py \ 13 | --tool_root_dir data/toolenv/tools \ 14 | --backbone_model chatgpt_function \ 15 | --chatgpt_model $GPT_MODEL \ 16 | --openai_key $OPENAI_KEY \ 17 | --max_observation_length 1024 \ 18 | --method CoT@1 \ 19 | --input_query_file data/solvable_queries/test_instruction/${group}.json \ 20 | --output_answer_file $OUTPUT_DIR/$group \ 21 | --toolbench_key $TOOLBENCH_KEY \ 22 | --num_thread 1 --function_provider "truth" -------------------------------------------------------------------------------- /scripts/inference/inference_opendomain_toolllama_pipeline_virtual.sh: -------------------------------------------------------------------------------- 1 | export TOOLBENCH_KEY="Set your toolbench key here" 2 | export OPENAI_KEY="Set you openai api key here" 3 | export PYTHONPATH=./ 4 | chatgpt_model="gpt-4o" 5 | export SERVICE_URL="http://localhost:8080/virtual" 6 | export OUTPUT_DIR="data/answer/test" 7 | export CUDA_VISIBLE_DEVICES=0 8 | 9 | model_path="reasonwang/ToolLlama-Llama-3-8B" 10 | stage="G2" 11 | group="instruction" 12 | 13 | 14 | # Open domain setting 15 | corpus_tsv_path="data/retrieval/${stage}/corpus.tsv" 16 | # retrieval_model_path="../models/ToolLlama/retriever/bert_${stage}" 17 | retrieval_model_path="ToolBench/ToolBench_IR_bert_based_uncased" 18 | 19 | 20 | mkdir -p $OUTPUT_DIR; mkdir -p $OUTPUT_DIR/$group 21 | 22 | # Open Domain 23 | cmd="python evaluation/toolbench/inference/qa_pipeline_multithread.py \ 24 | --model_path ${model_path} \ 25 | --tool_root_dir data/toolenv/tools \ 26 | --chatgpt_model ${chatgpt_model} \ 27 | --corpus_tsv_path ${corpus_tsv_path} \ 28 | --retrieval_model_path ${retrieval_model_path} \ 29 | --backbone_model toolchat \ 30 | --openai_key $OPENAI_KEY \ 31 | --max_observation_length 1024 \ 32 | --method CoT@1 \ 33 | --input_query_file data/solvable_queries/test_instruction/${stage}_${group}.json \ 34 | --output_answer_file $OUTPUT_DIR/${stage}_${group} \ 35 | --toolbench_key $TOOLBENCH_KEY \ 36 | --num_thread 1 \ 37 | --function_provider retriever" 38 | 39 | echo $cmd 40 | eval $cmd -------------------------------------------------------------------------------- /scripts/inference/inference_toolgen_pipeline_virtual.sh: -------------------------------------------------------------------------------- 1 | export TOOLBENCH_KEY="Set your ToolBench key here" 2 | export OPENAI_KEY="Set your OpenAI key here" 3 | export PYTHONPATH=./ 4 | export SERVICE_URL="http://localhost:8080/virtual" 5 | export CUDA_VISIBLE_DEVICES=0 6 | 7 | model_path="reasonwang/ToolGen-Llama-3-8B" 8 | indexing="Atomic" 9 | template="llama-3" 10 | 11 | # model_name="Qwen2.5-3B" 12 | # model_path="reasonwang/ToolGen-${model_name}" 13 | # indexing="Atomic" 14 | # template="qwen-7b-chat" 15 | 16 | export OUTPUT_DIR="data/answer/${model_name}/" 17 | stage="G3" 18 | group="instruction" 19 | 20 | if [ $indexing == "Atomic" ]; then 21 | backbone_model="toolgen_atomic" 22 | else 23 | backbone_model="toolgen" 24 | fi 25 | 26 | mkdir -p $OUTPUT_DIR; mkdir -p $OUTPUT_DIR/${stage}_${group} 27 | python evaluation/toolbench/inference/qa_pipeline_multithread.py \ 28 | --chatgpt_model gpt-4o \ 29 | --model_path ${model_path} \ 30 | --template ${template} \ 31 | --indexing ${indexing} \ 32 | --tool_root_dir data/toolenv/tools \ 33 | --backbone_model ${backbone_model} \ 34 | --openai_key $OPENAI_KEY \ 35 | --max_observation_length 1024 \ 36 | --method CoT@1 \ 37 | --input_query_file data/solvable_queries/test_instruction/${stage}_${group}.json \ 38 | --output_answer_file $OUTPUT_DIR/${stage}_${group} \ 39 | --toolbench_key $TOOLBENCH_KEY \ 40 | --num_thread 1 \ 41 | --function_provider all -------------------------------------------------------------------------------- /scripts/inference/inference_toolllama_pipeline_virtual.sh: -------------------------------------------------------------------------------- 1 | export TOOLBENCH_KEY="Set your toolbench key here" 2 | export OPENAI_KEY="Set your openai api key here" 3 | export PYTHONPATH=./ 4 | export SERVICE_URL="http://localhost:8080/virtual" 5 | chatgpt_model="gpt-4o" 6 | export OUTPUT_DIR="data/answer/test" 7 | 8 | model_path="reasonwang/ToolLlama-Llama-3-8B" 9 | stage="G2" 10 | group="instruction" 11 | 12 | 13 | mkdir -p $OUTPUT_DIR; mkdir -p $OUTPUT_DIR/$group 14 | cmd="python evaluation/toolbench/inference/qa_pipeline_multithread.py \ 15 | --model_path ${model_path} \ 16 | --tool_root_dir data/toolenv/tools \ 17 | --chatgpt_model ${chatgpt_model} \ 18 | --backbone_model toolchat \ 19 | --openai_key $OPENAI_KEY \ 20 | --max_observation_length 1024 \ 21 | --method CoT@1 \ 22 | --input_query_file data/solvable_queries/test_instruction/${stage}_${group}.json \ 23 | --output_answer_file $OUTPUT_DIR/${stage}_${group} \ 24 | --toolbench_key $TOOLBENCH_KEY \ 25 | --num_thread 1 --function_provider truth" 26 | 27 | echo $cmd 28 | eval $cmd -------------------------------------------------------------------------------- /scripts/pass_rate/run_pass_rate.sh: -------------------------------------------------------------------------------- 1 | export OPENAI_KEY="Set your openai api key here" 2 | export API_POOL_FILE=openai_keys.json 3 | # export OPENAI_API_BASE="https://api.openai.com/v1" 4 | export CONVERTED_ANSWER_PATH=data/model_predictions_converted 5 | export SAVE_PATH=data/results/pass_rate 6 | mkdir -p ${SAVE_PATH} 7 | export CANDIDATE_MODEL=test 8 | TEST_SET="G2_instruction" 9 | export EVAL_MODEL=gpt-4-turbo-2024-04-09 10 | mkdir -p ${SAVE_PATH}/${CANDIDATE_MODEL} 11 | 12 | 13 | python -m evaluation.toolbench.tooleval.eval_pass_rate \ 14 | --converted_answer_path ${CONVERTED_ANSWER_PATH} \ 15 | --save_path ${SAVE_PATH}/${CANDIDATE_MODEL} \ 16 | --reference_model ${CANDIDATE_MODEL} \ 17 | --test_ids data/solvable_queries/test_query_ids \ 18 | --max_eval_threads 3 \ 19 | --evaluate_times 3 \ 20 | --test_set ${TEST_SET} -------------------------------------------------------------------------------- /scripts/preference/run_preference.sh: -------------------------------------------------------------------------------- 1 | export OPENAI_KEY="Set your openai api key here" 2 | export API_POOL_FILE=openai_keys.json 3 | export CONVERTED_ANSWER_PATH=data/model_predictions_converted 4 | export SAVE_PATH=data/results/preference_results 5 | export PASS_RATE_PATH=data/results/pass_rate 6 | export REFERENCE_MODEL=virtual-gpt35-16k-step16-cot 7 | export CANDIDATE_MODEL=test 8 | export EVAL_MODEL=gpt-4o-2024-05-13 9 | test_set="G2_instruction" 10 | mkdir -p ${SAVE_PATH} 11 | 12 | 13 | cmd="python -m evaluation.toolbench.tooleval.eval_preference \ 14 | --converted_answer_path ${CONVERTED_ANSWER_PATH} \ 15 | --reference_model ${REFERENCE_MODEL} \ 16 | --output_model ${CANDIDATE_MODEL} \ 17 | --test_ids data/solvable_queries/test_query_ids/ \ 18 | --save_path ${SAVE_PATH} \ 19 | --pass_rate_result_path ${PASS_RATE_PATH} \ 20 | --max_eval_threads 3 \ 21 | --use_pass_rate true \ 22 | --evaluate_times 3 \ 23 | --test_set ${test_set}" 24 | echo $cmd 25 | eval $cmd -------------------------------------------------------------------------------- /scripts/retrieval/eval_bm25.sh: -------------------------------------------------------------------------------- 1 | model="bm25" 2 | stage="G1" 3 | split="instruction" # instruction, tool, category 4 | result_path="bm25" 5 | corpus="G123" # G123, G1, G2, G3. G123 is the multi-domain setting 6 | 7 | cmd="python -m evaluation.retrieval.eval_bm25 \ 8 | --model_name_or_path ${model} \ 9 | --stage ${stage} \ 10 | --split ${split} \ 11 | --result_path \"data/results/retrieval/${result_path}_${stage}_${split}.json\" \ 12 | --corpus ${corpus}" 13 | 14 | echo $cmd 15 | eval $cmd -------------------------------------------------------------------------------- /scripts/retrieval/eval_encoder.sh: -------------------------------------------------------------------------------- 1 | # model="reasonwang/BERT-G3" 2 | model="ToolBench/ToolBench_IR_bert_based_uncased" 3 | stage="G1" # G1, G2, G3 4 | split="instruction" 5 | corpus="G123" # G123, G1, G2, G3. G123 is the multi-domain setting 6 | result_path="BERT-G1-full-tools" 7 | cmd="python -m evaluation.retrieval.eval_encoder \ 8 | --model_name_or_path ${model} \ 9 | --stage ${stage} \ 10 | --split ${split} \ 11 | --result_path \"data/results/retrieval/${result_path}_${stage}_${split}.json\" 12 | --corpus ${corpus}" 13 | 14 | echo $cmd 15 | eval $cmd -------------------------------------------------------------------------------- /scripts/retrieval/eval_longcontext.sh: -------------------------------------------------------------------------------- 1 | export OPENAI_API_KEY="" 2 | model="gpt-4o" 3 | stage="G3" # G1, G2, G3 4 | split="test" 5 | corpus="G3" # G123, G1, G2, G3. G123 is the multi-domain setting 6 | result_path="GPT-4o" 7 | cmd="python -m evaluation.retrieval.eval_longcontext \ 8 | --model_name_or_path ${model} \ 9 | --stage ${stage} \ 10 | --split ${split} \ 11 | --result_path \"data/results/retrieval/${result_path}_${stage}_${split}.json\" 12 | --corpus ${corpus}" 13 | 14 | echo $cmd 15 | eval $cmd -------------------------------------------------------------------------------- /scripts/retrieval/eval_openai_embedding.sh: -------------------------------------------------------------------------------- 1 | model_name_or_path="text_embedding_large" 2 | stage="G1" # G1, G2, G3 3 | split="instruction" 4 | result_path="openai" 5 | api_key="Set your openai api key here" 6 | corpus="G123" # G123, G1, G2, G3. G123 is the multi-domain setting 7 | 8 | cmd="python -m evaluation.retrieval.eval_openai_embedding \ 9 | --model_name_or_path ${model_name_or_path} \ 10 | --api_key ${api_key} \ 11 | --stage ${stage} \ 12 | --split ${split} \ 13 | --result_path \"data/results/retrieval/${result_path}_${stage}_${split}.json\" \ 14 | --corpus ${corpus}" 15 | 16 | echo $cmd 17 | eval $cmd -------------------------------------------------------------------------------- /scripts/retrieval/eval_toolgen.sh: -------------------------------------------------------------------------------- 1 | model_name_or_path="reasonwang/ToolGen-Llama-3-8B-Tool-Retriever" 2 | indexing="Atomic" 3 | constrain="True" 4 | limit_to_stage_space="False" 5 | template="llama-3" 6 | 7 | stage="G1" # G1, G2, G3 8 | split="instruction" # instruction, tool, category 9 | 10 | cmd="python -m evaluation.retrieval.eval_toolgen \ 11 | --model_name_or_path ${model_name_or_path} \ 12 | --indexing ${indexing} \ 13 | --stage ${stage} \ 14 | --split ${split} \ 15 | --result_path data/results/retrieval/ \ 16 | --constrain ${constrain} \ 17 | --limit_to_stage_space ${limit_to_stage_space} \ 18 | --template ${template}" 19 | echo $cmd 20 | eval $cmd -------------------------------------------------------------------------------- /training/README.md: -------------------------------------------------------------------------------- 1 | ## Training 2 | 3 | Training requires DeepSpeed as dependency: 4 | ``` 5 | pip install deepspeed 6 | ``` 7 | 8 | 9 | ### Tool Memorization 10 | In the first stage, we use the following command to train ToolGen. The LLM (Llama-3-8B in this case) is first added tool tokens then expanded embeddings, which is controled by `add_virtual_tokens` argument. 11 | 12 | ```bash 13 | deepspeed --include=localhost:0,1,2,3,4,5,6,7 --master_port 25024 train.py \ 14 | --model_name_or_path meta-llama/Meta-Llama-3-8B \ 15 | --add_virtual_tokens True \ 16 | --flash_attention True \ 17 | --deepspeed src/configs/ds_z2_config.json \ 18 | --chat True \ 19 | --template llama-3 \ 20 | --architecture causal \ 21 | --output_dir checkpoints/ToolGen-Llama-3-8B-Tool-Memorization \ 22 | --save_strategy steps \ 23 | --save_steps 1000 \ 24 | --gather_weights True \ 25 | --learning_rate 2e-5 \ 26 | --warmup_ratio 0.03 \ 27 | --datasets toolgen_atomic_memorization.json \ 28 | --dataset_nums 10000000 \ 29 | --per_device_train_batch_size 2 \ 30 | --gradient_accumulation_steps 64 \ 31 | --max_length 1024 \ 32 | --num_train_epochs 8 \ 33 | --gradient_checkpointing False \ 34 | --bf16 True \ 35 | --logging_steps 1 \ 36 | --report_to wandb \ 37 | --run_name llama-3-8b-tool-memorization 38 | ``` 39 | 40 | ### Tool Retrieval 41 | In the second stage, we train the ToolGen model with queries and tool tokens, intialized from the model obtained in the first stage. Since the model is already added tool tokens and expanded embeddings, we set `add_virtual_tokens` to `False`. 42 | ```bash 43 | deepspeed --include=localhost:0,1,2,3,4,5,6,7 --master_port 25024 train.py \ 44 | --model_name_or_path checkpoints/ToolGen-Llama-3-8B-Tool-Memorization \ 45 | --add_virtual_tokens False \ 46 | --flash_attention True \ 47 | --deepspeed src/configs/ds_z2_config.json \ 48 | --chat True \ 49 | --template llama-3 \ 50 | --architecture causal \ 51 | --output_dir checkpoints/ToolGen-Llama-3-8B-Tool-Retriever \ 52 | --save_strategy steps \ 53 | --save_steps 1000 \ 54 | --gather_weights True \ 55 | --learning_rate 2e-5 \ 56 | --warmup_ratio 0.03 \ 57 | --datasets toolgen_atomic_retrieval_G123.json \ 58 | --dataset_nums 1000000 \ 59 | --per_device_train_batch_size 2 \ 60 | --gradient_accumulation_steps 64 \ 61 | --max_length 1024 \ 62 | --num_train_epochs 1 \ 63 | --gradient_checkpointing False \ 64 | --bf16 True \ 65 | --logging_steps 1 \ 66 | --report_to wandb \ 67 | --run_name llama-3-8b-tool-retrieval 68 | ``` 69 | 70 | ### End-to-End Training 71 | In the last stage, we train the ToolGen agent model with end-to-end trajectories. We set the maximum length to 6144, which generally needs large GPU memory. Based on our experiments, 4 GPUs each with 80GB memory are enough for this stage (Deepspeed zero 3 with offloading is used). 72 | ```bash 73 | deepspeed --include=localhost:0,1,2,3,4,5,6,7 --master_port 25024 train.py \ 74 | --model_name_or_path checkpoints/ToolGen-Llama-3-8B-Tool-Retriever \ 75 | --add_virtual_tokens False \ 76 | --flash_attention True \ 77 | --deepspeed src/configs/ds_z3_offload_config.json \ 78 | --chat True \ 79 | --template llama-3 \ 80 | --architecture causal \ 81 | --output_dir checkpoints/ToolGen-Llama-3-8B \ 82 | --save_strategy steps \ 83 | --save_steps 1000 \ 84 | --gather_weights True \ 85 | --learning_rate 2e-5 \ 86 | --warmup_ratio 0.03 \ 87 | --datasets toolgen_atomic_G123_dfs.json \ 88 | --dataset_nums 10000000 \ 89 | --per_device_train_batch_size 1 \ 90 | --gradient_accumulation_steps 64 \ 91 | --max_length 6144 \ 92 | --num_train_epochs 1 \ 93 | --gradient_checkpointing False \ 94 | --bf16 True \ 95 | --logging_steps 1 \ 96 | --report_to wandb \ 97 | --run_name llama-3-8b-end2end 98 | ``` 99 | -------------------------------------------------------------------------------- /training/data/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Reason-Wang/ToolGen/6839374a255810efe69deea4056eec5c55e25802/training/data/__init__.py -------------------------------------------------------------------------------- /training/data/loading.py: -------------------------------------------------------------------------------- 1 | 2 | from data.dataset import CausalLMDataset, CausalLMCollator, CausalLMChatDataset, Seq2SeqDataset, Seq2SeqCollator 3 | from data.utils import load_chat_data, load_instruction_data 4 | 5 | 6 | def load_datasets(chat, architecture, datasets, dataset_nums, tokenizer, max_length, template): 7 | if chat: 8 | # assert args.architecture == 'causal' # Only causal is supported for chat 9 | messages_list = load_chat_data( 10 | datasets, 11 | dataset_nums, 12 | ) 13 | dataset = CausalLMChatDataset(tokenizer, messages_list, max_length=max_length, template=template) 14 | collator = CausalLMCollator(tokenizer, max_length=max_length) 15 | else: 16 | instructions, responses = load_instruction_data( 17 | datasets, 18 | dataset_nums, 19 | ) 20 | # TODO: Support better template system 21 | if architecture == 'causal': 22 | dataset = CausalLMDataset( 23 | tokenizer, 24 | instructions, 25 | responses, 26 | max_length=max_length, 27 | template=template 28 | ) 29 | # Currently max_length is not used in the collator 30 | collator = CausalLMCollator(tokenizer, max_length=max_length) 31 | elif architecture == 'seq2seq': 32 | dataset = Seq2SeqDataset( 33 | tokenizer, 34 | instructions, 35 | responses, 36 | max_length=max_length, 37 | template=template 38 | ) 39 | collator = Seq2SeqCollator(tokenizer, max_length=max_length) 40 | else: 41 | raise ValueError(f"Architecture {architecture} not supported") 42 | 43 | return dataset, collator -------------------------------------------------------------------------------- /training/data/utils.py: -------------------------------------------------------------------------------- 1 | import json 2 | import random 3 | 4 | import pandas as pd 5 | 6 | 7 | def read_jsonl_to_list(file_path): 8 | data_list = [] 9 | with open(file_path, 'r') as file: 10 | for line in file: 11 | data_list.append(json.loads(line)) 12 | return data_list 13 | 14 | 15 | def load_instruction_data(datasets, nums): 16 | instructions = [] 17 | responses = [] 18 | for (d, n) in zip(datasets, nums): 19 | data_path = f'{d}' 20 | with open(data_path, 'r') as f: 21 | data_list = json.load(f)[:n] 22 | 23 | for sample in data_list: 24 | instruction = sample['instruction'] 25 | if 'input' in sample: 26 | instruction = instruction + ' ' + sample['input'] 27 | instruction = instruction.strip() 28 | response = sample['output'] 29 | 30 | instructions.append(instruction) 31 | responses.append(response) 32 | 33 | return instructions, responses 34 | 35 | 36 | def load_chat_data(datasets, nums): 37 | assert len(datasets) == len(nums) 38 | messages_list = [] 39 | for (d, n) in zip(datasets, nums): 40 | data_path = f'{d}' 41 | with open(data_path, 'r') as f: 42 | data_list = json.load(f) 43 | if n <= len(data_list): 44 | # randomly sample n conversations 45 | data_list = random.sample(data_list, n) 46 | messages_list.extend([data['conversations'] for data in data_list]) 47 | 48 | return messages_list 49 | -------------------------------------------------------------------------------- /training/models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Reason-Wang/ToolGen/6839374a255810efe69deea4056eec5c55e25802/training/models/__init__.py -------------------------------------------------------------------------------- /training/models/causallm.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | from prompts.conversations import get_conv_template 3 | from transformers import AutoModelForCausalLM, AutoTokenizer, StoppingCriteria, StoppingCriteriaList 4 | import huggingface_hub 5 | import torch 6 | from prompts.utils import k2_tokenize 7 | from models.utils import KeywordsStoppingCriteria, TextStoppingCriteria 8 | 9 | 10 | class ChatCausalLM: 11 | def __init__( 12 | self, 13 | model_name, 14 | max_new_tokens=512, 15 | temperature=0.7, 16 | device="auto", 17 | system_prompt=None, 18 | cache_dir=None, 19 | conversation_template=None, 20 | ): 21 | self.max_new_tokens = max_new_tokens 22 | self.temperature = temperature 23 | self.device = "cuda" if device=="auto" else device 24 | self.tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=cache_dir) 25 | 26 | self.model = AutoModelForCausalLM.from_pretrained( 27 | model_name, 28 | torch_dtype=torch.bfloat16, 29 | device_map=device, 30 | cache_dir=cache_dir 31 | ) 32 | 33 | self.system_prompt = system_prompt 34 | self.conversation_history = [] 35 | self.conversation_template = conversation_template 36 | 37 | def generate(self, messages, stop=None, print_prompt=False): 38 | human_role_set = {"user", "human"} 39 | ai_role_set = {"bot", "ai", "gpt", "assistant"} 40 | conv = get_conv_template(self.conversation_template) 41 | for message in messages: 42 | if message['role'] == 'system': 43 | conv.set_system_message(message['content']) 44 | else: 45 | conv.append_message( 46 | conv.roles[0] if message['role'] in human_role_set else conv.roles[1], 47 | message["content"] 48 | ) 49 | conv.append_message(conv.roles[1], None) 50 | prompt = conv.get_prompt() 51 | if print_prompt: 52 | print(prompt) 53 | # inputs = self.tokenizer(prompt, return_tensors="pt") 54 | if self.conversation_template == 'k2': 55 | inputs = k2_tokenize(self.tokenizer, prompt, return_tensors="pt") 56 | else: 57 | inputs = self.tokenizer(prompt, return_tensors="pt") 58 | for k, v in inputs.items(): 59 | inputs[k] = v.to(self.device) 60 | 61 | if self.conversation_template == 'k2': 62 | stop_criteria = StoppingCriteriaList([TextStoppingCriteria(stop, self.tokenizer, self.device)]) if stop else None 63 | else: 64 | stop_criteria = StoppingCriteriaList([KeywordsStoppingCriteria(stop, self.tokenizer, self.device)]) if stop else None 65 | 66 | outputs = self.model.generate( 67 | **inputs, 68 | max_new_tokens=self.max_new_tokens, 69 | do_sample=True, 70 | temperature=self.temperature, 71 | stopping_criteria=stop_criteria, 72 | eos_token_id=self.tokenizer.eos_token_id, 73 | ) 74 | inputs_token_length = len(inputs['input_ids'][0]) 75 | new_tokens = outputs[0][inputs_token_length:] 76 | text = self.tokenizer.decode(new_tokens, skip_special_tokens=True) 77 | 78 | if stop: 79 | for ending in stop: 80 | if text.endswith(ending): 81 | text = text[:-len(ending)] 82 | break 83 | 84 | return text.strip() 85 | 86 | def chat(self, text, stop=None, print_prompt=False): 87 | 88 | self.conversation_history.append({"role": "user", "content": text}) 89 | messages = [{"role": "system", "content": self.system_prompt}] if self.system_prompt else [] 90 | messages.extend(self.conversation_history) 91 | response = self.generate(messages, stop=stop, print_prompt=print_prompt) 92 | self.conversation_history.append({"role": "assistant", "content": response}) 93 | 94 | return response 95 | 96 | def clear_history(self): 97 | self.conversation_history = [] 98 | 99 | -------------------------------------------------------------------------------- /training/models/loading.py: -------------------------------------------------------------------------------- 1 | from utils.distributed import is_main_process 2 | import transformers 3 | import torch 4 | from unidecode import unidecode 5 | 6 | 7 | def load_tokenizer(model_name_or_path, cache_dir=None, virtual_tokens=False): 8 | tokenizer = transformers.AutoTokenizer.from_pretrained( 9 | model_name_or_path, 10 | cache_dir=cache_dir, 11 | ) 12 | if virtual_tokens: 13 | # if "llama-3" in model_name_or_path.lower(): 14 | # tokenizer = transformers.AutoTokenizer.from_pretrained( 15 | # "meta-llama/Meta-Llama-3-8B", 16 | # cache_dir=cache_dir, 17 | # ) 18 | # else: 19 | # raise ValueError(f"Virtual tokens not supported for tokenizer {model_name_or_path}") 20 | with open('src/configs/virtual_tokens.txt', 'r') as f: 21 | virtual_tokens = f.readlines() 22 | virtual_tokens = [unidecode(vt.strip()) for vt in virtual_tokens] 23 | tokenizer.add_tokens(new_tokens=virtual_tokens, special_tokens=False) 24 | if is_main_process(): 25 | print(f"Added {len(virtual_tokens)} virtual tokens") 26 | 27 | return tokenizer 28 | 29 | 30 | def load_model(model_name_or_path, architecture, tokenizer=None, flash_attention=False, cache_dir=None, virtual_tokens=False): 31 | if architecture == 'causal': 32 | # Check hf_home 33 | # rank = get_rank() 34 | # print(f"Rank {rank}: {os.environ['HF_HOME']}") 35 | # print(f"Rank {rank}: cache dir: {args.cache_dir}") 36 | if flash_attention: 37 | model = transformers.AutoModelForCausalLM.from_pretrained( 38 | model_name_or_path, 39 | cache_dir=cache_dir, 40 | torch_dtype=torch.bfloat16, 41 | attn_implementation='flash_attention_2' 42 | ) 43 | else: 44 | model = transformers.AutoModelForCausalLM.from_pretrained( 45 | model_name_or_path, 46 | cache_dir=cache_dir, 47 | torch_dtype=torch.bfloat16, 48 | ) 49 | elif architecture == 'seq2seq': 50 | if flash_attention: 51 | model = transformers.AutoModelForSeq2SeqLM.from_pretrained( 52 | model_name_or_path, 53 | cache_dir=cache_dir, 54 | attn_implementation='flash_attention_2' 55 | ) 56 | else: 57 | model = transformers.AutoModelForSeq2SeqLM.from_pretrained( 58 | model_name_or_path, 59 | cache_dir=cache_dir, 60 | ) 61 | else: 62 | raise ValueError(f"Architecture {architecture} not supported") 63 | 64 | if virtual_tokens: 65 | model.resize_token_embeddings(len(tokenizer)) 66 | if is_main_process(): 67 | print(f"Model resized token embeddings to {len(tokenizer)}") 68 | 69 | with open('src/configs/virtual_tokens.txt', 'r') as f: 70 | virtual_tokens = f.readlines() 71 | virtual_tokens = [unidecode(vt).strip() for vt in virtual_tokens] 72 | combined_tokens = [] 73 | for vt in virtual_tokens: 74 | combined_token = vt[2:-2].split("&&") 75 | combined_tokens.append(combined_token) 76 | 77 | for combined_token, virtual_token in zip(combined_tokens, virtual_tokens): 78 | combined_token_ids = tokenizer(" ".join(combined_token), add_special_tokens=False).input_ids 79 | virtual_token_id = tokenizer(virtual_token, add_special_tokens=False).input_ids 80 | # print(combined_token_ids) 81 | # print(virtual_token_id) 82 | assert len(virtual_token_id) == 1 83 | # print(model.device) 84 | combined_token_embeddings = model.model.embed_tokens(torch.tensor(combined_token_ids).to(model.device)) 85 | # print(combined_token_embeddings.shape) 86 | embedding = torch.mean(combined_token_embeddings, dim=0) 87 | # print(embedding.shape) 88 | model.model.embed_tokens.weight.data[virtual_token_id[0]] = embedding 89 | else: 90 | if is_main_process(): 91 | print(f"Initialized from {model_name_or_path} without adding embeddings.") 92 | 93 | return model 94 | -------------------------------------------------------------------------------- /training/prompts/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Reason-Wang/ToolGen/6839374a255810efe69deea4056eec5c55e25802/training/prompts/__init__.py -------------------------------------------------------------------------------- /training/prompts/templates.py: -------------------------------------------------------------------------------- 1 | null_template = { 2 | "full_template": '''{instruction} {response}''', 3 | "user_template": '''{instruction}''', 4 | } 5 | 6 | damsa_template = '''Translate the Arabic dialects to Modern Standard Arabic (MSA): {instruction} Response: {response}''' -------------------------------------------------------------------------------- /training/prompts/utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import transformers 3 | from prompts.conversations import Conversation, get_conv_template 4 | 5 | def k2_tokenize(tokenizer, text, add_special_tokens=True, return_tensors=None): 6 | """ 7 | Tokenize the prompt and return the input_ids and attention_mask 8 | To make tokenized correctly, we split the text by "" and tokenize each part separately. 9 | :param tokenizer: 10 | :param prompt: 11 | :param max_length: 12 | :return: input_ids, attention_mask 13 | """ 14 | if add_special_tokens: 15 | input_ids = [tokenizer.bos_token_id] 16 | attention_mask = [1] 17 | else: 18 | input_ids = [] 19 | attention_mask = [] 20 | 21 | splited_texts = text.split("") 22 | inputs = tokenizer(splited_texts[0], add_special_tokens=False) 23 | input_ids.extend(inputs['input_ids']) 24 | attention_mask.extend(inputs['attention_mask']) 25 | if len(splited_texts) > 1: 26 | for text in splited_texts[1:]: 27 | current_inputs = tokenizer(text, add_special_tokens=False) 28 | input_ids += [tokenizer.eos_token_id] + current_inputs['input_ids'] 29 | attention_mask += [1] + current_inputs['attention_mask'] 30 | if return_tensors == 'pt': 31 | input_ids = torch.tensor([input_ids]) 32 | attention_mask = torch.tensor([attention_mask]) 33 | 34 | return dict( 35 | input_ids=input_ids, 36 | attention_mask=attention_mask 37 | ) 38 | 39 | 40 | def format_conversation(messages, conv_template): 41 | # When there is no 'loss', we set it to False 42 | for message in messages: 43 | if 'loss' not in message: 44 | message['loss'] = False 45 | 46 | human_role_set = {"human", "user"} 47 | ai_role_set = {"ai", "gpt", "assistant"} 48 | conv = get_conv_template(conv_template) 49 | if 'from' in messages[0]: 50 | role_label, content_label = "from", "value" 51 | elif 'role' in messages[0]: 52 | role_label, content_label = "role", "content" 53 | else: 54 | raise ValueError("Cannot find role label and content label in the data.") 55 | 56 | for message in messages: 57 | if message[role_label] == 'system': 58 | conv.set_system_message(message[content_label]) 59 | else: 60 | conv.append_message(conv.roles[0] if message[role_label] in human_role_set else conv.roles[1], message[content_label], message['loss']) 61 | 62 | # conv.append_message(conv.roles[1], None) 63 | return conv 64 | 65 | 66 | def tokenize_conversation( 67 | messages, 68 | tokenizer, 69 | conv_template, 70 | max_length, 71 | ): 72 | """ 73 | We want to tokenize the whole conversation. But we can't just simply 74 | use get_prompt to get string prompt and tokenize it. Because the loss 75 | can only be computed on model's response. We want: 76 | input_ids 77 | attention_mask 78 | labels: should be -100 for user prompt and input id for model's response 79 | action_mask: should be 0 for user prompt and 1 for model's response 80 | :param messages: 81 | :param tokenizer: 82 | :param conv_template: 83 | :param max_length: 84 | :return: input_ids, attention_mask, labels, action_mask 85 | """ 86 | conv = format_conversation(messages, conv_template) 87 | separate_prompts = conv.get_separate_prompt_with_to_loss() 88 | # print(separate_prompts) 89 | input_ids = [] 90 | attention_mask = [] 91 | labels = [] 92 | action_mask = [] 93 | for i, (prompt, to_loss) in enumerate(separate_prompts): 94 | if i == 0: 95 | if tokenizer.bos_token: 96 | prompt = tokenizer.bos_token + prompt 97 | 98 | if conv_template == 'k2': 99 | tmp_input_ids = k2_tokenize(tokenizer, prompt, add_special_tokens=False)['input_ids'] 100 | else: 101 | tmp_input_ids = tokenizer(prompt, add_special_tokens=False)['input_ids'] 102 | if to_loss: 103 | tmp_target = tmp_input_ids.copy() 104 | tmp_action_mask = [1] * len(tmp_input_ids) 105 | else: 106 | tmp_target = [-100] * len(tmp_input_ids) 107 | tmp_action_mask = [0] * len(tmp_input_ids) 108 | # print(tmp_input_ids) 109 | input_ids.extend(tmp_input_ids) 110 | attention_mask.extend([1] * len(tmp_input_ids)) 111 | labels.extend(tmp_target) 112 | action_mask.extend(tmp_action_mask) 113 | 114 | input_ids = input_ids[:max_length] 115 | attention_mask = attention_mask[:max_length] 116 | labels = labels[:max_length] 117 | action_mask = action_mask[:max_length] 118 | 119 | # TODO: remove this check if everything is correct 120 | assert len(input_ids) == len(attention_mask) == len(labels) == len(action_mask) 121 | 122 | return dict( 123 | input_ids=torch.tensor([input_ids]), 124 | attention_mask=torch.tensor([attention_mask]), 125 | labels=torch.tensor([labels]), 126 | # action_mask=torch.tensor([action_mask]) 127 | ) 128 | 129 | -------------------------------------------------------------------------------- /training/scripts/train_toolgen.sh: -------------------------------------------------------------------------------- 1 | # Train tool memorization 2 | pretrain_dir="meta-llama/Meta-Llama-3-8B" 3 | checkpoint_dir="checkpoints/ToolGen-Llama-3-8B-Tool-Memorization" 4 | flash_attention="True" 5 | run_name="llama-3-8b-tool-memorization" 6 | datasets="toolgen_atomic_memorization.json" 7 | dataset_nums="10000000" 8 | max_length="1024" 9 | batch_size="2" 10 | lr="2e-5" 11 | accumulation_steps="64" 12 | epochs="8" 13 | add_virtual_tokens="True" 14 | template="llama-3" 15 | save_strategy="steps" 16 | save_steps="1000" 17 | zero="z2" 18 | 19 | # Train tool retrieval 20 | # pretrain_dir="checkpoints/ToolGen-Llama-3-8B-Tool-Memorization" 21 | # checkpoint_dir="checkpoints/ToolGen-Llama-3-8B-Tool-Retriever" 22 | # flash_attention="True" 23 | # run_name="llama-3-8b-tool-retrieval" 24 | # datasets="toolgen_atomic_retrieval_G123.json" 25 | # dataset_nums="1000000" 26 | # max_length="1024" 27 | # batch_size="2" 28 | # lr="2e-5" 29 | # accumulation_steps="64" 30 | # epochs="1" 31 | # add_virtual_tokens="False" 32 | # template="llama-3" 33 | # save_strategy="steps" 34 | # save_steps="1000" 35 | # zero="z2" 36 | 37 | # End2End 38 | # pretrain_dir="checkpoints/ToolGen-Llama-3-8B-Tool-Retriever" 39 | # checkpoint_dir="checkpoints/ToolGen-Llama-3-8B" 40 | # flash_attention="True" 41 | # run_name="llama-3-8b-end2end" 42 | # datasets="toolgen_atomic_G123_dfs.json" 43 | # dataset_nums="10000000" 44 | # max_length="6144" 45 | # batch_size="1" 46 | # lr="2e-5" 47 | # accumulation_steps="64" 48 | # epochs="1" 49 | # add_virtual_tokens="False" 50 | # template="llama-3" 51 | # save_strategy="steps" 52 | # save_steps="1000" 53 | # zero="z3_offload" 54 | 55 | chat="True" 56 | 57 | cmd="deepspeed --include=localhost:0,1,2,3,4,5,6,7 --master_port 25024 train.py \ 58 | --model_name_or_path ${pretrain_dir} \ 59 | --add_virtual_tokens ${add_virtual_tokens} \ 60 | --flash_attention ${flash_attention} \ 61 | --deepspeed src/configs/ds_${zero}_config.json \ 62 | --chat ${chat} \ 63 | --template ${template} \ 64 | --architecture causal \ 65 | --output_dir ${checkpoint_dir} \ 66 | --save_strategy ${save_strategy} \ 67 | --save_steps ${save_steps} \ 68 | --gather_weights True \ 69 | --learning_rate ${lr} \ 70 | --warmup_ratio 0.03 \ 71 | --datasets ${datasets} \ 72 | --dataset_nums ${dataset_nums} \ 73 | --per_device_train_batch_size ${batch_size} \ 74 | --gradient_accumulation_steps ${accumulation_steps} \ 75 | --max_length ${max_length} \ 76 | --num_train_epochs ${epochs} \ 77 | --gradient_checkpointing False \ 78 | --bf16 True \ 79 | --logging_steps 1 \ 80 | --report_to wandb \ 81 | --run_name ${run_name}" 82 | 83 | echo $cmd 84 | eval $cmd -------------------------------------------------------------------------------- /training/src/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Reason-Wang/ToolGen/6839374a255810efe69deea4056eec5c55e25802/training/src/__init__.py -------------------------------------------------------------------------------- /training/src/configs/ds_z2_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_batch_size": "auto", 3 | "train_micro_batch_size_per_gpu": "auto", 4 | "gradient_accumulation_steps": "auto", 5 | "zero_optimization": { 6 | "stage": 2, 7 | "reduce_bucket_size": "auto" 8 | }, 9 | "bf16": { 10 | "enabled": "auto" 11 | } 12 | } -------------------------------------------------------------------------------- /training/src/configs/ds_z3_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_batch_size": "auto", 3 | "train_micro_batch_size_per_gpu": "auto", 4 | "gradient_accumulation_steps": "auto", 5 | "optimizer": { 6 | "type": "AdamW", 7 | "params": { 8 | "lr": "auto", 9 | "betas": "auto", 10 | "eps": "auto", 11 | "weight_decay": "auto" 12 | } 13 | }, 14 | "scheduler": { 15 | "type": "WarmupCosineLR", 16 | "params": { 17 | "warmup_num_steps": "auto", 18 | "total_num_steps": "auto" 19 | } 20 | }, 21 | "zero_optimization": { 22 | "stage": 3, 23 | "overlap_comm": true, 24 | "reduce_bucket_size": "auto", 25 | "stage3_prefetch_bucket_size": "auto", 26 | "stage3_param_persistence_threshold": "auto", 27 | "stage3_max_live_parameters": 1e9, 28 | "stage3_max_reuse_distance": 1e9, 29 | "stage3_gather_16bit_weights_on_model_save": true 30 | }, 31 | "checkpoint": { 32 | "tag_validation": "Warn", 33 | "load_universal": false, 34 | "use_node_local_storage": true, 35 | "parallel_write":{ 36 | "pipeline_stage": true 37 | } 38 | }, 39 | "bf16": { 40 | "enabled": "auto" 41 | } 42 | } -------------------------------------------------------------------------------- /training/src/configs/ds_z3_offload_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_batch_size": "auto", 3 | "train_micro_batch_size_per_gpu": "auto", 4 | "gradient_accumulation_steps": "auto", 5 | "optimizer": { 6 | "type": "AdamW", 7 | "params": { 8 | "lr": "auto", 9 | "betas": "auto", 10 | "eps": "auto", 11 | "weight_decay": "auto" 12 | } 13 | }, 14 | "scheduler": { 15 | "type": "WarmupCosineLR", 16 | "params": { 17 | "warmup_num_steps": "auto", 18 | "total_num_steps": "auto" 19 | } 20 | }, 21 | "zero_optimization": { 22 | "stage": 3, 23 | "offload_optimizer": { 24 | "device": "cpu", 25 | "pin_memory": false 26 | }, 27 | "offload_param": { 28 | "device": "cpu", 29 | "pin_memory": false 30 | }, 31 | "memory_efficient_linear": true, 32 | "overlap_comm": true, 33 | "reduce_bucket_size": "auto", 34 | "stage3_prefetch_bucket_size": "auto", 35 | "stage3_param_persistence_threshold": "auto", 36 | "stage3_max_live_parameters": 1e9, 37 | "stage3_max_reuse_distance": 1e9, 38 | "stage3_gather_16bit_weights_on_model_save": true 39 | }, 40 | "checkpoint": { 41 | "tag_validation": "Warn", 42 | "load_universal": false, 43 | "use_node_local_storage": true, 44 | "parallel_write":{ 45 | "pipeline_stage": true 46 | } 47 | }, 48 | "bf16": { 49 | "enabled": "auto" 50 | } 51 | } -------------------------------------------------------------------------------- /training/src/configs/project_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "WANDB_PROJECT": "ToolGen" 3 | } -------------------------------------------------------------------------------- /training/train.py: -------------------------------------------------------------------------------- 1 | from data.loading import load_datasets 2 | from models.loading import load_model, load_tokenizer 3 | from utils.setting import set_project, set_system, set_args, set_distributed_logging 4 | from dataclasses import field, dataclass 5 | from typing import Optional, Any 6 | import torch 7 | from data.loading import load_datasets 8 | import os 9 | import transformers 10 | from transformers import Trainer 11 | from typing import List 12 | from prompts.templates import null_template 13 | from utils.logging import get_logger 14 | from utils.distributed import get_rank, is_main_process 15 | 16 | 17 | @dataclass 18 | class TrainingArguments(transformers.TrainingArguments): 19 | model_name_or_path: str = field(default="") 20 | chat: bool = False 21 | architecture: str = field(default='causal') 22 | flash_attention: bool = False 23 | data_path: str = field(default="") 24 | cache_dir: Optional[str] = field(default=None) 25 | optim: str = field(default="adamw_torch") 26 | resume_training: bool = False 27 | per_device_train_batch_size = 8 28 | max_length: int = 2048 29 | learning_rate: float = 5e-5 30 | num_train_epochs: int = 3 31 | gather_weights: bool = True 32 | datasets: List[str] = field(default_factory=list) 33 | dataset_nums: List[int] = field(default_factory=int) 34 | template: str = field(default="llama-3") 35 | add_virtual_tokens: bool = False 36 | 37 | 38 | def train(): 39 | # set_system("src/configs/project_config.json") 40 | # set_distributed_logging(strict=True) 41 | parser = transformers.HfArgumentParser(TrainingArguments) 42 | args = parser.parse_args_into_dataclasses()[0] 43 | set_args(args) 44 | set_project(args) 45 | 46 | # Get rank 47 | rank = get_rank() 48 | Logger = get_logger("logs", level="INFO", rank=rank) 49 | 50 | # Load VAgent tokenizer 51 | tokenizer = load_tokenizer( 52 | args.model_name_or_path, 53 | cache_dir=args.cache_dir, 54 | virtual_tokens=args.add_virtual_tokens, 55 | ) 56 | 57 | Logger.info("---- Loading Datasets ----") 58 | dataset, collator = load_datasets( 59 | chat=args.chat, 60 | architecture=args.architecture, 61 | datasets=args.datasets, 62 | dataset_nums=args.dataset_nums, 63 | tokenizer=tokenizer, 64 | max_length=args.max_length, 65 | template=args.template, 66 | ) 67 | Logger.info(f"Data length: {len(dataset)}") 68 | 69 | Logger.info("---- Loading Model ----") 70 | model = load_model( 71 | args.model_name_or_path, 72 | architecture=args.architecture, 73 | tokenizer=tokenizer, 74 | flash_attention=args.flash_attention, 75 | cache_dir=args.cache_dir, 76 | virtual_tokens=args.add_virtual_tokens, 77 | ) 78 | 79 | trainer = Trainer( 80 | model, 81 | args=args, 82 | data_collator=collator, 83 | train_dataset=dataset, 84 | ) 85 | 86 | trainer.train(resume_from_checkpoint=args.resume_training) 87 | if is_main_process(): 88 | tokenizer.save_pretrained(args.output_dir) 89 | 90 | # Whether to gather weights before saving 91 | # This is prefered for small models 92 | if args.gather_weights: 93 | trainer.save_model(args.output_dir) 94 | else: 95 | trainer.deepspeed.save_checkpoint(args.output_dir) 96 | 97 | 98 | if __name__ == "__main__": 99 | train() 100 | -------------------------------------------------------------------------------- /training/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Reason-Wang/ToolGen/6839374a255810efe69deea4056eec5c55e25802/training/utils/__init__.py -------------------------------------------------------------------------------- /training/utils/distributed.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | 4 | def is_main_process(): 5 | # Check if the current process is the main process 6 | rank = int(os.environ.get('RANK', -1)) 7 | return rank == 0 or rank == -1 8 | 9 | 10 | def get_rank(): 11 | # When using this function, make sure to call it after deepspeed is initialized 12 | # Using launcher or deepspeed.initialize() 13 | # Get the current rank 14 | rank = int(os.environ.get('RANK', -1)) 15 | return rank 16 | -------------------------------------------------------------------------------- /training/utils/huggingface.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | from transformers import AutoTokenizer, AutoModelForCausalLM 3 | from huggingface_hub import HfApi, repo_info, create_repo 4 | from huggingface_hub.utils import RepositoryNotFoundError 5 | import torch 6 | 7 | 8 | def repo_exists(repo_id, repo_type: Optional[str]=None, token: Optional[str]=None): 9 | """ 10 | Check if a repository exists on the Hugging Face Hub 11 | 12 | Args: 13 | repo_id (str): The repository ID to check 14 | repo_type (str): The type of repository to check 15 | token (str): The Hugging Face API token 16 | 17 | Returns: 18 | bool: Whether the repository exists 19 | """ 20 | try: 21 | repo_info(repo_id, repo_type=repo_type, token=token) 22 | return True 23 | except RepositoryNotFoundError: 24 | return False 25 | 26 | 27 | def upload_model(model_name_or_path, repo_id, private=False, token=""): 28 | """ 29 | Upload a model to the Hugging Face Hub 30 | 31 | Args: 32 | model_name_or_path (str): The model name or path to upload 33 | repo_id (str): The repository ID to upload the model to 34 | """ 35 | # Load the model 36 | # tokenizer = AutoTokenizer.from_pretrained(model_name_or_path) 37 | model = AutoModelForCausalLM.from_pretrained( 38 | model_name_or_path, 39 | torch_dtype=torch.bfloat16, 40 | device_map="cpu", 41 | ) 42 | 43 | if not repo_exists(repo_id, token=token): 44 | print(f"Repo {repo_id} does not exist, creating repo...") 45 | create_repo(repo_id, private=private, token=token) 46 | 47 | model.push_to_hub(repo_id, token=token) -------------------------------------------------------------------------------- /training/utils/logging.py: -------------------------------------------------------------------------------- 1 | import os 2 | from typing import Union 3 | from logging import WARNING, getLogger, INFO, StreamHandler, FileHandler, Formatter, DEBUG 4 | 5 | 6 | class Logger: 7 | def __init__(self, logger, rank: int): 8 | self.logger = logger 9 | self.rank = rank 10 | 11 | def info(self, msg): 12 | if self.rank in [-1, 0]: 13 | self.logger.info(msg) 14 | 15 | def debug(self, msg): 16 | if self.rank in [-1, 0]: 17 | self.logger.debug(msg) 18 | 19 | def warning(self, msg): 20 | if self.rank in [-1, 0]: 21 | self.logger.warning(msg) 22 | 23 | 24 | def get_logger(directory, level="INFO", rank: int=-1): 25 | # print(f"Local rank: {local_rank}") 26 | os.makedirs(directory, exist_ok=True) 27 | filename = directory + '/train' 28 | logger = getLogger(__name__) 29 | logger.propagate = False 30 | logger.handlers.clear() 31 | handler1 = StreamHandler() 32 | handler1.setFormatter(Formatter("%(message)s")) 33 | handler2 = FileHandler(filename=f"{filename}.log") 34 | handler2.setFormatter(Formatter("%(message)s")) 35 | logger.addHandler(handler1) 36 | logger.addHandler(handler2) 37 | # if level == "INFO": 38 | # handler2 = FileHandler(filename=f"{filename}.log") 39 | # handler2.setFormatter(Formatter("%(message)s")) 40 | # logger.addHandler(handler2) 41 | # logger.setLevel(INFO) 42 | # elif level == "DEBUG": 43 | # handler1 = StreamHandler() 44 | # handler1.setFormatter(Formatter("%(message)s")) 45 | # handler2 = FileHandler(filename=f"{filename}.log") 46 | # handler2.setFormatter(Formatter("%(message)s")) 47 | # logger.addHandler(handler1) 48 | # logger.addHandler(handler2) 49 | if level == "WARNING": 50 | logger.setLevel(WARNING) 51 | elif level == "INFO": 52 | logger.setLevel(INFO) 53 | elif level == "DEBUG": 54 | logger.setLevel(DEBUG) 55 | else: 56 | raise ValueError(f"Unknown level: {level}") 57 | 58 | logger = Logger(logger, rank) 59 | 60 | return logger 61 | 62 | 63 | if __name__=="__main__": 64 | logger = get_logger("test", level="INFO") 65 | logger.info("test") -------------------------------------------------------------------------------- /training/utils/setting.py: -------------------------------------------------------------------------------- 1 | import json 2 | import logging 3 | import os 4 | 5 | import wandb 6 | from utils.distributed import get_rank, is_main_process 7 | import warnings 8 | 9 | 10 | def set_system(config_path): 11 | with open(config_path, "r") as f: 12 | config = json.load(f) 13 | 14 | if "NinjaPath" in config: 15 | os.environ["PATH"] = config["NinjaPath"] + ":" + os.environ["PATH"] 16 | 17 | if "Environment" in config: 18 | for key, value in config["Environment"].items(): 19 | os.environ[key] = value 20 | 21 | return None 22 | 23 | 24 | def set_distributed_logging(strict: bool = False): 25 | ''' 26 | In default, only the main process will log INFO level 27 | Currently this function only controls logs from logging and warnings modules 28 | Some others libraries implemented their own logging system: 29 | - Deepspeed: implemented yellow color warnings 30 | But still need to be used carefully since some important logs might be missed 31 | ''' 32 | rank = get_rank() 33 | if is_main_process(): 34 | print(f"Rank {rank}: Setting logging level to INFO") 35 | logging.basicConfig(level=logging.INFO) 36 | else: 37 | if strict: 38 | print(f"Rank {rank}: Setting logging level to ERROR") 39 | logging.basicConfig(level=logging.ERROR) 40 | warnings.filterwarnings("ignore") 41 | else: 42 | print(f"Rank {rank}: Setting logging level to WARNING") 43 | logging.basicConfig(level=logging.WARNING) 44 | 45 | 46 | def set_args(args): 47 | if args.cache_dir is not None: 48 | # User has specified a cache directory 49 | pass 50 | else: 51 | # System setted cache directory 52 | if "HF_HUB_CACHE" in os.environ: 53 | args.cache_dir = os.environ["HF_HUB_CACHE"] 54 | # Use HF default cache directory 55 | else: 56 | args.cache_dir = None 57 | 58 | return None 59 | 60 | def set_project(args): 61 | with open("src/configs/project_config.json", "r") as f: 62 | project_config = json.load(f) 63 | 64 | if "WANDB_PROJECT" in project_config: 65 | os.environ["WANDB_PROJECT"] = project_config["WANDB_PROJECT"] 66 | if "WANDB_ENTITY" in project_config: 67 | os.environ["WANDB_ENTITY"] = project_config["WANDB_ENTITY"] 68 | 69 | # Detect if file exists 70 | keys_file = "src/configs/keys.json" 71 | if os.path.exists(keys_file): 72 | with open(keys_file, "r") as f: 73 | keys = json.load(f) 74 | 75 | if "WANDB_KEY" in keys: 76 | wandb.login(key=keys["WANDB_KEY"]) 77 | 78 | --------------------------------------------------------------------------------