├── .gitignore ├── README.md ├── cases ├── case-0.png ├── case-1.png └── case-2.png ├── config ├── archer │ ├── accelerate_config.yaml │ ├── archer_config.yaml │ └── default.yaml ├── ds_configs │ └── stage3-cosine.json ├── llama3-1 │ └── StepTool_ppo.json ├── qwen2 │ └── StepTool_ppo.json └── toolllama │ └── StepTool_ppo.json ├── data ├── model_predictions_converted │ └── qwen2 │ │ └── G123_example.json └── reward_annotation │ └── qwen2 │ └── G123_example_5.json ├── data_eval └── pass_rate_results │ ├── baseline-archer_cot │ ├── G1_category.csv │ ├── G1_category.json │ ├── G1_instruction.csv │ ├── G1_instruction.json │ ├── G1_tool.csv │ ├── G1_tool.json │ ├── G2_category.csv │ ├── G2_category.json │ ├── G2_instruction.csv │ ├── G2_instruction.json │ ├── G3_instruction.csv │ └── G3_instruction.json │ ├── baseline-archer_dfs │ ├── G1_category.csv │ ├── G1_category.json │ ├── G1_instruction.csv │ ├── G1_instruction.json │ ├── G1_tool.csv │ ├── G1_tool.json │ ├── G2_category.csv │ ├── G2_category.json │ ├── G2_instruction.csv │ ├── G2_instruction.json │ ├── G3_instruction.csv │ └── G3_instruction.json │ ├── baseline-eto_cot │ ├── G1_instruction.csv │ ├── G1_instruction.json │ ├── G1_tool.csv │ ├── G1_tool.json │ ├── G2_category.csv │ ├── G2_category.json │ ├── G2_instruction.csv │ ├── G2_instruction.json │ ├── G3_instruction.csv │ └── G3_instruction.json │ ├── baseline-eto_dfs │ ├── G1_category.csv │ ├── G1_category.json │ ├── G1_instruction.csv │ ├── G1_instruction.json │ ├── G1_tool.csv │ ├── G1_tool.json │ ├── G2_category.csv │ ├── G2_category.json │ ├── G2_instruction.csv │ ├── G2_instruction.json │ ├── G3_instruction.csv │ └── G3_instruction.json │ ├── baseline-ppo_cot │ ├── G1_category.csv │ ├── G1_category.json │ ├── G1_instruction.csv │ ├── G1_instruction.json │ ├── G1_tool.csv │ ├── G1_tool.json │ ├── G2_category.csv │ ├── G2_category.json │ ├── G2_instruction.csv │ ├── G2_instruction.json │ ├── G3_instruction.csv │ └── G3_instruction.json │ ├── baseline-ppo_dfs │ ├── G1_category.csv │ ├── G1_category.json │ ├── G1_instruction.csv │ ├── G1_instruction.json │ ├── G1_tool.csv │ ├── G1_tool.json │ ├── G2_category.csv │ ├── G2_category.json │ ├── G2_instruction.csv │ ├── G2_instruction.json │ ├── G3_instruction.csv │ └── G3_instruction.json │ ├── baseline-rft_cot │ ├── G1_category.csv │ ├── G1_category.json │ ├── G1_instruction.csv │ ├── G1_instruction.json │ ├── G1_tool.csv │ ├── G1_tool.json │ ├── G2_category.csv │ ├── G2_category.json │ ├── G2_instruction.csv │ ├── G2_instruction.json │ ├── G3_instruction.csv │ └── G3_instruction.json │ ├── baseline-rft_dfs │ ├── G1_category.csv │ ├── G1_category.json │ ├── G1_instruction.csv │ ├── G1_instruction.json │ ├── G1_tool.csv │ ├── G1_tool.json │ ├── G2_category.csv │ ├── G2_category.json │ ├── G2_instruction.csv │ ├── G2_instruction.json │ ├── G3_instruction.csv │ └── G3_instruction.json │ ├── steptool_cot │ ├── G1_category.csv │ ├── G1_category.json │ ├── G1_instruction.csv │ ├── G1_instruction.json │ ├── G1_tool.csv │ ├── G1_tool.json │ ├── G2_category.csv │ ├── G2_category.json │ ├── G2_instruction.csv │ ├── G2_instruction.json │ ├── G3_instruction.csv │ └── G3_instruction.json │ ├── steptool_dfs │ ├── G1_category.csv │ ├── G1_category.json │ ├── G1_instruction.csv │ ├── G1_instruction.json │ ├── G1_tool.csv │ ├── G1_tool.json │ ├── G2_category.csv │ ├── G2_category.json │ ├── G2_instruction.csv │ ├── G2_instruction.json │ ├── G3_instruction.csv │ └── G3_instruction.json │ ├── toolllama_sft_cot │ ├── G1_category.csv │ ├── G1_category.json │ ├── G1_instruction.csv │ ├── G1_instruction.json │ ├── G1_tool.csv │ ├── G1_tool.json │ ├── G2_category.csv │ ├── G2_category.json │ ├── G2_instruction.csv │ ├── G2_instruction.json │ ├── G3_instruction.csv │ └── G3_instruction.json │ └── toolllama_sft_dfs │ ├── G1_category.csv │ ├── G1_category.json │ ├── G1_instruction.csv │ ├── G1_instruction.json │ ├── G1_tool.csv │ ├── G1_tool.json │ ├── G2_category.csv │ ├── G2_category.json │ ├── G2_instruction.json │ ├── G3_instruction.csv │ └── G3_instruction.json ├── data_train ├── eto │ └── dpo_data_example.csv ├── llama3-1 │ ├── gpt4_dfs_G123_for_sft_example.json │ └── step_grained_for_ppo_example.csv ├── qwen2 │ ├── gpt4_dfs_G123_for_sft_example.json │ └── step_grained_for_ppo_example.csv ├── rft │ └── rft_data_example.json └── toolllama │ └── step_grained_for_ppo_example.csv ├── requirements.txt ├── scripts ├── baseline-archer │ ├── build_data.sh │ └── train_archer.sh ├── baseline-eto │ └── train_dpo.sh ├── baseline-ppo │ └── train_toolllama.sh ├── baseline-rft │ └── train_rft.sh ├── reward │ └── annotation_with_gpt.sh ├── sft │ ├── train_llama3-1.sh │ └── train_qwen2.sh └── steptool_train │ ├── train_llama3-1.sh │ ├── train_qwen2.sh │ └── train_toolllama.sh ├── scripts_eval ├── baseline-archer │ ├── inference_archer_vllm.sh │ ├── run_convert_answer.sh │ └── run_pass_rate.sh ├── baseline-eto │ ├── inference_eto_vllm.sh │ ├── run_convert_answer.sh │ └── run_pass_rate.sh ├── baseline-ppo │ ├── inference_ppo_vllm.sh │ ├── run_convert_answer.sh │ └── run_pass_rate.sh ├── baseline-rft │ ├── inference_rft_vllm.sh │ ├── run_convert_answer.sh │ └── run_pass_rate.sh ├── llama3-1 │ ├── inference_llama3-1_vllm.sh │ ├── run_conver_answer.sh │ ├── run_pass_rate.sh │ └── run_preference.sh ├── qwen2 │ ├── inference_qwen2_vllm.sh │ ├── run_convert_answer.sh │ ├── run_pass_rate.sh │ └── run_preference.sh ├── steptool │ ├── inference_steptool_vllm.sh │ ├── run_convert_answer.sh │ └── run_pass_rate.sh ├── toolllama-sft │ ├── inference_toolllama_vllm.sh │ ├── run_conver_answer.sh │ └── run_pass_rate.sh └── toolllama │ └── run_preference.sh ├── src ├── baseline-archer │ ├── archer_agent.py │ ├── archer_critic.py │ ├── archer_data.py │ ├── archer_environment.py │ ├── archer_trainer.py │ ├── build_archer_data.py │ ├── offpolicy_train_loop.py │ └── run.py ├── baseline-eto │ └── dpo_train.py ├── baseline-ppo │ └── ppo.py ├── baseline-rft │ └── rft.py ├── reward │ ├── annotation_by_rules.ipynb │ ├── annotation_with_gpt.py │ ├── evaluators │ │ ├── evaluator.py │ │ └── gpt-4-turbo-2024-04-09 │ │ │ ├── config.yaml │ │ │ └── template.txt │ └── openai_key.json ├── sft │ ├── llama3-1.py │ └── qwen2.py └── steptool │ ├── step_ppo.py │ └── step_ppotrainer.py └── stabletoolbench ├── config.yml ├── server ├── config.yml ├── main.py ├── requirements.txt └── utils.py ├── solvable_queries ├── test_instruction │ ├── G1_category.json │ ├── G1_instruction.json │ ├── G1_tool.json │ ├── G2_category.json │ ├── G2_instruction.json │ └── G3_instruction.json └── test_query_ids │ ├── G1_category.json │ ├── G1_instruction.json │ ├── G1_tool.json │ ├── G2_category.json │ ├── G2_instruction.json │ └── G3_instruction.json └── toolbench ├── inference ├── Algorithms │ ├── DFS.py │ ├── __init__.py │ ├── base_search.py │ └── single_chain.py ├── Downstream_tasks │ ├── __init__.py │ ├── base_env.py │ ├── rapidapi.py │ └── rapidapi_multithread.py ├── LLM │ ├── __init__.py │ ├── base_io.py │ ├── chatgpt_model.py │ ├── llama3_sft_model.py │ ├── qwen2_sft_model.py │ ├── retriever.py │ └── tool_llama_vllm.py ├── LLM_rank │ ├── __init__.py │ └── rank_candidate.py ├── Prompts │ ├── ReAct_prompts.py │ ├── Tree_search_prompts.py │ ├── __init__.py │ └── rank_prompts.py ├── Tree │ ├── Tree.py │ └── __init__.py ├── callbacks │ └── ServerEventCallback.py ├── qa_pipeline.py ├── qa_pipeline_multithread.py ├── qa_pipeline_open_domain.py ├── server.py ├── toolbench_server.py └── utils.py ├── model ├── __init__.py ├── apply_delta.py ├── compression.py ├── make_delta.py └── model_adapter.py ├── tool_conversation.py ├── tooleval ├── README.md ├── README_ZH.md ├── ToolBench.code-workspace ├── __init__.py ├── automatic_eval_sample.py ├── convert_answers.py ├── convert_to_answer_format.py ├── dataset │ └── __init__.py ├── eval_and_update_leaderboard.py ├── eval_pass_rate.py ├── eval_preference.py ├── eval_process_reward.py ├── evaluation │ ├── __init__.py │ ├── dataclass.py │ ├── methodcls.py │ └── usereval.py ├── evaluators │ ├── __init__.py │ ├── registered_cls │ │ ├── __init__.py │ │ ├── base.py │ │ ├── rtl.py │ │ ├── tooleval.py │ │ └── utils.py │ ├── tooleval_gpt-3.5-turbo_default │ │ ├── config.yaml │ │ └── template.txt │ ├── tooleval_gpt-3.5-turbo_fn │ │ ├── config.yaml │ │ └── template.txt │ └── tooleval_gpt-3.5-turbo_normalized │ │ ├── config.yaml │ │ └── template.txt ├── evaluators_comparison.py ├── requirements.txt ├── results │ ├── default_evalset │ │ ├── DFS │ │ │ └── win.csv │ │ └── gpt-3.5-turbo_CoT │ │ │ ├── G1_category.json │ │ │ ├── G1_instruction.json │ │ │ ├── G1_tool.json │ │ │ ├── G2_category.json │ │ │ ├── G2_instruction.json │ │ │ └── G3_instruction.json │ ├── leaderboard###default_evalset###tooleval_gpt-3.5-turbo_normalized###ChatGPT-DFSDT.csv │ └── leaderboard###default_evalset###tooleval_gpt-3.5-turbo_normalized###gpt-3.5-turbo_CoT.csv └── utils.py └── utils.py /.gitignore: -------------------------------------------------------------------------------- 1 | wandb/ 2 | sft_ckpts/ 3 | **/__pycache__/ 4 | ckpts/ 5 | data_eval/* 6 | !data_eval/pass_rate_results 7 | experimental_results/ 8 | core* -------------------------------------------------------------------------------- /cases/case-0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yuyq18/StepTool/121a6acd11a899d947a5d3fa7269cf76624c1df3/cases/case-0.png -------------------------------------------------------------------------------- /cases/case-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yuyq18/StepTool/121a6acd11a899d947a5d3fa7269cf76624c1df3/cases/case-1.png -------------------------------------------------------------------------------- /cases/case-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yuyq18/StepTool/121a6acd11a899d947a5d3fa7269cf76624c1df3/cases/case-2.png -------------------------------------------------------------------------------- /config/archer/accelerate_config.yaml: -------------------------------------------------------------------------------- 1 | compute_environment: LOCAL_MACHINE 2 | debug: true 3 | distributed_type: MULTI_GPU 4 | downcast_bf16: 'no' 5 | gpu_ids: 0,1,2,3 6 | machine_rank: 0 7 | main_training_function: main 8 | mixed_precision: 'bf16' 9 | num_machines: 1 10 | num_processes: 4 11 | rdzv_backend: static 12 | same_network: true 13 | tpu_env: [] 14 | tpu_use_cluster: false 15 | tpu_use_sudo: false 16 | use_cpu: false 17 | -------------------------------------------------------------------------------- /config/archer/archer_config.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - default 3 | - _self_ 4 | 5 | # checkpoint 6 | checkpoint_path: null 7 | basemodel: 'toolllama' 8 | save_path: 'output/archer_baseline/' 9 | env_load_path: 'data_train/archer/' 10 | 11 | # model 12 | agent_type: "archer_toolllama" 13 | policy_lm: 'ToolBench/ToolLLaMA-2-7b-v2' 14 | max_new_tokens: 512 15 | use_bfloat16: True 16 | use_lora: True 17 | eos_str: '' 18 | 19 | save_freq: 50 20 | eval_freq: 5 21 | 22 | capacity: 100000 #replay buffer size 23 | rollout_size: 16 #number of rollout trajectories for each update 24 | eval_size: 4 #number of trajectories for evaluation 25 | batch_size: 4 26 | iterations: 100 #total number of iterations 27 | epochs: 20 #number of epochs for the critic each iteration 28 | actor_epochs: 1 #number of epochs for the actor each iteration 29 | warmup_iter: 10 #number of iterations without updating the policy 30 | grad_accum_steps: 8 31 | do_sample: True 32 | temperature: 1.0 33 | critic_lr: 1e-5 34 | lm_lr: 2e-6 35 | env_idx: null #set to null if don't want to reset to a specific environment 36 | gamma: 0.95 #discount factor 37 | tau: 0.1 #soft update parameter 38 | max_grad_norm: 10.0 39 | 40 | # wandb logging 41 | use_wandb: True 42 | project_name: 'archer_baseline' 43 | run_name: 'toolllama_archer_iter100_epoch20_actor1' 44 | -------------------------------------------------------------------------------- /config/archer/default.yaml: -------------------------------------------------------------------------------- 1 | #cache directory of transformer 2 | cache_dir: '~/.cache/huggingface/hub/' 3 | 4 | #token 5 | huggingface_token: '' 6 | wandb_key: "" 7 | 8 | policy_lm: "gpt2" 9 | critic_lm: "roberta-base" 10 | agent_type: "archer_toolllama" 11 | use_baseline: False 12 | use_lora: False 13 | max_new_tokens: 32 14 | save_freq: 25 15 | eval_freq: 25 16 | 17 | #training hyperparameters 18 | capacity: 100000 #replay buffer size 19 | rollout_size: 128 #number of rollout trajectories for each update 20 | eval_size: 32 #number of trajectories for evaluation 21 | batch_size: 8 22 | iterations: 2000 #total number of iterations 23 | epochs: 50 #number of epochs for the critic each iteration 24 | actor_epochs: 3 #number of epochs for the actor each iteration 25 | warmup_iter: 20 #number of iterations without updating the policy 26 | grad_accum_steps: 32 27 | do_sample: True 28 | temperature: 1.0 29 | critic_lr: 1e-5 30 | lm_lr: 1e-5 31 | env_idx: null #set to null if don't want to reset to a specific environment 32 | gamma: 0.95 #discount factor 33 | tau: 0.1 #soft update parameter 34 | max_grad_norm: 1.0 35 | 36 | use_wandb: False -------------------------------------------------------------------------------- /config/ds_configs/stage3-cosine.json: -------------------------------------------------------------------------------- 1 | { 2 | "bf16": { 3 | "enabled": "auto" 4 | }, 5 | "fp16": { 6 | "enabled": "auto", 7 | "loss_scale": 0, 8 | "loss_scale_window": 1000, 9 | "initial_scale_power": 16, 10 | "hysteresis": 2, 11 | "min_loss_scale": 1 12 | }, 13 | "zero_optimization": { 14 | "stage": 3, 15 | "offload_optimizer": { 16 | "device": "cpu", 17 | "pin_memory": true 18 | }, 19 | "offload_param": { 20 | "device": "cpu", 21 | "pin_memory": true 22 | }, 23 | "overlap_comm": true, 24 | "contiguous_gradients": true, 25 | "sub_group_size": 1e9, 26 | "reduce_bucket_size": "auto", 27 | "stage3_prefetch_bucket_size": "auto", 28 | "stage3_param_persistence_threshold": "auto", 29 | "stage3_max_live_parameters": 1e9, 30 | "stage3_max_reuse_distance": 1e9, 31 | "gather_16bit_weights_on_model_save": true 32 | }, 33 | "gradient_accumulation_steps": "auto", 34 | "gradient_clipping": "auto", 35 | "steps_per_print": 1e5, 36 | "train_batch_size": "auto", 37 | "train_micro_batch_size_per_gpu": "auto", 38 | "wall_clock_breakdown": false 39 | } -------------------------------------------------------------------------------- /config/llama3-1/StepTool_ppo.json: -------------------------------------------------------------------------------- 1 | { 2 | "peft_kwargs": { 3 | "r": 8, 4 | "lora_alpha": 16, 5 | "bias": "none", 6 | "task_type": "CAUSAL_LM" 7 | }, 8 | "ppo_kwargs": { 9 | "learning_rate": 1e-5, 10 | "log_with": "wandb", 11 | "remove_unused_columns": false, 12 | "batch_size": 8, 13 | "mini_batch_size": 2, 14 | "gradient_accumulation_steps": 4, 15 | "kl_penalty": "kl", 16 | "init_kl_coef": 0.3, 17 | "target_kl": 6, 18 | "target": 6, 19 | "horizon": 10000, 20 | "gamma": 0.99 21 | } 22 | } -------------------------------------------------------------------------------- /config/qwen2/StepTool_ppo.json: -------------------------------------------------------------------------------- 1 | { 2 | "peft_kwargs": { 3 | "target_modules": ["gate_proj", "o_proj", "k_proj", "q_proj", "up_proj", "down_proj", "v_proj"], 4 | "r": 8, 5 | "lora_alpha": 16, 6 | "bias": "none", 7 | "task_type": "CAUSAL_LM" 8 | }, 9 | "ppo_kwargs": { 10 | "learning_rate": 1e-5, 11 | "log_with": "wandb", 12 | "remove_unused_columns": false, 13 | "batch_size": 8, 14 | "mini_batch_size": 2, 15 | "gradient_accumulation_steps": 4, 16 | "kl_penalty": "kl", 17 | "init_kl_coef": 0.3, 18 | "target_kl": 6, 19 | "target": 6, 20 | "horizon": 10000, 21 | "gamma": 0.99 22 | } 23 | } -------------------------------------------------------------------------------- /config/toolllama/StepTool_ppo.json: -------------------------------------------------------------------------------- 1 | { 2 | "peft_kwargs": { 3 | "r": 16, 4 | "lora_alpha": 16, 5 | "bias": "none", 6 | "task_type": "CAUSAL_LM" 7 | }, 8 | "ppo_kwargs": { 9 | "seed": 2024, 10 | "learning_rate": 1e-5, 11 | "log_with": "wandb", 12 | "remove_unused_columns": false, 13 | "batch_size": 8, 14 | "mini_batch_size": 2, 15 | "gradient_accumulation_steps": 4, 16 | "kl_penalty": "kl", 17 | "init_kl_coef": 0.3, 18 | "target_kl": 6, 19 | "target": 6, 20 | "horizon": 10000, 21 | "gamma": 0.99 22 | } 23 | } -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | accelerate==0.33.0 2 | datasets==2.21.0 3 | trl==0.10.1 4 | wandb==0.17.8 5 | fastapi==0.95.1 6 | gradio==3.23.0 7 | httpx==0.24.0 8 | markdown-it-py==2.2.0 9 | numpy==1.24.3 10 | prompt_toolkit==3.0.47 11 | pydantic==1.10.7 12 | requests==2.32.3 13 | rich==13.3.5 14 | rouge==1.0.1 15 | sentencepiece==0.1.99 16 | shortuuid==1.0.11 17 | tiktoken==0.4.0 18 | tokenizers==0.19.1 19 | transformers==4.43.1 20 | uvicorn==0.22.0 21 | bitsandbytes==0.43.3 22 | peft==0.5.0 23 | langchain==0.0.229 24 | deepspeed==0.14.5 25 | sentence-transformers==2.2.2 26 | tensorboard==2.17.1 27 | openai==1.42.0 28 | scipy==1.14.1 29 | termcolor==2.4.0 30 | Flask==3.0.3 31 | Flask-Cors==4.0.1 32 | backoff==2.2.1 33 | slowapi==0.1.9 34 | httpx==0.24.0 35 | omegaconf==2.3.0 36 | -------------------------------------------------------------------------------- /scripts/baseline-archer/build_data.sh: -------------------------------------------------------------------------------- 1 | export DATA_FILE="data_train/toolllama/step_grained_for_ppo.csv" 2 | export SAVE_PATH="data_train/archer" 3 | 4 | python src/baseline-archer/build_archer_data.py -------------------------------------------------------------------------------- /scripts/baseline-archer/train_archer.sh: -------------------------------------------------------------------------------- 1 | 2 | export ARCHER_CONFIG_NAME="archer_config.yaml" 3 | 4 | accelerate launch --config_file config/archer/accelerate_config.yaml src/baseline-archer/run.py -------------------------------------------------------------------------------- /scripts/baseline-eto/train_dpo.sh: -------------------------------------------------------------------------------- 1 | export TRAIN_PATH="data_train/eto" 2 | export CUDA_VISIBLE_DEVICES=0,1 3 | export WANDB_PROJECT="baselines" 4 | 5 | python src/baseline-eto/dpo_train.py \ 6 | --model_name_or_path ToolBench/ToolLLaMA-2-7b-v2 \ 7 | --data_path ${TRAIN_PATH}/dpo_data_example.csv \ 8 | --bf16 True \ 9 | --output_dir "output/eto_baseline-3epoch" \ 10 | --report_to "wandb" \ 11 | --run_name "eto_baseline-3epoch" \ 12 | --num_train_epochs 3 \ 13 | --per_device_train_batch_size 1 \ 14 | --per_device_eval_batch_size 1 \ 15 | --gradient_accumulation_steps 8 \ 16 | --eval_strategy "epoch" \ 17 | --save_strategy "epoch" \ 18 | --save_total_limit 10 \ 19 | --seed 2024 \ 20 | --learning_rate 1e-4 \ 21 | --lr_scheduler_type "cosine" \ 22 | --logging_steps 1 \ 23 | --model_max_length 8192 \ 24 | --max_prompt_length 7000 \ 25 | --beta 0.1 -------------------------------------------------------------------------------- /scripts/baseline-ppo/train_toolllama.sh: -------------------------------------------------------------------------------- 1 | export PYTHONPATH=./ 2 | export TRAIN_PATH="data_train" 3 | export TRAIN_SET="step_grained_for_ppo_example" 4 | export CUDA_VISIBLE_DEVICES="0,1,2,3" 5 | 6 | export MODEL_TYPE="toolllama" 7 | # load the base model after sft pretrain 8 | export MODEL_PATH="ToolBench/ToolLLaMA-2-7b-v2" 9 | 10 | python src/baseline-ppo/ppo.py \ 11 | --model_path ${MODEL_PATH} \ 12 | --model_type ${MODEL_TYPE} \ 13 | --config_path config/${MODEL_TYPE}/StepTool_ppo.json \ 14 | --data_file ${TRAIN_PATH}/${MODEL_TYPE}/${TRAIN_SET}.csv \ 15 | --max_context_len 4096 \ 16 | --max_response_len 1024 \ 17 | --epochs 5 18 | -------------------------------------------------------------------------------- /scripts/baseline-rft/train_rft.sh: -------------------------------------------------------------------------------- 1 | export TRAIN_PATH="data_train/rft" 2 | export NCCL_P2P_DISABLE=1 3 | export NCCL_IB_DISABLE=1 4 | export CUDA_VISIBLE_DEVICES=0,1,2,3 5 | export WANDB_PROJECT="baselines" 6 | torchrun \ 7 | --nproc_per_node 4 \ 8 | --nnodes 1 \ 9 | --node_rank 0 \ 10 | --master_addr localhost \ 11 | --master_port 6601 \ 12 | src/baseline-rft/rft.py \ 13 | --model_name_or_path ToolBench/ToolLLaMA-2-7b-v2 \ 14 | --data_path ${TRAIN_PATH}/rft_data_example.json \ 15 | --bf16 True \ 16 | --output_dir "output/rft_baseline-3epoch" \ 17 | --report_to "wandb" \ 18 | --run_name "rft_baseline-3epoch" \ 19 | --num_train_epochs 3 \ 20 | --per_device_train_batch_size 2 \ 21 | --per_device_eval_batch_size 2 \ 22 | --gradient_accumulation_steps 8 \ 23 | --eval_strategy "epoch" \ 24 | --save_strategy "epoch" \ 25 | --save_total_limit 10 \ 26 | --seed 2024 \ 27 | --learning_rate 5e-5 \ 28 | --weight_decay 0. \ 29 | --warmup_ratio 0.04 \ 30 | --lr_scheduler_type "cosine" \ 31 | --logging_steps 1 \ 32 | --model_max_length 8192 \ 33 | --gradient_checkpointing True \ 34 | --lazy_preprocess False \ 35 | --deepspeed config/ds_configs/stage3-cosine.json 36 | -------------------------------------------------------------------------------- /scripts/reward/annotation_with_gpt.sh: -------------------------------------------------------------------------------- 1 | # cd ../../toolbench/tooleval 2 | # export API_POOL_FILE=path/to/your/openai_key_json_file.json 3 | export PYTHONPATH="./:./stabletoolbench/toolbench/tooleval" 4 | export API_POOL_FILE=src/reward/openai_key.json 5 | export CONVERTED_ANSWER_PATH=data/model_predictions_converted 6 | export SAVE_PATH=data/reward_annotation/ 7 | mkdir -p ${SAVE_PATH} 8 | 9 | # export CANDIDATE_MODEL="virtual_qwen2_sft_dfs_fix_epoch3" 10 | export CANDIDATE_MODEL="qwen2" 11 | export EVAL_MODEL="gpt-4-turbo-2024-04-09" 12 | mkdir -p ${SAVE_PATH}/${CANDIDATE_MODEL} 13 | # unset HTTP_PROXY HTTPS_PROXY http_proxy https_proxy 14 | # --evaluators_cfg_path \ 15 | python src/reward/annotation_with_gpt.py \ 16 | --converted_answer_path ${CONVERTED_ANSWER_PATH}/${CANDIDATE_MODEL} \ 17 | --save_path ${SAVE_PATH}/${CANDIDATE_MODEL} \ 18 | --reference_model ${CANDIDATE_MODEL} \ 19 | --evaluator ${EVAL_MODEL} \ 20 | --max_eval_threads 1 \ 21 | --task_num 5 \ 22 | --evaluate_times 3 \ 23 | --test_set G123_example \ -------------------------------------------------------------------------------- /scripts/sft/train_llama3-1.sh: -------------------------------------------------------------------------------- 1 | export NCCL_P2P_DISABLE=1 2 | export NCCL_IB_DISABLE=1 3 | export CUDA_VISIBLE_DEVICES=0,1,2,3 4 | export TRAIN_PATH="data_train" 5 | export TRAIN_SET="gpt4_dfs_G123_for_sft" 6 | 7 | export MODEL_PATH="meta-llama/Meta-Llama-3.1-8B-Instruct" 8 | export MODEL_TYPE="llama3-1" 9 | export OUTPUT_DIR="sft_ckpts" 10 | export WANDB_PROJECT="SFT-Llama3-1" 11 | export WANDB_RUN_NAME="sft_with_gpt4_paths" 12 | 13 | torchrun \ 14 | --nproc_per_node 4 \ 15 | --nnodes 1 \ 16 | --node_rank 0 \ 17 | --master_addr localhost \ 18 | --master_port 6601 \ 19 | src/sft/llama3-1.py \ 20 | --model_name_or_path ${MODEL_PATH} \ 21 | --data_path ${TRAIN_PATH}/${MODEL_TYPE}/${TRAIN_SET}.json \ 22 | --bf16 True \ 23 | --output_dir ${OUTPUT_DIR}/${MODEL_TYPE} \ 24 | --report_to "wandb" \ 25 | --run_name ${WANDB_RUN_NAME} \ 26 | --num_train_epochs 5 \ 27 | --per_device_train_batch_size 1 \ 28 | --per_device_eval_batch_size 1 \ 29 | --gradient_accumulation_steps 4 \ 30 | --eval_strategy "steps" \ 31 | --eval_steps 400 \ 32 | --save_strategy "steps" \ 33 | --save_steps 400 \ 34 | --save_total_limit 10 \ 35 | --learning_rate 2e-5 \ 36 | --weight_decay 0. \ 37 | --warmup_ratio 0.04 \ 38 | --lr_scheduler_type "cosine" \ 39 | --logging_steps 1 \ 40 | --model_max_length 8192 \ 41 | --gradient_checkpointing True \ 42 | --lazy_preprocess False \ 43 | --deepspeed config/ds_configs/stage3-cosine.json 44 | -------------------------------------------------------------------------------- /scripts/sft/train_qwen2.sh: -------------------------------------------------------------------------------- 1 | export NCCL_P2P_DISABLE=1 2 | export NCCL_IB_DISABLE=1 3 | export CUDA_VISIBLE_DEVICES=0,1,2,3 4 | export TRAIN_PATH="data_train" 5 | export TRAIN_SET="gpt4_dfs_G123_for_sft" 6 | 7 | export MODEL_PATH="Qwen/Qwen2-7B-Instruct" 8 | export MODEL_TYPE="qwen2" 9 | export OUTPUT_DIR="sft_ckpts" 10 | export WANDB_PROJECT="SFT-Qwen2" 11 | export WANDB_RUN_NAME="sft_with_gpt4_paths" 12 | 13 | torchrun \ 14 | --nproc_per_node 4 \ 15 | --nnodes 1 \ 16 | --node_rank 0 \ 17 | --master_addr localhost \ 18 | --master_port 6601 \ 19 | src/sft/qwen2.py \ 20 | --model_name_or_path ${MODEL_PATH} \ 21 | --data_path ${TRAIN_PATH}/${MODEL_TYPE}/${TRAIN_SET}.json \ 22 | --bf16 True \ 23 | --output_dir ${OUTPUT_DIR}/${MODEL_TYPE} \ 24 | --report_to "wandb" \ 25 | --run_name ${WANDB_RUN_NAME} \ 26 | --num_train_epochs 5 \ 27 | --per_device_train_batch_size 1 \ 28 | --per_device_eval_batch_size 1 \ 29 | --gradient_accumulation_steps 4 \ 30 | --eval_strategy "steps" \ 31 | --eval_steps 400 \ 32 | --save_strategy "steps" \ 33 | --save_steps 400 \ 34 | --save_total_limit 10 \ 35 | --learning_rate 2e-5 \ 36 | --weight_decay 0. \ 37 | --warmup_ratio 0.04 \ 38 | --lr_scheduler_type "cosine" \ 39 | --logging_steps 1 \ 40 | --model_max_length 8192 \ 41 | --gradient_checkpointing True \ 42 | --lazy_preprocess False \ 43 | --deepspeed config/ds_configs/stage3-cosine.json 44 | -------------------------------------------------------------------------------- /scripts/steptool_train/train_llama3-1.sh: -------------------------------------------------------------------------------- 1 | export PYTHONPATH=./ 2 | export TRAIN_PATH="data_train" 3 | export TRAIN_SET="step_grained_for_ppo_example" 4 | export CUDA_VISIBLE_DEVICES="0,1,2,3" 5 | 6 | export MODEL_TYPE="llama3-1" 7 | # load the base model after sft pretrain 8 | export MODEL_PATH="sft-ckpts/llama3-1/checkpoint-3600" 9 | 10 | python src/steptool/step_ppo.py \ 11 | --model_path ${MODEL_PATH} \ 12 | --model_type ${MODEL_TYPE} \ 13 | --config_path config/${MODEL_TYPE}/StepTool_ppo.json \ 14 | --data_file ${TRAIN_PATH}/${MODEL_TYPE}/${TRAIN_SET}.csv \ 15 | --epochs 5 16 | 17 | -------------------------------------------------------------------------------- /scripts/steptool_train/train_qwen2.sh: -------------------------------------------------------------------------------- 1 | export PYTHONPATH=./ 2 | export TRAIN_PATH="data_train" 3 | export TRAIN_SET="step_grained_for_ppo_example" 4 | export CUDA_VISIBLE_DEVICES="0,1,2,3" 5 | 6 | export MODEL_TYPE="qwen2" 7 | # load the base model after sft pretrain 8 | export MODEL_PATH="sft-ckpts/qwen2/checkpoint-3639" 9 | 10 | python src/steptool/step_ppo.py \ 11 | --model_path ${MODEL_PATH} \ 12 | --model_type ${MODEL_TYPE} \ 13 | --config_path config/${MODEL_TYPE}/StepTool_ppo.json \ 14 | --data_file ${TRAIN_PATH}/${MODEL_TYPE}/${TRAIN_SET}.csv \ 15 | --epochs 5 16 | 17 | -------------------------------------------------------------------------------- /scripts/steptool_train/train_toolllama.sh: -------------------------------------------------------------------------------- 1 | export PYTHONPATH=./ 2 | export TRAIN_PATH="data_train" 3 | export TRAIN_SET="step_grained_for_ppo_example" 4 | export CUDA_VISIBLE_DEVICES="0,1,2,3" 5 | 6 | export MODEL_TYPE="toolllama" 7 | # load the base model after sft pretrain 8 | export MODEL_PATH="ToolBench/ToolLLaMA-2-7b-v2" 9 | 10 | python src/steptool/step_ppo.py \ 11 | --model_path ${MODEL_PATH} \ 12 | --model_type ${MODEL_TYPE} \ 13 | --config_path config/${MODEL_TYPE}/StepTool_ppo.json \ 14 | --data_file ${TRAIN_PATH}/${MODEL_TYPE}/${TRAIN_SET}.csv \ 15 | --max_context_len 4096 \ 16 | --max_response_len 1024 \ 17 | --epochs 5 18 | -------------------------------------------------------------------------------- /scripts_eval/baseline-archer/inference_archer_vllm.sh: -------------------------------------------------------------------------------- 1 | cd stabletoolbench 2 | export PYTHONPATH=./ 3 | export VLLM_API_BASE="http://127.0.0.1:8084/v1/" # the address of vllm.server 4 | export SERVICE_URL="http://localhost:8081/virtual" # the address of api server 5 | export MODEL_PATH="baseline-archer" # the name of vllm.server 6 | export STRATEGY="DFS_woFilter_w2" # or CoT@1 7 | 8 | export OUTPUT_DIR="data_eval/answer/baseline-archer_cot" # change it accordingly 9 | 10 | group=G1_instruction # G1_category, G1_tool, G2_category, G2_instruction, G3_instruction 11 | mkdir -p $OUTPUT_DIR; mkdir -p $OUTPUT_DIR/$group 12 | python toolbench/inference/qa_pipeline_multithread.py \ 13 | --backbone_model ToolLLaMA_vllm \ 14 | --model_path ${MODEL_PATH} \ 15 | --max_observation_length 1024 \ 16 | --method ${STRATEGY} \ 17 | --input_query_file solvable_queries/test_instruction/${group}.json \ 18 | --output_answer_file $OUTPUT_DIR/$group \ 19 | --max_query_count 30 \ 20 | --num_thread 4 -------------------------------------------------------------------------------- /scripts_eval/baseline-archer/run_convert_answer.sh: -------------------------------------------------------------------------------- 1 | cd stabletoolbench/toolbench/tooleval 2 | export RAW_ANSWER_PATH=../../../data_eval/answer 3 | export CONVERTED_ANSWER_PATH=../../../data_eval/model_predictions_converted 4 | export MODEL_NAME=baseline-archer_dfs # change it accordingly 5 | export STRATEGY="DFS_woFilter_w2" # or CoT@1 6 | export test_set=G1_instruction # G1_category, G1_tool, G2_category, G2_instruction, G3_instruction 7 | 8 | mkdir -p ${CONVERTED_ANSWER_PATH}/${MODEL_NAME} 9 | answer_dir=${RAW_ANSWER_PATH}/${MODEL_NAME}/${test_set} 10 | output_file=${CONVERTED_ANSWER_PATH}/${MODEL_NAME}/${test_set}.json 11 | 12 | python convert_to_answer_format.py\ 13 | --answer_dir ${answer_dir} \ 14 | --method ${STRATEGY} \ 15 | --output ${output_file} -------------------------------------------------------------------------------- /scripts_eval/baseline-archer/run_pass_rate.sh: -------------------------------------------------------------------------------- 1 | cd stabletoolbench/toolbench/tooleval 2 | export API_POOL_FILE=../../openai_key.json 3 | export CONVERTED_ANSWER_PATH=../../../data_eval/model_predictions_converted 4 | export SAVE_PATH=../../../data_eval/pass_rate_results 5 | mkdir -p ${SAVE_PATH} 6 | export CANDIDATE_MODEL="baseline-archer_cot" # change it accordingly 7 | export EVAL_MODEL=gpt-4-turbo-2024-04-09 8 | mkdir -p ${SAVE_PATH}/${CANDIDATE_MODEL} 9 | 10 | export test_set=G1_instruction # G1_category, G1_tool, G2_category, G2_instruction, G3_instruction 11 | 12 | python eval_pass_rate.py \ 13 | --converted_answer_path ${CONVERTED_ANSWER_PATH} \ 14 | --save_path ${SAVE_PATH}/${CANDIDATE_MODEL} \ 15 | --reference_model ${CANDIDATE_MODEL} \ 16 | --test_ids ../../solvable_queries/test_query_ids \ 17 | --max_eval_threads 1 \ 18 | --evaluate_times 3 \ 19 | --test_set ${test_set} \ 20 | # --overwrite -------------------------------------------------------------------------------- /scripts_eval/baseline-eto/inference_eto_vllm.sh: -------------------------------------------------------------------------------- 1 | cd stabletoolbench 2 | export PYTHONPATH=./ 3 | export VLLM_API_BASE="http://127.0.0.1:8084/v1/" # the address of vllm.server 4 | export SERVICE_URL="http://localhost:8081/virtual" # the address of api server 5 | export MODEL_PATH="baseline-eto" # the name of vllm.server 6 | export STRATEGY="DFS_woFilter_w2" # or CoT@1 7 | 8 | export OUTPUT_DIR="data_eval/answer/baseline-eto_cot" # change it accordingly 9 | 10 | group=G1_instruction # G1_category, G1_tool, G2_category, G2_instruction, G3_instruction 11 | mkdir -p $OUTPUT_DIR; mkdir -p $OUTPUT_DIR/$group 12 | python toolbench/inference/qa_pipeline_multithread.py \ 13 | --backbone_model ToolLLaMA_vllm \ 14 | --model_path ${MODEL_PATH} \ 15 | --max_observation_length 1024 \ 16 | --method ${STRATEGY} \ 17 | --input_query_file solvable_queries/test_instruction/${group}.json \ 18 | --output_answer_file $OUTPUT_DIR/$group \ 19 | --max_query_count 30 \ 20 | --num_thread 4 -------------------------------------------------------------------------------- /scripts_eval/baseline-eto/run_convert_answer.sh: -------------------------------------------------------------------------------- 1 | cd stabletoolbench/toolbench/tooleval 2 | export RAW_ANSWER_PATH=../../../data_eval/answer 3 | export CONVERTED_ANSWER_PATH=../../../data_eval/model_predictions_converted 4 | export MODEL_NAME=baseline-eto_dfs # change it accordingly 5 | export STRATEGY="DFS_woFilter_w2" # or CoT@1 6 | export test_set=G2_instruction # G1_category, G1_tool, G2_category, G2_instruction, G3_instruction 7 | 8 | mkdir -p ${CONVERTED_ANSWER_PATH}/${MODEL_NAME} 9 | answer_dir=${RAW_ANSWER_PATH}/${MODEL_NAME}/${test_set} 10 | output_file=${CONVERTED_ANSWER_PATH}/${MODEL_NAME}/${test_set}.json 11 | 12 | python convert_to_answer_format.py\ 13 | --answer_dir ${answer_dir} \ 14 | --method ${STRATEGY} \ 15 | --output ${output_file} -------------------------------------------------------------------------------- /scripts_eval/baseline-eto/run_pass_rate.sh: -------------------------------------------------------------------------------- 1 | cd stabletoolbench/toolbench/tooleval 2 | export API_POOL_FILE=../../openai_key.json 3 | export CONVERTED_ANSWER_PATH=../../../data_eval/model_predictions_converted 4 | export SAVE_PATH=../../../data_eval/pass_rate_results 5 | mkdir -p ${SAVE_PATH} 6 | export CANDIDATE_MODEL="baseline-eto_dfs" # change it accordingly 7 | export EVAL_MODEL=gpt-4-turbo-2024-04-09 8 | mkdir -p ${SAVE_PATH}/${CANDIDATE_MODEL} 9 | 10 | export test_set=G2_instruction # G1_category, G1_tool, G2_category, G2_instruction, G3_instruction 11 | 12 | python eval_pass_rate.py \ 13 | --converted_answer_path ${CONVERTED_ANSWER_PATH} \ 14 | --save_path ${SAVE_PATH}/${CANDIDATE_MODEL} \ 15 | --reference_model ${CANDIDATE_MODEL} \ 16 | --test_ids ../../solvable_queries/test_query_ids \ 17 | --max_eval_threads 15 \ 18 | --evaluate_times 3 \ 19 | --test_set ${test_set} \ 20 | # --overwrite -------------------------------------------------------------------------------- /scripts_eval/baseline-ppo/inference_ppo_vllm.sh: -------------------------------------------------------------------------------- 1 | cd stabletoolbench 2 | export PYTHONPATH=./ 3 | export VLLM_API_BASE="http://127.0.0.1:8084/v1/" # the address of vllm.server 4 | export SERVICE_URL="http://localhost:8081/virtual" # the address of api server 5 | export MODEL_PATH="baseline-ppo" # the name of vllm.server 6 | export STRATEGY="DFS_woFilter_w2" # or CoT@1 7 | 8 | export OUTPUT_DIR="data_eval/answer/baseline-ppo_cot" # change it accordingly 9 | 10 | group=G1_instruction # G1_category, G1_tool, G2_category, G2_instruction, G3_instruction 11 | mkdir -p $OUTPUT_DIR; mkdir -p $OUTPUT_DIR/$group 12 | python toolbench/inference/qa_pipeline_multithread.py \ 13 | --backbone_model ToolLLaMA_vllm \ 14 | --model_path ${MODEL_PATH} \ 15 | --max_observation_length 1024 \ 16 | --method ${STRATEGY} \ 17 | --input_query_file solvable_queries/test_instruction/${group}.json \ 18 | --output_answer_file $OUTPUT_DIR/$group \ 19 | --max_query_count 30 \ 20 | --num_thread 4 -------------------------------------------------------------------------------- /scripts_eval/baseline-ppo/run_convert_answer.sh: -------------------------------------------------------------------------------- 1 | cd stabletoolbench/toolbench/tooleval 2 | export RAW_ANSWER_PATH=../../../data_eval/answer 3 | export CONVERTED_ANSWER_PATH=../../../data_eval/model_predictions_converted 4 | export MODEL_NAME=baseline-ppo_dfs # change it accordingly 5 | export STRATEGY="DFS_woFilter_w2" # or CoT@1 6 | export test_set=G1_instruction # G1_category, G1_tool, G2_category, G2_instruction, G3_instruction 7 | 8 | mkdir -p ${CONVERTED_ANSWER_PATH}/${MODEL_NAME} 9 | answer_dir=${RAW_ANSWER_PATH}/${MODEL_NAME}/${test_set} 10 | output_file=${CONVERTED_ANSWER_PATH}/${MODEL_NAME}/${test_set}.json 11 | 12 | python convert_to_answer_format.py\ 13 | --answer_dir ${answer_dir} \ 14 | --method ${STRATEGY} \ 15 | --output ${output_file} -------------------------------------------------------------------------------- /scripts_eval/baseline-ppo/run_pass_rate.sh: -------------------------------------------------------------------------------- 1 | cd stabletoolbench/toolbench/tooleval 2 | export API_POOL_FILE=../../openai_key.json 3 | export CONVERTED_ANSWER_PATH=../../../data_eval/model_predictions_converted 4 | export SAVE_PATH=../../../data_eval/pass_rate_results 5 | mkdir -p ${SAVE_PATH} 6 | export CANDIDATE_MODEL="baseline-ppo_dfs" # change it accordingly 7 | export EVAL_MODEL=gpt-4-turbo-2024-04-09 8 | mkdir -p ${SAVE_PATH}/${CANDIDATE_MODEL} 9 | 10 | export test_set=G1_instruction # G1_category, G1_tool, G2_category, G2_instruction, G3_instruction 11 | 12 | python eval_pass_rate.py \ 13 | --converted_answer_path ${CONVERTED_ANSWER_PATH} \ 14 | --save_path ${SAVE_PATH}/${CANDIDATE_MODEL} \ 15 | --reference_model ${CANDIDATE_MODEL} \ 16 | --test_ids ../../solvable_queries/test_query_ids \ 17 | --max_eval_threads 1 \ 18 | --evaluate_times 3 \ 19 | --test_set ${test_set} \ 20 | # --overwrite -------------------------------------------------------------------------------- /scripts_eval/baseline-rft/inference_rft_vllm.sh: -------------------------------------------------------------------------------- 1 | cd stabletoolbench 2 | export PYTHONPATH=./ 3 | export VLLM_API_BASE="http://127.0.0.1:8084/v1/" # the address of vllm.server 4 | export SERVICE_URL="http://localhost:8081/virtual" # the address of api server 5 | export MODEL_PATH="baseline-rft" # the name of vllm.server 6 | export STRATEGY="DFS_woFilter_w2" # or CoT@1 7 | 8 | export OUTPUT_DIR="data_eval/answer/baseline-rft_cot" # change it accordingly 9 | 10 | group=G1_instruction # G1_category, G1_tool, G2_category, G2_instruction, G3_instruction 11 | mkdir -p $OUTPUT_DIR; mkdir -p $OUTPUT_DIR/$group 12 | python toolbench/inference/qa_pipeline_multithread.py \ 13 | --backbone_model ToolLLaMA_vllm \ 14 | --model_path ${MODEL_PATH} \ 15 | --max_observation_length 1024 \ 16 | --method ${STRATEGY} \ 17 | --input_query_file solvable_queries/test_instruction/${group}.json \ 18 | --output_answer_file $OUTPUT_DIR/$group \ 19 | --max_query_count 30 \ 20 | --num_thread 4 -------------------------------------------------------------------------------- /scripts_eval/baseline-rft/run_convert_answer.sh: -------------------------------------------------------------------------------- 1 | cd stabletoolbench/toolbench/tooleval 2 | export RAW_ANSWER_PATH=../../../data_eval/answer 3 | export CONVERTED_ANSWER_PATH=../../../data_eval/model_predictions_converted 4 | export MODEL_NAME=baseline-rft_dfs # change it accordingly 5 | export STRATEGY="DFS_woFilter_w2" # or CoT@1 6 | export test_set=G1_instruction # G1_category, G1_tool, G2_category, G2_instruction, G3_instruction 7 | 8 | mkdir -p ${CONVERTED_ANSWER_PATH}/${MODEL_NAME} 9 | answer_dir=${RAW_ANSWER_PATH}/${MODEL_NAME}/${test_set} 10 | output_file=${CONVERTED_ANSWER_PATH}/${MODEL_NAME}/${test_set}.json 11 | 12 | python convert_to_answer_format.py\ 13 | --answer_dir ${answer_dir} \ 14 | --method ${STRATEGY} \ 15 | --output ${output_file} -------------------------------------------------------------------------------- /scripts_eval/baseline-rft/run_pass_rate.sh: -------------------------------------------------------------------------------- 1 | cd stabletoolbench/toolbench/tooleval 2 | export API_POOL_FILE=../../openai_key.json 3 | export CONVERTED_ANSWER_PATH=../../../data_eval/model_predictions_converted 4 | export SAVE_PATH=../../../data_eval/pass_rate_results 5 | mkdir -p ${SAVE_PATH} 6 | export CANDIDATE_MODEL="baseline-rft_dfs" # change it accordingly 7 | export EVAL_MODEL=gpt-4-turbo-2024-04-09 8 | mkdir -p ${SAVE_PATH}/${CANDIDATE_MODEL} 9 | 10 | export test_set=G1_instruction # G1_category, G1_tool, G2_category, G2_instruction, G3_instruction 11 | 12 | python eval_pass_rate.py \ 13 | --converted_answer_path ${CONVERTED_ANSWER_PATH} \ 14 | --save_path ${SAVE_PATH}/${CANDIDATE_MODEL} \ 15 | --reference_model ${CANDIDATE_MODEL} \ 16 | --test_ids ../../solvable_queries/test_query_ids \ 17 | --max_eval_threads 1 \ 18 | --evaluate_times 3 \ 19 | --test_set ${test_set} \ 20 | # --overwrite -------------------------------------------------------------------------------- /scripts_eval/llama3-1/inference_llama3-1_vllm.sh: -------------------------------------------------------------------------------- 1 | cd stabletoolbench 2 | export PYTHONPATH=./ 3 | export VLLM_API_BASE="http://127.0.0.1:8085/v1/" # the address of vllm.server 4 | export SERVICE_URL="http://127.0.0.1:8081/virtual" # the address of api server 5 | export MODEL_PATH="llama3-1" # the name of vllm.server 6 | export STRATEGY="DFS_woFilter_w2" # or CoT@1 7 | 8 | export OUTPUT_DIR="data_eval/answer/virtual_llama3-1_dfs" # change it accordingly 9 | 10 | group=G1_instruction # G1_category, G1_tool, G2_category, G2_instruction, G3_instruction 11 | mkdir -p $OUTPUT_DIR; mkdir -p $OUTPUT_DIR/$group 12 | python toolbench/inference/qa_pipeline_multithread.py \ 13 | --backbone_model llama3 \ 14 | --model_path ${MODEL_PATH} \ 15 | --max_observation_length 1024 \ 16 | --method ${STRATEGY} \ 17 | --input_query_file solvable_queries/test_instruction/${group}.json \ 18 | --output_answer_file $OUTPUT_DIR/$group \ 19 | --max_query_count 30 \ 20 | --num_thread 4 -------------------------------------------------------------------------------- /scripts_eval/llama3-1/run_conver_answer.sh: -------------------------------------------------------------------------------- 1 | cd stabletoolbench/toolbench/tooleval 2 | export RAW_ANSWER_PATH=../../data_eval/answer 3 | export CONVERTED_ANSWER_PATH=../../data_eval/model_predictions_converted 4 | export MODEL_NAME=virtual_llama3-1_dfs # change it accordingly 5 | export STRATEGY="DFS_woFilter_w2" # or CoT@1 6 | export test_set=G1_instruction # G1_category, G1_tool, G2_category, G2_instruction, G3_instruction 7 | 8 | mkdir -p ${CONVERTED_ANSWER_PATH}/${MODEL_NAME} 9 | answer_dir=${RAW_ANSWER_PATH}/${MODEL_NAME}/${test_set} 10 | output_file=${CONVERTED_ANSWER_PATH}/${MODEL_NAME}/${test_set}.json 11 | 12 | python convert_to_answer_format.py\ 13 | --answer_dir ${answer_dir} \ 14 | --method ${STRATEGY} \ 15 | --output ${output_file} -------------------------------------------------------------------------------- /scripts_eval/llama3-1/run_pass_rate.sh: -------------------------------------------------------------------------------- 1 | cd stabletoolbench/toolbench/tooleval 2 | export API_POOL_FILE=../../openai_key.json 3 | export CONVERTED_ANSWER_PATH=../../data_eval/model_predictions_converted 4 | export SAVE_PATH=../../data_eval/pass_rate_results 5 | mkdir -p ${SAVE_PATH} 6 | export CANDIDATE_MODEL="virtual_llama3-1_dfs" # change it accordingly 7 | export EVAL_MODEL=gpt-4-turbo-2024-04-09 8 | mkdir -p ${SAVE_PATH}/${CANDIDATE_MODEL} 9 | 10 | export test_set=G1_instruction # G1_category, G1_tool, G2_category, G2_instruction, G3_instruction 11 | 12 | python eval_pass_rate.py \ 13 | --converted_answer_path ${CONVERTED_ANSWER_PATH} \ 14 | --save_path ${SAVE_PATH}/${CANDIDATE_MODEL} \ 15 | --reference_model ${CANDIDATE_MODEL} \ 16 | --test_ids ../../solvable_queries/test_query_ids \ 17 | --max_eval_threads 1 \ 18 | --evaluate_times 3 \ 19 | --test_set ${test_set} \ 20 | # --overwrite -------------------------------------------------------------------------------- /scripts_eval/llama3-1/run_preference.sh: -------------------------------------------------------------------------------- 1 | cd toolbench/tooleval 2 | export CONVERTED_ANSWER_PATH=../../data_eval/model_predictions_converted 3 | export SAVE_PATH=../../data_eval/preference_results 4 | export PASS_RATE_PATH=../../data_eval/pass_rate_results 5 | 6 | export REFERENCE_MODEL=virtual_gpt3.5-0125_dfs # change it accordingly 7 | export CANDIDATE_MODEL=virtual_llama3-1_dfs # change it accordingly 8 | 9 | export EVAL_MODEL=gpt-4-turbo-2024-04-09 10 | mkdir -p ${SAVE_PATH}/${REFERENCE_MODEL}_${CANDIDATE_MODEL} 11 | 12 | export test_set=G1_instruction # G1_category, G1_tool, G2_category, G2_instruction, G3_instruction 13 | 14 | python eval_preference.py \ 15 | --converted_answer_path ${CONVERTED_ANSWER_PATH} \ 16 | --reference_model ${REFERENCE_MODEL} \ 17 | --output_model ${CANDIDATE_MODEL} \ 18 | --test_ids ../../solvable_queries/test_query_ids/ \ 19 | --save_path ${SAVE_PATH}/${REFERENCE_MODEL}_${CANDIDATE_MODEL} \ 20 | --pass_rate_result_path ${PASS_RATE_PATH} \ 21 | --max_eval_threads 30 \ 22 | --evaluate_times 3 \ 23 | --test_set ${test_set} \ 24 | # --overwrite -------------------------------------------------------------------------------- /scripts_eval/qwen2/inference_qwen2_vllm.sh: -------------------------------------------------------------------------------- 1 | cd stabletoolbench 2 | export PYTHONPATH=./ 3 | export VLLM_API_BASE="http://127.0.0.1:8084/v1/" # the address of vllm.server 4 | export SERVICE_URL="http://localhost:8081/virtual" # the address of api server 5 | export MODEL_PATH="qwen2" # the name of vllm.server 6 | export STRATEGY="DFS_woFilter_w2" # or CoT@1 7 | 8 | export OUTPUT_DIR="data_eval/answer/virtual_qwen2_dfs" # change it accordingly 9 | 10 | group=G1_instruction # G1_category, G1_tool, G2_category, G2_instruction, G3_instruction 11 | mkdir -p $OUTPUT_DIR; mkdir -p $OUTPUT_DIR/$group 12 | python toolbench/inference/qa_pipeline_multithread.py \ 13 | --backbone_model qwen2 \ 14 | --model_path ${MODEL_PATH} \ 15 | --max_observation_length 1024 \ 16 | --method ${STRATEGY} \ 17 | --input_query_file solvable_queries/test_instruction/${group}.json \ 18 | --output_answer_file $OUTPUT_DIR/$group \ 19 | --max_query_count 30 \ 20 | --num_thread 4 -------------------------------------------------------------------------------- /scripts_eval/qwen2/run_convert_answer.sh: -------------------------------------------------------------------------------- 1 | cd stabletoolbench/toolbench/tooleval 2 | export RAW_ANSWER_PATH=../../data_eval/answer 3 | export CONVERTED_ANSWER_PATH=../../data_eval/model_predictions_converted 4 | export MODEL_NAME=virtual_qwen2_dfs # change it accordingly 5 | export STRATEGY="DFS_woFilter_w2" # or CoT@1 6 | export test_set=G1_instruction # G1_category, G1_tool, G2_category, G2_instruction, G3_instruction 7 | 8 | mkdir -p ${CONVERTED_ANSWER_PATH}/${MODEL_NAME} 9 | answer_dir=${RAW_ANSWER_PATH}/${MODEL_NAME}/${test_set} 10 | output_file=${CONVERTED_ANSWER_PATH}/${MODEL_NAME}/${test_set}.json 11 | 12 | python convert_to_answer_format.py\ 13 | --answer_dir ${answer_dir} \ 14 | --method ${STRATEGY} \ 15 | --output ${output_file} -------------------------------------------------------------------------------- /scripts_eval/qwen2/run_pass_rate.sh: -------------------------------------------------------------------------------- 1 | cd stabletoolbench/toolbench/tooleval 2 | export API_POOL_FILE=../../openai_key.json 3 | export CONVERTED_ANSWER_PATH=../../data_eval/model_predictions_converted 4 | export SAVE_PATH=../../data_eval/pass_rate_results 5 | mkdir -p ${SAVE_PATH} 6 | export CANDIDATE_MODEL="virtual_qwen2_dfs" # change it accordingly 7 | export EVAL_MODEL=gpt-4-turbo-2024-04-09 8 | mkdir -p ${SAVE_PATH}/${CANDIDATE_MODEL} 9 | 10 | export test_set=G1_instruction # G1_category, G1_tool, G2_category, G2_instruction, G3_instruction 11 | 12 | python eval_pass_rate.py \ 13 | --converted_answer_path ${CONVERTED_ANSWER_PATH} \ 14 | --save_path ${SAVE_PATH}/${CANDIDATE_MODEL} \ 15 | --reference_model ${CANDIDATE_MODEL} \ 16 | --test_ids ../../solvable_queries/test_query_ids \ 17 | --max_eval_threads 1 \ 18 | --evaluate_times 3 \ 19 | --test_set ${test_set} \ 20 | # --overwrite -------------------------------------------------------------------------------- /scripts_eval/qwen2/run_preference.sh: -------------------------------------------------------------------------------- 1 | cd toolbench/tooleval 2 | export CONVERTED_ANSWER_PATH=../../data_eval/model_predictions_converted 3 | export SAVE_PATH=../../data_eval/preference_results 4 | export PASS_RATE_PATH=../../data_eval/pass_rate_results 5 | 6 | export REFERENCE_MODEL=virtual_gpt3.5-0125_dfs # change it accordingly 7 | export CANDIDATE_MODEL=virtual_qwen2_dfs # change it accordingly 8 | 9 | export EVAL_MODEL=gpt-4-turbo-2024-04-09 10 | mkdir -p ${SAVE_PATH}/${REFERENCE_MODEL}_${CANDIDATE_MODEL} 11 | 12 | export test_set=G1_instruction # G1_category, G1_tool, G2_category, G2_instruction, G3_instruction 13 | 14 | python eval_preference.py \ 15 | --converted_answer_path ${CONVERTED_ANSWER_PATH} \ 16 | --reference_model ${REFERENCE_MODEL} \ 17 | --output_model ${CANDIDATE_MODEL} \ 18 | --test_ids ../../solvable_queries/test_query_ids/ \ 19 | --save_path ${SAVE_PATH}/${REFERENCE_MODEL}_${CANDIDATE_MODEL} \ 20 | --pass_rate_result_path ${PASS_RATE_PATH} \ 21 | --max_eval_threads 30 \ 22 | --evaluate_times 3 \ 23 | --test_set ${test_set} \ 24 | # --overwrite -------------------------------------------------------------------------------- /scripts_eval/steptool/inference_steptool_vllm.sh: -------------------------------------------------------------------------------- 1 | cd stabletoolbench 2 | export PYTHONPATH=./ 3 | export VLLM_API_BASE="http://127.0.0.1:8084/v1/" # the address of vllm.server 4 | export SERVICE_URL="http://localhost:8081/virtual" # the address of api server 5 | export MODEL_PATH="steptool" # the name of vllm.server 6 | export STRATEGY="DFS_woFilter_w2" # or CoT@1 7 | 8 | export OUTPUT_DIR="data_eval/answer/steptool_cot" # change it accordingly 9 | 10 | group=G1_instruction # G1_category, G1_tool, G2_category, G2_instruction, G3_instruction 11 | mkdir -p $OUTPUT_DIR; mkdir -p $OUTPUT_DIR/$group 12 | python toolbench/inference/qa_pipeline_multithread.py \ 13 | --backbone_model ToolLLaMA_vllm \ 14 | --model_path ${MODEL_PATH} \ 15 | --max_observation_length 1024 \ 16 | --method ${STRATEGY} \ 17 | --input_query_file solvable_queries/test_instruction/${group}.json \ 18 | --output_answer_file $OUTPUT_DIR/$group \ 19 | --max_query_count 30 \ 20 | --num_thread 4 -------------------------------------------------------------------------------- /scripts_eval/steptool/run_convert_answer.sh: -------------------------------------------------------------------------------- 1 | cd stabletoolbench/toolbench/tooleval 2 | export RAW_ANSWER_PATH=../../../data_eval/answer 3 | export CONVERTED_ANSWER_PATH=../../../data_eval/model_predictions_converted 4 | export MODEL_NAME=steptool_dfs # change it accordingly 5 | export STRATEGY="DFS_woFilter_w2" # or CoT@1 6 | export test_set=G1_instruction # G1_category, G1_tool, G2_category, G2_instruction, G3_instruction 7 | 8 | mkdir -p ${CONVERTED_ANSWER_PATH}/${MODEL_NAME} 9 | answer_dir=${RAW_ANSWER_PATH}/${MODEL_NAME}/${test_set} 10 | output_file=${CONVERTED_ANSWER_PATH}/${MODEL_NAME}/${test_set}.json 11 | 12 | python convert_to_answer_format.py\ 13 | --answer_dir ${answer_dir} \ 14 | --method ${STRATEGY} \ 15 | --output ${output_file} -------------------------------------------------------------------------------- /scripts_eval/steptool/run_pass_rate.sh: -------------------------------------------------------------------------------- 1 | cd stabletoolbench/toolbench/tooleval 2 | export API_POOL_FILE=../../openai_key.json 3 | export CONVERTED_ANSWER_PATH=../../../data_eval/model_predictions_converted 4 | export SAVE_PATH=../../../data_eval/pass_rate_results 5 | mkdir -p ${SAVE_PATH} 6 | export CANDIDATE_MODEL="steptool_cot" # change it accordingly 7 | export EVAL_MODEL=gpt-4-turbo-2024-04-09 8 | mkdir -p ${SAVE_PATH}/${CANDIDATE_MODEL} 9 | 10 | export test_set=G1_instruction # G1_category, G1_tool, G2_category, G2_instruction, G3_instruction 11 | 12 | python eval_pass_rate.py \ 13 | --converted_answer_path ${CONVERTED_ANSWER_PATH} \ 14 | --save_path ${SAVE_PATH}/${CANDIDATE_MODEL} \ 15 | --reference_model ${CANDIDATE_MODEL} \ 16 | --test_ids ../../solvable_queries/test_query_ids \ 17 | --max_eval_threads 1 \ 18 | --evaluate_times 3 \ 19 | --test_set ${test_set} \ 20 | # --overwrite -------------------------------------------------------------------------------- /scripts_eval/toolllama-sft/inference_toolllama_vllm.sh: -------------------------------------------------------------------------------- 1 | cd stabletoolbench 2 | export PYTHONPATH=./ 3 | export VLLM_API_BASE="http://127.0.0.1:8083/v1/" # the address of vllm.server 4 | export SERVICE_URL="http://127.0.0.1:8081/virtual" # the address of api server 5 | export MODEL_PATH="toolllama" # the name of vllm.server 6 | export STRATEGY="DFS_woFilter_w2" # or CoT@1 7 | 8 | export OUTPUT_DIR="data_eval/answer/toolllama_sft_dfs" # change it accordingly 9 | 10 | group=G1_instruction # G1_category, G1_tool, G2_category, G2_instruction, G3_instruction 11 | mkdir -p $OUTPUT_DIR; mkdir -p $OUTPUT_DIR/$group 12 | python toolbench/inference/qa_pipeline_multithread.py \ 13 | --backbone_model ToolLLaMA_vllm \ 14 | --model_path ${MODEL_PATH} \ 15 | --max_observation_length 1024 \ 16 | --method ${STRATEGY} \ 17 | --input_query_file solvable_queries/test_instruction/${group}.json \ 18 | --output_answer_file $OUTPUT_DIR/$group \ 19 | --max_query_count 30 \ 20 | --num_thread 4 -------------------------------------------------------------------------------- /scripts_eval/toolllama-sft/run_conver_answer.sh: -------------------------------------------------------------------------------- 1 | cd stabletoolbench/toolbench/tooleval 2 | export RAW_ANSWER_PATH=../../../data_eval/answer 3 | export CONVERTED_ANSWER_PATH=../../../data_eval/model_predictions_converted 4 | export MODEL_NAME=toolllama_sft_dfs # change it accordingly 5 | export STRATEGY="DFS_woFilter_w2" # or CoT@1 DFS_woFilter_w2 6 | export test_set=G1_tool # G1_category, G1_tool, G2_category, G2_instruction, G3_instruction 7 | 8 | mkdir -p ${CONVERTED_ANSWER_PATH}/${MODEL_NAME} 9 | answer_dir=${RAW_ANSWER_PATH}/${MODEL_NAME}/${test_set} 10 | output_file=${CONVERTED_ANSWER_PATH}/${MODEL_NAME}/${test_set}.json 11 | 12 | python convert_to_answer_format.py\ 13 | --answer_dir ${answer_dir} \ 14 | --method ${STRATEGY} \ 15 | --output ${output_file} -------------------------------------------------------------------------------- /scripts_eval/toolllama-sft/run_pass_rate.sh: -------------------------------------------------------------------------------- 1 | cd stabletoolbench/toolbench/tooleval 2 | export API_POOL_FILE=../../openai_key.json 3 | export CONVERTED_ANSWER_PATH=../../../data_eval/model_predictions_converted 4 | export SAVE_PATH=../../../data_eval/pass_rate_results 5 | mkdir -p ${SAVE_PATH} 6 | export CANDIDATE_MODEL="toolllama_sft_dfs" # change it accordingly 7 | export EVAL_MODEL=gpt-4-turbo-2024-04-09 8 | mkdir -p ${SAVE_PATH}/${CANDIDATE_MODEL} 9 | 10 | export test_set=G1_instruction # G1_category, G1_tool, G2_category, G2_instruction, G3_instruction 11 | 12 | python eval_pass_rate.py \ 13 | --converted_answer_path ${CONVERTED_ANSWER_PATH} \ 14 | --save_path ${SAVE_PATH}/${CANDIDATE_MODEL} \ 15 | --reference_model ${CANDIDATE_MODEL} \ 16 | --test_ids ../../solvable_queries/test_query_ids \ 17 | --max_eval_threads 15 \ 18 | --evaluate_times 3 \ 19 | --test_set ${test_set} \ 20 | # --overwrite -------------------------------------------------------------------------------- /scripts_eval/toolllama/run_preference.sh: -------------------------------------------------------------------------------- 1 | cd toolbench/tooleval 2 | export CONVERTED_ANSWER_PATH=../../data_eval/model_predictions_converted 3 | export SAVE_PATH=../../data_eval/preference_results 4 | export PASS_RATE_PATH=../../data_eval/pass_rate_results 5 | 6 | export REFERENCE_MODEL=virtual_gpt3.5-0125_dfs # change it accordingly 7 | export CANDIDATE_MODEL=virtual_toolllama_dfs # change it accordingly 8 | 9 | export EVAL_MODEL=gpt-4-turbo-2024-04-09 10 | mkdir -p ${SAVE_PATH}/${REFERENCE_MODEL}_${CANDIDATE_MODEL} 11 | 12 | export test_set=G1_instruction # G1_category, G1_tool, G2_category, G2_instruction, G3_instruction 13 | 14 | python eval_preference.py \ 15 | --converted_answer_path ${CONVERTED_ANSWER_PATH} \ 16 | --reference_model ${REFERENCE_MODEL} \ 17 | --output_model ${CANDIDATE_MODEL} \ 18 | --test_ids ../../solvable_queries/test_query_ids/ \ 19 | --save_path ${SAVE_PATH}/${REFERENCE_MODEL}_${CANDIDATE_MODEL} \ 20 | --pass_rate_result_path ${PASS_RATE_PATH} \ 21 | --max_eval_threads 30 \ 22 | --evaluate_times 3 \ 23 | --test_set ${test_set} \ 24 | # --overwrite -------------------------------------------------------------------------------- /src/baseline-archer/archer_critic.py: -------------------------------------------------------------------------------- 1 | # Ref: https://github.com/YifeiZhou02/ArCHer 2 | 3 | # @misc{zhou2024archer, 4 | # title={ArCHer: Training Language Model Agents via Hierarchical Multi-Turn RL}, 5 | # author={Yifei Zhou and Andrea Zanette and Jiayi Pan and Sergey Levine and Aviral Kumar}, 6 | # year={2024}, 7 | # eprint={2402.19446}, 8 | # archivePrefix={arXiv}, 9 | # primaryClass={cs.LG} 10 | # } 11 | 12 | import torch 13 | from transformers import AutoTokenizer, AutoModel 14 | import torch.nn as nn 15 | import numpy as np 16 | from transformers import RobertaTokenizer, RobertaModel 17 | class DoubleCritic(torch.nn.Module): 18 | def __init__(self, device, accelerator, critic_lm, cache_dir, in_dim, out_dim): 19 | super(DoubleCritic, self).__init__() 20 | self.device = device 21 | self.accelerator = accelerator 22 | self.base_lm = AutoModel.from_pretrained(critic_lm, cache_dir=cache_dir).to(device) 23 | self.base_tokenizer = AutoTokenizer.from_pretrained(critic_lm, cache_dir=cache_dir) 24 | self.base_tokenizer.truncation_side = 'left' 25 | self.critic1 = nn.Sequential(nn.Linear(in_dim*2, in_dim),\ 26 | nn.ReLU(),\ 27 | nn.Linear(in_dim, in_dim),\ 28 | nn.ReLU(),\ 29 | nn.Linear(in_dim, out_dim)).to(device) 30 | self.critic2 = nn.Sequential(nn.Linear(in_dim*2, in_dim),\ 31 | nn.ReLU(),\ 32 | nn.Linear(in_dim, in_dim),\ 33 | nn.ReLU(),\ 34 | nn.Linear(in_dim, out_dim)).to(device) 35 | self.v_critic1 = nn.Sequential(nn.Linear(in_dim, in_dim),\ 36 | nn.ReLU(),\ 37 | nn.Linear(in_dim, in_dim),\ 38 | nn.ReLU(),\ 39 | nn.Linear(in_dim, out_dim)).to(device) 40 | self.v_critic2 = nn.Sequential(nn.Linear(in_dim, in_dim),\ 41 | nn.ReLU(),\ 42 | nn.Linear(in_dim, in_dim),\ 43 | nn.ReLU(),\ 44 | nn.Linear(in_dim, out_dim)).to(device) 45 | 46 | # def prepare(self): 47 | # self.base_lm, self.critic1, self.critic2, self.v_critic1, self.v_critic2 = \ 48 | # self.accelerator.prepare(self.base_lm, self.critic1, self.critic2, self.v_critic1, self.v_critic2) 49 | 50 | def forward(self, observation, action, detach_model=False): 51 | state_actions = [o + a for o,a in zip(observation, action)] 52 | obs_ids = self.base_tokenizer(observation, padding = True, return_tensors='pt', max_length=512, truncation = True).to(self.device) 53 | # breakpoint() 54 | if detach_model: 55 | with torch.no_grad(): 56 | lm_states = self.base_lm(**obs_ids).pooler_output 57 | else: 58 | lm_states = self.base_lm(**obs_ids).pooler_output 59 | action_ids = self.base_tokenizer(action, padding = True, return_tensors='pt', max_length=512, truncation = True).to(self.device) 60 | # breakpoint() 61 | if detach_model: 62 | with torch.no_grad(): 63 | action_states = self.base_lm(**action_ids).pooler_output 64 | else: 65 | action_states = self.base_lm(**action_ids).pooler_output 66 | q_states = torch.cat([lm_states, action_states], dim = 1) 67 | # print(action.size()) 68 | return self.critic1(q_states), self.critic2(q_states), self.v_critic1(lm_states), self.v_critic2(lm_states) -------------------------------------------------------------------------------- /src/baseline-archer/archer_data.py: -------------------------------------------------------------------------------- 1 | # Ref: https://github.com/YifeiZhou02/ArCHer 2 | 3 | # @misc{zhou2024archer, 4 | # title={ArCHer: Training Language Model Agents via Hierarchical Multi-Turn RL}, 5 | # author={Yifei Zhou and Andrea Zanette and Jiayi Pan and Sergey Levine and Aviral Kumar}, 6 | # year={2024}, 7 | # eprint={2402.19446}, 8 | # archivePrefix={arXiv}, 9 | # primaryClass={cs.LG} 10 | # } 11 | 12 | from torch.utils.data import Dataset, DataLoader 13 | import numpy as np 14 | class DummyDataset(Dataset): 15 | def __init__(self, buffer): 16 | self.buffer = buffer 17 | 18 | def __len__(self): 19 | return len(self.buffer) 20 | 21 | def __getitem__(self, idx): 22 | return self.buffer[idx] 23 | 24 | 25 | class ReplayBuffer: 26 | def __init__(self, batch_size=2, capacity=10000): 27 | self.max_size = capacity 28 | self.size = 0 29 | self.observations = None 30 | self.rewards = None 31 | self.next_observations = None 32 | self.dones = None 33 | self.batch_size = batch_size 34 | self.actions = None 35 | self.mc_returns = None 36 | 37 | def sample(self, batch_size=None): 38 | if batch_size is None: 39 | batch_size = self.batch_size 40 | rand_indices = np.random.randint(0, self.size, size=(batch_size,)) % self.max_size 41 | return { 42 | "observation": self.observations[rand_indices], 43 | "action": self.actions[rand_indices], 44 | "reward": self.rewards[rand_indices], 45 | "next_observation": self.next_observations[rand_indices], 46 | "done": self.dones[rand_indices], 47 | "mc_return": self.mc_returns[rand_indices], 48 | } 49 | 50 | def __len__(self): 51 | return self.size 52 | 53 | def insert( 54 | self, 55 | /, 56 | observation, 57 | action, 58 | reward: np.ndarray, 59 | next_observation, 60 | done: np.ndarray, 61 | mc_return, 62 | **kwargs 63 | ): 64 | """ 65 | Insert a single transition into the replay buffer. 66 | 67 | Use like: 68 | replay_buffer.insert( 69 | observation=observation, 70 | action=action, 71 | reward=reward, 72 | next_observation=next_observation, 73 | done=done, 74 | ) 75 | """ 76 | if isinstance(reward, (float, int)): 77 | reward = np.array(reward) 78 | if isinstance(mc_return, (float, int)): 79 | mc_return = np.array(mc_return) 80 | if isinstance(done, bool): 81 | done = np.array(done) 82 | # print(next_observation) 83 | # if isinstance(prompt_actionaction, int): 84 | # action = np.array(action, dtype=np.int64) 85 | 86 | if self.observations is None: 87 | self.observations = np.array(['']*self.max_size, dtype = 'object') 88 | self.actions = np.array(['']*self.max_size, dtype = 'object') 89 | self.rewards = np.empty((self.max_size, *reward.shape), dtype=reward.dtype) 90 | self.next_observations = np.array(['']*self.max_size, dtype = 'object') 91 | self.dones = np.empty((self.max_size, *done.shape), dtype=done.dtype) 92 | self.mc_returns = np.empty((self.max_size, *mc_return.shape), dtype=mc_return.dtype) 93 | 94 | assert reward.shape == () 95 | assert done.shape == () 96 | 97 | self.observations[self.size % self.max_size] = observation 98 | self.actions[self.size % self.max_size] = action 99 | self.rewards[self.size % self.max_size] = reward 100 | self.next_observations[self.size % self.max_size] = next_observation 101 | self.dones[self.size % self.max_size] = done 102 | self.mc_returns[self.size % self.max_size] = mc_return 103 | 104 | self.size += 1 -------------------------------------------------------------------------------- /src/baseline-archer/archer_environment.py: -------------------------------------------------------------------------------- 1 | # Ref: https://github.com/YifeiZhou02/ArCHer 2 | 3 | # @misc{zhou2024archer, 4 | # title={ArCHer: Training Language Model Agents via Hierarchical Multi-Turn RL}, 5 | # author={Yifei Zhou and Andrea Zanette and Jiayi Pan and Sergey Levine and Aviral Kumar}, 6 | # year={2024}, 7 | # eprint={2402.19446}, 8 | # archivePrefix={arXiv}, 9 | # primaryClass={cs.LG} 10 | # } 11 | 12 | from tqdm import tqdm 13 | import numpy as np 14 | 15 | def add_trajectory_reward(trajectory): 16 | """ 17 | add trajectory reward to the dict of each interaction 18 | """ 19 | trajectory_reward = np.sum([d["reward"] for d in trajectory]) 20 | for d in trajectory: 21 | d.update({"trajectory_reward": trajectory_reward}) 22 | return trajectory 23 | 24 | def add_mc_return(trajectory, gamma = 0.95): 25 | """ 26 | add trajectory reward to the dict of each interaction 27 | """ 28 | trajectory_rewards = np.array([d["reward"] for d in trajectory]).reshape(1, -1) 29 | gamma_row = np.cumprod(np.ones((1, trajectory_rewards.shape[1]))*gamma) 30 | gamma_matrix = np.triu(gamma_row.reshape(1, -1 )/ gamma_row.reshape(-1, 1)) 31 | mc_returns = np.sum(trajectory_rewards*gamma_matrix, axis = 1) 32 | for d, mc in zip(trajectory, mc_returns): 33 | d.update({"mc_return": mc}) 34 | return trajectory 35 | 36 | 37 | def batch_interact_environment(agent, tokenizer, env, num_trajectories,\ 38 | post_f = lambda x: x, use_tqdm = True, decode_f = lambda x: x, 39 | env_idx = None): 40 | """ 41 | in a bacthed way, interact with the environments to get a list of trajectories 42 | [[{"observation":, "next_observation":, "reward":, "done":},...],...] 43 | post_f: function to add additional attributes to the trajectory 44 | """ 45 | bsize = env.bsize 46 | all_trajectories = [] 47 | for num_t in tqdm(range(num_trajectories//bsize), disable = not use_tqdm): 48 | done = False 49 | trajectories = [[] for _ in range(bsize)] 50 | # obs = reset_to(env, 69) 51 | batch_obs = env.reset(idx=env_idx) 52 | batch_done = [False,]*bsize 53 | steps = 0 54 | while not all(batch_done): 55 | steps += 1 56 | # print(f"Environment stpes {str(steps)}") 57 | action = agent.get_action(batch_obs) 58 | batch_return = env.step(decode_f(action)) 59 | for i,result in zip(range(bsize), batch_return): 60 | if result is None: 61 | continue 62 | next_obs, r, done = result 63 | trajectories[i].append({"observation": batch_obs[i], \ 64 | "next_observation": next_obs, \ 65 | "reward": r, \ 66 | "done": done, \ 67 | "action": action[i]}) 68 | batch_obs[i] = next_obs 69 | batch_done[i] = done 70 | # obs = next_obs 71 | print(trajectories[0][-1]["next_observation"]) 72 | all_trajectories += [post_f(add_mc_return(add_trajectory_reward(trajectory)))\ 73 | for trajectory in trajectories] 74 | # breakpoint() 75 | # trajectories.append(post_f(add_trajectory_reward(trajectory))) 76 | return all_trajectories 77 | -------------------------------------------------------------------------------- /src/baseline-archer/build_archer_data.py: -------------------------------------------------------------------------------- 1 | from archer_data import ReplayBuffer 2 | import pandas as pd 3 | import numpy as np 4 | import os 5 | import torch 6 | import json 7 | 8 | model = "toolllama" 9 | buffer_batch_size = 2 10 | tool_data_file = os.environ.get("DATA_FILE", None) 11 | 12 | # bsize = 4 13 | df = pd.read_csv(tool_data_file, sep="\t") 14 | 15 | 16 | # build origin trajectory 17 | trajectories = [[] for _ in range(len(df))] 18 | 19 | MAX_LEN = 1024 20 | 21 | # TODO 22 | for i in range(0, len(df)): 23 | prompt_list = eval(df.iloc[i]["prompt"]) 24 | response_list = eval(df.iloc[i]["response"]) 25 | reward_list = eval(df.iloc[i]["reward"]) 26 | 27 | obs = prompt_list[0] 28 | next_obs = obs + response_list[0] + prompt_list[1] 29 | done = False 30 | if len(obs) > MAX_LEN: 31 | obs = obs[-MAX_LEN:] 32 | if len(next_obs) > MAX_LEN: 33 | 34 | next_obs = next_obs[-MAX_LEN:] 35 | trajectories[i].append({"observation": obs, \ 36 | "next_observation": next_obs, \ 37 | "reward": reward_list[0], \ 38 | "done": done, \ 39 | "action": response_list[0]}) 40 | for j in range(1, len(response_list)): 41 | obs = next_obs 42 | next_obs = obs + response_list[j] 43 | if j+1 < len(response_list): 44 | next_obs += prompt_list[j+1] 45 | else: 46 | done = True 47 | 48 | if len(obs) > MAX_LEN: 49 | obs = obs[-MAX_LEN:] 50 | if len(next_obs) > MAX_LEN: 51 | next_obs = next_obs[-MAX_LEN:] 52 | trajectories[i].append({"observation": obs, \ 53 | "next_observation": next_obs, \ 54 | "reward": reward_list[j], \ 55 | "done": done, \ 56 | "action": response_list[j]}) 57 | 58 | 59 | def add_trajectory_reward(trajectory): 60 | """ 61 | add trajectory reward to the dict of each interaction 62 | """ 63 | trajectory_reward = np.sum([d["reward"] for d in trajectory]) 64 | for d in trajectory: 65 | d.update({"trajectory_reward": trajectory_reward}) 66 | return trajectory 67 | 68 | def add_mc_return(trajectory, gamma = 0.95): 69 | """ 70 | add trajectory reward to the dict of each interaction 71 | """ 72 | trajectory_rewards = np.array([d["reward"] for d in trajectory]).reshape(1, -1) 73 | gamma_row = np.cumprod(np.ones((1, trajectory_rewards.shape[1]))*gamma) 74 | gamma_matrix = np.triu(gamma_row.reshape(1, -1 )/ gamma_row.reshape(-1, 1)) 75 | mc_returns = np.sum(trajectory_rewards*gamma_matrix, axis = 1) 76 | for d, mc in zip(trajectory, mc_returns): 77 | d.update({"mc_return": mc}) 78 | 79 | return trajectory 80 | 81 | all_trajectories = [add_mc_return(add_trajectory_reward(trajectory))\ 82 | for trajectory in trajectories] 83 | 84 | # save to json 85 | trajectory_json = {} 86 | for i in range(len(all_trajectories)): 87 | trajectory_json[i] = all_trajectories[i] 88 | 89 | with open("trajectories.json", "w") as f: 90 | json.dump(trajectory_json, f, indent=4, ensure_ascii=False) 91 | 92 | 93 | # build replay_buffer 94 | replay_buffer= ReplayBuffer(batch_size=buffer_batch_size) 95 | 96 | data = sum(all_trajectories, []) 97 | for t in data: 98 | replay_buffer.insert(**t) 99 | 100 | print(">>> Saving Replay Buffer") 101 | save_path = os.environ.get("SAVE_PATH", "save") 102 | os.makedirs(save_path, exist_ok=True) 103 | torch.save(replay_buffer, os.path.join(save_path, 'replay_buffer.pt')) 104 | torch.save(all_trajectories, os.path.join(save_path, 'trajectories.pt')) 105 | -------------------------------------------------------------------------------- /src/baseline-archer/offpolicy_train_loop.py: -------------------------------------------------------------------------------- 1 | from archer_environment import batch_interact_environment 2 | from archer_data import DummyDataset, ReplayBuffer 3 | import numpy as np 4 | from torch.utils.data import Dataset, DataLoader 5 | from tqdm import tqdm 6 | from archer_trainer import ArcherTrainer 7 | import wandb 8 | import threading 9 | import os 10 | import torch 11 | import time 12 | def offpolicy_train_loop(env,\ 13 | eval_env,\ 14 | agent,\ 15 | tokenizer,\ 16 | accelerator,\ 17 | warmup_iter: int = 20, 18 | rollout_size: int = 50,\ 19 | eval_size: int = 1, 20 | batch_size: int = 2, 21 | capacity: int = 500000, 22 | iterations: int = 10,\ 23 | epochs:int = 3, \ 24 | grad_accum_steps: int = 1,\ 25 | env_idx:int = None,\ 26 | do_sample: bool = False,\ 27 | temperature: float = 2.0,\ 28 | critic_lr: float= 1e-3,\ 29 | lm_lr: float = 1e-5,\ 30 | gamma: float = 0.9, 31 | tau: float = 0.1, 32 | use_wandb: bool = False, 33 | env_load_path: str = '', 34 | actor_epochs: int = 3, 35 | max_grad_norm: float = 0.01, 36 | save_path: str = None, 37 | save_freq: int = 25, 38 | eval_freq: int = 25, 39 | agent_type: str = "archer", 40 | decode_f: callable = lambda x: x, 41 | **kwargs): 42 | if agent_type.lower() == "archer_toolllama": 43 | trainer = ArcherTrainer(agent=agent,\ 44 | accelerator=accelerator,\ 45 | tokenizer=tokenizer,\ 46 | critic_lr = critic_lr,\ 47 | lm_lr = lm_lr,\ 48 | gamma = gamma,\ 49 | tau = tau,\ 50 | epochs = epochs,\ 51 | actor_epochs = actor_epochs, 52 | grad_accum_steps=grad_accum_steps, 53 | max_grad_norm=max_grad_norm) 54 | replay_buffer= ReplayBuffer(batch_size= batch_size, capacity=capacity) 55 | 56 | os.makedirs(save_path, exist_ok=True) 57 | all_trajectories = torch.load(os.path.join(env_load_path, 'trajectories.pt')) 58 | info = {"rollout.mean": np.mean([d[0]["trajectory_reward"] for d in all_trajectories]),\ 59 | "rollout.max": np.max([d[0]["trajectory_reward"] for d in all_trajectories]),\ 60 | "rollout.min": np.min([d[0]["trajectory_reward"] for d in all_trajectories])} 61 | 62 | replay_buffer = torch.load(os.path.join(env_load_path, 'replay_buffer.pt')) 63 | agent.prepare() 64 | #main training loop 65 | print(">>>start iterations") 66 | for i in tqdm(range(iterations)): # pre collected in replay_buffer.pt 67 | info = {} 68 | all_trajectories = torch.load(os.path.join(env_load_path, 'trajectories.pt')) 69 | replay_buffer = torch.load(os.path.join(env_load_path, 'replay_buffer.pt')) 70 | print("Training") 71 | if 'filtered' in agent_type.lower(): 72 | filtered_buffer= ReplayBuffer(batch_size= batch_size, capacity=capacity) 73 | episode_rewards = [d[0]["trajectory_reward"] for d in all_trajectories] 74 | cutoff = np.quantile(episode_rewards, 1 - 0.1) 75 | print("Episode Reward Cutoff: ", cutoff) 76 | filtered_trajectories = list(filter(lambda x: x[0]["trajectory_reward"] >= cutoff, all_trajectories)) 77 | data = sum(filtered_trajectories, []) 78 | for d in data: 79 | filtered_buffer.insert(**d) 80 | info.update(trainer.update(filtered_buffer, no_update_actor = (i < warmup_iter))) 81 | else: 82 | # data = list(filter(lambda x: x["reward"] >0, data)) 83 | info.update(trainer.update(replay_buffer, no_update_actor = (i < warmup_iter))) 84 | if use_wandb and accelerator.is_main_process: 85 | wandb.log(info) 86 | if (i+1) % save_freq == 0 and save_path is not None and accelerator.is_main_process: 87 | print("Saving") 88 | trainer.save(os.path.join(save_path, 'trainer.pt'), save_dir=save_path) 89 | torch.save(replay_buffer, os.path.join(save_path, 'replay_buffer.pt')) 90 | # return model -------------------------------------------------------------------------------- /src/baseline-archer/run.py: -------------------------------------------------------------------------------- 1 | # Ref: https://github.com/YifeiZhou02/ArCHer 2 | 3 | # @misc{zhou2024archer, 4 | # title={ArCHer: Training Language Model Agents via Hierarchical Multi-Turn RL}, 5 | # author={Yifei Zhou and Andrea Zanette and Jiayi Pan and Sergey Levine and Aviral Kumar}, 6 | # year={2024}, 7 | # eprint={2402.19446}, 8 | # archivePrefix={arXiv}, 9 | # primaryClass={cs.LG} 10 | # } 11 | 12 | import torch 13 | import transformers 14 | from tqdm import tqdm 15 | from archer_agent import ArcherAgent 16 | from offpolicy_train_loop import offpolicy_train_loop 17 | 18 | import torch.nn as nn 19 | import numpy as np 20 | import wandb 21 | from omegaconf import DictConfig, OmegaConf 22 | import os 23 | import hydra 24 | from accelerate import Accelerator 25 | from datetime import timedelta 26 | from accelerate import DistributedDataParallelKwargs, InitProcessGroupKwargs 27 | transformers.logging.set_verbosity_error() 28 | 29 | CONFIG_NAME = os.environ.get("ARCHER_CONFIG_NAME", None) 30 | @hydra.main(version_base=None, config_path="../../config/archer/", config_name=CONFIG_NAME) 31 | def main(config: "DictConfig"): 32 | print(">>> Configuration file: "+CONFIG_NAME+"<<<") 33 | print(OmegaConf.to_yaml(config)) 34 | try: 35 | from huggingface_hub import login 36 | login(token=config.huggingface_token) 37 | except: 38 | print(">>> Huggingface token not found.") 39 | 40 | accelerator = Accelerator(InitProcessGroupKwargs(timeout=timedelta(18000))) 41 | device = accelerator.device 42 | 43 | decode_f = lambda x:x 44 | # load decision model 45 | if config.agent_type.lower() == "archer_toolllama": 46 | print(">>> Using ArCHer agent with ToolLLAMA") 47 | agent = ArcherAgent(device=device, accelerator=accelerator, 48 | temperature=config.temperature, do_sample=config.do_sample, 49 | policy_lm=config.policy_lm, critic_lm=config.critic_lm, 50 | cache_dir=config.cache_dir, max_new_tokens=config.max_new_tokens, 51 | use_lora=config.use_lora, 52 | eos_str=config.eos_str) 53 | else: 54 | raise NotImplementedError("Agent not implemented.") 55 | tokenizer = agent.tokenizer 56 | if config.checkpoint_path is not None: 57 | state_dict = torch.load(config.checkpoint_path, map_location=device)['model_state_dict'] 58 | agent.model.load_state_dict(state_dict) 59 | 60 | if config.use_wandb and accelerator.is_main_process: 61 | wandb.login(key=config.wandb_key) 62 | wandb.init(project=config.project_name, name=config.run_name, config=dict(config)) 63 | 64 | offpolicy_train_loop(env = None, 65 | agent = agent, 66 | tokenizer = tokenizer, 67 | eval_env = None, 68 | accelerator = accelerator, 69 | decode_f=decode_f, 70 | **config) 71 | 72 | 73 | if __name__ == "__main__": 74 | main() 75 | -------------------------------------------------------------------------------- /src/baseline-eto/dpo_train.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import transformers 3 | from peft import LoraConfig, get_peft_model 4 | from dataclasses import dataclass, field 5 | from typing import Optional 6 | 7 | from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, AutoConfig 8 | from datasets import load_dataset 9 | from transformers.integrations import deepspeed 10 | from trl import ( 11 | DPOTrainer, 12 | DPOConfig 13 | ) 14 | 15 | @dataclass 16 | class ModelArguments: 17 | model_name_or_path: Optional[str] = field(default="facebook/opt-125m") 18 | 19 | @dataclass 20 | class DataArguments: 21 | data_path: str = field( 22 | default=None, metadata={"help": "Path to the training data."} 23 | ) 24 | 25 | @dataclass 26 | class TrainingArguments(DPOConfig): 27 | beta: float = field(default=0.2, metadata={"help": "The beta factor in DPO loss. Higher beta means less divergence from the initial policy. For the IPO loss, beta is the regularization parameter denoted by tau in the paper."}) 28 | model_max_length: int = field( 29 | default=8192, 30 | metadata={ 31 | "help": "Expanded maximum sequence length. Sequences will be right padded (and possibly truncated)." 32 | }, 33 | ) 34 | 35 | @dataclass 36 | class LoraArguments: 37 | lora_r: int = 16 38 | lora_alpha: int = 16 39 | lora_dropout: float = 0.05 40 | lora_bias: str = "none" 41 | 42 | class DPOTrain(): 43 | 44 | def __init__(self): 45 | pass 46 | 47 | def print_trainable_parameters(self, model): 48 | """ 49 | Prints the number of trainable parameters in the model. 50 | """ 51 | trainable_params = 0 52 | all_param = 0 53 | for _, param in model.named_parameters(): 54 | all_param += param.numel() 55 | if param.requires_grad: 56 | trainable_params += param.numel() 57 | print( 58 | f"trainable params: {trainable_params} || all params: {all_param} || trainables%: {100 * trainable_params / all_param}" 59 | ) 60 | 61 | def run(self): 62 | global local_rank 63 | 64 | parser = transformers.HfArgumentParser( 65 | (ModelArguments, DataArguments, TrainingArguments, LoraArguments) 66 | ) 67 | model_args, data_args, training_args, lora_args = parser.parse_args_into_dataclasses() 68 | 69 | device_map = "auto" 70 | 71 | self.tokenizer = AutoTokenizer.from_pretrained( 72 | model_args.model_name_or_path, 73 | model_max_length=training_args.model_max_length, 74 | padding_side="right", 75 | use_fast=False, 76 | ) 77 | self.tokenizer.pad_token = self.tokenizer.unk_token 78 | 79 | # train_dataset = self.get_dpo_dataset(self.data_file) 80 | dataset = load_dataset('csv', data_files=data_args.data_path, delimiter='\t') 81 | print(dataset.keys()) 82 | train_val = dataset["train"].train_test_split( 83 | test_size=0.02, shuffle=True, seed=2024 84 | ) 85 | train_dataset = train_val["train"] 86 | val_dataset = train_val["test"] 87 | 88 | # Set RoPE scaling factor 89 | model_config = AutoConfig.from_pretrained( 90 | model_args.model_name_or_path, 91 | rope_scaling = { 92 | "factor": 2.0, 93 | "type": "linear" 94 | }, 95 | use_cache = False 96 | ) 97 | model_load_kwargs = { 98 | 'low_cpu_mem_usage': not deepspeed.is_deepspeed_zero3_enabled(), 99 | } 100 | model = AutoModelForCausalLM.from_pretrained( 101 | model_args.model_name_or_path, 102 | config = model_config, 103 | device_map=device_map, 104 | trust_remote_code=True, 105 | torch_dtype=torch.bfloat16, 106 | **model_load_kwargs 107 | ) 108 | 109 | lora_config = LoraConfig( 110 | r=lora_args.lora_r, 111 | lora_alpha=lora_args.lora_alpha, 112 | bias=lora_args.lora_bias, 113 | task_type="CAUSAL_LM", 114 | ) 115 | model = get_peft_model(model, lora_config) 116 | self.print_trainable_parameters(model) 117 | 118 | dpo_trainer = DPOTrainer( 119 | model=model, 120 | ref_model=None, 121 | args=training_args, 122 | train_dataset=train_dataset, 123 | eval_dataset=val_dataset, 124 | tokenizer=self.tokenizer, 125 | ) 126 | dpo_trainer.train() 127 | dpo_trainer.save_model() 128 | 129 | 130 | if __name__ == "__main__": 131 | DPOTrain_ = DPOTrain() 132 | DPOTrain_.run() -------------------------------------------------------------------------------- /src/baseline-ppo/ppo.py: -------------------------------------------------------------------------------- 1 | # PPO (Final Reward) 2 | 3 | import json 4 | import time 5 | from tqdm import tqdm 6 | import os 7 | import torch 8 | from peft import LoraConfig 9 | 10 | from argparse import ArgumentParser 11 | from transformers import AutoTokenizer 12 | from accelerate import Accelerator 13 | from datasets import load_dataset 14 | 15 | from trl import ( 16 | PPOTrainer, 17 | PPOConfig, 18 | AutoModelForCausalLMWithValueHead, 19 | ) 20 | 21 | import wandb 22 | import numpy as np 23 | import random 24 | 25 | def set_seed(seed): 26 | random.seed(seed) 27 | os.environ['PYTHONHASHSEED'] = str(seed) 28 | np.random.seed(seed) 29 | torch.manual_seed(seed) 30 | torch.cuda.manual_seed(seed) 31 | torch.cuda.manual_seed_all(seed) # if you are using multi-GPU. 32 | 33 | 34 | class PPOTrain(): 35 | @staticmethod 36 | def parse_args(): 37 | parser = ArgumentParser() 38 | parser.add_argument('--config_path', default="config/dpo-test.json", type=str, required=True, help='Path to the config file') 39 | parser.add_argument('--model_path', default="ToolBench/ToolLLaMA-2-7b-v2", type=str, help='Path to the model') 40 | parser.add_argument('--data_file', required=True, type=str, help='Path to the data file') 41 | parser.add_argument('--model_type', default="ToolLlama", type=str, help='Type of the model') 42 | parser.add_argument('--epochs', default=3, type=int, help='Number of epochs to train') 43 | parser.add_argument('--max_length', default=1024, type=int, help='Max length of the input') 44 | parser.add_argument('--max_context_len', default=4096, type=int, help='Max context length') 45 | parser.add_argument('--max_response_len', default=1200, type=int, help='Max response length') 46 | return parser.parse_args() 47 | 48 | def __init__(self, args): 49 | self.config_path = args.config_path 50 | self.model_path = args.model_path 51 | self.data_file = args.data_file 52 | self.max_length = args.max_length 53 | self.epochs = args.epochs 54 | self.max_length = args.max_length 55 | self.max_context_len = args.max_context_len 56 | self.max_response_len = args.max_response_len 57 | wandb_project = "baseline-PPO" 58 | wandb_run_name = f"{args.model_type}" 59 | wandb.init(project=wandb_project, name=wandb_run_name) 60 | 61 | 62 | def print_trainable_parameters(self, model): 63 | """ 64 | Prints the number of trainable parameters in the model. 65 | """ 66 | trainable_params = 0 67 | all_param = 0 68 | for _, param in model.named_parameters(): 69 | all_param += param.numel() 70 | if param.requires_grad: 71 | trainable_params += param.numel() 72 | print( 73 | f"trainable params: {trainable_params} || all params: {all_param} || trainables%: {100 * trainable_params / all_param}" 74 | ) 75 | 76 | def formatting_func(self, examples): 77 | input_text = examples["prompt"] 78 | examples["query"] = self.tokenizer.encode(input_text, return_tensors='pt').squeeze(0) 79 | 80 | max_context_len = 4096 81 | max_response_len = 1200 82 | while len(examples["query"]) > max_context_len: 83 | examples["query"] = examples["query"][-max_context_len:] 84 | 85 | 86 | examples['response'] = self.tokenizer.encode(examples["response"], return_tensors='pt').squeeze(0) 87 | if len(examples['response']) > max_response_len: 88 | examples['response'] = examples['response'][:self.max_response_len] 89 | examples["label"] = torch.tensor(eval(examples["reward"])[-1], dtype=torch.float16) 90 | return examples 91 | 92 | def train(self, epochs: int = 1): 93 | base_dir = os.path.join('ckpts/', f'baseline-ppo_'+str(int(time.time()))) 94 | 95 | batch_steps = 0 96 | for epoch in range(epochs): 97 | print(f"==========================Epoch {epoch}==========================") 98 | 99 | for batch_id, batch in tqdm(enumerate(self.ppo_trainer.dataloader)): 100 | batch_steps += 1 101 | query_tensors, response_tensors = batch['query'], batch['response'] 102 | rewards = batch['label'] 103 | stats = self.ppo_trainer.step(query_tensors, response_tensors, rewards) 104 | self.ppo_trainer.log_stats(stats, batch, rewards, columns_to_log=[]) 105 | 106 | if batch_steps % 100 == 0: 107 | os.makedirs(base_dir, exist_ok=True) 108 | self.ppo_trainer.save_pretrained(os.path.join(base_dir, f'batch-{batch_steps}')) 109 | os.makedirs(base_dir, exist_ok=True) 110 | self.ppo_trainer.save_pretrained(os.path.join(base_dir, f'epoch-{epoch}')) 111 | 112 | 113 | def run(self): 114 | set_seed(2024) 115 | 116 | with open(self.config_path, 'r') as config_f: 117 | config = json.load(config_f) 118 | 119 | self.tokenizer = AutoTokenizer.from_pretrained(self.model_path, 120 | device_map= {"": Accelerator().process_index}) 121 | dataset = load_dataset('csv', data_files=self.data_file, delimiter='\t') 122 | 123 | peft_kwargs = config.get('peft_kwargs', {}) 124 | peft_config = LoraConfig(**peft_kwargs) 125 | 126 | formatted_dataset = dataset.map(self.formatting_func, batched=False, load_from_cache_file=False) 127 | formatted_dataset.set_format(type="torch") 128 | train_dataset = formatted_dataset["train"] 129 | 130 | ppo_kwargs = config.get('ppo_kwargs', {}) 131 | ppo_config = PPOConfig(**ppo_kwargs) 132 | 133 | model = AutoModelForCausalLMWithValueHead.from_pretrained( 134 | self.model_path, 135 | low_cpu_mem_usage=True, 136 | device_map="auto", 137 | peft_config=peft_config, 138 | torch_dtype=torch.bfloat16, 139 | ) 140 | 141 | self.print_trainable_parameters(model) 142 | 143 | def collator(data): 144 | return dict((key, [d[key] for d in data]) for key in data[0]) 145 | 146 | if self.tokenizer.pad_token is None: 147 | self.tokenizer.pad_token = self.tokenizer.eos_token 148 | model.config.pad_token_id = model.config.eos_token_id 149 | 150 | self.ppo_trainer = PPOTrainer( 151 | config=ppo_config, 152 | model=model, 153 | dataset=train_dataset, 154 | tokenizer=self.tokenizer, 155 | data_collator=collator 156 | ) 157 | 158 | self.train(epochs=args.epochs) 159 | 160 | 161 | if __name__ == "__main__": 162 | args = PPOTrain.parse_args() 163 | PPOTrain = PPOTrain(args) 164 | PPOTrain.run() -------------------------------------------------------------------------------- /src/reward/annotation_with_gpt.py: -------------------------------------------------------------------------------- 1 | from src.reward.evaluators.evaluator import ProcessRewardEvaluator 2 | from stabletoolbench.toolbench.tooleval.evaluators import load_registered_automatic_evaluator 3 | import os 4 | import json 5 | import random 6 | from concurrent.futures import ThreadPoolExecutor,as_completed 7 | import argparse 8 | from tqdm import tqdm 9 | from stabletoolbench.toolbench.tooleval.utils import get_steps 10 | import backoff 11 | 12 | abs_dir = os.path.split(__file__)[0] 13 | 14 | def parse_args(): 15 | parser = argparse.ArgumentParser() 16 | parser.add_argument('--converted_answer_path', type=str, default="", required=True, help='converted answer path') 17 | parser.add_argument('--save_path', type=str, default="", required=False, help='result save path') 18 | parser.add_argument('--reference_model', type=str, default="", required=False, help='model predictions path') 19 | parser.add_argument('--reference_path', type=str, default=None, required=False, help='reference path') 20 | parser.add_argument('--task_num', type=int, default=None, required=False, help='task num') 21 | parser.add_argument('--evaluator', type=str, default="gpt-4-turbo-2024-04-09", required=False, help='which evaluator to use.') 22 | parser.add_argument('--max_eval_threads', type=int, default=30, required=False, help='max threads nums') 23 | parser.add_argument('--evaluate_times', type=int, default=4, required=False, help='how many times to predict with the evaluator for each solution path.') 24 | parser.add_argument('--test_set', nargs='+', default=['G1_instruction'], help='test set name') 25 | parser.add_argument('--overwrite', action='store_true', help='whether to overwrite the existing result file') 26 | return parser.parse_args() 27 | 28 | if __name__ == "__main__": 29 | args = parse_args() 30 | evaluators = [load_registered_automatic_evaluator(evaluator_name=args.evaluator, evaluators_cfg_path=os.path.join(abs_dir,'evaluators')) for _ in range(args.max_eval_threads)] 31 | @backoff.on_exception(backoff.expo, Exception, max_time=15) 32 | def compute_process_reward(query_id, example, evaluate_time): 33 | global evaluators 34 | evaluator = random.choice(evaluators) 35 | answer_steps, answer_steps_list, final_step = get_steps(example) 36 | succeed_tool_calling_list, contributions, answer_status = evaluator.evaluate_process_reward( 37 | { 38 | 'query':example['query'], 39 | 'available_tools':example['available_tools'], 40 | }, 41 | answer_steps_list[:-1], 42 | example['answer'], 43 | ) 44 | process_reward = { 45 | "succeed_tool_calling": succeed_tool_calling_list, 46 | "contributions": contributions, 47 | } 48 | return query_id, process_reward, answer_status, evaluate_time 49 | 50 | reference_model = args.reference_model 51 | output_list = [] 52 | 53 | for test_set in args.test_set: 54 | 55 | save_file = f"{args.save_path}/{test_set}.json" 56 | if args.task_num: 57 | save_file = f"{args.save_path}/{test_set}_{args.task_num}.json" 58 | 59 | reference_path = f"{args.converted_answer_path}/{test_set}.json" 60 | reference_examples = json.load(open(reference_path, "r")) 61 | if args.task_num: 62 | reference_examples = {k:reference_examples[k] for k in list(reference_examples.keys())[:args.task_num]} 63 | 64 | if os.path.exists(save_file) and not args.overwrite: 65 | old_existed_ids = list(json.load(open(save_file, "r")).keys()) 66 | old_label_cnt = json.load(open(save_file, "r")) 67 | existed_ids = [] 68 | label_cnt = {} 69 | for query_id in old_existed_ids: 70 | ans = old_label_cnt[query_id] 71 | if len(ans['process_reward'].keys()) == args.evaluate_times: 72 | existed_ids.append(query_id) 73 | label_cnt[query_id] = ans 74 | else: 75 | existed_ids = [] 76 | label_cnt = {} 77 | 78 | with ThreadPoolExecutor(args.max_eval_threads) as pool: 79 | future = [] 80 | 81 | for query_id in reference_examples: 82 | if query_id in existed_ids: 83 | continue 84 | for i in range(args.evaluate_times): 85 | example = reference_examples[query_id] 86 | future.append(pool.submit( 87 | compute_process_reward, 88 | query_id, 89 | example, 90 | evaluate_time=i 91 | )) 92 | 93 | for thd in tqdm(as_completed(future),total=len(future),ncols=100): 94 | query_id, process_reward, is_solved, evaluate_time = thd.result() 95 | example = reference_examples[query_id] 96 | query = example["query"] 97 | tool_names = [] 98 | for tool_dict in example["available_tools"]: 99 | tool_name = tool_dict["function"]["name"] 100 | tool_names.append(tool_name) 101 | answer_steps, answer_steps_list, final_step = get_steps(example) 102 | if query_id not in label_cnt: 103 | label_cnt[query_id] = {} 104 | label_cnt[query_id]["query"] = query 105 | label_cnt[query_id]["tool_names"] = tool_names 106 | label_cnt[query_id]["answer_steps"] = answer_steps_list[:-1] 107 | # label_cnt[query_id]["mid_steps_reward"] = mid_steps_reward # parsed 108 | if 'process_reward' not in label_cnt[query_id]: 109 | label_cnt[query_id]["process_reward"] = {} 110 | label_cnt[query_id]["process_reward"][evaluate_time] = process_reward 111 | label_cnt[query_id]["final_step"] = final_step 112 | 113 | if 'is_solved' not in label_cnt[query_id]: 114 | label_cnt[query_id]["is_solved"] = {} 115 | label_cnt[query_id]["is_solved"][evaluate_time] = str(is_solved) 116 | # print("========== Finish and Dump into json file===========", query_id, is_solved, evaluate_time) 117 | 118 | json.dump(label_cnt, open(save_file, "w"), ensure_ascii=False, indent=4) 119 | 120 | json.dump(label_cnt, open(save_file, "w"), ensure_ascii=False, indent=4) 121 | -------------------------------------------------------------------------------- /src/reward/evaluators/evaluator.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, Any 2 | 3 | from stabletoolbench.toolbench.tooleval.evaluators.registered_cls.utils import register_evaluator 4 | from stabletoolbench.toolbench.tooleval.evaluators.registered_cls.rtl import ReinforceToolLearningEvaluator 5 | 6 | from enum import Enum 7 | 8 | class AnswerStatus(Enum): 9 | Unsure = "Unsure" 10 | Unsolved = "Unsolved" 11 | Solved = "Solved" 12 | 13 | @register_evaluator 14 | class ProcessRewardEvaluator(ReinforceToolLearningEvaluator): 15 | def evaluate_process_reward(self, 16 | task_description:Dict, 17 | mid_steps, 18 | answer:Dict[Any,Any]): 19 | ret = self.function_call( 20 | 'evaluate_process_reward', 21 | { 22 | 'query': task_description['query'], 23 | 'mid_steps': mid_steps, 24 | 'final_answer':answer['final_answer'], 25 | } 26 | ) 27 | answer_status = AnswerStatus(ret['final_answer_status']) 28 | return ret['succeed_tool_calling'], ret['contribution_to_final_answer'], answer_status 29 | -------------------------------------------------------------------------------- /src/reward/evaluators/gpt-4-turbo-2024-04-09/config.yaml: -------------------------------------------------------------------------------- 1 | evaluator_name: "gpt-4-turbo-2024-04-09" 2 | registered_cls_name: "ProcessRewardEvaluator" 3 | prompt_template: "template.txt" 4 | fn_completions: "normalized_openai_completions" 5 | apis_json: "your/path/to/api_pool.json" 6 | completions_kwargs: 7 | model: "gpt-4-turbo-2024-04-09" 8 | max_tokens: 1000 9 | temperature: 0 10 | timeout: 10 11 | functions: 12 | - name: "evaluate_process_reward" 13 | description: "Evaluate the entire task-solving process, including tool calls, the contribution of each intermediate step to the final answer, and the status of the final answer." 14 | parameters: 15 | type: "object" 16 | properties: 17 | succeed_tool_calling: 18 | type: "array" 19 | description: "Provide a binary score (0 or 1) indicating whether **each intermediate step** successfully called the tool." 20 | items: 21 | type: "number" 22 | description: "0 for unsuccessful tool calls, 1 for successful tool calls" 23 | contribution_to_final_answer: 24 | type: "array" 25 | description: "Provide a score (0 to 5) to assess how much **each intermediate step** contributed to the final answer." 26 | items: 27 | type: "number" 28 | description: "0 indicates no contribution, and 5 indicates maximum contribution." 29 | final_answer_status: 30 | type: "string" 31 | enum: ["Unsure", "Unsolved", "Solved"] 32 | description: "Indicate the status of the final answer. Choose from: 'Unsure', 'Unsolved', or 'Solved'." 33 | required: ["succeed_tool_calling", "contribution_to_final_answer", "final_answer_status"] 34 | 35 | - name: "check_answer_status" 36 | description: "Parse the json answer with layerd nodes and return the answer_status about the answer" 37 | parameters: 38 | type: "object" 39 | properties: 40 | answer_status: 41 | type: "string" 42 | enum: ["Unsure","Unsolved","Solved"] 43 | required: ["answer_status"] 44 | fn_completion_parser: "index_parser" 45 | batch_size: 1 46 | -------------------------------------------------------------------------------- /src/reward/evaluators/gpt-4-turbo-2024-04-09/template.txt: -------------------------------------------------------------------------------- 1 | 2 | evaluate_process_reward 3 | 4 | Query: 5 | {query} 6 | 7 | Intermediate Steps: 8 | {mid_steps} 9 | 10 | Final Answer: 11 | {final_answer} 12 | 13 | Based on the query, intermediate steps, and final answer, evaluate the entire task-solving process using the following criteria: 14 | 15 | 1. **Successful Tool Calling**: For each intermediate step, indicate whether a tool was successfully called, with a score of 0 (no) or 1 (yes). 16 | 2. **Contribution to Final Answer**: Rate the contribution of each intermediate step to the final answer on a scale of 0 to 5. 17 | 3. **Final Answer Status**: Determine the final answer status as 'Solved', 'Unsure', or 'Unsolved'. 18 | 19 | Please call the `evaluate_process_reward` function to return your evaluation. 20 | 21 | -------------------------------------------------------------------------------- /src/reward/openai_key.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "api_key": "", 4 | "api_base": "" 5 | } 6 | ] -------------------------------------------------------------------------------- /stabletoolbench/config.yml: -------------------------------------------------------------------------------- 1 | api_key: 2 | api_base: 3 | toolbench_key: 4 | tool_root_dir: server/tools -------------------------------------------------------------------------------- /stabletoolbench/server/config.yml: -------------------------------------------------------------------------------- 1 | api_key: 2 | api_base: 3 | model: 4 | temperature: 0 5 | toolbench_url: 6 | rapidapi_key: 7 | tools_folder: "./tools" 8 | cache_folder: "./tool_response_cache" 9 | is_save: true 10 | port: 8081 11 | -------------------------------------------------------------------------------- /stabletoolbench/server/requirements.txt: -------------------------------------------------------------------------------- 1 | fastapi==0.110.0 2 | openai 3 | pydantic==1.10.11 4 | PyYAML==6.0.1 5 | PyYAML==6.0.1 6 | Requests==2.31.0 7 | slowapi==0.1.9 8 | tenacity==8.2.2 9 | uvicorn==0.28.0 10 | -------------------------------------------------------------------------------- /stabletoolbench/server/utils.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | categories = [ 4 | "Sports", 5 | "Finance", 6 | "Data", 7 | "Entertainment", 8 | "Travel", 9 | "Location", 10 | "Science", 11 | "Food", 12 | "Transportation", 13 | "Music", 14 | "Business", 15 | "Visual Recognition", 16 | "Tools", 17 | "Text Analysis", 18 | "Weather", 19 | "Gaming", 20 | "SMS", 21 | "Events", 22 | "Health and Fitness", 23 | "Payments", 24 | "Financial", 25 | "Translation", 26 | "Storage", 27 | "Logistics", 28 | "Database", 29 | "Search", 30 | "Reward", 31 | "Mapping", 32 | "Artificial%20Intelligence%2FMachine%20Learning", 33 | "Email", 34 | "News, Media", 35 | "Video, Images", 36 | "eCommerce", 37 | "Medical", 38 | "Devices", 39 | "Business Software", 40 | "Advertising", 41 | "Education", 42 | "Media", 43 | "Social", 44 | "Commerce", 45 | "Communication", 46 | "Other", 47 | "Monitoring", 48 | "Energy", 49 | "Jobs", 50 | "Movies", 51 | "Cryptography", 52 | "Cybersecurity" 53 | ] 54 | 55 | def standardize_category(category): 56 | save_category = category.replace(" ", "_").replace(",", "_").replace("/", "_") 57 | while " " in save_category or "," in save_category: 58 | save_category = save_category.replace(" ", "_").replace(",", "_") 59 | save_category = save_category.replace("__", "_") 60 | return save_category 61 | 62 | def standardize(string): 63 | res = re.compile("[^\\u4e00-\\u9fa5^a-z^A-Z^0-9^_]") 64 | string = res.sub("_", string) 65 | string = re.sub(r"(_)\1+","_", string).lower() 66 | while True: 67 | if len(string) == 0: 68 | return string 69 | if string[0] == "_": 70 | string = string[1:] 71 | else: 72 | break 73 | while True: 74 | if len(string) == 0: 75 | return string 76 | if string[-1] == "_": 77 | string = string[:-1] 78 | else: 79 | break 80 | if string[0].isdigit(): 81 | string = "get_" + string 82 | return string 83 | 84 | def change_name(name): 85 | change_list = ["from", "class", "return", "false", "true", "id", "and"] 86 | if name in change_list: 87 | name = "is_" + name 88 | return name 89 | -------------------------------------------------------------------------------- /stabletoolbench/solvable_queries/test_query_ids/G1_category.json: -------------------------------------------------------------------------------- 1 | { 2 | "28": 0, 3 | "29": 0, 4 | "1198": 0, 5 | "1301": 0, 6 | "4153": 0, 7 | "4155": 0, 8 | "4266": 0, 9 | "4273": 0, 10 | "4286": 0, 11 | "4328": 0, 12 | "4343": 0, 13 | "4366": 0, 14 | "4415": 0, 15 | "4424": 0, 16 | "4428": 0, 17 | "4465": 0, 18 | "4471": 0, 19 | "6504": 0, 20 | "6511": 0, 21 | "6521": 0, 22 | "9604": 0, 23 | "9661": 0, 24 | "9679": 0, 25 | "9708": 0, 26 | "9719": 0, 27 | "12535": 0, 28 | "12642": 0, 29 | "12671": 0, 30 | "12688": 0, 31 | "12744": 0, 32 | "12759": 0, 33 | "12770": 0, 34 | "12788": 0, 35 | "12790": 0, 36 | "12805": 0, 37 | "12819": 0, 38 | "12875": 0, 39 | "12884": 0, 40 | "12954": 0, 41 | "15137": 0, 42 | "18268": 0, 43 | "18286": 0, 44 | "18319": 0, 45 | "18337": 0, 46 | "21257": 0, 47 | "21313": 0, 48 | "21400": 0, 49 | "21447": 0, 50 | "21453": 0, 51 | "21477": 0, 52 | "21518": 0, 53 | "23486": 0, 54 | "25341": 0, 55 | "25344": 0, 56 | "25348": 0, 57 | "26577": 0, 58 | "26613": 0, 59 | "26661": 0, 60 | "26698": 0, 61 | "26701": 0, 62 | "29592": 0, 63 | "29647": 0, 64 | "29653": 0, 65 | "29719": 0, 66 | "29724": 0, 67 | "29746": 0, 68 | "29778": 0, 69 | "29816": 0, 70 | "29824": 0, 71 | "29844": 0, 72 | "29859": 0, 73 | "29917": 0, 74 | "34747": 0, 75 | "34773": 0, 76 | "34774": 0, 77 | "34811": 0, 78 | "37847": 0, 79 | "37876": 0, 80 | "38008": 0, 81 | "38021": 0, 82 | "38028": 0, 83 | "38045": 0, 84 | "38099": 0, 85 | "38125": 0, 86 | "41983": 0, 87 | "43312": 0, 88 | "43364": 0, 89 | "43375": 0, 90 | "46424": 0, 91 | "46455": 0, 92 | "46528": 0, 93 | "46662": 0, 94 | "46676": 0, 95 | "46688": 0, 96 | "46760": 0, 97 | "51809": 0, 98 | "51817": 0, 99 | "54484": 0, 100 | "54640": 0, 101 | "54658": 0, 102 | "54697": 0, 103 | "54801": 0, 104 | "54839": 0, 105 | "54844": 0, 106 | "56809": 0, 107 | "59862": 0, 108 | "59890": 0, 109 | "59905": 0, 110 | "62870": 0, 111 | "62960": 0, 112 | "63066": 0, 113 | "63151": 0, 114 | "65185": 0, 115 | "65190": 0, 116 | "67087": 0, 117 | "67089": 0, 118 | "68407": 0, 119 | "68448": 0, 120 | "68470": 0, 121 | "68553": 0, 122 | "71583": 0, 123 | "71638": 0, 124 | "71685": 0, 125 | "71692": 0, 126 | "71741": 0, 127 | "71801": 0, 128 | "71823": 0, 129 | "77144": 0, 130 | "77171": 0, 131 | "77200": 0, 132 | "77208": 0, 133 | "77247": 0, 134 | "77261": 0, 135 | "80170": 0, 136 | "80175": 0, 137 | "80248": 0, 138 | "80298": 0, 139 | "80491": 0, 140 | "80500": 0, 141 | "80504": 0, 142 | "80519": 0, 143 | "82434": 0, 144 | "84935": 0, 145 | "84974": 0, 146 | "86069": 0, 147 | "86231": 0, 148 | "86251": 0, 149 | "86288": 0, 150 | "86297": 0, 151 | "86335": 0, 152 | "86488": 0, 153 | "86502": 0, 154 | "86535": 0 155 | } -------------------------------------------------------------------------------- /stabletoolbench/solvable_queries/test_query_ids/G1_instruction.json: -------------------------------------------------------------------------------- 1 | { 2 | "588": 0, 3 | "608": 0, 4 | "1073": 0, 5 | "1572": 0, 6 | "1856": 0, 7 | "2121": 0, 8 | "2144": 0, 9 | "2213": 0, 10 | "2354": 0, 11 | "2399": 0, 12 | "3308": 0, 13 | "3510": 0, 14 | "3723": 0, 15 | "3922": 0, 16 | "4505": 0, 17 | "5116": 0, 18 | "5810": 0, 19 | "5965": 0, 20 | "6618": 0, 21 | "6736": 0, 22 | "6959": 0, 23 | "7043": 0, 24 | "7497": 0, 25 | "7658": 0, 26 | "7989": 0, 27 | "8025": 0, 28 | "9921": 0, 29 | "9984": 0, 30 | "10160": 0, 31 | "10770": 0, 32 | "11653": 0, 33 | "11686": 0, 34 | "12204": 0, 35 | "13095": 0, 36 | "14714": 0, 37 | "15511": 0, 38 | "16196": 0, 39 | "16970": 0, 40 | "17038": 0, 41 | "17223": 0, 42 | "17952": 0, 43 | "20704": 0, 44 | "21596": 0, 45 | "22781": 0, 46 | "22937": 0, 47 | "23163": 0, 48 | "23248": 0, 49 | "23982": 0, 50 | "24146": 0, 51 | "24810": 0, 52 | "25052": 0, 53 | "25658": 0, 54 | "26063": 0, 55 | "26752": 0, 56 | "26892": 0, 57 | "27847": 0, 58 | "28751": 0, 59 | "29059": 0, 60 | "29291": 0, 61 | "29322": 0, 62 | "31117": 0, 63 | "31267": 0, 64 | "31402": 0, 65 | "32001": 0, 66 | "32285": 0, 67 | "32309": 0, 68 | "32617": 0, 69 | "32652": 0, 70 | "32807": 0, 71 | "33112": 0, 72 | "33330": 0, 73 | "33889": 0, 74 | "34266": 0, 75 | "34823": 0, 76 | "35112": 0, 77 | "36068": 0, 78 | "36197": 0, 79 | "36717": 0, 80 | "37421": 0, 81 | "38494": 0, 82 | "40019": 0, 83 | "40054": 0, 84 | "40436": 0, 85 | "40699": 0, 86 | "41389": 0, 87 | "41444": 0, 88 | "41806": 0, 89 | "42351": 0, 90 | "43269": 0, 91 | "43821": 0, 92 | "44482": 0, 93 | "44533": 0, 94 | "44619": 0, 95 | "44774": 0, 96 | "45490": 0, 97 | "45775": 0, 98 | "46403": 0, 99 | "47301": 0, 100 | "47838": 0, 101 | "48059": 0, 102 | "49267": 0, 103 | "49991": 0, 104 | "51043": 0, 105 | "52534": 0, 106 | "52734": 0, 107 | "55223": 0, 108 | "55323": 0, 109 | "55489": 0, 110 | "55721": 0, 111 | "56226": 0, 112 | "56236": 0, 113 | "56666": 0, 114 | "58096": 0, 115 | "58949": 0, 116 | "59266": 0, 117 | "59954": 0, 118 | "60837": 0, 119 | "60936": 0, 120 | "61654": 0, 121 | "62012": 0, 122 | "62757": 0, 123 | "63730": 0, 124 | "63962": 0, 125 | "65637": 0, 126 | "66018": 0, 127 | "67007": 0, 128 | "67522": 0, 129 | "67966": 0, 130 | "68221": 0, 131 | "68327": 0, 132 | "68335": 0, 133 | "69206": 0, 134 | "70610": 0, 135 | "71402": 0, 136 | "72373": 0, 137 | "72659": 0, 138 | "73529": 0, 139 | "73762": 0, 140 | "74322": 0, 141 | "75338": 0, 142 | "75390": 0, 143 | "76554": 0, 144 | "76957": 0, 145 | "77471": 0, 146 | "77514": 0, 147 | "77855": 0, 148 | "78406": 0, 149 | "79053": 0, 150 | "79620": 0, 151 | "80884": 0, 152 | "81195": 0, 153 | "81581": 0, 154 | "82314": 0, 155 | "82701": 0, 156 | "83742": 0, 157 | "83819": 0, 158 | "83950": 0, 159 | "84845": 0, 160 | "85152": 0, 161 | "86084": 0, 162 | "86143": 0, 163 | "87632": 0, 164 | "88193": 0 165 | } -------------------------------------------------------------------------------- /stabletoolbench/solvable_queries/test_query_ids/G1_tool.json: -------------------------------------------------------------------------------- 1 | { 2 | "394": 0, 3 | "692": 0, 4 | "1617": 0, 5 | "2412": 0, 6 | "2513": 0, 7 | "2701": 0, 8 | "3007": 0, 9 | "3215": 0, 10 | "3221": 0, 11 | "3287": 0, 12 | "5085": 0, 13 | "6677": 0, 14 | "7474": 0, 15 | "7903": 0, 16 | "7971": 0, 17 | "8129": 0, 18 | "8443": 0, 19 | "8655": 0, 20 | "8722": 0, 21 | "9039": 0, 22 | "9238": 0, 23 | "9792": 0, 24 | "9956": 0, 25 | "10221": 0, 26 | "10277": 0, 27 | "11924": 0, 28 | "13495": 0, 29 | "13497": 0, 30 | "13499": 0, 31 | "13537": 0, 32 | "13826": 0, 33 | "14198": 0, 34 | "15058": 0, 35 | "15335": 0, 36 | "15931": 0, 37 | "16133": 0, 38 | "16700": 0, 39 | "17978": 0, 40 | "18761": 0, 41 | "19662": 0, 42 | "19696": 0, 43 | "20358": 0, 44 | "21785": 0, 45 | "22077": 0, 46 | "22514": 0, 47 | "24777": 0, 48 | "25164": 0, 49 | "25483": 0, 50 | "25687": 0, 51 | "26542": 0, 52 | "26820": 0, 53 | "26961": 0, 54 | "27819": 0, 55 | "28028": 0, 56 | "28229": 0, 57 | "28240": 0, 58 | "28788": 0, 59 | "30660": 0, 60 | "31708": 0, 61 | "32177": 0, 62 | "33971": 0, 63 | "34211": 0, 64 | "34696": 0, 65 | "34946": 0, 66 | "35056": 0, 67 | "35382": 0, 68 | "36378": 0, 69 | "36687": 0, 70 | "37553": 0, 71 | "38414": 0, 72 | "38551": 0, 73 | "39392": 0, 74 | "39393": 0, 75 | "42077": 0, 76 | "42348": 0, 77 | "42934": 0, 78 | "43110": 0, 79 | "43557": 0, 80 | "43585": 0, 81 | "43933": 0, 82 | "44066": 0, 83 | "44793": 0, 84 | "44845": 0, 85 | "45370": 0, 86 | "45371": 0, 87 | "45418": 0, 88 | "45422": 0, 89 | "45533": 0, 90 | "46409": 0, 91 | "46413": 0, 92 | "47032": 0, 93 | "48480": 0, 94 | "48483": 0, 95 | "48950": 0, 96 | "49173": 0, 97 | "49529": 0, 98 | "49531": 0, 99 | "49830": 0, 100 | "50984": 0, 101 | "51600": 0, 102 | "52332": 0, 103 | "53120": 0, 104 | "53924": 0, 105 | "53959": 0, 106 | "54421": 0, 107 | "55589": 0, 108 | "56049": 0, 109 | "56495": 0, 110 | "58412": 0, 111 | "58705": 0, 112 | "58826": 0, 113 | "64662": 0, 114 | "65119": 0, 115 | "65125": 0, 116 | "65425": 0, 117 | "65584": 0, 118 | "65624": 0, 119 | "65673": 0, 120 | "66052": 0, 121 | "66927": 0, 122 | "68228": 0, 123 | "69319": 0, 124 | "69540": 0, 125 | "69717": 0, 126 | "69972": 0, 127 | "69973": 0, 128 | "70158": 0, 129 | "70359": 0, 130 | "70672": 0, 131 | "70835": 0, 132 | "72543": 0, 133 | "73151": 0, 134 | "73587": 0, 135 | "73739": 0, 136 | "74709": 0, 137 | "74989": 0, 138 | "75659": 0, 139 | "76706": 0, 140 | "76740": 0, 141 | "76966": 0, 142 | "77375": 0, 143 | "77908": 0, 144 | "78490": 0, 145 | "78791": 0, 146 | "78994": 0, 147 | "79741": 0, 148 | "81549": 0, 149 | "83931": 0, 150 | "85155": 0, 151 | "85562": 0, 152 | "85582": 0, 153 | "85759": 0, 154 | "86105": 0, 155 | "86735": 0, 156 | "87540": 0, 157 | "87616": 0, 158 | "87714": 0, 159 | "88197": 0 160 | } -------------------------------------------------------------------------------- /stabletoolbench/solvable_queries/test_query_ids/G2_category.json: -------------------------------------------------------------------------------- 1 | { 2 | "43": 0, 3 | "61": 0, 4 | "75": 0, 5 | "83": 0, 6 | "3432": 0, 7 | "3442": 0, 8 | "3456": 0, 9 | "3463": 0, 10 | "3482": 0, 11 | "3494": 0, 12 | "3534": 0, 13 | "3558": 0, 14 | "3609": 0, 15 | "3640": 0, 16 | "3645": 0, 17 | "3652": 0, 18 | "3672": 0, 19 | "3786": 0, 20 | "3843": 0, 21 | "3929": 0, 22 | "3942": 0, 23 | "3990": 0, 24 | "4006": 0, 25 | "4031": 0, 26 | "4095": 0, 27 | "4176": 0, 28 | "4271": 0, 29 | "13338": 0, 30 | "13354": 0, 31 | "13384": 0, 32 | "13385": 0, 33 | "13487": 0, 34 | "13517": 0, 35 | "13533": 0, 36 | "13555": 0, 37 | "13559": 0, 38 | "13586": 0, 39 | "13592": 0, 40 | "13639": 0, 41 | "13699": 0, 42 | "13745": 0, 43 | "13778": 0, 44 | "13795": 0, 45 | "13838": 0, 46 | "13951": 0, 47 | "14036": 0, 48 | "14117": 0, 49 | "14161": 0, 50 | "14185": 0, 51 | "14333": 0, 52 | "14384": 0, 53 | "14400": 0, 54 | "14533": 0, 55 | "14595": 0, 56 | "14605": 0, 57 | "14628": 0, 58 | "14732": 0, 59 | "14802": 0, 60 | "29606": 0, 61 | "29701": 0, 62 | "33046": 0, 63 | "33055": 0, 64 | "33156": 0, 65 | "33171": 0, 66 | "33255": 0, 67 | "33263": 0, 68 | "33271": 0, 69 | "33295": 0, 70 | "33431": 0, 71 | "33457": 0, 72 | "33481": 0, 73 | "33632": 0, 74 | "33716": 0, 75 | "42534": 0, 76 | "42547": 0, 77 | "42608": 0, 78 | "42635": 0, 79 | "42649": 0, 80 | "42701": 0, 81 | "42708": 0, 82 | "42729": 0, 83 | "42748": 0, 84 | "42882": 0, 85 | "42885": 0, 86 | "42957": 0, 87 | "43070": 0, 88 | "43076": 0, 89 | "43102": 0, 90 | "43200": 0, 91 | "43201": 0, 92 | "43230": 0, 93 | "43258": 0, 94 | "43316": 0, 95 | "43368": 0, 96 | "43505": 0, 97 | "43612": 0, 98 | "43663": 0, 99 | "43713": 0, 100 | "43724": 0, 101 | "43994": 0, 102 | "44010": 0, 103 | "44040": 0, 104 | "50937": 0, 105 | "62159": 0, 106 | "62261": 0, 107 | "71363": 0, 108 | "71501": 0, 109 | "71675": 0, 110 | "71756": 0, 111 | "71980": 0, 112 | "72000": 0, 113 | "72004": 0, 114 | "72040": 0, 115 | "72118": 0, 116 | "72271": 0, 117 | "72274": 0, 118 | "72357": 0, 119 | "72406": 0, 120 | "72458": 0, 121 | "72585": 0, 122 | "72618": 0, 123 | "72827": 0, 124 | "79652": 0, 125 | "79681": 0 126 | } -------------------------------------------------------------------------------- /stabletoolbench/solvable_queries/test_query_ids/G2_instruction.json: -------------------------------------------------------------------------------- 1 | { 2 | "1643": 0, 3 | "4746": 0, 4 | "5744": 0, 5 | "7257": 0, 6 | "9834": 0, 7 | "9957": 0, 8 | "9959": 0, 9 | "10097": 0, 10 | "10941": 0, 11 | "11627": 0, 12 | "11820": 0, 13 | "12034": 0, 14 | "12142": 0, 15 | "12507": 0, 16 | "12509": 0, 17 | "12634": 0, 18 | "12742": 0, 19 | "12773": 0, 20 | "12894": 0, 21 | "12961": 0, 22 | "12974": 0, 23 | "15067": 0, 24 | "15439": 0, 25 | "15929": 0, 26 | "17233": 0, 27 | "17864": 0, 28 | "19186": 0, 29 | "19850": 0, 30 | "22262": 0, 31 | "24131": 0, 32 | "25866": 0, 33 | "26341": 0, 34 | "26837": 0, 35 | "27543": 0, 36 | "29044": 0, 37 | "29499": 0, 38 | "30246": 0, 39 | "30501": 0, 40 | "34056": 0, 41 | "34437": 0, 42 | "34667": 0, 43 | "34980": 0, 44 | "35139": 0, 45 | "36115": 0, 46 | "37074": 0, 47 | "38666": 0, 48 | "44321": 0, 49 | "45688": 0, 50 | "47748": 0, 51 | "48039": 0, 52 | "48770": 0, 53 | "49308": 0, 54 | "50058": 0, 55 | "50406": 0, 56 | "50656": 0, 57 | "50658": 0, 58 | "51289": 0, 59 | "52115": 0, 60 | "54151": 0, 61 | "54246": 0, 62 | "54739": 0, 63 | "54775": 0, 64 | "54793": 0, 65 | "55251": 0, 66 | "55671": 0, 67 | "56101": 0, 68 | "56133": 0, 69 | "56155": 0, 70 | "56266": 0, 71 | "62997": 0, 72 | "63490": 0, 73 | "65457": 0, 74 | "65468": 0, 75 | "65521": 0, 76 | "65607": 0, 77 | "67514": 0, 78 | "67887": 0, 79 | "67969": 0, 80 | "68308": 0, 81 | "69637": 0, 82 | "70369": 0, 83 | "70435": 0, 84 | "70543": 0, 85 | "73783": 0, 86 | "73991": 0, 87 | "75279": 0, 88 | "75958": 0, 89 | "76230": 0, 90 | "76512": 0, 91 | "78631": 0, 92 | "78838": 0, 93 | "79476": 0, 94 | "79633": 0, 95 | "79640": 0, 96 | "79644": 0, 97 | "79645": 0, 98 | "81337": 0, 99 | "83220": 0, 100 | "83236": 0, 101 | "84074": 0, 102 | "84585": 0, 103 | "84593": 0, 104 | "85051": 0, 105 | "85129": 0, 106 | "86555": 0, 107 | "87064": 0 108 | } -------------------------------------------------------------------------------- /stabletoolbench/solvable_queries/test_query_ids/G3_instruction.json: -------------------------------------------------------------------------------- 1 | { 2 | "455": 0, 3 | "456": 0, 4 | "457": 0, 5 | "459": 0, 6 | "460": 0, 7 | "1983": 0, 8 | "1984": 0, 9 | "1985": 0, 10 | "1989": 0, 11 | "1991": 0, 12 | "5863": 0, 13 | "5864": 0, 14 | "5865": 0, 15 | "8031": 0, 16 | "8032": 0, 17 | "8034": 0, 18 | "8334": 0, 19 | "8335": 0, 20 | "8337": 0, 21 | "9341": 0, 22 | "9343": 0, 23 | "9344": 0, 24 | "9345": 0, 25 | "9346": 0, 26 | "9349": 0, 27 | "10898": 0, 28 | "11644": 0, 29 | "11645": 0, 30 | "11647": 0, 31 | "11648": 0, 32 | "11649": 0, 33 | "11650": 0, 34 | "13773": 0, 35 | "13774": 0, 36 | "13777": 0, 37 | "13779": 0, 38 | "13780": 0, 39 | "14485": 0, 40 | "14489": 0, 41 | "14938": 0, 42 | "14950": 0, 43 | "18978": 0, 44 | "18979": 0, 45 | "18980": 0, 46 | "18982": 0, 47 | "18984": 0, 48 | "18987": 0, 49 | "18988": 0, 50 | "18990": 0, 51 | "18992": 0, 52 | "19272": 0, 53 | "19274": 0, 54 | "19281": 0, 55 | "20022": 0, 56 | "20024": 0, 57 | "20026": 0, 58 | "20027": 0, 59 | "20028": 0, 60 | "20029": 0, 61 | "20030": 0, 62 | "21682": 0 63 | } -------------------------------------------------------------------------------- /stabletoolbench/toolbench/inference/Algorithms/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yuyq18/StepTool/121a6acd11a899d947a5d3fa7269cf76624c1df3/stabletoolbench/toolbench/inference/Algorithms/__init__.py -------------------------------------------------------------------------------- /stabletoolbench/toolbench/inference/Algorithms/base_search.py: -------------------------------------------------------------------------------- 1 | from Downstream_tasks.base_env import base_env 2 | 3 | class base_search_method: 4 | """For the base tree search method, you need to support the following functions""" 5 | 6 | def __init__(self,llm,io_func: base_env, process_id=0, callbacks = None): 7 | """Args: 8 | llm: The interface of the LLM 9 | io_func(base_env): Interface to the environment, 10 | process_id (int, optional): In multiprocessing annotation, this describes the process id. Defaults to 0. 11 | callbacks (_type_, optional): _description_. Defaults to None. 12 | """ 13 | pass 14 | 15 | def to_json(self,answer=False,process=True): 16 | ''' 17 | return a json object, 18 | If "answer" = True. must have the following field to make answer annotation 19 | If "process" = True. You need provide the full information of the tree searching process 20 | 21 | "answer_generation": { 22 | "valid_data": bool, 23 | "final_answer": string, 24 | "finish_type": enum["give_up","give_answer"] 25 | "train_messages": [ [openAI-message] ], 26 | } 27 | ''' 28 | raise NotImplementedError 29 | 30 | def start(self, **args): 31 | """This is the entry point of the searching process""" 32 | raise NotImplementedError 33 | 34 | -------------------------------------------------------------------------------- /stabletoolbench/toolbench/inference/Downstream_tasks/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yuyq18/StepTool/121a6acd11a899d947a5d3fa7269cf76624c1df3/stabletoolbench/toolbench/inference/Downstream_tasks/__init__.py -------------------------------------------------------------------------------- /stabletoolbench/toolbench/inference/Downstream_tasks/base_env.py: -------------------------------------------------------------------------------- 1 | class base_env: 2 | 3 | def __init__(self): 4 | self.task_description = "" 5 | self.input_description = "" 6 | self.tool_names = [] 7 | self.functions = [] 8 | 9 | def restart(self): 10 | ''' 11 | Restrat the environment 12 | ''' 13 | raise NotImplementedError 14 | 15 | def get_score(self): 16 | ''' 17 | Get the value of the current state 18 | A fake function, used to search in oracle mode, which is not actually used (and impossible to obtain) 19 | ''' 20 | raise NotImplementedError 21 | 22 | def step(self, action, input_str): 23 | ''' 24 | Perform an interaction in natural language mode 25 | return value (output str, status code) 26 | ''' 27 | raise NotImplementedError 28 | 29 | def check_success(self): 30 | ''' 31 | Returns 1 if successful, otherwise returns 0 32 | ''' 33 | raise NotImplementedError 34 | 35 | def to_json(self): 36 | raise NotImplementedError -------------------------------------------------------------------------------- /stabletoolbench/toolbench/inference/LLM/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yuyq18/StepTool/121a6acd11a899d947a5d3fa7269cf76624c1df3/stabletoolbench/toolbench/inference/LLM/__init__.py -------------------------------------------------------------------------------- /stabletoolbench/toolbench/inference/LLM/base_io.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | def base_io(input_str): 4 | pass -------------------------------------------------------------------------------- /stabletoolbench/toolbench/inference/LLM/retriever.py: -------------------------------------------------------------------------------- 1 | import time 2 | import pandas as pd 3 | from sentence_transformers import SentenceTransformer, util 4 | import json 5 | import re 6 | import os, torch 7 | from toolbench.utils import standardize, standardize_category, change_name, process_retrieval_ducoment 8 | 9 | 10 | class ToolRetriever: 11 | def __init__(self, corpus_tsv_path = "", model_path=""): 12 | self.corpus_tsv_path = corpus_tsv_path 13 | self.model_path = model_path 14 | self.model_name = model_path.split('/')[-1] 15 | self.corpus, self.corpus2tool = self.build_retrieval_corpus() 16 | self.embedder = self.build_retrieval_embedder() 17 | self.corpus_embeddings = self.build_corpus_embeddings() 18 | 19 | def build_retrieval_corpus(self): 20 | print("Building corpus...") 21 | documents_df = pd.read_csv(self.corpus_tsv_path, sep='\t') 22 | corpus, corpus2tool = process_retrieval_ducoment(documents_df) 23 | corpus_ids = list(corpus.keys()) 24 | corpus = [corpus[cid] for cid in corpus_ids] 25 | return corpus, corpus2tool 26 | 27 | def build_retrieval_embedder(self): 28 | print("Building embedder...") 29 | embedder = SentenceTransformer(self.model_path) 30 | return embedder 31 | 32 | def build_corpus_embeddings(self): 33 | print("Building corpus embeddings with embedder...") 34 | embedding_save_path = self.corpus_tsv_path.replace('.tsv', f'_{self.model_name}_embeddings.pt') 35 | if os.path.exists(embedding_save_path): 36 | print("Loading pre-computed corpus embeddings...") 37 | corpus_embeddings = torch.load(embedding_save_path) 38 | return corpus_embeddings 39 | print("Computing corpus embeddings...") 40 | corpus_embeddings = self.embedder.encode(self.corpus, convert_to_tensor=True) 41 | 42 | torch.save(corpus_embeddings, embedding_save_path) 43 | return corpus_embeddings 44 | 45 | def retrieving(self, query, top_k=5, excluded_tools={}): 46 | print("Retrieving...") 47 | start = time.time() 48 | query_embedding = self.embedder.encode(query, convert_to_tensor=True) 49 | hits = util.semantic_search(query_embedding, self.corpus_embeddings, top_k=10*top_k, score_function=util.cos_sim) 50 | retrieved_tools = [] 51 | for rank, hit in enumerate(hits[0]): 52 | # import pdb; pdb.set_trace() 53 | try: 54 | category, tool_name, api_name = self.corpus2tool[self.corpus[hit['corpus_id']]].split('[SEP]') 55 | except: 56 | print(self.corpus2tool[self.corpus[hit['corpus_id']]]) 57 | import pdb; pdb.set_trace() 58 | category = standardize_category(category) 59 | tool_name = standardize(tool_name) # standardizing 60 | api_name = change_name(standardize(api_name)) # standardizing 61 | if category in excluded_tools: 62 | if tool_name in excluded_tools[category]: 63 | top_k += 1 64 | continue 65 | tmp_dict = { 66 | "category": category, 67 | "tool_name": tool_name, 68 | "api_name": api_name 69 | } 70 | retrieved_tools.append(tmp_dict) 71 | return retrieved_tools -------------------------------------------------------------------------------- /stabletoolbench/toolbench/inference/LLM_rank/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yuyq18/StepTool/121a6acd11a899d947a5d3fa7269cf76624c1df3/stabletoolbench/toolbench/inference/LLM_rank/__init__.py -------------------------------------------------------------------------------- /stabletoolbench/toolbench/inference/LLM_rank/rank_candidate.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Evaluate the score of a query corresponding to different candidates 3 | ''' 4 | 5 | from Prompts.rank_prompts import LLM_PAIRWISE_RANK_SUBFIX_SYSTEM_PROMPT, LLM_PAIRWISE_RANK_USER_PROMPT 6 | import random 7 | from Tree.Tree import tree_node 8 | 9 | 10 | def rank2symmetry(llm_interface, LLM_rank_args, cand1,cand2): 11 | ''' 12 | Use llm to compare the height, due to the sequence, you need to compare each of the two in the front 13 | ''' 14 | single_rank_func = LLM_rank_args["rank_func"] 15 | score = [0,0] 16 | bigger1,query_count1, total_tokens1 = single_rank_func(llm_interface, LLM_rank_args, cand1,cand2) 17 | score[1 - bigger1] += 1 18 | bigger2,query_count2, total_tokens2 = single_rank_func(llm_interface, LLM_rank_args, cand2,cand1) 19 | score[bigger2] += 1 20 | if score[0] > score[1]: 21 | return 1 , query_count1 + query_count2, total_tokens1 + total_tokens2 22 | elif score[0] < score[1]: 23 | return -1, query_count1 + query_count2, total_tokens1 + total_tokens2 24 | else: 25 | return 0, query_count1 + query_count2, total_tokens1 + total_tokens2 26 | 27 | 28 | 29 | def rank2_subfix(llm_interface,LLM_rank_args, cand1,cand2): 30 | ''' 31 | Assumed that the two candidates have a long common prefix 32 | ''' 33 | anscestor_interesction = tree_node.find_ancestor_intersection(cand1,cand2) 34 | assert anscestor_interesction != None 35 | intersect_trice = anscestor_interesction.get_former_trice_from_this_node(end_node=None) 36 | trice_1 = cand1.get_former_trice_from_this_node(end_node=anscestor_interesction) 37 | trice_2 = cand2.get_former_trice_from_this_node(end_node=anscestor_interesction) 38 | 39 | system_message = LLM_PAIRWISE_RANK_SUBFIX_SYSTEM_PROMPT 40 | system_message = system_message.replace("{task_description}", LLM_rank_args["task_description"]) 41 | system_message = system_message.replace("{intersect_trice}", intersect_trice) 42 | system_message = system_message.replace("{candidate_A}",trice_1) 43 | system_message = system_message.replace("{candidate_B}",trice_2) 44 | llm_interface.change_messages([{"role":"system","content":system_message}, 45 | {"role":"user","content":LLM_PAIRWISE_RANK_USER_PROMPT}, 46 | ]) 47 | output,error_code, total_tokens = llm_interface.parse(functions=LLM_rank_args["functions"],function_call="none",process_id=LLM_rank_args["process_id"]) 48 | if output["content"].strip().lower()[-1] == "a": 49 | return 1, 1, total_tokens 50 | else: 51 | return 0, 1, total_tokens 52 | 53 | def sum_based_rankn(llm_interface,LLM_rank_args, candidates): 54 | ''' 55 | All pairs are sorted pairwise, sum the total points, and choose the best 56 | ''' 57 | total_querys = 0 58 | total_tokens = 0 59 | scores = [0]*len(candidates) 60 | for i in range(len(candidates)-1): 61 | for j in range(i+1,len(candidates)): 62 | pairwise_rank,query_count,rank2_tokens = rank2symmetry(llm_interface,LLM_rank_args, candidates[i],candidates[j]) 63 | total_querys += query_count 64 | total_tokens += rank2_tokens 65 | if pairwise_rank > 0: 66 | scores[i] += 1 67 | elif pairwise_rank < 0: 68 | scores[j] += 1 69 | else: 70 | scores[i] += 0.5 71 | scores[j] += 0.5 72 | return scores, total_querys, total_tokens 73 | 74 | 75 | 76 | if __name__ == "__main__": 77 | random.seed(42) 78 | # candidates = [ 79 | # "234", 80 | # "66.5", 81 | # "77.1", 82 | # "88.967", 83 | # "pi", 84 | # # "e", 85 | # # "ln(2)" 86 | # ] 87 | candidates = [ 88 | "77.1", 89 | "88.967", 90 | "pi", 91 | "66.5", 92 | "234", 93 | "ln(2)" 94 | ] 95 | ''' 96 | starting_delta: 97 | 50 -> 42.85% 98 | 100 -> 35.99% 99 | 150 -> 29.66% 100 | 200 -> 24.03% 101 | ''' 102 | -------------------------------------------------------------------------------- /stabletoolbench/toolbench/inference/Prompts/ReAct_prompts.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | FORMAT_INSTRUCTIONS_SYSTEM_FUNCTION = """You are AutoGPT, you can use many tools(functions) to do the following task. 5 | First I will give you the task description, and your task start. 6 | At each step, you need to give your thought to analyze the status now and what to do next, with a function call to actually excute your step. 7 | After the call, you will get the call result, and you are now in a new state. 8 | Then you will analyze your status now, then decide what to do next... 9 | After many (Thought-call) pairs, you finally perform the task, then you can give your finial answer. 10 | Remember: 11 | 1.the state change is irreversible, you can't go back to one of the former state, if you want to restart the task, say "I give up and restart". 12 | 2.All the thought is short, at most in 5 sentence. 13 | 3.You can do more then one trys, so if your plan is to continusly try some conditions, you can do one of the conditions per try. 14 | Let's Begin! 15 | Task description: {task_description}""" 16 | 17 | FORMAT_INSTRUCTIONS_USER_FUNCTION = """ 18 | {input_description} 19 | Begin! 20 | """ 21 | 22 | FORMAT_INSTRUCTIONS_SYSTEM_FUNCTION_ZEROSHOT = """Answer the following questions as best you can. Specifically, you have access to the following APIs: 23 | 24 | {func_str} 25 | 26 | Use the following format: 27 | Thought: you should always think about what to do 28 | Action: the action to take, should be one of {func_list} 29 | Action Input: the input to the action 30 | End Action 31 | 32 | Begin! Remember: (1) Follow the format, i.e, 33 | Thought: 34 | Action: 35 | Action Input: 36 | End Action 37 | (2)The Action: MUST be one of the following:{func_list} 38 | (3)If you believe that you have obtained enough information (which can be judge from the history observations) that can answer the task, please call: 39 | Action: Finish 40 | Action Input: {{"return_type": "give_answer", "final_answer": your answer string}}. 41 | Question: {question} 42 | 43 | Here are the history actions and observations: 44 | """ 45 | -------------------------------------------------------------------------------- /stabletoolbench/toolbench/inference/Prompts/Tree_search_prompts.py: -------------------------------------------------------------------------------- 1 | DIVERSITY_PROMPT='''This is not the first time you try this task, all previous trails failed. 2 | Before you generate my thought for this state, I will first show you your previous actions for this state, and then you must generate actions that is different from all of them. Here are some previous actions candidates: 3 | {previous_candidate} 4 | Remember you are now in the intermediate state of a trail, you will first analyze the now state and previous action candidates, then make actions that is different from all the previous.''' 5 | 6 | 7 | -------------------------------------------------------------------------------- /stabletoolbench/toolbench/inference/Prompts/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yuyq18/StepTool/121a6acd11a899d947a5d3fa7269cf76624c1df3/stabletoolbench/toolbench/inference/Prompts/__init__.py -------------------------------------------------------------------------------- /stabletoolbench/toolbench/inference/Prompts/rank_prompts.py: -------------------------------------------------------------------------------- 1 | 2 | LLM_PAIRWISE_RANK_SUBFIX_SYSTEM_PROMPT = ''' 3 | You are value-GPT, which is an expert of defining which trail is better, which trail is more close to solving the task. 4 | All candidate tries to solve this task with some funciton calls: 5 | ******************************* 6 | {{TASK_DESCRIPTION}} 7 | {task_description} 8 | {{END_TASK_DESCRIPTION}} 9 | ******************************* 10 | First, all candidate do the following things: 11 | {intersect_trice} 12 | After that, there are two candidates A and B, they do different things: 13 | ******************************* 14 | {{CANDIDATE_A_START}} 15 | {candidate_A} 16 | {{CANDIDATE_A_END}} 17 | ******************************* 18 | {{CANDIDATE_B_START}} 19 | {candidate_B} 20 | {{CANDIDATE_B_END}} 21 | Which try do you think is more helpful to solving the task? 22 | ''' 23 | 24 | 25 | 26 | 27 | LLM_PAIRWISE_RANK_USER_PROMPT = ''' 28 | Tell me which candidate is better in ONE Word: "A" or "B":''' -------------------------------------------------------------------------------- /stabletoolbench/toolbench/inference/Tree/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yuyq18/StepTool/121a6acd11a899d947a5d3fa7269cf76624c1df3/stabletoolbench/toolbench/inference/Tree/__init__.py -------------------------------------------------------------------------------- /stabletoolbench/toolbench/inference/callbacks/ServerEventCallback.py: -------------------------------------------------------------------------------- 1 | from typing import Any, Dict, List, Union 2 | import queue 3 | class ServerEventCallback(): 4 | """Base callback handler""" 5 | 6 | def __init__(self, queue: queue.Queue, *args, **kwargs): 7 | super().__init__(*args, **kwargs) 8 | self.queue = queue 9 | self.llm_block_id = 0 10 | self.tool_block_id = 0 11 | self.tool_descriptions = {} 12 | 13 | def add_to_queue(self, method_name: str, block_id, **kwargs: Any): 14 | data = { 15 | "method_name": method_name, 16 | "block_id": block_id, 17 | } 18 | data.update(kwargs) 19 | self.queue.put(data) 20 | 21 | def on_tool_retrieval_start(self): 22 | # tools should be of the form 23 | # {tool_name, tool_desc} 24 | self.add_to_queue( 25 | "on_tool_retrieval_start", 26 | "recommendation-1", 27 | ) 28 | print("on_tool_retrieval_start method called") 29 | 30 | def on_tool_retrieval_end(self, tools): 31 | # tool should be of the form 32 | # {tool_name, tool_desc} 33 | self.add_to_queue( 34 | "on_tool_retrieval_end", 35 | "recommendation-1", 36 | recommendations=tools 37 | ) 38 | self.tool_descriptions = { 39 | tool["name"]: tool for tool in tools 40 | } 41 | print("on_tool_retrieval_end method called") 42 | def on_request_start(self, user_input: str, method: str) -> Any: 43 | self.tool_block_id = 0 44 | self.llm_block_id = 0 45 | self.add_to_queue( 46 | "on_request_start", 47 | block_id="start", 48 | user_input=user_input, 49 | method=method 50 | ) 51 | def on_request_end(self, outputs: str, chain: List[Any]): 52 | self.add_to_queue( 53 | "on_request_end", 54 | block_id="end", 55 | output=outputs, 56 | chain=chain 57 | ) 58 | def on_request_error(self, error: str): 59 | self.add_to_queue( 60 | "on_request_error", 61 | block_id="error", 62 | error=error 63 | ) 64 | 65 | # keep 66 | def on_chain_start(self, inputs: str, depth: int) -> Any: 67 | """Run when chain starts running.""" 68 | print("on_chain_start method called") 69 | self.llm_block_id += 1 70 | block_id = "llm-" + str(self.llm_block_id) 71 | self.add_to_queue( 72 | "on_chain_start", 73 | block_id=block_id, 74 | messages=inputs, 75 | depth=depth 76 | ) 77 | return block_id 78 | 79 | # this one needs the block_id memorized 80 | def on_chain_end(self, block_id: str, depth: int) -> Any: 81 | self.add_to_queue( 82 | "on_chain_end", 83 | block_id=block_id, 84 | # output=output, 85 | depth=depth 86 | ) 87 | print("on_chain_end method called") 88 | 89 | def on_chain_error(self, error: Union[Exception, KeyboardInterrupt], **kwargs: Any) -> Any: 90 | method_name = "on_chain_error" 91 | self.add_to_queue(method_name, error=error, **kwargs) 92 | print("on_chain_error method called") 93 | 94 | def on_llm_start( 95 | self, messages: str, depth: int 96 | ) -> Any: 97 | """Run when LLM starts running.""" 98 | self.add_to_queue( 99 | "on_llm_start", 100 | block_id="llm-" + str(self.llm_block_id), 101 | messages=messages, 102 | depth=depth 103 | ) 104 | print("on_llm_start method called") 105 | 106 | def on_llm_new_token(self, token: str, **kwargs: Any) -> Any: 107 | """Run on new LLM token. Only available when streaming is enabled.""" 108 | method_name = "on_llm_new_token" 109 | self.add_to_queue(method_name, token=token, **kwargs) 110 | print("on_llm_new_token method called") 111 | 112 | def on_llm_end(self, response: str, depth: int) -> Any: 113 | """Run when LLM ends running.""" 114 | self.add_to_queue( 115 | "on_llm_end", 116 | block_id="llm-" + str(self.llm_block_id), 117 | response=response, 118 | depth=depth 119 | ) 120 | print("on_llm_end method called") 121 | 122 | def on_llm_error(self, error: Union[Exception, KeyboardInterrupt]) -> Any: 123 | """Run when LLM errors.""" 124 | self.add_to_queue( 125 | "on_llm_error", 126 | block_id="llm-" + str(self.llm_block_id), 127 | message=str(error), 128 | error=error 129 | ) 130 | print("on_llm_error method called") 131 | 132 | def on_agent_action(self, action, action_input, depth: int) -> str: 133 | self.tool_block_id += 1 134 | block_id="tool-" + str(self.tool_block_id) 135 | self.add_to_queue( 136 | "on_agent_action", 137 | block_id=block_id, 138 | action=action, 139 | action_input = action_input, 140 | depth=depth 141 | ) 142 | print("on_agent_action method called") 143 | return block_id 144 | 145 | def on_tool_start(self, tool_name: str, tool_input: str, depth: int) -> Any: 146 | method_name = "on_tool_start" 147 | tool_description = "Tool not found in tool descriptions" 148 | if tool_name in self.tool_descriptions: 149 | tool_description = self.tool_descriptions[tool_name] 150 | else: 151 | print(self.tool_descriptions) 152 | print("Key", tool_name, "not found in tool descriptions") 153 | self.add_to_queue( 154 | method_name, 155 | block_id="tool-" + str(self.tool_block_id), 156 | tool_name=tool_name, 157 | tool_description=tool_description, 158 | tool_input=tool_input, 159 | depth=depth 160 | ) 161 | print("on_tool_start method called") 162 | 163 | def on_tool_end(self, output: str, status:int, depth: int) -> Any: 164 | method_name = "on_tool_end" 165 | self.add_to_queue( 166 | method_name, 167 | block_id="tool-" + str(self.tool_block_id), 168 | output=output, 169 | status= status, 170 | depth=depth 171 | ) 172 | print("on_tool_end method called") 173 | 174 | def on_tool_error(self, error: Union[Exception, KeyboardInterrupt]) -> Any: 175 | method_name = "on_tool_error" 176 | self.add_to_queue( 177 | method_name, 178 | error=error 179 | ) 180 | print("on_tool_error method called") 181 | 182 | def on_agent_end(self, block_id:str, depth: int): 183 | self.add_to_queue( 184 | "on_agent_end", 185 | block_id=block_id, 186 | depth=depth 187 | ) 188 | print("on_agent_end method called") -------------------------------------------------------------------------------- /stabletoolbench/toolbench/inference/qa_pipeline.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Close-domain QA Pipeline 3 | ''' 4 | 5 | import argparse, os 6 | import yaml 7 | from toolbench.inference.Downstream_tasks.rapidapi import pipeline_runner 8 | 9 | 10 | if __name__ == "__main__": 11 | 12 | parser = argparse.ArgumentParser() 13 | parser.add_argument('--backbone_model', type=str, default="toolllama", required=False, help='chatgpt_function or davinci or toolllama') 14 | parser.add_argument('--chatgpt_model', type=str, default="gpt-4-turbo-2024-04-09", required=False, help='gpt-3.5-turbo or gpt-4') 15 | # parser.add_argument('--base_url', type=str, default="https://api.openai.com/v1", required=False, help='openai api url') 16 | # parser.add_argument('--openai_key', type=str, default="", required=False, help='openai key for chatgpt_function or davinci model') 17 | parser.add_argument('--config_file', type=str, default='config.yml', help='Api configuration file') 18 | parser.add_argument('--model_path', type=str, default="your_model_path/", required=False, help='') 19 | # parser.add_argument('--tool_root_dir', type=str, default="your_tools_path/", required=True, help='') 20 | parser.add_argument("--lora", action="store_true", help="Load lora model or not.") 21 | parser.add_argument('--lora_path', type=str, default="your_lora_path if lora", required=False, help='') 22 | parser.add_argument('--max_observation_length', type=int, default=1024, required=False, help='maximum observation length') 23 | parser.add_argument('--max_source_sequence_length', type=int, default=4096, required=False, help='original maximum model sequence length') 24 | parser.add_argument('--max_sequence_length', type=int, default=8192, required=False, help='maximum model sequence length') 25 | parser.add_argument('--observ_compress_method', type=str, default="truncate", choices=["truncate", "filter", "random"], required=False, help='observation compress method') 26 | parser.add_argument('--method', type=str, default="CoT@1", required=False, help='method for answer generation: CoT@n,Reflexion@n,BFS,DFS,UCT_vote') 27 | parser.add_argument('--input_query_file', type=str, default="", required=False, help='input path') 28 | parser.add_argument('--output_answer_file', type=str, default="",required=False, help='output path') 29 | # parser.add_argument('--toolbench_key', type=str, default="",required=False, help='your toolbench key to request rapidapi service') 30 | parser.add_argument('--rapidapi_key', type=str, default="",required=False, help='your rapidapi key to request rapidapi service') 31 | parser.add_argument('--use_rapidapi_key', action="store_true", help="To use customized rapidapi service or not.") 32 | parser.add_argument('--api_customization', action="store_true", help="To use customized api or not.") 33 | 34 | args = parser.parse_args() 35 | 36 | CONFIG = yaml.load(open(args.config_file, 'r'), Loader=yaml.FullLoader) 37 | os.environ["OPENAI_API_BASE"] = CONFIG['api_base'] 38 | os.environ["OPENAI_KEY"] = CONFIG['api_key'] 39 | os.environ["TOOLBENCH_KEY"] = CONFIG['toolbench_key'] 40 | os.environ["TOOL_ROOT_DIR"] = CONFIG['tool_root_dir'] 41 | 42 | pipeline_runner = pipeline_runner(args) 43 | pipeline_runner.run() 44 | 45 | -------------------------------------------------------------------------------- /stabletoolbench/toolbench/inference/qa_pipeline_multithread.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Close-domain QA Pipeline 3 | ''' 4 | 5 | import argparse, os 6 | import yaml 7 | from toolbench.inference.Downstream_tasks.rapidapi_multithread import pipeline_runner 8 | 9 | 10 | if __name__ == "__main__": 11 | 12 | parser = argparse.ArgumentParser() 13 | parser.add_argument('--backbone_model', type=str, default="toolllama", required=False, help='chatgpt_function or davinci or toolllama') 14 | parser.add_argument('--chatgpt_model', type=str, default="gpt-4-turbo-2024-04-09", required=False, help='gpt-3.5-turbo or gpt-4') 15 | # parser.add_argument('--base_url', type=str, default="https://api.openai.com/v1", required=False, help='openai api url') 16 | # parser.add_argument('--openai_key', type=str, default="", required=False, help='openai key for chatgpt_function or davinci model') 17 | parser.add_argument('--config_file', type=str, default='config.yml', help='Api configuration file') 18 | parser.add_argument('--model_path', type=str, default="your_model_path/", required=False, help='') 19 | # parser.add_argument('--tool_root_dir', type=str, default="your_tools_path/", required=True, help='') 20 | parser.add_argument("--lora", action="store_true", help="Load lora model or not.") 21 | parser.add_argument('--lora_path', type=str, default="your_lora_path if lora", required=False, help='') 22 | parser.add_argument('--max_observation_length', type=int, default=1024, required=False, help='maximum observation length') 23 | parser.add_argument('--max_source_sequence_length', type=int, default=4096, required=False, help='original maximum model sequence length') 24 | parser.add_argument('--max_sequence_length', type=int, default=8192, required=False, help='maximum model sequence length') 25 | parser.add_argument('--single_chain_max_step', type=int, default=12, required=False, help='maximum step for single chain') 26 | parser.add_argument('--max_query_count', type=int, default=30, required=False, help='maximum query count') 27 | parser.add_argument('--observ_compress_method', type=str, default="truncate", choices=["truncate", "filter", "random"], required=False, help='observation compress method') 28 | parser.add_argument('--method', type=str, default="CoT@1", required=False, help='method for answer generation: CoT@n,Reflexion@n,BFS,DFS,UCT_vote') 29 | parser.add_argument('--input_query_file', type=str, default="", required=False, help='input path') 30 | parser.add_argument('--output_answer_file', type=str, default="",required=False, help='output path') 31 | # parser.add_argument('--toolbench_key', type=str, default="",required=False, help='your toolbench key to request rapidapi service') 32 | parser.add_argument('--rapidapi_key', type=str, default="",required=False, help='your rapidapi key to request rapidapi service') 33 | parser.add_argument('--use_rapidapi_key', action="store_true", help="To use customized rapidapi service or not.") 34 | parser.add_argument('--api_customization', action="store_true", help="To use customized api or not.") 35 | parser.add_argument('--num_thread', type=int, default=1, required=False, help='number of threads') 36 | parser.add_argument('--disable_tqdm', action="store_true", help="disable tqdm or not.") 37 | parser.add_argument('--overwrite', action='store_true', help='overwrite existing runs') 38 | parser.add_argument('--easy_tool', action='store_true', help='use easy tool baseline or not') 39 | 40 | args = parser.parse_args() 41 | if args.overwrite: 42 | os.system(f"rm -rf {args.output_answer_file}") 43 | 44 | CONFIG = yaml.load(open(args.config_file, 'r'), Loader=yaml.FullLoader) 45 | os.environ["OPENAI_API_BASE"] = CONFIG['api_base'] 46 | os.environ["OPENAI_KEY"] = CONFIG['api_key'] 47 | os.environ["TOOLBENCH_KEY"] = CONFIG['toolbench_key'] 48 | os.environ["TOOL_ROOT_DIR"] = CONFIG['tool_root_dir'] 49 | 50 | pipeline_runner = pipeline_runner(args) 51 | pipeline_runner.run() 52 | 53 | -------------------------------------------------------------------------------- /stabletoolbench/toolbench/inference/qa_pipeline_open_domain.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Open-domain QA Pipeline 3 | ''' 4 | import argparse 5 | from toolbench.inference.Downstream_tasks.rapidapi import pipeline_runner 6 | 7 | 8 | if __name__ == "__main__": 9 | 10 | parser = argparse.ArgumentParser() 11 | parser.add_argument('--corpus_tsv_path', type=str, default="your_retrival_corpus_path/", required=False, help='') 12 | parser.add_argument('--retrieval_model_path', type=str, default="your_model_path/", required=False, help='') 13 | parser.add_argument('--retrieved_api_nums', type=int, default=5, required=False, help='') 14 | parser.add_argument('--backbone_model', type=str, default="toolllama", required=False, help='chatgpt_function or davinci or toolllama') 15 | parser.add_argument('--chatgpt_model', type=str, default="gpt-4-turbo-2024-04-09", required=False, help='gpt-3.5-turbo or gpt-4') 16 | parser.add_argument('--base_url', type=str, default="https://api.openai.com/v1", required=False, help='openai api url') 17 | parser.add_argument('--openai_key', type=str, default="", required=False, help='openai key for chatgpt_function or davinci model') 18 | parser.add_argument('--model_path', type=str, default="your_model_path/", required=False, help='') 19 | parser.add_argument('--tool_root_dir', type=str, default="your_tools_path/", required=True, help='') 20 | parser.add_argument("--lora", action="store_true", help="Load lora model or not.") 21 | parser.add_argument('--lora_path', type=str, default="your_lora_path if lora", required=False, help='') 22 | parser.add_argument('--max_observation_length', type=int, default=1024, required=False, help='maximum observation length') 23 | parser.add_argument('--max_source_sequence_length', type=int, default=4096, required=False, help='original maximum model sequence length') 24 | parser.add_argument('--max_sequence_length', type=int, default=8192, required=False, help='maximum model sequence length') 25 | parser.add_argument('--observ_compress_method', type=str, default="truncate", choices=["truncate", "filter", "random"], required=False, help='maximum observation length') 26 | parser.add_argument('--method', type=str, default="CoT@1", required=False, help='method for answer generation: CoT@n,Reflexion@n,BFS,DFS,UCT_vote') 27 | parser.add_argument('--input_query_file', type=str, default="", required=False, help='input path') 28 | parser.add_argument('--output_answer_file', type=str, default="",required=False, help='output path') 29 | parser.add_argument('--toolbench_key', type=str, default="",required=False, help='your toolbench key to request rapidapi service') 30 | parser.add_argument('--rapidapi_key', type=str, default="",required=False, help='your rapidapi key to request rapidapi service') 31 | parser.add_argument('--use_rapidapi_key', action="store_true", help="To use customized rapidapi service or not.") 32 | parser.add_argument('--api_customization', action="store_true", help="To use customized api or not. NOT SUPPORTED currently under open domain setting.") 33 | 34 | args = parser.parse_args() 35 | 36 | pipeline_runner = pipeline_runner(args, add_retrieval=True) 37 | pipeline_runner.run() 38 | -------------------------------------------------------------------------------- /stabletoolbench/toolbench/model/__init__.py: -------------------------------------------------------------------------------- 1 | from toolbench.model.model_adapter import ( 2 | load_model, 3 | get_conversation_template, 4 | add_model_args, 5 | ) 6 | -------------------------------------------------------------------------------- /stabletoolbench/toolbench/model/apply_delta.py: -------------------------------------------------------------------------------- 1 | """ 2 | Apply the delta weights on top of a base model. 3 | 4 | Usage: 5 | python3 -m fastchat.model.apply_delta --base ~/model_weights/llama-7b --target ~/model_weights/vicuna-7b --delta lmsys/vicuna-7b-delta-v1.1 6 | """ 7 | import argparse 8 | import gc 9 | import glob 10 | import json 11 | import os 12 | import shutil 13 | import tempfile 14 | 15 | from huggingface_hub import snapshot_download 16 | import torch 17 | from torch import nn 18 | from tqdm import tqdm 19 | from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig 20 | 21 | 22 | GB = 1 << 30 23 | 24 | 25 | def split_files(model_path, tmp_path, split_size): 26 | if not os.path.exists(model_path): 27 | model_path = snapshot_download(repo_id=model_path) 28 | if not os.path.exists(tmp_path): 29 | os.makedirs(tmp_path) 30 | 31 | file_pattern = os.path.join(model_path, "pytorch_model-*.bin") 32 | files = glob.glob(file_pattern) 33 | 34 | part = 0 35 | try: 36 | for file_path in tqdm(files): 37 | state_dict = torch.load(file_path) 38 | new_state_dict = {} 39 | 40 | current_size = 0 41 | for name, param in state_dict.items(): 42 | param_size = param.numel() * param.element_size() 43 | 44 | if current_size + param_size > split_size: 45 | new_file_name = f"pytorch_model-{part}.bin" 46 | new_file_path = os.path.join(tmp_path, new_file_name) 47 | torch.save(new_state_dict, new_file_path) 48 | current_size = 0 49 | new_state_dict = None 50 | gc.collect() 51 | new_state_dict = {} 52 | part += 1 53 | 54 | new_state_dict[name] = param 55 | current_size += param_size 56 | 57 | new_file_name = f"pytorch_model-{part}.bin" 58 | new_file_path = os.path.join(tmp_path, new_file_name) 59 | torch.save(new_state_dict, new_file_path) 60 | new_state_dict = None 61 | gc.collect() 62 | new_state_dict = {} 63 | part += 1 64 | except Exception as e: 65 | print(f"An error occurred during split_files: {e}") 66 | shutil.rmtree(tmp_path) 67 | raise 68 | 69 | 70 | def apply_delta_low_cpu_mem(base_model_path, target_model_path, delta_path): 71 | delta_tokenizer = AutoTokenizer.from_pretrained(delta_path, use_fast=False) 72 | delta_config = AutoConfig.from_pretrained(delta_path) 73 | 74 | if os.path.exists(target_model_path): 75 | shutil.rmtree(target_model_path) 76 | os.makedirs(target_model_path) 77 | 78 | split_size = 4 * GB 79 | 80 | with tempfile.TemporaryDirectory() as tmp_base_path, tempfile.TemporaryDirectory() as tmp_delta_path: 81 | print(f"Split files for the base model to {tmp_base_path}") 82 | split_files(base_model_path, tmp_base_path, split_size) 83 | print(f"Split files for the delta weights to {tmp_delta_path}") 84 | split_files(delta_path, tmp_delta_path, split_size) 85 | 86 | base_pattern = os.path.join(tmp_base_path, "pytorch_model-*.bin") 87 | base_files = glob.glob(base_pattern) 88 | delta_pattern = os.path.join(tmp_delta_path, "pytorch_model-*.bin") 89 | delta_files = glob.glob(delta_pattern) 90 | delta_state_dict = torch.load(delta_files[0]) 91 | 92 | print("Applying the delta") 93 | weight_map = {} 94 | total_size = 0 95 | 96 | for i, base_file in tqdm(enumerate(base_files)): 97 | state_dict = torch.load(base_file) 98 | file_name = f"pytorch_model-{i}.bin" 99 | for name, param in state_dict.items(): 100 | if name not in delta_state_dict: 101 | for delta_file in delta_files: 102 | delta_state_dict = torch.load(delta_file) 103 | gc.collect() 104 | if name in delta_state_dict: 105 | break 106 | 107 | state_dict[name] += delta_state_dict[name] 108 | weight_map[name] = file_name 109 | total_size += param.numel() * param.element_size() 110 | gc.collect() 111 | torch.save(state_dict, os.path.join(target_model_path, file_name)) 112 | 113 | with open( 114 | os.path.join(target_model_path, "pytorch_model.bin.index.json"), "w" 115 | ) as f: 116 | json.dump( 117 | {"weight_map": weight_map, "metadata": {"total_size": total_size}}, f 118 | ) 119 | 120 | print(f"Saving the target model to {target_model_path}") 121 | delta_tokenizer.save_pretrained(target_model_path) 122 | delta_config.save_pretrained(target_model_path) 123 | 124 | 125 | def apply_delta(base_model_path, target_model_path, delta_path): 126 | print(f"Loading the delta weights from {delta_path}") 127 | delta_tokenizer = AutoTokenizer.from_pretrained(delta_path, use_fast=False) 128 | delta = AutoModelForCausalLM.from_pretrained( 129 | delta_path, torch_dtype=torch.float16, low_cpu_mem_usage=True 130 | ) 131 | 132 | print(f"Loading the base model from {base_model_path}") 133 | base = AutoModelForCausalLM.from_pretrained( 134 | base_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True 135 | ) 136 | 137 | print("Applying the delta") 138 | for name, param in tqdm(base.state_dict().items(), desc="Applying delta"): 139 | assert name in delta.state_dict() 140 | param.data += delta.state_dict()[name] 141 | 142 | print(f"Saving the target model to {target_model_path}") 143 | base.save_pretrained(target_model_path) 144 | delta_tokenizer.save_pretrained(target_model_path) 145 | 146 | 147 | if __name__ == "__main__": 148 | parser = argparse.ArgumentParser() 149 | parser.add_argument("--base-model-path", type=str, required=True) 150 | parser.add_argument("--target-model-path", type=str, required=True) 151 | parser.add_argument("--delta-path", type=str, required=True) 152 | parser.add_argument( 153 | "--low-cpu-mem", 154 | action="store_true", 155 | help="Lower the cpu memory usage. This will split large files and use " 156 | "disk as swap to reduce the memory usage below 10GB.", 157 | ) 158 | args = parser.parse_args() 159 | 160 | if args.low_cpu_mem: 161 | apply_delta_low_cpu_mem( 162 | args.base_model_path, args.target_model_path, args.delta_path 163 | ) 164 | else: 165 | apply_delta(args.base_model_path, args.target_model_path, args.delta_path) 166 | -------------------------------------------------------------------------------- /stabletoolbench/toolbench/model/compression.py: -------------------------------------------------------------------------------- 1 | import dataclasses 2 | import os 3 | 4 | import torch 5 | import torch.nn as nn 6 | from torch.nn import functional as F 7 | from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig 8 | 9 | 10 | @dataclasses.dataclass 11 | class CompressionConfig: 12 | """Group-wise quantization.""" 13 | 14 | num_bits: int 15 | group_size: int 16 | group_dim: int 17 | symmetric: bool 18 | enabled: bool = True 19 | 20 | 21 | default_compression_config = CompressionConfig( 22 | num_bits=8, group_size=256, group_dim=1, symmetric=True, enabled=True 23 | ) 24 | 25 | 26 | class CLinear(nn.Module): 27 | """Compressed Linear Layer.""" 28 | 29 | def __init__(self, weight=None, bias=None, device=None): 30 | super().__init__() 31 | self.weight = weight 32 | self.bias = bias 33 | 34 | def forward(self, input): 35 | return F.linear(input.to(self.weight.dtype), self.weight, self.bias) 36 | 37 | 38 | def compress_module(module, target_device): 39 | for name, child in module.named_children(): 40 | if isinstance(child, nn.Linear): 41 | setattr( 42 | module, 43 | name, 44 | CLinear(child.weight, child.bias, target_device), 45 | ) 46 | compress_module(child, target_device) 47 | 48 | 49 | def get_compressed_list(module, prefix=""): 50 | compressed_list = [] 51 | for name, child in module.named_children(): 52 | if isinstance(child, nn.Linear): 53 | full_name = f"{prefix}.{name}.weight" if prefix else f"{name}.weight" 54 | compressed_list.append(full_name) 55 | compressed_list.extend( 56 | get_compressed_list(child, full_name) 57 | ) 58 | return compressed_list 59 | 60 | 61 | def apply_compressed_weight(module, compressed_state_dict, target_device, prefix=""): 62 | for name, child in module.named_children(): 63 | if isinstance(child, nn.Linear): 64 | full_name = f"{prefix}.{name}.weight" if prefix else f"{name}.weight" 65 | setattr( 66 | module, 67 | name, 68 | CLinear( 69 | compressed_state_dict[full_name], child.bias, target_device 70 | ), 71 | ) 72 | apply_compressed_weight(child, compressed_state_dict, target_device, full_name) 73 | 74 | 75 | def load_compress_model(model_path, device, torch_dtype): 76 | # partially load model 77 | tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False) 78 | base_pattern = os.path.join(model_path, "pytorch_model-*.bin") 79 | files = glob.glob(base_pattern) 80 | 81 | config = AutoConfig.from_pretrained( 82 | model_path, low_cpu_mem_usage=True, torch_dtype=torch_dtype 83 | ) 84 | model = AutoModelForCausalLM.from_config(config) 85 | linear_weights = get_compressed_list(model) 86 | 87 | compressed_state_dict = {} 88 | 89 | for filename in files: 90 | tmp_state_dict = torch.load(filename) 91 | for name in tmp_state_dict: 92 | if name in linear_weights: 93 | tensor = tmp_state_dict[name].to(device).data.to(torch_dtype) 94 | compressed_state_dict[name] = compress( 95 | tensor, default_compression_config 96 | ) 97 | else: 98 | compressed_state_dict[name] = tmp_state_dict[name].to(device) 99 | tmp_state_dict[name] = None 100 | tensor = None 101 | torch.cuda.empty_cache() 102 | 103 | for name, param in model.named_parameters(): 104 | if name not in linear_weights: 105 | param.data = compressed_state_dict[name] 106 | apply_compressed_weight(model, compressed_state_dict, device) 107 | 108 | model.to(device) 109 | 110 | return model, tokenizer 111 | 112 | 113 | def compress(tensor, config): 114 | """Simulate group-wise quantization.""" 115 | if not config.enabled: 116 | return tensor 117 | 118 | group_size, num_bits, group_dim, symmetric = ( 119 | config.group_size, 120 | config.num_bits, 121 | config.group_dim, 122 | config.symmetric, 123 | ) 124 | assert num_bits <= 8 125 | 126 | original_shape = tensor.shape 127 | num_groups = (original_shape[group_dim] + group_size - 1) // group_size 128 | new_shape = ( 129 | original_shape[:group_dim] 130 | + (num_groups, group_size) 131 | + original_shape[group_dim + 1 :] 132 | ) 133 | 134 | # Pad 135 | pad_len = group_size - original_shape[group_dim] % group_size 136 | if pad_len != 0: 137 | pad_shape = ( 138 | original_shape[:group_dim] + (pad_len,) + original_shape[group_dim + 1 :] 139 | ) 140 | tensor = torch.cat( 141 | [tensor, torch.zeros(pad_shape, dtype=tensor.dtype, device=tensor.device)], 142 | dim=group_dim, 143 | ) 144 | data = tensor.view(new_shape) 145 | 146 | # Quantize 147 | if symmetric: 148 | B = 2 ** (num_bits - 1) - 1 149 | scale = B / torch.max(data.abs(), dim=group_dim + 1, keepdim=True)[0] 150 | data = data * scale 151 | data = data.clamp_(-B, B).round_().to(torch.int8) 152 | return data, scale, original_shape 153 | else: 154 | B = 2**num_bits - 1 155 | mn = torch.min(data, dim=group_dim + 1, keepdim=True)[0] 156 | mx = torch.max(data, dim=group_dim + 1, keepdim=True)[0] 157 | 158 | scale = B / (mx - mn) 159 | data = data - mn 160 | data *= scale 161 | 162 | data = data.clamp_(0, B).round_().to(torch.uint8) 163 | return data, mn, scale, original_shape 164 | 165 | 166 | def decompress(packed_data, config): 167 | """Simulate group-wise dequantization.""" 168 | if not config.enabled: 169 | return packed_data 170 | 171 | group_size, num_bits, group_dim, symmetric = ( 172 | config.group_size, 173 | config.num_bits, 174 | config.group_dim, 175 | config.symmetric, 176 | ) 177 | 178 | # Dequantize 179 | if symmetric: 180 | data, scale, original_shape = packed_data 181 | data = data / scale 182 | else: 183 | data, mn, scale, original_shape = packed_data 184 | data = data / scale 185 | data += mn 186 | 187 | # Unpad 188 | pad_len = group_size - original_shape[group_dim] % group_size 189 | if pad_len: 190 | padded_original_shape = ( 191 | original_shape[:group_dim] 192 | + (original_shape[group_dim] + pad_len,) 193 | + original_shape[group_dim + 1 :] 194 | ) 195 | data = data.reshape(padded_original_shape) 196 | indices = [slice(0, x) for x in original_shape] 197 | return data[indices].contiguous() 198 | else: 199 | return data.view(original_shape) 200 | -------------------------------------------------------------------------------- /stabletoolbench/toolbench/model/make_delta.py: -------------------------------------------------------------------------------- 1 | """ 2 | Make the delta weights by subtracting base weights. 3 | 4 | Usage: 5 | python3 -m fastchat.model.make_delta --base ~/model_weights/llama-13b --target ~/model_weights/vicuna-13b --delta ~/model_weights/vicuna-13b-delta --hub-repo-id lmsys/vicuna-13b-delta-v1.1 6 | """ 7 | import argparse 8 | 9 | import torch 10 | from tqdm import tqdm 11 | from transformers import AutoTokenizer, AutoModelForCausalLM 12 | 13 | 14 | def make_delta(base_model_path, target_model_path, delta_path): 15 | print(f"Loading the base model from {base_model_path}") 16 | base = AutoModelForCausalLM.from_pretrained( 17 | base_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True 18 | ) 19 | 20 | print(f"Loading the target model from {target_model_path}") 21 | target = AutoModelForCausalLM.from_pretrained( 22 | target_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True 23 | ) 24 | target_tokenizer = AutoTokenizer.from_pretrained(target_model_path, use_fast=False) 25 | 26 | print("Calculating the delta") 27 | for name, param in tqdm(target.state_dict().items(), desc="Calculating delta"): 28 | assert name in base.state_dict() 29 | param.data -= base.state_dict()[name] 30 | 31 | print(f"Saving the delta to {delta_path}") 32 | if args.hub_repo_id: 33 | kwargs = {"push_to_hub": True, "repo_id": args.hub_repo_id} 34 | else: 35 | kwargs = {} 36 | target.save_pretrained(delta_path, **kwargs) 37 | target_tokenizer.save_pretrained(delta_path, **kwargs) 38 | 39 | 40 | if __name__ == "__main__": 41 | parser = argparse.ArgumentParser() 42 | parser.add_argument("--base-model-path", type=str, required=True) 43 | parser.add_argument("--target-model-path", type=str, required=True) 44 | parser.add_argument("--delta-path", type=str, required=True) 45 | parser.add_argument("--hub-repo-id", type=str) 46 | args = parser.parse_args() 47 | 48 | make_delta(args.base_model_path, args.target_model_path, args.delta_path) 49 | -------------------------------------------------------------------------------- /stabletoolbench/toolbench/tooleval/README_ZH.md: -------------------------------------------------------------------------------- 1 |
2 |

🛠️Tool Eval🤖

3 |
4 | 5 | 通过在ToolBench上对LLaMA进行微调,我们得到了**ToolLLaMA**。考虑到人工评估非常耗时,我们借鉴[AlpacaEval](https://tatsu-lab.github.io/alpaca_eval/)开发了一个高效的机器自动评估**ToolEval**,其中包含两个评估指标: 6 | 7 | - **通过率**:计算在有限的OpenAI API调用次数内成功完成指令的比例。 8 | 9 | - **偏好**:通过比较给定指令的两个答案(动作序列)来衡量。我们预先定义了一组更好答案的标准,这些标准被组织成ChatGPT的提示。我们向评估器提供测试指令和两个候选答案,并获得其偏好。我们对每个答案对进行多次评估以提高系统的可靠性。然后,我们计算**优胜率**(被评估器选择为更优的百分比。有关详细信息,请参阅我们的论文。 10 | 11 | 为了验证ChatGPT评估器在通过率和胜率方面的可靠性,我们从四种不同的方法(ChatGPT+ReACT,ChatGPT+DFSDT,ToolLLaMA+DFSDT和GPT4+DFSDT)中进行采样,为每种方法的300个测试指令获取解决方案对。然后,我们请人类标注ChatGPT+DFSDT,ToolLLaMA+DFSDT和GPT4+DFSDT的通过率,以及ChatGPT+ReACT和ChatGPT+DFSDT之间的胜率。 12 | 13 | 我们的ChatGPT评估器在通过率方面与人类标注者具有高达**87.1%**的一致性,在胜率方面具有**80.3%**的一致性。这个结果表明,我们的评估器生成的评估结果与人类非常相似,并且可以视为在通过率和胜率上模拟人类评估的可靠评估器。 14 | 有关ToolEval的更多细节,请参阅我们的论文。 15 | 16 | ## 🚀用法 17 | 18 | ### Install 19 | Install Package (python>=3.9) 20 | ```bash 21 | pip install -r requirements.txt 22 | ``` 23 | 24 | ### Evaluation 25 | *若要复现结果,直接通过[Google Drive](https://drive.google.com/drive/folders/1yBUQ732mPu-KclJnuQELEhtKakdXFc3J)下载我们的`reproduction_data.zip`,解压后置`reproduction_data`于`ToolBench/data/`下即可,可以跳过数据准备流程。* 26 | - 数据准备。若要使用 ToolEval 评估您自己的模型和方法,首先需要为六个测试子集准备所有的模型预测。创建一个以您的模型和方法命名的目录,例如 `chatgpt_cot`,然后将每个测试集的预测放在该目录下。目录的文件结构应如下: 27 | ``` 28 | ├── /chatgpt_cot/ 29 | │ ├── /G1_instruction/ 30 | │ │ ├── /10160_CoT@1.json 31 | │ │ └── ... 32 | │ ├── /G1_tool/ 33 | │ │ ├── /10221_CoT@1.json 34 | │ │ └── ... 35 | │ ├── ... 36 | │ ├── /G3_instruction/ 37 | │ │ ├── /10221_CoT@1.json 38 | │ │ └── ... 39 | ``` 40 | 41 | 然后对模型预测进行预处理: 42 | 43 | ```bash 44 | export RAW_ANSWER_PATH=../../data/reproduction_data/model_predictions/ 45 | export CONVERTED_ANSWER_PATH=../../data/reproduction_data/model_predictions_converted/ 46 | export MODEL_NAME=chatgpt_cot 47 | export METHOD=CoT 48 | mkdir ${CONVERTED_ANSWER_PATH}/${MODEL_NAME} 49 | for test_set in G1_instruction G1_category G1_tool G2_category G2_instruction G3_instruction 50 | do 51 | answer_dir=${RAW_ANSWER_PATH}/${MODEL_NAME}/${test_set} 52 | output_file=${CONVERTED_ANSWER_PATH}/${MODEL_NAME}/${test_set}.json 53 | python convert_to_answer_format.py\ 54 | --answer_dir ${answer_dir} \ 55 | --method ${METHOD} \ 56 | --output ${output_file} 57 | done 58 | ``` 59 | 之后,检查`${CONVERTED_ANSWER_PATH}/${MODEL_NAME}`下是否有测试集的预处理JSON文件。如果有,你就可以准备运行以下评估过程了。如果没有,请检查模型的预测是否有问题。 60 | 61 | - OpenAI Key 62 | 准备您的OpenAI Key来搭建我们的evaluator。Key需要被存储到一个json file中,如`path/to/your/openai_key_json_file.json`: 63 | ```bash 64 | [ 65 | { 66 | "username": "your_user_name", 67 | "passwd": "your_password", 68 | "api_key": "your_openai_key", 69 | "organization": "your_organization" 70 | }, 71 | ... 72 | ] 73 | ``` 74 | - Pass rate. 75 | ```bash 76 | export CONVERTED_ANSWER_PATH=../../data/reproduction_data/model_predictions_converted/ 77 | export SAVE_PATH=pass_rate_results 78 | export CANDIDATE_MODEL=chatgpt_cot 79 | export API_POOL_FILE=path/to/your/openai_key_json_file.json 80 | 81 | python eval_pass_rate.py \ 82 | --converted_answer_path ${CONVERTED_ANSWER_PATH} \ 83 | --save_path ${SAVE_PATH} \ 84 | --reference_model ${CANDIDATE_MODEL} \ 85 | --test_ids ../../data/test_query_ids/ \ 86 | --max_eval_threads 20 \ 87 | --evaluate_times 4 88 | 89 | ``` 90 | 91 | 结果文件会被存储至${SAVE_PATH}中。 92 | 93 | - Win rate. 以下示例以ChatGPT-ReACT作为参考模型,GPT4-ReACT作为候选模型。请注意,您首先需要获取两个模型的pass rate结果,然后运行以下命令来评估GPT4-ReACT的win rate结果: 94 | ```bash 95 | export CONVERTED_ANSWER_PATH=../../data/reproduction_data/model_predictions_converted/ 96 | export SAVE_PATH=preference_results 97 | export PASS_TARE_PATH=pass_rate_results 98 | export REFERENCE_MODEL=chatgpt_cot 99 | export CANDIDATE_MODEL=gpt-4-0613_cot 100 | export API_POOL_FILE=path/to/your/openai_key_json_file.json 101 | 102 | python eval_preference.py \ 103 | --converted_answer_path ${CONVERTED_ANSWER_PATH} \ 104 | --reference_model ${REFERENCE_MODEL} \ 105 | --output_model ${CANDIDATE_MODEL} \ 106 | --test_ids ../../data/test_query_ids/ \ 107 | --save_path ${SAVE_PATH} \ 108 | --pass_rate_result_path ${PASS_TARE_PATH} \ 109 | --max_eval_threads 20 \ 110 | --use_pass_rate true \ 111 | --evaluate_times 4 112 | ``` 113 | 114 | 结果文件会被存储至${SAVE_PATH}中。 115 | 116 | ### 评估新方法 117 | 要评估除了ReACT和DFSDT之外的方法,您需要遵循以上Data preparation的步骤准备您的预处理好的answer数据。预处理好的answer数据需遵循以下json格式: 118 | 119 | ```json 120 | [ 121 | { 122 | "method":"method name", 123 | "total_steps": int, // a integer count total steps in answer details 124 | "final_answer": "final answer from the method", 125 | "answer_details":[{ 126 | "role":"node role, can be system, user, assistant and tool", 127 | "message":"message for the node", 128 | "next":[//next steps, can have multiple elements if the node have multiple candidates. 129 | { 130 | "role":"", 131 | "message":"", 132 | "next":[...] 133 | }, 134 | ...//more candidates 135 | ] 136 | }] 137 | } 138 | ... // more answers for the give query in the testdata 139 | ] 140 | ``` 141 | 142 | 143 | ### 更新排行榜 144 | 145 | 如果您想将您的模型的结果上传到[ToolEval Leaderboard](https://openbmb.github.io/ToolBench/),请您将您的结果文件整理成上述格式发送给我们(urtoolbench@gmail.com)或者开一个pull request。 146 | 我们将运行评测脚本更新结果并将您的模型添加到排行榜中。 147 | 148 | 149 | ### 创建新的自动评估器 150 | 如果您想创建新的自动评估器,您需要按下列步骤进行: 151 | 1. 在路径`toolbench/tooleval/evaluators`下创建一个评测器配置文件目录,命名与你的评测器名一致。在其中添加`config.yaml`文件与`template.txt`文件。具体配置方式可参考`toolbench/tooleval/evaluators/tooleval_gpt-3.5-turbo_normalized`中的实现。 152 | 2. 创建你的evaluator类并实现`fn_completions`函数在文件夹`toolbench/tooleval/evaluators/registered_cls`中,或者你可以使用我们预先定义好的类例如`OpenAINormalizedEvaluator`。 153 | 完成后将配置文件中`registered_cls_name`字段填写为该类的名称。 154 | 这里给出一个例子: 155 | ```Python 156 | from evaluators import register_evaluator,BaseEvaluator 157 | from typing import Dict,List 158 | 159 | @register_evaluator 160 | class MyEvaluator(BaseEvaluator): 161 | def __init__(self,config): 162 | super().__init__( 163 | fn_completions=self.fn_completions, 164 | ) 165 | # set your configures here 166 | 167 | def fn_completions(self,query:Dict,answers:List[Dict])->int: 168 | # implement your evaluator here 169 | # return the index of the preferred answer 170 | return 0 171 | ``` 172 | 其中register_evaluator是一个装饰器,用于注册评估器,BaseEvaluator是一个基类,用于实现评估器的基本功能。 173 | 3. 测试评估器的性能,运行脚本`evaluators_comparison.py`。 174 | -------------------------------------------------------------------------------- /stabletoolbench/toolbench/tooleval/ToolBench.code-workspace: -------------------------------------------------------------------------------- 1 | { 2 | "folders": [ 3 | { 4 | "path": "../../../ToolBench" 5 | }, 6 | { 7 | "path": "../.." 8 | }, 9 | { 10 | "path": "../../../STC/RapidAPI-Server" 11 | } 12 | ], 13 | "settings": { 14 | "git.ignoreLimitWarning": true 15 | } 16 | } -------------------------------------------------------------------------------- /stabletoolbench/toolbench/tooleval/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yuyq18/StepTool/121a6acd11a899d947a5d3fa7269cf76624c1df3/stabletoolbench/toolbench/tooleval/__init__.py -------------------------------------------------------------------------------- /stabletoolbench/toolbench/tooleval/automatic_eval_sample.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import time 4 | from concurrent.futures import ThreadPoolExecutor,as_completed 5 | from tqdm import tqdm 6 | import numpy as np 7 | import argparse 8 | import random 9 | from evaluation import UserEvaluation,BaseToolMethod 10 | from evaluators import load_registered_automatic_evaluator 11 | from typing import List,Dict,Callable 12 | import pandas as pd 13 | 14 | abs_dir = os.path.split(__file__)[0] 15 | 16 | 17 | def parse_args(): 18 | parser = argparse.ArgumentParser() 19 | parser.add_argument('--output',default=os.path.join(abs_dir,'dataset','test.json'),help='where to store the method output.') 20 | parser.add_argument('--method',default='unknown',help='what the name of the method.') 21 | parser.add_argument('--ref_method',default='gpt-3.5-turbo_CoT',help='what the reference method is') 22 | parser.add_argument('--ref_output',default=os.path.join(abs_dir,'dataset','ref_sample.json'),help='where the reference answer stored') 23 | parser.add_argument('--evaluators_cfg_path',default=os.path.join(abs_dir,'evaluators'),help='where the evaluators config files are stored') 24 | parser.add_argument('--evaluator',default='tooleval_gpt-3.5-turbo_normalized',help='which evaluator to use') 25 | parser.add_argument('--max_eval_threads',default=16,type=int,help='how many threads to use for evaluation') 26 | parser.add_argument('--evalset',default='default_evalset',help='which the evaluation dataset to use') 27 | parser.add_argument('--eval_server_address',default='http://localhost:8000',help='the address of the evaluation server') 28 | parser.add_argument('--use_existed_output',default=False,action='store_true',help='whether to use the existed output') 29 | 30 | return parser.parse_args() 31 | 32 | 33 | ## !!define your method here !! 34 | class SampleMethod(BaseToolMethod): 35 | def __init__(self): 36 | super().__init__() 37 | def forward(self,query:str,tools:List[Dict],tool_func:Callable)->Dict: 38 | return {} 39 | def convert_result_to_dict(self,result): 40 | return { 41 | 'method': 'sample', 42 | 'total_steps': 0, 43 | 'final_answer': '', 44 | 'answer_details': [] 45 | } 46 | 47 | if __name__=='__main__': 48 | args = parse_args() 49 | 50 | exec_generating_method_outputs = True 51 | if os.path.exists(args.output): 52 | print('Output file {} already exists!'.format(args.output)) 53 | if args.use_existed_output: 54 | exec_generating_method_outputs = False 55 | else: 56 | print('Overwrite? (y/n)') 57 | exec_generating_method_outputs = input()=='y' 58 | 59 | if exec_generating_method_outputs: 60 | ## change the SampleMethod to your method 61 | usereval = UserEvaluation(SampleMethod(),args.eval_server_address,args.evalset) 62 | print('Generating method outputs...') 63 | results = usereval.run() 64 | print('Saving method outputs...') 65 | with open(args.output,'w') as f: 66 | json.dump(results,f) 67 | else: 68 | print('Use existed output.') 69 | results = json.load(open(args.output)) 70 | 71 | print('Loading reference answer for evaluation...') 72 | try: 73 | ref_output = json.load(open(args.ref_output)) 74 | except: 75 | raise Exception('Cannot load reference answer from {}\n Please Download before evaluation!'.format(args.ref_output)) 76 | 77 | print('Loading automatic evaluators...') 78 | evaluators = [load_registered_automatic_evaluator(vars(args)) for _ in range(args.max_eval_threads)] 79 | 80 | def get_preference(qid,query,tools,ref_ans,ans,): 81 | global evaluators 82 | evaluator = random.choice(evaluators) 83 | ret = evaluator.annotate_preference( 84 | query, 85 | tools, 86 | [ref_ans,ans]) 87 | return qid,ret 88 | def get_most_preferred(d:list)->np.ndarray: 89 | if np.iterable(d): 90 | d = np.asanyarray(d) 91 | bins = np.bincount(d) 92 | max_val = np.max(bins) 93 | argmax = np.where(max_val==bins)[0] 94 | return argmax 95 | else: 96 | return np.asarray([d]) 97 | 98 | print('Evaluating...') 99 | prefer_dict = {} 100 | with ThreadPoolExecutor(args.max_eval_threads) as pool: 101 | future = [] 102 | for qid in ref_output.keys(): 103 | try: 104 | future.append(pool.submit( 105 | get_preference, 106 | qid, 107 | ref_output[qid]['query'], 108 | ref_output[qid]['available_tools'], 109 | ref_output[qid]['answer'], 110 | results[qid]['answer'] 111 | )) 112 | except KeyError as e: 113 | print('Warning : Missing answer for query {} in answer file! '.format(e)) 114 | 115 | for thd in tqdm(as_completed(future),total=len(future),ncols=100): 116 | qid,preference = thd.result() 117 | prefer_dict[qid] = get_most_preferred(preference)[0] 118 | 119 | prefer = list(prefer_dict.values()) 120 | 121 | prefer = np.array(prefer) 122 | df = pd.DataFrame.from_dict([{ 123 | 'Method':args.method, 124 | 'Win Rate':prefer.mean(), 125 | 'Std Error':np.std(prefer)/np.sqrt(len(prefer)) 126 | }]) 127 | print('###### Leaderboard vs {} ######'.format(args.ref_method)) 128 | print(df) 129 | save_file = os.path.join(abs_dir,'results',args.evalset,args.method) 130 | os.makedirs(save_file,exist_ok=True) 131 | df.to_csv(os.path.join(save_file,'win.csv')) 132 | -------------------------------------------------------------------------------- /stabletoolbench/toolbench/tooleval/convert_answers.py: -------------------------------------------------------------------------------- 1 | from convert_to_answer_format import process_invalid_data,process_valid_data 2 | import json 3 | from glob import glob 4 | import os 5 | 6 | save_dir = 'path/to/save/dir' 7 | 8 | groups_dirs = ['path/to/dataset/eval/result/folders'] 9 | 10 | for groups_dir in groups_dirs: 11 | method = os.path.split(groups_dir)[1] 12 | print(method) 13 | groups_save_dir = os.path.join(save_dir,method) 14 | os.makedirs(groups_save_dir,exist_ok=True) 15 | groups = [os.path.split(g)[1] for g in glob(groups_dir+'/*')] 16 | full_answer = {} 17 | for g in groups: 18 | print(g) 19 | answer_dict = {} 20 | files = glob(os.path.join(groups_dir,g,'*.json')) 21 | for file in files: 22 | qid = os.path.split(file)[1].split('_')[0] 23 | try: 24 | data = json.load(open(file)) 25 | except: 26 | print('Read error: ',file) 27 | continue 28 | if not data['answer_generation']['valid_data']: 29 | answer_dict[qid] = process_invalid_data(method,data) 30 | else: 31 | answer_dict[qid] = process_valid_data(method,data['answer_generation']) 32 | json.dump(answer_dict,open(os.path.join(groups_save_dir,f'{g}.json'),'w')) 33 | full_answer.update(answer_dict) 34 | # json.dump(full_answer,open(os.path.join(groups_save_dir,f'fullanswer.json'),'w')) -------------------------------------------------------------------------------- /stabletoolbench/toolbench/tooleval/dataset/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yuyq18/StepTool/121a6acd11a899d947a5d3fa7269cf76624c1df3/stabletoolbench/toolbench/tooleval/dataset/__init__.py -------------------------------------------------------------------------------- /stabletoolbench/toolbench/tooleval/eval_process_reward.py: -------------------------------------------------------------------------------- 1 | from evaluators import load_registered_automatic_evaluator 2 | import os 3 | import json 4 | import csv 5 | from evaluators.registered_cls.rtl import AnswerStatus, TaskStatus, AnswerPass 6 | import random 7 | from concurrent.futures import ThreadPoolExecutor,as_completed 8 | import argparse 9 | from tqdm import tqdm 10 | import numpy as np 11 | from utils import test_sets, get_steps 12 | import backoff 13 | 14 | abs_dir = os.path.split(__file__)[0] 15 | 16 | def parse_args(): 17 | parser = argparse.ArgumentParser() 18 | parser.add_argument('--converted_answer_path', type=str, default="", required=True, help='converted answer path') 19 | parser.add_argument('--save_path', type=str, default="", required=False, help='result save path') 20 | parser.add_argument('--reference_model', type=str, default="", required=False, help='model predictions path') 21 | parser.add_argument('--reference_path', type=str, default=None, required=False, help='reference path') 22 | parser.add_argument('--test_ids', type=str, default="", required=True, help='model predictions path') 23 | parser.add_argument('--task_num', type=int, default=None, required=False, help='task num') 24 | parser.add_argument('--evaluator', type=str, default="tooleval_gpt-3.5-turbo_default", required=False, help='which evaluator to use.') 25 | parser.add_argument('--max_eval_threads', type=int, default=30, required=False, help='max threads nums') 26 | parser.add_argument('--evaluate_times', type=int, default=4, required=False, help='how many times to predict with the evaluator for each solution path.') 27 | parser.add_argument('--test_set', nargs='+', default=['G1_instruction'], help='test set name') 28 | parser.add_argument('--overwrite', action='store_true', help='whether to overwrite the existing result file') 29 | return parser.parse_args() 30 | 31 | if __name__ == "__main__": 32 | args = parse_args() 33 | evaluators = [load_registered_automatic_evaluator(evaluator_name=args.evaluator, evaluators_cfg_path=os.path.join(abs_dir,'evaluators')) for _ in range(args.max_eval_threads)] 34 | 35 | @backoff.on_exception(backoff.expo, Exception, max_time=15) 36 | def compute_process_reward(query_id, example, evaluate_time): 37 | global evaluators 38 | evaluator = random.choice(evaluators) 39 | answer_steps, answer_steps_list, final_step = get_steps(example) 40 | 41 | succeed_tool_calling_list, contributions, answer_status = evaluator.evaluate_process_reward( 42 | { 43 | 'query':example['query'], 44 | 'available_tools':example['available_tools'], 45 | }, 46 | answer_steps_list[:-1], 47 | example['answer'], 48 | ) 49 | process_reward = { 50 | "succeed_tool_calling": succeed_tool_calling_list, 51 | "contributions": contributions, 52 | } 53 | return query_id, process_reward, answer_status, evaluate_time 54 | 55 | reference_model = args.reference_model 56 | output_list = [] 57 | 58 | for test_set in args.test_set: 59 | 60 | save_file = f"{args.save_path}/{test_set}.json" 61 | if args.task_num: 62 | save_file = f"{args.save_path}/{test_set}_{args.task_num}.json" 63 | 64 | reference_path = f"{args.converted_answer_path}/{test_set}.json" 65 | reference_examples = json.load(open(reference_path, "r")) 66 | if args.task_num: 67 | reference_examples = {k:reference_examples[k] for k in list(reference_examples.keys())[:args.task_num]} 68 | if os.path.exists(save_file) and not args.overwrite: 69 | old_existed_ids = list(json.load(open(save_file, "r")).keys()) 70 | old_label_cnt = json.load(open(save_file, "r")) 71 | existed_ids = [] 72 | label_cnt = {} 73 | for query_id in old_existed_ids: 74 | ans = old_label_cnt[query_id] 75 | if len(ans['process_reward'].keys()) == args.evaluate_times: 76 | existed_ids.append(query_id) 77 | label_cnt[query_id] = ans 78 | else: 79 | existed_ids = [] 80 | label_cnt = {} 81 | 82 | with ThreadPoolExecutor(args.max_eval_threads) as pool: 83 | future = [] 84 | 85 | for query_id in reference_examples: 86 | if query_id in existed_ids: 87 | continue 88 | for i in range(args.evaluate_times): 89 | example = reference_examples[query_id] 90 | future.append(pool.submit( 91 | compute_process_reward, 92 | query_id, 93 | example, 94 | evaluate_time=i 95 | )) 96 | 97 | for thd in tqdm(as_completed(future),total=len(future),ncols=100): 98 | query_id, process_reward, is_solved, evaluate_time = thd.result() 99 | example = reference_examples[query_id] 100 | query = example["query"] 101 | tool_names = [] 102 | for tool_dict in example["available_tools"]: 103 | tool_name = tool_dict["function"]["name"] 104 | tool_names.append(tool_name) 105 | answer_steps, answer_steps_list, final_step = get_steps(example) 106 | if query_id not in label_cnt: 107 | label_cnt[query_id] = {} 108 | label_cnt[query_id]["query"] = query 109 | label_cnt[query_id]["tool_names"] = tool_names 110 | label_cnt[query_id]["answer_steps"] = answer_steps_list[:-1] 111 | # label_cnt[query_id]["mid_steps_reward"] = mid_steps_reward # parsed 112 | if 'process_reward' not in label_cnt[query_id]: 113 | label_cnt[query_id]["process_reward"] = {} 114 | label_cnt[query_id]["process_reward"][evaluate_time] = process_reward 115 | label_cnt[query_id]["final_step"] = final_step 116 | 117 | if 'is_solved' not in label_cnt[query_id]: 118 | label_cnt[query_id]["is_solved"] = {} 119 | label_cnt[query_id]["is_solved"][evaluate_time] = str(is_solved) 120 | # print("========== Finish and Dump into json file===========", query_id, is_solved, evaluate_time) 121 | 122 | json.dump(label_cnt, open(save_file, "w"), ensure_ascii=False, indent=4) 123 | 124 | json.dump(label_cnt, open(save_file, "w"), ensure_ascii=False, indent=4) 125 | -------------------------------------------------------------------------------- /stabletoolbench/toolbench/tooleval/evaluation/__init__.py: -------------------------------------------------------------------------------- 1 | from .usereval import UserEvaluation 2 | from .methodcls import BaseToolMethod 3 | from .dataclass import ExecutionGraph,ExecutionNode,DirectedEdge -------------------------------------------------------------------------------- /stabletoolbench/toolbench/tooleval/evaluation/methodcls.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, List,Callable 2 | 3 | class BaseToolMethod: 4 | def __init__(self): 5 | pass 6 | def convert_result_to_dict(self,result): 7 | '''Return Format 8 | -------- 9 | { 10 | 'method': 'method name', 11 | 'total_steps': int, 12 | 'final_answer': 'answer', 13 | 'answer_details': [{ 14 | "role": "system", 15 | "message": "", 16 | "next": [ 17 | { 18 | "role": "user", 19 | "message": "I am planning ...", 20 | "next": [ 21 | { 22 | "role": "tool", 23 | "message": "{'name': 'Finish', 'arguments': '{\\n \"return_type\": \"give_answer\",\\n \"final_answer\": \"I encountere...", 24 | "next": [] 25 | } 26 | ] 27 | } 28 | ] 29 | }] 30 | } 31 | 32 | ''' 33 | pass 34 | def forward(self,query:str,tools:List[Dict],tool_func:Callable)->Dict: 35 | pass 36 | 37 | def __call__(self,query:str,tools:List[Dict],tool_func:Callable)->Dict: 38 | result = self.forward(query,tools,tool_func) 39 | return self.convert_result_to_dict(result) 40 | 41 | -------------------------------------------------------------------------------- /stabletoolbench/toolbench/tooleval/evaluation/usereval.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from tqdm import tqdm 3 | from typing import Union, Dict, List, Optional,Tuple 4 | from .methodcls import BaseToolMethod 5 | from .dataclass import * 6 | import json 7 | 8 | class UserEvaluation: 9 | def __init__(self, 10 | method:BaseToolMethod, 11 | eval_server_addr='http://localhost:8000', 12 | evalset='eval20230718'): 13 | self.eval_server_addr = eval_server_addr 14 | self.evalset = evalset 15 | self.method = method 16 | res = requests.post(self.eval_server_addr+'/neweval',json=self.evalset) 17 | if res.status_code != 200: 18 | raise Exception('Failed to obtain new evaluation id! Error: '+res.text) 19 | ret = res.json() 20 | self.eval_id = ret['evaluation_id'] 21 | self.len = ret['len'] 22 | 23 | def get_new_question(self)->Tuple[str,List]: 24 | res = requests.post(self.eval_server_addr+'/next_question',json=self.eval_id) 25 | if res.status_code == 204: 26 | raise EvalCompleted() 27 | if res.status_code != 200: 28 | raise Exception('Failed to obtain new question!') 29 | 30 | self.question = Question(**res.json()) 31 | self.tool_name_to_id = {} 32 | tools = [tool.model_dump() for tool in self.question.available_tools] 33 | for tool in tools: 34 | self.tool_name_to_id[tool['name']] = tool.pop('tid') 35 | 36 | 37 | return self.question.query,tools 38 | def tool_func(self,tool_name:str,tool_args:str)->requests.Response: 39 | tid = self.tool_name_to_id[tool_name] 40 | # res = requests.post(self.eval_server_addr+'/api',json={ 41 | # 'evaluation_id':self.eval_id, 42 | # 'tool_id':tid, 43 | # 'tool_args':tool_args 44 | # }) 45 | res = requests.post(self.eval_server_addr+'/rapidapi',json={ 46 | 'evaluation_id':self.eval_id, 47 | 'tool_id':tid, 48 | 'tool_args':tool_args 49 | }) 50 | 51 | return res 52 | def _forward(self,query:str,tools:List[Dict])->Dict: 53 | method_ret = self.method(query,tools,self.tool_func) 54 | 55 | return self.question.qid,{ 56 | 'query':query, 57 | 'available_tools':tools, 58 | 'answer':method_ret 59 | } 60 | 61 | 62 | def run(self)->Dict: 63 | results = {} 64 | for _ in tqdm(range(self.len),ncols=100): 65 | try: 66 | qid,ret = self._forward(*self.get_new_question()) 67 | except EvalCompleted: 68 | return results 69 | results[qid] = ret 70 | return results 71 | -------------------------------------------------------------------------------- /stabletoolbench/toolbench/tooleval/evaluators/__init__.py: -------------------------------------------------------------------------------- 1 | from .registered_cls import BaseEvaluator,register_evaluator,get_evaluator_cls 2 | 3 | __all__=['register_evaluator','get_evaluator_cls','BaseEvaluator','load_registered_automatic_evaluator'] 4 | 5 | 6 | 7 | def load_registered_automatic_evaluator(config:dict={},evaluator_name=None,evaluators_cfg_path=None)->BaseEvaluator: 8 | import os 9 | import yaml 10 | 11 | evaluator_name = config['evaluator'] if evaluator_name is None else evaluator_name 12 | cfg_path = config['evaluators_cfg_path'] if evaluators_cfg_path is None else evaluators_cfg_path 13 | cfg_path = os.path.join(cfg_path,evaluator_name) 14 | 15 | cls_name = yaml.load(open(os.path.join(cfg_path,'config.yaml')),Loader=yaml.FullLoader)['registered_cls_name'] 16 | 17 | evaluator:BaseEvaluator = get_evaluator_cls(cls_name)(cfg_path) 18 | return evaluator -------------------------------------------------------------------------------- /stabletoolbench/toolbench/tooleval/evaluators/registered_cls/__init__.py: -------------------------------------------------------------------------------- 1 | from .base import BaseEvaluator 2 | from .utils import register_evaluator,get_evaluator_cls 3 | 4 | __all__ = ['register_evaluator','get_evaluator_cls','BaseEvaluator'] 5 | 6 | import os 7 | import importlib 8 | current_dir = os.path.dirname(__file__) 9 | 10 | for item in os.listdir(current_dir): 11 | item_path = os.path.join(current_dir, item) 12 | 13 | if os.path.isfile(item_path) and item != '__init__.py' and item.endswith('.py'): 14 | module_name = item[:-3] 15 | 16 | full_module_path = f"{__name__}.{module_name}" 17 | 18 | imported_module = importlib.import_module(full_module_path) 19 | 20 | globals()[module_name] = imported_module 21 | -------------------------------------------------------------------------------- /stabletoolbench/toolbench/tooleval/evaluators/registered_cls/base.py: -------------------------------------------------------------------------------- 1 | import random 2 | from typing import List, Union, Dict, Any, Callable 3 | import os 4 | import yaml 5 | from .utils import register_evaluator 6 | 7 | def process_answer(answer: Dict): 8 | answer['final_answer'] = answer['final_answer'][:1000] 9 | answer['answer_details'] = answer['answer_details'][:3000] 10 | # breakpoint() 11 | answer.pop('method', None) 12 | return answer 13 | 14 | 15 | def process_tools(tools: List[Dict]): 16 | for tool in tools: 17 | tool.pop('description', None) 18 | tool.pop('parameters', None) 19 | return tools 20 | 21 | @register_evaluator 22 | class BaseEvaluator: 23 | """Base class for evaluators. 24 | 25 | Attributes: 26 | ---------- 27 | fn_completions : Callable[[Dict,List[Dict]],int] 28 | The completion function of the evaluator, used to get annotated results. 29 | This function should take two arguments: `task_description`:Dict and `answers`:List[Dict], return a int stand for the index of best answer. 30 | 31 | Functions: 32 | --------- 33 | annotate_preference : Callable 34 | Annotate and return the index of the preferred answer. 35 | 36 | """ 37 | def __init__(self, 38 | fn_completions: Callable[[Dict,List[Dict]],int] = None, 39 | *args, 40 | **kwargs): 41 | self.fn_completions = fn_completions 42 | def annotate_preference(self, 43 | query: str, 44 | available_tools: List[Dict[Any, Any]], 45 | answers:List[Dict], 46 | multisample=False, 47 | sample_n=4, 48 | task_status=None, 49 | answer_statuss=[None, None]) -> Union[List[int], int]: 50 | """Annotate and return the index of the preferred answer. 51 | 52 | For given query, available tools, and two answers, return the index of the preferred answer by calling function `fn_completions` of the evaluator. 53 | 54 | Parameters: 55 | ---------- 56 | query : str 57 | The query of the task. 58 | available_tools : List[Dict[Any, Any]] 59 | The list of available tools for the task. The specific format of the tool is defined in `tooleval/evaluation/dataclass.py` 60 | answers : List[Dict] 61 | The list of answers for comparison. 62 | multisample : bool, optional 63 | Whether to use multisample to get the preference. If True, the function will return a list of preferences, otherwise return a single preference. 64 | sample_n : int, optional 65 | The number of samples to get the preference. 66 | 67 | Returns: 68 | ------- 69 | preference : Union[List[int], int] 70 | The index of the preferred answer. If `multisample` is True, return a list of preferences, otherwise return a single preference. 71 | 72 | Raise: 73 | ----- 74 | 75 | """ 76 | answers_processed = [process_answer(ans) for ans in answers] 77 | available_tools = process_tools(available_tools) 78 | 79 | def shuffle_run() -> int: 80 | indexs = list(range(len(answers_processed))) 81 | random.shuffle(indexs) 82 | 83 | answers_projected = [answers_processed[idx] for idx in indexs] 84 | # breakpoint() 85 | preferred_index = self.fn_completions( 86 | { 87 | 'query':query, 88 | 'available_tools':available_tools, 89 | }, 90 | answers_projected, 91 | task_status, 92 | answer_statuss 93 | ) 94 | if preferred_index in indexs: 95 | return indexs.index(preferred_index) 96 | raise ValueError(f'Preferred index {preferred_index} is invalid!') 97 | 98 | if not multisample: 99 | return shuffle_run() 100 | else: 101 | prefers = [shuffle_run() for _ in range(sample_n)] 102 | return prefers 103 | 104 | @register_evaluator 105 | class ToolEvalEvaluator(BaseEvaluator): 106 | """ToolEval common evaluator class. 107 | 108 | Attributes: 109 | ---------- 110 | cfg_path : str 111 | A path store the configuration of the evaluator. 112 | 113 | 114 | """ 115 | def __init__(self, 116 | cfg_path: str = None, 117 | ): 118 | eval_config = yaml.load(open(os.path.join(cfg_path,'config.yaml')),Loader=yaml.FullLoader) 119 | template = open(os.path.join(cfg_path,eval_config['prompt_template'])).read() 120 | 121 | super().__init__( 122 | fn_completions=getattr(self,eval_config['fn_completions']) 123 | ) 124 | self.eval_config = eval_config 125 | self.template = template -------------------------------------------------------------------------------- /stabletoolbench/toolbench/tooleval/evaluators/registered_cls/utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | from typing import List,Dict 4 | import requests 5 | from tenacity import retry, wait_random_exponential, stop_after_attempt 6 | 7 | from openai import OpenAI 8 | import random 9 | 10 | __registered_evaluators__ = {} 11 | 12 | def register_evaluator(cls): 13 | """ 14 | Decorator function to register classes with the registered_evaluators list. 15 | """ 16 | __registered_evaluators__[cls.__name__] = cls 17 | return cls 18 | 19 | def get_evaluator_cls(clsname): 20 | """ 21 | Return the evaluator class with the given name. 22 | """ 23 | try: 24 | return __registered_evaluators__.get(clsname) 25 | except: 26 | raise ModuleNotFoundError('Cannot find evaluator class {}'.format(clsname)) 27 | 28 | 29 | class OpenaiPoolRequest: 30 | def __init__(self, pool_json_file=None): 31 | self.pool:List[Dict] = [] 32 | __pool_file = pool_json_file 33 | if os.environ.get('API_POOL_FILE',None) is not None: 34 | __pool_file = os.environ.get('API_POOL_FILE') 35 | self.now_pos = random.randint(-1, len(self.pool)) 36 | if os.path.exists(__pool_file): 37 | self.pool = json.load(open(__pool_file)) 38 | self.now_pos = random.randint(-1, len(self.pool)) 39 | # print(__pool_file) 40 | if os.environ.get('OPENAI_KEY',None) is not None: 41 | self.pool.append({ 42 | 'api_key':os.environ.get('OPENAI_KEY'), 43 | 'api_base':os.environ.get('OPENAI_API_BASE',None), 44 | 'organization':os.environ.get('OPENAI_ORG',None), 45 | 'api_type':os.environ.get('OPENAI_TYPE',None), 46 | 'api_version':os.environ.get('OPENAI_VER',None) 47 | }) 48 | 49 | # @retry(wait=wait_random_exponential(multiplier=1, max=30), stop=stop_after_attempt(10),reraise=True) 50 | def request(self,messages,**kwargs): 51 | self.now_pos = (self.now_pos + 1) % len(self.pool) 52 | key_pos = self.now_pos 53 | item = self.pool[key_pos] 54 | # print(len(self.pool)) 55 | api_key = item['api_key'] 56 | api_base = item.get('api_base', None) 57 | client = OpenAI(api_key=api_key,base_url=api_base) 58 | response = client.chat.completions.create(messages=messages,**kwargs) 59 | return response 60 | 61 | def __call__(self,messages,**kwargs): 62 | return self.request(messages,**kwargs) 63 | -------------------------------------------------------------------------------- /stabletoolbench/toolbench/tooleval/evaluators/tooleval_gpt-3.5-turbo_default/config.yaml: -------------------------------------------------------------------------------- 1 | evaluator_name: "tooleval_gpt-3.5-turbo_default" 2 | registered_cls_name: "ReinforceToolLearningEvaluator" 3 | prompt_template: "template.txt" 4 | fn_completions: "normalized_openai_completions" 5 | apis_json: "your/path/to/api_pool.json" 6 | completions_kwargs: 7 | model: "gpt-3.5-turbo-16k" 8 | max_tokens: 1000 9 | temperature: 0 10 | timeout: 10 11 | functions: 12 | - name: "evaluate_process_reward" 13 | description: "Evaluate the entire task-solving process, including tool calls, the contribution of each intermediate step to the final answer, and the status of the final answer." 14 | parameters: 15 | type: "object" 16 | properties: 17 | succeed_tool_calling: 18 | type: "array" 19 | description: "Provide a binary score (0 or 1) indicating whether **each intermediate step** successfully called the tool." 20 | items: 21 | type: "number" 22 | description: "0 for unsuccessful tool calls, 1 for successful tool calls" 23 | contribution_to_final_answer: 24 | type: "array" 25 | description: "Provide a score (0 to 5) to assess how much **each intermediate step** contributed to the final answer." 26 | items: 27 | type: "number" 28 | description: "0 indicates no contribution, and 5 indicates maximum contribution." 29 | final_answer_status: 30 | type: "string" 31 | enum: ["Unsure", "Unsolved", "Solved"] 32 | description: "Indicate the status of the final answer. Choose from: 'Unsure', 'Unsolved', or 'Solved'." 33 | required: ["succeed_tool_calling", "contribution_to_final_answer", "final_answer_status"] 34 | 35 | - name: "check_answer_status" 36 | description: "Parse the json answer with layerd nodes and return the answer_status about the answer" 37 | parameters: 38 | type: "object" 39 | properties: 40 | answer_status: 41 | type: "string" 42 | enum: ["Unsure","Unsolved","Solved"] 43 | required: ["answer_status"] 44 | - name: "parse_answer_status" 45 | description: "Parse the json answer with layerd nodes and return the answer_status about the answer" 46 | parameters: 47 | type: "object" 48 | properties: 49 | answer_status: 50 | type: "string" 51 | enum: ["Unsure","Unsolved","Solved"] 52 | required: ["answer_status"] 53 | - name: "check_task_solvable" 54 | description: "Parse the task description and return the task_status about the task" 55 | parameters: 56 | type: "object" 57 | properties: 58 | task_status: 59 | type: "string" 60 | enum: ["Unsure","Unsolvable","Solvable"] 61 | required: ["task_status"] 62 | - name: "select_better_answer" 63 | description: "Select the better answer with a comprehensive investigation on given aspects. You should ignore the impact of the order of candidate answers." 64 | parameters: 65 | type: "object" 66 | properties: 67 | index: 68 | type: "number" 69 | description: "The `index` value in the selected better answer." 70 | required: ["index"] 71 | fn_completion_parser: "index_parser" 72 | batch_size: 1 73 | -------------------------------------------------------------------------------- /stabletoolbench/toolbench/tooleval/evaluators/tooleval_gpt-3.5-turbo_default/template.txt: -------------------------------------------------------------------------------- 1 | 2 | evaluate_process_reward 3 | 4 | Query: 5 | {query} 6 | 7 | Intermediate Steps: 8 | {mid_steps} 9 | 10 | Final Answer: 11 | {final_answer} 12 | 13 | Based on the query, intermediate steps, and final answer, evaluate the entire task-solving process using the following criteria: 14 | 15 | 1. **Successful Tool Calling**: For each intermediate step, indicate whether a tool was successfully called, with a score of 0 (no) or 1 (yes). 16 | 2. **Contribution to Final Answer**: Rate the contribution of each intermediate step to the final answer on a scale of 0 to 5. 17 | 3. **Final Answer Status**: Determine the final answer status as 'Solved', 'Unsure', or 'Unsolved'. 18 | 19 | Please call the `evaluate_process_reward` function to return your evaluation. 20 | 21 | 22 | 23 | 24 | 25 | check_answer_status 26 | 27 | Giving the query and answer, you need give `answer_status` of the answer by following rules: 28 | 1. If the answer is a sorry message or not a positive/straight response for the given query, return "Unsolved". 29 | 2. If the answer is a positive/straight response for the given query, you have to further check. 30 | 2.1 If the answer is not sufficient to determine whether the solve the query or not, return "Unsure". 31 | 2.2 If you are confident that the answer is sufficient to determine whether the solve the query or not, return "Solved" or "Unsolved". 32 | 33 | Query: 34 | {query} 35 | Answer: 36 | {answer} 37 | 38 | Now give your reason in "content" and `answer_status` of JSON to `check_answer_status`. 39 | 40 | 41 | 42 | 43 | parse_answer_status 44 | 45 | Giving the query and the correspond execution detail of an answer, you need give `answer_status` of the answer by following rules: 46 | 1. If all 'tool' nodes' message indicate that there are errors happened, return "Unsolved" 47 | 2. If you find the information in the "final_answer" is not true/valid according to the messages in 'tool' nodes, return "Unsolved" 48 | 3. If you are unable to verify the authenticity and validity of the information, return "Unsure" 49 | 4. If there are 'tool' node in the chain contains successful func calling and those calling indeed solve the query, return "Solved" 50 | 51 | Query: 52 | {query} 53 | Answer: 54 | {answer} 55 | 56 | Now you are requested to give reason in "content" and `answer_status` of JSON to `parse_answer_status`. 57 | 58 | 59 | 60 | 61 | check_task_solvable 62 | 63 | Please check whether the given task solvable with following rules: 64 | 1. If the `query` provide invalid information (e.g. invalid email address or phone number), return "Unsolvable" 65 | 2. If the `query` needs more information to solve (e.g. the target restaurant name in a navigation task), return "Unsolvable" 66 | 3. If you are unable to draw a conclusion, return "Unsure" 67 | 4. If the currently `available_tools` are enough to solve the query, return "Solvable" 68 | 69 | Task: 70 | {task} 71 | 72 | Now give your reason in "content" and `task_status` of JSON to `check_task_solvable`. 73 | 74 | 75 | 76 | 77 | 78 | 79 | select_better_answer 80 | 81 | Query: 82 | {query} 83 | 84 | Answer_0: 85 | {answer_0} 86 | 87 | Answer_1: 88 | {answer_1} 89 | 90 | Given above query and answers in JSON format, you must follow the rules to select the relatively better answer and give the index of the answer **(0 for Answer_0, 1 for Answer_1)**: 91 | 1. Compare the value of "final_answer" in following aspects: 92 | - Informative: whether it contains all necessary information to reply to the query. 93 | - Factuality: whether it accurately describes what has been done, and what failed in the end. 94 | - Reasoning: If answer does not solve the query, whether gives a detailed and accurate reason for failure. 95 | 2. If you cannot determine yet, compare the value of "answer_details" in following aspects: 96 | - Tool calling costs: calculating the percentage of failed and replicated tools calling. 97 | - Running costs: calculating the total tokens T used in execution. 98 | - Milestone: calculating the milestone(fixed subtasks) reached in execution. 99 | - Exploration: whether tries potential useful tools in execution. Just count times of successful tool calling with different tools/arguments in execution. 100 | 101 | If you have made your decision, calling `select_better_answer`, else if you cannot determine, select a random answer. 102 | 103 | -------------------------------------------------------------------------------- /stabletoolbench/toolbench/tooleval/evaluators/tooleval_gpt-3.5-turbo_fn/config.yaml: -------------------------------------------------------------------------------- 1 | evaluator_name: "tooleval_gpt-3.5-turbo_fn" 2 | registered_cls_name: "OpenAIEvaluator" 3 | prompt_template: "template.txt" 4 | fn_completions: "openai_completions" 5 | apis_json: "your/path/to/api_pool.json" 6 | completions_kwargs: 7 | model: "gpt-3.5-turbo-16k" 8 | max_tokens: 100 9 | temperature: 0 10 | timeout: 10 11 | function_call: 12 | name: "choose_preference" 13 | functions: 14 | - name: "choose_preference" 15 | description: "Choose the preferred answer for the query within all given answers." 16 | parameters: 17 | type: "object" 18 | properties: 19 | preference: 20 | type: "number" 21 | description: "The index of the preferred answer in all given answers." 22 | required: [ "preference" ] 23 | fn_completion_parser: "index_parser" 24 | batch_size: 1 25 | -------------------------------------------------------------------------------- /stabletoolbench/toolbench/tooleval/evaluators/tooleval_gpt-3.5-turbo_fn/template.txt: -------------------------------------------------------------------------------- 1 | 2 | system 3 | You are a helpful annotator, that help user to annotate data. 4 | 5 | 6 | user 7 | Giving task description and candidate answers, I want you to choose one preferred answer based on the rules. To do so, I will give you the task description that given to the models, and the candidate answers in a list for chosen. To choose the one preferred answer, you need to first analyse answers based on rules, then give the index number of the preferred answer of JSON to `choose_preference`. 8 | 9 | Here are the preference rules: 10 | 1. if both answers give the none empty `final_answer`, check whether the given `final_answer` solves the given query. 11 | 1.1 if both answers solve the query, choose one with smaller `total_steps`. 12 | 1.1.1 if `total_steps` are same, choose one answer with better `final_answer` quality. 13 | 1.2 if one answer solve while the other not, chose the answer that solve query. 14 | 1.3 if both answers failed, check the `answer_details` to choose one with considering following preference: 15 | 1.3.1 check `response` and prefer more successful tool calling. 16 | 1.3.2 check `name` and prefer using more various tool usage. 17 | 1.3.3 prefer smaller `total_steps`. 18 | 2. if one give none empty `final_answer` while other not, choose the one give `final_answer`. 19 | 3. if both failed to give none empty `final_answer`, following 1.3 to choose one with better `answer_details`. 20 | 21 | Here is the task description in JSON format: 22 | {task_description} 23 | 24 | Here are the candidate answers in JSON format: 25 | {answers} 26 | 27 | Now choose the preferred answer by analysing results and the rules given, return the index in range [0,1]. 28 | -------------------------------------------------------------------------------- /stabletoolbench/toolbench/tooleval/evaluators/tooleval_gpt-3.5-turbo_normalized/config.yaml: -------------------------------------------------------------------------------- 1 | evaluator_name: "tooleval_gpt-3.5-turbo_normalized" 2 | registered_cls_name: "OpenAINormalizedEvaluator" 3 | prompt_template: "template.txt" 4 | fn_completions: "normalized_openai_completions" 5 | apis_json: "your/path/to/api_pool.json" 6 | completions_kwargs: 7 | model: "gpt-3.5-turbo-16k" 8 | max_tokens: 100 9 | temperature: 0 10 | timeout: 10 11 | functions: 12 | - name: "parse_answer_details" 13 | description: "Parse the json answer with layerd nodes and return the informations about the answer" 14 | parameters: 15 | type: "object" 16 | properties: 17 | succeed_tool_calling: 18 | type: "number" 19 | description: "Give the number of times that the 'tool' nodes' message is called successfully without any errors in the response" 20 | used_tool_types: 21 | type: "number" 22 | description: "Give the number of different 'name' in 'tool' nodes' message" 23 | required: [ "succeed_tool_calling", "used_tool_types"] 24 | - name: "select_best_final_answer" 25 | description: "For given query, select the best answer in answers list and return the index of the best answer" 26 | parameters: 27 | type: "object" 28 | properties: 29 | best_answer_index: 30 | type: "number" 31 | description: "The index of the best answer in the answer list, start from 0" 32 | required: [ "best_answer_index"] 33 | - name: "check_solve_query" 34 | description: "Check whether the given answer solve the given query, return true or false" 35 | parameters: 36 | type: "object" 37 | properties: 38 | is_solved: 39 | type: "boolean" 40 | description: "true if solved and false if not" 41 | required: ["is_solved"] 42 | fn_completion_parser: "index_parser" 43 | batch_size: 1 44 | -------------------------------------------------------------------------------- /stabletoolbench/toolbench/tooleval/evaluators/tooleval_gpt-3.5-turbo_normalized/template.txt: -------------------------------------------------------------------------------- 1 | 2 | parse_answer_details 3 | 4 | Giving answer details in the following JSON format: 5 | {answer_details} 6 | 7 | I want you to parse the answer details and give the information of JSON to `parse_answer_details`. Now parse the answer. 8 | 9 | 10 | 11 | select_best_final_answer 12 | 13 | For query {query}, you have the following answers in JSON format: 14 | {final_answers} 15 | 16 | I want you to select the best answer from the above answers and give the index of the answer of JSON to `select_best_final_answer`. Now select the best answer. 17 | 18 | 19 | 20 | check_solve_query 21 | 22 | Please check whether the answer solve the query or not. 23 | Query: 24 | {query} 25 | 26 | Answer: 27 | {final_answer} 28 | 29 | Now give your judgment of JSON to `check_solve_query`, remember do not be too strict. 30 | 31 | -------------------------------------------------------------------------------- /stabletoolbench/toolbench/tooleval/evaluators_comparison.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import json 3 | from concurrent.futures import ThreadPoolExecutor,as_completed 4 | from tqdm import tqdm 5 | from evaluators import load_registered_automatic_evaluator 6 | import os 7 | import numpy as np 8 | import copy 9 | from typing import List 10 | from scipy.stats import pearsonr,spearmanr 11 | import random 12 | random.seed(42) 13 | 14 | abs_dir = os.path.split(__file__)[0] 15 | annotated_data = json.load(open(os.path.join(abs_dir,'dataset/human_cross_annotated_data.json'))) 16 | NUM_WORKERS=16 17 | 18 | def get_most_preferred(d:list)->np.ndarray: 19 | if np.iterable(d): 20 | d = np.asanyarray(d) 21 | bins = np.bincount(d) 22 | max_val = np.max(bins) 23 | argmax = np.where(max_val==bins)[0] 24 | return argmax 25 | else: 26 | return np.asarray([d]) 27 | 28 | def agreement_score(x,ref:list)->float: 29 | majority_x = get_most_preferred(x) 30 | majority_ref = get_most_preferred(ref) 31 | score_unit = 1/len(majority_x)/len(majority_ref) 32 | score = 0.0 33 | for x in majority_x: 34 | if x in majority_ref: 35 | score += score_unit 36 | return score 37 | def get_correlation(x,y): 38 | x= np.asarray(x) 39 | y = np.asarray(y) 40 | x = x+1 41 | y = y+1 42 | if np.var(x)==0 or np.var(y)==0: 43 | return float(random.choice(get_most_preferred(x))==random.choice(get_most_preferred(y))) 44 | return pearsonr(x,y)[0] 45 | 46 | def test_on_annotated_data(evaluator_cfg)->List[List[int]]: 47 | evaluators = [load_registered_automatic_evaluator(evaluator_cfg) for _ in range(NUM_WORKERS)] 48 | def get_preference(idx): 49 | data = annotated_data[idx] 50 | def process_tools(tools:list): 51 | for tool in tools: 52 | tool.pop('description',None) 53 | tool.pop('parameters',None) 54 | return tools 55 | 56 | tools = process_tools(data['available_tools']) 57 | ret = evaluators[idx%NUM_WORKERS].annotate_preference( 58 | data['query'], 59 | tools, 60 | data['answers'],multisample=True) 61 | return idx,ret 62 | prefer_dict = {} 63 | with ThreadPoolExecutor(NUM_WORKERS) as pool: 64 | # future = [pool.submit(get_preference,idx) for idx in range(100)] 65 | future = [pool.submit(get_preference,idx) for idx in range(len(annotated_data))] 66 | for thd in tqdm(as_completed(future),total=len(future),ncols=100): 67 | if thd.exception() is not None: 68 | pool.shutdown(cancel_futures=True) 69 | raise thd.exception() 70 | exit(-1) 71 | idx,preference = thd.result() 72 | prefer_dict[idx] = preference 73 | prefer = [prefer_dict[idx] for idx in range(len(future))] 74 | return prefer 75 | 76 | def get_popped_and_rest(d:list,index:int): 77 | l = copy.deepcopy(d) 78 | popped = l.pop(index) 79 | return popped,l 80 | 81 | def calculate_human_performance(): 82 | human_agreement = [] 83 | variance = [] 84 | for data in annotated_data: 85 | agreement_scores = [ 86 | agreement_score(*get_popped_and_rest(data['preference'],idx)) 87 | for idx in range(len(data['preference'])) 88 | ] 89 | human_agreement.append(np.mean(agreement_scores)) 90 | variance.append(np.var([1-agreement_scores[idx] for idx in range(len(agreement_scores))])) 91 | 92 | 93 | return { 94 | 'human_agreement':np.mean(human_agreement), 95 | 'bias':0, 96 | 'variance':np.mean(variance) 97 | } 98 | 99 | 100 | 101 | def calculate_evaluator_performance(evaluator_preference,human_preference): 102 | human_agreement = [] 103 | bias = [] 104 | variance = [] 105 | assert len(evaluator_preference)==len(human_preference),'length of evaluator_preference and human_preference should be the same!' 106 | correlation = [] 107 | for idx in range(len(evaluator_preference)): 108 | human_pref = human_preference[idx] 109 | evaluator_pref = evaluator_preference[idx] 110 | 111 | human_agreement.append([ 112 | agreement_score(pref,human_pref) for pref in evaluator_pref 113 | ]) 114 | bias.append( 115 | 1 - agreement_score(human_pref,evaluator_pref) 116 | ) 117 | variance.append( 118 | np.var([1-score for score in human_agreement[-1]]) 119 | ) 120 | correlation.append(get_correlation(human_pref,evaluator_pref)) 121 | 122 | return{ 123 | 'correlation': np.mean(correlation), 124 | 'human_agreement':np.mean(np.mean(human_agreement,axis=1)), 125 | 'bias':np.mean(bias), 126 | 'variance':np.mean(variance) 127 | } 128 | 129 | if __name__=='__main__': 130 | evaluators = ['tooleval_gpt-3.5-turbo_normalized',] 131 | human_perference = [ 132 | data['preference'] for data in annotated_data 133 | ] 134 | 135 | evaluator_performance = [calculate_human_performance()] 136 | for evaluator in evaluators: 137 | if not os.path.exists(os.path.join(abs_dir,'dataset',f'performance_{evaluator}.npy')): 138 | evaluator_cfg = { 139 | 'evaluators_cfg_path':os.path.join(abs_dir,'evaluators'), 140 | 'evaluator':evaluator 141 | } 142 | evaluator_perference = test_on_annotated_data(evaluator_cfg) 143 | np.save(os.path.join(abs_dir,'dataset',f'performance_{evaluator}.npy'),evaluator_perference) 144 | 145 | evaluator_perference = np.load(os.path.join(abs_dir,'dataset',f'performance_{evaluator}.npy'),allow_pickle=True) 146 | performance = calculate_evaluator_performance(evaluator_perference,human_perference) 147 | print(performance) 148 | evaluator_performance.append(performance) 149 | 150 | df = pd.DataFrame(evaluator_performance,index=['human']+evaluators) 151 | df.to_csv(os.path.join(abs_dir,'dataset','evaluator_performance.csv')) 152 | print(df) -------------------------------------------------------------------------------- /stabletoolbench/toolbench/tooleval/requirements.txt: -------------------------------------------------------------------------------- 1 | tqdm 2 | numpy 3 | pandas 4 | pydantic 5 | tenacity 6 | openai 7 | pyyaml -------------------------------------------------------------------------------- /stabletoolbench/toolbench/tooleval/results/default_evalset/DFS/win.csv: -------------------------------------------------------------------------------- 1 | ,Method,Win Rate,Std Error 2 | 0,DFS,, 3 | -------------------------------------------------------------------------------- /stabletoolbench/toolbench/tooleval/results/leaderboard###default_evalset###tooleval_gpt-3.5-turbo_normalized###ChatGPT-DFSDT.csv: -------------------------------------------------------------------------------- 1 | Method,WinRate,G1_instruction_WinRate,G1_tool_WinRate,G1_category_WinRate,G2_instruction_WinRate,G2_category_WinRate,G3_instruction_WinRate 2 | GPT4-DFSDT,70.4,60,71.5,67,79.5,77.5,71 3 | GPT4-ReACT,64.4,53.5,50,53.5,67,72,47 4 | ChatGPT-DFSDT,64.3,54.5,65,60.5,75,71.5,62 5 | ToolLLaMA-DFSDT-Retriever,63.1,64,64,60.5,81.5,68.5,65 6 | ToolLLaMA-DFSDT,60,57,61,62,77,77,66 7 | ChatGPT-ReACT,50,41.5,44,44.5,42.5,46.5,22 8 | Text-Davinci-003-DFSDT,46.3,43.5,44,46,37,42,46 9 | Claude-2-DFSDT,43.5,20.5,31,18.5,17,20.5,28 10 | Claude-2-ReACT,34.4,5.5,3.5,5.5,6,6,14 11 | Text-Davinci-003-ReACT,33.2,12,20,20,8.5,14.5,24 -------------------------------------------------------------------------------- /stabletoolbench/toolbench/tooleval/results/leaderboard###default_evalset###tooleval_gpt-3.5-turbo_normalized###gpt-3.5-turbo_CoT.csv: -------------------------------------------------------------------------------- 1 | Method,WinRate,StdError,G1_tool_WinRate,G2_instruction_WinRate,G1_category_WinRate,G1_instruction_WinRate,G2_category_WinRate,G3_instruction_WinRate,G1_tool_StdError,G2_instruction_StdError,G1_category_StdError,G1_instruction_StdError,G2_category_StdError,G3_instruction_StdError 2 | llama-65B-finetuned-5k_CoT,0.675,0.0191213231759729,0.55,0.74,0.55,0.67,0.8,0.74,0.049749371855331,0.0438634243989226,0.049749371855331,0.0470212717820349,0.04,0.0438634243989226 3 | llama-65B-finetuned-1k_CoT,0.666110183639399,0.0192690903060015,0.49,0.696969696969697,0.53,0.66,0.86,0.76,0.0499899989997999,0.0461883428464987,0.0499099188538711,0.047370877129308,0.0346987031457949,0.0427083130081252 4 | llama-65B-finetuned-300_CoT,0.5383333333333333,0.0203523362932267,0.41,0.66,0.43,0.51,0.65,0.57,0.0491833305094317,0.047370877129308,0.0495075751779462,0.0499899989997999,0.0476969600708472,0.0495075751779462 5 | gpt-3.5-turbo_CoT,0.5,0.0,0.5,0.5,0.5,0.5,0.5,0.5,0.0,0.0,0.0,0.0,0.0,0.0 6 | -------------------------------------------------------------------------------- /stabletoolbench/toolbench/utils.py: -------------------------------------------------------------------------------- 1 | import json 2 | import re 3 | import torch 4 | import transformers 5 | import transformers.models.llama.modeling_llama 6 | from functools import partial 7 | 8 | 9 | def process_system_message(system_message, functions): 10 | assert "with a function call to actually excute your step." in system_message 11 | # we find that following ReACT format and merging the thought node and function call node is easier for model to learn to integrate the action input json string in its prediction than learn to predict a json string directly. 12 | system_message = system_message.replace("with a function call to actually excute your step.", "with a function call to actually excute your step. Your output should follow this format:\nThought:\nAction\nAction Input:\n") 13 | # add all the function dicts in the prompt. 14 | system_message = system_message + "\nSpecifically, you have access to the following APIs: " + str(functions) 15 | return system_message 16 | 17 | def get_gpu_memory(max_gpus=None): 18 | """Get available memory for each GPU.""" 19 | gpu_memory = [] 20 | num_gpus = ( 21 | torch.cuda.device_count() 22 | if max_gpus is None 23 | else min(max_gpus, torch.cuda.device_count()) 24 | ) 25 | 26 | for gpu_id in range(num_gpus): 27 | with torch.cuda.device(gpu_id): 28 | device = torch.cuda.current_device() 29 | gpu_properties = torch.cuda.get_device_properties(device) 30 | total_memory = gpu_properties.total_memory / (1024**3) 31 | allocated_memory = torch.cuda.memory_allocated() / (1024**3) 32 | available_memory = total_memory - allocated_memory 33 | gpu_memory.append(available_memory) 34 | return gpu_memory 35 | 36 | 37 | def standardize_category(category): 38 | save_category = category.replace(" ", "_").replace(",", "_").replace("/", "_") 39 | while " " in save_category or "," in save_category: 40 | save_category = save_category.replace(" ", "_").replace(",", "_") 41 | save_category = save_category.replace("__", "_") 42 | return save_category 43 | 44 | def standardize(string): 45 | res = re.compile("[^\\u4e00-\\u9fa5^a-z^A-Z^0-9^_]") 46 | string = res.sub("_", string) 47 | string = re.sub(r"(_)\1+","_", string).lower() 48 | while True: 49 | if len(string) == 0: 50 | return string 51 | if string[0] == "_": 52 | string = string[1:] 53 | else: 54 | break 55 | while True: 56 | if len(string) == 0: 57 | return string 58 | if string[-1] == "_": 59 | string = string[:-1] 60 | else: 61 | break 62 | if string[0].isdigit(): 63 | string = "get_" + string 64 | return string 65 | 66 | def change_name(name): 67 | change_list = ["from", "class", "return", "false", "true", "id", "and"] 68 | if name in change_list: 69 | name = "is_" + name 70 | return name 71 | 72 | # code adapted from https://huggingface.co/kaiokendev/superhot-13b-8k-no-rlhf-test/blob/main/llama_rope_scaled_monkey_patch.py 73 | class CondenseRotaryEmbedding(torch.nn.Module): 74 | def __init__(self, dim, ratio, max_position_embeddings=2048, base=10000, device=None): 75 | super().__init__() 76 | inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2).float().to(device) / dim)) 77 | self.register_buffer("inv_freq", inv_freq) 78 | 79 | # Build here to make `torch.jit.trace` work. 80 | self.ratio = ratio 81 | max_position_embeddings *= ratio 82 | print(f"Condensing Positional embeddings from {max_position_embeddings} to {max_position_embeddings // ratio}") 83 | self.max_seq_len_cached = max_position_embeddings 84 | t = torch.arange(self.max_seq_len_cached, device=self.inv_freq.device, dtype=self.inv_freq.dtype) / ratio 85 | freqs = torch.einsum("i,j->ij", t, self.inv_freq) 86 | # Different from paper, but it uses a different permutation in order to obtain the same calculation 87 | emb = torch.cat((freqs, freqs), dim=-1) 88 | dtype = torch.get_default_dtype() 89 | self.register_buffer("cos_cached", emb.cos()[None, None, :, :].to(dtype), persistent=False) 90 | self.register_buffer("sin_cached", emb.sin()[None, None, :, :].to(dtype), persistent=False) 91 | 92 | def forward(self, x, seq_len=None): 93 | # x: [bs, num_attention_heads, seq_len, head_size] 94 | # This `if` block is unlikely to be run after we build sin/cos in `__init__`. Keep the logic here just in case. 95 | if seq_len > self.max_seq_len_cached: 96 | self.max_seq_len_cached = seq_len 97 | t = torch.arange(self.max_seq_len_cached, device=x.device, dtype=self.inv_freq.dtype) / self.ratio 98 | freqs = torch.einsum("i,j->ij", t, self.inv_freq) 99 | # Different from paper, but it uses a different permutation in order to obtain the same calculation 100 | emb = torch.cat((freqs, freqs), dim=-1).to(x.device) 101 | self.register_buffer("cos_cached", emb.cos()[None, None, :, :].to(x.dtype), persistent=False) 102 | self.register_buffer("sin_cached", emb.sin()[None, None, :, :].to(x.dtype), persistent=False) 103 | return ( 104 | self.cos_cached[:, :, :seq_len, ...].to(dtype=x.dtype), 105 | self.sin_cached[:, :, :seq_len, ...].to(dtype=x.dtype), 106 | ) 107 | 108 | def replace_llama_with_condense(ratio): 109 | transformers.models.llama.modeling_llama.LlamaRotaryEmbedding = partial(CondenseRotaryEmbedding, ratio=ratio) 110 | 111 | 112 | def process_retrieval_ducoment(documents_df): 113 | ir_corpus = {} 114 | corpus2tool = {} 115 | for row in documents_df.itertuples(): 116 | doc = json.loads(row.document_content) 117 | ir_corpus[row.docid] = (doc.get('category_name', '') or '') + ', ' + \ 118 | (doc.get('tool_name', '') or '') + ', ' + \ 119 | (doc.get('api_name', '') or '') + ', ' + \ 120 | (doc.get('api_description', '') or '') + \ 121 | ', required_params: ' + json.dumps(doc.get('required_parameters', '')) + \ 122 | ', optional_params: ' + json.dumps(doc.get('optional_parameters', '')) + \ 123 | ', return_schema: ' + json.dumps(doc.get('template_response', '')) 124 | corpus2tool[(doc.get('category_name', '') or '') + ', ' + \ 125 | (doc.get('tool_name', '') or '') + ', ' + \ 126 | (doc.get('api_name', '') or '') + ', ' + \ 127 | (doc.get('api_description', '') or '') + \ 128 | ', required_params: ' + json.dumps(doc.get('required_parameters', '')) + \ 129 | ', optional_params: ' + json.dumps(doc.get('optional_parameters', '')) + \ 130 | ', return_schema: ' + json.dumps(doc.get('template_response', ''))] = doc['category_name'] + '[SEP]' + doc['tool_name'] + '[SEP]' + doc['api_name'] 131 | return ir_corpus, corpus2tool 132 | --------------------------------------------------------------------------------