├── .gitignore
├── README.md
├── cases
├── case-0.png
├── case-1.png
└── case-2.png
├── config
├── archer
│ ├── accelerate_config.yaml
│ ├── archer_config.yaml
│ └── default.yaml
├── ds_configs
│ └── stage3-cosine.json
├── llama3-1
│ └── StepTool_ppo.json
├── qwen2
│ └── StepTool_ppo.json
└── toolllama
│ └── StepTool_ppo.json
├── data
├── model_predictions_converted
│ └── qwen2
│ │ └── G123_example.json
└── reward_annotation
│ └── qwen2
│ └── G123_example_5.json
├── data_eval
└── pass_rate_results
│ ├── baseline-archer_cot
│ ├── G1_category.csv
│ ├── G1_category.json
│ ├── G1_instruction.csv
│ ├── G1_instruction.json
│ ├── G1_tool.csv
│ ├── G1_tool.json
│ ├── G2_category.csv
│ ├── G2_category.json
│ ├── G2_instruction.csv
│ ├── G2_instruction.json
│ ├── G3_instruction.csv
│ └── G3_instruction.json
│ ├── baseline-archer_dfs
│ ├── G1_category.csv
│ ├── G1_category.json
│ ├── G1_instruction.csv
│ ├── G1_instruction.json
│ ├── G1_tool.csv
│ ├── G1_tool.json
│ ├── G2_category.csv
│ ├── G2_category.json
│ ├── G2_instruction.csv
│ ├── G2_instruction.json
│ ├── G3_instruction.csv
│ └── G3_instruction.json
│ ├── baseline-eto_cot
│ ├── G1_instruction.csv
│ ├── G1_instruction.json
│ ├── G1_tool.csv
│ ├── G1_tool.json
│ ├── G2_category.csv
│ ├── G2_category.json
│ ├── G2_instruction.csv
│ ├── G2_instruction.json
│ ├── G3_instruction.csv
│ └── G3_instruction.json
│ ├── baseline-eto_dfs
│ ├── G1_category.csv
│ ├── G1_category.json
│ ├── G1_instruction.csv
│ ├── G1_instruction.json
│ ├── G1_tool.csv
│ ├── G1_tool.json
│ ├── G2_category.csv
│ ├── G2_category.json
│ ├── G2_instruction.csv
│ ├── G2_instruction.json
│ ├── G3_instruction.csv
│ └── G3_instruction.json
│ ├── baseline-ppo_cot
│ ├── G1_category.csv
│ ├── G1_category.json
│ ├── G1_instruction.csv
│ ├── G1_instruction.json
│ ├── G1_tool.csv
│ ├── G1_tool.json
│ ├── G2_category.csv
│ ├── G2_category.json
│ ├── G2_instruction.csv
│ ├── G2_instruction.json
│ ├── G3_instruction.csv
│ └── G3_instruction.json
│ ├── baseline-ppo_dfs
│ ├── G1_category.csv
│ ├── G1_category.json
│ ├── G1_instruction.csv
│ ├── G1_instruction.json
│ ├── G1_tool.csv
│ ├── G1_tool.json
│ ├── G2_category.csv
│ ├── G2_category.json
│ ├── G2_instruction.csv
│ ├── G2_instruction.json
│ ├── G3_instruction.csv
│ └── G3_instruction.json
│ ├── baseline-rft_cot
│ ├── G1_category.csv
│ ├── G1_category.json
│ ├── G1_instruction.csv
│ ├── G1_instruction.json
│ ├── G1_tool.csv
│ ├── G1_tool.json
│ ├── G2_category.csv
│ ├── G2_category.json
│ ├── G2_instruction.csv
│ ├── G2_instruction.json
│ ├── G3_instruction.csv
│ └── G3_instruction.json
│ ├── baseline-rft_dfs
│ ├── G1_category.csv
│ ├── G1_category.json
│ ├── G1_instruction.csv
│ ├── G1_instruction.json
│ ├── G1_tool.csv
│ ├── G1_tool.json
│ ├── G2_category.csv
│ ├── G2_category.json
│ ├── G2_instruction.csv
│ ├── G2_instruction.json
│ ├── G3_instruction.csv
│ └── G3_instruction.json
│ ├── steptool_cot
│ ├── G1_category.csv
│ ├── G1_category.json
│ ├── G1_instruction.csv
│ ├── G1_instruction.json
│ ├── G1_tool.csv
│ ├── G1_tool.json
│ ├── G2_category.csv
│ ├── G2_category.json
│ ├── G2_instruction.csv
│ ├── G2_instruction.json
│ ├── G3_instruction.csv
│ └── G3_instruction.json
│ ├── steptool_dfs
│ ├── G1_category.csv
│ ├── G1_category.json
│ ├── G1_instruction.csv
│ ├── G1_instruction.json
│ ├── G1_tool.csv
│ ├── G1_tool.json
│ ├── G2_category.csv
│ ├── G2_category.json
│ ├── G2_instruction.csv
│ ├── G2_instruction.json
│ ├── G3_instruction.csv
│ └── G3_instruction.json
│ ├── toolllama_sft_cot
│ ├── G1_category.csv
│ ├── G1_category.json
│ ├── G1_instruction.csv
│ ├── G1_instruction.json
│ ├── G1_tool.csv
│ ├── G1_tool.json
│ ├── G2_category.csv
│ ├── G2_category.json
│ ├── G2_instruction.csv
│ ├── G2_instruction.json
│ ├── G3_instruction.csv
│ └── G3_instruction.json
│ └── toolllama_sft_dfs
│ ├── G1_category.csv
│ ├── G1_category.json
│ ├── G1_instruction.csv
│ ├── G1_instruction.json
│ ├── G1_tool.csv
│ ├── G1_tool.json
│ ├── G2_category.csv
│ ├── G2_category.json
│ ├── G2_instruction.json
│ ├── G3_instruction.csv
│ └── G3_instruction.json
├── data_train
├── eto
│ └── dpo_data_example.csv
├── llama3-1
│ ├── gpt4_dfs_G123_for_sft_example.json
│ └── step_grained_for_ppo_example.csv
├── qwen2
│ ├── gpt4_dfs_G123_for_sft_example.json
│ └── step_grained_for_ppo_example.csv
├── rft
│ └── rft_data_example.json
└── toolllama
│ └── step_grained_for_ppo_example.csv
├── requirements.txt
├── scripts
├── baseline-archer
│ ├── build_data.sh
│ └── train_archer.sh
├── baseline-eto
│ └── train_dpo.sh
├── baseline-ppo
│ └── train_toolllama.sh
├── baseline-rft
│ └── train_rft.sh
├── reward
│ └── annotation_with_gpt.sh
├── sft
│ ├── train_llama3-1.sh
│ └── train_qwen2.sh
└── steptool_train
│ ├── train_llama3-1.sh
│ ├── train_qwen2.sh
│ └── train_toolllama.sh
├── scripts_eval
├── baseline-archer
│ ├── inference_archer_vllm.sh
│ ├── run_convert_answer.sh
│ └── run_pass_rate.sh
├── baseline-eto
│ ├── inference_eto_vllm.sh
│ ├── run_convert_answer.sh
│ └── run_pass_rate.sh
├── baseline-ppo
│ ├── inference_ppo_vllm.sh
│ ├── run_convert_answer.sh
│ └── run_pass_rate.sh
├── baseline-rft
│ ├── inference_rft_vllm.sh
│ ├── run_convert_answer.sh
│ └── run_pass_rate.sh
├── llama3-1
│ ├── inference_llama3-1_vllm.sh
│ ├── run_conver_answer.sh
│ ├── run_pass_rate.sh
│ └── run_preference.sh
├── qwen2
│ ├── inference_qwen2_vllm.sh
│ ├── run_convert_answer.sh
│ ├── run_pass_rate.sh
│ └── run_preference.sh
├── steptool
│ ├── inference_steptool_vllm.sh
│ ├── run_convert_answer.sh
│ └── run_pass_rate.sh
├── toolllama-sft
│ ├── inference_toolllama_vllm.sh
│ ├── run_conver_answer.sh
│ └── run_pass_rate.sh
└── toolllama
│ └── run_preference.sh
├── src
├── baseline-archer
│ ├── archer_agent.py
│ ├── archer_critic.py
│ ├── archer_data.py
│ ├── archer_environment.py
│ ├── archer_trainer.py
│ ├── build_archer_data.py
│ ├── offpolicy_train_loop.py
│ └── run.py
├── baseline-eto
│ └── dpo_train.py
├── baseline-ppo
│ └── ppo.py
├── baseline-rft
│ └── rft.py
├── reward
│ ├── annotation_by_rules.ipynb
│ ├── annotation_with_gpt.py
│ ├── evaluators
│ │ ├── evaluator.py
│ │ └── gpt-4-turbo-2024-04-09
│ │ │ ├── config.yaml
│ │ │ └── template.txt
│ └── openai_key.json
├── sft
│ ├── llama3-1.py
│ └── qwen2.py
└── steptool
│ ├── step_ppo.py
│ └── step_ppotrainer.py
└── stabletoolbench
├── config.yml
├── server
├── config.yml
├── main.py
├── requirements.txt
└── utils.py
├── solvable_queries
├── test_instruction
│ ├── G1_category.json
│ ├── G1_instruction.json
│ ├── G1_tool.json
│ ├── G2_category.json
│ ├── G2_instruction.json
│ └── G3_instruction.json
└── test_query_ids
│ ├── G1_category.json
│ ├── G1_instruction.json
│ ├── G1_tool.json
│ ├── G2_category.json
│ ├── G2_instruction.json
│ └── G3_instruction.json
└── toolbench
├── inference
├── Algorithms
│ ├── DFS.py
│ ├── __init__.py
│ ├── base_search.py
│ └── single_chain.py
├── Downstream_tasks
│ ├── __init__.py
│ ├── base_env.py
│ ├── rapidapi.py
│ └── rapidapi_multithread.py
├── LLM
│ ├── __init__.py
│ ├── base_io.py
│ ├── chatgpt_model.py
│ ├── llama3_sft_model.py
│ ├── qwen2_sft_model.py
│ ├── retriever.py
│ └── tool_llama_vllm.py
├── LLM_rank
│ ├── __init__.py
│ └── rank_candidate.py
├── Prompts
│ ├── ReAct_prompts.py
│ ├── Tree_search_prompts.py
│ ├── __init__.py
│ └── rank_prompts.py
├── Tree
│ ├── Tree.py
│ └── __init__.py
├── callbacks
│ └── ServerEventCallback.py
├── qa_pipeline.py
├── qa_pipeline_multithread.py
├── qa_pipeline_open_domain.py
├── server.py
├── toolbench_server.py
└── utils.py
├── model
├── __init__.py
├── apply_delta.py
├── compression.py
├── make_delta.py
└── model_adapter.py
├── tool_conversation.py
├── tooleval
├── README.md
├── README_ZH.md
├── ToolBench.code-workspace
├── __init__.py
├── automatic_eval_sample.py
├── convert_answers.py
├── convert_to_answer_format.py
├── dataset
│ └── __init__.py
├── eval_and_update_leaderboard.py
├── eval_pass_rate.py
├── eval_preference.py
├── eval_process_reward.py
├── evaluation
│ ├── __init__.py
│ ├── dataclass.py
│ ├── methodcls.py
│ └── usereval.py
├── evaluators
│ ├── __init__.py
│ ├── registered_cls
│ │ ├── __init__.py
│ │ ├── base.py
│ │ ├── rtl.py
│ │ ├── tooleval.py
│ │ └── utils.py
│ ├── tooleval_gpt-3.5-turbo_default
│ │ ├── config.yaml
│ │ └── template.txt
│ ├── tooleval_gpt-3.5-turbo_fn
│ │ ├── config.yaml
│ │ └── template.txt
│ └── tooleval_gpt-3.5-turbo_normalized
│ │ ├── config.yaml
│ │ └── template.txt
├── evaluators_comparison.py
├── requirements.txt
├── results
│ ├── default_evalset
│ │ ├── DFS
│ │ │ └── win.csv
│ │ └── gpt-3.5-turbo_CoT
│ │ │ ├── G1_category.json
│ │ │ ├── G1_instruction.json
│ │ │ ├── G1_tool.json
│ │ │ ├── G2_category.json
│ │ │ ├── G2_instruction.json
│ │ │ └── G3_instruction.json
│ ├── leaderboard###default_evalset###tooleval_gpt-3.5-turbo_normalized###ChatGPT-DFSDT.csv
│ └── leaderboard###default_evalset###tooleval_gpt-3.5-turbo_normalized###gpt-3.5-turbo_CoT.csv
└── utils.py
└── utils.py
/.gitignore:
--------------------------------------------------------------------------------
1 | wandb/
2 | sft_ckpts/
3 | **/__pycache__/
4 | ckpts/
5 | data_eval/*
6 | !data_eval/pass_rate_results
7 | experimental_results/
8 | core*
--------------------------------------------------------------------------------
/cases/case-0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yuyq18/StepTool/121a6acd11a899d947a5d3fa7269cf76624c1df3/cases/case-0.png
--------------------------------------------------------------------------------
/cases/case-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yuyq18/StepTool/121a6acd11a899d947a5d3fa7269cf76624c1df3/cases/case-1.png
--------------------------------------------------------------------------------
/cases/case-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yuyq18/StepTool/121a6acd11a899d947a5d3fa7269cf76624c1df3/cases/case-2.png
--------------------------------------------------------------------------------
/config/archer/accelerate_config.yaml:
--------------------------------------------------------------------------------
1 | compute_environment: LOCAL_MACHINE
2 | debug: true
3 | distributed_type: MULTI_GPU
4 | downcast_bf16: 'no'
5 | gpu_ids: 0,1,2,3
6 | machine_rank: 0
7 | main_training_function: main
8 | mixed_precision: 'bf16'
9 | num_machines: 1
10 | num_processes: 4
11 | rdzv_backend: static
12 | same_network: true
13 | tpu_env: []
14 | tpu_use_cluster: false
15 | tpu_use_sudo: false
16 | use_cpu: false
17 |
--------------------------------------------------------------------------------
/config/archer/archer_config.yaml:
--------------------------------------------------------------------------------
1 | defaults:
2 | - default
3 | - _self_
4 |
5 | # checkpoint
6 | checkpoint_path: null
7 | basemodel: 'toolllama'
8 | save_path: 'output/archer_baseline/'
9 | env_load_path: 'data_train/archer/'
10 |
11 | # model
12 | agent_type: "archer_toolllama"
13 | policy_lm: 'ToolBench/ToolLLaMA-2-7b-v2'
14 | max_new_tokens: 512
15 | use_bfloat16: True
16 | use_lora: True
17 | eos_str: ''
18 |
19 | save_freq: 50
20 | eval_freq: 5
21 |
22 | capacity: 100000 #replay buffer size
23 | rollout_size: 16 #number of rollout trajectories for each update
24 | eval_size: 4 #number of trajectories for evaluation
25 | batch_size: 4
26 | iterations: 100 #total number of iterations
27 | epochs: 20 #number of epochs for the critic each iteration
28 | actor_epochs: 1 #number of epochs for the actor each iteration
29 | warmup_iter: 10 #number of iterations without updating the policy
30 | grad_accum_steps: 8
31 | do_sample: True
32 | temperature: 1.0
33 | critic_lr: 1e-5
34 | lm_lr: 2e-6
35 | env_idx: null #set to null if don't want to reset to a specific environment
36 | gamma: 0.95 #discount factor
37 | tau: 0.1 #soft update parameter
38 | max_grad_norm: 10.0
39 |
40 | # wandb logging
41 | use_wandb: True
42 | project_name: 'archer_baseline'
43 | run_name: 'toolllama_archer_iter100_epoch20_actor1'
44 |
--------------------------------------------------------------------------------
/config/archer/default.yaml:
--------------------------------------------------------------------------------
1 | #cache directory of transformer
2 | cache_dir: '~/.cache/huggingface/hub/'
3 |
4 | #token
5 | huggingface_token: ''
6 | wandb_key: ""
7 |
8 | policy_lm: "gpt2"
9 | critic_lm: "roberta-base"
10 | agent_type: "archer_toolllama"
11 | use_baseline: False
12 | use_lora: False
13 | max_new_tokens: 32
14 | save_freq: 25
15 | eval_freq: 25
16 |
17 | #training hyperparameters
18 | capacity: 100000 #replay buffer size
19 | rollout_size: 128 #number of rollout trajectories for each update
20 | eval_size: 32 #number of trajectories for evaluation
21 | batch_size: 8
22 | iterations: 2000 #total number of iterations
23 | epochs: 50 #number of epochs for the critic each iteration
24 | actor_epochs: 3 #number of epochs for the actor each iteration
25 | warmup_iter: 20 #number of iterations without updating the policy
26 | grad_accum_steps: 32
27 | do_sample: True
28 | temperature: 1.0
29 | critic_lr: 1e-5
30 | lm_lr: 1e-5
31 | env_idx: null #set to null if don't want to reset to a specific environment
32 | gamma: 0.95 #discount factor
33 | tau: 0.1 #soft update parameter
34 | max_grad_norm: 1.0
35 |
36 | use_wandb: False
--------------------------------------------------------------------------------
/config/ds_configs/stage3-cosine.json:
--------------------------------------------------------------------------------
1 | {
2 | "bf16": {
3 | "enabled": "auto"
4 | },
5 | "fp16": {
6 | "enabled": "auto",
7 | "loss_scale": 0,
8 | "loss_scale_window": 1000,
9 | "initial_scale_power": 16,
10 | "hysteresis": 2,
11 | "min_loss_scale": 1
12 | },
13 | "zero_optimization": {
14 | "stage": 3,
15 | "offload_optimizer": {
16 | "device": "cpu",
17 | "pin_memory": true
18 | },
19 | "offload_param": {
20 | "device": "cpu",
21 | "pin_memory": true
22 | },
23 | "overlap_comm": true,
24 | "contiguous_gradients": true,
25 | "sub_group_size": 1e9,
26 | "reduce_bucket_size": "auto",
27 | "stage3_prefetch_bucket_size": "auto",
28 | "stage3_param_persistence_threshold": "auto",
29 | "stage3_max_live_parameters": 1e9,
30 | "stage3_max_reuse_distance": 1e9,
31 | "gather_16bit_weights_on_model_save": true
32 | },
33 | "gradient_accumulation_steps": "auto",
34 | "gradient_clipping": "auto",
35 | "steps_per_print": 1e5,
36 | "train_batch_size": "auto",
37 | "train_micro_batch_size_per_gpu": "auto",
38 | "wall_clock_breakdown": false
39 | }
--------------------------------------------------------------------------------
/config/llama3-1/StepTool_ppo.json:
--------------------------------------------------------------------------------
1 | {
2 | "peft_kwargs": {
3 | "r": 8,
4 | "lora_alpha": 16,
5 | "bias": "none",
6 | "task_type": "CAUSAL_LM"
7 | },
8 | "ppo_kwargs": {
9 | "learning_rate": 1e-5,
10 | "log_with": "wandb",
11 | "remove_unused_columns": false,
12 | "batch_size": 8,
13 | "mini_batch_size": 2,
14 | "gradient_accumulation_steps": 4,
15 | "kl_penalty": "kl",
16 | "init_kl_coef": 0.3,
17 | "target_kl": 6,
18 | "target": 6,
19 | "horizon": 10000,
20 | "gamma": 0.99
21 | }
22 | }
--------------------------------------------------------------------------------
/config/qwen2/StepTool_ppo.json:
--------------------------------------------------------------------------------
1 | {
2 | "peft_kwargs": {
3 | "target_modules": ["gate_proj", "o_proj", "k_proj", "q_proj", "up_proj", "down_proj", "v_proj"],
4 | "r": 8,
5 | "lora_alpha": 16,
6 | "bias": "none",
7 | "task_type": "CAUSAL_LM"
8 | },
9 | "ppo_kwargs": {
10 | "learning_rate": 1e-5,
11 | "log_with": "wandb",
12 | "remove_unused_columns": false,
13 | "batch_size": 8,
14 | "mini_batch_size": 2,
15 | "gradient_accumulation_steps": 4,
16 | "kl_penalty": "kl",
17 | "init_kl_coef": 0.3,
18 | "target_kl": 6,
19 | "target": 6,
20 | "horizon": 10000,
21 | "gamma": 0.99
22 | }
23 | }
--------------------------------------------------------------------------------
/config/toolllama/StepTool_ppo.json:
--------------------------------------------------------------------------------
1 | {
2 | "peft_kwargs": {
3 | "r": 16,
4 | "lora_alpha": 16,
5 | "bias": "none",
6 | "task_type": "CAUSAL_LM"
7 | },
8 | "ppo_kwargs": {
9 | "seed": 2024,
10 | "learning_rate": 1e-5,
11 | "log_with": "wandb",
12 | "remove_unused_columns": false,
13 | "batch_size": 8,
14 | "mini_batch_size": 2,
15 | "gradient_accumulation_steps": 4,
16 | "kl_penalty": "kl",
17 | "init_kl_coef": 0.3,
18 | "target_kl": 6,
19 | "target": 6,
20 | "horizon": 10000,
21 | "gamma": 0.99
22 | }
23 | }
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | accelerate==0.33.0
2 | datasets==2.21.0
3 | trl==0.10.1
4 | wandb==0.17.8
5 | fastapi==0.95.1
6 | gradio==3.23.0
7 | httpx==0.24.0
8 | markdown-it-py==2.2.0
9 | numpy==1.24.3
10 | prompt_toolkit==3.0.47
11 | pydantic==1.10.7
12 | requests==2.32.3
13 | rich==13.3.5
14 | rouge==1.0.1
15 | sentencepiece==0.1.99
16 | shortuuid==1.0.11
17 | tiktoken==0.4.0
18 | tokenizers==0.19.1
19 | transformers==4.43.1
20 | uvicorn==0.22.0
21 | bitsandbytes==0.43.3
22 | peft==0.5.0
23 | langchain==0.0.229
24 | deepspeed==0.14.5
25 | sentence-transformers==2.2.2
26 | tensorboard==2.17.1
27 | openai==1.42.0
28 | scipy==1.14.1
29 | termcolor==2.4.0
30 | Flask==3.0.3
31 | Flask-Cors==4.0.1
32 | backoff==2.2.1
33 | slowapi==0.1.9
34 | httpx==0.24.0
35 | omegaconf==2.3.0
36 |
--------------------------------------------------------------------------------
/scripts/baseline-archer/build_data.sh:
--------------------------------------------------------------------------------
1 | export DATA_FILE="data_train/toolllama/step_grained_for_ppo.csv"
2 | export SAVE_PATH="data_train/archer"
3 |
4 | python src/baseline-archer/build_archer_data.py
--------------------------------------------------------------------------------
/scripts/baseline-archer/train_archer.sh:
--------------------------------------------------------------------------------
1 |
2 | export ARCHER_CONFIG_NAME="archer_config.yaml"
3 |
4 | accelerate launch --config_file config/archer/accelerate_config.yaml src/baseline-archer/run.py
--------------------------------------------------------------------------------
/scripts/baseline-eto/train_dpo.sh:
--------------------------------------------------------------------------------
1 | export TRAIN_PATH="data_train/eto"
2 | export CUDA_VISIBLE_DEVICES=0,1
3 | export WANDB_PROJECT="baselines"
4 |
5 | python src/baseline-eto/dpo_train.py \
6 | --model_name_or_path ToolBench/ToolLLaMA-2-7b-v2 \
7 | --data_path ${TRAIN_PATH}/dpo_data_example.csv \
8 | --bf16 True \
9 | --output_dir "output/eto_baseline-3epoch" \
10 | --report_to "wandb" \
11 | --run_name "eto_baseline-3epoch" \
12 | --num_train_epochs 3 \
13 | --per_device_train_batch_size 1 \
14 | --per_device_eval_batch_size 1 \
15 | --gradient_accumulation_steps 8 \
16 | --eval_strategy "epoch" \
17 | --save_strategy "epoch" \
18 | --save_total_limit 10 \
19 | --seed 2024 \
20 | --learning_rate 1e-4 \
21 | --lr_scheduler_type "cosine" \
22 | --logging_steps 1 \
23 | --model_max_length 8192 \
24 | --max_prompt_length 7000 \
25 | --beta 0.1
--------------------------------------------------------------------------------
/scripts/baseline-ppo/train_toolllama.sh:
--------------------------------------------------------------------------------
1 | export PYTHONPATH=./
2 | export TRAIN_PATH="data_train"
3 | export TRAIN_SET="step_grained_for_ppo_example"
4 | export CUDA_VISIBLE_DEVICES="0,1,2,3"
5 |
6 | export MODEL_TYPE="toolllama"
7 | # load the base model after sft pretrain
8 | export MODEL_PATH="ToolBench/ToolLLaMA-2-7b-v2"
9 |
10 | python src/baseline-ppo/ppo.py \
11 | --model_path ${MODEL_PATH} \
12 | --model_type ${MODEL_TYPE} \
13 | --config_path config/${MODEL_TYPE}/StepTool_ppo.json \
14 | --data_file ${TRAIN_PATH}/${MODEL_TYPE}/${TRAIN_SET}.csv \
15 | --max_context_len 4096 \
16 | --max_response_len 1024 \
17 | --epochs 5
18 |
--------------------------------------------------------------------------------
/scripts/baseline-rft/train_rft.sh:
--------------------------------------------------------------------------------
1 | export TRAIN_PATH="data_train/rft"
2 | export NCCL_P2P_DISABLE=1
3 | export NCCL_IB_DISABLE=1
4 | export CUDA_VISIBLE_DEVICES=0,1,2,3
5 | export WANDB_PROJECT="baselines"
6 | torchrun \
7 | --nproc_per_node 4 \
8 | --nnodes 1 \
9 | --node_rank 0 \
10 | --master_addr localhost \
11 | --master_port 6601 \
12 | src/baseline-rft/rft.py \
13 | --model_name_or_path ToolBench/ToolLLaMA-2-7b-v2 \
14 | --data_path ${TRAIN_PATH}/rft_data_example.json \
15 | --bf16 True \
16 | --output_dir "output/rft_baseline-3epoch" \
17 | --report_to "wandb" \
18 | --run_name "rft_baseline-3epoch" \
19 | --num_train_epochs 3 \
20 | --per_device_train_batch_size 2 \
21 | --per_device_eval_batch_size 2 \
22 | --gradient_accumulation_steps 8 \
23 | --eval_strategy "epoch" \
24 | --save_strategy "epoch" \
25 | --save_total_limit 10 \
26 | --seed 2024 \
27 | --learning_rate 5e-5 \
28 | --weight_decay 0. \
29 | --warmup_ratio 0.04 \
30 | --lr_scheduler_type "cosine" \
31 | --logging_steps 1 \
32 | --model_max_length 8192 \
33 | --gradient_checkpointing True \
34 | --lazy_preprocess False \
35 | --deepspeed config/ds_configs/stage3-cosine.json
36 |
--------------------------------------------------------------------------------
/scripts/reward/annotation_with_gpt.sh:
--------------------------------------------------------------------------------
1 | # cd ../../toolbench/tooleval
2 | # export API_POOL_FILE=path/to/your/openai_key_json_file.json
3 | export PYTHONPATH="./:./stabletoolbench/toolbench/tooleval"
4 | export API_POOL_FILE=src/reward/openai_key.json
5 | export CONVERTED_ANSWER_PATH=data/model_predictions_converted
6 | export SAVE_PATH=data/reward_annotation/
7 | mkdir -p ${SAVE_PATH}
8 |
9 | # export CANDIDATE_MODEL="virtual_qwen2_sft_dfs_fix_epoch3"
10 | export CANDIDATE_MODEL="qwen2"
11 | export EVAL_MODEL="gpt-4-turbo-2024-04-09"
12 | mkdir -p ${SAVE_PATH}/${CANDIDATE_MODEL}
13 | # unset HTTP_PROXY HTTPS_PROXY http_proxy https_proxy
14 | # --evaluators_cfg_path \
15 | python src/reward/annotation_with_gpt.py \
16 | --converted_answer_path ${CONVERTED_ANSWER_PATH}/${CANDIDATE_MODEL} \
17 | --save_path ${SAVE_PATH}/${CANDIDATE_MODEL} \
18 | --reference_model ${CANDIDATE_MODEL} \
19 | --evaluator ${EVAL_MODEL} \
20 | --max_eval_threads 1 \
21 | --task_num 5 \
22 | --evaluate_times 3 \
23 | --test_set G123_example \
--------------------------------------------------------------------------------
/scripts/sft/train_llama3-1.sh:
--------------------------------------------------------------------------------
1 | export NCCL_P2P_DISABLE=1
2 | export NCCL_IB_DISABLE=1
3 | export CUDA_VISIBLE_DEVICES=0,1,2,3
4 | export TRAIN_PATH="data_train"
5 | export TRAIN_SET="gpt4_dfs_G123_for_sft"
6 |
7 | export MODEL_PATH="meta-llama/Meta-Llama-3.1-8B-Instruct"
8 | export MODEL_TYPE="llama3-1"
9 | export OUTPUT_DIR="sft_ckpts"
10 | export WANDB_PROJECT="SFT-Llama3-1"
11 | export WANDB_RUN_NAME="sft_with_gpt4_paths"
12 |
13 | torchrun \
14 | --nproc_per_node 4 \
15 | --nnodes 1 \
16 | --node_rank 0 \
17 | --master_addr localhost \
18 | --master_port 6601 \
19 | src/sft/llama3-1.py \
20 | --model_name_or_path ${MODEL_PATH} \
21 | --data_path ${TRAIN_PATH}/${MODEL_TYPE}/${TRAIN_SET}.json \
22 | --bf16 True \
23 | --output_dir ${OUTPUT_DIR}/${MODEL_TYPE} \
24 | --report_to "wandb" \
25 | --run_name ${WANDB_RUN_NAME} \
26 | --num_train_epochs 5 \
27 | --per_device_train_batch_size 1 \
28 | --per_device_eval_batch_size 1 \
29 | --gradient_accumulation_steps 4 \
30 | --eval_strategy "steps" \
31 | --eval_steps 400 \
32 | --save_strategy "steps" \
33 | --save_steps 400 \
34 | --save_total_limit 10 \
35 | --learning_rate 2e-5 \
36 | --weight_decay 0. \
37 | --warmup_ratio 0.04 \
38 | --lr_scheduler_type "cosine" \
39 | --logging_steps 1 \
40 | --model_max_length 8192 \
41 | --gradient_checkpointing True \
42 | --lazy_preprocess False \
43 | --deepspeed config/ds_configs/stage3-cosine.json
44 |
--------------------------------------------------------------------------------
/scripts/sft/train_qwen2.sh:
--------------------------------------------------------------------------------
1 | export NCCL_P2P_DISABLE=1
2 | export NCCL_IB_DISABLE=1
3 | export CUDA_VISIBLE_DEVICES=0,1,2,3
4 | export TRAIN_PATH="data_train"
5 | export TRAIN_SET="gpt4_dfs_G123_for_sft"
6 |
7 | export MODEL_PATH="Qwen/Qwen2-7B-Instruct"
8 | export MODEL_TYPE="qwen2"
9 | export OUTPUT_DIR="sft_ckpts"
10 | export WANDB_PROJECT="SFT-Qwen2"
11 | export WANDB_RUN_NAME="sft_with_gpt4_paths"
12 |
13 | torchrun \
14 | --nproc_per_node 4 \
15 | --nnodes 1 \
16 | --node_rank 0 \
17 | --master_addr localhost \
18 | --master_port 6601 \
19 | src/sft/qwen2.py \
20 | --model_name_or_path ${MODEL_PATH} \
21 | --data_path ${TRAIN_PATH}/${MODEL_TYPE}/${TRAIN_SET}.json \
22 | --bf16 True \
23 | --output_dir ${OUTPUT_DIR}/${MODEL_TYPE} \
24 | --report_to "wandb" \
25 | --run_name ${WANDB_RUN_NAME} \
26 | --num_train_epochs 5 \
27 | --per_device_train_batch_size 1 \
28 | --per_device_eval_batch_size 1 \
29 | --gradient_accumulation_steps 4 \
30 | --eval_strategy "steps" \
31 | --eval_steps 400 \
32 | --save_strategy "steps" \
33 | --save_steps 400 \
34 | --save_total_limit 10 \
35 | --learning_rate 2e-5 \
36 | --weight_decay 0. \
37 | --warmup_ratio 0.04 \
38 | --lr_scheduler_type "cosine" \
39 | --logging_steps 1 \
40 | --model_max_length 8192 \
41 | --gradient_checkpointing True \
42 | --lazy_preprocess False \
43 | --deepspeed config/ds_configs/stage3-cosine.json
44 |
--------------------------------------------------------------------------------
/scripts/steptool_train/train_llama3-1.sh:
--------------------------------------------------------------------------------
1 | export PYTHONPATH=./
2 | export TRAIN_PATH="data_train"
3 | export TRAIN_SET="step_grained_for_ppo_example"
4 | export CUDA_VISIBLE_DEVICES="0,1,2,3"
5 |
6 | export MODEL_TYPE="llama3-1"
7 | # load the base model after sft pretrain
8 | export MODEL_PATH="sft-ckpts/llama3-1/checkpoint-3600"
9 |
10 | python src/steptool/step_ppo.py \
11 | --model_path ${MODEL_PATH} \
12 | --model_type ${MODEL_TYPE} \
13 | --config_path config/${MODEL_TYPE}/StepTool_ppo.json \
14 | --data_file ${TRAIN_PATH}/${MODEL_TYPE}/${TRAIN_SET}.csv \
15 | --epochs 5
16 |
17 |
--------------------------------------------------------------------------------
/scripts/steptool_train/train_qwen2.sh:
--------------------------------------------------------------------------------
1 | export PYTHONPATH=./
2 | export TRAIN_PATH="data_train"
3 | export TRAIN_SET="step_grained_for_ppo_example"
4 | export CUDA_VISIBLE_DEVICES="0,1,2,3"
5 |
6 | export MODEL_TYPE="qwen2"
7 | # load the base model after sft pretrain
8 | export MODEL_PATH="sft-ckpts/qwen2/checkpoint-3639"
9 |
10 | python src/steptool/step_ppo.py \
11 | --model_path ${MODEL_PATH} \
12 | --model_type ${MODEL_TYPE} \
13 | --config_path config/${MODEL_TYPE}/StepTool_ppo.json \
14 | --data_file ${TRAIN_PATH}/${MODEL_TYPE}/${TRAIN_SET}.csv \
15 | --epochs 5
16 |
17 |
--------------------------------------------------------------------------------
/scripts/steptool_train/train_toolllama.sh:
--------------------------------------------------------------------------------
1 | export PYTHONPATH=./
2 | export TRAIN_PATH="data_train"
3 | export TRAIN_SET="step_grained_for_ppo_example"
4 | export CUDA_VISIBLE_DEVICES="0,1,2,3"
5 |
6 | export MODEL_TYPE="toolllama"
7 | # load the base model after sft pretrain
8 | export MODEL_PATH="ToolBench/ToolLLaMA-2-7b-v2"
9 |
10 | python src/steptool/step_ppo.py \
11 | --model_path ${MODEL_PATH} \
12 | --model_type ${MODEL_TYPE} \
13 | --config_path config/${MODEL_TYPE}/StepTool_ppo.json \
14 | --data_file ${TRAIN_PATH}/${MODEL_TYPE}/${TRAIN_SET}.csv \
15 | --max_context_len 4096 \
16 | --max_response_len 1024 \
17 | --epochs 5
18 |
--------------------------------------------------------------------------------
/scripts_eval/baseline-archer/inference_archer_vllm.sh:
--------------------------------------------------------------------------------
1 | cd stabletoolbench
2 | export PYTHONPATH=./
3 | export VLLM_API_BASE="http://127.0.0.1:8084/v1/" # the address of vllm.server
4 | export SERVICE_URL="http://localhost:8081/virtual" # the address of api server
5 | export MODEL_PATH="baseline-archer" # the name of vllm.server
6 | export STRATEGY="DFS_woFilter_w2" # or CoT@1
7 |
8 | export OUTPUT_DIR="data_eval/answer/baseline-archer_cot" # change it accordingly
9 |
10 | group=G1_instruction # G1_category, G1_tool, G2_category, G2_instruction, G3_instruction
11 | mkdir -p $OUTPUT_DIR; mkdir -p $OUTPUT_DIR/$group
12 | python toolbench/inference/qa_pipeline_multithread.py \
13 | --backbone_model ToolLLaMA_vllm \
14 | --model_path ${MODEL_PATH} \
15 | --max_observation_length 1024 \
16 | --method ${STRATEGY} \
17 | --input_query_file solvable_queries/test_instruction/${group}.json \
18 | --output_answer_file $OUTPUT_DIR/$group \
19 | --max_query_count 30 \
20 | --num_thread 4
--------------------------------------------------------------------------------
/scripts_eval/baseline-archer/run_convert_answer.sh:
--------------------------------------------------------------------------------
1 | cd stabletoolbench/toolbench/tooleval
2 | export RAW_ANSWER_PATH=../../../data_eval/answer
3 | export CONVERTED_ANSWER_PATH=../../../data_eval/model_predictions_converted
4 | export MODEL_NAME=baseline-archer_dfs # change it accordingly
5 | export STRATEGY="DFS_woFilter_w2" # or CoT@1
6 | export test_set=G1_instruction # G1_category, G1_tool, G2_category, G2_instruction, G3_instruction
7 |
8 | mkdir -p ${CONVERTED_ANSWER_PATH}/${MODEL_NAME}
9 | answer_dir=${RAW_ANSWER_PATH}/${MODEL_NAME}/${test_set}
10 | output_file=${CONVERTED_ANSWER_PATH}/${MODEL_NAME}/${test_set}.json
11 |
12 | python convert_to_answer_format.py\
13 | --answer_dir ${answer_dir} \
14 | --method ${STRATEGY} \
15 | --output ${output_file}
--------------------------------------------------------------------------------
/scripts_eval/baseline-archer/run_pass_rate.sh:
--------------------------------------------------------------------------------
1 | cd stabletoolbench/toolbench/tooleval
2 | export API_POOL_FILE=../../openai_key.json
3 | export CONVERTED_ANSWER_PATH=../../../data_eval/model_predictions_converted
4 | export SAVE_PATH=../../../data_eval/pass_rate_results
5 | mkdir -p ${SAVE_PATH}
6 | export CANDIDATE_MODEL="baseline-archer_cot" # change it accordingly
7 | export EVAL_MODEL=gpt-4-turbo-2024-04-09
8 | mkdir -p ${SAVE_PATH}/${CANDIDATE_MODEL}
9 |
10 | export test_set=G1_instruction # G1_category, G1_tool, G2_category, G2_instruction, G3_instruction
11 |
12 | python eval_pass_rate.py \
13 | --converted_answer_path ${CONVERTED_ANSWER_PATH} \
14 | --save_path ${SAVE_PATH}/${CANDIDATE_MODEL} \
15 | --reference_model ${CANDIDATE_MODEL} \
16 | --test_ids ../../solvable_queries/test_query_ids \
17 | --max_eval_threads 1 \
18 | --evaluate_times 3 \
19 | --test_set ${test_set} \
20 | # --overwrite
--------------------------------------------------------------------------------
/scripts_eval/baseline-eto/inference_eto_vllm.sh:
--------------------------------------------------------------------------------
1 | cd stabletoolbench
2 | export PYTHONPATH=./
3 | export VLLM_API_BASE="http://127.0.0.1:8084/v1/" # the address of vllm.server
4 | export SERVICE_URL="http://localhost:8081/virtual" # the address of api server
5 | export MODEL_PATH="baseline-eto" # the name of vllm.server
6 | export STRATEGY="DFS_woFilter_w2" # or CoT@1
7 |
8 | export OUTPUT_DIR="data_eval/answer/baseline-eto_cot" # change it accordingly
9 |
10 | group=G1_instruction # G1_category, G1_tool, G2_category, G2_instruction, G3_instruction
11 | mkdir -p $OUTPUT_DIR; mkdir -p $OUTPUT_DIR/$group
12 | python toolbench/inference/qa_pipeline_multithread.py \
13 | --backbone_model ToolLLaMA_vllm \
14 | --model_path ${MODEL_PATH} \
15 | --max_observation_length 1024 \
16 | --method ${STRATEGY} \
17 | --input_query_file solvable_queries/test_instruction/${group}.json \
18 | --output_answer_file $OUTPUT_DIR/$group \
19 | --max_query_count 30 \
20 | --num_thread 4
--------------------------------------------------------------------------------
/scripts_eval/baseline-eto/run_convert_answer.sh:
--------------------------------------------------------------------------------
1 | cd stabletoolbench/toolbench/tooleval
2 | export RAW_ANSWER_PATH=../../../data_eval/answer
3 | export CONVERTED_ANSWER_PATH=../../../data_eval/model_predictions_converted
4 | export MODEL_NAME=baseline-eto_dfs # change it accordingly
5 | export STRATEGY="DFS_woFilter_w2" # or CoT@1
6 | export test_set=G2_instruction # G1_category, G1_tool, G2_category, G2_instruction, G3_instruction
7 |
8 | mkdir -p ${CONVERTED_ANSWER_PATH}/${MODEL_NAME}
9 | answer_dir=${RAW_ANSWER_PATH}/${MODEL_NAME}/${test_set}
10 | output_file=${CONVERTED_ANSWER_PATH}/${MODEL_NAME}/${test_set}.json
11 |
12 | python convert_to_answer_format.py\
13 | --answer_dir ${answer_dir} \
14 | --method ${STRATEGY} \
15 | --output ${output_file}
--------------------------------------------------------------------------------
/scripts_eval/baseline-eto/run_pass_rate.sh:
--------------------------------------------------------------------------------
1 | cd stabletoolbench/toolbench/tooleval
2 | export API_POOL_FILE=../../openai_key.json
3 | export CONVERTED_ANSWER_PATH=../../../data_eval/model_predictions_converted
4 | export SAVE_PATH=../../../data_eval/pass_rate_results
5 | mkdir -p ${SAVE_PATH}
6 | export CANDIDATE_MODEL="baseline-eto_dfs" # change it accordingly
7 | export EVAL_MODEL=gpt-4-turbo-2024-04-09
8 | mkdir -p ${SAVE_PATH}/${CANDIDATE_MODEL}
9 |
10 | export test_set=G2_instruction # G1_category, G1_tool, G2_category, G2_instruction, G3_instruction
11 |
12 | python eval_pass_rate.py \
13 | --converted_answer_path ${CONVERTED_ANSWER_PATH} \
14 | --save_path ${SAVE_PATH}/${CANDIDATE_MODEL} \
15 | --reference_model ${CANDIDATE_MODEL} \
16 | --test_ids ../../solvable_queries/test_query_ids \
17 | --max_eval_threads 15 \
18 | --evaluate_times 3 \
19 | --test_set ${test_set} \
20 | # --overwrite
--------------------------------------------------------------------------------
/scripts_eval/baseline-ppo/inference_ppo_vllm.sh:
--------------------------------------------------------------------------------
1 | cd stabletoolbench
2 | export PYTHONPATH=./
3 | export VLLM_API_BASE="http://127.0.0.1:8084/v1/" # the address of vllm.server
4 | export SERVICE_URL="http://localhost:8081/virtual" # the address of api server
5 | export MODEL_PATH="baseline-ppo" # the name of vllm.server
6 | export STRATEGY="DFS_woFilter_w2" # or CoT@1
7 |
8 | export OUTPUT_DIR="data_eval/answer/baseline-ppo_cot" # change it accordingly
9 |
10 | group=G1_instruction # G1_category, G1_tool, G2_category, G2_instruction, G3_instruction
11 | mkdir -p $OUTPUT_DIR; mkdir -p $OUTPUT_DIR/$group
12 | python toolbench/inference/qa_pipeline_multithread.py \
13 | --backbone_model ToolLLaMA_vllm \
14 | --model_path ${MODEL_PATH} \
15 | --max_observation_length 1024 \
16 | --method ${STRATEGY} \
17 | --input_query_file solvable_queries/test_instruction/${group}.json \
18 | --output_answer_file $OUTPUT_DIR/$group \
19 | --max_query_count 30 \
20 | --num_thread 4
--------------------------------------------------------------------------------
/scripts_eval/baseline-ppo/run_convert_answer.sh:
--------------------------------------------------------------------------------
1 | cd stabletoolbench/toolbench/tooleval
2 | export RAW_ANSWER_PATH=../../../data_eval/answer
3 | export CONVERTED_ANSWER_PATH=../../../data_eval/model_predictions_converted
4 | export MODEL_NAME=baseline-ppo_dfs # change it accordingly
5 | export STRATEGY="DFS_woFilter_w2" # or CoT@1
6 | export test_set=G1_instruction # G1_category, G1_tool, G2_category, G2_instruction, G3_instruction
7 |
8 | mkdir -p ${CONVERTED_ANSWER_PATH}/${MODEL_NAME}
9 | answer_dir=${RAW_ANSWER_PATH}/${MODEL_NAME}/${test_set}
10 | output_file=${CONVERTED_ANSWER_PATH}/${MODEL_NAME}/${test_set}.json
11 |
12 | python convert_to_answer_format.py\
13 | --answer_dir ${answer_dir} \
14 | --method ${STRATEGY} \
15 | --output ${output_file}
--------------------------------------------------------------------------------
/scripts_eval/baseline-ppo/run_pass_rate.sh:
--------------------------------------------------------------------------------
1 | cd stabletoolbench/toolbench/tooleval
2 | export API_POOL_FILE=../../openai_key.json
3 | export CONVERTED_ANSWER_PATH=../../../data_eval/model_predictions_converted
4 | export SAVE_PATH=../../../data_eval/pass_rate_results
5 | mkdir -p ${SAVE_PATH}
6 | export CANDIDATE_MODEL="baseline-ppo_dfs" # change it accordingly
7 | export EVAL_MODEL=gpt-4-turbo-2024-04-09
8 | mkdir -p ${SAVE_PATH}/${CANDIDATE_MODEL}
9 |
10 | export test_set=G1_instruction # G1_category, G1_tool, G2_category, G2_instruction, G3_instruction
11 |
12 | python eval_pass_rate.py \
13 | --converted_answer_path ${CONVERTED_ANSWER_PATH} \
14 | --save_path ${SAVE_PATH}/${CANDIDATE_MODEL} \
15 | --reference_model ${CANDIDATE_MODEL} \
16 | --test_ids ../../solvable_queries/test_query_ids \
17 | --max_eval_threads 1 \
18 | --evaluate_times 3 \
19 | --test_set ${test_set} \
20 | # --overwrite
--------------------------------------------------------------------------------
/scripts_eval/baseline-rft/inference_rft_vllm.sh:
--------------------------------------------------------------------------------
1 | cd stabletoolbench
2 | export PYTHONPATH=./
3 | export VLLM_API_BASE="http://127.0.0.1:8084/v1/" # the address of vllm.server
4 | export SERVICE_URL="http://localhost:8081/virtual" # the address of api server
5 | export MODEL_PATH="baseline-rft" # the name of vllm.server
6 | export STRATEGY="DFS_woFilter_w2" # or CoT@1
7 |
8 | export OUTPUT_DIR="data_eval/answer/baseline-rft_cot" # change it accordingly
9 |
10 | group=G1_instruction # G1_category, G1_tool, G2_category, G2_instruction, G3_instruction
11 | mkdir -p $OUTPUT_DIR; mkdir -p $OUTPUT_DIR/$group
12 | python toolbench/inference/qa_pipeline_multithread.py \
13 | --backbone_model ToolLLaMA_vllm \
14 | --model_path ${MODEL_PATH} \
15 | --max_observation_length 1024 \
16 | --method ${STRATEGY} \
17 | --input_query_file solvable_queries/test_instruction/${group}.json \
18 | --output_answer_file $OUTPUT_DIR/$group \
19 | --max_query_count 30 \
20 | --num_thread 4
--------------------------------------------------------------------------------
/scripts_eval/baseline-rft/run_convert_answer.sh:
--------------------------------------------------------------------------------
1 | cd stabletoolbench/toolbench/tooleval
2 | export RAW_ANSWER_PATH=../../../data_eval/answer
3 | export CONVERTED_ANSWER_PATH=../../../data_eval/model_predictions_converted
4 | export MODEL_NAME=baseline-rft_dfs # change it accordingly
5 | export STRATEGY="DFS_woFilter_w2" # or CoT@1
6 | export test_set=G1_instruction # G1_category, G1_tool, G2_category, G2_instruction, G3_instruction
7 |
8 | mkdir -p ${CONVERTED_ANSWER_PATH}/${MODEL_NAME}
9 | answer_dir=${RAW_ANSWER_PATH}/${MODEL_NAME}/${test_set}
10 | output_file=${CONVERTED_ANSWER_PATH}/${MODEL_NAME}/${test_set}.json
11 |
12 | python convert_to_answer_format.py\
13 | --answer_dir ${answer_dir} \
14 | --method ${STRATEGY} \
15 | --output ${output_file}
--------------------------------------------------------------------------------
/scripts_eval/baseline-rft/run_pass_rate.sh:
--------------------------------------------------------------------------------
1 | cd stabletoolbench/toolbench/tooleval
2 | export API_POOL_FILE=../../openai_key.json
3 | export CONVERTED_ANSWER_PATH=../../../data_eval/model_predictions_converted
4 | export SAVE_PATH=../../../data_eval/pass_rate_results
5 | mkdir -p ${SAVE_PATH}
6 | export CANDIDATE_MODEL="baseline-rft_dfs" # change it accordingly
7 | export EVAL_MODEL=gpt-4-turbo-2024-04-09
8 | mkdir -p ${SAVE_PATH}/${CANDIDATE_MODEL}
9 |
10 | export test_set=G1_instruction # G1_category, G1_tool, G2_category, G2_instruction, G3_instruction
11 |
12 | python eval_pass_rate.py \
13 | --converted_answer_path ${CONVERTED_ANSWER_PATH} \
14 | --save_path ${SAVE_PATH}/${CANDIDATE_MODEL} \
15 | --reference_model ${CANDIDATE_MODEL} \
16 | --test_ids ../../solvable_queries/test_query_ids \
17 | --max_eval_threads 1 \
18 | --evaluate_times 3 \
19 | --test_set ${test_set} \
20 | # --overwrite
--------------------------------------------------------------------------------
/scripts_eval/llama3-1/inference_llama3-1_vllm.sh:
--------------------------------------------------------------------------------
1 | cd stabletoolbench
2 | export PYTHONPATH=./
3 | export VLLM_API_BASE="http://127.0.0.1:8085/v1/" # the address of vllm.server
4 | export SERVICE_URL="http://127.0.0.1:8081/virtual" # the address of api server
5 | export MODEL_PATH="llama3-1" # the name of vllm.server
6 | export STRATEGY="DFS_woFilter_w2" # or CoT@1
7 |
8 | export OUTPUT_DIR="data_eval/answer/virtual_llama3-1_dfs" # change it accordingly
9 |
10 | group=G1_instruction # G1_category, G1_tool, G2_category, G2_instruction, G3_instruction
11 | mkdir -p $OUTPUT_DIR; mkdir -p $OUTPUT_DIR/$group
12 | python toolbench/inference/qa_pipeline_multithread.py \
13 | --backbone_model llama3 \
14 | --model_path ${MODEL_PATH} \
15 | --max_observation_length 1024 \
16 | --method ${STRATEGY} \
17 | --input_query_file solvable_queries/test_instruction/${group}.json \
18 | --output_answer_file $OUTPUT_DIR/$group \
19 | --max_query_count 30 \
20 | --num_thread 4
--------------------------------------------------------------------------------
/scripts_eval/llama3-1/run_conver_answer.sh:
--------------------------------------------------------------------------------
1 | cd stabletoolbench/toolbench/tooleval
2 | export RAW_ANSWER_PATH=../../data_eval/answer
3 | export CONVERTED_ANSWER_PATH=../../data_eval/model_predictions_converted
4 | export MODEL_NAME=virtual_llama3-1_dfs # change it accordingly
5 | export STRATEGY="DFS_woFilter_w2" # or CoT@1
6 | export test_set=G1_instruction # G1_category, G1_tool, G2_category, G2_instruction, G3_instruction
7 |
8 | mkdir -p ${CONVERTED_ANSWER_PATH}/${MODEL_NAME}
9 | answer_dir=${RAW_ANSWER_PATH}/${MODEL_NAME}/${test_set}
10 | output_file=${CONVERTED_ANSWER_PATH}/${MODEL_NAME}/${test_set}.json
11 |
12 | python convert_to_answer_format.py\
13 | --answer_dir ${answer_dir} \
14 | --method ${STRATEGY} \
15 | --output ${output_file}
--------------------------------------------------------------------------------
/scripts_eval/llama3-1/run_pass_rate.sh:
--------------------------------------------------------------------------------
1 | cd stabletoolbench/toolbench/tooleval
2 | export API_POOL_FILE=../../openai_key.json
3 | export CONVERTED_ANSWER_PATH=../../data_eval/model_predictions_converted
4 | export SAVE_PATH=../../data_eval/pass_rate_results
5 | mkdir -p ${SAVE_PATH}
6 | export CANDIDATE_MODEL="virtual_llama3-1_dfs" # change it accordingly
7 | export EVAL_MODEL=gpt-4-turbo-2024-04-09
8 | mkdir -p ${SAVE_PATH}/${CANDIDATE_MODEL}
9 |
10 | export test_set=G1_instruction # G1_category, G1_tool, G2_category, G2_instruction, G3_instruction
11 |
12 | python eval_pass_rate.py \
13 | --converted_answer_path ${CONVERTED_ANSWER_PATH} \
14 | --save_path ${SAVE_PATH}/${CANDIDATE_MODEL} \
15 | --reference_model ${CANDIDATE_MODEL} \
16 | --test_ids ../../solvable_queries/test_query_ids \
17 | --max_eval_threads 1 \
18 | --evaluate_times 3 \
19 | --test_set ${test_set} \
20 | # --overwrite
--------------------------------------------------------------------------------
/scripts_eval/llama3-1/run_preference.sh:
--------------------------------------------------------------------------------
1 | cd toolbench/tooleval
2 | export CONVERTED_ANSWER_PATH=../../data_eval/model_predictions_converted
3 | export SAVE_PATH=../../data_eval/preference_results
4 | export PASS_RATE_PATH=../../data_eval/pass_rate_results
5 |
6 | export REFERENCE_MODEL=virtual_gpt3.5-0125_dfs # change it accordingly
7 | export CANDIDATE_MODEL=virtual_llama3-1_dfs # change it accordingly
8 |
9 | export EVAL_MODEL=gpt-4-turbo-2024-04-09
10 | mkdir -p ${SAVE_PATH}/${REFERENCE_MODEL}_${CANDIDATE_MODEL}
11 |
12 | export test_set=G1_instruction # G1_category, G1_tool, G2_category, G2_instruction, G3_instruction
13 |
14 | python eval_preference.py \
15 | --converted_answer_path ${CONVERTED_ANSWER_PATH} \
16 | --reference_model ${REFERENCE_MODEL} \
17 | --output_model ${CANDIDATE_MODEL} \
18 | --test_ids ../../solvable_queries/test_query_ids/ \
19 | --save_path ${SAVE_PATH}/${REFERENCE_MODEL}_${CANDIDATE_MODEL} \
20 | --pass_rate_result_path ${PASS_RATE_PATH} \
21 | --max_eval_threads 30 \
22 | --evaluate_times 3 \
23 | --test_set ${test_set} \
24 | # --overwrite
--------------------------------------------------------------------------------
/scripts_eval/qwen2/inference_qwen2_vllm.sh:
--------------------------------------------------------------------------------
1 | cd stabletoolbench
2 | export PYTHONPATH=./
3 | export VLLM_API_BASE="http://127.0.0.1:8084/v1/" # the address of vllm.server
4 | export SERVICE_URL="http://localhost:8081/virtual" # the address of api server
5 | export MODEL_PATH="qwen2" # the name of vllm.server
6 | export STRATEGY="DFS_woFilter_w2" # or CoT@1
7 |
8 | export OUTPUT_DIR="data_eval/answer/virtual_qwen2_dfs" # change it accordingly
9 |
10 | group=G1_instruction # G1_category, G1_tool, G2_category, G2_instruction, G3_instruction
11 | mkdir -p $OUTPUT_DIR; mkdir -p $OUTPUT_DIR/$group
12 | python toolbench/inference/qa_pipeline_multithread.py \
13 | --backbone_model qwen2 \
14 | --model_path ${MODEL_PATH} \
15 | --max_observation_length 1024 \
16 | --method ${STRATEGY} \
17 | --input_query_file solvable_queries/test_instruction/${group}.json \
18 | --output_answer_file $OUTPUT_DIR/$group \
19 | --max_query_count 30 \
20 | --num_thread 4
--------------------------------------------------------------------------------
/scripts_eval/qwen2/run_convert_answer.sh:
--------------------------------------------------------------------------------
1 | cd stabletoolbench/toolbench/tooleval
2 | export RAW_ANSWER_PATH=../../data_eval/answer
3 | export CONVERTED_ANSWER_PATH=../../data_eval/model_predictions_converted
4 | export MODEL_NAME=virtual_qwen2_dfs # change it accordingly
5 | export STRATEGY="DFS_woFilter_w2" # or CoT@1
6 | export test_set=G1_instruction # G1_category, G1_tool, G2_category, G2_instruction, G3_instruction
7 |
8 | mkdir -p ${CONVERTED_ANSWER_PATH}/${MODEL_NAME}
9 | answer_dir=${RAW_ANSWER_PATH}/${MODEL_NAME}/${test_set}
10 | output_file=${CONVERTED_ANSWER_PATH}/${MODEL_NAME}/${test_set}.json
11 |
12 | python convert_to_answer_format.py\
13 | --answer_dir ${answer_dir} \
14 | --method ${STRATEGY} \
15 | --output ${output_file}
--------------------------------------------------------------------------------
/scripts_eval/qwen2/run_pass_rate.sh:
--------------------------------------------------------------------------------
1 | cd stabletoolbench/toolbench/tooleval
2 | export API_POOL_FILE=../../openai_key.json
3 | export CONVERTED_ANSWER_PATH=../../data_eval/model_predictions_converted
4 | export SAVE_PATH=../../data_eval/pass_rate_results
5 | mkdir -p ${SAVE_PATH}
6 | export CANDIDATE_MODEL="virtual_qwen2_dfs" # change it accordingly
7 | export EVAL_MODEL=gpt-4-turbo-2024-04-09
8 | mkdir -p ${SAVE_PATH}/${CANDIDATE_MODEL}
9 |
10 | export test_set=G1_instruction # G1_category, G1_tool, G2_category, G2_instruction, G3_instruction
11 |
12 | python eval_pass_rate.py \
13 | --converted_answer_path ${CONVERTED_ANSWER_PATH} \
14 | --save_path ${SAVE_PATH}/${CANDIDATE_MODEL} \
15 | --reference_model ${CANDIDATE_MODEL} \
16 | --test_ids ../../solvable_queries/test_query_ids \
17 | --max_eval_threads 1 \
18 | --evaluate_times 3 \
19 | --test_set ${test_set} \
20 | # --overwrite
--------------------------------------------------------------------------------
/scripts_eval/qwen2/run_preference.sh:
--------------------------------------------------------------------------------
1 | cd toolbench/tooleval
2 | export CONVERTED_ANSWER_PATH=../../data_eval/model_predictions_converted
3 | export SAVE_PATH=../../data_eval/preference_results
4 | export PASS_RATE_PATH=../../data_eval/pass_rate_results
5 |
6 | export REFERENCE_MODEL=virtual_gpt3.5-0125_dfs # change it accordingly
7 | export CANDIDATE_MODEL=virtual_qwen2_dfs # change it accordingly
8 |
9 | export EVAL_MODEL=gpt-4-turbo-2024-04-09
10 | mkdir -p ${SAVE_PATH}/${REFERENCE_MODEL}_${CANDIDATE_MODEL}
11 |
12 | export test_set=G1_instruction # G1_category, G1_tool, G2_category, G2_instruction, G3_instruction
13 |
14 | python eval_preference.py \
15 | --converted_answer_path ${CONVERTED_ANSWER_PATH} \
16 | --reference_model ${REFERENCE_MODEL} \
17 | --output_model ${CANDIDATE_MODEL} \
18 | --test_ids ../../solvable_queries/test_query_ids/ \
19 | --save_path ${SAVE_PATH}/${REFERENCE_MODEL}_${CANDIDATE_MODEL} \
20 | --pass_rate_result_path ${PASS_RATE_PATH} \
21 | --max_eval_threads 30 \
22 | --evaluate_times 3 \
23 | --test_set ${test_set} \
24 | # --overwrite
--------------------------------------------------------------------------------
/scripts_eval/steptool/inference_steptool_vllm.sh:
--------------------------------------------------------------------------------
1 | cd stabletoolbench
2 | export PYTHONPATH=./
3 | export VLLM_API_BASE="http://127.0.0.1:8084/v1/" # the address of vllm.server
4 | export SERVICE_URL="http://localhost:8081/virtual" # the address of api server
5 | export MODEL_PATH="steptool" # the name of vllm.server
6 | export STRATEGY="DFS_woFilter_w2" # or CoT@1
7 |
8 | export OUTPUT_DIR="data_eval/answer/steptool_cot" # change it accordingly
9 |
10 | group=G1_instruction # G1_category, G1_tool, G2_category, G2_instruction, G3_instruction
11 | mkdir -p $OUTPUT_DIR; mkdir -p $OUTPUT_DIR/$group
12 | python toolbench/inference/qa_pipeline_multithread.py \
13 | --backbone_model ToolLLaMA_vllm \
14 | --model_path ${MODEL_PATH} \
15 | --max_observation_length 1024 \
16 | --method ${STRATEGY} \
17 | --input_query_file solvable_queries/test_instruction/${group}.json \
18 | --output_answer_file $OUTPUT_DIR/$group \
19 | --max_query_count 30 \
20 | --num_thread 4
--------------------------------------------------------------------------------
/scripts_eval/steptool/run_convert_answer.sh:
--------------------------------------------------------------------------------
1 | cd stabletoolbench/toolbench/tooleval
2 | export RAW_ANSWER_PATH=../../../data_eval/answer
3 | export CONVERTED_ANSWER_PATH=../../../data_eval/model_predictions_converted
4 | export MODEL_NAME=steptool_dfs # change it accordingly
5 | export STRATEGY="DFS_woFilter_w2" # or CoT@1
6 | export test_set=G1_instruction # G1_category, G1_tool, G2_category, G2_instruction, G3_instruction
7 |
8 | mkdir -p ${CONVERTED_ANSWER_PATH}/${MODEL_NAME}
9 | answer_dir=${RAW_ANSWER_PATH}/${MODEL_NAME}/${test_set}
10 | output_file=${CONVERTED_ANSWER_PATH}/${MODEL_NAME}/${test_set}.json
11 |
12 | python convert_to_answer_format.py\
13 | --answer_dir ${answer_dir} \
14 | --method ${STRATEGY} \
15 | --output ${output_file}
--------------------------------------------------------------------------------
/scripts_eval/steptool/run_pass_rate.sh:
--------------------------------------------------------------------------------
1 | cd stabletoolbench/toolbench/tooleval
2 | export API_POOL_FILE=../../openai_key.json
3 | export CONVERTED_ANSWER_PATH=../../../data_eval/model_predictions_converted
4 | export SAVE_PATH=../../../data_eval/pass_rate_results
5 | mkdir -p ${SAVE_PATH}
6 | export CANDIDATE_MODEL="steptool_cot" # change it accordingly
7 | export EVAL_MODEL=gpt-4-turbo-2024-04-09
8 | mkdir -p ${SAVE_PATH}/${CANDIDATE_MODEL}
9 |
10 | export test_set=G1_instruction # G1_category, G1_tool, G2_category, G2_instruction, G3_instruction
11 |
12 | python eval_pass_rate.py \
13 | --converted_answer_path ${CONVERTED_ANSWER_PATH} \
14 | --save_path ${SAVE_PATH}/${CANDIDATE_MODEL} \
15 | --reference_model ${CANDIDATE_MODEL} \
16 | --test_ids ../../solvable_queries/test_query_ids \
17 | --max_eval_threads 1 \
18 | --evaluate_times 3 \
19 | --test_set ${test_set} \
20 | # --overwrite
--------------------------------------------------------------------------------
/scripts_eval/toolllama-sft/inference_toolllama_vllm.sh:
--------------------------------------------------------------------------------
1 | cd stabletoolbench
2 | export PYTHONPATH=./
3 | export VLLM_API_BASE="http://127.0.0.1:8083/v1/" # the address of vllm.server
4 | export SERVICE_URL="http://127.0.0.1:8081/virtual" # the address of api server
5 | export MODEL_PATH="toolllama" # the name of vllm.server
6 | export STRATEGY="DFS_woFilter_w2" # or CoT@1
7 |
8 | export OUTPUT_DIR="data_eval/answer/toolllama_sft_dfs" # change it accordingly
9 |
10 | group=G1_instruction # G1_category, G1_tool, G2_category, G2_instruction, G3_instruction
11 | mkdir -p $OUTPUT_DIR; mkdir -p $OUTPUT_DIR/$group
12 | python toolbench/inference/qa_pipeline_multithread.py \
13 | --backbone_model ToolLLaMA_vllm \
14 | --model_path ${MODEL_PATH} \
15 | --max_observation_length 1024 \
16 | --method ${STRATEGY} \
17 | --input_query_file solvable_queries/test_instruction/${group}.json \
18 | --output_answer_file $OUTPUT_DIR/$group \
19 | --max_query_count 30 \
20 | --num_thread 4
--------------------------------------------------------------------------------
/scripts_eval/toolllama-sft/run_conver_answer.sh:
--------------------------------------------------------------------------------
1 | cd stabletoolbench/toolbench/tooleval
2 | export RAW_ANSWER_PATH=../../../data_eval/answer
3 | export CONVERTED_ANSWER_PATH=../../../data_eval/model_predictions_converted
4 | export MODEL_NAME=toolllama_sft_dfs # change it accordingly
5 | export STRATEGY="DFS_woFilter_w2" # or CoT@1 DFS_woFilter_w2
6 | export test_set=G1_tool # G1_category, G1_tool, G2_category, G2_instruction, G3_instruction
7 |
8 | mkdir -p ${CONVERTED_ANSWER_PATH}/${MODEL_NAME}
9 | answer_dir=${RAW_ANSWER_PATH}/${MODEL_NAME}/${test_set}
10 | output_file=${CONVERTED_ANSWER_PATH}/${MODEL_NAME}/${test_set}.json
11 |
12 | python convert_to_answer_format.py\
13 | --answer_dir ${answer_dir} \
14 | --method ${STRATEGY} \
15 | --output ${output_file}
--------------------------------------------------------------------------------
/scripts_eval/toolllama-sft/run_pass_rate.sh:
--------------------------------------------------------------------------------
1 | cd stabletoolbench/toolbench/tooleval
2 | export API_POOL_FILE=../../openai_key.json
3 | export CONVERTED_ANSWER_PATH=../../../data_eval/model_predictions_converted
4 | export SAVE_PATH=../../../data_eval/pass_rate_results
5 | mkdir -p ${SAVE_PATH}
6 | export CANDIDATE_MODEL="toolllama_sft_dfs" # change it accordingly
7 | export EVAL_MODEL=gpt-4-turbo-2024-04-09
8 | mkdir -p ${SAVE_PATH}/${CANDIDATE_MODEL}
9 |
10 | export test_set=G1_instruction # G1_category, G1_tool, G2_category, G2_instruction, G3_instruction
11 |
12 | python eval_pass_rate.py \
13 | --converted_answer_path ${CONVERTED_ANSWER_PATH} \
14 | --save_path ${SAVE_PATH}/${CANDIDATE_MODEL} \
15 | --reference_model ${CANDIDATE_MODEL} \
16 | --test_ids ../../solvable_queries/test_query_ids \
17 | --max_eval_threads 15 \
18 | --evaluate_times 3 \
19 | --test_set ${test_set} \
20 | # --overwrite
--------------------------------------------------------------------------------
/scripts_eval/toolllama/run_preference.sh:
--------------------------------------------------------------------------------
1 | cd toolbench/tooleval
2 | export CONVERTED_ANSWER_PATH=../../data_eval/model_predictions_converted
3 | export SAVE_PATH=../../data_eval/preference_results
4 | export PASS_RATE_PATH=../../data_eval/pass_rate_results
5 |
6 | export REFERENCE_MODEL=virtual_gpt3.5-0125_dfs # change it accordingly
7 | export CANDIDATE_MODEL=virtual_toolllama_dfs # change it accordingly
8 |
9 | export EVAL_MODEL=gpt-4-turbo-2024-04-09
10 | mkdir -p ${SAVE_PATH}/${REFERENCE_MODEL}_${CANDIDATE_MODEL}
11 |
12 | export test_set=G1_instruction # G1_category, G1_tool, G2_category, G2_instruction, G3_instruction
13 |
14 | python eval_preference.py \
15 | --converted_answer_path ${CONVERTED_ANSWER_PATH} \
16 | --reference_model ${REFERENCE_MODEL} \
17 | --output_model ${CANDIDATE_MODEL} \
18 | --test_ids ../../solvable_queries/test_query_ids/ \
19 | --save_path ${SAVE_PATH}/${REFERENCE_MODEL}_${CANDIDATE_MODEL} \
20 | --pass_rate_result_path ${PASS_RATE_PATH} \
21 | --max_eval_threads 30 \
22 | --evaluate_times 3 \
23 | --test_set ${test_set} \
24 | # --overwrite
--------------------------------------------------------------------------------
/src/baseline-archer/archer_critic.py:
--------------------------------------------------------------------------------
1 | # Ref: https://github.com/YifeiZhou02/ArCHer
2 |
3 | # @misc{zhou2024archer,
4 | # title={ArCHer: Training Language Model Agents via Hierarchical Multi-Turn RL},
5 | # author={Yifei Zhou and Andrea Zanette and Jiayi Pan and Sergey Levine and Aviral Kumar},
6 | # year={2024},
7 | # eprint={2402.19446},
8 | # archivePrefix={arXiv},
9 | # primaryClass={cs.LG}
10 | # }
11 |
12 | import torch
13 | from transformers import AutoTokenizer, AutoModel
14 | import torch.nn as nn
15 | import numpy as np
16 | from transformers import RobertaTokenizer, RobertaModel
17 | class DoubleCritic(torch.nn.Module):
18 | def __init__(self, device, accelerator, critic_lm, cache_dir, in_dim, out_dim):
19 | super(DoubleCritic, self).__init__()
20 | self.device = device
21 | self.accelerator = accelerator
22 | self.base_lm = AutoModel.from_pretrained(critic_lm, cache_dir=cache_dir).to(device)
23 | self.base_tokenizer = AutoTokenizer.from_pretrained(critic_lm, cache_dir=cache_dir)
24 | self.base_tokenizer.truncation_side = 'left'
25 | self.critic1 = nn.Sequential(nn.Linear(in_dim*2, in_dim),\
26 | nn.ReLU(),\
27 | nn.Linear(in_dim, in_dim),\
28 | nn.ReLU(),\
29 | nn.Linear(in_dim, out_dim)).to(device)
30 | self.critic2 = nn.Sequential(nn.Linear(in_dim*2, in_dim),\
31 | nn.ReLU(),\
32 | nn.Linear(in_dim, in_dim),\
33 | nn.ReLU(),\
34 | nn.Linear(in_dim, out_dim)).to(device)
35 | self.v_critic1 = nn.Sequential(nn.Linear(in_dim, in_dim),\
36 | nn.ReLU(),\
37 | nn.Linear(in_dim, in_dim),\
38 | nn.ReLU(),\
39 | nn.Linear(in_dim, out_dim)).to(device)
40 | self.v_critic2 = nn.Sequential(nn.Linear(in_dim, in_dim),\
41 | nn.ReLU(),\
42 | nn.Linear(in_dim, in_dim),\
43 | nn.ReLU(),\
44 | nn.Linear(in_dim, out_dim)).to(device)
45 |
46 | # def prepare(self):
47 | # self.base_lm, self.critic1, self.critic2, self.v_critic1, self.v_critic2 = \
48 | # self.accelerator.prepare(self.base_lm, self.critic1, self.critic2, self.v_critic1, self.v_critic2)
49 |
50 | def forward(self, observation, action, detach_model=False):
51 | state_actions = [o + a for o,a in zip(observation, action)]
52 | obs_ids = self.base_tokenizer(observation, padding = True, return_tensors='pt', max_length=512, truncation = True).to(self.device)
53 | # breakpoint()
54 | if detach_model:
55 | with torch.no_grad():
56 | lm_states = self.base_lm(**obs_ids).pooler_output
57 | else:
58 | lm_states = self.base_lm(**obs_ids).pooler_output
59 | action_ids = self.base_tokenizer(action, padding = True, return_tensors='pt', max_length=512, truncation = True).to(self.device)
60 | # breakpoint()
61 | if detach_model:
62 | with torch.no_grad():
63 | action_states = self.base_lm(**action_ids).pooler_output
64 | else:
65 | action_states = self.base_lm(**action_ids).pooler_output
66 | q_states = torch.cat([lm_states, action_states], dim = 1)
67 | # print(action.size())
68 | return self.critic1(q_states), self.critic2(q_states), self.v_critic1(lm_states), self.v_critic2(lm_states)
--------------------------------------------------------------------------------
/src/baseline-archer/archer_data.py:
--------------------------------------------------------------------------------
1 | # Ref: https://github.com/YifeiZhou02/ArCHer
2 |
3 | # @misc{zhou2024archer,
4 | # title={ArCHer: Training Language Model Agents via Hierarchical Multi-Turn RL},
5 | # author={Yifei Zhou and Andrea Zanette and Jiayi Pan and Sergey Levine and Aviral Kumar},
6 | # year={2024},
7 | # eprint={2402.19446},
8 | # archivePrefix={arXiv},
9 | # primaryClass={cs.LG}
10 | # }
11 |
12 | from torch.utils.data import Dataset, DataLoader
13 | import numpy as np
14 | class DummyDataset(Dataset):
15 | def __init__(self, buffer):
16 | self.buffer = buffer
17 |
18 | def __len__(self):
19 | return len(self.buffer)
20 |
21 | def __getitem__(self, idx):
22 | return self.buffer[idx]
23 |
24 |
25 | class ReplayBuffer:
26 | def __init__(self, batch_size=2, capacity=10000):
27 | self.max_size = capacity
28 | self.size = 0
29 | self.observations = None
30 | self.rewards = None
31 | self.next_observations = None
32 | self.dones = None
33 | self.batch_size = batch_size
34 | self.actions = None
35 | self.mc_returns = None
36 |
37 | def sample(self, batch_size=None):
38 | if batch_size is None:
39 | batch_size = self.batch_size
40 | rand_indices = np.random.randint(0, self.size, size=(batch_size,)) % self.max_size
41 | return {
42 | "observation": self.observations[rand_indices],
43 | "action": self.actions[rand_indices],
44 | "reward": self.rewards[rand_indices],
45 | "next_observation": self.next_observations[rand_indices],
46 | "done": self.dones[rand_indices],
47 | "mc_return": self.mc_returns[rand_indices],
48 | }
49 |
50 | def __len__(self):
51 | return self.size
52 |
53 | def insert(
54 | self,
55 | /,
56 | observation,
57 | action,
58 | reward: np.ndarray,
59 | next_observation,
60 | done: np.ndarray,
61 | mc_return,
62 | **kwargs
63 | ):
64 | """
65 | Insert a single transition into the replay buffer.
66 |
67 | Use like:
68 | replay_buffer.insert(
69 | observation=observation,
70 | action=action,
71 | reward=reward,
72 | next_observation=next_observation,
73 | done=done,
74 | )
75 | """
76 | if isinstance(reward, (float, int)):
77 | reward = np.array(reward)
78 | if isinstance(mc_return, (float, int)):
79 | mc_return = np.array(mc_return)
80 | if isinstance(done, bool):
81 | done = np.array(done)
82 | # print(next_observation)
83 | # if isinstance(prompt_actionaction, int):
84 | # action = np.array(action, dtype=np.int64)
85 |
86 | if self.observations is None:
87 | self.observations = np.array(['']*self.max_size, dtype = 'object')
88 | self.actions = np.array(['']*self.max_size, dtype = 'object')
89 | self.rewards = np.empty((self.max_size, *reward.shape), dtype=reward.dtype)
90 | self.next_observations = np.array(['']*self.max_size, dtype = 'object')
91 | self.dones = np.empty((self.max_size, *done.shape), dtype=done.dtype)
92 | self.mc_returns = np.empty((self.max_size, *mc_return.shape), dtype=mc_return.dtype)
93 |
94 | assert reward.shape == ()
95 | assert done.shape == ()
96 |
97 | self.observations[self.size % self.max_size] = observation
98 | self.actions[self.size % self.max_size] = action
99 | self.rewards[self.size % self.max_size] = reward
100 | self.next_observations[self.size % self.max_size] = next_observation
101 | self.dones[self.size % self.max_size] = done
102 | self.mc_returns[self.size % self.max_size] = mc_return
103 |
104 | self.size += 1
--------------------------------------------------------------------------------
/src/baseline-archer/archer_environment.py:
--------------------------------------------------------------------------------
1 | # Ref: https://github.com/YifeiZhou02/ArCHer
2 |
3 | # @misc{zhou2024archer,
4 | # title={ArCHer: Training Language Model Agents via Hierarchical Multi-Turn RL},
5 | # author={Yifei Zhou and Andrea Zanette and Jiayi Pan and Sergey Levine and Aviral Kumar},
6 | # year={2024},
7 | # eprint={2402.19446},
8 | # archivePrefix={arXiv},
9 | # primaryClass={cs.LG}
10 | # }
11 |
12 | from tqdm import tqdm
13 | import numpy as np
14 |
15 | def add_trajectory_reward(trajectory):
16 | """
17 | add trajectory reward to the dict of each interaction
18 | """
19 | trajectory_reward = np.sum([d["reward"] for d in trajectory])
20 | for d in trajectory:
21 | d.update({"trajectory_reward": trajectory_reward})
22 | return trajectory
23 |
24 | def add_mc_return(trajectory, gamma = 0.95):
25 | """
26 | add trajectory reward to the dict of each interaction
27 | """
28 | trajectory_rewards = np.array([d["reward"] for d in trajectory]).reshape(1, -1)
29 | gamma_row = np.cumprod(np.ones((1, trajectory_rewards.shape[1]))*gamma)
30 | gamma_matrix = np.triu(gamma_row.reshape(1, -1 )/ gamma_row.reshape(-1, 1))
31 | mc_returns = np.sum(trajectory_rewards*gamma_matrix, axis = 1)
32 | for d, mc in zip(trajectory, mc_returns):
33 | d.update({"mc_return": mc})
34 | return trajectory
35 |
36 |
37 | def batch_interact_environment(agent, tokenizer, env, num_trajectories,\
38 | post_f = lambda x: x, use_tqdm = True, decode_f = lambda x: x,
39 | env_idx = None):
40 | """
41 | in a bacthed way, interact with the environments to get a list of trajectories
42 | [[{"observation":, "next_observation":, "reward":, "done":},...],...]
43 | post_f: function to add additional attributes to the trajectory
44 | """
45 | bsize = env.bsize
46 | all_trajectories = []
47 | for num_t in tqdm(range(num_trajectories//bsize), disable = not use_tqdm):
48 | done = False
49 | trajectories = [[] for _ in range(bsize)]
50 | # obs = reset_to(env, 69)
51 | batch_obs = env.reset(idx=env_idx)
52 | batch_done = [False,]*bsize
53 | steps = 0
54 | while not all(batch_done):
55 | steps += 1
56 | # print(f"Environment stpes {str(steps)}")
57 | action = agent.get_action(batch_obs)
58 | batch_return = env.step(decode_f(action))
59 | for i,result in zip(range(bsize), batch_return):
60 | if result is None:
61 | continue
62 | next_obs, r, done = result
63 | trajectories[i].append({"observation": batch_obs[i], \
64 | "next_observation": next_obs, \
65 | "reward": r, \
66 | "done": done, \
67 | "action": action[i]})
68 | batch_obs[i] = next_obs
69 | batch_done[i] = done
70 | # obs = next_obs
71 | print(trajectories[0][-1]["next_observation"])
72 | all_trajectories += [post_f(add_mc_return(add_trajectory_reward(trajectory)))\
73 | for trajectory in trajectories]
74 | # breakpoint()
75 | # trajectories.append(post_f(add_trajectory_reward(trajectory)))
76 | return all_trajectories
77 |
--------------------------------------------------------------------------------
/src/baseline-archer/build_archer_data.py:
--------------------------------------------------------------------------------
1 | from archer_data import ReplayBuffer
2 | import pandas as pd
3 | import numpy as np
4 | import os
5 | import torch
6 | import json
7 |
8 | model = "toolllama"
9 | buffer_batch_size = 2
10 | tool_data_file = os.environ.get("DATA_FILE", None)
11 |
12 | # bsize = 4
13 | df = pd.read_csv(tool_data_file, sep="\t")
14 |
15 |
16 | # build origin trajectory
17 | trajectories = [[] for _ in range(len(df))]
18 |
19 | MAX_LEN = 1024
20 |
21 | # TODO
22 | for i in range(0, len(df)):
23 | prompt_list = eval(df.iloc[i]["prompt"])
24 | response_list = eval(df.iloc[i]["response"])
25 | reward_list = eval(df.iloc[i]["reward"])
26 |
27 | obs = prompt_list[0]
28 | next_obs = obs + response_list[0] + prompt_list[1]
29 | done = False
30 | if len(obs) > MAX_LEN:
31 | obs = obs[-MAX_LEN:]
32 | if len(next_obs) > MAX_LEN:
33 |
34 | next_obs = next_obs[-MAX_LEN:]
35 | trajectories[i].append({"observation": obs, \
36 | "next_observation": next_obs, \
37 | "reward": reward_list[0], \
38 | "done": done, \
39 | "action": response_list[0]})
40 | for j in range(1, len(response_list)):
41 | obs = next_obs
42 | next_obs = obs + response_list[j]
43 | if j+1 < len(response_list):
44 | next_obs += prompt_list[j+1]
45 | else:
46 | done = True
47 |
48 | if len(obs) > MAX_LEN:
49 | obs = obs[-MAX_LEN:]
50 | if len(next_obs) > MAX_LEN:
51 | next_obs = next_obs[-MAX_LEN:]
52 | trajectories[i].append({"observation": obs, \
53 | "next_observation": next_obs, \
54 | "reward": reward_list[j], \
55 | "done": done, \
56 | "action": response_list[j]})
57 |
58 |
59 | def add_trajectory_reward(trajectory):
60 | """
61 | add trajectory reward to the dict of each interaction
62 | """
63 | trajectory_reward = np.sum([d["reward"] for d in trajectory])
64 | for d in trajectory:
65 | d.update({"trajectory_reward": trajectory_reward})
66 | return trajectory
67 |
68 | def add_mc_return(trajectory, gamma = 0.95):
69 | """
70 | add trajectory reward to the dict of each interaction
71 | """
72 | trajectory_rewards = np.array([d["reward"] for d in trajectory]).reshape(1, -1)
73 | gamma_row = np.cumprod(np.ones((1, trajectory_rewards.shape[1]))*gamma)
74 | gamma_matrix = np.triu(gamma_row.reshape(1, -1 )/ gamma_row.reshape(-1, 1))
75 | mc_returns = np.sum(trajectory_rewards*gamma_matrix, axis = 1)
76 | for d, mc in zip(trajectory, mc_returns):
77 | d.update({"mc_return": mc})
78 |
79 | return trajectory
80 |
81 | all_trajectories = [add_mc_return(add_trajectory_reward(trajectory))\
82 | for trajectory in trajectories]
83 |
84 | # save to json
85 | trajectory_json = {}
86 | for i in range(len(all_trajectories)):
87 | trajectory_json[i] = all_trajectories[i]
88 |
89 | with open("trajectories.json", "w") as f:
90 | json.dump(trajectory_json, f, indent=4, ensure_ascii=False)
91 |
92 |
93 | # build replay_buffer
94 | replay_buffer= ReplayBuffer(batch_size=buffer_batch_size)
95 |
96 | data = sum(all_trajectories, [])
97 | for t in data:
98 | replay_buffer.insert(**t)
99 |
100 | print(">>> Saving Replay Buffer")
101 | save_path = os.environ.get("SAVE_PATH", "save")
102 | os.makedirs(save_path, exist_ok=True)
103 | torch.save(replay_buffer, os.path.join(save_path, 'replay_buffer.pt'))
104 | torch.save(all_trajectories, os.path.join(save_path, 'trajectories.pt'))
105 |
--------------------------------------------------------------------------------
/src/baseline-archer/offpolicy_train_loop.py:
--------------------------------------------------------------------------------
1 | from archer_environment import batch_interact_environment
2 | from archer_data import DummyDataset, ReplayBuffer
3 | import numpy as np
4 | from torch.utils.data import Dataset, DataLoader
5 | from tqdm import tqdm
6 | from archer_trainer import ArcherTrainer
7 | import wandb
8 | import threading
9 | import os
10 | import torch
11 | import time
12 | def offpolicy_train_loop(env,\
13 | eval_env,\
14 | agent,\
15 | tokenizer,\
16 | accelerator,\
17 | warmup_iter: int = 20,
18 | rollout_size: int = 50,\
19 | eval_size: int = 1,
20 | batch_size: int = 2,
21 | capacity: int = 500000,
22 | iterations: int = 10,\
23 | epochs:int = 3, \
24 | grad_accum_steps: int = 1,\
25 | env_idx:int = None,\
26 | do_sample: bool = False,\
27 | temperature: float = 2.0,\
28 | critic_lr: float= 1e-3,\
29 | lm_lr: float = 1e-5,\
30 | gamma: float = 0.9,
31 | tau: float = 0.1,
32 | use_wandb: bool = False,
33 | env_load_path: str = '',
34 | actor_epochs: int = 3,
35 | max_grad_norm: float = 0.01,
36 | save_path: str = None,
37 | save_freq: int = 25,
38 | eval_freq: int = 25,
39 | agent_type: str = "archer",
40 | decode_f: callable = lambda x: x,
41 | **kwargs):
42 | if agent_type.lower() == "archer_toolllama":
43 | trainer = ArcherTrainer(agent=agent,\
44 | accelerator=accelerator,\
45 | tokenizer=tokenizer,\
46 | critic_lr = critic_lr,\
47 | lm_lr = lm_lr,\
48 | gamma = gamma,\
49 | tau = tau,\
50 | epochs = epochs,\
51 | actor_epochs = actor_epochs,
52 | grad_accum_steps=grad_accum_steps,
53 | max_grad_norm=max_grad_norm)
54 | replay_buffer= ReplayBuffer(batch_size= batch_size, capacity=capacity)
55 |
56 | os.makedirs(save_path, exist_ok=True)
57 | all_trajectories = torch.load(os.path.join(env_load_path, 'trajectories.pt'))
58 | info = {"rollout.mean": np.mean([d[0]["trajectory_reward"] for d in all_trajectories]),\
59 | "rollout.max": np.max([d[0]["trajectory_reward"] for d in all_trajectories]),\
60 | "rollout.min": np.min([d[0]["trajectory_reward"] for d in all_trajectories])}
61 |
62 | replay_buffer = torch.load(os.path.join(env_load_path, 'replay_buffer.pt'))
63 | agent.prepare()
64 | #main training loop
65 | print(">>>start iterations")
66 | for i in tqdm(range(iterations)): # pre collected in replay_buffer.pt
67 | info = {}
68 | all_trajectories = torch.load(os.path.join(env_load_path, 'trajectories.pt'))
69 | replay_buffer = torch.load(os.path.join(env_load_path, 'replay_buffer.pt'))
70 | print("Training")
71 | if 'filtered' in agent_type.lower():
72 | filtered_buffer= ReplayBuffer(batch_size= batch_size, capacity=capacity)
73 | episode_rewards = [d[0]["trajectory_reward"] for d in all_trajectories]
74 | cutoff = np.quantile(episode_rewards, 1 - 0.1)
75 | print("Episode Reward Cutoff: ", cutoff)
76 | filtered_trajectories = list(filter(lambda x: x[0]["trajectory_reward"] >= cutoff, all_trajectories))
77 | data = sum(filtered_trajectories, [])
78 | for d in data:
79 | filtered_buffer.insert(**d)
80 | info.update(trainer.update(filtered_buffer, no_update_actor = (i < warmup_iter)))
81 | else:
82 | # data = list(filter(lambda x: x["reward"] >0, data))
83 | info.update(trainer.update(replay_buffer, no_update_actor = (i < warmup_iter)))
84 | if use_wandb and accelerator.is_main_process:
85 | wandb.log(info)
86 | if (i+1) % save_freq == 0 and save_path is not None and accelerator.is_main_process:
87 | print("Saving")
88 | trainer.save(os.path.join(save_path, 'trainer.pt'), save_dir=save_path)
89 | torch.save(replay_buffer, os.path.join(save_path, 'replay_buffer.pt'))
90 | # return model
--------------------------------------------------------------------------------
/src/baseline-archer/run.py:
--------------------------------------------------------------------------------
1 | # Ref: https://github.com/YifeiZhou02/ArCHer
2 |
3 | # @misc{zhou2024archer,
4 | # title={ArCHer: Training Language Model Agents via Hierarchical Multi-Turn RL},
5 | # author={Yifei Zhou and Andrea Zanette and Jiayi Pan and Sergey Levine and Aviral Kumar},
6 | # year={2024},
7 | # eprint={2402.19446},
8 | # archivePrefix={arXiv},
9 | # primaryClass={cs.LG}
10 | # }
11 |
12 | import torch
13 | import transformers
14 | from tqdm import tqdm
15 | from archer_agent import ArcherAgent
16 | from offpolicy_train_loop import offpolicy_train_loop
17 |
18 | import torch.nn as nn
19 | import numpy as np
20 | import wandb
21 | from omegaconf import DictConfig, OmegaConf
22 | import os
23 | import hydra
24 | from accelerate import Accelerator
25 | from datetime import timedelta
26 | from accelerate import DistributedDataParallelKwargs, InitProcessGroupKwargs
27 | transformers.logging.set_verbosity_error()
28 |
29 | CONFIG_NAME = os.environ.get("ARCHER_CONFIG_NAME", None)
30 | @hydra.main(version_base=None, config_path="../../config/archer/", config_name=CONFIG_NAME)
31 | def main(config: "DictConfig"):
32 | print(">>> Configuration file: "+CONFIG_NAME+"<<<")
33 | print(OmegaConf.to_yaml(config))
34 | try:
35 | from huggingface_hub import login
36 | login(token=config.huggingface_token)
37 | except:
38 | print(">>> Huggingface token not found.")
39 |
40 | accelerator = Accelerator(InitProcessGroupKwargs(timeout=timedelta(18000)))
41 | device = accelerator.device
42 |
43 | decode_f = lambda x:x
44 | # load decision model
45 | if config.agent_type.lower() == "archer_toolllama":
46 | print(">>> Using ArCHer agent with ToolLLAMA")
47 | agent = ArcherAgent(device=device, accelerator=accelerator,
48 | temperature=config.temperature, do_sample=config.do_sample,
49 | policy_lm=config.policy_lm, critic_lm=config.critic_lm,
50 | cache_dir=config.cache_dir, max_new_tokens=config.max_new_tokens,
51 | use_lora=config.use_lora,
52 | eos_str=config.eos_str)
53 | else:
54 | raise NotImplementedError("Agent not implemented.")
55 | tokenizer = agent.tokenizer
56 | if config.checkpoint_path is not None:
57 | state_dict = torch.load(config.checkpoint_path, map_location=device)['model_state_dict']
58 | agent.model.load_state_dict(state_dict)
59 |
60 | if config.use_wandb and accelerator.is_main_process:
61 | wandb.login(key=config.wandb_key)
62 | wandb.init(project=config.project_name, name=config.run_name, config=dict(config))
63 |
64 | offpolicy_train_loop(env = None,
65 | agent = agent,
66 | tokenizer = tokenizer,
67 | eval_env = None,
68 | accelerator = accelerator,
69 | decode_f=decode_f,
70 | **config)
71 |
72 |
73 | if __name__ == "__main__":
74 | main()
75 |
--------------------------------------------------------------------------------
/src/baseline-eto/dpo_train.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import transformers
3 | from peft import LoraConfig, get_peft_model
4 | from dataclasses import dataclass, field
5 | from typing import Optional
6 |
7 | from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, AutoConfig
8 | from datasets import load_dataset
9 | from transformers.integrations import deepspeed
10 | from trl import (
11 | DPOTrainer,
12 | DPOConfig
13 | )
14 |
15 | @dataclass
16 | class ModelArguments:
17 | model_name_or_path: Optional[str] = field(default="facebook/opt-125m")
18 |
19 | @dataclass
20 | class DataArguments:
21 | data_path: str = field(
22 | default=None, metadata={"help": "Path to the training data."}
23 | )
24 |
25 | @dataclass
26 | class TrainingArguments(DPOConfig):
27 | beta: float = field(default=0.2, metadata={"help": "The beta factor in DPO loss. Higher beta means less divergence from the initial policy. For the IPO loss, beta is the regularization parameter denoted by tau in the paper."})
28 | model_max_length: int = field(
29 | default=8192,
30 | metadata={
31 | "help": "Expanded maximum sequence length. Sequences will be right padded (and possibly truncated)."
32 | },
33 | )
34 |
35 | @dataclass
36 | class LoraArguments:
37 | lora_r: int = 16
38 | lora_alpha: int = 16
39 | lora_dropout: float = 0.05
40 | lora_bias: str = "none"
41 |
42 | class DPOTrain():
43 |
44 | def __init__(self):
45 | pass
46 |
47 | def print_trainable_parameters(self, model):
48 | """
49 | Prints the number of trainable parameters in the model.
50 | """
51 | trainable_params = 0
52 | all_param = 0
53 | for _, param in model.named_parameters():
54 | all_param += param.numel()
55 | if param.requires_grad:
56 | trainable_params += param.numel()
57 | print(
58 | f"trainable params: {trainable_params} || all params: {all_param} || trainables%: {100 * trainable_params / all_param}"
59 | )
60 |
61 | def run(self):
62 | global local_rank
63 |
64 | parser = transformers.HfArgumentParser(
65 | (ModelArguments, DataArguments, TrainingArguments, LoraArguments)
66 | )
67 | model_args, data_args, training_args, lora_args = parser.parse_args_into_dataclasses()
68 |
69 | device_map = "auto"
70 |
71 | self.tokenizer = AutoTokenizer.from_pretrained(
72 | model_args.model_name_or_path,
73 | model_max_length=training_args.model_max_length,
74 | padding_side="right",
75 | use_fast=False,
76 | )
77 | self.tokenizer.pad_token = self.tokenizer.unk_token
78 |
79 | # train_dataset = self.get_dpo_dataset(self.data_file)
80 | dataset = load_dataset('csv', data_files=data_args.data_path, delimiter='\t')
81 | print(dataset.keys())
82 | train_val = dataset["train"].train_test_split(
83 | test_size=0.02, shuffle=True, seed=2024
84 | )
85 | train_dataset = train_val["train"]
86 | val_dataset = train_val["test"]
87 |
88 | # Set RoPE scaling factor
89 | model_config = AutoConfig.from_pretrained(
90 | model_args.model_name_or_path,
91 | rope_scaling = {
92 | "factor": 2.0,
93 | "type": "linear"
94 | },
95 | use_cache = False
96 | )
97 | model_load_kwargs = {
98 | 'low_cpu_mem_usage': not deepspeed.is_deepspeed_zero3_enabled(),
99 | }
100 | model = AutoModelForCausalLM.from_pretrained(
101 | model_args.model_name_or_path,
102 | config = model_config,
103 | device_map=device_map,
104 | trust_remote_code=True,
105 | torch_dtype=torch.bfloat16,
106 | **model_load_kwargs
107 | )
108 |
109 | lora_config = LoraConfig(
110 | r=lora_args.lora_r,
111 | lora_alpha=lora_args.lora_alpha,
112 | bias=lora_args.lora_bias,
113 | task_type="CAUSAL_LM",
114 | )
115 | model = get_peft_model(model, lora_config)
116 | self.print_trainable_parameters(model)
117 |
118 | dpo_trainer = DPOTrainer(
119 | model=model,
120 | ref_model=None,
121 | args=training_args,
122 | train_dataset=train_dataset,
123 | eval_dataset=val_dataset,
124 | tokenizer=self.tokenizer,
125 | )
126 | dpo_trainer.train()
127 | dpo_trainer.save_model()
128 |
129 |
130 | if __name__ == "__main__":
131 | DPOTrain_ = DPOTrain()
132 | DPOTrain_.run()
--------------------------------------------------------------------------------
/src/baseline-ppo/ppo.py:
--------------------------------------------------------------------------------
1 | # PPO (Final Reward)
2 |
3 | import json
4 | import time
5 | from tqdm import tqdm
6 | import os
7 | import torch
8 | from peft import LoraConfig
9 |
10 | from argparse import ArgumentParser
11 | from transformers import AutoTokenizer
12 | from accelerate import Accelerator
13 | from datasets import load_dataset
14 |
15 | from trl import (
16 | PPOTrainer,
17 | PPOConfig,
18 | AutoModelForCausalLMWithValueHead,
19 | )
20 |
21 | import wandb
22 | import numpy as np
23 | import random
24 |
25 | def set_seed(seed):
26 | random.seed(seed)
27 | os.environ['PYTHONHASHSEED'] = str(seed)
28 | np.random.seed(seed)
29 | torch.manual_seed(seed)
30 | torch.cuda.manual_seed(seed)
31 | torch.cuda.manual_seed_all(seed) # if you are using multi-GPU.
32 |
33 |
34 | class PPOTrain():
35 | @staticmethod
36 | def parse_args():
37 | parser = ArgumentParser()
38 | parser.add_argument('--config_path', default="config/dpo-test.json", type=str, required=True, help='Path to the config file')
39 | parser.add_argument('--model_path', default="ToolBench/ToolLLaMA-2-7b-v2", type=str, help='Path to the model')
40 | parser.add_argument('--data_file', required=True, type=str, help='Path to the data file')
41 | parser.add_argument('--model_type', default="ToolLlama", type=str, help='Type of the model')
42 | parser.add_argument('--epochs', default=3, type=int, help='Number of epochs to train')
43 | parser.add_argument('--max_length', default=1024, type=int, help='Max length of the input')
44 | parser.add_argument('--max_context_len', default=4096, type=int, help='Max context length')
45 | parser.add_argument('--max_response_len', default=1200, type=int, help='Max response length')
46 | return parser.parse_args()
47 |
48 | def __init__(self, args):
49 | self.config_path = args.config_path
50 | self.model_path = args.model_path
51 | self.data_file = args.data_file
52 | self.max_length = args.max_length
53 | self.epochs = args.epochs
54 | self.max_length = args.max_length
55 | self.max_context_len = args.max_context_len
56 | self.max_response_len = args.max_response_len
57 | wandb_project = "baseline-PPO"
58 | wandb_run_name = f"{args.model_type}"
59 | wandb.init(project=wandb_project, name=wandb_run_name)
60 |
61 |
62 | def print_trainable_parameters(self, model):
63 | """
64 | Prints the number of trainable parameters in the model.
65 | """
66 | trainable_params = 0
67 | all_param = 0
68 | for _, param in model.named_parameters():
69 | all_param += param.numel()
70 | if param.requires_grad:
71 | trainable_params += param.numel()
72 | print(
73 | f"trainable params: {trainable_params} || all params: {all_param} || trainables%: {100 * trainable_params / all_param}"
74 | )
75 |
76 | def formatting_func(self, examples):
77 | input_text = examples["prompt"]
78 | examples["query"] = self.tokenizer.encode(input_text, return_tensors='pt').squeeze(0)
79 |
80 | max_context_len = 4096
81 | max_response_len = 1200
82 | while len(examples["query"]) > max_context_len:
83 | examples["query"] = examples["query"][-max_context_len:]
84 |
85 |
86 | examples['response'] = self.tokenizer.encode(examples["response"], return_tensors='pt').squeeze(0)
87 | if len(examples['response']) > max_response_len:
88 | examples['response'] = examples['response'][:self.max_response_len]
89 | examples["label"] = torch.tensor(eval(examples["reward"])[-1], dtype=torch.float16)
90 | return examples
91 |
92 | def train(self, epochs: int = 1):
93 | base_dir = os.path.join('ckpts/', f'baseline-ppo_'+str(int(time.time())))
94 |
95 | batch_steps = 0
96 | for epoch in range(epochs):
97 | print(f"==========================Epoch {epoch}==========================")
98 |
99 | for batch_id, batch in tqdm(enumerate(self.ppo_trainer.dataloader)):
100 | batch_steps += 1
101 | query_tensors, response_tensors = batch['query'], batch['response']
102 | rewards = batch['label']
103 | stats = self.ppo_trainer.step(query_tensors, response_tensors, rewards)
104 | self.ppo_trainer.log_stats(stats, batch, rewards, columns_to_log=[])
105 |
106 | if batch_steps % 100 == 0:
107 | os.makedirs(base_dir, exist_ok=True)
108 | self.ppo_trainer.save_pretrained(os.path.join(base_dir, f'batch-{batch_steps}'))
109 | os.makedirs(base_dir, exist_ok=True)
110 | self.ppo_trainer.save_pretrained(os.path.join(base_dir, f'epoch-{epoch}'))
111 |
112 |
113 | def run(self):
114 | set_seed(2024)
115 |
116 | with open(self.config_path, 'r') as config_f:
117 | config = json.load(config_f)
118 |
119 | self.tokenizer = AutoTokenizer.from_pretrained(self.model_path,
120 | device_map= {"": Accelerator().process_index})
121 | dataset = load_dataset('csv', data_files=self.data_file, delimiter='\t')
122 |
123 | peft_kwargs = config.get('peft_kwargs', {})
124 | peft_config = LoraConfig(**peft_kwargs)
125 |
126 | formatted_dataset = dataset.map(self.formatting_func, batched=False, load_from_cache_file=False)
127 | formatted_dataset.set_format(type="torch")
128 | train_dataset = formatted_dataset["train"]
129 |
130 | ppo_kwargs = config.get('ppo_kwargs', {})
131 | ppo_config = PPOConfig(**ppo_kwargs)
132 |
133 | model = AutoModelForCausalLMWithValueHead.from_pretrained(
134 | self.model_path,
135 | low_cpu_mem_usage=True,
136 | device_map="auto",
137 | peft_config=peft_config,
138 | torch_dtype=torch.bfloat16,
139 | )
140 |
141 | self.print_trainable_parameters(model)
142 |
143 | def collator(data):
144 | return dict((key, [d[key] for d in data]) for key in data[0])
145 |
146 | if self.tokenizer.pad_token is None:
147 | self.tokenizer.pad_token = self.tokenizer.eos_token
148 | model.config.pad_token_id = model.config.eos_token_id
149 |
150 | self.ppo_trainer = PPOTrainer(
151 | config=ppo_config,
152 | model=model,
153 | dataset=train_dataset,
154 | tokenizer=self.tokenizer,
155 | data_collator=collator
156 | )
157 |
158 | self.train(epochs=args.epochs)
159 |
160 |
161 | if __name__ == "__main__":
162 | args = PPOTrain.parse_args()
163 | PPOTrain = PPOTrain(args)
164 | PPOTrain.run()
--------------------------------------------------------------------------------
/src/reward/annotation_with_gpt.py:
--------------------------------------------------------------------------------
1 | from src.reward.evaluators.evaluator import ProcessRewardEvaluator
2 | from stabletoolbench.toolbench.tooleval.evaluators import load_registered_automatic_evaluator
3 | import os
4 | import json
5 | import random
6 | from concurrent.futures import ThreadPoolExecutor,as_completed
7 | import argparse
8 | from tqdm import tqdm
9 | from stabletoolbench.toolbench.tooleval.utils import get_steps
10 | import backoff
11 |
12 | abs_dir = os.path.split(__file__)[0]
13 |
14 | def parse_args():
15 | parser = argparse.ArgumentParser()
16 | parser.add_argument('--converted_answer_path', type=str, default="", required=True, help='converted answer path')
17 | parser.add_argument('--save_path', type=str, default="", required=False, help='result save path')
18 | parser.add_argument('--reference_model', type=str, default="", required=False, help='model predictions path')
19 | parser.add_argument('--reference_path', type=str, default=None, required=False, help='reference path')
20 | parser.add_argument('--task_num', type=int, default=None, required=False, help='task num')
21 | parser.add_argument('--evaluator', type=str, default="gpt-4-turbo-2024-04-09", required=False, help='which evaluator to use.')
22 | parser.add_argument('--max_eval_threads', type=int, default=30, required=False, help='max threads nums')
23 | parser.add_argument('--evaluate_times', type=int, default=4, required=False, help='how many times to predict with the evaluator for each solution path.')
24 | parser.add_argument('--test_set', nargs='+', default=['G1_instruction'], help='test set name')
25 | parser.add_argument('--overwrite', action='store_true', help='whether to overwrite the existing result file')
26 | return parser.parse_args()
27 |
28 | if __name__ == "__main__":
29 | args = parse_args()
30 | evaluators = [load_registered_automatic_evaluator(evaluator_name=args.evaluator, evaluators_cfg_path=os.path.join(abs_dir,'evaluators')) for _ in range(args.max_eval_threads)]
31 | @backoff.on_exception(backoff.expo, Exception, max_time=15)
32 | def compute_process_reward(query_id, example, evaluate_time):
33 | global evaluators
34 | evaluator = random.choice(evaluators)
35 | answer_steps, answer_steps_list, final_step = get_steps(example)
36 | succeed_tool_calling_list, contributions, answer_status = evaluator.evaluate_process_reward(
37 | {
38 | 'query':example['query'],
39 | 'available_tools':example['available_tools'],
40 | },
41 | answer_steps_list[:-1],
42 | example['answer'],
43 | )
44 | process_reward = {
45 | "succeed_tool_calling": succeed_tool_calling_list,
46 | "contributions": contributions,
47 | }
48 | return query_id, process_reward, answer_status, evaluate_time
49 |
50 | reference_model = args.reference_model
51 | output_list = []
52 |
53 | for test_set in args.test_set:
54 |
55 | save_file = f"{args.save_path}/{test_set}.json"
56 | if args.task_num:
57 | save_file = f"{args.save_path}/{test_set}_{args.task_num}.json"
58 |
59 | reference_path = f"{args.converted_answer_path}/{test_set}.json"
60 | reference_examples = json.load(open(reference_path, "r"))
61 | if args.task_num:
62 | reference_examples = {k:reference_examples[k] for k in list(reference_examples.keys())[:args.task_num]}
63 |
64 | if os.path.exists(save_file) and not args.overwrite:
65 | old_existed_ids = list(json.load(open(save_file, "r")).keys())
66 | old_label_cnt = json.load(open(save_file, "r"))
67 | existed_ids = []
68 | label_cnt = {}
69 | for query_id in old_existed_ids:
70 | ans = old_label_cnt[query_id]
71 | if len(ans['process_reward'].keys()) == args.evaluate_times:
72 | existed_ids.append(query_id)
73 | label_cnt[query_id] = ans
74 | else:
75 | existed_ids = []
76 | label_cnt = {}
77 |
78 | with ThreadPoolExecutor(args.max_eval_threads) as pool:
79 | future = []
80 |
81 | for query_id in reference_examples:
82 | if query_id in existed_ids:
83 | continue
84 | for i in range(args.evaluate_times):
85 | example = reference_examples[query_id]
86 | future.append(pool.submit(
87 | compute_process_reward,
88 | query_id,
89 | example,
90 | evaluate_time=i
91 | ))
92 |
93 | for thd in tqdm(as_completed(future),total=len(future),ncols=100):
94 | query_id, process_reward, is_solved, evaluate_time = thd.result()
95 | example = reference_examples[query_id]
96 | query = example["query"]
97 | tool_names = []
98 | for tool_dict in example["available_tools"]:
99 | tool_name = tool_dict["function"]["name"]
100 | tool_names.append(tool_name)
101 | answer_steps, answer_steps_list, final_step = get_steps(example)
102 | if query_id not in label_cnt:
103 | label_cnt[query_id] = {}
104 | label_cnt[query_id]["query"] = query
105 | label_cnt[query_id]["tool_names"] = tool_names
106 | label_cnt[query_id]["answer_steps"] = answer_steps_list[:-1]
107 | # label_cnt[query_id]["mid_steps_reward"] = mid_steps_reward # parsed
108 | if 'process_reward' not in label_cnt[query_id]:
109 | label_cnt[query_id]["process_reward"] = {}
110 | label_cnt[query_id]["process_reward"][evaluate_time] = process_reward
111 | label_cnt[query_id]["final_step"] = final_step
112 |
113 | if 'is_solved' not in label_cnt[query_id]:
114 | label_cnt[query_id]["is_solved"] = {}
115 | label_cnt[query_id]["is_solved"][evaluate_time] = str(is_solved)
116 | # print("========== Finish and Dump into json file===========", query_id, is_solved, evaluate_time)
117 |
118 | json.dump(label_cnt, open(save_file, "w"), ensure_ascii=False, indent=4)
119 |
120 | json.dump(label_cnt, open(save_file, "w"), ensure_ascii=False, indent=4)
121 |
--------------------------------------------------------------------------------
/src/reward/evaluators/evaluator.py:
--------------------------------------------------------------------------------
1 | from typing import Dict, Any
2 |
3 | from stabletoolbench.toolbench.tooleval.evaluators.registered_cls.utils import register_evaluator
4 | from stabletoolbench.toolbench.tooleval.evaluators.registered_cls.rtl import ReinforceToolLearningEvaluator
5 |
6 | from enum import Enum
7 |
8 | class AnswerStatus(Enum):
9 | Unsure = "Unsure"
10 | Unsolved = "Unsolved"
11 | Solved = "Solved"
12 |
13 | @register_evaluator
14 | class ProcessRewardEvaluator(ReinforceToolLearningEvaluator):
15 | def evaluate_process_reward(self,
16 | task_description:Dict,
17 | mid_steps,
18 | answer:Dict[Any,Any]):
19 | ret = self.function_call(
20 | 'evaluate_process_reward',
21 | {
22 | 'query': task_description['query'],
23 | 'mid_steps': mid_steps,
24 | 'final_answer':answer['final_answer'],
25 | }
26 | )
27 | answer_status = AnswerStatus(ret['final_answer_status'])
28 | return ret['succeed_tool_calling'], ret['contribution_to_final_answer'], answer_status
29 |
--------------------------------------------------------------------------------
/src/reward/evaluators/gpt-4-turbo-2024-04-09/config.yaml:
--------------------------------------------------------------------------------
1 | evaluator_name: "gpt-4-turbo-2024-04-09"
2 | registered_cls_name: "ProcessRewardEvaluator"
3 | prompt_template: "template.txt"
4 | fn_completions: "normalized_openai_completions"
5 | apis_json: "your/path/to/api_pool.json"
6 | completions_kwargs:
7 | model: "gpt-4-turbo-2024-04-09"
8 | max_tokens: 1000
9 | temperature: 0
10 | timeout: 10
11 | functions:
12 | - name: "evaluate_process_reward"
13 | description: "Evaluate the entire task-solving process, including tool calls, the contribution of each intermediate step to the final answer, and the status of the final answer."
14 | parameters:
15 | type: "object"
16 | properties:
17 | succeed_tool_calling:
18 | type: "array"
19 | description: "Provide a binary score (0 or 1) indicating whether **each intermediate step** successfully called the tool."
20 | items:
21 | type: "number"
22 | description: "0 for unsuccessful tool calls, 1 for successful tool calls"
23 | contribution_to_final_answer:
24 | type: "array"
25 | description: "Provide a score (0 to 5) to assess how much **each intermediate step** contributed to the final answer."
26 | items:
27 | type: "number"
28 | description: "0 indicates no contribution, and 5 indicates maximum contribution."
29 | final_answer_status:
30 | type: "string"
31 | enum: ["Unsure", "Unsolved", "Solved"]
32 | description: "Indicate the status of the final answer. Choose from: 'Unsure', 'Unsolved', or 'Solved'."
33 | required: ["succeed_tool_calling", "contribution_to_final_answer", "final_answer_status"]
34 |
35 | - name: "check_answer_status"
36 | description: "Parse the json answer with layerd nodes and return the answer_status about the answer"
37 | parameters:
38 | type: "object"
39 | properties:
40 | answer_status:
41 | type: "string"
42 | enum: ["Unsure","Unsolved","Solved"]
43 | required: ["answer_status"]
44 | fn_completion_parser: "index_parser"
45 | batch_size: 1
46 |
--------------------------------------------------------------------------------
/src/reward/evaluators/gpt-4-turbo-2024-04-09/template.txt:
--------------------------------------------------------------------------------
1 |
2 | evaluate_process_reward
3 |
4 | Query:
5 | {query}
6 |
7 | Intermediate Steps:
8 | {mid_steps}
9 |
10 | Final Answer:
11 | {final_answer}
12 |
13 | Based on the query, intermediate steps, and final answer, evaluate the entire task-solving process using the following criteria:
14 |
15 | 1. **Successful Tool Calling**: For each intermediate step, indicate whether a tool was successfully called, with a score of 0 (no) or 1 (yes).
16 | 2. **Contribution to Final Answer**: Rate the contribution of each intermediate step to the final answer on a scale of 0 to 5.
17 | 3. **Final Answer Status**: Determine the final answer status as 'Solved', 'Unsure', or 'Unsolved'.
18 |
19 | Please call the `evaluate_process_reward` function to return your evaluation.
20 |
21 |
--------------------------------------------------------------------------------
/src/reward/openai_key.json:
--------------------------------------------------------------------------------
1 | [
2 | {
3 | "api_key": "",
4 | "api_base": ""
5 | }
6 | ]
--------------------------------------------------------------------------------
/stabletoolbench/config.yml:
--------------------------------------------------------------------------------
1 | api_key:
2 | api_base:
3 | toolbench_key:
4 | tool_root_dir: server/tools
--------------------------------------------------------------------------------
/stabletoolbench/server/config.yml:
--------------------------------------------------------------------------------
1 | api_key:
2 | api_base:
3 | model:
4 | temperature: 0
5 | toolbench_url:
6 | rapidapi_key:
7 | tools_folder: "./tools"
8 | cache_folder: "./tool_response_cache"
9 | is_save: true
10 | port: 8081
11 |
--------------------------------------------------------------------------------
/stabletoolbench/server/requirements.txt:
--------------------------------------------------------------------------------
1 | fastapi==0.110.0
2 | openai
3 | pydantic==1.10.11
4 | PyYAML==6.0.1
5 | PyYAML==6.0.1
6 | Requests==2.31.0
7 | slowapi==0.1.9
8 | tenacity==8.2.2
9 | uvicorn==0.28.0
10 |
--------------------------------------------------------------------------------
/stabletoolbench/server/utils.py:
--------------------------------------------------------------------------------
1 | import re
2 |
3 | categories = [
4 | "Sports",
5 | "Finance",
6 | "Data",
7 | "Entertainment",
8 | "Travel",
9 | "Location",
10 | "Science",
11 | "Food",
12 | "Transportation",
13 | "Music",
14 | "Business",
15 | "Visual Recognition",
16 | "Tools",
17 | "Text Analysis",
18 | "Weather",
19 | "Gaming",
20 | "SMS",
21 | "Events",
22 | "Health and Fitness",
23 | "Payments",
24 | "Financial",
25 | "Translation",
26 | "Storage",
27 | "Logistics",
28 | "Database",
29 | "Search",
30 | "Reward",
31 | "Mapping",
32 | "Artificial%20Intelligence%2FMachine%20Learning",
33 | "Email",
34 | "News, Media",
35 | "Video, Images",
36 | "eCommerce",
37 | "Medical",
38 | "Devices",
39 | "Business Software",
40 | "Advertising",
41 | "Education",
42 | "Media",
43 | "Social",
44 | "Commerce",
45 | "Communication",
46 | "Other",
47 | "Monitoring",
48 | "Energy",
49 | "Jobs",
50 | "Movies",
51 | "Cryptography",
52 | "Cybersecurity"
53 | ]
54 |
55 | def standardize_category(category):
56 | save_category = category.replace(" ", "_").replace(",", "_").replace("/", "_")
57 | while " " in save_category or "," in save_category:
58 | save_category = save_category.replace(" ", "_").replace(",", "_")
59 | save_category = save_category.replace("__", "_")
60 | return save_category
61 |
62 | def standardize(string):
63 | res = re.compile("[^\\u4e00-\\u9fa5^a-z^A-Z^0-9^_]")
64 | string = res.sub("_", string)
65 | string = re.sub(r"(_)\1+","_", string).lower()
66 | while True:
67 | if len(string) == 0:
68 | return string
69 | if string[0] == "_":
70 | string = string[1:]
71 | else:
72 | break
73 | while True:
74 | if len(string) == 0:
75 | return string
76 | if string[-1] == "_":
77 | string = string[:-1]
78 | else:
79 | break
80 | if string[0].isdigit():
81 | string = "get_" + string
82 | return string
83 |
84 | def change_name(name):
85 | change_list = ["from", "class", "return", "false", "true", "id", "and"]
86 | if name in change_list:
87 | name = "is_" + name
88 | return name
89 |
--------------------------------------------------------------------------------
/stabletoolbench/solvable_queries/test_query_ids/G1_category.json:
--------------------------------------------------------------------------------
1 | {
2 | "28": 0,
3 | "29": 0,
4 | "1198": 0,
5 | "1301": 0,
6 | "4153": 0,
7 | "4155": 0,
8 | "4266": 0,
9 | "4273": 0,
10 | "4286": 0,
11 | "4328": 0,
12 | "4343": 0,
13 | "4366": 0,
14 | "4415": 0,
15 | "4424": 0,
16 | "4428": 0,
17 | "4465": 0,
18 | "4471": 0,
19 | "6504": 0,
20 | "6511": 0,
21 | "6521": 0,
22 | "9604": 0,
23 | "9661": 0,
24 | "9679": 0,
25 | "9708": 0,
26 | "9719": 0,
27 | "12535": 0,
28 | "12642": 0,
29 | "12671": 0,
30 | "12688": 0,
31 | "12744": 0,
32 | "12759": 0,
33 | "12770": 0,
34 | "12788": 0,
35 | "12790": 0,
36 | "12805": 0,
37 | "12819": 0,
38 | "12875": 0,
39 | "12884": 0,
40 | "12954": 0,
41 | "15137": 0,
42 | "18268": 0,
43 | "18286": 0,
44 | "18319": 0,
45 | "18337": 0,
46 | "21257": 0,
47 | "21313": 0,
48 | "21400": 0,
49 | "21447": 0,
50 | "21453": 0,
51 | "21477": 0,
52 | "21518": 0,
53 | "23486": 0,
54 | "25341": 0,
55 | "25344": 0,
56 | "25348": 0,
57 | "26577": 0,
58 | "26613": 0,
59 | "26661": 0,
60 | "26698": 0,
61 | "26701": 0,
62 | "29592": 0,
63 | "29647": 0,
64 | "29653": 0,
65 | "29719": 0,
66 | "29724": 0,
67 | "29746": 0,
68 | "29778": 0,
69 | "29816": 0,
70 | "29824": 0,
71 | "29844": 0,
72 | "29859": 0,
73 | "29917": 0,
74 | "34747": 0,
75 | "34773": 0,
76 | "34774": 0,
77 | "34811": 0,
78 | "37847": 0,
79 | "37876": 0,
80 | "38008": 0,
81 | "38021": 0,
82 | "38028": 0,
83 | "38045": 0,
84 | "38099": 0,
85 | "38125": 0,
86 | "41983": 0,
87 | "43312": 0,
88 | "43364": 0,
89 | "43375": 0,
90 | "46424": 0,
91 | "46455": 0,
92 | "46528": 0,
93 | "46662": 0,
94 | "46676": 0,
95 | "46688": 0,
96 | "46760": 0,
97 | "51809": 0,
98 | "51817": 0,
99 | "54484": 0,
100 | "54640": 0,
101 | "54658": 0,
102 | "54697": 0,
103 | "54801": 0,
104 | "54839": 0,
105 | "54844": 0,
106 | "56809": 0,
107 | "59862": 0,
108 | "59890": 0,
109 | "59905": 0,
110 | "62870": 0,
111 | "62960": 0,
112 | "63066": 0,
113 | "63151": 0,
114 | "65185": 0,
115 | "65190": 0,
116 | "67087": 0,
117 | "67089": 0,
118 | "68407": 0,
119 | "68448": 0,
120 | "68470": 0,
121 | "68553": 0,
122 | "71583": 0,
123 | "71638": 0,
124 | "71685": 0,
125 | "71692": 0,
126 | "71741": 0,
127 | "71801": 0,
128 | "71823": 0,
129 | "77144": 0,
130 | "77171": 0,
131 | "77200": 0,
132 | "77208": 0,
133 | "77247": 0,
134 | "77261": 0,
135 | "80170": 0,
136 | "80175": 0,
137 | "80248": 0,
138 | "80298": 0,
139 | "80491": 0,
140 | "80500": 0,
141 | "80504": 0,
142 | "80519": 0,
143 | "82434": 0,
144 | "84935": 0,
145 | "84974": 0,
146 | "86069": 0,
147 | "86231": 0,
148 | "86251": 0,
149 | "86288": 0,
150 | "86297": 0,
151 | "86335": 0,
152 | "86488": 0,
153 | "86502": 0,
154 | "86535": 0
155 | }
--------------------------------------------------------------------------------
/stabletoolbench/solvable_queries/test_query_ids/G1_instruction.json:
--------------------------------------------------------------------------------
1 | {
2 | "588": 0,
3 | "608": 0,
4 | "1073": 0,
5 | "1572": 0,
6 | "1856": 0,
7 | "2121": 0,
8 | "2144": 0,
9 | "2213": 0,
10 | "2354": 0,
11 | "2399": 0,
12 | "3308": 0,
13 | "3510": 0,
14 | "3723": 0,
15 | "3922": 0,
16 | "4505": 0,
17 | "5116": 0,
18 | "5810": 0,
19 | "5965": 0,
20 | "6618": 0,
21 | "6736": 0,
22 | "6959": 0,
23 | "7043": 0,
24 | "7497": 0,
25 | "7658": 0,
26 | "7989": 0,
27 | "8025": 0,
28 | "9921": 0,
29 | "9984": 0,
30 | "10160": 0,
31 | "10770": 0,
32 | "11653": 0,
33 | "11686": 0,
34 | "12204": 0,
35 | "13095": 0,
36 | "14714": 0,
37 | "15511": 0,
38 | "16196": 0,
39 | "16970": 0,
40 | "17038": 0,
41 | "17223": 0,
42 | "17952": 0,
43 | "20704": 0,
44 | "21596": 0,
45 | "22781": 0,
46 | "22937": 0,
47 | "23163": 0,
48 | "23248": 0,
49 | "23982": 0,
50 | "24146": 0,
51 | "24810": 0,
52 | "25052": 0,
53 | "25658": 0,
54 | "26063": 0,
55 | "26752": 0,
56 | "26892": 0,
57 | "27847": 0,
58 | "28751": 0,
59 | "29059": 0,
60 | "29291": 0,
61 | "29322": 0,
62 | "31117": 0,
63 | "31267": 0,
64 | "31402": 0,
65 | "32001": 0,
66 | "32285": 0,
67 | "32309": 0,
68 | "32617": 0,
69 | "32652": 0,
70 | "32807": 0,
71 | "33112": 0,
72 | "33330": 0,
73 | "33889": 0,
74 | "34266": 0,
75 | "34823": 0,
76 | "35112": 0,
77 | "36068": 0,
78 | "36197": 0,
79 | "36717": 0,
80 | "37421": 0,
81 | "38494": 0,
82 | "40019": 0,
83 | "40054": 0,
84 | "40436": 0,
85 | "40699": 0,
86 | "41389": 0,
87 | "41444": 0,
88 | "41806": 0,
89 | "42351": 0,
90 | "43269": 0,
91 | "43821": 0,
92 | "44482": 0,
93 | "44533": 0,
94 | "44619": 0,
95 | "44774": 0,
96 | "45490": 0,
97 | "45775": 0,
98 | "46403": 0,
99 | "47301": 0,
100 | "47838": 0,
101 | "48059": 0,
102 | "49267": 0,
103 | "49991": 0,
104 | "51043": 0,
105 | "52534": 0,
106 | "52734": 0,
107 | "55223": 0,
108 | "55323": 0,
109 | "55489": 0,
110 | "55721": 0,
111 | "56226": 0,
112 | "56236": 0,
113 | "56666": 0,
114 | "58096": 0,
115 | "58949": 0,
116 | "59266": 0,
117 | "59954": 0,
118 | "60837": 0,
119 | "60936": 0,
120 | "61654": 0,
121 | "62012": 0,
122 | "62757": 0,
123 | "63730": 0,
124 | "63962": 0,
125 | "65637": 0,
126 | "66018": 0,
127 | "67007": 0,
128 | "67522": 0,
129 | "67966": 0,
130 | "68221": 0,
131 | "68327": 0,
132 | "68335": 0,
133 | "69206": 0,
134 | "70610": 0,
135 | "71402": 0,
136 | "72373": 0,
137 | "72659": 0,
138 | "73529": 0,
139 | "73762": 0,
140 | "74322": 0,
141 | "75338": 0,
142 | "75390": 0,
143 | "76554": 0,
144 | "76957": 0,
145 | "77471": 0,
146 | "77514": 0,
147 | "77855": 0,
148 | "78406": 0,
149 | "79053": 0,
150 | "79620": 0,
151 | "80884": 0,
152 | "81195": 0,
153 | "81581": 0,
154 | "82314": 0,
155 | "82701": 0,
156 | "83742": 0,
157 | "83819": 0,
158 | "83950": 0,
159 | "84845": 0,
160 | "85152": 0,
161 | "86084": 0,
162 | "86143": 0,
163 | "87632": 0,
164 | "88193": 0
165 | }
--------------------------------------------------------------------------------
/stabletoolbench/solvable_queries/test_query_ids/G1_tool.json:
--------------------------------------------------------------------------------
1 | {
2 | "394": 0,
3 | "692": 0,
4 | "1617": 0,
5 | "2412": 0,
6 | "2513": 0,
7 | "2701": 0,
8 | "3007": 0,
9 | "3215": 0,
10 | "3221": 0,
11 | "3287": 0,
12 | "5085": 0,
13 | "6677": 0,
14 | "7474": 0,
15 | "7903": 0,
16 | "7971": 0,
17 | "8129": 0,
18 | "8443": 0,
19 | "8655": 0,
20 | "8722": 0,
21 | "9039": 0,
22 | "9238": 0,
23 | "9792": 0,
24 | "9956": 0,
25 | "10221": 0,
26 | "10277": 0,
27 | "11924": 0,
28 | "13495": 0,
29 | "13497": 0,
30 | "13499": 0,
31 | "13537": 0,
32 | "13826": 0,
33 | "14198": 0,
34 | "15058": 0,
35 | "15335": 0,
36 | "15931": 0,
37 | "16133": 0,
38 | "16700": 0,
39 | "17978": 0,
40 | "18761": 0,
41 | "19662": 0,
42 | "19696": 0,
43 | "20358": 0,
44 | "21785": 0,
45 | "22077": 0,
46 | "22514": 0,
47 | "24777": 0,
48 | "25164": 0,
49 | "25483": 0,
50 | "25687": 0,
51 | "26542": 0,
52 | "26820": 0,
53 | "26961": 0,
54 | "27819": 0,
55 | "28028": 0,
56 | "28229": 0,
57 | "28240": 0,
58 | "28788": 0,
59 | "30660": 0,
60 | "31708": 0,
61 | "32177": 0,
62 | "33971": 0,
63 | "34211": 0,
64 | "34696": 0,
65 | "34946": 0,
66 | "35056": 0,
67 | "35382": 0,
68 | "36378": 0,
69 | "36687": 0,
70 | "37553": 0,
71 | "38414": 0,
72 | "38551": 0,
73 | "39392": 0,
74 | "39393": 0,
75 | "42077": 0,
76 | "42348": 0,
77 | "42934": 0,
78 | "43110": 0,
79 | "43557": 0,
80 | "43585": 0,
81 | "43933": 0,
82 | "44066": 0,
83 | "44793": 0,
84 | "44845": 0,
85 | "45370": 0,
86 | "45371": 0,
87 | "45418": 0,
88 | "45422": 0,
89 | "45533": 0,
90 | "46409": 0,
91 | "46413": 0,
92 | "47032": 0,
93 | "48480": 0,
94 | "48483": 0,
95 | "48950": 0,
96 | "49173": 0,
97 | "49529": 0,
98 | "49531": 0,
99 | "49830": 0,
100 | "50984": 0,
101 | "51600": 0,
102 | "52332": 0,
103 | "53120": 0,
104 | "53924": 0,
105 | "53959": 0,
106 | "54421": 0,
107 | "55589": 0,
108 | "56049": 0,
109 | "56495": 0,
110 | "58412": 0,
111 | "58705": 0,
112 | "58826": 0,
113 | "64662": 0,
114 | "65119": 0,
115 | "65125": 0,
116 | "65425": 0,
117 | "65584": 0,
118 | "65624": 0,
119 | "65673": 0,
120 | "66052": 0,
121 | "66927": 0,
122 | "68228": 0,
123 | "69319": 0,
124 | "69540": 0,
125 | "69717": 0,
126 | "69972": 0,
127 | "69973": 0,
128 | "70158": 0,
129 | "70359": 0,
130 | "70672": 0,
131 | "70835": 0,
132 | "72543": 0,
133 | "73151": 0,
134 | "73587": 0,
135 | "73739": 0,
136 | "74709": 0,
137 | "74989": 0,
138 | "75659": 0,
139 | "76706": 0,
140 | "76740": 0,
141 | "76966": 0,
142 | "77375": 0,
143 | "77908": 0,
144 | "78490": 0,
145 | "78791": 0,
146 | "78994": 0,
147 | "79741": 0,
148 | "81549": 0,
149 | "83931": 0,
150 | "85155": 0,
151 | "85562": 0,
152 | "85582": 0,
153 | "85759": 0,
154 | "86105": 0,
155 | "86735": 0,
156 | "87540": 0,
157 | "87616": 0,
158 | "87714": 0,
159 | "88197": 0
160 | }
--------------------------------------------------------------------------------
/stabletoolbench/solvable_queries/test_query_ids/G2_category.json:
--------------------------------------------------------------------------------
1 | {
2 | "43": 0,
3 | "61": 0,
4 | "75": 0,
5 | "83": 0,
6 | "3432": 0,
7 | "3442": 0,
8 | "3456": 0,
9 | "3463": 0,
10 | "3482": 0,
11 | "3494": 0,
12 | "3534": 0,
13 | "3558": 0,
14 | "3609": 0,
15 | "3640": 0,
16 | "3645": 0,
17 | "3652": 0,
18 | "3672": 0,
19 | "3786": 0,
20 | "3843": 0,
21 | "3929": 0,
22 | "3942": 0,
23 | "3990": 0,
24 | "4006": 0,
25 | "4031": 0,
26 | "4095": 0,
27 | "4176": 0,
28 | "4271": 0,
29 | "13338": 0,
30 | "13354": 0,
31 | "13384": 0,
32 | "13385": 0,
33 | "13487": 0,
34 | "13517": 0,
35 | "13533": 0,
36 | "13555": 0,
37 | "13559": 0,
38 | "13586": 0,
39 | "13592": 0,
40 | "13639": 0,
41 | "13699": 0,
42 | "13745": 0,
43 | "13778": 0,
44 | "13795": 0,
45 | "13838": 0,
46 | "13951": 0,
47 | "14036": 0,
48 | "14117": 0,
49 | "14161": 0,
50 | "14185": 0,
51 | "14333": 0,
52 | "14384": 0,
53 | "14400": 0,
54 | "14533": 0,
55 | "14595": 0,
56 | "14605": 0,
57 | "14628": 0,
58 | "14732": 0,
59 | "14802": 0,
60 | "29606": 0,
61 | "29701": 0,
62 | "33046": 0,
63 | "33055": 0,
64 | "33156": 0,
65 | "33171": 0,
66 | "33255": 0,
67 | "33263": 0,
68 | "33271": 0,
69 | "33295": 0,
70 | "33431": 0,
71 | "33457": 0,
72 | "33481": 0,
73 | "33632": 0,
74 | "33716": 0,
75 | "42534": 0,
76 | "42547": 0,
77 | "42608": 0,
78 | "42635": 0,
79 | "42649": 0,
80 | "42701": 0,
81 | "42708": 0,
82 | "42729": 0,
83 | "42748": 0,
84 | "42882": 0,
85 | "42885": 0,
86 | "42957": 0,
87 | "43070": 0,
88 | "43076": 0,
89 | "43102": 0,
90 | "43200": 0,
91 | "43201": 0,
92 | "43230": 0,
93 | "43258": 0,
94 | "43316": 0,
95 | "43368": 0,
96 | "43505": 0,
97 | "43612": 0,
98 | "43663": 0,
99 | "43713": 0,
100 | "43724": 0,
101 | "43994": 0,
102 | "44010": 0,
103 | "44040": 0,
104 | "50937": 0,
105 | "62159": 0,
106 | "62261": 0,
107 | "71363": 0,
108 | "71501": 0,
109 | "71675": 0,
110 | "71756": 0,
111 | "71980": 0,
112 | "72000": 0,
113 | "72004": 0,
114 | "72040": 0,
115 | "72118": 0,
116 | "72271": 0,
117 | "72274": 0,
118 | "72357": 0,
119 | "72406": 0,
120 | "72458": 0,
121 | "72585": 0,
122 | "72618": 0,
123 | "72827": 0,
124 | "79652": 0,
125 | "79681": 0
126 | }
--------------------------------------------------------------------------------
/stabletoolbench/solvable_queries/test_query_ids/G2_instruction.json:
--------------------------------------------------------------------------------
1 | {
2 | "1643": 0,
3 | "4746": 0,
4 | "5744": 0,
5 | "7257": 0,
6 | "9834": 0,
7 | "9957": 0,
8 | "9959": 0,
9 | "10097": 0,
10 | "10941": 0,
11 | "11627": 0,
12 | "11820": 0,
13 | "12034": 0,
14 | "12142": 0,
15 | "12507": 0,
16 | "12509": 0,
17 | "12634": 0,
18 | "12742": 0,
19 | "12773": 0,
20 | "12894": 0,
21 | "12961": 0,
22 | "12974": 0,
23 | "15067": 0,
24 | "15439": 0,
25 | "15929": 0,
26 | "17233": 0,
27 | "17864": 0,
28 | "19186": 0,
29 | "19850": 0,
30 | "22262": 0,
31 | "24131": 0,
32 | "25866": 0,
33 | "26341": 0,
34 | "26837": 0,
35 | "27543": 0,
36 | "29044": 0,
37 | "29499": 0,
38 | "30246": 0,
39 | "30501": 0,
40 | "34056": 0,
41 | "34437": 0,
42 | "34667": 0,
43 | "34980": 0,
44 | "35139": 0,
45 | "36115": 0,
46 | "37074": 0,
47 | "38666": 0,
48 | "44321": 0,
49 | "45688": 0,
50 | "47748": 0,
51 | "48039": 0,
52 | "48770": 0,
53 | "49308": 0,
54 | "50058": 0,
55 | "50406": 0,
56 | "50656": 0,
57 | "50658": 0,
58 | "51289": 0,
59 | "52115": 0,
60 | "54151": 0,
61 | "54246": 0,
62 | "54739": 0,
63 | "54775": 0,
64 | "54793": 0,
65 | "55251": 0,
66 | "55671": 0,
67 | "56101": 0,
68 | "56133": 0,
69 | "56155": 0,
70 | "56266": 0,
71 | "62997": 0,
72 | "63490": 0,
73 | "65457": 0,
74 | "65468": 0,
75 | "65521": 0,
76 | "65607": 0,
77 | "67514": 0,
78 | "67887": 0,
79 | "67969": 0,
80 | "68308": 0,
81 | "69637": 0,
82 | "70369": 0,
83 | "70435": 0,
84 | "70543": 0,
85 | "73783": 0,
86 | "73991": 0,
87 | "75279": 0,
88 | "75958": 0,
89 | "76230": 0,
90 | "76512": 0,
91 | "78631": 0,
92 | "78838": 0,
93 | "79476": 0,
94 | "79633": 0,
95 | "79640": 0,
96 | "79644": 0,
97 | "79645": 0,
98 | "81337": 0,
99 | "83220": 0,
100 | "83236": 0,
101 | "84074": 0,
102 | "84585": 0,
103 | "84593": 0,
104 | "85051": 0,
105 | "85129": 0,
106 | "86555": 0,
107 | "87064": 0
108 | }
--------------------------------------------------------------------------------
/stabletoolbench/solvable_queries/test_query_ids/G3_instruction.json:
--------------------------------------------------------------------------------
1 | {
2 | "455": 0,
3 | "456": 0,
4 | "457": 0,
5 | "459": 0,
6 | "460": 0,
7 | "1983": 0,
8 | "1984": 0,
9 | "1985": 0,
10 | "1989": 0,
11 | "1991": 0,
12 | "5863": 0,
13 | "5864": 0,
14 | "5865": 0,
15 | "8031": 0,
16 | "8032": 0,
17 | "8034": 0,
18 | "8334": 0,
19 | "8335": 0,
20 | "8337": 0,
21 | "9341": 0,
22 | "9343": 0,
23 | "9344": 0,
24 | "9345": 0,
25 | "9346": 0,
26 | "9349": 0,
27 | "10898": 0,
28 | "11644": 0,
29 | "11645": 0,
30 | "11647": 0,
31 | "11648": 0,
32 | "11649": 0,
33 | "11650": 0,
34 | "13773": 0,
35 | "13774": 0,
36 | "13777": 0,
37 | "13779": 0,
38 | "13780": 0,
39 | "14485": 0,
40 | "14489": 0,
41 | "14938": 0,
42 | "14950": 0,
43 | "18978": 0,
44 | "18979": 0,
45 | "18980": 0,
46 | "18982": 0,
47 | "18984": 0,
48 | "18987": 0,
49 | "18988": 0,
50 | "18990": 0,
51 | "18992": 0,
52 | "19272": 0,
53 | "19274": 0,
54 | "19281": 0,
55 | "20022": 0,
56 | "20024": 0,
57 | "20026": 0,
58 | "20027": 0,
59 | "20028": 0,
60 | "20029": 0,
61 | "20030": 0,
62 | "21682": 0
63 | }
--------------------------------------------------------------------------------
/stabletoolbench/toolbench/inference/Algorithms/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yuyq18/StepTool/121a6acd11a899d947a5d3fa7269cf76624c1df3/stabletoolbench/toolbench/inference/Algorithms/__init__.py
--------------------------------------------------------------------------------
/stabletoolbench/toolbench/inference/Algorithms/base_search.py:
--------------------------------------------------------------------------------
1 | from Downstream_tasks.base_env import base_env
2 |
3 | class base_search_method:
4 | """For the base tree search method, you need to support the following functions"""
5 |
6 | def __init__(self,llm,io_func: base_env, process_id=0, callbacks = None):
7 | """Args:
8 | llm: The interface of the LLM
9 | io_func(base_env): Interface to the environment,
10 | process_id (int, optional): In multiprocessing annotation, this describes the process id. Defaults to 0.
11 | callbacks (_type_, optional): _description_. Defaults to None.
12 | """
13 | pass
14 |
15 | def to_json(self,answer=False,process=True):
16 | '''
17 | return a json object,
18 | If "answer" = True. must have the following field to make answer annotation
19 | If "process" = True. You need provide the full information of the tree searching process
20 |
21 | "answer_generation": {
22 | "valid_data": bool,
23 | "final_answer": string,
24 | "finish_type": enum["give_up","give_answer"]
25 | "train_messages": [ [openAI-message] ],
26 | }
27 | '''
28 | raise NotImplementedError
29 |
30 | def start(self, **args):
31 | """This is the entry point of the searching process"""
32 | raise NotImplementedError
33 |
34 |
--------------------------------------------------------------------------------
/stabletoolbench/toolbench/inference/Downstream_tasks/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yuyq18/StepTool/121a6acd11a899d947a5d3fa7269cf76624c1df3/stabletoolbench/toolbench/inference/Downstream_tasks/__init__.py
--------------------------------------------------------------------------------
/stabletoolbench/toolbench/inference/Downstream_tasks/base_env.py:
--------------------------------------------------------------------------------
1 | class base_env:
2 |
3 | def __init__(self):
4 | self.task_description = ""
5 | self.input_description = ""
6 | self.tool_names = []
7 | self.functions = []
8 |
9 | def restart(self):
10 | '''
11 | Restrat the environment
12 | '''
13 | raise NotImplementedError
14 |
15 | def get_score(self):
16 | '''
17 | Get the value of the current state
18 | A fake function, used to search in oracle mode, which is not actually used (and impossible to obtain)
19 | '''
20 | raise NotImplementedError
21 |
22 | def step(self, action, input_str):
23 | '''
24 | Perform an interaction in natural language mode
25 | return value (output str, status code)
26 | '''
27 | raise NotImplementedError
28 |
29 | def check_success(self):
30 | '''
31 | Returns 1 if successful, otherwise returns 0
32 | '''
33 | raise NotImplementedError
34 |
35 | def to_json(self):
36 | raise NotImplementedError
--------------------------------------------------------------------------------
/stabletoolbench/toolbench/inference/LLM/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yuyq18/StepTool/121a6acd11a899d947a5d3fa7269cf76624c1df3/stabletoolbench/toolbench/inference/LLM/__init__.py
--------------------------------------------------------------------------------
/stabletoolbench/toolbench/inference/LLM/base_io.py:
--------------------------------------------------------------------------------
1 | import re
2 |
3 | def base_io(input_str):
4 | pass
--------------------------------------------------------------------------------
/stabletoolbench/toolbench/inference/LLM/retriever.py:
--------------------------------------------------------------------------------
1 | import time
2 | import pandas as pd
3 | from sentence_transformers import SentenceTransformer, util
4 | import json
5 | import re
6 | import os, torch
7 | from toolbench.utils import standardize, standardize_category, change_name, process_retrieval_ducoment
8 |
9 |
10 | class ToolRetriever:
11 | def __init__(self, corpus_tsv_path = "", model_path=""):
12 | self.corpus_tsv_path = corpus_tsv_path
13 | self.model_path = model_path
14 | self.model_name = model_path.split('/')[-1]
15 | self.corpus, self.corpus2tool = self.build_retrieval_corpus()
16 | self.embedder = self.build_retrieval_embedder()
17 | self.corpus_embeddings = self.build_corpus_embeddings()
18 |
19 | def build_retrieval_corpus(self):
20 | print("Building corpus...")
21 | documents_df = pd.read_csv(self.corpus_tsv_path, sep='\t')
22 | corpus, corpus2tool = process_retrieval_ducoment(documents_df)
23 | corpus_ids = list(corpus.keys())
24 | corpus = [corpus[cid] for cid in corpus_ids]
25 | return corpus, corpus2tool
26 |
27 | def build_retrieval_embedder(self):
28 | print("Building embedder...")
29 | embedder = SentenceTransformer(self.model_path)
30 | return embedder
31 |
32 | def build_corpus_embeddings(self):
33 | print("Building corpus embeddings with embedder...")
34 | embedding_save_path = self.corpus_tsv_path.replace('.tsv', f'_{self.model_name}_embeddings.pt')
35 | if os.path.exists(embedding_save_path):
36 | print("Loading pre-computed corpus embeddings...")
37 | corpus_embeddings = torch.load(embedding_save_path)
38 | return corpus_embeddings
39 | print("Computing corpus embeddings...")
40 | corpus_embeddings = self.embedder.encode(self.corpus, convert_to_tensor=True)
41 |
42 | torch.save(corpus_embeddings, embedding_save_path)
43 | return corpus_embeddings
44 |
45 | def retrieving(self, query, top_k=5, excluded_tools={}):
46 | print("Retrieving...")
47 | start = time.time()
48 | query_embedding = self.embedder.encode(query, convert_to_tensor=True)
49 | hits = util.semantic_search(query_embedding, self.corpus_embeddings, top_k=10*top_k, score_function=util.cos_sim)
50 | retrieved_tools = []
51 | for rank, hit in enumerate(hits[0]):
52 | # import pdb; pdb.set_trace()
53 | try:
54 | category, tool_name, api_name = self.corpus2tool[self.corpus[hit['corpus_id']]].split('[SEP]')
55 | except:
56 | print(self.corpus2tool[self.corpus[hit['corpus_id']]])
57 | import pdb; pdb.set_trace()
58 | category = standardize_category(category)
59 | tool_name = standardize(tool_name) # standardizing
60 | api_name = change_name(standardize(api_name)) # standardizing
61 | if category in excluded_tools:
62 | if tool_name in excluded_tools[category]:
63 | top_k += 1
64 | continue
65 | tmp_dict = {
66 | "category": category,
67 | "tool_name": tool_name,
68 | "api_name": api_name
69 | }
70 | retrieved_tools.append(tmp_dict)
71 | return retrieved_tools
--------------------------------------------------------------------------------
/stabletoolbench/toolbench/inference/LLM_rank/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yuyq18/StepTool/121a6acd11a899d947a5d3fa7269cf76624c1df3/stabletoolbench/toolbench/inference/LLM_rank/__init__.py
--------------------------------------------------------------------------------
/stabletoolbench/toolbench/inference/LLM_rank/rank_candidate.py:
--------------------------------------------------------------------------------
1 | '''
2 | Evaluate the score of a query corresponding to different candidates
3 | '''
4 |
5 | from Prompts.rank_prompts import LLM_PAIRWISE_RANK_SUBFIX_SYSTEM_PROMPT, LLM_PAIRWISE_RANK_USER_PROMPT
6 | import random
7 | from Tree.Tree import tree_node
8 |
9 |
10 | def rank2symmetry(llm_interface, LLM_rank_args, cand1,cand2):
11 | '''
12 | Use llm to compare the height, due to the sequence, you need to compare each of the two in the front
13 | '''
14 | single_rank_func = LLM_rank_args["rank_func"]
15 | score = [0,0]
16 | bigger1,query_count1, total_tokens1 = single_rank_func(llm_interface, LLM_rank_args, cand1,cand2)
17 | score[1 - bigger1] += 1
18 | bigger2,query_count2, total_tokens2 = single_rank_func(llm_interface, LLM_rank_args, cand2,cand1)
19 | score[bigger2] += 1
20 | if score[0] > score[1]:
21 | return 1 , query_count1 + query_count2, total_tokens1 + total_tokens2
22 | elif score[0] < score[1]:
23 | return -1, query_count1 + query_count2, total_tokens1 + total_tokens2
24 | else:
25 | return 0, query_count1 + query_count2, total_tokens1 + total_tokens2
26 |
27 |
28 |
29 | def rank2_subfix(llm_interface,LLM_rank_args, cand1,cand2):
30 | '''
31 | Assumed that the two candidates have a long common prefix
32 | '''
33 | anscestor_interesction = tree_node.find_ancestor_intersection(cand1,cand2)
34 | assert anscestor_interesction != None
35 | intersect_trice = anscestor_interesction.get_former_trice_from_this_node(end_node=None)
36 | trice_1 = cand1.get_former_trice_from_this_node(end_node=anscestor_interesction)
37 | trice_2 = cand2.get_former_trice_from_this_node(end_node=anscestor_interesction)
38 |
39 | system_message = LLM_PAIRWISE_RANK_SUBFIX_SYSTEM_PROMPT
40 | system_message = system_message.replace("{task_description}", LLM_rank_args["task_description"])
41 | system_message = system_message.replace("{intersect_trice}", intersect_trice)
42 | system_message = system_message.replace("{candidate_A}",trice_1)
43 | system_message = system_message.replace("{candidate_B}",trice_2)
44 | llm_interface.change_messages([{"role":"system","content":system_message},
45 | {"role":"user","content":LLM_PAIRWISE_RANK_USER_PROMPT},
46 | ])
47 | output,error_code, total_tokens = llm_interface.parse(functions=LLM_rank_args["functions"],function_call="none",process_id=LLM_rank_args["process_id"])
48 | if output["content"].strip().lower()[-1] == "a":
49 | return 1, 1, total_tokens
50 | else:
51 | return 0, 1, total_tokens
52 |
53 | def sum_based_rankn(llm_interface,LLM_rank_args, candidates):
54 | '''
55 | All pairs are sorted pairwise, sum the total points, and choose the best
56 | '''
57 | total_querys = 0
58 | total_tokens = 0
59 | scores = [0]*len(candidates)
60 | for i in range(len(candidates)-1):
61 | for j in range(i+1,len(candidates)):
62 | pairwise_rank,query_count,rank2_tokens = rank2symmetry(llm_interface,LLM_rank_args, candidates[i],candidates[j])
63 | total_querys += query_count
64 | total_tokens += rank2_tokens
65 | if pairwise_rank > 0:
66 | scores[i] += 1
67 | elif pairwise_rank < 0:
68 | scores[j] += 1
69 | else:
70 | scores[i] += 0.5
71 | scores[j] += 0.5
72 | return scores, total_querys, total_tokens
73 |
74 |
75 |
76 | if __name__ == "__main__":
77 | random.seed(42)
78 | # candidates = [
79 | # "234",
80 | # "66.5",
81 | # "77.1",
82 | # "88.967",
83 | # "pi",
84 | # # "e",
85 | # # "ln(2)"
86 | # ]
87 | candidates = [
88 | "77.1",
89 | "88.967",
90 | "pi",
91 | "66.5",
92 | "234",
93 | "ln(2)"
94 | ]
95 | '''
96 | starting_delta:
97 | 50 -> 42.85%
98 | 100 -> 35.99%
99 | 150 -> 29.66%
100 | 200 -> 24.03%
101 | '''
102 |
--------------------------------------------------------------------------------
/stabletoolbench/toolbench/inference/Prompts/ReAct_prompts.py:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | FORMAT_INSTRUCTIONS_SYSTEM_FUNCTION = """You are AutoGPT, you can use many tools(functions) to do the following task.
5 | First I will give you the task description, and your task start.
6 | At each step, you need to give your thought to analyze the status now and what to do next, with a function call to actually excute your step.
7 | After the call, you will get the call result, and you are now in a new state.
8 | Then you will analyze your status now, then decide what to do next...
9 | After many (Thought-call) pairs, you finally perform the task, then you can give your finial answer.
10 | Remember:
11 | 1.the state change is irreversible, you can't go back to one of the former state, if you want to restart the task, say "I give up and restart".
12 | 2.All the thought is short, at most in 5 sentence.
13 | 3.You can do more then one trys, so if your plan is to continusly try some conditions, you can do one of the conditions per try.
14 | Let's Begin!
15 | Task description: {task_description}"""
16 |
17 | FORMAT_INSTRUCTIONS_USER_FUNCTION = """
18 | {input_description}
19 | Begin!
20 | """
21 |
22 | FORMAT_INSTRUCTIONS_SYSTEM_FUNCTION_ZEROSHOT = """Answer the following questions as best you can. Specifically, you have access to the following APIs:
23 |
24 | {func_str}
25 |
26 | Use the following format:
27 | Thought: you should always think about what to do
28 | Action: the action to take, should be one of {func_list}
29 | Action Input: the input to the action
30 | End Action
31 |
32 | Begin! Remember: (1) Follow the format, i.e,
33 | Thought:
34 | Action:
35 | Action Input:
36 | End Action
37 | (2)The Action: MUST be one of the following:{func_list}
38 | (3)If you believe that you have obtained enough information (which can be judge from the history observations) that can answer the task, please call:
39 | Action: Finish
40 | Action Input: {{"return_type": "give_answer", "final_answer": your answer string}}.
41 | Question: {question}
42 |
43 | Here are the history actions and observations:
44 | """
45 |
--------------------------------------------------------------------------------
/stabletoolbench/toolbench/inference/Prompts/Tree_search_prompts.py:
--------------------------------------------------------------------------------
1 | DIVERSITY_PROMPT='''This is not the first time you try this task, all previous trails failed.
2 | Before you generate my thought for this state, I will first show you your previous actions for this state, and then you must generate actions that is different from all of them. Here are some previous actions candidates:
3 | {previous_candidate}
4 | Remember you are now in the intermediate state of a trail, you will first analyze the now state and previous action candidates, then make actions that is different from all the previous.'''
5 |
6 |
7 |
--------------------------------------------------------------------------------
/stabletoolbench/toolbench/inference/Prompts/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yuyq18/StepTool/121a6acd11a899d947a5d3fa7269cf76624c1df3/stabletoolbench/toolbench/inference/Prompts/__init__.py
--------------------------------------------------------------------------------
/stabletoolbench/toolbench/inference/Prompts/rank_prompts.py:
--------------------------------------------------------------------------------
1 |
2 | LLM_PAIRWISE_RANK_SUBFIX_SYSTEM_PROMPT = '''
3 | You are value-GPT, which is an expert of defining which trail is better, which trail is more close to solving the task.
4 | All candidate tries to solve this task with some funciton calls:
5 | *******************************
6 | {{TASK_DESCRIPTION}}
7 | {task_description}
8 | {{END_TASK_DESCRIPTION}}
9 | *******************************
10 | First, all candidate do the following things:
11 | {intersect_trice}
12 | After that, there are two candidates A and B, they do different things:
13 | *******************************
14 | {{CANDIDATE_A_START}}
15 | {candidate_A}
16 | {{CANDIDATE_A_END}}
17 | *******************************
18 | {{CANDIDATE_B_START}}
19 | {candidate_B}
20 | {{CANDIDATE_B_END}}
21 | Which try do you think is more helpful to solving the task?
22 | '''
23 |
24 |
25 |
26 |
27 | LLM_PAIRWISE_RANK_USER_PROMPT = '''
28 | Tell me which candidate is better in ONE Word: "A" or "B":'''
--------------------------------------------------------------------------------
/stabletoolbench/toolbench/inference/Tree/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yuyq18/StepTool/121a6acd11a899d947a5d3fa7269cf76624c1df3/stabletoolbench/toolbench/inference/Tree/__init__.py
--------------------------------------------------------------------------------
/stabletoolbench/toolbench/inference/callbacks/ServerEventCallback.py:
--------------------------------------------------------------------------------
1 | from typing import Any, Dict, List, Union
2 | import queue
3 | class ServerEventCallback():
4 | """Base callback handler"""
5 |
6 | def __init__(self, queue: queue.Queue, *args, **kwargs):
7 | super().__init__(*args, **kwargs)
8 | self.queue = queue
9 | self.llm_block_id = 0
10 | self.tool_block_id = 0
11 | self.tool_descriptions = {}
12 |
13 | def add_to_queue(self, method_name: str, block_id, **kwargs: Any):
14 | data = {
15 | "method_name": method_name,
16 | "block_id": block_id,
17 | }
18 | data.update(kwargs)
19 | self.queue.put(data)
20 |
21 | def on_tool_retrieval_start(self):
22 | # tools should be of the form
23 | # {tool_name, tool_desc}
24 | self.add_to_queue(
25 | "on_tool_retrieval_start",
26 | "recommendation-1",
27 | )
28 | print("on_tool_retrieval_start method called")
29 |
30 | def on_tool_retrieval_end(self, tools):
31 | # tool should be of the form
32 | # {tool_name, tool_desc}
33 | self.add_to_queue(
34 | "on_tool_retrieval_end",
35 | "recommendation-1",
36 | recommendations=tools
37 | )
38 | self.tool_descriptions = {
39 | tool["name"]: tool for tool in tools
40 | }
41 | print("on_tool_retrieval_end method called")
42 | def on_request_start(self, user_input: str, method: str) -> Any:
43 | self.tool_block_id = 0
44 | self.llm_block_id = 0
45 | self.add_to_queue(
46 | "on_request_start",
47 | block_id="start",
48 | user_input=user_input,
49 | method=method
50 | )
51 | def on_request_end(self, outputs: str, chain: List[Any]):
52 | self.add_to_queue(
53 | "on_request_end",
54 | block_id="end",
55 | output=outputs,
56 | chain=chain
57 | )
58 | def on_request_error(self, error: str):
59 | self.add_to_queue(
60 | "on_request_error",
61 | block_id="error",
62 | error=error
63 | )
64 |
65 | # keep
66 | def on_chain_start(self, inputs: str, depth: int) -> Any:
67 | """Run when chain starts running."""
68 | print("on_chain_start method called")
69 | self.llm_block_id += 1
70 | block_id = "llm-" + str(self.llm_block_id)
71 | self.add_to_queue(
72 | "on_chain_start",
73 | block_id=block_id,
74 | messages=inputs,
75 | depth=depth
76 | )
77 | return block_id
78 |
79 | # this one needs the block_id memorized
80 | def on_chain_end(self, block_id: str, depth: int) -> Any:
81 | self.add_to_queue(
82 | "on_chain_end",
83 | block_id=block_id,
84 | # output=output,
85 | depth=depth
86 | )
87 | print("on_chain_end method called")
88 |
89 | def on_chain_error(self, error: Union[Exception, KeyboardInterrupt], **kwargs: Any) -> Any:
90 | method_name = "on_chain_error"
91 | self.add_to_queue(method_name, error=error, **kwargs)
92 | print("on_chain_error method called")
93 |
94 | def on_llm_start(
95 | self, messages: str, depth: int
96 | ) -> Any:
97 | """Run when LLM starts running."""
98 | self.add_to_queue(
99 | "on_llm_start",
100 | block_id="llm-" + str(self.llm_block_id),
101 | messages=messages,
102 | depth=depth
103 | )
104 | print("on_llm_start method called")
105 |
106 | def on_llm_new_token(self, token: str, **kwargs: Any) -> Any:
107 | """Run on new LLM token. Only available when streaming is enabled."""
108 | method_name = "on_llm_new_token"
109 | self.add_to_queue(method_name, token=token, **kwargs)
110 | print("on_llm_new_token method called")
111 |
112 | def on_llm_end(self, response: str, depth: int) -> Any:
113 | """Run when LLM ends running."""
114 | self.add_to_queue(
115 | "on_llm_end",
116 | block_id="llm-" + str(self.llm_block_id),
117 | response=response,
118 | depth=depth
119 | )
120 | print("on_llm_end method called")
121 |
122 | def on_llm_error(self, error: Union[Exception, KeyboardInterrupt]) -> Any:
123 | """Run when LLM errors."""
124 | self.add_to_queue(
125 | "on_llm_error",
126 | block_id="llm-" + str(self.llm_block_id),
127 | message=str(error),
128 | error=error
129 | )
130 | print("on_llm_error method called")
131 |
132 | def on_agent_action(self, action, action_input, depth: int) -> str:
133 | self.tool_block_id += 1
134 | block_id="tool-" + str(self.tool_block_id)
135 | self.add_to_queue(
136 | "on_agent_action",
137 | block_id=block_id,
138 | action=action,
139 | action_input = action_input,
140 | depth=depth
141 | )
142 | print("on_agent_action method called")
143 | return block_id
144 |
145 | def on_tool_start(self, tool_name: str, tool_input: str, depth: int) -> Any:
146 | method_name = "on_tool_start"
147 | tool_description = "Tool not found in tool descriptions"
148 | if tool_name in self.tool_descriptions:
149 | tool_description = self.tool_descriptions[tool_name]
150 | else:
151 | print(self.tool_descriptions)
152 | print("Key", tool_name, "not found in tool descriptions")
153 | self.add_to_queue(
154 | method_name,
155 | block_id="tool-" + str(self.tool_block_id),
156 | tool_name=tool_name,
157 | tool_description=tool_description,
158 | tool_input=tool_input,
159 | depth=depth
160 | )
161 | print("on_tool_start method called")
162 |
163 | def on_tool_end(self, output: str, status:int, depth: int) -> Any:
164 | method_name = "on_tool_end"
165 | self.add_to_queue(
166 | method_name,
167 | block_id="tool-" + str(self.tool_block_id),
168 | output=output,
169 | status= status,
170 | depth=depth
171 | )
172 | print("on_tool_end method called")
173 |
174 | def on_tool_error(self, error: Union[Exception, KeyboardInterrupt]) -> Any:
175 | method_name = "on_tool_error"
176 | self.add_to_queue(
177 | method_name,
178 | error=error
179 | )
180 | print("on_tool_error method called")
181 |
182 | def on_agent_end(self, block_id:str, depth: int):
183 | self.add_to_queue(
184 | "on_agent_end",
185 | block_id=block_id,
186 | depth=depth
187 | )
188 | print("on_agent_end method called")
--------------------------------------------------------------------------------
/stabletoolbench/toolbench/inference/qa_pipeline.py:
--------------------------------------------------------------------------------
1 | '''
2 | Close-domain QA Pipeline
3 | '''
4 |
5 | import argparse, os
6 | import yaml
7 | from toolbench.inference.Downstream_tasks.rapidapi import pipeline_runner
8 |
9 |
10 | if __name__ == "__main__":
11 |
12 | parser = argparse.ArgumentParser()
13 | parser.add_argument('--backbone_model', type=str, default="toolllama", required=False, help='chatgpt_function or davinci or toolllama')
14 | parser.add_argument('--chatgpt_model', type=str, default="gpt-4-turbo-2024-04-09", required=False, help='gpt-3.5-turbo or gpt-4')
15 | # parser.add_argument('--base_url', type=str, default="https://api.openai.com/v1", required=False, help='openai api url')
16 | # parser.add_argument('--openai_key', type=str, default="", required=False, help='openai key for chatgpt_function or davinci model')
17 | parser.add_argument('--config_file', type=str, default='config.yml', help='Api configuration file')
18 | parser.add_argument('--model_path', type=str, default="your_model_path/", required=False, help='')
19 | # parser.add_argument('--tool_root_dir', type=str, default="your_tools_path/", required=True, help='')
20 | parser.add_argument("--lora", action="store_true", help="Load lora model or not.")
21 | parser.add_argument('--lora_path', type=str, default="your_lora_path if lora", required=False, help='')
22 | parser.add_argument('--max_observation_length', type=int, default=1024, required=False, help='maximum observation length')
23 | parser.add_argument('--max_source_sequence_length', type=int, default=4096, required=False, help='original maximum model sequence length')
24 | parser.add_argument('--max_sequence_length', type=int, default=8192, required=False, help='maximum model sequence length')
25 | parser.add_argument('--observ_compress_method', type=str, default="truncate", choices=["truncate", "filter", "random"], required=False, help='observation compress method')
26 | parser.add_argument('--method', type=str, default="CoT@1", required=False, help='method for answer generation: CoT@n,Reflexion@n,BFS,DFS,UCT_vote')
27 | parser.add_argument('--input_query_file', type=str, default="", required=False, help='input path')
28 | parser.add_argument('--output_answer_file', type=str, default="",required=False, help='output path')
29 | # parser.add_argument('--toolbench_key', type=str, default="",required=False, help='your toolbench key to request rapidapi service')
30 | parser.add_argument('--rapidapi_key', type=str, default="",required=False, help='your rapidapi key to request rapidapi service')
31 | parser.add_argument('--use_rapidapi_key', action="store_true", help="To use customized rapidapi service or not.")
32 | parser.add_argument('--api_customization', action="store_true", help="To use customized api or not.")
33 |
34 | args = parser.parse_args()
35 |
36 | CONFIG = yaml.load(open(args.config_file, 'r'), Loader=yaml.FullLoader)
37 | os.environ["OPENAI_API_BASE"] = CONFIG['api_base']
38 | os.environ["OPENAI_KEY"] = CONFIG['api_key']
39 | os.environ["TOOLBENCH_KEY"] = CONFIG['toolbench_key']
40 | os.environ["TOOL_ROOT_DIR"] = CONFIG['tool_root_dir']
41 |
42 | pipeline_runner = pipeline_runner(args)
43 | pipeline_runner.run()
44 |
45 |
--------------------------------------------------------------------------------
/stabletoolbench/toolbench/inference/qa_pipeline_multithread.py:
--------------------------------------------------------------------------------
1 | '''
2 | Close-domain QA Pipeline
3 | '''
4 |
5 | import argparse, os
6 | import yaml
7 | from toolbench.inference.Downstream_tasks.rapidapi_multithread import pipeline_runner
8 |
9 |
10 | if __name__ == "__main__":
11 |
12 | parser = argparse.ArgumentParser()
13 | parser.add_argument('--backbone_model', type=str, default="toolllama", required=False, help='chatgpt_function or davinci or toolllama')
14 | parser.add_argument('--chatgpt_model', type=str, default="gpt-4-turbo-2024-04-09", required=False, help='gpt-3.5-turbo or gpt-4')
15 | # parser.add_argument('--base_url', type=str, default="https://api.openai.com/v1", required=False, help='openai api url')
16 | # parser.add_argument('--openai_key', type=str, default="", required=False, help='openai key for chatgpt_function or davinci model')
17 | parser.add_argument('--config_file', type=str, default='config.yml', help='Api configuration file')
18 | parser.add_argument('--model_path', type=str, default="your_model_path/", required=False, help='')
19 | # parser.add_argument('--tool_root_dir', type=str, default="your_tools_path/", required=True, help='')
20 | parser.add_argument("--lora", action="store_true", help="Load lora model or not.")
21 | parser.add_argument('--lora_path', type=str, default="your_lora_path if lora", required=False, help='')
22 | parser.add_argument('--max_observation_length', type=int, default=1024, required=False, help='maximum observation length')
23 | parser.add_argument('--max_source_sequence_length', type=int, default=4096, required=False, help='original maximum model sequence length')
24 | parser.add_argument('--max_sequence_length', type=int, default=8192, required=False, help='maximum model sequence length')
25 | parser.add_argument('--single_chain_max_step', type=int, default=12, required=False, help='maximum step for single chain')
26 | parser.add_argument('--max_query_count', type=int, default=30, required=False, help='maximum query count')
27 | parser.add_argument('--observ_compress_method', type=str, default="truncate", choices=["truncate", "filter", "random"], required=False, help='observation compress method')
28 | parser.add_argument('--method', type=str, default="CoT@1", required=False, help='method for answer generation: CoT@n,Reflexion@n,BFS,DFS,UCT_vote')
29 | parser.add_argument('--input_query_file', type=str, default="", required=False, help='input path')
30 | parser.add_argument('--output_answer_file', type=str, default="",required=False, help='output path')
31 | # parser.add_argument('--toolbench_key', type=str, default="",required=False, help='your toolbench key to request rapidapi service')
32 | parser.add_argument('--rapidapi_key', type=str, default="",required=False, help='your rapidapi key to request rapidapi service')
33 | parser.add_argument('--use_rapidapi_key', action="store_true", help="To use customized rapidapi service or not.")
34 | parser.add_argument('--api_customization', action="store_true", help="To use customized api or not.")
35 | parser.add_argument('--num_thread', type=int, default=1, required=False, help='number of threads')
36 | parser.add_argument('--disable_tqdm', action="store_true", help="disable tqdm or not.")
37 | parser.add_argument('--overwrite', action='store_true', help='overwrite existing runs')
38 | parser.add_argument('--easy_tool', action='store_true', help='use easy tool baseline or not')
39 |
40 | args = parser.parse_args()
41 | if args.overwrite:
42 | os.system(f"rm -rf {args.output_answer_file}")
43 |
44 | CONFIG = yaml.load(open(args.config_file, 'r'), Loader=yaml.FullLoader)
45 | os.environ["OPENAI_API_BASE"] = CONFIG['api_base']
46 | os.environ["OPENAI_KEY"] = CONFIG['api_key']
47 | os.environ["TOOLBENCH_KEY"] = CONFIG['toolbench_key']
48 | os.environ["TOOL_ROOT_DIR"] = CONFIG['tool_root_dir']
49 |
50 | pipeline_runner = pipeline_runner(args)
51 | pipeline_runner.run()
52 |
53 |
--------------------------------------------------------------------------------
/stabletoolbench/toolbench/inference/qa_pipeline_open_domain.py:
--------------------------------------------------------------------------------
1 | '''
2 | Open-domain QA Pipeline
3 | '''
4 | import argparse
5 | from toolbench.inference.Downstream_tasks.rapidapi import pipeline_runner
6 |
7 |
8 | if __name__ == "__main__":
9 |
10 | parser = argparse.ArgumentParser()
11 | parser.add_argument('--corpus_tsv_path', type=str, default="your_retrival_corpus_path/", required=False, help='')
12 | parser.add_argument('--retrieval_model_path', type=str, default="your_model_path/", required=False, help='')
13 | parser.add_argument('--retrieved_api_nums', type=int, default=5, required=False, help='')
14 | parser.add_argument('--backbone_model', type=str, default="toolllama", required=False, help='chatgpt_function or davinci or toolllama')
15 | parser.add_argument('--chatgpt_model', type=str, default="gpt-4-turbo-2024-04-09", required=False, help='gpt-3.5-turbo or gpt-4')
16 | parser.add_argument('--base_url', type=str, default="https://api.openai.com/v1", required=False, help='openai api url')
17 | parser.add_argument('--openai_key', type=str, default="", required=False, help='openai key for chatgpt_function or davinci model')
18 | parser.add_argument('--model_path', type=str, default="your_model_path/", required=False, help='')
19 | parser.add_argument('--tool_root_dir', type=str, default="your_tools_path/", required=True, help='')
20 | parser.add_argument("--lora", action="store_true", help="Load lora model or not.")
21 | parser.add_argument('--lora_path', type=str, default="your_lora_path if lora", required=False, help='')
22 | parser.add_argument('--max_observation_length', type=int, default=1024, required=False, help='maximum observation length')
23 | parser.add_argument('--max_source_sequence_length', type=int, default=4096, required=False, help='original maximum model sequence length')
24 | parser.add_argument('--max_sequence_length', type=int, default=8192, required=False, help='maximum model sequence length')
25 | parser.add_argument('--observ_compress_method', type=str, default="truncate", choices=["truncate", "filter", "random"], required=False, help='maximum observation length')
26 | parser.add_argument('--method', type=str, default="CoT@1", required=False, help='method for answer generation: CoT@n,Reflexion@n,BFS,DFS,UCT_vote')
27 | parser.add_argument('--input_query_file', type=str, default="", required=False, help='input path')
28 | parser.add_argument('--output_answer_file', type=str, default="",required=False, help='output path')
29 | parser.add_argument('--toolbench_key', type=str, default="",required=False, help='your toolbench key to request rapidapi service')
30 | parser.add_argument('--rapidapi_key', type=str, default="",required=False, help='your rapidapi key to request rapidapi service')
31 | parser.add_argument('--use_rapidapi_key', action="store_true", help="To use customized rapidapi service or not.")
32 | parser.add_argument('--api_customization', action="store_true", help="To use customized api or not. NOT SUPPORTED currently under open domain setting.")
33 |
34 | args = parser.parse_args()
35 |
36 | pipeline_runner = pipeline_runner(args, add_retrieval=True)
37 | pipeline_runner.run()
38 |
--------------------------------------------------------------------------------
/stabletoolbench/toolbench/model/__init__.py:
--------------------------------------------------------------------------------
1 | from toolbench.model.model_adapter import (
2 | load_model,
3 | get_conversation_template,
4 | add_model_args,
5 | )
6 |
--------------------------------------------------------------------------------
/stabletoolbench/toolbench/model/apply_delta.py:
--------------------------------------------------------------------------------
1 | """
2 | Apply the delta weights on top of a base model.
3 |
4 | Usage:
5 | python3 -m fastchat.model.apply_delta --base ~/model_weights/llama-7b --target ~/model_weights/vicuna-7b --delta lmsys/vicuna-7b-delta-v1.1
6 | """
7 | import argparse
8 | import gc
9 | import glob
10 | import json
11 | import os
12 | import shutil
13 | import tempfile
14 |
15 | from huggingface_hub import snapshot_download
16 | import torch
17 | from torch import nn
18 | from tqdm import tqdm
19 | from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig
20 |
21 |
22 | GB = 1 << 30
23 |
24 |
25 | def split_files(model_path, tmp_path, split_size):
26 | if not os.path.exists(model_path):
27 | model_path = snapshot_download(repo_id=model_path)
28 | if not os.path.exists(tmp_path):
29 | os.makedirs(tmp_path)
30 |
31 | file_pattern = os.path.join(model_path, "pytorch_model-*.bin")
32 | files = glob.glob(file_pattern)
33 |
34 | part = 0
35 | try:
36 | for file_path in tqdm(files):
37 | state_dict = torch.load(file_path)
38 | new_state_dict = {}
39 |
40 | current_size = 0
41 | for name, param in state_dict.items():
42 | param_size = param.numel() * param.element_size()
43 |
44 | if current_size + param_size > split_size:
45 | new_file_name = f"pytorch_model-{part}.bin"
46 | new_file_path = os.path.join(tmp_path, new_file_name)
47 | torch.save(new_state_dict, new_file_path)
48 | current_size = 0
49 | new_state_dict = None
50 | gc.collect()
51 | new_state_dict = {}
52 | part += 1
53 |
54 | new_state_dict[name] = param
55 | current_size += param_size
56 |
57 | new_file_name = f"pytorch_model-{part}.bin"
58 | new_file_path = os.path.join(tmp_path, new_file_name)
59 | torch.save(new_state_dict, new_file_path)
60 | new_state_dict = None
61 | gc.collect()
62 | new_state_dict = {}
63 | part += 1
64 | except Exception as e:
65 | print(f"An error occurred during split_files: {e}")
66 | shutil.rmtree(tmp_path)
67 | raise
68 |
69 |
70 | def apply_delta_low_cpu_mem(base_model_path, target_model_path, delta_path):
71 | delta_tokenizer = AutoTokenizer.from_pretrained(delta_path, use_fast=False)
72 | delta_config = AutoConfig.from_pretrained(delta_path)
73 |
74 | if os.path.exists(target_model_path):
75 | shutil.rmtree(target_model_path)
76 | os.makedirs(target_model_path)
77 |
78 | split_size = 4 * GB
79 |
80 | with tempfile.TemporaryDirectory() as tmp_base_path, tempfile.TemporaryDirectory() as tmp_delta_path:
81 | print(f"Split files for the base model to {tmp_base_path}")
82 | split_files(base_model_path, tmp_base_path, split_size)
83 | print(f"Split files for the delta weights to {tmp_delta_path}")
84 | split_files(delta_path, tmp_delta_path, split_size)
85 |
86 | base_pattern = os.path.join(tmp_base_path, "pytorch_model-*.bin")
87 | base_files = glob.glob(base_pattern)
88 | delta_pattern = os.path.join(tmp_delta_path, "pytorch_model-*.bin")
89 | delta_files = glob.glob(delta_pattern)
90 | delta_state_dict = torch.load(delta_files[0])
91 |
92 | print("Applying the delta")
93 | weight_map = {}
94 | total_size = 0
95 |
96 | for i, base_file in tqdm(enumerate(base_files)):
97 | state_dict = torch.load(base_file)
98 | file_name = f"pytorch_model-{i}.bin"
99 | for name, param in state_dict.items():
100 | if name not in delta_state_dict:
101 | for delta_file in delta_files:
102 | delta_state_dict = torch.load(delta_file)
103 | gc.collect()
104 | if name in delta_state_dict:
105 | break
106 |
107 | state_dict[name] += delta_state_dict[name]
108 | weight_map[name] = file_name
109 | total_size += param.numel() * param.element_size()
110 | gc.collect()
111 | torch.save(state_dict, os.path.join(target_model_path, file_name))
112 |
113 | with open(
114 | os.path.join(target_model_path, "pytorch_model.bin.index.json"), "w"
115 | ) as f:
116 | json.dump(
117 | {"weight_map": weight_map, "metadata": {"total_size": total_size}}, f
118 | )
119 |
120 | print(f"Saving the target model to {target_model_path}")
121 | delta_tokenizer.save_pretrained(target_model_path)
122 | delta_config.save_pretrained(target_model_path)
123 |
124 |
125 | def apply_delta(base_model_path, target_model_path, delta_path):
126 | print(f"Loading the delta weights from {delta_path}")
127 | delta_tokenizer = AutoTokenizer.from_pretrained(delta_path, use_fast=False)
128 | delta = AutoModelForCausalLM.from_pretrained(
129 | delta_path, torch_dtype=torch.float16, low_cpu_mem_usage=True
130 | )
131 |
132 | print(f"Loading the base model from {base_model_path}")
133 | base = AutoModelForCausalLM.from_pretrained(
134 | base_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True
135 | )
136 |
137 | print("Applying the delta")
138 | for name, param in tqdm(base.state_dict().items(), desc="Applying delta"):
139 | assert name in delta.state_dict()
140 | param.data += delta.state_dict()[name]
141 |
142 | print(f"Saving the target model to {target_model_path}")
143 | base.save_pretrained(target_model_path)
144 | delta_tokenizer.save_pretrained(target_model_path)
145 |
146 |
147 | if __name__ == "__main__":
148 | parser = argparse.ArgumentParser()
149 | parser.add_argument("--base-model-path", type=str, required=True)
150 | parser.add_argument("--target-model-path", type=str, required=True)
151 | parser.add_argument("--delta-path", type=str, required=True)
152 | parser.add_argument(
153 | "--low-cpu-mem",
154 | action="store_true",
155 | help="Lower the cpu memory usage. This will split large files and use "
156 | "disk as swap to reduce the memory usage below 10GB.",
157 | )
158 | args = parser.parse_args()
159 |
160 | if args.low_cpu_mem:
161 | apply_delta_low_cpu_mem(
162 | args.base_model_path, args.target_model_path, args.delta_path
163 | )
164 | else:
165 | apply_delta(args.base_model_path, args.target_model_path, args.delta_path)
166 |
--------------------------------------------------------------------------------
/stabletoolbench/toolbench/model/compression.py:
--------------------------------------------------------------------------------
1 | import dataclasses
2 | import os
3 |
4 | import torch
5 | import torch.nn as nn
6 | from torch.nn import functional as F
7 | from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig
8 |
9 |
10 | @dataclasses.dataclass
11 | class CompressionConfig:
12 | """Group-wise quantization."""
13 |
14 | num_bits: int
15 | group_size: int
16 | group_dim: int
17 | symmetric: bool
18 | enabled: bool = True
19 |
20 |
21 | default_compression_config = CompressionConfig(
22 | num_bits=8, group_size=256, group_dim=1, symmetric=True, enabled=True
23 | )
24 |
25 |
26 | class CLinear(nn.Module):
27 | """Compressed Linear Layer."""
28 |
29 | def __init__(self, weight=None, bias=None, device=None):
30 | super().__init__()
31 | self.weight = weight
32 | self.bias = bias
33 |
34 | def forward(self, input):
35 | return F.linear(input.to(self.weight.dtype), self.weight, self.bias)
36 |
37 |
38 | def compress_module(module, target_device):
39 | for name, child in module.named_children():
40 | if isinstance(child, nn.Linear):
41 | setattr(
42 | module,
43 | name,
44 | CLinear(child.weight, child.bias, target_device),
45 | )
46 | compress_module(child, target_device)
47 |
48 |
49 | def get_compressed_list(module, prefix=""):
50 | compressed_list = []
51 | for name, child in module.named_children():
52 | if isinstance(child, nn.Linear):
53 | full_name = f"{prefix}.{name}.weight" if prefix else f"{name}.weight"
54 | compressed_list.append(full_name)
55 | compressed_list.extend(
56 | get_compressed_list(child, full_name)
57 | )
58 | return compressed_list
59 |
60 |
61 | def apply_compressed_weight(module, compressed_state_dict, target_device, prefix=""):
62 | for name, child in module.named_children():
63 | if isinstance(child, nn.Linear):
64 | full_name = f"{prefix}.{name}.weight" if prefix else f"{name}.weight"
65 | setattr(
66 | module,
67 | name,
68 | CLinear(
69 | compressed_state_dict[full_name], child.bias, target_device
70 | ),
71 | )
72 | apply_compressed_weight(child, compressed_state_dict, target_device, full_name)
73 |
74 |
75 | def load_compress_model(model_path, device, torch_dtype):
76 | # partially load model
77 | tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False)
78 | base_pattern = os.path.join(model_path, "pytorch_model-*.bin")
79 | files = glob.glob(base_pattern)
80 |
81 | config = AutoConfig.from_pretrained(
82 | model_path, low_cpu_mem_usage=True, torch_dtype=torch_dtype
83 | )
84 | model = AutoModelForCausalLM.from_config(config)
85 | linear_weights = get_compressed_list(model)
86 |
87 | compressed_state_dict = {}
88 |
89 | for filename in files:
90 | tmp_state_dict = torch.load(filename)
91 | for name in tmp_state_dict:
92 | if name in linear_weights:
93 | tensor = tmp_state_dict[name].to(device).data.to(torch_dtype)
94 | compressed_state_dict[name] = compress(
95 | tensor, default_compression_config
96 | )
97 | else:
98 | compressed_state_dict[name] = tmp_state_dict[name].to(device)
99 | tmp_state_dict[name] = None
100 | tensor = None
101 | torch.cuda.empty_cache()
102 |
103 | for name, param in model.named_parameters():
104 | if name not in linear_weights:
105 | param.data = compressed_state_dict[name]
106 | apply_compressed_weight(model, compressed_state_dict, device)
107 |
108 | model.to(device)
109 |
110 | return model, tokenizer
111 |
112 |
113 | def compress(tensor, config):
114 | """Simulate group-wise quantization."""
115 | if not config.enabled:
116 | return tensor
117 |
118 | group_size, num_bits, group_dim, symmetric = (
119 | config.group_size,
120 | config.num_bits,
121 | config.group_dim,
122 | config.symmetric,
123 | )
124 | assert num_bits <= 8
125 |
126 | original_shape = tensor.shape
127 | num_groups = (original_shape[group_dim] + group_size - 1) // group_size
128 | new_shape = (
129 | original_shape[:group_dim]
130 | + (num_groups, group_size)
131 | + original_shape[group_dim + 1 :]
132 | )
133 |
134 | # Pad
135 | pad_len = group_size - original_shape[group_dim] % group_size
136 | if pad_len != 0:
137 | pad_shape = (
138 | original_shape[:group_dim] + (pad_len,) + original_shape[group_dim + 1 :]
139 | )
140 | tensor = torch.cat(
141 | [tensor, torch.zeros(pad_shape, dtype=tensor.dtype, device=tensor.device)],
142 | dim=group_dim,
143 | )
144 | data = tensor.view(new_shape)
145 |
146 | # Quantize
147 | if symmetric:
148 | B = 2 ** (num_bits - 1) - 1
149 | scale = B / torch.max(data.abs(), dim=group_dim + 1, keepdim=True)[0]
150 | data = data * scale
151 | data = data.clamp_(-B, B).round_().to(torch.int8)
152 | return data, scale, original_shape
153 | else:
154 | B = 2**num_bits - 1
155 | mn = torch.min(data, dim=group_dim + 1, keepdim=True)[0]
156 | mx = torch.max(data, dim=group_dim + 1, keepdim=True)[0]
157 |
158 | scale = B / (mx - mn)
159 | data = data - mn
160 | data *= scale
161 |
162 | data = data.clamp_(0, B).round_().to(torch.uint8)
163 | return data, mn, scale, original_shape
164 |
165 |
166 | def decompress(packed_data, config):
167 | """Simulate group-wise dequantization."""
168 | if not config.enabled:
169 | return packed_data
170 |
171 | group_size, num_bits, group_dim, symmetric = (
172 | config.group_size,
173 | config.num_bits,
174 | config.group_dim,
175 | config.symmetric,
176 | )
177 |
178 | # Dequantize
179 | if symmetric:
180 | data, scale, original_shape = packed_data
181 | data = data / scale
182 | else:
183 | data, mn, scale, original_shape = packed_data
184 | data = data / scale
185 | data += mn
186 |
187 | # Unpad
188 | pad_len = group_size - original_shape[group_dim] % group_size
189 | if pad_len:
190 | padded_original_shape = (
191 | original_shape[:group_dim]
192 | + (original_shape[group_dim] + pad_len,)
193 | + original_shape[group_dim + 1 :]
194 | )
195 | data = data.reshape(padded_original_shape)
196 | indices = [slice(0, x) for x in original_shape]
197 | return data[indices].contiguous()
198 | else:
199 | return data.view(original_shape)
200 |
--------------------------------------------------------------------------------
/stabletoolbench/toolbench/model/make_delta.py:
--------------------------------------------------------------------------------
1 | """
2 | Make the delta weights by subtracting base weights.
3 |
4 | Usage:
5 | python3 -m fastchat.model.make_delta --base ~/model_weights/llama-13b --target ~/model_weights/vicuna-13b --delta ~/model_weights/vicuna-13b-delta --hub-repo-id lmsys/vicuna-13b-delta-v1.1
6 | """
7 | import argparse
8 |
9 | import torch
10 | from tqdm import tqdm
11 | from transformers import AutoTokenizer, AutoModelForCausalLM
12 |
13 |
14 | def make_delta(base_model_path, target_model_path, delta_path):
15 | print(f"Loading the base model from {base_model_path}")
16 | base = AutoModelForCausalLM.from_pretrained(
17 | base_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True
18 | )
19 |
20 | print(f"Loading the target model from {target_model_path}")
21 | target = AutoModelForCausalLM.from_pretrained(
22 | target_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True
23 | )
24 | target_tokenizer = AutoTokenizer.from_pretrained(target_model_path, use_fast=False)
25 |
26 | print("Calculating the delta")
27 | for name, param in tqdm(target.state_dict().items(), desc="Calculating delta"):
28 | assert name in base.state_dict()
29 | param.data -= base.state_dict()[name]
30 |
31 | print(f"Saving the delta to {delta_path}")
32 | if args.hub_repo_id:
33 | kwargs = {"push_to_hub": True, "repo_id": args.hub_repo_id}
34 | else:
35 | kwargs = {}
36 | target.save_pretrained(delta_path, **kwargs)
37 | target_tokenizer.save_pretrained(delta_path, **kwargs)
38 |
39 |
40 | if __name__ == "__main__":
41 | parser = argparse.ArgumentParser()
42 | parser.add_argument("--base-model-path", type=str, required=True)
43 | parser.add_argument("--target-model-path", type=str, required=True)
44 | parser.add_argument("--delta-path", type=str, required=True)
45 | parser.add_argument("--hub-repo-id", type=str)
46 | args = parser.parse_args()
47 |
48 | make_delta(args.base_model_path, args.target_model_path, args.delta_path)
49 |
--------------------------------------------------------------------------------
/stabletoolbench/toolbench/tooleval/README_ZH.md:
--------------------------------------------------------------------------------
1 |
2 |
🛠️Tool Eval🤖
3 |
4 |
5 | 通过在ToolBench上对LLaMA进行微调,我们得到了**ToolLLaMA**。考虑到人工评估非常耗时,我们借鉴[AlpacaEval](https://tatsu-lab.github.io/alpaca_eval/)开发了一个高效的机器自动评估**ToolEval**,其中包含两个评估指标:
6 |
7 | - **通过率**:计算在有限的OpenAI API调用次数内成功完成指令的比例。
8 |
9 | - **偏好**:通过比较给定指令的两个答案(动作序列)来衡量。我们预先定义了一组更好答案的标准,这些标准被组织成ChatGPT的提示。我们向评估器提供测试指令和两个候选答案,并获得其偏好。我们对每个答案对进行多次评估以提高系统的可靠性。然后,我们计算**优胜率**(被评估器选择为更优的百分比。有关详细信息,请参阅我们的论文。
10 |
11 | 为了验证ChatGPT评估器在通过率和胜率方面的可靠性,我们从四种不同的方法(ChatGPT+ReACT,ChatGPT+DFSDT,ToolLLaMA+DFSDT和GPT4+DFSDT)中进行采样,为每种方法的300个测试指令获取解决方案对。然后,我们请人类标注ChatGPT+DFSDT,ToolLLaMA+DFSDT和GPT4+DFSDT的通过率,以及ChatGPT+ReACT和ChatGPT+DFSDT之间的胜率。
12 |
13 | 我们的ChatGPT评估器在通过率方面与人类标注者具有高达**87.1%**的一致性,在胜率方面具有**80.3%**的一致性。这个结果表明,我们的评估器生成的评估结果与人类非常相似,并且可以视为在通过率和胜率上模拟人类评估的可靠评估器。
14 | 有关ToolEval的更多细节,请参阅我们的论文。
15 |
16 | ## 🚀用法
17 |
18 | ### Install
19 | Install Package (python>=3.9)
20 | ```bash
21 | pip install -r requirements.txt
22 | ```
23 |
24 | ### Evaluation
25 | *若要复现结果,直接通过[Google Drive](https://drive.google.com/drive/folders/1yBUQ732mPu-KclJnuQELEhtKakdXFc3J)下载我们的`reproduction_data.zip`,解压后置`reproduction_data`于`ToolBench/data/`下即可,可以跳过数据准备流程。*
26 | - 数据准备。若要使用 ToolEval 评估您自己的模型和方法,首先需要为六个测试子集准备所有的模型预测。创建一个以您的模型和方法命名的目录,例如 `chatgpt_cot`,然后将每个测试集的预测放在该目录下。目录的文件结构应如下:
27 | ```
28 | ├── /chatgpt_cot/
29 | │ ├── /G1_instruction/
30 | │ │ ├── /10160_CoT@1.json
31 | │ │ └── ...
32 | │ ├── /G1_tool/
33 | │ │ ├── /10221_CoT@1.json
34 | │ │ └── ...
35 | │ ├── ...
36 | │ ├── /G3_instruction/
37 | │ │ ├── /10221_CoT@1.json
38 | │ │ └── ...
39 | ```
40 |
41 | 然后对模型预测进行预处理:
42 |
43 | ```bash
44 | export RAW_ANSWER_PATH=../../data/reproduction_data/model_predictions/
45 | export CONVERTED_ANSWER_PATH=../../data/reproduction_data/model_predictions_converted/
46 | export MODEL_NAME=chatgpt_cot
47 | export METHOD=CoT
48 | mkdir ${CONVERTED_ANSWER_PATH}/${MODEL_NAME}
49 | for test_set in G1_instruction G1_category G1_tool G2_category G2_instruction G3_instruction
50 | do
51 | answer_dir=${RAW_ANSWER_PATH}/${MODEL_NAME}/${test_set}
52 | output_file=${CONVERTED_ANSWER_PATH}/${MODEL_NAME}/${test_set}.json
53 | python convert_to_answer_format.py\
54 | --answer_dir ${answer_dir} \
55 | --method ${METHOD} \
56 | --output ${output_file}
57 | done
58 | ```
59 | 之后,检查`${CONVERTED_ANSWER_PATH}/${MODEL_NAME}`下是否有测试集的预处理JSON文件。如果有,你就可以准备运行以下评估过程了。如果没有,请检查模型的预测是否有问题。
60 |
61 | - OpenAI Key
62 | 准备您的OpenAI Key来搭建我们的evaluator。Key需要被存储到一个json file中,如`path/to/your/openai_key_json_file.json`:
63 | ```bash
64 | [
65 | {
66 | "username": "your_user_name",
67 | "passwd": "your_password",
68 | "api_key": "your_openai_key",
69 | "organization": "your_organization"
70 | },
71 | ...
72 | ]
73 | ```
74 | - Pass rate.
75 | ```bash
76 | export CONVERTED_ANSWER_PATH=../../data/reproduction_data/model_predictions_converted/
77 | export SAVE_PATH=pass_rate_results
78 | export CANDIDATE_MODEL=chatgpt_cot
79 | export API_POOL_FILE=path/to/your/openai_key_json_file.json
80 |
81 | python eval_pass_rate.py \
82 | --converted_answer_path ${CONVERTED_ANSWER_PATH} \
83 | --save_path ${SAVE_PATH} \
84 | --reference_model ${CANDIDATE_MODEL} \
85 | --test_ids ../../data/test_query_ids/ \
86 | --max_eval_threads 20 \
87 | --evaluate_times 4
88 |
89 | ```
90 |
91 | 结果文件会被存储至${SAVE_PATH}中。
92 |
93 | - Win rate. 以下示例以ChatGPT-ReACT作为参考模型,GPT4-ReACT作为候选模型。请注意,您首先需要获取两个模型的pass rate结果,然后运行以下命令来评估GPT4-ReACT的win rate结果:
94 | ```bash
95 | export CONVERTED_ANSWER_PATH=../../data/reproduction_data/model_predictions_converted/
96 | export SAVE_PATH=preference_results
97 | export PASS_TARE_PATH=pass_rate_results
98 | export REFERENCE_MODEL=chatgpt_cot
99 | export CANDIDATE_MODEL=gpt-4-0613_cot
100 | export API_POOL_FILE=path/to/your/openai_key_json_file.json
101 |
102 | python eval_preference.py \
103 | --converted_answer_path ${CONVERTED_ANSWER_PATH} \
104 | --reference_model ${REFERENCE_MODEL} \
105 | --output_model ${CANDIDATE_MODEL} \
106 | --test_ids ../../data/test_query_ids/ \
107 | --save_path ${SAVE_PATH} \
108 | --pass_rate_result_path ${PASS_TARE_PATH} \
109 | --max_eval_threads 20 \
110 | --use_pass_rate true \
111 | --evaluate_times 4
112 | ```
113 |
114 | 结果文件会被存储至${SAVE_PATH}中。
115 |
116 | ### 评估新方法
117 | 要评估除了ReACT和DFSDT之外的方法,您需要遵循以上Data preparation的步骤准备您的预处理好的answer数据。预处理好的answer数据需遵循以下json格式:
118 |
119 | ```json
120 | [
121 | {
122 | "method":"method name",
123 | "total_steps": int, // a integer count total steps in answer details
124 | "final_answer": "final answer from the method",
125 | "answer_details":[{
126 | "role":"node role, can be system, user, assistant and tool",
127 | "message":"message for the node",
128 | "next":[//next steps, can have multiple elements if the node have multiple candidates.
129 | {
130 | "role":"",
131 | "message":"",
132 | "next":[...]
133 | },
134 | ...//more candidates
135 | ]
136 | }]
137 | }
138 | ... // more answers for the give query in the testdata
139 | ]
140 | ```
141 |
142 |
143 | ### 更新排行榜
144 |
145 | 如果您想将您的模型的结果上传到[ToolEval Leaderboard](https://openbmb.github.io/ToolBench/),请您将您的结果文件整理成上述格式发送给我们(urtoolbench@gmail.com)或者开一个pull request。
146 | 我们将运行评测脚本更新结果并将您的模型添加到排行榜中。
147 |
148 |
149 | ### 创建新的自动评估器
150 | 如果您想创建新的自动评估器,您需要按下列步骤进行:
151 | 1. 在路径`toolbench/tooleval/evaluators`下创建一个评测器配置文件目录,命名与你的评测器名一致。在其中添加`config.yaml`文件与`template.txt`文件。具体配置方式可参考`toolbench/tooleval/evaluators/tooleval_gpt-3.5-turbo_normalized`中的实现。
152 | 2. 创建你的evaluator类并实现`fn_completions`函数在文件夹`toolbench/tooleval/evaluators/registered_cls`中,或者你可以使用我们预先定义好的类例如`OpenAINormalizedEvaluator`。
153 | 完成后将配置文件中`registered_cls_name`字段填写为该类的名称。
154 | 这里给出一个例子:
155 | ```Python
156 | from evaluators import register_evaluator,BaseEvaluator
157 | from typing import Dict,List
158 |
159 | @register_evaluator
160 | class MyEvaluator(BaseEvaluator):
161 | def __init__(self,config):
162 | super().__init__(
163 | fn_completions=self.fn_completions,
164 | )
165 | # set your configures here
166 |
167 | def fn_completions(self,query:Dict,answers:List[Dict])->int:
168 | # implement your evaluator here
169 | # return the index of the preferred answer
170 | return 0
171 | ```
172 | 其中register_evaluator是一个装饰器,用于注册评估器,BaseEvaluator是一个基类,用于实现评估器的基本功能。
173 | 3. 测试评估器的性能,运行脚本`evaluators_comparison.py`。
174 |
--------------------------------------------------------------------------------
/stabletoolbench/toolbench/tooleval/ToolBench.code-workspace:
--------------------------------------------------------------------------------
1 | {
2 | "folders": [
3 | {
4 | "path": "../../../ToolBench"
5 | },
6 | {
7 | "path": "../.."
8 | },
9 | {
10 | "path": "../../../STC/RapidAPI-Server"
11 | }
12 | ],
13 | "settings": {
14 | "git.ignoreLimitWarning": true
15 | }
16 | }
--------------------------------------------------------------------------------
/stabletoolbench/toolbench/tooleval/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yuyq18/StepTool/121a6acd11a899d947a5d3fa7269cf76624c1df3/stabletoolbench/toolbench/tooleval/__init__.py
--------------------------------------------------------------------------------
/stabletoolbench/toolbench/tooleval/automatic_eval_sample.py:
--------------------------------------------------------------------------------
1 | import os
2 | import json
3 | import time
4 | from concurrent.futures import ThreadPoolExecutor,as_completed
5 | from tqdm import tqdm
6 | import numpy as np
7 | import argparse
8 | import random
9 | from evaluation import UserEvaluation,BaseToolMethod
10 | from evaluators import load_registered_automatic_evaluator
11 | from typing import List,Dict,Callable
12 | import pandas as pd
13 |
14 | abs_dir = os.path.split(__file__)[0]
15 |
16 |
17 | def parse_args():
18 | parser = argparse.ArgumentParser()
19 | parser.add_argument('--output',default=os.path.join(abs_dir,'dataset','test.json'),help='where to store the method output.')
20 | parser.add_argument('--method',default='unknown',help='what the name of the method.')
21 | parser.add_argument('--ref_method',default='gpt-3.5-turbo_CoT',help='what the reference method is')
22 | parser.add_argument('--ref_output',default=os.path.join(abs_dir,'dataset','ref_sample.json'),help='where the reference answer stored')
23 | parser.add_argument('--evaluators_cfg_path',default=os.path.join(abs_dir,'evaluators'),help='where the evaluators config files are stored')
24 | parser.add_argument('--evaluator',default='tooleval_gpt-3.5-turbo_normalized',help='which evaluator to use')
25 | parser.add_argument('--max_eval_threads',default=16,type=int,help='how many threads to use for evaluation')
26 | parser.add_argument('--evalset',default='default_evalset',help='which the evaluation dataset to use')
27 | parser.add_argument('--eval_server_address',default='http://localhost:8000',help='the address of the evaluation server')
28 | parser.add_argument('--use_existed_output',default=False,action='store_true',help='whether to use the existed output')
29 |
30 | return parser.parse_args()
31 |
32 |
33 | ## !!define your method here !!
34 | class SampleMethod(BaseToolMethod):
35 | def __init__(self):
36 | super().__init__()
37 | def forward(self,query:str,tools:List[Dict],tool_func:Callable)->Dict:
38 | return {}
39 | def convert_result_to_dict(self,result):
40 | return {
41 | 'method': 'sample',
42 | 'total_steps': 0,
43 | 'final_answer': '',
44 | 'answer_details': []
45 | }
46 |
47 | if __name__=='__main__':
48 | args = parse_args()
49 |
50 | exec_generating_method_outputs = True
51 | if os.path.exists(args.output):
52 | print('Output file {} already exists!'.format(args.output))
53 | if args.use_existed_output:
54 | exec_generating_method_outputs = False
55 | else:
56 | print('Overwrite? (y/n)')
57 | exec_generating_method_outputs = input()=='y'
58 |
59 | if exec_generating_method_outputs:
60 | ## change the SampleMethod to your method
61 | usereval = UserEvaluation(SampleMethod(),args.eval_server_address,args.evalset)
62 | print('Generating method outputs...')
63 | results = usereval.run()
64 | print('Saving method outputs...')
65 | with open(args.output,'w') as f:
66 | json.dump(results,f)
67 | else:
68 | print('Use existed output.')
69 | results = json.load(open(args.output))
70 |
71 | print('Loading reference answer for evaluation...')
72 | try:
73 | ref_output = json.load(open(args.ref_output))
74 | except:
75 | raise Exception('Cannot load reference answer from {}\n Please Download before evaluation!'.format(args.ref_output))
76 |
77 | print('Loading automatic evaluators...')
78 | evaluators = [load_registered_automatic_evaluator(vars(args)) for _ in range(args.max_eval_threads)]
79 |
80 | def get_preference(qid,query,tools,ref_ans,ans,):
81 | global evaluators
82 | evaluator = random.choice(evaluators)
83 | ret = evaluator.annotate_preference(
84 | query,
85 | tools,
86 | [ref_ans,ans])
87 | return qid,ret
88 | def get_most_preferred(d:list)->np.ndarray:
89 | if np.iterable(d):
90 | d = np.asanyarray(d)
91 | bins = np.bincount(d)
92 | max_val = np.max(bins)
93 | argmax = np.where(max_val==bins)[0]
94 | return argmax
95 | else:
96 | return np.asarray([d])
97 |
98 | print('Evaluating...')
99 | prefer_dict = {}
100 | with ThreadPoolExecutor(args.max_eval_threads) as pool:
101 | future = []
102 | for qid in ref_output.keys():
103 | try:
104 | future.append(pool.submit(
105 | get_preference,
106 | qid,
107 | ref_output[qid]['query'],
108 | ref_output[qid]['available_tools'],
109 | ref_output[qid]['answer'],
110 | results[qid]['answer']
111 | ))
112 | except KeyError as e:
113 | print('Warning : Missing answer for query {} in answer file! '.format(e))
114 |
115 | for thd in tqdm(as_completed(future),total=len(future),ncols=100):
116 | qid,preference = thd.result()
117 | prefer_dict[qid] = get_most_preferred(preference)[0]
118 |
119 | prefer = list(prefer_dict.values())
120 |
121 | prefer = np.array(prefer)
122 | df = pd.DataFrame.from_dict([{
123 | 'Method':args.method,
124 | 'Win Rate':prefer.mean(),
125 | 'Std Error':np.std(prefer)/np.sqrt(len(prefer))
126 | }])
127 | print('###### Leaderboard vs {} ######'.format(args.ref_method))
128 | print(df)
129 | save_file = os.path.join(abs_dir,'results',args.evalset,args.method)
130 | os.makedirs(save_file,exist_ok=True)
131 | df.to_csv(os.path.join(save_file,'win.csv'))
132 |
--------------------------------------------------------------------------------
/stabletoolbench/toolbench/tooleval/convert_answers.py:
--------------------------------------------------------------------------------
1 | from convert_to_answer_format import process_invalid_data,process_valid_data
2 | import json
3 | from glob import glob
4 | import os
5 |
6 | save_dir = 'path/to/save/dir'
7 |
8 | groups_dirs = ['path/to/dataset/eval/result/folders']
9 |
10 | for groups_dir in groups_dirs:
11 | method = os.path.split(groups_dir)[1]
12 | print(method)
13 | groups_save_dir = os.path.join(save_dir,method)
14 | os.makedirs(groups_save_dir,exist_ok=True)
15 | groups = [os.path.split(g)[1] for g in glob(groups_dir+'/*')]
16 | full_answer = {}
17 | for g in groups:
18 | print(g)
19 | answer_dict = {}
20 | files = glob(os.path.join(groups_dir,g,'*.json'))
21 | for file in files:
22 | qid = os.path.split(file)[1].split('_')[0]
23 | try:
24 | data = json.load(open(file))
25 | except:
26 | print('Read error: ',file)
27 | continue
28 | if not data['answer_generation']['valid_data']:
29 | answer_dict[qid] = process_invalid_data(method,data)
30 | else:
31 | answer_dict[qid] = process_valid_data(method,data['answer_generation'])
32 | json.dump(answer_dict,open(os.path.join(groups_save_dir,f'{g}.json'),'w'))
33 | full_answer.update(answer_dict)
34 | # json.dump(full_answer,open(os.path.join(groups_save_dir,f'fullanswer.json'),'w'))
--------------------------------------------------------------------------------
/stabletoolbench/toolbench/tooleval/dataset/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yuyq18/StepTool/121a6acd11a899d947a5d3fa7269cf76624c1df3/stabletoolbench/toolbench/tooleval/dataset/__init__.py
--------------------------------------------------------------------------------
/stabletoolbench/toolbench/tooleval/eval_process_reward.py:
--------------------------------------------------------------------------------
1 | from evaluators import load_registered_automatic_evaluator
2 | import os
3 | import json
4 | import csv
5 | from evaluators.registered_cls.rtl import AnswerStatus, TaskStatus, AnswerPass
6 | import random
7 | from concurrent.futures import ThreadPoolExecutor,as_completed
8 | import argparse
9 | from tqdm import tqdm
10 | import numpy as np
11 | from utils import test_sets, get_steps
12 | import backoff
13 |
14 | abs_dir = os.path.split(__file__)[0]
15 |
16 | def parse_args():
17 | parser = argparse.ArgumentParser()
18 | parser.add_argument('--converted_answer_path', type=str, default="", required=True, help='converted answer path')
19 | parser.add_argument('--save_path', type=str, default="", required=False, help='result save path')
20 | parser.add_argument('--reference_model', type=str, default="", required=False, help='model predictions path')
21 | parser.add_argument('--reference_path', type=str, default=None, required=False, help='reference path')
22 | parser.add_argument('--test_ids', type=str, default="", required=True, help='model predictions path')
23 | parser.add_argument('--task_num', type=int, default=None, required=False, help='task num')
24 | parser.add_argument('--evaluator', type=str, default="tooleval_gpt-3.5-turbo_default", required=False, help='which evaluator to use.')
25 | parser.add_argument('--max_eval_threads', type=int, default=30, required=False, help='max threads nums')
26 | parser.add_argument('--evaluate_times', type=int, default=4, required=False, help='how many times to predict with the evaluator for each solution path.')
27 | parser.add_argument('--test_set', nargs='+', default=['G1_instruction'], help='test set name')
28 | parser.add_argument('--overwrite', action='store_true', help='whether to overwrite the existing result file')
29 | return parser.parse_args()
30 |
31 | if __name__ == "__main__":
32 | args = parse_args()
33 | evaluators = [load_registered_automatic_evaluator(evaluator_name=args.evaluator, evaluators_cfg_path=os.path.join(abs_dir,'evaluators')) for _ in range(args.max_eval_threads)]
34 |
35 | @backoff.on_exception(backoff.expo, Exception, max_time=15)
36 | def compute_process_reward(query_id, example, evaluate_time):
37 | global evaluators
38 | evaluator = random.choice(evaluators)
39 | answer_steps, answer_steps_list, final_step = get_steps(example)
40 |
41 | succeed_tool_calling_list, contributions, answer_status = evaluator.evaluate_process_reward(
42 | {
43 | 'query':example['query'],
44 | 'available_tools':example['available_tools'],
45 | },
46 | answer_steps_list[:-1],
47 | example['answer'],
48 | )
49 | process_reward = {
50 | "succeed_tool_calling": succeed_tool_calling_list,
51 | "contributions": contributions,
52 | }
53 | return query_id, process_reward, answer_status, evaluate_time
54 |
55 | reference_model = args.reference_model
56 | output_list = []
57 |
58 | for test_set in args.test_set:
59 |
60 | save_file = f"{args.save_path}/{test_set}.json"
61 | if args.task_num:
62 | save_file = f"{args.save_path}/{test_set}_{args.task_num}.json"
63 |
64 | reference_path = f"{args.converted_answer_path}/{test_set}.json"
65 | reference_examples = json.load(open(reference_path, "r"))
66 | if args.task_num:
67 | reference_examples = {k:reference_examples[k] for k in list(reference_examples.keys())[:args.task_num]}
68 | if os.path.exists(save_file) and not args.overwrite:
69 | old_existed_ids = list(json.load(open(save_file, "r")).keys())
70 | old_label_cnt = json.load(open(save_file, "r"))
71 | existed_ids = []
72 | label_cnt = {}
73 | for query_id in old_existed_ids:
74 | ans = old_label_cnt[query_id]
75 | if len(ans['process_reward'].keys()) == args.evaluate_times:
76 | existed_ids.append(query_id)
77 | label_cnt[query_id] = ans
78 | else:
79 | existed_ids = []
80 | label_cnt = {}
81 |
82 | with ThreadPoolExecutor(args.max_eval_threads) as pool:
83 | future = []
84 |
85 | for query_id in reference_examples:
86 | if query_id in existed_ids:
87 | continue
88 | for i in range(args.evaluate_times):
89 | example = reference_examples[query_id]
90 | future.append(pool.submit(
91 | compute_process_reward,
92 | query_id,
93 | example,
94 | evaluate_time=i
95 | ))
96 |
97 | for thd in tqdm(as_completed(future),total=len(future),ncols=100):
98 | query_id, process_reward, is_solved, evaluate_time = thd.result()
99 | example = reference_examples[query_id]
100 | query = example["query"]
101 | tool_names = []
102 | for tool_dict in example["available_tools"]:
103 | tool_name = tool_dict["function"]["name"]
104 | tool_names.append(tool_name)
105 | answer_steps, answer_steps_list, final_step = get_steps(example)
106 | if query_id not in label_cnt:
107 | label_cnt[query_id] = {}
108 | label_cnt[query_id]["query"] = query
109 | label_cnt[query_id]["tool_names"] = tool_names
110 | label_cnt[query_id]["answer_steps"] = answer_steps_list[:-1]
111 | # label_cnt[query_id]["mid_steps_reward"] = mid_steps_reward # parsed
112 | if 'process_reward' not in label_cnt[query_id]:
113 | label_cnt[query_id]["process_reward"] = {}
114 | label_cnt[query_id]["process_reward"][evaluate_time] = process_reward
115 | label_cnt[query_id]["final_step"] = final_step
116 |
117 | if 'is_solved' not in label_cnt[query_id]:
118 | label_cnt[query_id]["is_solved"] = {}
119 | label_cnt[query_id]["is_solved"][evaluate_time] = str(is_solved)
120 | # print("========== Finish and Dump into json file===========", query_id, is_solved, evaluate_time)
121 |
122 | json.dump(label_cnt, open(save_file, "w"), ensure_ascii=False, indent=4)
123 |
124 | json.dump(label_cnt, open(save_file, "w"), ensure_ascii=False, indent=4)
125 |
--------------------------------------------------------------------------------
/stabletoolbench/toolbench/tooleval/evaluation/__init__.py:
--------------------------------------------------------------------------------
1 | from .usereval import UserEvaluation
2 | from .methodcls import BaseToolMethod
3 | from .dataclass import ExecutionGraph,ExecutionNode,DirectedEdge
--------------------------------------------------------------------------------
/stabletoolbench/toolbench/tooleval/evaluation/methodcls.py:
--------------------------------------------------------------------------------
1 | from typing import Dict, List,Callable
2 |
3 | class BaseToolMethod:
4 | def __init__(self):
5 | pass
6 | def convert_result_to_dict(self,result):
7 | '''Return Format
8 | --------
9 | {
10 | 'method': 'method name',
11 | 'total_steps': int,
12 | 'final_answer': 'answer',
13 | 'answer_details': [{
14 | "role": "system",
15 | "message": "",
16 | "next": [
17 | {
18 | "role": "user",
19 | "message": "I am planning ...",
20 | "next": [
21 | {
22 | "role": "tool",
23 | "message": "{'name': 'Finish', 'arguments': '{\\n \"return_type\": \"give_answer\",\\n \"final_answer\": \"I encountere...",
24 | "next": []
25 | }
26 | ]
27 | }
28 | ]
29 | }]
30 | }
31 |
32 | '''
33 | pass
34 | def forward(self,query:str,tools:List[Dict],tool_func:Callable)->Dict:
35 | pass
36 |
37 | def __call__(self,query:str,tools:List[Dict],tool_func:Callable)->Dict:
38 | result = self.forward(query,tools,tool_func)
39 | return self.convert_result_to_dict(result)
40 |
41 |
--------------------------------------------------------------------------------
/stabletoolbench/toolbench/tooleval/evaluation/usereval.py:
--------------------------------------------------------------------------------
1 | import requests
2 | from tqdm import tqdm
3 | from typing import Union, Dict, List, Optional,Tuple
4 | from .methodcls import BaseToolMethod
5 | from .dataclass import *
6 | import json
7 |
8 | class UserEvaluation:
9 | def __init__(self,
10 | method:BaseToolMethod,
11 | eval_server_addr='http://localhost:8000',
12 | evalset='eval20230718'):
13 | self.eval_server_addr = eval_server_addr
14 | self.evalset = evalset
15 | self.method = method
16 | res = requests.post(self.eval_server_addr+'/neweval',json=self.evalset)
17 | if res.status_code != 200:
18 | raise Exception('Failed to obtain new evaluation id! Error: '+res.text)
19 | ret = res.json()
20 | self.eval_id = ret['evaluation_id']
21 | self.len = ret['len']
22 |
23 | def get_new_question(self)->Tuple[str,List]:
24 | res = requests.post(self.eval_server_addr+'/next_question',json=self.eval_id)
25 | if res.status_code == 204:
26 | raise EvalCompleted()
27 | if res.status_code != 200:
28 | raise Exception('Failed to obtain new question!')
29 |
30 | self.question = Question(**res.json())
31 | self.tool_name_to_id = {}
32 | tools = [tool.model_dump() for tool in self.question.available_tools]
33 | for tool in tools:
34 | self.tool_name_to_id[tool['name']] = tool.pop('tid')
35 |
36 |
37 | return self.question.query,tools
38 | def tool_func(self,tool_name:str,tool_args:str)->requests.Response:
39 | tid = self.tool_name_to_id[tool_name]
40 | # res = requests.post(self.eval_server_addr+'/api',json={
41 | # 'evaluation_id':self.eval_id,
42 | # 'tool_id':tid,
43 | # 'tool_args':tool_args
44 | # })
45 | res = requests.post(self.eval_server_addr+'/rapidapi',json={
46 | 'evaluation_id':self.eval_id,
47 | 'tool_id':tid,
48 | 'tool_args':tool_args
49 | })
50 |
51 | return res
52 | def _forward(self,query:str,tools:List[Dict])->Dict:
53 | method_ret = self.method(query,tools,self.tool_func)
54 |
55 | return self.question.qid,{
56 | 'query':query,
57 | 'available_tools':tools,
58 | 'answer':method_ret
59 | }
60 |
61 |
62 | def run(self)->Dict:
63 | results = {}
64 | for _ in tqdm(range(self.len),ncols=100):
65 | try:
66 | qid,ret = self._forward(*self.get_new_question())
67 | except EvalCompleted:
68 | return results
69 | results[qid] = ret
70 | return results
71 |
--------------------------------------------------------------------------------
/stabletoolbench/toolbench/tooleval/evaluators/__init__.py:
--------------------------------------------------------------------------------
1 | from .registered_cls import BaseEvaluator,register_evaluator,get_evaluator_cls
2 |
3 | __all__=['register_evaluator','get_evaluator_cls','BaseEvaluator','load_registered_automatic_evaluator']
4 |
5 |
6 |
7 | def load_registered_automatic_evaluator(config:dict={},evaluator_name=None,evaluators_cfg_path=None)->BaseEvaluator:
8 | import os
9 | import yaml
10 |
11 | evaluator_name = config['evaluator'] if evaluator_name is None else evaluator_name
12 | cfg_path = config['evaluators_cfg_path'] if evaluators_cfg_path is None else evaluators_cfg_path
13 | cfg_path = os.path.join(cfg_path,evaluator_name)
14 |
15 | cls_name = yaml.load(open(os.path.join(cfg_path,'config.yaml')),Loader=yaml.FullLoader)['registered_cls_name']
16 |
17 | evaluator:BaseEvaluator = get_evaluator_cls(cls_name)(cfg_path)
18 | return evaluator
--------------------------------------------------------------------------------
/stabletoolbench/toolbench/tooleval/evaluators/registered_cls/__init__.py:
--------------------------------------------------------------------------------
1 | from .base import BaseEvaluator
2 | from .utils import register_evaluator,get_evaluator_cls
3 |
4 | __all__ = ['register_evaluator','get_evaluator_cls','BaseEvaluator']
5 |
6 | import os
7 | import importlib
8 | current_dir = os.path.dirname(__file__)
9 |
10 | for item in os.listdir(current_dir):
11 | item_path = os.path.join(current_dir, item)
12 |
13 | if os.path.isfile(item_path) and item != '__init__.py' and item.endswith('.py'):
14 | module_name = item[:-3]
15 |
16 | full_module_path = f"{__name__}.{module_name}"
17 |
18 | imported_module = importlib.import_module(full_module_path)
19 |
20 | globals()[module_name] = imported_module
21 |
--------------------------------------------------------------------------------
/stabletoolbench/toolbench/tooleval/evaluators/registered_cls/base.py:
--------------------------------------------------------------------------------
1 | import random
2 | from typing import List, Union, Dict, Any, Callable
3 | import os
4 | import yaml
5 | from .utils import register_evaluator
6 |
7 | def process_answer(answer: Dict):
8 | answer['final_answer'] = answer['final_answer'][:1000]
9 | answer['answer_details'] = answer['answer_details'][:3000]
10 | # breakpoint()
11 | answer.pop('method', None)
12 | return answer
13 |
14 |
15 | def process_tools(tools: List[Dict]):
16 | for tool in tools:
17 | tool.pop('description', None)
18 | tool.pop('parameters', None)
19 | return tools
20 |
21 | @register_evaluator
22 | class BaseEvaluator:
23 | """Base class for evaluators.
24 |
25 | Attributes:
26 | ----------
27 | fn_completions : Callable[[Dict,List[Dict]],int]
28 | The completion function of the evaluator, used to get annotated results.
29 | This function should take two arguments: `task_description`:Dict and `answers`:List[Dict], return a int stand for the index of best answer.
30 |
31 | Functions:
32 | ---------
33 | annotate_preference : Callable
34 | Annotate and return the index of the preferred answer.
35 |
36 | """
37 | def __init__(self,
38 | fn_completions: Callable[[Dict,List[Dict]],int] = None,
39 | *args,
40 | **kwargs):
41 | self.fn_completions = fn_completions
42 | def annotate_preference(self,
43 | query: str,
44 | available_tools: List[Dict[Any, Any]],
45 | answers:List[Dict],
46 | multisample=False,
47 | sample_n=4,
48 | task_status=None,
49 | answer_statuss=[None, None]) -> Union[List[int], int]:
50 | """Annotate and return the index of the preferred answer.
51 |
52 | For given query, available tools, and two answers, return the index of the preferred answer by calling function `fn_completions` of the evaluator.
53 |
54 | Parameters:
55 | ----------
56 | query : str
57 | The query of the task.
58 | available_tools : List[Dict[Any, Any]]
59 | The list of available tools for the task. The specific format of the tool is defined in `tooleval/evaluation/dataclass.py`
60 | answers : List[Dict]
61 | The list of answers for comparison.
62 | multisample : bool, optional
63 | Whether to use multisample to get the preference. If True, the function will return a list of preferences, otherwise return a single preference.
64 | sample_n : int, optional
65 | The number of samples to get the preference.
66 |
67 | Returns:
68 | -------
69 | preference : Union[List[int], int]
70 | The index of the preferred answer. If `multisample` is True, return a list of preferences, otherwise return a single preference.
71 |
72 | Raise:
73 | -----
74 |
75 | """
76 | answers_processed = [process_answer(ans) for ans in answers]
77 | available_tools = process_tools(available_tools)
78 |
79 | def shuffle_run() -> int:
80 | indexs = list(range(len(answers_processed)))
81 | random.shuffle(indexs)
82 |
83 | answers_projected = [answers_processed[idx] for idx in indexs]
84 | # breakpoint()
85 | preferred_index = self.fn_completions(
86 | {
87 | 'query':query,
88 | 'available_tools':available_tools,
89 | },
90 | answers_projected,
91 | task_status,
92 | answer_statuss
93 | )
94 | if preferred_index in indexs:
95 | return indexs.index(preferred_index)
96 | raise ValueError(f'Preferred index {preferred_index} is invalid!')
97 |
98 | if not multisample:
99 | return shuffle_run()
100 | else:
101 | prefers = [shuffle_run() for _ in range(sample_n)]
102 | return prefers
103 |
104 | @register_evaluator
105 | class ToolEvalEvaluator(BaseEvaluator):
106 | """ToolEval common evaluator class.
107 |
108 | Attributes:
109 | ----------
110 | cfg_path : str
111 | A path store the configuration of the evaluator.
112 |
113 |
114 | """
115 | def __init__(self,
116 | cfg_path: str = None,
117 | ):
118 | eval_config = yaml.load(open(os.path.join(cfg_path,'config.yaml')),Loader=yaml.FullLoader)
119 | template = open(os.path.join(cfg_path,eval_config['prompt_template'])).read()
120 |
121 | super().__init__(
122 | fn_completions=getattr(self,eval_config['fn_completions'])
123 | )
124 | self.eval_config = eval_config
125 | self.template = template
--------------------------------------------------------------------------------
/stabletoolbench/toolbench/tooleval/evaluators/registered_cls/utils.py:
--------------------------------------------------------------------------------
1 | import os
2 | import json
3 | from typing import List,Dict
4 | import requests
5 | from tenacity import retry, wait_random_exponential, stop_after_attempt
6 |
7 | from openai import OpenAI
8 | import random
9 |
10 | __registered_evaluators__ = {}
11 |
12 | def register_evaluator(cls):
13 | """
14 | Decorator function to register classes with the registered_evaluators list.
15 | """
16 | __registered_evaluators__[cls.__name__] = cls
17 | return cls
18 |
19 | def get_evaluator_cls(clsname):
20 | """
21 | Return the evaluator class with the given name.
22 | """
23 | try:
24 | return __registered_evaluators__.get(clsname)
25 | except:
26 | raise ModuleNotFoundError('Cannot find evaluator class {}'.format(clsname))
27 |
28 |
29 | class OpenaiPoolRequest:
30 | def __init__(self, pool_json_file=None):
31 | self.pool:List[Dict] = []
32 | __pool_file = pool_json_file
33 | if os.environ.get('API_POOL_FILE',None) is not None:
34 | __pool_file = os.environ.get('API_POOL_FILE')
35 | self.now_pos = random.randint(-1, len(self.pool))
36 | if os.path.exists(__pool_file):
37 | self.pool = json.load(open(__pool_file))
38 | self.now_pos = random.randint(-1, len(self.pool))
39 | # print(__pool_file)
40 | if os.environ.get('OPENAI_KEY',None) is not None:
41 | self.pool.append({
42 | 'api_key':os.environ.get('OPENAI_KEY'),
43 | 'api_base':os.environ.get('OPENAI_API_BASE',None),
44 | 'organization':os.environ.get('OPENAI_ORG',None),
45 | 'api_type':os.environ.get('OPENAI_TYPE',None),
46 | 'api_version':os.environ.get('OPENAI_VER',None)
47 | })
48 |
49 | # @retry(wait=wait_random_exponential(multiplier=1, max=30), stop=stop_after_attempt(10),reraise=True)
50 | def request(self,messages,**kwargs):
51 | self.now_pos = (self.now_pos + 1) % len(self.pool)
52 | key_pos = self.now_pos
53 | item = self.pool[key_pos]
54 | # print(len(self.pool))
55 | api_key = item['api_key']
56 | api_base = item.get('api_base', None)
57 | client = OpenAI(api_key=api_key,base_url=api_base)
58 | response = client.chat.completions.create(messages=messages,**kwargs)
59 | return response
60 |
61 | def __call__(self,messages,**kwargs):
62 | return self.request(messages,**kwargs)
63 |
--------------------------------------------------------------------------------
/stabletoolbench/toolbench/tooleval/evaluators/tooleval_gpt-3.5-turbo_default/config.yaml:
--------------------------------------------------------------------------------
1 | evaluator_name: "tooleval_gpt-3.5-turbo_default"
2 | registered_cls_name: "ReinforceToolLearningEvaluator"
3 | prompt_template: "template.txt"
4 | fn_completions: "normalized_openai_completions"
5 | apis_json: "your/path/to/api_pool.json"
6 | completions_kwargs:
7 | model: "gpt-3.5-turbo-16k"
8 | max_tokens: 1000
9 | temperature: 0
10 | timeout: 10
11 | functions:
12 | - name: "evaluate_process_reward"
13 | description: "Evaluate the entire task-solving process, including tool calls, the contribution of each intermediate step to the final answer, and the status of the final answer."
14 | parameters:
15 | type: "object"
16 | properties:
17 | succeed_tool_calling:
18 | type: "array"
19 | description: "Provide a binary score (0 or 1) indicating whether **each intermediate step** successfully called the tool."
20 | items:
21 | type: "number"
22 | description: "0 for unsuccessful tool calls, 1 for successful tool calls"
23 | contribution_to_final_answer:
24 | type: "array"
25 | description: "Provide a score (0 to 5) to assess how much **each intermediate step** contributed to the final answer."
26 | items:
27 | type: "number"
28 | description: "0 indicates no contribution, and 5 indicates maximum contribution."
29 | final_answer_status:
30 | type: "string"
31 | enum: ["Unsure", "Unsolved", "Solved"]
32 | description: "Indicate the status of the final answer. Choose from: 'Unsure', 'Unsolved', or 'Solved'."
33 | required: ["succeed_tool_calling", "contribution_to_final_answer", "final_answer_status"]
34 |
35 | - name: "check_answer_status"
36 | description: "Parse the json answer with layerd nodes and return the answer_status about the answer"
37 | parameters:
38 | type: "object"
39 | properties:
40 | answer_status:
41 | type: "string"
42 | enum: ["Unsure","Unsolved","Solved"]
43 | required: ["answer_status"]
44 | - name: "parse_answer_status"
45 | description: "Parse the json answer with layerd nodes and return the answer_status about the answer"
46 | parameters:
47 | type: "object"
48 | properties:
49 | answer_status:
50 | type: "string"
51 | enum: ["Unsure","Unsolved","Solved"]
52 | required: ["answer_status"]
53 | - name: "check_task_solvable"
54 | description: "Parse the task description and return the task_status about the task"
55 | parameters:
56 | type: "object"
57 | properties:
58 | task_status:
59 | type: "string"
60 | enum: ["Unsure","Unsolvable","Solvable"]
61 | required: ["task_status"]
62 | - name: "select_better_answer"
63 | description: "Select the better answer with a comprehensive investigation on given aspects. You should ignore the impact of the order of candidate answers."
64 | parameters:
65 | type: "object"
66 | properties:
67 | index:
68 | type: "number"
69 | description: "The `index` value in the selected better answer."
70 | required: ["index"]
71 | fn_completion_parser: "index_parser"
72 | batch_size: 1
73 |
--------------------------------------------------------------------------------
/stabletoolbench/toolbench/tooleval/evaluators/tooleval_gpt-3.5-turbo_default/template.txt:
--------------------------------------------------------------------------------
1 |
2 | evaluate_process_reward
3 |
4 | Query:
5 | {query}
6 |
7 | Intermediate Steps:
8 | {mid_steps}
9 |
10 | Final Answer:
11 | {final_answer}
12 |
13 | Based on the query, intermediate steps, and final answer, evaluate the entire task-solving process using the following criteria:
14 |
15 | 1. **Successful Tool Calling**: For each intermediate step, indicate whether a tool was successfully called, with a score of 0 (no) or 1 (yes).
16 | 2. **Contribution to Final Answer**: Rate the contribution of each intermediate step to the final answer on a scale of 0 to 5.
17 | 3. **Final Answer Status**: Determine the final answer status as 'Solved', 'Unsure', or 'Unsolved'.
18 |
19 | Please call the `evaluate_process_reward` function to return your evaluation.
20 |
21 |
22 |
23 |
24 |
25 | check_answer_status
26 |
27 | Giving the query and answer, you need give `answer_status` of the answer by following rules:
28 | 1. If the answer is a sorry message or not a positive/straight response for the given query, return "Unsolved".
29 | 2. If the answer is a positive/straight response for the given query, you have to further check.
30 | 2.1 If the answer is not sufficient to determine whether the solve the query or not, return "Unsure".
31 | 2.2 If you are confident that the answer is sufficient to determine whether the solve the query or not, return "Solved" or "Unsolved".
32 |
33 | Query:
34 | {query}
35 | Answer:
36 | {answer}
37 |
38 | Now give your reason in "content" and `answer_status` of JSON to `check_answer_status`.
39 |
40 |
41 |
42 |
43 | parse_answer_status
44 |
45 | Giving the query and the correspond execution detail of an answer, you need give `answer_status` of the answer by following rules:
46 | 1. If all 'tool' nodes' message indicate that there are errors happened, return "Unsolved"
47 | 2. If you find the information in the "final_answer" is not true/valid according to the messages in 'tool' nodes, return "Unsolved"
48 | 3. If you are unable to verify the authenticity and validity of the information, return "Unsure"
49 | 4. If there are 'tool' node in the chain contains successful func calling and those calling indeed solve the query, return "Solved"
50 |
51 | Query:
52 | {query}
53 | Answer:
54 | {answer}
55 |
56 | Now you are requested to give reason in "content" and `answer_status` of JSON to `parse_answer_status`.
57 |
58 |
59 |
60 |
61 | check_task_solvable
62 |
63 | Please check whether the given task solvable with following rules:
64 | 1. If the `query` provide invalid information (e.g. invalid email address or phone number), return "Unsolvable"
65 | 2. If the `query` needs more information to solve (e.g. the target restaurant name in a navigation task), return "Unsolvable"
66 | 3. If you are unable to draw a conclusion, return "Unsure"
67 | 4. If the currently `available_tools` are enough to solve the query, return "Solvable"
68 |
69 | Task:
70 | {task}
71 |
72 | Now give your reason in "content" and `task_status` of JSON to `check_task_solvable`.
73 |
74 |
75 |
76 |
77 |
78 |
79 | select_better_answer
80 |
81 | Query:
82 | {query}
83 |
84 | Answer_0:
85 | {answer_0}
86 |
87 | Answer_1:
88 | {answer_1}
89 |
90 | Given above query and answers in JSON format, you must follow the rules to select the relatively better answer and give the index of the answer **(0 for Answer_0, 1 for Answer_1)**:
91 | 1. Compare the value of "final_answer" in following aspects:
92 | - Informative: whether it contains all necessary information to reply to the query.
93 | - Factuality: whether it accurately describes what has been done, and what failed in the end.
94 | - Reasoning: If answer does not solve the query, whether gives a detailed and accurate reason for failure.
95 | 2. If you cannot determine yet, compare the value of "answer_details" in following aspects:
96 | - Tool calling costs: calculating the percentage of failed and replicated tools calling.
97 | - Running costs: calculating the total tokens T used in execution.
98 | - Milestone: calculating the milestone(fixed subtasks) reached in execution.
99 | - Exploration: whether tries potential useful tools in execution. Just count times of successful tool calling with different tools/arguments in execution.
100 |
101 | If you have made your decision, calling `select_better_answer`, else if you cannot determine, select a random answer.
102 |
103 |
--------------------------------------------------------------------------------
/stabletoolbench/toolbench/tooleval/evaluators/tooleval_gpt-3.5-turbo_fn/config.yaml:
--------------------------------------------------------------------------------
1 | evaluator_name: "tooleval_gpt-3.5-turbo_fn"
2 | registered_cls_name: "OpenAIEvaluator"
3 | prompt_template: "template.txt"
4 | fn_completions: "openai_completions"
5 | apis_json: "your/path/to/api_pool.json"
6 | completions_kwargs:
7 | model: "gpt-3.5-turbo-16k"
8 | max_tokens: 100
9 | temperature: 0
10 | timeout: 10
11 | function_call:
12 | name: "choose_preference"
13 | functions:
14 | - name: "choose_preference"
15 | description: "Choose the preferred answer for the query within all given answers."
16 | parameters:
17 | type: "object"
18 | properties:
19 | preference:
20 | type: "number"
21 | description: "The index of the preferred answer in all given answers."
22 | required: [ "preference" ]
23 | fn_completion_parser: "index_parser"
24 | batch_size: 1
25 |
--------------------------------------------------------------------------------
/stabletoolbench/toolbench/tooleval/evaluators/tooleval_gpt-3.5-turbo_fn/template.txt:
--------------------------------------------------------------------------------
1 |
2 | system
3 | You are a helpful annotator, that help user to annotate data.
4 |
5 |
6 | user
7 | Giving task description and candidate answers, I want you to choose one preferred answer based on the rules. To do so, I will give you the task description that given to the models, and the candidate answers in a list for chosen. To choose the one preferred answer, you need to first analyse answers based on rules, then give the index number of the preferred answer of JSON to `choose_preference`.
8 |
9 | Here are the preference rules:
10 | 1. if both answers give the none empty `final_answer`, check whether the given `final_answer` solves the given query.
11 | 1.1 if both answers solve the query, choose one with smaller `total_steps`.
12 | 1.1.1 if `total_steps` are same, choose one answer with better `final_answer` quality.
13 | 1.2 if one answer solve while the other not, chose the answer that solve query.
14 | 1.3 if both answers failed, check the `answer_details` to choose one with considering following preference:
15 | 1.3.1 check `response` and prefer more successful tool calling.
16 | 1.3.2 check `name` and prefer using more various tool usage.
17 | 1.3.3 prefer smaller `total_steps`.
18 | 2. if one give none empty `final_answer` while other not, choose the one give `final_answer`.
19 | 3. if both failed to give none empty `final_answer`, following 1.3 to choose one with better `answer_details`.
20 |
21 | Here is the task description in JSON format:
22 | {task_description}
23 |
24 | Here are the candidate answers in JSON format:
25 | {answers}
26 |
27 | Now choose the preferred answer by analysing results and the rules given, return the index in range [0,1].
28 |
--------------------------------------------------------------------------------
/stabletoolbench/toolbench/tooleval/evaluators/tooleval_gpt-3.5-turbo_normalized/config.yaml:
--------------------------------------------------------------------------------
1 | evaluator_name: "tooleval_gpt-3.5-turbo_normalized"
2 | registered_cls_name: "OpenAINormalizedEvaluator"
3 | prompt_template: "template.txt"
4 | fn_completions: "normalized_openai_completions"
5 | apis_json: "your/path/to/api_pool.json"
6 | completions_kwargs:
7 | model: "gpt-3.5-turbo-16k"
8 | max_tokens: 100
9 | temperature: 0
10 | timeout: 10
11 | functions:
12 | - name: "parse_answer_details"
13 | description: "Parse the json answer with layerd nodes and return the informations about the answer"
14 | parameters:
15 | type: "object"
16 | properties:
17 | succeed_tool_calling:
18 | type: "number"
19 | description: "Give the number of times that the 'tool' nodes' message is called successfully without any errors in the response"
20 | used_tool_types:
21 | type: "number"
22 | description: "Give the number of different 'name' in 'tool' nodes' message"
23 | required: [ "succeed_tool_calling", "used_tool_types"]
24 | - name: "select_best_final_answer"
25 | description: "For given query, select the best answer in answers list and return the index of the best answer"
26 | parameters:
27 | type: "object"
28 | properties:
29 | best_answer_index:
30 | type: "number"
31 | description: "The index of the best answer in the answer list, start from 0"
32 | required: [ "best_answer_index"]
33 | - name: "check_solve_query"
34 | description: "Check whether the given answer solve the given query, return true or false"
35 | parameters:
36 | type: "object"
37 | properties:
38 | is_solved:
39 | type: "boolean"
40 | description: "true if solved and false if not"
41 | required: ["is_solved"]
42 | fn_completion_parser: "index_parser"
43 | batch_size: 1
44 |
--------------------------------------------------------------------------------
/stabletoolbench/toolbench/tooleval/evaluators/tooleval_gpt-3.5-turbo_normalized/template.txt:
--------------------------------------------------------------------------------
1 |
2 | parse_answer_details
3 |
4 | Giving answer details in the following JSON format:
5 | {answer_details}
6 |
7 | I want you to parse the answer details and give the information of JSON to `parse_answer_details`. Now parse the answer.
8 |
9 |
10 |
11 | select_best_final_answer
12 |
13 | For query {query}, you have the following answers in JSON format:
14 | {final_answers}
15 |
16 | I want you to select the best answer from the above answers and give the index of the answer of JSON to `select_best_final_answer`. Now select the best answer.
17 |
18 |
19 |
20 | check_solve_query
21 |
22 | Please check whether the answer solve the query or not.
23 | Query:
24 | {query}
25 |
26 | Answer:
27 | {final_answer}
28 |
29 | Now give your judgment of JSON to `check_solve_query`, remember do not be too strict.
30 |
31 |
--------------------------------------------------------------------------------
/stabletoolbench/toolbench/tooleval/evaluators_comparison.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import json
3 | from concurrent.futures import ThreadPoolExecutor,as_completed
4 | from tqdm import tqdm
5 | from evaluators import load_registered_automatic_evaluator
6 | import os
7 | import numpy as np
8 | import copy
9 | from typing import List
10 | from scipy.stats import pearsonr,spearmanr
11 | import random
12 | random.seed(42)
13 |
14 | abs_dir = os.path.split(__file__)[0]
15 | annotated_data = json.load(open(os.path.join(abs_dir,'dataset/human_cross_annotated_data.json')))
16 | NUM_WORKERS=16
17 |
18 | def get_most_preferred(d:list)->np.ndarray:
19 | if np.iterable(d):
20 | d = np.asanyarray(d)
21 | bins = np.bincount(d)
22 | max_val = np.max(bins)
23 | argmax = np.where(max_val==bins)[0]
24 | return argmax
25 | else:
26 | return np.asarray([d])
27 |
28 | def agreement_score(x,ref:list)->float:
29 | majority_x = get_most_preferred(x)
30 | majority_ref = get_most_preferred(ref)
31 | score_unit = 1/len(majority_x)/len(majority_ref)
32 | score = 0.0
33 | for x in majority_x:
34 | if x in majority_ref:
35 | score += score_unit
36 | return score
37 | def get_correlation(x,y):
38 | x= np.asarray(x)
39 | y = np.asarray(y)
40 | x = x+1
41 | y = y+1
42 | if np.var(x)==0 or np.var(y)==0:
43 | return float(random.choice(get_most_preferred(x))==random.choice(get_most_preferred(y)))
44 | return pearsonr(x,y)[0]
45 |
46 | def test_on_annotated_data(evaluator_cfg)->List[List[int]]:
47 | evaluators = [load_registered_automatic_evaluator(evaluator_cfg) for _ in range(NUM_WORKERS)]
48 | def get_preference(idx):
49 | data = annotated_data[idx]
50 | def process_tools(tools:list):
51 | for tool in tools:
52 | tool.pop('description',None)
53 | tool.pop('parameters',None)
54 | return tools
55 |
56 | tools = process_tools(data['available_tools'])
57 | ret = evaluators[idx%NUM_WORKERS].annotate_preference(
58 | data['query'],
59 | tools,
60 | data['answers'],multisample=True)
61 | return idx,ret
62 | prefer_dict = {}
63 | with ThreadPoolExecutor(NUM_WORKERS) as pool:
64 | # future = [pool.submit(get_preference,idx) for idx in range(100)]
65 | future = [pool.submit(get_preference,idx) for idx in range(len(annotated_data))]
66 | for thd in tqdm(as_completed(future),total=len(future),ncols=100):
67 | if thd.exception() is not None:
68 | pool.shutdown(cancel_futures=True)
69 | raise thd.exception()
70 | exit(-1)
71 | idx,preference = thd.result()
72 | prefer_dict[idx] = preference
73 | prefer = [prefer_dict[idx] for idx in range(len(future))]
74 | return prefer
75 |
76 | def get_popped_and_rest(d:list,index:int):
77 | l = copy.deepcopy(d)
78 | popped = l.pop(index)
79 | return popped,l
80 |
81 | def calculate_human_performance():
82 | human_agreement = []
83 | variance = []
84 | for data in annotated_data:
85 | agreement_scores = [
86 | agreement_score(*get_popped_and_rest(data['preference'],idx))
87 | for idx in range(len(data['preference']))
88 | ]
89 | human_agreement.append(np.mean(agreement_scores))
90 | variance.append(np.var([1-agreement_scores[idx] for idx in range(len(agreement_scores))]))
91 |
92 |
93 | return {
94 | 'human_agreement':np.mean(human_agreement),
95 | 'bias':0,
96 | 'variance':np.mean(variance)
97 | }
98 |
99 |
100 |
101 | def calculate_evaluator_performance(evaluator_preference,human_preference):
102 | human_agreement = []
103 | bias = []
104 | variance = []
105 | assert len(evaluator_preference)==len(human_preference),'length of evaluator_preference and human_preference should be the same!'
106 | correlation = []
107 | for idx in range(len(evaluator_preference)):
108 | human_pref = human_preference[idx]
109 | evaluator_pref = evaluator_preference[idx]
110 |
111 | human_agreement.append([
112 | agreement_score(pref,human_pref) for pref in evaluator_pref
113 | ])
114 | bias.append(
115 | 1 - agreement_score(human_pref,evaluator_pref)
116 | )
117 | variance.append(
118 | np.var([1-score for score in human_agreement[-1]])
119 | )
120 | correlation.append(get_correlation(human_pref,evaluator_pref))
121 |
122 | return{
123 | 'correlation': np.mean(correlation),
124 | 'human_agreement':np.mean(np.mean(human_agreement,axis=1)),
125 | 'bias':np.mean(bias),
126 | 'variance':np.mean(variance)
127 | }
128 |
129 | if __name__=='__main__':
130 | evaluators = ['tooleval_gpt-3.5-turbo_normalized',]
131 | human_perference = [
132 | data['preference'] for data in annotated_data
133 | ]
134 |
135 | evaluator_performance = [calculate_human_performance()]
136 | for evaluator in evaluators:
137 | if not os.path.exists(os.path.join(abs_dir,'dataset',f'performance_{evaluator}.npy')):
138 | evaluator_cfg = {
139 | 'evaluators_cfg_path':os.path.join(abs_dir,'evaluators'),
140 | 'evaluator':evaluator
141 | }
142 | evaluator_perference = test_on_annotated_data(evaluator_cfg)
143 | np.save(os.path.join(abs_dir,'dataset',f'performance_{evaluator}.npy'),evaluator_perference)
144 |
145 | evaluator_perference = np.load(os.path.join(abs_dir,'dataset',f'performance_{evaluator}.npy'),allow_pickle=True)
146 | performance = calculate_evaluator_performance(evaluator_perference,human_perference)
147 | print(performance)
148 | evaluator_performance.append(performance)
149 |
150 | df = pd.DataFrame(evaluator_performance,index=['human']+evaluators)
151 | df.to_csv(os.path.join(abs_dir,'dataset','evaluator_performance.csv'))
152 | print(df)
--------------------------------------------------------------------------------
/stabletoolbench/toolbench/tooleval/requirements.txt:
--------------------------------------------------------------------------------
1 | tqdm
2 | numpy
3 | pandas
4 | pydantic
5 | tenacity
6 | openai
7 | pyyaml
--------------------------------------------------------------------------------
/stabletoolbench/toolbench/tooleval/results/default_evalset/DFS/win.csv:
--------------------------------------------------------------------------------
1 | ,Method,Win Rate,Std Error
2 | 0,DFS,,
3 |
--------------------------------------------------------------------------------
/stabletoolbench/toolbench/tooleval/results/leaderboard###default_evalset###tooleval_gpt-3.5-turbo_normalized###ChatGPT-DFSDT.csv:
--------------------------------------------------------------------------------
1 | Method,WinRate,G1_instruction_WinRate,G1_tool_WinRate,G1_category_WinRate,G2_instruction_WinRate,G2_category_WinRate,G3_instruction_WinRate
2 | GPT4-DFSDT,70.4,60,71.5,67,79.5,77.5,71
3 | GPT4-ReACT,64.4,53.5,50,53.5,67,72,47
4 | ChatGPT-DFSDT,64.3,54.5,65,60.5,75,71.5,62
5 | ToolLLaMA-DFSDT-Retriever,63.1,64,64,60.5,81.5,68.5,65
6 | ToolLLaMA-DFSDT,60,57,61,62,77,77,66
7 | ChatGPT-ReACT,50,41.5,44,44.5,42.5,46.5,22
8 | Text-Davinci-003-DFSDT,46.3,43.5,44,46,37,42,46
9 | Claude-2-DFSDT,43.5,20.5,31,18.5,17,20.5,28
10 | Claude-2-ReACT,34.4,5.5,3.5,5.5,6,6,14
11 | Text-Davinci-003-ReACT,33.2,12,20,20,8.5,14.5,24
--------------------------------------------------------------------------------
/stabletoolbench/toolbench/tooleval/results/leaderboard###default_evalset###tooleval_gpt-3.5-turbo_normalized###gpt-3.5-turbo_CoT.csv:
--------------------------------------------------------------------------------
1 | Method,WinRate,StdError,G1_tool_WinRate,G2_instruction_WinRate,G1_category_WinRate,G1_instruction_WinRate,G2_category_WinRate,G3_instruction_WinRate,G1_tool_StdError,G2_instruction_StdError,G1_category_StdError,G1_instruction_StdError,G2_category_StdError,G3_instruction_StdError
2 | llama-65B-finetuned-5k_CoT,0.675,0.0191213231759729,0.55,0.74,0.55,0.67,0.8,0.74,0.049749371855331,0.0438634243989226,0.049749371855331,0.0470212717820349,0.04,0.0438634243989226
3 | llama-65B-finetuned-1k_CoT,0.666110183639399,0.0192690903060015,0.49,0.696969696969697,0.53,0.66,0.86,0.76,0.0499899989997999,0.0461883428464987,0.0499099188538711,0.047370877129308,0.0346987031457949,0.0427083130081252
4 | llama-65B-finetuned-300_CoT,0.5383333333333333,0.0203523362932267,0.41,0.66,0.43,0.51,0.65,0.57,0.0491833305094317,0.047370877129308,0.0495075751779462,0.0499899989997999,0.0476969600708472,0.0495075751779462
5 | gpt-3.5-turbo_CoT,0.5,0.0,0.5,0.5,0.5,0.5,0.5,0.5,0.0,0.0,0.0,0.0,0.0,0.0
6 |
--------------------------------------------------------------------------------
/stabletoolbench/toolbench/utils.py:
--------------------------------------------------------------------------------
1 | import json
2 | import re
3 | import torch
4 | import transformers
5 | import transformers.models.llama.modeling_llama
6 | from functools import partial
7 |
8 |
9 | def process_system_message(system_message, functions):
10 | assert "with a function call to actually excute your step." in system_message
11 | # we find that following ReACT format and merging the thought node and function call node is easier for model to learn to integrate the action input json string in its prediction than learn to predict a json string directly.
12 | system_message = system_message.replace("with a function call to actually excute your step.", "with a function call to actually excute your step. Your output should follow this format:\nThought:\nAction\nAction Input:\n")
13 | # add all the function dicts in the prompt.
14 | system_message = system_message + "\nSpecifically, you have access to the following APIs: " + str(functions)
15 | return system_message
16 |
17 | def get_gpu_memory(max_gpus=None):
18 | """Get available memory for each GPU."""
19 | gpu_memory = []
20 | num_gpus = (
21 | torch.cuda.device_count()
22 | if max_gpus is None
23 | else min(max_gpus, torch.cuda.device_count())
24 | )
25 |
26 | for gpu_id in range(num_gpus):
27 | with torch.cuda.device(gpu_id):
28 | device = torch.cuda.current_device()
29 | gpu_properties = torch.cuda.get_device_properties(device)
30 | total_memory = gpu_properties.total_memory / (1024**3)
31 | allocated_memory = torch.cuda.memory_allocated() / (1024**3)
32 | available_memory = total_memory - allocated_memory
33 | gpu_memory.append(available_memory)
34 | return gpu_memory
35 |
36 |
37 | def standardize_category(category):
38 | save_category = category.replace(" ", "_").replace(",", "_").replace("/", "_")
39 | while " " in save_category or "," in save_category:
40 | save_category = save_category.replace(" ", "_").replace(",", "_")
41 | save_category = save_category.replace("__", "_")
42 | return save_category
43 |
44 | def standardize(string):
45 | res = re.compile("[^\\u4e00-\\u9fa5^a-z^A-Z^0-9^_]")
46 | string = res.sub("_", string)
47 | string = re.sub(r"(_)\1+","_", string).lower()
48 | while True:
49 | if len(string) == 0:
50 | return string
51 | if string[0] == "_":
52 | string = string[1:]
53 | else:
54 | break
55 | while True:
56 | if len(string) == 0:
57 | return string
58 | if string[-1] == "_":
59 | string = string[:-1]
60 | else:
61 | break
62 | if string[0].isdigit():
63 | string = "get_" + string
64 | return string
65 |
66 | def change_name(name):
67 | change_list = ["from", "class", "return", "false", "true", "id", "and"]
68 | if name in change_list:
69 | name = "is_" + name
70 | return name
71 |
72 | # code adapted from https://huggingface.co/kaiokendev/superhot-13b-8k-no-rlhf-test/blob/main/llama_rope_scaled_monkey_patch.py
73 | class CondenseRotaryEmbedding(torch.nn.Module):
74 | def __init__(self, dim, ratio, max_position_embeddings=2048, base=10000, device=None):
75 | super().__init__()
76 | inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2).float().to(device) / dim))
77 | self.register_buffer("inv_freq", inv_freq)
78 |
79 | # Build here to make `torch.jit.trace` work.
80 | self.ratio = ratio
81 | max_position_embeddings *= ratio
82 | print(f"Condensing Positional embeddings from {max_position_embeddings} to {max_position_embeddings // ratio}")
83 | self.max_seq_len_cached = max_position_embeddings
84 | t = torch.arange(self.max_seq_len_cached, device=self.inv_freq.device, dtype=self.inv_freq.dtype) / ratio
85 | freqs = torch.einsum("i,j->ij", t, self.inv_freq)
86 | # Different from paper, but it uses a different permutation in order to obtain the same calculation
87 | emb = torch.cat((freqs, freqs), dim=-1)
88 | dtype = torch.get_default_dtype()
89 | self.register_buffer("cos_cached", emb.cos()[None, None, :, :].to(dtype), persistent=False)
90 | self.register_buffer("sin_cached", emb.sin()[None, None, :, :].to(dtype), persistent=False)
91 |
92 | def forward(self, x, seq_len=None):
93 | # x: [bs, num_attention_heads, seq_len, head_size]
94 | # This `if` block is unlikely to be run after we build sin/cos in `__init__`. Keep the logic here just in case.
95 | if seq_len > self.max_seq_len_cached:
96 | self.max_seq_len_cached = seq_len
97 | t = torch.arange(self.max_seq_len_cached, device=x.device, dtype=self.inv_freq.dtype) / self.ratio
98 | freqs = torch.einsum("i,j->ij", t, self.inv_freq)
99 | # Different from paper, but it uses a different permutation in order to obtain the same calculation
100 | emb = torch.cat((freqs, freqs), dim=-1).to(x.device)
101 | self.register_buffer("cos_cached", emb.cos()[None, None, :, :].to(x.dtype), persistent=False)
102 | self.register_buffer("sin_cached", emb.sin()[None, None, :, :].to(x.dtype), persistent=False)
103 | return (
104 | self.cos_cached[:, :, :seq_len, ...].to(dtype=x.dtype),
105 | self.sin_cached[:, :, :seq_len, ...].to(dtype=x.dtype),
106 | )
107 |
108 | def replace_llama_with_condense(ratio):
109 | transformers.models.llama.modeling_llama.LlamaRotaryEmbedding = partial(CondenseRotaryEmbedding, ratio=ratio)
110 |
111 |
112 | def process_retrieval_ducoment(documents_df):
113 | ir_corpus = {}
114 | corpus2tool = {}
115 | for row in documents_df.itertuples():
116 | doc = json.loads(row.document_content)
117 | ir_corpus[row.docid] = (doc.get('category_name', '') or '') + ', ' + \
118 | (doc.get('tool_name', '') or '') + ', ' + \
119 | (doc.get('api_name', '') or '') + ', ' + \
120 | (doc.get('api_description', '') or '') + \
121 | ', required_params: ' + json.dumps(doc.get('required_parameters', '')) + \
122 | ', optional_params: ' + json.dumps(doc.get('optional_parameters', '')) + \
123 | ', return_schema: ' + json.dumps(doc.get('template_response', ''))
124 | corpus2tool[(doc.get('category_name', '') or '') + ', ' + \
125 | (doc.get('tool_name', '') or '') + ', ' + \
126 | (doc.get('api_name', '') or '') + ', ' + \
127 | (doc.get('api_description', '') or '') + \
128 | ', required_params: ' + json.dumps(doc.get('required_parameters', '')) + \
129 | ', optional_params: ' + json.dumps(doc.get('optional_parameters', '')) + \
130 | ', return_schema: ' + json.dumps(doc.get('template_response', ''))] = doc['category_name'] + '[SEP]' + doc['tool_name'] + '[SEP]' + doc['api_name']
131 | return ir_corpus, corpus2tool
132 |
--------------------------------------------------------------------------------