├── .gitignore ├── LICENSE ├── README.md ├── README_ZH.md ├── assets ├── .DS_Store ├── ToolLLaMA-logo.png ├── ToolLLaMA.png ├── answer_anno.png ├── comparison.png ├── instructiongeneration.png ├── overview.png ├── paper.pdf ├── performance.png └── toolbench-demo.mp4 ├── data_example ├── answer │ ├── .DS_Store │ ├── G1_answer │ │ ├── 10_ChatGPT_DFS_woFilter_w2.json │ │ ├── 11_ChatGPT_DFS_woFilter_w2.json │ │ ├── 57_ChatGPT_DFS_woFilter_w2.json │ │ ├── 59_ChatGPT_DFS_woFilter_w2.json │ │ └── 69_ChatGPT_DFS_woFilter_w2.json │ ├── G2_answer │ │ ├── 102_ChatGPT_DFS_woFilter_w2.json │ │ ├── 10_ChatGPT_DFS_woFilter_w2.json │ │ ├── 119_ChatGPT_DFS_woFilter_w2.json │ │ ├── 127_ChatGPT_DFS_woFilter_w2.json │ │ └── 52_ChatGPT_DFS_woFilter_w2.json │ └── G3_answer │ │ ├── 13_ChatGPT_DFS_woFilter_w2.json │ │ ├── 15_ChatGPT_DFS_woFilter_w2.json │ │ ├── 21_ChatGPT_DFS_woFilter_w2.json │ │ ├── 3_ChatGPT_DFS_woFilter_w2.json │ │ └── 8_ChatGPT_DFS_woFilter_w2.json ├── instruction │ ├── G1_query.json │ ├── G2_query.json │ ├── G3_query.json │ ├── inference_query_demo.json │ └── inference_query_demo_open_domain.json └── toolenv │ ├── .DS_Store │ ├── response_examples │ ├── .DS_Store │ ├── Advertising │ │ ├── privatepublicapi.json │ │ ├── publicapitestinginbox.json │ │ └── putreq.json │ ├── Artificial_Intelligence_Machine_Learning │ │ ├── speech_recognition.json │ │ ├── stable_diffusion_v2.json │ │ └── starpredictai_ai_powered_text_review_star_predictor.json │ ├── Business │ │ ├── abuse_ip_check.json │ │ ├── acopaer.json │ │ └── acrosuite_oauther.json │ ├── Commerce │ │ ├── ado_stock.json │ │ ├── aliexpress_true_api.json │ │ └── aliexpress_unofficial.json │ └── Gaming │ │ ├── csgo_matches_and_tournaments.json │ │ ├── demo_project_v2.json │ │ └── dice_roll_simulator.json │ └── tools │ ├── .DS_Store │ ├── Advertising │ ├── .DS_Store │ ├── bog_boi_api │ │ └── api.py │ ├── bty690warped │ │ └── api.py │ └── buy_gmail_accounts │ │ └── api.py │ ├── Artificial_Intelligence_Machine_Learning │ ├── .DS_Store │ ├── b2b_sales_forecasting │ │ └── api.py │ ├── bard │ │ └── api.py │ └── bard_api │ │ └── api.py │ ├── Business │ ├── .DS_Store │ ├── contacts_api │ │ └── api.py │ ├── contus_mirrorfly │ │ └── api.py │ └── crime_rate │ │ └── api.py │ ├── Commerce │ ├── .DS_Store │ ├── amazon_data_scapper │ │ └── api.py │ ├── api_shopping │ │ └── api.py │ └── codeepy_vispox │ │ └── api.py │ └── Music │ ├── .DS_Store │ ├── genius_song_lyrics │ └── api.py │ ├── getsongs │ └── api.py │ └── kooed │ └── api.py ├── docs └── index.html ├── ds_configs ├── stage2.json └── stage3.json ├── preprocess ├── preprocess_retriever_data.py └── preprocess_toolllama_data.py ├── requirements.txt ├── scripts ├── inference_chatgpt_pipeline.sh ├── inference_chatgpt_pipeline_w_rapidapi_key.sh ├── inference_davinci_pipeline.sh ├── inference_toolllama_lora_pipeline.sh ├── inference_toolllama_lora_pipeline_open_domain.sh ├── inference_toolllama_pipeline.sh ├── preprocess_retriever_data.sh ├── preprocess_toolllama_data.sh ├── train_retriever.sh ├── train_toolllama.sh └── train_toolllama_lora.sh └── toolbench ├── inference ├── Algorithms │ ├── DFS.py │ ├── __init__.py │ ├── base_search.py │ └── single_chain.py ├── Downstream_tasks │ ├── __init__.py │ ├── base_env.py │ └── rapidapi.py ├── LLM │ ├── __init__.py │ ├── base_io.py │ ├── chatgpt_function_model.py │ ├── davinci_model.py │ ├── llama_model.py │ ├── retriever.py │ ├── tool_llama_lora_model.py │ └── tool_llama_model.py ├── LLM_rank │ ├── __init__.py │ └── rank_candidate.py ├── Prompts │ ├── ReAct_prompts.py │ ├── Tree_search_prompts.py │ ├── __init__.py │ └── rank_prompts.py ├── Tree │ ├── Tree.py │ └── __init__.py ├── callbacks │ └── ServerEventCallback.py ├── qa_pipeline.py ├── qa_pipeline_open_domain.py ├── server.py ├── toolbench_server.py └── utils.py ├── model ├── __init__.py ├── apply_delta.py ├── compression.py ├── make_delta.py └── model_adapter.py ├── retrieval ├── api_evaluator.py ├── inference_example.py └── train.py ├── tool_conversation.py ├── tooleval ├── README.md ├── README_ZH.md ├── __init__.py ├── automatic_eval_sample.py ├── convert_answers.py ├── convert_to_answer_format.py ├── dataset │ └── __init__.py ├── eval_and_update_leaderboard.py ├── eval_pass_rate.py ├── eval_preference.py ├── evaluation │ ├── __init__.py │ ├── dataclass.py │ ├── methodcls.py │ └── usereval.py ├── evaluators │ ├── __init__.py │ ├── registered_cls │ │ ├── __init__.py │ │ ├── base.py │ │ ├── rtl.py │ │ ├── tooleval.py │ │ └── utils.py │ ├── tooleval_gpt-3.5-turbo_default │ │ ├── config.yaml │ │ └── template.txt │ ├── tooleval_gpt-3.5-turbo_fn │ │ ├── config.yaml │ │ └── template.txt │ └── tooleval_gpt-3.5-turbo_normalized │ │ ├── config.yaml │ │ └── template.txt ├── evaluators_comparison.py ├── requirements.txt ├── results │ ├── default_evalset │ │ ├── DFS │ │ │ └── win.csv │ │ └── gpt-3.5-turbo_CoT │ │ │ ├── G1_category.json │ │ │ ├── G1_instruction.json │ │ │ ├── G1_tool.json │ │ │ ├── G2_category.json │ │ │ ├── G2_instruction.json │ │ │ └── G3_instruction.json │ ├── leaderboard###default_evalset###tooleval_gpt-3.5-turbo_normalized###ChatGPT-DFSDT.csv │ └── leaderboard###default_evalset###tooleval_gpt-3.5-turbo_normalized###gpt-3.5-turbo_CoT.csv ├── run_convert_answer.sh ├── run_pass_rate.sh ├── run_preference.sh └── utils.py ├── train ├── llama_condense_monkey_patch.py ├── llama_flash_attn_monkey_patch.py ├── train.py ├── train_lora.py └── train_mem.py └── utils.py /.gitignore: -------------------------------------------------------------------------------- 1 | data/ 2 | data.zip 3 | *.DS_store 4 | 5 | __MACOSX/ 6 | 7 | run.bash 8 | 9 | *.pyc 10 | **/__pycache__ 11 | -------------------------------------------------------------------------------- /assets/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenBMB/ToolBench/d56fdd89faf8c91fa135090b212bb9057ee5cfc2/assets/.DS_Store -------------------------------------------------------------------------------- /assets/ToolLLaMA-logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenBMB/ToolBench/d56fdd89faf8c91fa135090b212bb9057ee5cfc2/assets/ToolLLaMA-logo.png -------------------------------------------------------------------------------- /assets/ToolLLaMA.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenBMB/ToolBench/d56fdd89faf8c91fa135090b212bb9057ee5cfc2/assets/ToolLLaMA.png -------------------------------------------------------------------------------- /assets/answer_anno.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenBMB/ToolBench/d56fdd89faf8c91fa135090b212bb9057ee5cfc2/assets/answer_anno.png -------------------------------------------------------------------------------- /assets/comparison.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenBMB/ToolBench/d56fdd89faf8c91fa135090b212bb9057ee5cfc2/assets/comparison.png -------------------------------------------------------------------------------- /assets/instructiongeneration.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenBMB/ToolBench/d56fdd89faf8c91fa135090b212bb9057ee5cfc2/assets/instructiongeneration.png -------------------------------------------------------------------------------- /assets/overview.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenBMB/ToolBench/d56fdd89faf8c91fa135090b212bb9057ee5cfc2/assets/overview.png -------------------------------------------------------------------------------- /assets/paper.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenBMB/ToolBench/d56fdd89faf8c91fa135090b212bb9057ee5cfc2/assets/paper.pdf -------------------------------------------------------------------------------- /assets/performance.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenBMB/ToolBench/d56fdd89faf8c91fa135090b212bb9057ee5cfc2/assets/performance.png -------------------------------------------------------------------------------- /assets/toolbench-demo.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenBMB/ToolBench/d56fdd89faf8c91fa135090b212bb9057ee5cfc2/assets/toolbench-demo.mp4 -------------------------------------------------------------------------------- /data_example/answer/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenBMB/ToolBench/d56fdd89faf8c91fa135090b212bb9057ee5cfc2/data_example/answer/.DS_Store -------------------------------------------------------------------------------- /data_example/instruction/inference_query_demo.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "query": "I'm planning a surprise party for my best friend, and I want to include meaningful quotes in the decorations. Can you provide me with random love, success, and motivation quotes? It would be great to have quotes that can celebrate love, success, and inspire everyone at the party. Thank you so much for your help!", 4 | "query_id": 82217, 5 | "api_list": [ 6 | { 7 | "category_name": "Social", 8 | "tool_name": "Olato Quotes", 9 | "api_name": "Love Quote" 10 | }, 11 | { 12 | "category_name": "Social", 13 | "tool_name": "Olato Quotes", 14 | "api_name": "Success Quote" 15 | }, 16 | { 17 | "category_name": "Social", 18 | "tool_name": "Olato Quotes", 19 | "api_name": "Motivation Quote" 20 | } 21 | ] 22 | } 23 | ] -------------------------------------------------------------------------------- /data_example/instruction/inference_query_demo_open_domain.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "query": "I'm planning a surprise party for my best friend, and I want to include meaningful quotes in the decorations. Can you provide me with random love, success, and motivation quotes? It would be great to have quotes that can celebrate love, success, and inspire everyone at the party. Thank you so much for your help!", 4 | "query_id": 82217 5 | } 6 | ] -------------------------------------------------------------------------------- /data_example/toolenv/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenBMB/ToolBench/d56fdd89faf8c91fa135090b212bb9057ee5cfc2/data_example/toolenv/.DS_Store -------------------------------------------------------------------------------- /data_example/toolenv/response_examples/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenBMB/ToolBench/d56fdd89faf8c91fa135090b212bb9057ee5cfc2/data_example/toolenv/response_examples/.DS_Store -------------------------------------------------------------------------------- /data_example/toolenv/response_examples/Advertising/privatepublicapi.json: -------------------------------------------------------------------------------- 1 | { 2 | "api_list": [ 3 | { 4 | "name": "getPetById", 5 | "schema": { 6 | "code": "int", 7 | "message": "str" 8 | } 9 | }, 10 | { 11 | "name": "getUserByName", 12 | "schema": { 13 | "username": "str", 14 | "firstName": "str", 15 | "lastName": "str", 16 | "email": "str" 17 | } 18 | }, 19 | { 20 | "name": "getInventory", 21 | "schema": { 22 | "Sale": "int", 23 | "totvs": "int", 24 | "sold": "int", 25 | "available": "int", 26 | "pending": "int", 27 | "cat": "int", 28 | "dgdfgdf": "int" 29 | } 30 | }, 31 | { 32 | "name": "getOrderById", 33 | "schema": { 34 | "code": "int", 35 | "message": "str" 36 | } 37 | } 38 | ] 39 | } -------------------------------------------------------------------------------- /data_example/toolenv/response_examples/Advertising/publicapitestinginbox.json: -------------------------------------------------------------------------------- 1 | { 2 | "api_list": [ 3 | { 4 | "name": "getPetById", 5 | "schema": { 6 | "code": "int", 7 | "message": "str" 8 | } 9 | }, 10 | { 11 | "name": "getInventory", 12 | "schema": { 13 | "Sale": "int", 14 | "totvs": "int", 15 | "sold": "int", 16 | "pending": "int", 17 | "available": "int", 18 | "peric": "int", 19 | "not available": "int" 20 | } 21 | }, 22 | { 23 | "name": "getOrderById", 24 | "schema": { 25 | "code": "int", 26 | "message": "str" 27 | } 28 | } 29 | ] 30 | } -------------------------------------------------------------------------------- /data_example/toolenv/response_examples/Advertising/putreq.json: -------------------------------------------------------------------------------- 1 | { 2 | "api_list": [] 3 | } -------------------------------------------------------------------------------- /data_example/toolenv/response_examples/Artificial_Intelligence_Machine_Learning/speech_recognition.json: -------------------------------------------------------------------------------- 1 | { 2 | "api_list": [ 3 | { 4 | "name": "languages", 5 | "schema": { 6 | "languages": [ 7 | { 8 | "name": "str", 9 | "code": "str" 10 | } 11 | ] 12 | } 13 | } 14 | ] 15 | } -------------------------------------------------------------------------------- /data_example/toolenv/response_examples/Artificial_Intelligence_Machine_Learning/stable_diffusion_v2.json: -------------------------------------------------------------------------------- 1 | { 2 | "api_list": [] 3 | } -------------------------------------------------------------------------------- /data_example/toolenv/response_examples/Artificial_Intelligence_Machine_Learning/starpredictai_ai_powered_text_review_star_predictor.json: -------------------------------------------------------------------------------- 1 | { 2 | "api_list": [] 3 | } -------------------------------------------------------------------------------- /data_example/toolenv/response_examples/Business/abuse_ip_check.json: -------------------------------------------------------------------------------- 1 | { 2 | "api_list": [ 3 | { 4 | "name": "CHECK Endpoint", 5 | "schema": { 6 | "data": { 7 | "ipAddress": "str", 8 | "isPublic": "bool", 9 | "ipVersion": "int", 10 | "isWhitelisted": "bool", 11 | "abuseConfidenceScore": "int", 12 | "countryCode": "str", 13 | "usageType": "str", 14 | "isp": "str", 15 | "domain": "str", 16 | "isTor": "bool", 17 | "totalReports": "int", 18 | "lastReportedAt": "str" 19 | } 20 | } 21 | } 22 | ] 23 | } -------------------------------------------------------------------------------- /data_example/toolenv/response_examples/Business/acopaer.json: -------------------------------------------------------------------------------- 1 | { 2 | "api_list": [] 3 | } -------------------------------------------------------------------------------- /data_example/toolenv/response_examples/Business/acrosuite_oauther.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenBMB/ToolBench/d56fdd89faf8c91fa135090b212bb9057ee5cfc2/data_example/toolenv/response_examples/Business/acrosuite_oauther.json -------------------------------------------------------------------------------- /data_example/toolenv/response_examples/Commerce/ado_stock.json: -------------------------------------------------------------------------------- 1 | { 2 | "api_list": [] 3 | } -------------------------------------------------------------------------------- /data_example/toolenv/response_examples/Commerce/aliexpress_true_api.json: -------------------------------------------------------------------------------- 1 | { 2 | "api_list": [ 3 | { 4 | "name": "Hot Products", 5 | "schema": { 6 | "app_sale_price": "str", 7 | "commission_rate": "str", 8 | "discount": "str", 9 | "evaluate_rate": "str", 10 | "first_level_category_name": "str", 11 | "hot_product_commission_rate": "str", 12 | "lastest_volume": "str", 13 | "original_price": "str", 14 | "product_detail_url": "str", 15 | "product_id": "str", 16 | "product_main_image_url": "str", 17 | "product_title": "str", 18 | "promotion_link": "str", 19 | "relevant_market_commission_rate": "str", 20 | "sale_price": "str", 21 | "second_level_category_name": "str", 22 | "shop_url": "str", 23 | "target_app_sale_price": "str", 24 | "target_original_price": "str", 25 | "target_sale_price": "str" 26 | } 27 | }, 28 | { 29 | "name": "Product by ID", 30 | "schema": { 31 | "app_sale_price": "str", 32 | "app_sale_price_currency": "str", 33 | "discount": "str", 34 | "first_level_category_id": "str", 35 | "first_level_category_name": "str", 36 | "lastest_volume": "str", 37 | "original_price": "str", 38 | "original_price_currency": "str", 39 | "product_detail_url": "str", 40 | "product_id": "str", 41 | "product_main_image_url": "str", 42 | "product_title": "str", 43 | "sale_price": "str", 44 | "sale_price_currency": "str", 45 | "second_level_category_id": "str", 46 | "second_level_category_name": "str", 47 | "shop_id": "str", 48 | "shop_url": "str" 49 | } 50 | } 51 | ] 52 | } -------------------------------------------------------------------------------- /data_example/toolenv/response_examples/Commerce/aliexpress_unofficial.json: -------------------------------------------------------------------------------- 1 | { 2 | "api_list": [ 3 | { 4 | "name": "/feedbacks/{id}", 5 | "schema": { 6 | "message": "str" 7 | } 8 | }, 9 | { 10 | "name": "/shipping/{id}", 11 | "schema": { 12 | "message": "str" 13 | } 14 | }, 15 | { 16 | "name": "/product/{id}", 17 | "schema": { 18 | "message": "str" 19 | } 20 | } 21 | ] 22 | } -------------------------------------------------------------------------------- /data_example/toolenv/response_examples/Gaming/csgo_matches_and_tournaments.json: -------------------------------------------------------------------------------- 1 | { 2 | "api_list": [ 3 | { 4 | "name": "Played", 5 | "schema": { 6 | "data": [ 7 | { 8 | "location": "str", 9 | "team_won": { 10 | "title": "str" 11 | }, 12 | "team_lose_country": { 13 | "title": "str" 14 | }, 15 | "match_kind": { 16 | "title": "str" 17 | }, 18 | "score_lose": "str", 19 | "team_won_country": { 20 | "title": "str" 21 | }, 22 | "score_won": "str", 23 | "event": { 24 | "title": "str" 25 | }, 26 | "stars": "str" 27 | } 28 | ], 29 | "meta": { 30 | "limit": "str", 31 | "page": "str", 32 | "total": "str", 33 | "pages": "str" 34 | } 35 | } 36 | }, 37 | { 38 | "name": "Upcoming", 39 | "schema": { 40 | "data": [ 41 | { 42 | "match_kind": { 43 | "title": "str" 44 | }, 45 | "event": { 46 | "title": "str" 47 | }, 48 | "stars": "str", 49 | "play_at": "str", 50 | "team1": { 51 | "title": "str" 52 | }, 53 | "team1_country": { 54 | "title": "str" 55 | }, 56 | "team2": { 57 | "title": "str" 58 | }, 59 | "team2_country": { 60 | "title": "str" 61 | } 62 | } 63 | ], 64 | "meta": { 65 | "total": "str" 66 | } 67 | } 68 | } 69 | ] 70 | } -------------------------------------------------------------------------------- /data_example/toolenv/response_examples/Gaming/demo_project_v2.json: -------------------------------------------------------------------------------- 1 | { 2 | "api_list": [] 3 | } -------------------------------------------------------------------------------- /data_example/toolenv/response_examples/Gaming/dice_roll_simulator.json: -------------------------------------------------------------------------------- 1 | { 2 | "api_list": [ 3 | { 4 | "name": "Regular dice rolls", 5 | "schema": { 6 | "result": [ 7 | { 8 | "roll": "empty list", 9 | "subtotal": "int" 10 | } 11 | ], 12 | "total": "int" 13 | } 14 | }, 15 | { 16 | "name": "Custom dice rolls", 17 | "schema": { 18 | "result": [ 19 | { 20 | "roll": "empty list", 21 | "subtotal": "int" 22 | } 23 | ], 24 | "total": "int" 25 | } 26 | } 27 | ] 28 | } -------------------------------------------------------------------------------- /data_example/toolenv/tools/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenBMB/ToolBench/d56fdd89faf8c91fa135090b212bb9057ee5cfc2/data_example/toolenv/tools/.DS_Store -------------------------------------------------------------------------------- /data_example/toolenv/tools/Advertising/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenBMB/ToolBench/d56fdd89faf8c91fa135090b212bb9057ee5cfc2/data_example/toolenv/tools/Advertising/.DS_Store -------------------------------------------------------------------------------- /data_example/toolenv/tools/Advertising/bog_boi_api/api.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import json 3 | from datetime import date, datetime, timedelta 4 | import os 5 | 6 | from typing import Optional, Dict, Union, List 7 | 8 | 9 | def lorem(toolbench_rapidapi_key: str='088440d910mshef857391f2fc461p17ae9ejsnaebc918926ff'): 10 | """ 11 | "Lorem" 12 | 13 | """ 14 | url = f"https://bog-boi-api.p.rapidapi.com/" 15 | querystring = {} 16 | 17 | headers = { 18 | "X-RapidAPI-Key": toolbench_rapidapi_key, 19 | "X-RapidAPI-Host": "bog-boi-api.p.rapidapi.com" 20 | } 21 | 22 | 23 | response = requests.get(url, headers=headers, params=querystring) 24 | try: 25 | observation = response.json() 26 | except: 27 | observation = response.text 28 | return observation 29 | 30 | -------------------------------------------------------------------------------- /data_example/toolenv/tools/Advertising/bty690warped/api.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import json 3 | from datetime import date, datetime, timedelta 4 | import os 5 | 6 | from typing import Optional, Dict, Union, List 7 | 8 | 9 | def bty690_warped(bty690warped: str=None, toolbench_rapidapi_key: str='088440d910mshef857391f2fc461p17ae9ejsnaebc918926ff'): 10 | """ 11 | "https://www.warped-mirror.com/ bty690 warped là trang web được nhà cái Bty690 Bsports ủy quyền phát triển khai thác thị trường cá cược trực tuyến tại Việt Nam. Truy cập warped-mirror.com để lấy link đăng ký bty690.com - bsport thể thao miễn phí và nhận khuyến mãi chơi cá độ online ngay." 12 | 13 | """ 14 | url = f"https://bty690warped.p.rapidapi.com/bty690warped" 15 | querystring = {} 16 | if bty690warped: 17 | querystring['bty690warped'] = bty690warped 18 | 19 | headers = { 20 | "X-RapidAPI-Key": toolbench_rapidapi_key, 21 | "X-RapidAPI-Host": "bty690warped.p.rapidapi.com" 22 | } 23 | 24 | 25 | response = requests.get(url, headers=headers, params=querystring) 26 | try: 27 | observation = response.json() 28 | except: 29 | observation = response.text 30 | return observation 31 | 32 | -------------------------------------------------------------------------------- /data_example/toolenv/tools/Advertising/buy_gmail_accounts/api.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import json 3 | from datetime import date, datetime, timedelta 4 | import os 5 | 6 | from typing import Optional, Dict, Union, List 7 | 8 | 9 | def buy_gmail_accounts_old_new_verified_instant_delivery(toolbench_rapidapi_key: str='088440d910mshef857391f2fc461p17ae9ejsnaebc918926ff'): 10 | """ 11 | "Buy and sell old PVA Gmail accounts in bulk from Storegmail at super low price with 100% money back guarantee. You can buy bulk Gmail accounts or JUST ONE" 12 | 13 | """ 14 | url = f"https://buy-gmail-accounts.p.rapidapi.com/" 15 | querystring = {} 16 | 17 | headers = { 18 | "X-RapidAPI-Key": toolbench_rapidapi_key, 19 | "X-RapidAPI-Host": "buy-gmail-accounts.p.rapidapi.com" 20 | } 21 | 22 | 23 | response = requests.get(url, headers=headers, params=querystring) 24 | try: 25 | observation = response.json() 26 | except: 27 | observation = response.text 28 | return observation 29 | 30 | -------------------------------------------------------------------------------- /data_example/toolenv/tools/Artificial_Intelligence_Machine_Learning/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenBMB/ToolBench/d56fdd89faf8c91fa135090b212bb9057ee5cfc2/data_example/toolenv/tools/Artificial_Intelligence_Machine_Learning/.DS_Store -------------------------------------------------------------------------------- /data_example/toolenv/tools/Artificial_Intelligence_Machine_Learning/b2b_sales_forecasting/api.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import json 3 | from datetime import date, datetime, timedelta 4 | import os 5 | 6 | from typing import Optional, Dict, Union, List 7 | 8 | 9 | def description_of_machine_learning_model_parameters(model_id: str, toolbench_rapidapi_key: str='088440d910mshef857391f2fc461p17ae9ejsnaebc918926ff'): 10 | """ 11 | "API returns a general description of ML model, like Classification Accuracy, list of allowed qualitative sales attributes and their values. Only those values are allowed when describing opportunity." 12 | 13 | """ 14 | url = f"https://b2b-sales-forecasting.p.rapidapi.com/model/{model_id}" 15 | querystring = {} 16 | 17 | headers = { 18 | "X-RapidAPI-Key": toolbench_rapidapi_key, 19 | "X-RapidAPI-Host": "b2b-sales-forecasting.p.rapidapi.com" 20 | } 21 | 22 | 23 | response = requests.get(url, headers=headers, params=querystring) 24 | try: 25 | observation = response.json() 26 | except: 27 | observation = response.text 28 | return observation 29 | 30 | -------------------------------------------------------------------------------- /data_example/toolenv/tools/Artificial_Intelligence_Machine_Learning/bard/api.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import json 3 | from datetime import date, datetime, timedelta 4 | import os 5 | 6 | from typing import Optional, Dict, Union, List 7 | 8 | 9 | def ask(question: str, bard_secure_1psid_cookie_value: str, toolbench_rapidapi_key: str='088440d910mshef857391f2fc461p17ae9ejsnaebc918926ff'): 10 | """ 11 | "https://i.ibb.co/5WHmCQ8/Screenshot-2023-05-30-231728-1.png" 12 | bard_secure_1psid_cookie_value: A string representing your bard __Secure-1PSID cookie (You can get your __Secure-1PSID cookie by simply accessing Developer Consolle and search for __Secure-1PSID cookie Name https://i.ibb.co/5WHmCQ8/Screenshot-2023-05-30-231728-1.png ) 13 | 14 | """ 15 | url = f"https://bard1.p.rapidapi.com/ask" 16 | querystring = {'question': question, 'bard___Secure-1PSID_cookie_value': bard_secure_1psid_cookie_value, } 17 | 18 | headers = { 19 | "X-RapidAPI-Key": toolbench_rapidapi_key, 20 | "X-RapidAPI-Host": "bard1.p.rapidapi.com" 21 | } 22 | 23 | 24 | response = requests.get(url, headers=headers, params=querystring) 25 | try: 26 | observation = response.json() 27 | except: 28 | observation = response.text 29 | return observation 30 | 31 | -------------------------------------------------------------------------------- /data_example/toolenv/tools/Artificial_Intelligence_Machine_Learning/bard_api/api.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import json 3 | from datetime import date, datetime, timedelta 4 | import os 5 | 6 | from typing import Optional, Dict, Union, List 7 | 8 | 9 | def ask(bard_secure_1psid_cookie_value: str, question: str, toolbench_rapidapi_key: str='088440d910mshef857391f2fc461p17ae9ejsnaebc918926ff'): 10 | """ 11 | "https://i.ibb.co/5WHmCQ8/Screenshot-2023-05-30-231728-1.png" 12 | bard_secure_1psid_cookie_value: A string representing your bard __Secure-1PSID cookie (You can get your __Secure-1PSID cookie by simply accessing Developer Consolle and search for __Secure-1PSID cookie Name https://i.ibb.co/5WHmCQ8/Screenshot-2023-05-30-231728-1.png ) 13 | question: A string representing the question your would like to ask 14 | 15 | """ 16 | url = f"https://bard-api.p.rapidapi.com/ask" 17 | querystring = {'bard___Secure-1PSID_cookie_value': bard_secure_1psid_cookie_value, 'question': question, } 18 | 19 | headers = { 20 | "X-RapidAPI-Key": toolbench_rapidapi_key, 21 | "X-RapidAPI-Host": "bard-api.p.rapidapi.com" 22 | } 23 | 24 | 25 | response = requests.get(url, headers=headers, params=querystring) 26 | try: 27 | observation = response.json() 28 | except: 29 | observation = response.text 30 | return observation 31 | 32 | -------------------------------------------------------------------------------- /data_example/toolenv/tools/Business/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenBMB/ToolBench/d56fdd89faf8c91fa135090b212bb9057ee5cfc2/data_example/toolenv/tools/Business/.DS_Store -------------------------------------------------------------------------------- /data_example/toolenv/tools/Business/contacts_api/api.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import json 3 | from datetime import date, datetime, timedelta 4 | import os 5 | 6 | from typing import Optional, Dict, Union, List 7 | 8 | 9 | def single_contact(is_id: str, toolbench_rapidapi_key: str='088440d910mshef857391f2fc461p17ae9ejsnaebc918926ff'): 10 | """ 11 | "Get a single contact" 12 | 13 | """ 14 | url = f"https://contacts-api.p.rapidapi.com/{is_id}" 15 | querystring = {} 16 | 17 | headers = { 18 | "X-RapidAPI-Key": toolbench_rapidapi_key, 19 | "X-RapidAPI-Host": "contacts-api.p.rapidapi.com" 20 | } 21 | 22 | 23 | response = requests.get(url, headers=headers, params=querystring) 24 | try: 25 | observation = response.json() 26 | except: 27 | observation = response.text 28 | return observation 29 | 30 | def home(toolbench_rapidapi_key: str='088440d910mshef857391f2fc461p17ae9ejsnaebc918926ff'): 31 | """ 32 | "Get all contacts" 33 | 34 | """ 35 | url = f"https://contacts-api.p.rapidapi.com/" 36 | querystring = {} 37 | 38 | headers = { 39 | "X-RapidAPI-Key": toolbench_rapidapi_key, 40 | "X-RapidAPI-Host": "contacts-api.p.rapidapi.com" 41 | } 42 | 43 | 44 | response = requests.get(url, headers=headers, params=querystring) 45 | try: 46 | observation = response.json() 47 | except: 48 | observation = response.text 49 | return observation 50 | 51 | -------------------------------------------------------------------------------- /data_example/toolenv/tools/Business/contus_mirrorfly/api.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import json 3 | from datetime import date, datetime, timedelta 4 | import os 5 | 6 | from typing import Optional, Dict, Union, List 7 | 8 | 9 | def sample(toolbench_rapidapi_key: str='088440d910mshef857391f2fc461p17ae9ejsnaebc918926ff'): 10 | """ 11 | "Sample" 12 | 13 | """ 14 | url = f"https://contus-mirrorfly.p.rapidapi.com/" 15 | querystring = {} 16 | 17 | headers = { 18 | "X-RapidAPI-Key": toolbench_rapidapi_key, 19 | "X-RapidAPI-Host": "contus-mirrorfly.p.rapidapi.com" 20 | } 21 | 22 | 23 | response = requests.get(url, headers=headers, params=querystring) 24 | try: 25 | observation = response.json() 26 | except: 27 | observation = response.text 28 | return observation 29 | 30 | -------------------------------------------------------------------------------- /data_example/toolenv/tools/Business/crime_rate/api.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import json 3 | from datetime import date, datetime, timedelta 4 | import os 5 | 6 | from typing import Optional, Dict, Union, List 7 | 8 | 9 | def getproductbyslug(toolbench_rapidapi_key: str='088440d910mshef857391f2fc461p17ae9ejsnaebc918926ff'): 10 | """ 11 | " " 12 | 13 | """ 14 | url = f"https://crime-rate1.p.rapidapi.com/product/getProduct/consequatur-id-quod-vel-et-accusantium-suscipit-praesentium-architecto-optio" 15 | querystring = {} 16 | 17 | headers = { 18 | "X-RapidAPI-Key": toolbench_rapidapi_key, 19 | "X-RapidAPI-Host": "crime-rate1.p.rapidapi.com" 20 | } 21 | 22 | 23 | response = requests.get(url, headers=headers, params=querystring) 24 | try: 25 | observation = response.json() 26 | except: 27 | observation = response.text 28 | return observation 29 | 30 | def getproducts(sort_field: str, sort_direction: str, search: str='Velit', per_page: int=21, toolbench_rapidapi_key: str='088440d910mshef857391f2fc461p17ae9ejsnaebc918926ff'): 31 | """ 32 | " " 33 | 34 | """ 35 | url = f"https://crime-rate1.p.rapidapi.com/product" 36 | querystring = {'sort_field': sort_field, 'sort_direction': sort_direction, } 37 | if search: 38 | querystring['search'] = search 39 | if per_page: 40 | querystring['per_page'] = per_page 41 | 42 | headers = { 43 | "X-RapidAPI-Key": toolbench_rapidapi_key, 44 | "X-RapidAPI-Host": "crime-rate1.p.rapidapi.com" 45 | } 46 | 47 | 48 | response = requests.get(url, headers=headers, params=querystring) 49 | try: 50 | observation = response.json() 51 | except: 52 | observation = response.text 53 | return observation 54 | 55 | -------------------------------------------------------------------------------- /data_example/toolenv/tools/Commerce/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenBMB/ToolBench/d56fdd89faf8c91fa135090b212bb9057ee5cfc2/data_example/toolenv/tools/Commerce/.DS_Store -------------------------------------------------------------------------------- /data_example/toolenv/tools/Commerce/amazon_data_scapper/api.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import json 3 | from datetime import date, datetime, timedelta 4 | import os 5 | 6 | from typing import Optional, Dict, Union, List 7 | 8 | 9 | def get_product_offers(productid: str, api_key: str, toolbench_rapidapi_key: str='088440d910mshef857391f2fc461p17ae9ejsnaebc918926ff'): 10 | """ 11 | "Get the offers on specific product using Product ID" 12 | 13 | """ 14 | url = f"https://amazon-data-scapper.p.rapidapi.com/products/{productid}/offers" 15 | querystring = {'api_key': api_key, } 16 | 17 | headers = { 18 | "X-RapidAPI-Key": toolbench_rapidapi_key, 19 | "X-RapidAPI-Host": "amazon-data-scapper.p.rapidapi.com" 20 | } 21 | 22 | 23 | response = requests.get(url, headers=headers, params=querystring) 24 | try: 25 | observation = response.json() 26 | except: 27 | observation = response.text 28 | return observation 29 | 30 | -------------------------------------------------------------------------------- /data_example/toolenv/tools/Commerce/api_shopping/api.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import json 3 | from datetime import date, datetime, timedelta 4 | import os 5 | 6 | from typing import Optional, Dict, Union, List 7 | 8 | 9 | def get_prices_of_tomatoes(toolbench_rapidapi_key: str='088440d910mshef857391f2fc461p17ae9ejsnaebc918926ff'): 10 | """ 11 | "This one will get prices from tomatoes" 12 | 13 | """ 14 | url = f"https://api-shopping.p.rapidapi.com/shopping?item=tomatoes" 15 | querystring = {} 16 | 17 | headers = { 18 | "X-RapidAPI-Key": toolbench_rapidapi_key, 19 | "X-RapidAPI-Host": "api-shopping.p.rapidapi.com" 20 | } 21 | 22 | 23 | response = requests.get(url, headers=headers, params=querystring) 24 | try: 25 | observation = response.json() 26 | except: 27 | observation = response.text 28 | return observation 29 | 30 | def get_prices_of_bananas(toolbench_rapidapi_key: str='088440d910mshef857391f2fc461p17ae9ejsnaebc918926ff'): 31 | """ 32 | "On this option, you can get a JSON file with the prices of bananas from TESCO" 33 | 34 | """ 35 | url = f"https://api-shopping.p.rapidapi.com/shopping?item=banana" 36 | querystring = {} 37 | 38 | headers = { 39 | "X-RapidAPI-Key": toolbench_rapidapi_key, 40 | "X-RapidAPI-Host": "api-shopping.p.rapidapi.com" 41 | } 42 | 43 | 44 | response = requests.get(url, headers=headers, params=querystring) 45 | try: 46 | observation = response.json() 47 | except: 48 | observation = response.text 49 | return observation 50 | 51 | -------------------------------------------------------------------------------- /data_example/toolenv/tools/Commerce/codeepy_vispox/api.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import json 3 | from datetime import date, datetime, timedelta 4 | import os 5 | 6 | from typing import Optional, Dict, Union, List 7 | 8 | 9 | def get_company_sponsor_list(start: int, format: str, size: int, s: str=None, t: str=None, co: str=None, ci: str=None, n: str=None, st: str=None, toolbench_rapidapi_key: str='088440d910mshef857391f2fc461p17ae9ejsnaebc918926ff'): 10 | """ 11 | "" 12 | start: The start index for the sponsor company in the list 13 | format: The returned list format. The default supported is JSON 14 | size: Size for the list result 15 | s: Generic search parameter. It searches in every field of name, city, county, tier, and subtier. If this 's' parameter is used, parameter 'n', 'ci', 'co', 't', 'st' will be ignored. 16 | t: Parameter for the company's visa tier (Tier 2 or Tier 5) 17 | co: Parameter for the company's county name 18 | ci: Parameter for the company's city name 19 | n: Parameter for the company's name 20 | st: Parameter for the company's visa sub-tier 21 | 22 | """ 23 | url = f"https://vispox.p.rapidapi.com/sponsor/" 24 | querystring = {'start': start, 'format': format, 'size': size, } 25 | if s: 26 | querystring['s'] = s 27 | if t: 28 | querystring['t'] = t 29 | if co: 30 | querystring['co'] = co 31 | if ci: 32 | querystring['ci'] = ci 33 | if n: 34 | querystring['n'] = n 35 | if st: 36 | querystring['st'] = st 37 | 38 | headers = { 39 | "X-RapidAPI-Key": toolbench_rapidapi_key, 40 | "X-RapidAPI-Host": "vispox.p.rapidapi.com" 41 | } 42 | 43 | 44 | response = requests.get(url, headers=headers, params=querystring) 45 | try: 46 | observation = response.json() 47 | except: 48 | observation = response.text 49 | return observation 50 | 51 | -------------------------------------------------------------------------------- /data_example/toolenv/tools/Music/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenBMB/ToolBench/d56fdd89faf8c91fa135090b212bb9057ee5cfc2/data_example/toolenv/tools/Music/.DS_Store -------------------------------------------------------------------------------- /data_example/toolenv/tools/Music/getsongs/api.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import json 3 | from datetime import date, datetime, timedelta 4 | import os 5 | 6 | from typing import Optional, Dict, Union, List 7 | 8 | 9 | def get_top_songs(toolbench_rapidapi_key: str='088440d910mshef857391f2fc461p17ae9ejsnaebc918926ff'): 10 | """ 11 | "tujhe dekha toh as of now" 12 | 13 | """ 14 | url = f"https://getsongs.p.rapidapi.com/getTopSongs" 15 | querystring = {} 16 | 17 | headers = { 18 | "X-RapidAPI-Key": toolbench_rapidapi_key, 19 | "X-RapidAPI-Host": "getsongs.p.rapidapi.com" 20 | } 21 | 22 | 23 | response = requests.get(url, headers=headers, params=querystring) 24 | try: 25 | observation = response.json() 26 | except: 27 | observation = response.text 28 | return observation 29 | 30 | -------------------------------------------------------------------------------- /data_example/toolenv/tools/Music/kooed/api.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import json 3 | from datetime import date, datetime, timedelta 4 | import os 5 | 6 | from typing import Optional, Dict, Union, List 7 | 8 | 9 | def kooed_endpoint_copy(kooed: str=None, toolbench_rapidapi_key: str='088440d910mshef857391f2fc461p17ae9ejsnaebc918926ff'): 10 | """ 11 | "Kooed Radio Stations end point" 12 | 13 | """ 14 | url = f"https://kooed.p.rapidapi.com/" 15 | querystring = {} 16 | if kooed: 17 | querystring['Kooed'] = kooed 18 | 19 | headers = { 20 | "X-RapidAPI-Key": toolbench_rapidapi_key, 21 | "X-RapidAPI-Host": "kooed.p.rapidapi.com" 22 | } 23 | 24 | 25 | response = requests.get(url, headers=headers, params=querystring) 26 | try: 27 | observation = response.json() 28 | except: 29 | observation = response.text 30 | return observation 31 | 32 | def kooed_endpoint(kooed: str=None, toolbench_rapidapi_key: str='088440d910mshef857391f2fc461p17ae9ejsnaebc918926ff'): 33 | """ 34 | "Kooed Radio Stations end point" 35 | 36 | """ 37 | url = f"https://kooed.p.rapidapi.com/" 38 | querystring = {} 39 | if kooed: 40 | querystring['Kooed'] = kooed 41 | 42 | headers = { 43 | "X-RapidAPI-Key": toolbench_rapidapi_key, 44 | "X-RapidAPI-Host": "kooed.p.rapidapi.com" 45 | } 46 | 47 | 48 | response = requests.get(url, headers=headers, params=querystring) 49 | try: 50 | observation = response.json() 51 | except: 52 | observation = response.text 53 | return observation 54 | 55 | -------------------------------------------------------------------------------- /ds_configs/stage2.json: -------------------------------------------------------------------------------- 1 | { 2 | "bfloat16": { 3 | "enabled": "auto" 4 | }, 5 | "fp16": { 6 | "enabled": "auto", 7 | "loss_scale": 0, 8 | "loss_scale_window": 1000, 9 | "initial_scale_power": 16, 10 | "hysteresis": 2, 11 | "min_loss_scale": 1 12 | }, 13 | "optimizer": { 14 | "type": "AdamW", 15 | "params": { 16 | "lr": "auto", 17 | "betas": "auto", 18 | "eps": "auto", 19 | "weight_decay": "auto" 20 | } 21 | }, 22 | "scheduler": { 23 | "type": "WarmupLR", 24 | "params": { 25 | "warmup_min_lr": "auto", 26 | "warmup_max_lr": "auto", 27 | "warmup_num_steps": "auto" 28 | } 29 | }, 30 | "zero_optimization": { 31 | "stage": 2, 32 | "offload_optimizer": { 33 | "device": "cpu", 34 | "pin_memory": true 35 | }, 36 | "allgather_partitions": true, 37 | "allgather_bucket_size": 2e8, 38 | "overlap_comm": true, 39 | "reduce_scatter": true, 40 | "reduce_bucket_size": 2e8, 41 | "contiguous_gradients": true 42 | }, 43 | "gradient_accumulation_steps": "auto", 44 | "gradient_clipping": "auto", 45 | "train_batch_size": "auto", 46 | "train_micro_batch_size_per_gpu": "auto", 47 | "steps_per_print": 1e5 48 | } -------------------------------------------------------------------------------- /ds_configs/stage3.json: -------------------------------------------------------------------------------- 1 | { 2 | "bfloat16": { 3 | "enabled": false 4 | }, 5 | "fp16": { 6 | "enabled": "auto", 7 | "loss_scale": 0, 8 | "loss_scale_window": 1000, 9 | "initial_scale_power": 16, 10 | "hysteresis": 2, 11 | "min_loss_scale": 1 12 | }, 13 | "optimizer": { 14 | "type": "AdamW", 15 | "params": { 16 | "lr": "auto", 17 | "betas": "auto", 18 | "eps": "auto", 19 | "weight_decay": "auto" 20 | } 21 | }, 22 | "scheduler": { 23 | "type": "WarmupLR", 24 | "params": { 25 | "warmup_min_lr": "auto", 26 | "warmup_max_lr": "auto", 27 | "warmup_num_steps": "auto" 28 | } 29 | }, 30 | "zero_optimization": { 31 | "stage": 3, 32 | "offload_optimizer": { 33 | "device": "cpu", 34 | "pin_memory": true 35 | }, 36 | "offload_param": { 37 | "device": "cpu", 38 | "pin_memory": true 39 | }, 40 | "overlap_comm": true, 41 | "contiguous_gradients": true, 42 | "sub_group_size": 1e9, 43 | "reduce_bucket_size": "auto", 44 | "stage3_prefetch_bucket_size": "auto", 45 | "stage3_param_persistence_threshold": "auto", 46 | "stage3_max_live_parameters": 1e9, 47 | "stage3_max_reuse_distance": 1e9, 48 | "stage3_gather_fp16_weights_on_model_save": true 49 | }, 50 | "gradient_accumulation_steps": "auto", 51 | "gradient_clipping": "auto", 52 | "steps_per_print": 1e5, 53 | "train_batch_size": "auto", 54 | "train_micro_batch_size_per_gpu": "auto", 55 | "wall_clock_breakdown": false 56 | } -------------------------------------------------------------------------------- /preprocess/preprocess_retriever_data.py: -------------------------------------------------------------------------------- 1 | 2 | import json 3 | import argparse 4 | import os 5 | from tqdm import tqdm 6 | import pandas as pd 7 | from sklearn.utils import shuffle 8 | 9 | 10 | # 创建参数解析器并添加参数 11 | parser = argparse.ArgumentParser() 12 | parser.add_argument('--output_dir', type=str, default="", required=True, help='The directory to output the split files') 13 | parser.add_argument('--query_file', type=str, default="", required=True, help='The name of the query file') 14 | parser.add_argument('--index_file', type=str, default="", required=True, help='The name of the index file') 15 | parser.add_argument('--dataset_name', type=str, default="", required=True, help='The name of the output dataset') 16 | 17 | # 解析命令行参数 18 | args = parser.parse_args() 19 | 20 | ### For dataset split ### 21 | 22 | # 读取query文件 23 | with open(args.query_file, 'r') as f: 24 | query_data = json.load(f) 25 | 26 | # 读取index文件 27 | with open(args.index_file, 'r') as f: 28 | test_index_data = json.load(f) 29 | 30 | # 通过集合操作,创建测试集index的集合 31 | test_index_set = set(map(int, test_index_data.keys())) 32 | 33 | # 初始化训练集和测试集的列表 34 | query_train = [] 35 | query_test = [] 36 | 37 | # 根据index,将query_data的数据分配到训练集和测试集 38 | for index, item in tqdm(enumerate(query_data)): 39 | if "query_id" in item: 40 | index = item["query_id"] 41 | if index in test_index_set: 42 | query_test.append(item) 43 | else: 44 | query_train.append(item) 45 | 46 | 47 | os.makedirs(args.output_dir, exist_ok=True) 48 | # 创建输出文件的名字 49 | output_file_base = args.dataset_name 50 | train_file = f"{args.output_dir}/train.json" 51 | test_file = f"{args.output_dir}/test.json" 52 | 53 | # 将训练集和测试集写入到对应的json文件中 54 | with open(train_file, 'w') as f: 55 | json.dump(query_train, f) 56 | 57 | with open(test_file, 'w') as f: 58 | json.dump(query_test, f) 59 | 60 | 61 | ### For dataset preprocess ### 62 | 63 | doc_id_map = {} # Create a mapping from doc to doc_id 64 | query_id_map = {} # Create a mapping from query to query_id 65 | 66 | documents = [] 67 | train_pairs = [] 68 | test_pairs = [] 69 | 70 | def process_data(data, pairs): 71 | for doc in tqdm(data): 72 | for api in doc['api_list']: 73 | document_content = api 74 | api_identity = [api['tool_name'], api['api_name']] 75 | doc_id = doc_id_map.setdefault(json.dumps(document_content), len(doc_id_map) + 1) 76 | documents.append([doc_id, json.dumps(document_content)]) 77 | 78 | # Check if the current API is in the relevant APIs 79 | if api_identity in doc['relevant APIs']: 80 | query = doc['query'] 81 | if isinstance(query, list): 82 | query = query[0] # a few instances is store in list 83 | query_id = query_id_map.setdefault(query, len(query_id_map) + 1) 84 | pairs.append(([query_id, query], [query_id, 0, doc_id, 1])) 85 | 86 | process_data(query_train, train_pairs) 87 | process_data(query_test, test_pairs) 88 | 89 | # Shuffle the data using the shuffle function 90 | train_pairs = shuffle(train_pairs, random_state=42) 91 | test_pairs = shuffle(test_pairs, random_state=42) 92 | 93 | # Split the shuffled data into queries and labels 94 | train_queries, train_labels = zip(*train_pairs) 95 | test_queries, test_labels = zip(*test_pairs) 96 | 97 | documents_df = pd.DataFrame(documents, columns=['docid', 'document_content']) 98 | train_queries_df = pd.DataFrame(train_queries, columns=['qid', 'query_text']) 99 | train_labels_df = pd.DataFrame(train_labels, columns=['qid', 'useless', 'docid', 'label']) 100 | test_queries_df = pd.DataFrame(test_queries, columns=['qid', 'query_text']) 101 | test_labels_df = pd.DataFrame(test_labels, columns=['qid', 'useless', 'docid', 'label']) 102 | 103 | 104 | 105 | # Save as .tsv and .txt files 106 | documents_df.to_csv(args.output_dir + '/corpus.tsv', sep='\t', index=False) 107 | train_queries_df.to_csv(args.output_dir + '/train.query.txt', sep='\t', index=False, header=False) 108 | test_queries_df.to_csv(args.output_dir + '/test.query.txt', sep='\t', index=False, header=False) 109 | train_labels_df.to_csv(args.output_dir + '/qrels.train.tsv', sep='\t', index=False, header=False) 110 | test_labels_df.to_csv(args.output_dir + '/qrels.test.tsv', sep='\t', index=False, header=False) -------------------------------------------------------------------------------- /preprocess/preprocess_toolllama_data.py: -------------------------------------------------------------------------------- 1 | """ 2 | Data preprocessing 3 | """ 4 | import argparse 5 | import json 6 | import os 7 | import random 8 | from toolbench.utils import process_system_message 9 | random.seed(0) 10 | parser = argparse.ArgumentParser() 11 | parser.add_argument('--tool_data_dir', type=str, default="", required=True, help='Original tool data path.') 12 | parser.add_argument('--output_file', type=str, default="", required=True, help='Preprocessed tool data output path.') 13 | parser.add_argument('--method', type=str, default="DFS_woFilter_w2", choices=["CoT@1", "DFS_woFilter_w2"], required=False, help='The method of data.') 14 | 15 | 16 | def preprocess_rapidapi(tool_data_dir, method, output_file): 17 | def process_assistant_reply(message_dict: dict) -> str: 18 | content = message_dict["content"] 19 | if "function_call" in message_dict: 20 | function_call = message_dict["function_call"] 21 | reply = function_call # the whole dict containing action name and action input as target. 22 | elif content is not None: 23 | reply = content 24 | else: 25 | print(f"Wrong assistant reply: {message_dict}") 26 | return "" 27 | return reply 28 | 29 | def append_list(instances_list: list) -> list: 30 | return_list = [] 31 | for instances in instances_list: 32 | return_list.extend(instances) 33 | return return_list 34 | 35 | print(f"Preprocessing data from {tool_data_dir} into {output_file}") 36 | out_list = [] 37 | for data_file in os.listdir(os.path.join(tool_data_dir)): 38 | tmp_instances = [] 39 | if method not in data_file: 40 | continue 41 | data_dict = json.load(open(os.path.join(tool_data_dir, data_file), "r")) 42 | answer_generation = data_dict["answer_generation"] 43 | is_valid = answer_generation["valid_data"] 44 | if not is_valid: 45 | continue 46 | train_messages = answer_generation["train_messages"] 47 | query = answer_generation["query"] 48 | functions = answer_generation["function"] 49 | for train_message in train_messages: 50 | conversations = [] 51 | cur_react = "" 52 | for message_id, message_dict in enumerate(train_message): 53 | role = message_dict["role"] 54 | content = message_dict["content"] 55 | if role == "assistant": 56 | inputs = process_assistant_reply(message_dict) 57 | 58 | # process the last assistant message as target 59 | if message_id + 1 == len(train_message): 60 | if "function_call" not in message_dict: 61 | cur_react = "" 62 | break 63 | else: 64 | if cur_react == "": 65 | cur_react += "\nThought: " 66 | action = inputs["name"] 67 | action_input = inputs["arguments"] 68 | cur_react += f"\nAction: {action}" 69 | cur_react += f"\nAction Input: {action_input}" 70 | conversations.append({ 71 | "from": role, 72 | "value": cur_react 73 | }) 74 | cur_react = "" 75 | tmp_dict = { 76 | "id": f"Step {str(message_id)}: {query}", 77 | "conversations":conversations 78 | } 79 | tmp_instances.append(tmp_dict) 80 | break 81 | 82 | # process the former assistant messages into history conversations 83 | else: 84 | if "function_call" not in message_dict: 85 | cur_react += f"\nThought: {inputs}" 86 | continue 87 | else: 88 | if cur_react == "": 89 | cur_react += "\nThought: " 90 | action = inputs["name"] 91 | action_input = inputs["arguments"] 92 | cur_react += f"\nAction: {action}" 93 | cur_react += f"\nAction Input: {action_input}" 94 | conversations.append({ 95 | "from": role, 96 | "value": cur_react 97 | }) 98 | cur_react = "" 99 | else: 100 | if role == "system": 101 | inputs = process_system_message(content, functions) 102 | else: 103 | inputs = content 104 | conversations.append({ 105 | "from": role, 106 | "value": inputs 107 | }) 108 | cur_react = "" 109 | out_list.append(tmp_instances) 110 | out_list = append_list(out_list) 111 | json.dump(out_list, open(output_file,"w"), indent=4, ensure_ascii=False) 112 | print("Preprocessing done.") 113 | 114 | if __name__=='__main__': 115 | args = parser.parse_args() 116 | preprocess_rapidapi(args.tool_data_dir, args.method, args.output_file) 117 | 118 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | accelerate==0.20.3 2 | fastapi==0.95.1 3 | gradio==3.23.0 4 | httpx==0.24.0 5 | markdown-it-py==2.2.0 6 | numpy==1.24.3 7 | prompt-toolkit==3.0.38 8 | pydantic==1.10.7 9 | requests==2.30.0 10 | rich==13.3.5 11 | rouge==1.0.1 12 | sentencepiece==0.1.99 13 | shortuuid==1.0.11 14 | tiktoken==0.4.0 15 | tokenizers==0.13.3 16 | torch>=1.12.0 17 | transformers==4.28.1 18 | uvicorn==0.22.0 19 | bitsandbytes==0.38.1 20 | peft==0.3.0 21 | langchain==0.0.229 22 | deepspeed==0.9.2 23 | sentence_transformers==2.2.2 24 | tensorboard 25 | openai 26 | scipy 27 | termcolor 28 | flask 29 | flask_cors 30 | sentence_transformers -------------------------------------------------------------------------------- /scripts/inference_chatgpt_pipeline.sh: -------------------------------------------------------------------------------- 1 | export TOOLBENCH_KEY="" 2 | export OUTPUT_DIR="data/answer/chatgpt_dfs" 3 | export OPENAI_KEY="" 4 | export PYTHONPATH=./ 5 | 6 | mkdir $OUTPUT_DIR 7 | python toolbench/inference/qa_pipeline.py \ 8 | --tool_root_dir data/toolenv/tools/ \ 9 | --backbone_model chatgpt_function \ 10 | --openai_key $OPENAI_KEY \ 11 | --max_observation_length 1024 \ 12 | --method DFS_woFilter_w2 \ 13 | --input_query_file data/instruction/inference_query_demo.json \ 14 | --output_answer_file $OUTPUT_DIR \ 15 | --toolbench_key $TOOLBENCH_KEY 16 | 17 | -------------------------------------------------------------------------------- /scripts/inference_chatgpt_pipeline_w_rapidapi_key.sh: -------------------------------------------------------------------------------- 1 | export RAPIDAPI_KEY="" 2 | export OUTPUT_DIR="data/answer/chatgpt_dfs" 3 | export OPENAI_KEY="" 4 | export PYTHONPATH=./ 5 | 6 | mkdir $OUTPUT_DIR 7 | python toolbench/inference/qa_pipeline.py \ 8 | --tool_root_dir data/toolenv/tools/ \ 9 | --backbone_model chatgpt_function \ 10 | --openai_key $OPENAI_KEY \ 11 | --max_observation_length 1024 \ 12 | --method DFS_woFilter_w2 \ 13 | --input_query_file data/instruction/inference_query_demo.json \ 14 | --output_answer_file $OUTPUT_DIR \ 15 | --rapidapi_key $RAPIDAPI_KEY \ 16 | --use_rapidapi_key 17 | 18 | -------------------------------------------------------------------------------- /scripts/inference_davinci_pipeline.sh: -------------------------------------------------------------------------------- 1 | export TOOLBENCH_KEY="" 2 | export OUTPUT_DIR="data/answer/davinci_dfs" 3 | export OPENAI_KEY="" 4 | 5 | mkdir $OUTPUT_DIR 6 | python toolbench/inference/qa_pipeline.py \ 7 | --tool_root_dir data/toolenv/tools/ \ 8 | --backbone_model davinci \ 9 | --openai_key $OPENAI_KEY \ 10 | --max_observation_length 1024 \ 11 | --method DFS_woFilter_w2 \ 12 | --input_query_file data/instruction/inference_query_demo.json \ 13 | --output_answer_file $OUTPUT_DIR \ 14 | --toolbench_key $TOOLBENCH_KEY 15 | 16 | -------------------------------------------------------------------------------- /scripts/inference_toolllama_lora_pipeline.sh: -------------------------------------------------------------------------------- 1 | export CUDA_VISIBLE_DEVICES=0 2 | export TOOLBENCH_KEY="" 3 | export OUTPUT_DIR="data/toolllama/pipeline_answer/toolllama_lora" 4 | export PYTHONPATH=./ 5 | 6 | mkdir $OUTPUT_DIR 7 | python toolbench/inference/qa_pipeline.py \ 8 | --tool_root_dir data/toolenv/tools/ \ 9 | --backbone_model toolllama \ 10 | --model_path huggyllama/llama-7b \ 11 | --lora \ 12 | --lora_path toolllama_lora \ 13 | --max_observation_length 1024 \ 14 | --observ_compress_method truncate \ 15 | --method DFS_woFilter_w2 \ 16 | --input_query_file data/instruction/inference_query_demo.json \ 17 | --output_answer_file $OUTPUT_DIR \ 18 | --toolbench_key $TOOLBENCH_KEY 19 | 20 | -------------------------------------------------------------------------------- /scripts/inference_toolllama_lora_pipeline_open_domain.sh: -------------------------------------------------------------------------------- 1 | export CUDA_VISIBLE_DEVICES=0 2 | export TOOLBENCH_KEY="" 3 | export OUTPUT_DIR="data/answer/toolllama_lora_open_domain" 4 | export PYTHONPATH=./ 5 | 6 | mkdir $OUTPUT_DIR 7 | python toolbench/inference/qa_pipeline_open_domain.py \ 8 | --tool_root_dir data/toolenv/tools/ \ 9 | --corpus_tsv_path data/retrieval/G1/corpus.tsv \ 10 | --retrieval_model_path retrieval_model \ 11 | --retrieved_api_nums 5 \ 12 | --backbone_model toolllama \ 13 | --model_path huggyllama/llama-7b \ 14 | --lora \ 15 | --lora_path toolllama_lora \ 16 | --max_observation_length 1024 \ 17 | --observ_compress_method truncate \ 18 | --method DFS_woFilter_w2 \ 19 | --input_query_file data/instruction/inference_query_demo_open_domain.json \ 20 | --output_answer_file $OUTPUT_DIR \ 21 | --toolbench_key $TOOLBENCH_KEY 22 | 23 | -------------------------------------------------------------------------------- /scripts/inference_toolllama_pipeline.sh: -------------------------------------------------------------------------------- 1 | export CUDA_VISIBLE_DEVICES=0 2 | export TOOLBENCH_KEY="" 3 | export OUTPUT_DIR="data/answer/toolllama_dfs" 4 | export PYTHONPATH=./ 5 | 6 | mkdir $OUTPUT_DIR 7 | python toolbench/inference/qa_pipeline.py \ 8 | --tool_root_dir data/toolenv/tools/ \ 9 | --backbone_model toolllama \ 10 | --model_path toolllama \ 11 | --max_observation_length 1024 \ 12 | --observ_compress_method truncate \ 13 | --method DFS_woFilter_w2 \ 14 | --input_query_file data/instruction/inference_query_demo.json \ 15 | --output_answer_file $OUTPUT_DIR \ 16 | --toolbench_key $TOOLBENCH_KEY 17 | 18 | -------------------------------------------------------------------------------- /scripts/preprocess_retriever_data.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | export PYTHONPATH=./ 4 | export QUERY_FILE="data/instruction/G1_query.json" 5 | export INDEX_FILE="data/test_query_ids/G1_instruction_test_query_ids.json" 6 | export DATASET_NAME="G1" 7 | export OUTPUT_DIR="data/retrieval/G1" 8 | 9 | # data_preprocess.py will process files and output to the specified directory 10 | python preprocess/preprocess_retriever_data.py \ 11 | --query_file $QUERY_FILE \ 12 | --index_file $INDEX_FILE \ 13 | --dataset_name $DATASET_NAME \ 14 | --output_dir $OUTPUT_DIR 15 | -------------------------------------------------------------------------------- /scripts/preprocess_toolllama_data.sh: -------------------------------------------------------------------------------- 1 | export PYTHONPATH=./ 2 | export TOOL_DATA_DIR="data/answer/G1_answer/" 3 | export METHOD="DFS_woFilter_w2" 4 | export OUTPUT_FILE="data/answer/toolllama_G1_dfs.json" 5 | 6 | python preprocess/preprocess_toolllama_data.py \ 7 | --tool_data_dir $TOOL_DATA_DIR \ 8 | --method $METHOD \ 9 | --output_file $OUTPUT_FILE -------------------------------------------------------------------------------- /scripts/train_retriever.sh: -------------------------------------------------------------------------------- 1 | export PYTHONPATH=./ 2 | export DATA_DIR="data/retrieval/G1/" 3 | export MODEL_NAME="bert-base-uncased" 4 | export OUTPUT_PATH="retriever_model" 5 | 6 | python toolbench/retrieval/train.py \ 7 | --data_path $DATA_DIR \ 8 | --model_name $MODEL_NAME \ 9 | --output_path $OUTPUT_PATH \ 10 | --num_epochs 5 \ 11 | --train_batch_size 32 \ 12 | --learning_rate 2e-5 \ 13 | --warmup_steps 500 \ 14 | --max_seq_length 256 -------------------------------------------------------------------------------- /scripts/train_toolllama.sh: -------------------------------------------------------------------------------- 1 | export PYTHONPATH=./ 2 | export CUDA_VISIBLE_DEVICES=0,1 3 | 4 | torchrun --nproc_per_node=2 --master_port=20001 toolbench/train/train_mem.py \ 5 | --model_name_or_path huggyllama/llama-7b \ 6 | --data_path data/toolllama_G123_dfs_train.json \ 7 | --eval_data_path data/toolllama_G123_dfs_eval.json \ 8 | --conv_template tool-llama-single-round \ 9 | --bf16 True \ 10 | --output_dir toolllama \ 11 | --num_train_epochs 2 \ 12 | --per_device_train_batch_size 2 \ 13 | --per_device_eval_batch_size 2 \ 14 | --gradient_accumulation_steps 8 \ 15 | --evaluation_strategy "epoch" \ 16 | --prediction_loss_only \ 17 | --save_strategy "epoch" \ 18 | --save_total_limit 8 \ 19 | --learning_rate 5e-5 \ 20 | --weight_decay 0. \ 21 | --warmup_ratio 0.04 \ 22 | --lr_scheduler_type "cosine" \ 23 | --logging_steps 1 \ 24 | --fsdp "full_shard auto_wrap" \ 25 | --fsdp_transformer_layer_cls_to_wrap 'LlamaDecoderLayer' \ 26 | --tf32 True \ 27 | --source_model_max_length 2048 \ 28 | --model_max_length 8192 \ 29 | --gradient_checkpointing True \ 30 | --lazy_preprocess True \ 31 | --report_to none 32 | -------------------------------------------------------------------------------- /scripts/train_toolllama_lora.sh: -------------------------------------------------------------------------------- 1 | export PYTHONPATH=./ 2 | deepspeed --master_port=20001 toolbench/train/train_lora.py \ 3 | --model_name_or_path huggyllama/llama-7b \ 4 | --data_path data/toolllama_G123_dfs_train.json \ 5 | --eval_data_path data/toolllama_G123_dfs_eval.json \ 6 | --conv_template tool-llama-single-round \ 7 | --bf16 True \ 8 | --output_dir toolllama_lora \ 9 | --num_train_epochs 5 \ 10 | --per_device_train_batch_size 4 \ 11 | --per_device_eval_batch_size 2 \ 12 | --gradient_accumulation_steps 2 \ 13 | --evaluation_strategy "epoch" \ 14 | --prediction_loss_only \ 15 | --save_strategy "epoch" \ 16 | --save_total_limit 8 \ 17 | --learning_rate 5e-5 \ 18 | --weight_decay 0. \ 19 | --warmup_ratio 0.04 \ 20 | --lr_scheduler_type "cosine" \ 21 | --logging_steps 1 \ 22 | --source_model_max_length 2048 \ 23 | --model_max_length 8192 \ 24 | --gradient_checkpointing True \ 25 | --lazy_preprocess True \ 26 | --deepspeed ds_configs/stage2.json \ 27 | --report_to none 28 | -------------------------------------------------------------------------------- /toolbench/inference/Algorithms/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenBMB/ToolBench/d56fdd89faf8c91fa135090b212bb9057ee5cfc2/toolbench/inference/Algorithms/__init__.py -------------------------------------------------------------------------------- /toolbench/inference/Algorithms/base_search.py: -------------------------------------------------------------------------------- 1 | from Downstream_tasks.base_env import base_env 2 | 3 | class base_search_method: 4 | """For the base tree search method, you need to support the following functions""" 5 | 6 | def __init__(self,llm,io_func: base_env, process_id=0, callbacks = None): 7 | """Args: 8 | llm: The interface of the LLM 9 | io_func(base_env): Interface to the environment, 10 | process_id (int, optional): In multiprocessing annotation, this describes the process id. Defaults to 0. 11 | callbacks (_type_, optional): _description_. Defaults to None. 12 | """ 13 | pass 14 | 15 | def to_json(self,answer=False,process=True): 16 | ''' 17 | return a json object, 18 | If "answer" = True. must have the following field to make answer annotation 19 | If "process" = True. You need provide the full information of the tree searching process 20 | 21 | "answer_generation": { 22 | "valid_data": bool, 23 | "final_answer": string, 24 | "finish_type": enum["give_up","give_answer"] 25 | "train_messages": [ [openAI-message] ], 26 | } 27 | ''' 28 | raise NotImplementedError 29 | 30 | def start(self, **args): 31 | """This is the entry point of the searching process""" 32 | raise NotImplementedError 33 | 34 | -------------------------------------------------------------------------------- /toolbench/inference/Downstream_tasks/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenBMB/ToolBench/d56fdd89faf8c91fa135090b212bb9057ee5cfc2/toolbench/inference/Downstream_tasks/__init__.py -------------------------------------------------------------------------------- /toolbench/inference/Downstream_tasks/base_env.py: -------------------------------------------------------------------------------- 1 | class base_env: 2 | 3 | def __init__(self): 4 | self.task_description = "" 5 | self.input_description = "" 6 | self.tool_names = [] 7 | self.functions = [] 8 | 9 | def restart(self): 10 | ''' 11 | Restrat the environment 12 | ''' 13 | raise NotImplementedError 14 | 15 | def get_score(self): 16 | ''' 17 | Get the value of the current state 18 | A fake function, used to search in oracle mode, which is not actually used (and impossible to obtain) 19 | ''' 20 | raise NotImplementedError 21 | 22 | def step(self, action, input_str): 23 | ''' 24 | Perform an interaction in natural language mode 25 | return value (output str, status code) 26 | ''' 27 | raise NotImplementedError 28 | 29 | def check_success(self): 30 | ''' 31 | Returns 1 if successful, otherwise returns 0 32 | ''' 33 | raise NotImplementedError 34 | 35 | def to_json(self): 36 | raise NotImplementedError -------------------------------------------------------------------------------- /toolbench/inference/LLM/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenBMB/ToolBench/d56fdd89faf8c91fa135090b212bb9057ee5cfc2/toolbench/inference/LLM/__init__.py -------------------------------------------------------------------------------- /toolbench/inference/LLM/base_io.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | def base_io(input_str): 4 | pass -------------------------------------------------------------------------------- /toolbench/inference/LLM/chatgpt_function_model.py: -------------------------------------------------------------------------------- 1 | import json 2 | import openai 3 | from tenacity import retry, wait_random_exponential, stop_after_attempt 4 | from termcolor import colored 5 | import time 6 | import random 7 | 8 | 9 | @retry(wait=wait_random_exponential(min=1, max=40), stop=stop_after_attempt(3)) 10 | def chat_completion_request(key, messages, functions=None,function_call=None,key_pos=None, model="gpt-3.5-turbo-16k-0613",stop=None,process_id=0, **args): 11 | use_messages = [] 12 | for message in messages: 13 | if not("valid" in message.keys() and message["valid"] == False): 14 | use_messages.append(message) 15 | 16 | json_data = { 17 | "model": model, 18 | "messages": use_messages, 19 | "max_tokens": 1024, 20 | "frequency_penalty": 0, 21 | "presence_penalty": 0, 22 | **args 23 | } 24 | if stop is not None: 25 | json_data.update({"stop": stop}) 26 | if functions is not None: 27 | json_data.update({"functions": functions}) 28 | if function_call is not None: 29 | json_data.update({"function_call": function_call}) 30 | 31 | try: 32 | if model == "gpt-3.5-turbo-16k-0613": 33 | openai.api_key = key 34 | else: 35 | raise NotImplementedError 36 | openai_response = openai.ChatCompletion.create( 37 | **json_data, 38 | ) 39 | json_data = json.loads(str(openai_response)) 40 | return json_data 41 | 42 | except Exception as e: 43 | print("Unable to generate ChatCompletion response") 44 | print(f"OpenAI calling Exception: {e}") 45 | return e 46 | 47 | class ChatGPTFunction: 48 | def __init__(self, model="gpt-3.5-turbo-16k-0613", openai_key=""): 49 | self.model = model 50 | self.conversation_history = [] 51 | self.openai_key = openai_key 52 | self.time = time.time() 53 | self.TRY_TIME = 6 54 | 55 | def add_message(self, message): 56 | self.conversation_history.append(message) 57 | 58 | def change_messages(self,messages): 59 | self.conversation_history = messages 60 | 61 | def display_conversation(self, detailed=False): 62 | role_to_color = { 63 | "system": "red", 64 | "user": "green", 65 | "assistant": "blue", 66 | "function": "magenta", 67 | } 68 | print("before_print"+"*"*50) 69 | for message in self.conversation_history: 70 | print_obj = f"{message['role']}: {message['content']} " 71 | if "function_call" in message.keys(): 72 | print_obj = print_obj + f"function_call: {message['function_call']}" 73 | print_obj += "" 74 | print( 75 | colored( 76 | print_obj, 77 | role_to_color[message["role"]], 78 | ) 79 | ) 80 | print("end_print"+"*"*50) 81 | 82 | def parse(self,functions,process_id,key_pos=None,**args): 83 | self.time = time.time() 84 | conversation_history = self.conversation_history 85 | for _ in range(self.TRY_TIME): 86 | if _ != 0: 87 | time.sleep(15) 88 | if functions != []: 89 | json_data = chat_completion_request( 90 | self.openai_key, conversation_history, functions=functions,process_id=process_id, key_pos=key_pos,**args 91 | ) 92 | else: 93 | json_data = chat_completion_request( 94 | self.openai_key, conversation_history,process_id=process_id,key_pos=key_pos, **args 95 | ) 96 | try: 97 | total_tokens = json_data['usage']['total_tokens'] 98 | message = json_data["choices"][0]["message"] 99 | if process_id == 0: 100 | print(f"[process({process_id})]total tokens: {json_data['usage']['total_tokens']}") 101 | 102 | if "function_call" in message.keys() and "." in message["function_call"]["name"]: 103 | message["function_call"]["name"] = message["function_call"]["name"].split(".")[-1] 104 | 105 | return message, 0, total_tokens 106 | except BaseException as e: 107 | print(f"[process({process_id})]Parsing Exception: {repr(e)}. Try again.") 108 | if json_data is not None: 109 | print(f"[process({process_id})]OpenAI return: {json_data}") 110 | 111 | 112 | return {"role": "assistant", "content": str(json_data)}, -1, 0 113 | 114 | if __name__ == "__main__": 115 | llm = ChatGPTFunction() 116 | prompt = '''下面这句英文可能有语病,能不能把语病都改掉? 117 | If you think you get the result which can answer the task, call this function to give the final answer. Or, if you think you can't handle the task from this status, call this function to restart. Remember: you should ALWAYS call this function at the end of your try, and the final answer is the ONLY part that will be showed to user, so final answer should contain enough information. 118 | 没语病的形式: 119 | ''' 120 | messages = [ 121 | {"role":"system","content":""}, 122 | {"role":"user","content":prompt}, 123 | ] 124 | llm.change_messages(messages) 125 | output,error_code,token_usage = llm.parse(functions=[],process_id=0) 126 | print(output) 127 | -------------------------------------------------------------------------------- /toolbench/inference/LLM/davinci_model.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | from typing import Optional, List, Mapping, Any 4 | from termcolor import colored 5 | import json 6 | import random 7 | import openai 8 | from typing import Optional 9 | from toolbench.model.model_adapter import get_conversation_template 10 | from toolbench.inference.utils import SimpleChatIO, react_parser 11 | from toolbench.inference.Prompts.ReAct_prompts import FORMAT_INSTRUCTIONS_SYSTEM_FUNCTION_ZEROSHOT 12 | 13 | 14 | class Davinci: 15 | def __init__(self, model="text-davinci-003", openai_key="") -> None: 16 | super().__init__() 17 | self.model = model 18 | self.openai_key = openai_key 19 | self.chatio = SimpleChatIO() 20 | 21 | def prediction(self, prompt: str, stop: Optional[List[str]] = None) -> str: 22 | max_try = 10 23 | while True: 24 | openai.api_key = self.openai_key 25 | try: 26 | response = openai.Completion.create( 27 | engine=self.model, 28 | prompt=prompt, 29 | temperature=0.5, 30 | max_tokens=512, 31 | top_p=1, 32 | frequency_penalty=0, 33 | presence_penalty=0, 34 | stop="End Action" 35 | ) 36 | result = response['choices'][0]['text'].strip() 37 | break 38 | except Exception as e: 39 | print(e) 40 | max_try -= 1 41 | if max_try < 0: 42 | result = "Exceed max retry times. Please check your davinci api calling." 43 | break 44 | return result, response["usage"] 45 | 46 | def add_message(self, message): 47 | self.conversation_history.append(message) 48 | 49 | def change_messages(self,messages): 50 | self.conversation_history = messages 51 | 52 | def display_conversation(self, detailed=False): 53 | role_to_color = { 54 | "system": "red", 55 | "user": "green", 56 | "assistant": "blue", 57 | "function": "magenta", 58 | } 59 | print("before_print"+"*"*50) 60 | for message in self.conversation_history: 61 | print_obj = f"{message['role']}: {message['content']} " 62 | if "function_call" in message.keys(): 63 | print_obj = print_obj + f"function_call: {message['function_call']}" 64 | print_obj += "" 65 | print( 66 | colored( 67 | print_obj, 68 | role_to_color[message["role"]], 69 | ) 70 | ) 71 | print("end_print"+"*"*50) 72 | 73 | def parse(self,functions,process_id,**args): 74 | conv = get_conversation_template("tool-llama-single-round") 75 | roles = {"system": conv.roles[0], "user": conv.roles[1], "function": conv.roles[2], "assistant": conv.roles[3]} 76 | conversation_history = self.conversation_history 77 | question = '' 78 | for message in conversation_history: 79 | role = roles[message['role']] 80 | content = message['content'] 81 | if role == "User": 82 | question = content 83 | break 84 | func_str = "" 85 | func_list = [] 86 | for function_dict in functions: 87 | param_str = "" 88 | api_name = function_dict["name"] 89 | func_list.append(api_name) 90 | if "Finish" in api_name: 91 | param_str = f'"return_type": string, "final_answer": string, ' 92 | api_desc = "If you believe that you have obtained a result that can answer the task, please call this function to provide the final answer. ALWAYS call this function at the end of your attempt to answer the question finally." 93 | func_str += f"{api_name}: {api_desc}. Your input should be a json (args json schema): {param_str} The Action to trigger this API should be {api_name} and the input parameters should be a json dict string. Pay attention to the type of parameters.\n\n" 94 | else: 95 | api_desc = function_dict["description"][function_dict["description"].find("The description of this function is: ")+len("The description of this function is: "):] 96 | for param_name in function_dict["parameters"]["properties"]: 97 | data_type = function_dict["parameters"]["properties"][param_name]["type"] 98 | param_str += f'"{param_name}": {data_type}, ' 99 | param_str = "{{" + param_str + "}}" 100 | func_str += f"{api_name}: {api_desc}. Your input should be a json (args json schema): {param_str} The Action to trigger this API should be {api_name} and the input parameters should be a json dict string. Pay attention to the type of parameters.\n\n" 101 | func_list = str(func_list) 102 | prompt = FORMAT_INSTRUCTIONS_SYSTEM_FUNCTION_ZEROSHOT.replace("{func_str}", func_str).replace("{func_list}", func_list).replace("{func_list}", func_list).replace("{question}", question) 103 | prompt = prompt.replace("{{", "{").replace("}}", "}") 104 | for message in conversation_history: 105 | role = roles[message['role']] 106 | content = message['content'] 107 | if role == "Assistant": 108 | prompt += f"\n{content}\n" 109 | elif role == "Function": 110 | prompt += f"Observation: {content}\n" 111 | if functions != []: 112 | predictions, usage = self.prediction(prompt) 113 | else: 114 | predictions, usage = self.prediction(prompt) 115 | 116 | # react format prediction 117 | thought, action, action_input = react_parser(predictions) 118 | message = { 119 | "role": "assistant", 120 | "content": thought, 121 | "function_call": { 122 | "name": action, 123 | "arguments": action_input 124 | } 125 | } 126 | return message, 0, usage["total_tokens"] 127 | 128 | 129 | if __name__ == "__main__": 130 | llm = Davinci() 131 | result = llm.prediction("How old are you?") 132 | print(result) -------------------------------------------------------------------------------- /toolbench/inference/LLM/llama_model.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | from typing import Optional, List, Mapping, Any 4 | from transformers import AutoTokenizer, AutoModelForCausalLM 5 | from termcolor import colored 6 | import time 7 | from typing import Optional 8 | from transformers import ( 9 | AutoTokenizer, 10 | AutoModelForCausalLM 11 | ) 12 | from toolbench.utils import process_system_message 13 | from toolbench.model.model_adapter import get_conversation_template 14 | from toolbench.inference.utils import SimpleChatIO, generate_stream, react_parser 15 | 16 | 17 | class LlamaModel: 18 | def __init__(self, model_name_or_path: str, template:str="tool-llama-single-round", device: str="cuda", cpu_offloading: bool=False, max_sequence_length: int=2048) -> None: 19 | super().__init__() 20 | self.model_name = model_name_or_path 21 | self.template = template 22 | self.max_sequence_length = max_sequence_length 23 | self.tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=False, model_max_length=self.max_sequence_length) 24 | self.model = AutoModelForCausalLM.from_pretrained( 25 | model_name_or_path, low_cpu_mem_usage=True 26 | ) 27 | if self.tokenizer.pad_token_id == None: 28 | self.tokenizer.add_special_tokens({"bos_token": "", "eos_token": "", "pad_token": ""}) 29 | self.model.resize_token_embeddings(len(self.tokenizer)) 30 | self.use_gpu = (True if device == "cuda" else False) 31 | if (device == "cuda" and not cpu_offloading) or device == "mps": 32 | self.model.to(device) 33 | self.chatio = SimpleChatIO() 34 | 35 | def prediction(self, prompt: str, stop: Optional[List[str]] = None) -> str: 36 | gen_params = { 37 | "model": "", 38 | "prompt": prompt, 39 | "temperature": 0.5, 40 | "max_new_tokens": 512, 41 | "stop": "", 42 | "stop_token_ids": None, 43 | "echo": False 44 | } 45 | generate_stream_func = generate_stream 46 | output_stream = generate_stream_func(self.model, self.tokenizer, gen_params, "cuda", self.max_sequence_length, force_generate=True) 47 | outputs = self.chatio.return_output(output_stream) 48 | prediction = outputs.strip() 49 | return prediction 50 | 51 | def add_message(self, message): 52 | self.conversation_history.append(message) 53 | 54 | def change_messages(self,messages): 55 | self.conversation_history = messages 56 | 57 | def display_conversation(self, detailed=False): 58 | role_to_color = { 59 | "system": "red", 60 | "user": "green", 61 | "assistant": "blue", 62 | "function": "magenta", 63 | } 64 | print("before_print"+"*"*50) 65 | for message in self.conversation_history: 66 | print_obj = f"{message['role']}: {message['content']} " 67 | if "function_call" in message.keys(): 68 | print_obj = print_obj + f"function_call: {message['function_call']}" 69 | print_obj += "" 70 | print( 71 | colored( 72 | print_obj, 73 | role_to_color[message["role"]], 74 | ) 75 | ) 76 | print("end_print"+"*"*50) 77 | 78 | def parse(self,functions,process_id,**args): 79 | conv = get_conversation_template(self.template) 80 | if self.template == "tool-llama": 81 | roles = {"human": conv.roles[0], "gpt": conv.roles[1]} 82 | elif self.template == "tool-llama-single-round" or self.template == "tool-llama-multi-rounds": 83 | roles = {"system": conv.roles[0], "user": conv.roles[1], "function": conv.roles[2], "assistant": conv.roles[3]} 84 | 85 | self.time = time.time() 86 | conversation_history = self.conversation_history 87 | prompt = '' 88 | for message in conversation_history: 89 | role = roles[message['role']] 90 | content = message['content'] 91 | if role == "System" and functions != []: 92 | content = process_system_message(content, functions) 93 | prompt += f"{role}: {content}\n" 94 | prompt += "Assistant:\n" 95 | if functions != []: 96 | predictions = self.prediction(prompt) 97 | else: 98 | predictions = self.prediction(prompt) 99 | 100 | decoded_token_len = len(self.tokenizer(predictions)) 101 | if process_id == 0: 102 | print(f"[process({process_id})]total tokens: {decoded_token_len}") 103 | 104 | thought, action, action_input = react_parser(predictions) 105 | if len(thought.strip()) > 1: 106 | print(thought) 107 | # input() 108 | message = { 109 | "role": "assistant", 110 | "content": thought, 111 | "function_call": { 112 | "name": action, 113 | "arguments": action_input 114 | } 115 | } 116 | return message, 0, decoded_token_len 117 | 118 | 119 | if __name__ == "__main__": 120 | # can accept all huggingface LlamaModel family 121 | llm = LlamaModel("decapoda-research/llama-7b-hf") 122 | messages = [ 123 | {'role': 'system', 'content': '''You are AutoGPT, you can use many tools(functions) to do 124 | the following task.\nFirst I will give you the task description, and your task start.\nAt each step, you need to give your thought to analyze the status now and what to do next, with a function call to actually excute your step.\nAfter the call, you will get the call result, and you are now in a new state.\nThen you will analyze your status now, then decide what to do next...\nAfter many (Thought-call) pairs, you finally perform the task, then you can give your finial answer.\nRemember: \n1.the state change is , you can\'t go 125 | back to the former state, if you want to restart the task, say "I give up and restart".\n2.All the thought is short, at most in 5 sentence.\nLet\'s Begin!\nTask description: Use numbers and basic arithmetic operations (+ - * /) to obtain exactly one number=24. Each 126 | step, you are only allowed to choose two of the left numbers to obtain a new number. For example, you can combine [3,13,9,7] as 7*9 - 3*13 = 24.\nRemember:\n1.all of the number must be used , and must be used ONCE. So Only when left numbers is exact 24, you will win. So you don\'t succeed when left number = [24, 5]. You succeed when left number = [24]. \n2.all the try takes exactly 3 steps, look 127 | at the input format'''}, 128 | {'role': 'user', 'content': '\nThe real task input is: [1, 2, 4, 7]\nBegin!\n'} 129 | ] 130 | functions = [{'name': 'play_24', 'description': '''make your current conbine with the format "x operation y = z (left: aaa) " like "1+2=3, (left: 3 5 7)", then I will tell you whether you win. This is the ONLY way 131 | to interact with the game, and the total process of a input use 3 steps of call, each step you can only combine 2 of the left numbers, so the count of left numbers decrease from 4 to 1''','parameters':{'type': 'object', 'properties':{}}}]#, 'parameters': {'type': 'object', 'properties': {'input': {'type': 'string', 'description': 'describe what number you want to conbine, and how to conbine.'}}, 'required': ['input']}}] 132 | 133 | llm.change_messages(messages) 134 | output = llm.parse(functions=functions) 135 | print(output) -------------------------------------------------------------------------------- /toolbench/inference/LLM/retriever.py: -------------------------------------------------------------------------------- 1 | import time 2 | import pandas as pd 3 | from sentence_transformers import SentenceTransformer, util 4 | import json 5 | import re 6 | from toolbench.utils import standardize, standardize_category, change_name, process_retrieval_ducoment 7 | 8 | 9 | class ToolRetriever: 10 | def __init__(self, corpus_tsv_path = "", model_path=""): 11 | self.corpus_tsv_path = corpus_tsv_path 12 | self.model_path = model_path 13 | self.corpus, self.corpus2tool = self.build_retrieval_corpus() 14 | self.embedder = self.build_retrieval_embedder() 15 | self.corpus_embeddings = self.build_corpus_embeddings() 16 | 17 | def build_retrieval_corpus(self): 18 | print("Building corpus...") 19 | documents_df = pd.read_csv(self.corpus_tsv_path, sep='\t') 20 | corpus, corpus2tool = process_retrieval_ducoment(documents_df) 21 | corpus_ids = list(corpus.keys()) 22 | corpus = [corpus[cid] for cid in corpus_ids] 23 | return corpus, corpus2tool 24 | 25 | def build_retrieval_embedder(self): 26 | print("Building embedder...") 27 | embedder = SentenceTransformer(self.model_path) 28 | return embedder 29 | 30 | def build_corpus_embeddings(self): 31 | print("Building corpus embeddings with embedder...") 32 | corpus_embeddings = self.embedder.encode(self.corpus, convert_to_tensor=True) 33 | return corpus_embeddings 34 | 35 | def retrieving(self, query, top_k=5, excluded_tools={}): 36 | print("Retrieving...") 37 | start = time.time() 38 | query_embedding = self.embedder.encode(query, convert_to_tensor=True) 39 | hits = util.semantic_search(query_embedding, self.corpus_embeddings, top_k=10*top_k, score_function=util.cos_sim) 40 | retrieved_tools = [] 41 | for rank, hit in enumerate(hits[0]): 42 | category, tool_name, api_name = self.corpus2tool[self.corpus[hit['corpus_id']]].split('\t') 43 | category = standardize_category(category) 44 | tool_name = standardize(tool_name) # standardizing 45 | api_name = change_name(standardize(api_name)) # standardizing 46 | if category in excluded_tools: 47 | if tool_name in excluded_tools[category]: 48 | top_k += 1 49 | continue 50 | tmp_dict = { 51 | "category": category, 52 | "tool_name": tool_name, 53 | "api_name": api_name 54 | } 55 | retrieved_tools.append(tmp_dict) 56 | return retrieved_tools -------------------------------------------------------------------------------- /toolbench/inference/LLM/tool_llama_lora_model.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | import time 4 | from termcolor import colored 5 | from typing import Optional, List 6 | from peft import PeftModel 7 | import torch 8 | from typing import Optional 9 | import torch 10 | from transformers import ( 11 | AutoTokenizer, 12 | LlamaForCausalLM, 13 | ) 14 | from toolbench.utils import process_system_message 15 | from toolbench.model.model_adapter import get_conversation_template 16 | from toolbench.inference.utils import SimpleChatIO, generate_stream, react_parser 17 | 18 | 19 | class ToolLLaMALoRA: 20 | def __init__( 21 | self, 22 | base_name_or_path: str, 23 | model_name_or_path: str, 24 | template:str="tool-llama-single-round", 25 | device: str="cuda", 26 | cpu_offloading: bool=False, 27 | load_8bit: bool=False, 28 | max_sequence_length: int=8192 29 | ) -> None: 30 | super().__init__() 31 | self.model_name = model_name_or_path 32 | self.template = template 33 | self.max_sequence_length = max_sequence_length 34 | self.tokenizer = AutoTokenizer.from_pretrained(base_name_or_path, use_fast=False, model_max_length=self.max_sequence_length, padding_side="right") 35 | model = LlamaForCausalLM.from_pretrained( 36 | base_name_or_path, 37 | load_in_8bit=load_8bit, 38 | torch_dtype=torch.float16, 39 | device_map="auto" 40 | ) 41 | self.model = PeftModel.from_pretrained( 42 | model, 43 | model_name_or_path, 44 | torch_dtype=torch.float16, 45 | ) 46 | self.tokenizer.pad_token = self.tokenizer.unk_token 47 | 48 | self.use_gpu = (True if device == "cuda" else False) 49 | if (device == "cuda" and not cpu_offloading) or device == "mps": 50 | self.model.to(device) 51 | self.chatio = SimpleChatIO() 52 | 53 | def prediction(self, prompt: str, stop: Optional[List[str]] = None) -> str: 54 | gen_params = { 55 | "model": "", 56 | "prompt": prompt, 57 | "temperature": 0.5, 58 | "max_new_tokens": 512, 59 | "stop": "", 60 | "stop_token_ids": None, 61 | "echo": False 62 | } 63 | generate_stream_func = generate_stream 64 | output_stream = generate_stream_func(self.model, self.tokenizer, gen_params, "cuda", self.max_sequence_length, force_generate=True) 65 | outputs = self.chatio.return_output(output_stream) 66 | prediction = outputs.strip() 67 | return prediction 68 | 69 | def add_message(self, message): 70 | self.conversation_history.append(message) 71 | 72 | def change_messages(self,messages): 73 | self.conversation_history = messages 74 | 75 | def display_conversation(self, detailed=False): 76 | role_to_color = { 77 | "system": "red", 78 | "user": "green", 79 | "assistant": "blue", 80 | "function": "magenta", 81 | } 82 | print("before_print"+"*"*50) 83 | for message in self.conversation_history: 84 | print_obj = f"{message['role']}: {message['content']} " 85 | if "function_call" in message.keys(): 86 | print_obj = print_obj + f"function_call: {message['function_call']}" 87 | print_obj += "" 88 | print( 89 | colored( 90 | print_obj, 91 | role_to_color[message["role"]], 92 | ) 93 | ) 94 | print("end_print"+"*"*50) 95 | 96 | def parse(self,functions,process_id,**args): 97 | conv = get_conversation_template(self.template) 98 | if self.template == "tool-llama": 99 | roles = {"human": conv.roles[0], "gpt": conv.roles[1]} 100 | elif self.template == "tool-llama-single-round" or self.template == "tool-llama-multi-rounds": 101 | roles = {"system": conv.roles[0], "user": conv.roles[1], "function": conv.roles[2], "assistant": conv.roles[3]} 102 | 103 | self.time = time.time() 104 | conversation_history = self.conversation_history 105 | prompt = '' 106 | for message in conversation_history: 107 | role = roles[message['role']] 108 | content = message['content'] 109 | if role == "System" and functions != []: 110 | content = process_system_message(content, functions) 111 | prompt += f"{role}: {content}\n" 112 | prompt += "Assistant:\n" 113 | if functions != []: 114 | predictions = self.prediction(prompt) 115 | else: 116 | predictions = self.prediction(prompt) 117 | 118 | decoded_token_len = len(self.tokenizer(predictions)) 119 | if process_id == 0: 120 | print(f"[process({process_id})]total tokens: {decoded_token_len}") 121 | 122 | # react format prediction 123 | thought, action, action_input = react_parser(predictions) 124 | message = { 125 | "role": "assistant", 126 | "content": thought, 127 | "function_call": { 128 | "name": action, 129 | "arguments": action_input 130 | } 131 | } 132 | return message, 0, decoded_token_len 133 | 134 | 135 | if __name__ == "__main__": 136 | # can accept all huggingface LlamaModel family 137 | llm = ToolLLaMALoRA("decapoda-research/llama-7b-hf") 138 | messages = [ 139 | {'role': 'system', 'content': '''You are AutoGPT, you can use many tools(functions) to do 140 | the following task.\nFirst I will give you the task description, and your task start.\nAt each step, you need to give your thought to analyze the status now and what to do next, with a function call to actually excute your step.\nAfter the call, you will get the call result, and you are now in a new state.\nThen you will analyze your status now, then decide what to do next...\nAfter many (Thought-call) pairs, you finally perform the task, then you can give your finial answer.\nRemember: \n1.the state change is , you can\'t go 141 | back to the former state, if you want to restart the task, say "I give up and restart".\n2.All the thought is short, at most in 5 sentence.\nLet\'s Begin!\nTask description: Use numbers and basic arithmetic operations (+ - * /) to obtain exactly one number=24. Each 142 | step, you are only allowed to choose two of the left numbers to obtain a new number. For example, you can combine [3,13,9,7] as 7*9 - 3*13 = 24.\nRemember:\n1.all of the number must be used , and must be used ONCE. So Only when left numbers is exact 24, you will win. So you don\'t succeed when left number = [24, 5]. You succeed when left number = [24]. \n2.all the try takes exactly 3 steps, look 143 | at the input format'''}, 144 | {'role': 'user', 'content': '\nThe real task input is: [1, 2, 4, 7]\nBegin!\n'} 145 | ] 146 | functions = [{'name': 'play_24', 'description': '''make your current conbine with the format "x operation y = z (left: aaa) " like "1+2=3, (left: 3 5 7)", then I will tell you whether you win. This is the ONLY way 147 | to interact with the game, and the total process of a input use 3 steps of call, each step you can only combine 2 of the left numbers, so the count of left numbers decrease from 4 to 1''','parameters':{'type': 'object', 'properties':{}}}]#, 'parameters': {'type': 'object', 'properties': {'input': {'type': 'string', 'description': 'describe what number you want to conbine, and how to conbine.'}}, 'required': ['input']}}] 148 | 149 | llm.change_messages(messages) 150 | output = llm.parse(functions=functions) 151 | print(output) -------------------------------------------------------------------------------- /toolbench/inference/LLM/tool_llama_model.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | import time 4 | from termcolor import colored 5 | from typing import Optional, List 6 | import torch 7 | from typing import Optional 8 | import torch 9 | from transformers import ( 10 | AutoTokenizer, 11 | AutoModelForCausalLM, 12 | ) 13 | from toolbench.utils import process_system_message 14 | from toolbench.model.model_adapter import get_conversation_template 15 | from toolbench.inference.utils import SimpleChatIO, generate_stream, react_parser 16 | 17 | 18 | class ToolLLaMA: 19 | def __init__( 20 | self, 21 | model_name_or_path: str, 22 | template:str="tool-llama-single-round", 23 | device: str="cuda", 24 | cpu_offloading: bool=False, 25 | max_sequence_length: int=8192 26 | ) -> None: 27 | super().__init__() 28 | self.model_name = model_name_or_path 29 | self.template = template 30 | self.max_sequence_length = max_sequence_length 31 | self.tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=False, model_max_length=self.max_sequence_length) 32 | self.model = AutoModelForCausalLM.from_pretrained( 33 | model_name_or_path, low_cpu_mem_usage=True 34 | ) 35 | if self.tokenizer.pad_token_id == None: 36 | self.tokenizer.add_special_tokens({"bos_token": "", "eos_token": "", "pad_token": ""}) 37 | self.model.resize_token_embeddings(len(self.tokenizer)) 38 | self.use_gpu = (True if device == "cuda" else False) 39 | if (device == "cuda" and not cpu_offloading) or device == "mps": 40 | self.model.to(device) 41 | self.chatio = SimpleChatIO() 42 | 43 | def prediction(self, prompt: str, stop: Optional[List[str]] = None) -> str: 44 | with torch.no_grad(): 45 | gen_params = { 46 | "model": "", 47 | "prompt": prompt, 48 | "temperature": 0.5, 49 | "max_new_tokens": 512, 50 | "stop": "", 51 | "stop_token_ids": None, 52 | "echo": False 53 | } 54 | generate_stream_func = generate_stream 55 | output_stream = generate_stream_func(self.model, self.tokenizer, gen_params, "cuda", self.max_sequence_length, force_generate=True) 56 | outputs = self.chatio.return_output(output_stream) 57 | prediction = outputs.strip() 58 | return prediction 59 | 60 | def add_message(self, message): 61 | self.conversation_history.append(message) 62 | 63 | def change_messages(self,messages): 64 | self.conversation_history = messages 65 | 66 | def display_conversation(self, detailed=False): 67 | role_to_color = { 68 | "system": "red", 69 | "user": "green", 70 | "assistant": "blue", 71 | "function": "magenta", 72 | } 73 | print("before_print"+"*"*50) 74 | for message in self.conversation_history: 75 | print_obj = f"{message['role']}: {message['content']} " 76 | if "function_call" in message.keys(): 77 | print_obj = print_obj + f"function_call: {message['function_call']}" 78 | print_obj += "" 79 | print( 80 | colored( 81 | print_obj, 82 | role_to_color[message["role"]], 83 | ) 84 | ) 85 | print("end_print"+"*"*50) 86 | 87 | def parse(self, functions, process_id, **args): 88 | conv = get_conversation_template(self.template) 89 | if self.template == "tool-llama": 90 | roles = {"human": conv.roles[0], "gpt": conv.roles[1]} 91 | elif self.template == "tool-llama-single-round" or self.template == "tool-llama-multi-rounds": 92 | roles = {"system": conv.roles[0], "user": conv.roles[1], "function": conv.roles[2], "assistant": conv.roles[3]} 93 | 94 | self.time = time.time() 95 | conversation_history = self.conversation_history 96 | prompt = '' 97 | for message in conversation_history: 98 | role = roles[message['role']] 99 | content = message['content'] 100 | if role == "System" and functions != []: 101 | content = process_system_message(content, functions) 102 | prompt += f"{role}: {content}\n" 103 | prompt += "Assistant:\n" 104 | 105 | if functions != []: 106 | predictions = self.prediction(prompt) 107 | else: 108 | predictions = self.prediction(prompt) 109 | 110 | decoded_token_len = len(self.tokenizer(predictions)) 111 | if process_id == 0: 112 | print(f"[process({process_id})]total tokens: {decoded_token_len}") 113 | 114 | # react format prediction 115 | thought, action, action_input = react_parser(predictions) 116 | message = { 117 | "role": "assistant", 118 | "content": thought, 119 | "function_call": { 120 | "name": action, 121 | "arguments": action_input 122 | } 123 | } 124 | return message, 0, decoded_token_len 125 | 126 | 127 | if __name__ == "__main__": 128 | # can accept all huggingface LlamaModel family 129 | llm = ToolLLaMA("decapoda-research/llama-7b-hf") 130 | messages = [ 131 | {'role': 'system', 'content': '''You are AutoGPT, you can use many tools(functions) to do 132 | the following task.\nFirst I will give you the task description, and your task start.\nAt each step, you need to give your thought to analyze the status now and what to do next, with a function call to actually excute your step.\nAfter the call, you will get the call result, and you are now in a new state.\nThen you will analyze your status now, then decide what to do next...\nAfter many (Thought-call) pairs, you finally perform the task, then you can give your finial answer.\nRemember: \n1.the state change is , you can\'t go 133 | back to the former state, if you want to restart the task, say "I give up and restart".\n2.All the thought is short, at most in 5 sentence.\nLet\'s Begin!\nTask description: Use numbers and basic arithmetic operations (+ - * /) to obtain exactly one number=24. Each 134 | step, you are only allowed to choose two of the left numbers to obtain a new number. For example, you can combine [3,13,9,7] as 7*9 - 3*13 = 24.\nRemember:\n1.all of the number must be used , and must be used ONCE. So Only when left numbers is exact 24, you will win. So you don\'t succeed when left number = [24, 5]. You succeed when left number = [24]. \n2.all the try takes exactly 3 steps, look 135 | at the input format'''}, 136 | {'role': 'user', 'content': '\nThe real task input is: [1, 2, 4, 7]\nBegin!\n'} 137 | ] 138 | functions = [{'name': 'play_24', 'description': '''make your current conbine with the format "x operation y = z (left: aaa) " like "1+2=3, (left: 3 5 7)", then I will tell you whether you win. This is the ONLY way 139 | to interact with the game, and the total process of a input use 3 steps of call, each step you can only combine 2 of the left numbers, so the count of left numbers decrease from 4 to 1''','parameters':{'type': 'object', 'properties':{}}}]#, 'parameters': {'type': 'object', 'properties': {'input': {'type': 'string', 'description': 'describe what number you want to conbine, and how to conbine.'}}, 'required': ['input']}}] 140 | 141 | llm.change_messages(messages) 142 | output = llm.parse(functions=functions) 143 | print(output) -------------------------------------------------------------------------------- /toolbench/inference/LLM_rank/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenBMB/ToolBench/d56fdd89faf8c91fa135090b212bb9057ee5cfc2/toolbench/inference/LLM_rank/__init__.py -------------------------------------------------------------------------------- /toolbench/inference/LLM_rank/rank_candidate.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Evaluate the score of a query corresponding to different candidates 3 | ''' 4 | 5 | from Prompts.rank_prompts import LLM_PAIRWISE_RANK_SUBFIX_SYSTEM_PROMPT, LLM_PAIRWISE_RANK_USER_PROMPT 6 | import random 7 | from Tree.Tree import tree_node 8 | 9 | 10 | def rank2symmetry(llm_interface, LLM_rank_args, cand1,cand2): 11 | ''' 12 | Use llm to compare the height, due to the sequence, you need to compare each of the two in the front 13 | ''' 14 | single_rank_func = LLM_rank_args["rank_func"] 15 | score = [0,0] 16 | bigger1,query_count1, total_tokens1 = single_rank_func(llm_interface, LLM_rank_args, cand1,cand2) 17 | score[1 - bigger1] += 1 18 | bigger2,query_count2, total_tokens2 = single_rank_func(llm_interface, LLM_rank_args, cand2,cand1) 19 | score[bigger2] += 1 20 | if score[0] > score[1]: 21 | return 1 , query_count1 + query_count2, total_tokens1 + total_tokens2 22 | elif score[0] < score[1]: 23 | return -1, query_count1 + query_count2, total_tokens1 + total_tokens2 24 | else: 25 | return 0, query_count1 + query_count2, total_tokens1 + total_tokens2 26 | 27 | 28 | 29 | def rank2_subfix(llm_interface,LLM_rank_args, cand1,cand2): 30 | ''' 31 | Assumed that the two candidates have a long common prefix 32 | ''' 33 | anscestor_interesction = tree_node.find_ancestor_intersection(cand1,cand2) 34 | assert anscestor_interesction != None 35 | intersect_trice = anscestor_interesction.get_former_trice_from_this_node(end_node=None) 36 | trice_1 = cand1.get_former_trice_from_this_node(end_node=anscestor_interesction) 37 | trice_2 = cand2.get_former_trice_from_this_node(end_node=anscestor_interesction) 38 | 39 | system_message = LLM_PAIRWISE_RANK_SUBFIX_SYSTEM_PROMPT 40 | system_message = system_message.replace("{task_description}", LLM_rank_args["task_description"]) 41 | system_message = system_message.replace("{intersect_trice}", intersect_trice) 42 | system_message = system_message.replace("{candidate_A}",trice_1) 43 | system_message = system_message.replace("{candidate_B}",trice_2) 44 | llm_interface.change_messages([{"role":"system","content":system_message}, 45 | {"role":"user","content":LLM_PAIRWISE_RANK_USER_PROMPT}, 46 | ]) 47 | output,error_code, total_tokens = llm_interface.parse(functions=LLM_rank_args["functions"],function_call="none",process_id=LLM_rank_args["process_id"]) 48 | if output["content"].strip().lower()[-1] == "a": 49 | return 1, 1, total_tokens 50 | else: 51 | return 0, 1, total_tokens 52 | 53 | def sum_based_rankn(llm_interface,LLM_rank_args, candidates): 54 | ''' 55 | All pairs are sorted pairwise, sum the total points, and choose the best 56 | ''' 57 | total_querys = 0 58 | total_tokens = 0 59 | scores = [0]*len(candidates) 60 | for i in range(len(candidates)-1): 61 | for j in range(i+1,len(candidates)): 62 | pairwise_rank,query_count,rank2_tokens = rank2symmetry(llm_interface,LLM_rank_args, candidates[i],candidates[j]) 63 | total_querys += query_count 64 | total_tokens += rank2_tokens 65 | if pairwise_rank > 0: 66 | scores[i] += 1 67 | elif pairwise_rank < 0: 68 | scores[j] += 1 69 | else: 70 | scores[i] += 0.5 71 | scores[j] += 0.5 72 | return scores, total_querys, total_tokens 73 | 74 | 75 | 76 | if __name__ == "__main__": 77 | random.seed(42) 78 | # candidates = [ 79 | # "234", 80 | # "66.5", 81 | # "77.1", 82 | # "88.967", 83 | # "pi", 84 | # # "e", 85 | # # "ln(2)" 86 | # ] 87 | candidates = [ 88 | "77.1", 89 | "88.967", 90 | "pi", 91 | "66.5", 92 | "234", 93 | "ln(2)" 94 | ] 95 | ''' 96 | starting_delta: 97 | 50 -> 42.85% 98 | 100 -> 35.99% 99 | 150 -> 29.66% 100 | 200 -> 24.03% 101 | ''' 102 | -------------------------------------------------------------------------------- /toolbench/inference/Prompts/ReAct_prompts.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | FORMAT_INSTRUCTIONS_SYSTEM_FUNCTION = """You are AutoGPT, you can use many tools(functions) to do the following task. 5 | First I will give you the task description, and your task start. 6 | At each step, you need to give your thought to analyze the status now and what to do next, with a function call to actually excute your step. 7 | After the call, you will get the call result, and you are now in a new state. 8 | Then you will analyze your status now, then decide what to do next... 9 | After many (Thought-call) pairs, you finally perform the task, then you can give your finial answer. 10 | Remember: 11 | 1.the state change is irreversible, you can't go back to one of the former state, if you want to restart the task, say "I give up and restart". 12 | 2.All the thought is short, at most in 5 sentence. 13 | 3.You can do more then one trys, so if your plan is to continusly try some conditions, you can do one of the conditions per try. 14 | Let's Begin! 15 | Task description: {task_description}""" 16 | 17 | FORMAT_INSTRUCTIONS_USER_FUNCTION = """ 18 | {input_description} 19 | Begin! 20 | """ 21 | 22 | FORMAT_INSTRUCTIONS_SYSTEM_FUNCTION_ZEROSHOT = """Answer the following questions as best you can. Specifically, you have access to the following APIs: 23 | 24 | {func_str} 25 | 26 | Use the following format: 27 | Thought: you should always think about what to do 28 | Action: the action to take, should be one of {func_list} 29 | Action Input: the input to the action 30 | End Action 31 | 32 | Begin! Remember: (1) Follow the format, i.e, 33 | Thought: 34 | Action: 35 | Action Input: 36 | End Action 37 | (2)The Action: MUST be one of the following:{func_list} 38 | (3)If you believe that you have obtained enough information (which can be judge from the history observations) that can answer the task, please call: 39 | Action: Finish 40 | Action Input: {{"return_type": "give_answer", "final_answer": your answer string}}. 41 | Question: {question} 42 | 43 | Here are the history actions and observations: 44 | """ 45 | -------------------------------------------------------------------------------- /toolbench/inference/Prompts/Tree_search_prompts.py: -------------------------------------------------------------------------------- 1 | DIVERSITY_PROMPT='''This is not the first time you try this task, all previous trails failed. 2 | Before you generate my thought for this state, I will first show you your previous actions for this state, and then you must generate actions that is different from all of them. Here are some previous actions candidates: 3 | {previous_candidate} 4 | Remember you are now in the intermediate state of a trail, you will first analyze the now state and previous action candidates, then make actions that is different from all the previous.''' 5 | 6 | 7 | -------------------------------------------------------------------------------- /toolbench/inference/Prompts/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenBMB/ToolBench/d56fdd89faf8c91fa135090b212bb9057ee5cfc2/toolbench/inference/Prompts/__init__.py -------------------------------------------------------------------------------- /toolbench/inference/Prompts/rank_prompts.py: -------------------------------------------------------------------------------- 1 | 2 | LLM_PAIRWISE_RANK_SUBFIX_SYSTEM_PROMPT = ''' 3 | You are value-GPT, which is an expert of defining which trail is better, which trail is more close to solving the task. 4 | All candidate tries to solve this task with some funciton calls: 5 | ******************************* 6 | {{TASK_DESCRIPTION}} 7 | {task_description} 8 | {{END_TASK_DESCRIPTION}} 9 | ******************************* 10 | First, all candidate do the following things: 11 | {intersect_trice} 12 | After that, there are two candidates A and B, they do different things: 13 | ******************************* 14 | {{CANDIDATE_A_START}} 15 | {candidate_A} 16 | {{CANDIDATE_A_END}} 17 | ******************************* 18 | {{CANDIDATE_B_START}} 19 | {candidate_B} 20 | {{CANDIDATE_B_END}} 21 | Which try do you think is more helpful to solving the task? 22 | ''' 23 | 24 | 25 | 26 | 27 | LLM_PAIRWISE_RANK_USER_PROMPT = ''' 28 | Tell me which candidate is better in ONE Word: "A" or "B":''' -------------------------------------------------------------------------------- /toolbench/inference/Tree/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenBMB/ToolBench/d56fdd89faf8c91fa135090b212bb9057ee5cfc2/toolbench/inference/Tree/__init__.py -------------------------------------------------------------------------------- /toolbench/inference/callbacks/ServerEventCallback.py: -------------------------------------------------------------------------------- 1 | from typing import Any, Dict, List, Union 2 | import queue 3 | class ServerEventCallback(): 4 | """Base callback handler""" 5 | 6 | def __init__(self, queue: queue.Queue, *args, **kwargs): 7 | super().__init__(*args, **kwargs) 8 | self.queue = queue 9 | self.llm_block_id = 0 10 | self.tool_block_id = 0 11 | self.tool_descriptions = {} 12 | 13 | def add_to_queue(self, method_name: str, block_id, **kwargs: Any): 14 | data = { 15 | "method_name": method_name, 16 | "block_id": block_id, 17 | } 18 | data.update(kwargs) 19 | self.queue.put(data) 20 | 21 | def on_tool_retrieval_start(self): 22 | # tools should be of the form 23 | # {tool_name, tool_desc} 24 | self.add_to_queue( 25 | "on_tool_retrieval_start", 26 | "recommendation-1", 27 | ) 28 | print("on_tool_retrieval_start method called") 29 | 30 | def on_tool_retrieval_end(self, tools): 31 | # tool should be of the form 32 | # {tool_name, tool_desc} 33 | self.add_to_queue( 34 | "on_tool_retrieval_end", 35 | "recommendation-1", 36 | recommendations=tools 37 | ) 38 | self.tool_descriptions = { 39 | tool["name"]: tool for tool in tools 40 | } 41 | print("on_tool_retrieval_end method called") 42 | def on_request_start(self, user_input: str, method: str) -> Any: 43 | self.tool_block_id = 0 44 | self.llm_block_id = 0 45 | self.add_to_queue( 46 | "on_request_start", 47 | block_id="start", 48 | user_input=user_input, 49 | method=method 50 | ) 51 | def on_request_end(self, outputs: str, chain: List[Any]): 52 | self.add_to_queue( 53 | "on_request_end", 54 | block_id="end", 55 | output=outputs, 56 | chain=chain 57 | ) 58 | def on_request_error(self, error: str): 59 | self.add_to_queue( 60 | "on_request_error", 61 | block_id="error", 62 | error=error 63 | ) 64 | 65 | # keep 66 | def on_chain_start(self, inputs: str, depth: int) -> Any: 67 | """Run when chain starts running.""" 68 | print("on_chain_start method called") 69 | self.llm_block_id += 1 70 | block_id = "llm-" + str(self.llm_block_id) 71 | self.add_to_queue( 72 | "on_chain_start", 73 | block_id=block_id, 74 | messages=inputs, 75 | depth=depth 76 | ) 77 | return block_id 78 | 79 | # this one needs the block_id memorized 80 | def on_chain_end(self, block_id: str, depth: int) -> Any: 81 | self.add_to_queue( 82 | "on_chain_end", 83 | block_id=block_id, 84 | # output=output, 85 | depth=depth 86 | ) 87 | print("on_chain_end method called") 88 | 89 | def on_chain_error(self, error: Union[Exception, KeyboardInterrupt], **kwargs: Any) -> Any: 90 | method_name = "on_chain_error" 91 | self.add_to_queue(method_name, error=error, **kwargs) 92 | print("on_chain_error method called") 93 | 94 | def on_llm_start( 95 | self, messages: str, depth: int 96 | ) -> Any: 97 | """Run when LLM starts running.""" 98 | self.add_to_queue( 99 | "on_llm_start", 100 | block_id="llm-" + str(self.llm_block_id), 101 | messages=messages, 102 | depth=depth 103 | ) 104 | print("on_llm_start method called") 105 | 106 | def on_llm_new_token(self, token: str, **kwargs: Any) -> Any: 107 | """Run on new LLM token. Only available when streaming is enabled.""" 108 | method_name = "on_llm_new_token" 109 | self.add_to_queue(method_name, token=token, **kwargs) 110 | print("on_llm_new_token method called") 111 | 112 | def on_llm_end(self, response: str, depth: int) -> Any: 113 | """Run when LLM ends running.""" 114 | self.add_to_queue( 115 | "on_llm_end", 116 | block_id="llm-" + str(self.llm_block_id), 117 | response=response, 118 | depth=depth 119 | ) 120 | print("on_llm_end method called") 121 | 122 | def on_llm_error(self, error: Union[Exception, KeyboardInterrupt]) -> Any: 123 | """Run when LLM errors.""" 124 | self.add_to_queue( 125 | "on_llm_error", 126 | block_id="llm-" + str(self.llm_block_id), 127 | message=str(error), 128 | error=error 129 | ) 130 | print("on_llm_error method called") 131 | 132 | def on_agent_action(self, action, action_input, depth: int) -> str: 133 | self.tool_block_id += 1 134 | block_id="tool-" + str(self.tool_block_id) 135 | self.add_to_queue( 136 | "on_agent_action", 137 | block_id=block_id, 138 | action=action, 139 | action_input = action_input, 140 | depth=depth 141 | ) 142 | print("on_agent_action method called") 143 | return block_id 144 | 145 | def on_tool_start(self, tool_name: str, tool_input: str, depth: int) -> Any: 146 | method_name = "on_tool_start" 147 | tool_description = "Tool not found in tool descriptions" 148 | if tool_name in self.tool_descriptions: 149 | tool_description = self.tool_descriptions[tool_name] 150 | else: 151 | print(self.tool_descriptions) 152 | print("Key", tool_name, "not found in tool descriptions") 153 | self.add_to_queue( 154 | method_name, 155 | block_id="tool-" + str(self.tool_block_id), 156 | tool_name=tool_name, 157 | tool_description=tool_description, 158 | tool_input=tool_input, 159 | depth=depth 160 | ) 161 | print("on_tool_start method called") 162 | 163 | def on_tool_end(self, output: str, status:int, depth: int) -> Any: 164 | method_name = "on_tool_end" 165 | self.add_to_queue( 166 | method_name, 167 | block_id="tool-" + str(self.tool_block_id), 168 | output=output, 169 | status= status, 170 | depth=depth 171 | ) 172 | print("on_tool_end method called") 173 | 174 | def on_tool_error(self, error: Union[Exception, KeyboardInterrupt]) -> Any: 175 | method_name = "on_tool_error" 176 | self.add_to_queue( 177 | method_name, 178 | error=error 179 | ) 180 | print("on_tool_error method called") 181 | 182 | def on_agent_end(self, block_id:str, depth: int): 183 | self.add_to_queue( 184 | "on_agent_end", 185 | block_id=block_id, 186 | depth=depth 187 | ) 188 | print("on_agent_end method called") -------------------------------------------------------------------------------- /toolbench/inference/qa_pipeline.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Close-domain QA Pipeline 3 | ''' 4 | 5 | import argparse 6 | from toolbench.inference.Downstream_tasks.rapidapi import pipeline_runner 7 | 8 | 9 | if __name__ == "__main__": 10 | 11 | parser = argparse.ArgumentParser() 12 | parser.add_argument('--backbone_model', type=str, default="toolllama", required=False, help='chatgpt_function or davinci or toolllama') 13 | parser.add_argument('--openai_key', type=str, default="", required=False, help='openai key for chatgpt_function or davinci model') 14 | parser.add_argument('--model_path', type=str, default="your_model_path/", required=False, help='') 15 | parser.add_argument('--tool_root_dir', type=str, default="your_tools_path/", required=True, help='') 16 | parser.add_argument("--lora", action="store_true", help="Load lora model or not.") 17 | parser.add_argument('--lora_path', type=str, default="your_lora_path if lora", required=False, help='') 18 | parser.add_argument('--max_observation_length', type=int, default=1024, required=False, help='maximum observation length') 19 | parser.add_argument('--max_source_sequence_length', type=int, default=4096, required=False, help='original maximum model sequence length') 20 | parser.add_argument('--max_sequence_length', type=int, default=8192, required=False, help='maximum model sequence length') 21 | parser.add_argument('--observ_compress_method', type=str, default="truncate", choices=["truncate", "filter", "random"], required=False, help='observation compress method') 22 | parser.add_argument('--method', type=str, default="CoT@1", required=False, help='method for answer generation: CoT@n,Reflexion@n,BFS,DFS,UCT_vote') 23 | parser.add_argument('--input_query_file', type=str, default="", required=False, help='input path') 24 | parser.add_argument('--output_answer_file', type=str, default="",required=False, help='output path') 25 | parser.add_argument('--toolbench_key', type=str, default="",required=False, help='your toolbench key to request rapidapi service') 26 | parser.add_argument('--rapidapi_key', type=str, default="",required=False, help='your rapidapi key to request rapidapi service') 27 | parser.add_argument('--use_rapidapi_key', action="store_true", help="To use customized rapidapi service or not.") 28 | parser.add_argument('--api_customization', action="store_true", help="To use customized api or not.") 29 | 30 | args = parser.parse_args() 31 | 32 | pipeline_runner = pipeline_runner(args) 33 | pipeline_runner.run() 34 | 35 | -------------------------------------------------------------------------------- /toolbench/inference/qa_pipeline_open_domain.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Open-domain QA Pipeline 3 | ''' 4 | import argparse 5 | from toolbench.inference.Downstream_tasks.rapidapi import pipeline_runner 6 | 7 | 8 | if __name__ == "__main__": 9 | 10 | parser = argparse.ArgumentParser() 11 | parser.add_argument('--corpus_tsv_path', type=str, default="your_retrival_corpus_path/", required=False, help='') 12 | parser.add_argument('--retrieval_model_path', type=str, default="your_model_path/", required=False, help='') 13 | parser.add_argument('--retrieved_api_nums', type=int, default=5, required=False, help='') 14 | parser.add_argument('--backbone_model', type=str, default="toolllama", required=False, help='chatgpt_function or davinci or toolllama') 15 | parser.add_argument('--openai_key', type=str, default="", required=False, help='openai key for chatgpt_function or davinci model') 16 | parser.add_argument('--model_path', type=str, default="your_model_path/", required=False, help='') 17 | parser.add_argument('--tool_root_dir', type=str, default="your_tools_path/", required=True, help='') 18 | parser.add_argument("--lora", action="store_true", help="Load lora model or not.") 19 | parser.add_argument('--lora_path', type=str, default="your_lora_path if lora", required=False, help='') 20 | parser.add_argument('--max_observation_length', type=int, default=1024, required=False, help='maximum observation length') 21 | parser.add_argument('--max_source_sequence_length', type=int, default=4096, required=False, help='original maximum model sequence length') 22 | parser.add_argument('--max_sequence_length', type=int, default=8192, required=False, help='maximum model sequence length') 23 | parser.add_argument('--observ_compress_method', type=str, default="truncate", choices=["truncate", "filter", "random"], required=False, help='maximum observation length') 24 | parser.add_argument('--method', type=str, default="CoT@1", required=False, help='method for answer generation: CoT@n,Reflexion@n,BFS,DFS,UCT_vote') 25 | parser.add_argument('--input_query_file', type=str, default="", required=False, help='input path') 26 | parser.add_argument('--output_answer_file', type=str, default="",required=False, help='output path') 27 | parser.add_argument('--toolbench_key', type=str, default="",required=False, help='your toolbench key to request rapidapi service') 28 | parser.add_argument('--rapidapi_key', type=str, default="",required=False, help='your rapidapi key to request rapidapi service') 29 | parser.add_argument('--use_rapidapi_key', action="store_true", help="To use customized rapidapi service or not.") 30 | parser.add_argument('--api_customization', action="store_true", help="To use customized api or not. NOT SUPPORTED currently under open domain setting.") 31 | 32 | args = parser.parse_args() 33 | 34 | pipeline_runner = pipeline_runner(args, add_retrieval=True) 35 | pipeline_runner.run() 36 | -------------------------------------------------------------------------------- /toolbench/inference/toolbench_server.py: -------------------------------------------------------------------------------- 1 | from flask import Flask, Response, stream_with_context, request 2 | from flask_cors import CORS, cross_origin 3 | from callbacks.ServerEventCallback import ServerEventCallback 4 | import argparse 5 | from toolbench.inference.Downstream_tasks.rapidapi import pipeline_runner 6 | import subprocess 7 | import concurrent.futures 8 | import json 9 | import signal 10 | import time 11 | from queue import Queue 12 | import copy 13 | import time 14 | app = Flask(__name__) 15 | cors = CORS(app) 16 | 17 | 18 | class Model: 19 | def __init__(self, gpu=0): 20 | self.inuse = False 21 | print("Initializing...") 22 | starting_time = time.time() 23 | self.args = self.get_args() 24 | self.pipeline = pipeline_runner(self.args, add_retrieval=False, server=True) 25 | print("Loading model...") 26 | self.llm = self.pipeline.get_backbone_model() 27 | print("Model loaded in {} seconds".format(time.time() - starting_time)) 28 | starting_time = time.time() 29 | print("Loading retriever...") 30 | self.retriever = self.pipeline.get_retriever() 31 | print("Retriever loaded in {} seconds".format(time.time() - starting_time)) 32 | self.query_id = 0 33 | # self.process_num = self.args.process_num 34 | 35 | self.queue = Queue() 36 | self.callback = ServerEventCallback(self.queue) 37 | self.occupied = False 38 | 39 | print("Server ready") 40 | 41 | def run_pipeline(self, user_input, method, top_k): 42 | # empty the queue 43 | while not self.queue.empty(): 44 | self.queue.get() 45 | self.query_id += 1 46 | temp_args = copy.deepcopy(self.args) 47 | temp_args.retrieved_api_nums = top_k 48 | temp_args.method = method 49 | data_dict = { 50 | "query": user_input, 51 | } 52 | self.pipeline.run_single_task( 53 | method=method, 54 | backbone_model=self.llm, 55 | query_id=self.query_id, 56 | data_dict=data_dict, 57 | output_dir_path=self.args.output_answer_file, 58 | retriever=self.retriever, 59 | args=temp_args, 60 | tool_des=None, 61 | callbacks=[self.callback] 62 | ) 63 | 64 | def get_queue(self): 65 | while not self.queue.empty(): 66 | yield self.queue.get() 67 | 68 | def get_args(self): 69 | parser = argparse.ArgumentParser() 70 | parser.add_argument('--corpus_tsv_path', type=str, default="your_retrival_corpus_path/", required=False, 71 | help='') 72 | parser.add_argument('--retrieval_model_path', type=str, default="your_model_path/", required=False, help='') 73 | parser.add_argument('--retrieved_api_nums', type=int, default=5, required=False, help='') 74 | parser.add_argument('--backbone_model', type=str, default="toolllama", required=False, 75 | help='chatgpt_function or davinci or toolllama') 76 | parser.add_argument('--openai_key', type=str, default="", required=False, 77 | help='openai key for chatgpt_function or davinci model') 78 | parser.add_argument('--model_path', type=str, default="your_model_path/", required=True, help='') 79 | parser.add_argument('--tool_root_dir', type=str, default="your_tools_path/", required=True, help='') 80 | parser.add_argument("--lora", action="store_true", help="Load lora model or not.") 81 | parser.add_argument('--lora_path', type=str, default="your_lora_path if lora", required=False, help='') 82 | parser.add_argument('--max_observation_length', type=int, default=1024, required=False, 83 | help='maximum observation length') 84 | parser.add_argument('--observ_compress_method', type=str, default="truncate", choices=["truncate", "filter", "random"], 85 | required=False, help='observation compress method') 86 | parser.add_argument('--method', type=str, default="CoT@1", required=False, 87 | help='method for answer generation: CoT@n,Reflexion@n,BFS,DFS,UCT_vote') 88 | parser.add_argument('--input_query_file', type=str, default="", required=False, help='input path') 89 | parser.add_argument('--output_answer_file', type=str, default="", required=False, help='output path') 90 | parser.add_argument('--toolbench_key', type=str, default="", required=False, help='your toolbench key') 91 | parser.add_argument('--rapidapi_key', type=str, default="",required=False, help='your rapidapi key to request rapidapi service') 92 | parser.add_argument('--use_rapidapi_key', action="store_true", help="To use customized rapidapi service or not.") 93 | parser.add_argument('--api_customization', action="store_true", help="To use customized api or not.") 94 | 95 | args = parser.parse_args() 96 | return args 97 | 98 | model = Model() 99 | 100 | 101 | @app.route('/stream', methods=['GET', 'POST']) 102 | @cross_origin() 103 | def stream(): 104 | data = json.loads(request.data) 105 | user_input = data["text"] 106 | top_k = data["top_k"] 107 | method = data["method"] 108 | print("Called stream") 109 | global model 110 | 111 | def generate(model): 112 | print("Called generate") 113 | if model.inuse: 114 | # send 409 error 115 | return Response(json.dumps({ 116 | "method_name": "error", 117 | "error": "Model in use" 118 | }), status=409, mimetype='application/json') 119 | return 120 | model.inuse = True 121 | 122 | # run model.run_agent in the background 123 | with concurrent.futures.ThreadPoolExecutor() as executor: 124 | 125 | future = executor.submit(model.run_pipeline, user_input, method, top_k) 126 | # keep waiting for the queue to be empty 127 | while True: 128 | if model.queue.empty(): 129 | if future.done(): 130 | print("Finished with future") 131 | break 132 | time.sleep(0.01) 133 | continue 134 | else: 135 | obj = model.queue.get() 136 | if obj["method_name"] == "unknown": continue 137 | if obj["method_name"] == "on_request_end": 138 | yield json.dumps(obj) 139 | break 140 | 141 | try: 142 | yield json.dumps(obj) + "\n" 143 | except Exception as e: 144 | model.inuse = False 145 | print(obj) 146 | print(e) 147 | 148 | try: 149 | future.result() 150 | except Exception as e: 151 | model.inuse = False 152 | print(e) 153 | 154 | model.inuse = False 155 | return 156 | 157 | return Response(stream_with_context(generate(model))) 158 | 159 | @app.route('/methods', methods=['GET']) 160 | @cross_origin() 161 | def methods(): 162 | # return a list of available methods 163 | return Response(json.dumps({ 164 | { 165 | "methods": ["DFS_woFilter_w2"] 166 | } 167 | }), status=200, mimetype='application/json') 168 | 169 | def handle_keyboard_interrupt(signal, frame): 170 | global model 171 | exit(0) 172 | 173 | signal.signal(signal.SIGINT, handle_keyboard_interrupt) 174 | 175 | if __name__ == '__main__': 176 | app.run(use_reloader=False, host="0.0.0.0", debug=True, port=5000) 177 | -------------------------------------------------------------------------------- /toolbench/model/__init__.py: -------------------------------------------------------------------------------- 1 | from toolbench.model.model_adapter import ( 2 | load_model, 3 | get_conversation_template, 4 | add_model_args, 5 | ) 6 | -------------------------------------------------------------------------------- /toolbench/model/apply_delta.py: -------------------------------------------------------------------------------- 1 | """ 2 | Apply the delta weights on top of a base model. 3 | 4 | Usage: 5 | python3 -m fastchat.model.apply_delta --base ~/model_weights/llama-7b --target ~/model_weights/vicuna-7b --delta lmsys/vicuna-7b-delta-v1.1 6 | """ 7 | import argparse 8 | import gc 9 | import glob 10 | import json 11 | import os 12 | import shutil 13 | import tempfile 14 | 15 | from huggingface_hub import snapshot_download 16 | import torch 17 | from torch import nn 18 | from tqdm import tqdm 19 | from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig 20 | 21 | 22 | GB = 1 << 30 23 | 24 | 25 | def split_files(model_path, tmp_path, split_size): 26 | if not os.path.exists(model_path): 27 | model_path = snapshot_download(repo_id=model_path) 28 | if not os.path.exists(tmp_path): 29 | os.makedirs(tmp_path) 30 | 31 | file_pattern = os.path.join(model_path, "pytorch_model-*.bin") 32 | files = glob.glob(file_pattern) 33 | 34 | part = 0 35 | try: 36 | for file_path in tqdm(files): 37 | state_dict = torch.load(file_path) 38 | new_state_dict = {} 39 | 40 | current_size = 0 41 | for name, param in state_dict.items(): 42 | param_size = param.numel() * param.element_size() 43 | 44 | if current_size + param_size > split_size: 45 | new_file_name = f"pytorch_model-{part}.bin" 46 | new_file_path = os.path.join(tmp_path, new_file_name) 47 | torch.save(new_state_dict, new_file_path) 48 | current_size = 0 49 | new_state_dict = None 50 | gc.collect() 51 | new_state_dict = {} 52 | part += 1 53 | 54 | new_state_dict[name] = param 55 | current_size += param_size 56 | 57 | new_file_name = f"pytorch_model-{part}.bin" 58 | new_file_path = os.path.join(tmp_path, new_file_name) 59 | torch.save(new_state_dict, new_file_path) 60 | new_state_dict = None 61 | gc.collect() 62 | new_state_dict = {} 63 | part += 1 64 | except Exception as e: 65 | print(f"An error occurred during split_files: {e}") 66 | shutil.rmtree(tmp_path) 67 | raise 68 | 69 | 70 | def apply_delta_low_cpu_mem(base_model_path, target_model_path, delta_path): 71 | delta_tokenizer = AutoTokenizer.from_pretrained(delta_path, use_fast=False) 72 | delta_config = AutoConfig.from_pretrained(delta_path) 73 | 74 | if os.path.exists(target_model_path): 75 | shutil.rmtree(target_model_path) 76 | os.makedirs(target_model_path) 77 | 78 | split_size = 4 * GB 79 | 80 | with tempfile.TemporaryDirectory() as tmp_base_path, tempfile.TemporaryDirectory() as tmp_delta_path: 81 | print(f"Split files for the base model to {tmp_base_path}") 82 | split_files(base_model_path, tmp_base_path, split_size) 83 | print(f"Split files for the delta weights to {tmp_delta_path}") 84 | split_files(delta_path, tmp_delta_path, split_size) 85 | 86 | base_pattern = os.path.join(tmp_base_path, "pytorch_model-*.bin") 87 | base_files = glob.glob(base_pattern) 88 | delta_pattern = os.path.join(tmp_delta_path, "pytorch_model-*.bin") 89 | delta_files = glob.glob(delta_pattern) 90 | delta_state_dict = torch.load(delta_files[0]) 91 | 92 | print("Applying the delta") 93 | weight_map = {} 94 | total_size = 0 95 | 96 | for i, base_file in tqdm(enumerate(base_files)): 97 | state_dict = torch.load(base_file) 98 | file_name = f"pytorch_model-{i}.bin" 99 | for name, param in state_dict.items(): 100 | if name not in delta_state_dict: 101 | for delta_file in delta_files: 102 | delta_state_dict = torch.load(delta_file) 103 | gc.collect() 104 | if name in delta_state_dict: 105 | break 106 | 107 | state_dict[name] += delta_state_dict[name] 108 | weight_map[name] = file_name 109 | total_size += param.numel() * param.element_size() 110 | gc.collect() 111 | torch.save(state_dict, os.path.join(target_model_path, file_name)) 112 | 113 | with open( 114 | os.path.join(target_model_path, "pytorch_model.bin.index.json"), "w" 115 | ) as f: 116 | json.dump( 117 | {"weight_map": weight_map, "metadata": {"total_size": total_size}}, f 118 | ) 119 | 120 | print(f"Saving the target model to {target_model_path}") 121 | delta_tokenizer.save_pretrained(target_model_path) 122 | delta_config.save_pretrained(target_model_path) 123 | 124 | 125 | def apply_delta(base_model_path, target_model_path, delta_path): 126 | print(f"Loading the delta weights from {delta_path}") 127 | delta_tokenizer = AutoTokenizer.from_pretrained(delta_path, use_fast=False) 128 | delta = AutoModelForCausalLM.from_pretrained( 129 | delta_path, torch_dtype=torch.float16, low_cpu_mem_usage=True 130 | ) 131 | 132 | print(f"Loading the base model from {base_model_path}") 133 | base = AutoModelForCausalLM.from_pretrained( 134 | base_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True 135 | ) 136 | 137 | print("Applying the delta") 138 | for name, param in tqdm(base.state_dict().items(), desc="Applying delta"): 139 | assert name in delta.state_dict() 140 | param.data += delta.state_dict()[name] 141 | 142 | print(f"Saving the target model to {target_model_path}") 143 | base.save_pretrained(target_model_path) 144 | delta_tokenizer.save_pretrained(target_model_path) 145 | 146 | 147 | if __name__ == "__main__": 148 | parser = argparse.ArgumentParser() 149 | parser.add_argument("--base-model-path", type=str, required=True) 150 | parser.add_argument("--target-model-path", type=str, required=True) 151 | parser.add_argument("--delta-path", type=str, required=True) 152 | parser.add_argument( 153 | "--low-cpu-mem", 154 | action="store_true", 155 | help="Lower the cpu memory usage. This will split large files and use " 156 | "disk as swap to reduce the memory usage below 10GB.", 157 | ) 158 | args = parser.parse_args() 159 | 160 | if args.low_cpu_mem: 161 | apply_delta_low_cpu_mem( 162 | args.base_model_path, args.target_model_path, args.delta_path 163 | ) 164 | else: 165 | apply_delta(args.base_model_path, args.target_model_path, args.delta_path) 166 | -------------------------------------------------------------------------------- /toolbench/model/compression.py: -------------------------------------------------------------------------------- 1 | import dataclasses 2 | import os 3 | 4 | import torch 5 | import torch.nn as nn 6 | from torch.nn import functional as F 7 | from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig 8 | 9 | 10 | @dataclasses.dataclass 11 | class CompressionConfig: 12 | """Group-wise quantization.""" 13 | 14 | num_bits: int 15 | group_size: int 16 | group_dim: int 17 | symmetric: bool 18 | enabled: bool = True 19 | 20 | 21 | default_compression_config = CompressionConfig( 22 | num_bits=8, group_size=256, group_dim=1, symmetric=True, enabled=True 23 | ) 24 | 25 | 26 | class CLinear(nn.Module): 27 | """Compressed Linear Layer.""" 28 | 29 | def __init__(self, weight=None, bias=None, device=None): 30 | super().__init__() 31 | self.weight = weight 32 | self.bias = bias 33 | 34 | def forward(self, input): 35 | return F.linear(input.to(self.weight.dtype), self.weight, self.bias) 36 | 37 | 38 | def compress_module(module, target_device): 39 | for name, child in module.named_children(): 40 | if isinstance(child, nn.Linear): 41 | setattr( 42 | module, 43 | name, 44 | CLinear(child.weight, child.bias, target_device), 45 | ) 46 | compress_module(child, target_device) 47 | 48 | 49 | def get_compressed_list(module, prefix=""): 50 | compressed_list = [] 51 | for name, child in module.named_children(): 52 | if isinstance(child, nn.Linear): 53 | full_name = f"{prefix}.{name}.weight" if prefix else f"{name}.weight" 54 | compressed_list.append(full_name) 55 | compressed_list.extend( 56 | get_compressed_list(child, full_name) 57 | ) 58 | return compressed_list 59 | 60 | 61 | def apply_compressed_weight(module, compressed_state_dict, target_device, prefix=""): 62 | for name, child in module.named_children(): 63 | if isinstance(child, nn.Linear): 64 | full_name = f"{prefix}.{name}.weight" if prefix else f"{name}.weight" 65 | setattr( 66 | module, 67 | name, 68 | CLinear( 69 | compressed_state_dict[full_name], child.bias, target_device 70 | ), 71 | ) 72 | apply_compressed_weight(child, compressed_state_dict, target_device, full_name) 73 | 74 | 75 | def load_compress_model(model_path, device, torch_dtype): 76 | # partially load model 77 | tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False) 78 | base_pattern = os.path.join(model_path, "pytorch_model-*.bin") 79 | files = glob.glob(base_pattern) 80 | 81 | config = AutoConfig.from_pretrained( 82 | model_path, low_cpu_mem_usage=True, torch_dtype=torch_dtype 83 | ) 84 | model = AutoModelForCausalLM.from_config(config) 85 | linear_weights = get_compressed_list(model) 86 | 87 | compressed_state_dict = {} 88 | 89 | for filename in files: 90 | tmp_state_dict = torch.load(filename) 91 | for name in tmp_state_dict: 92 | if name in linear_weights: 93 | tensor = tmp_state_dict[name].to(device).data.to(torch_dtype) 94 | compressed_state_dict[name] = compress( 95 | tensor, default_compression_config 96 | ) 97 | else: 98 | compressed_state_dict[name] = tmp_state_dict[name].to(device) 99 | tmp_state_dict[name] = None 100 | tensor = None 101 | torch.cuda.empty_cache() 102 | 103 | for name, param in model.named_parameters(): 104 | if name not in linear_weights: 105 | param.data = compressed_state_dict[name] 106 | apply_compressed_weight(model, compressed_state_dict, device) 107 | 108 | model.to(device) 109 | 110 | return model, tokenizer 111 | 112 | 113 | def compress(tensor, config): 114 | """Simulate group-wise quantization.""" 115 | if not config.enabled: 116 | return tensor 117 | 118 | group_size, num_bits, group_dim, symmetric = ( 119 | config.group_size, 120 | config.num_bits, 121 | config.group_dim, 122 | config.symmetric, 123 | ) 124 | assert num_bits <= 8 125 | 126 | original_shape = tensor.shape 127 | num_groups = (original_shape[group_dim] + group_size - 1) // group_size 128 | new_shape = ( 129 | original_shape[:group_dim] 130 | + (num_groups, group_size) 131 | + original_shape[group_dim + 1 :] 132 | ) 133 | 134 | # Pad 135 | pad_len = group_size - original_shape[group_dim] % group_size 136 | if pad_len != 0: 137 | pad_shape = ( 138 | original_shape[:group_dim] + (pad_len,) + original_shape[group_dim + 1 :] 139 | ) 140 | tensor = torch.cat( 141 | [tensor, torch.zeros(pad_shape, dtype=tensor.dtype, device=tensor.device)], 142 | dim=group_dim, 143 | ) 144 | data = tensor.view(new_shape) 145 | 146 | # Quantize 147 | if symmetric: 148 | B = 2 ** (num_bits - 1) - 1 149 | scale = B / torch.max(data.abs(), dim=group_dim + 1, keepdim=True)[0] 150 | data = data * scale 151 | data = data.clamp_(-B, B).round_().to(torch.int8) 152 | return data, scale, original_shape 153 | else: 154 | B = 2**num_bits - 1 155 | mn = torch.min(data, dim=group_dim + 1, keepdim=True)[0] 156 | mx = torch.max(data, dim=group_dim + 1, keepdim=True)[0] 157 | 158 | scale = B / (mx - mn) 159 | data = data - mn 160 | data *= scale 161 | 162 | data = data.clamp_(0, B).round_().to(torch.uint8) 163 | return data, mn, scale, original_shape 164 | 165 | 166 | def decompress(packed_data, config): 167 | """Simulate group-wise dequantization.""" 168 | if not config.enabled: 169 | return packed_data 170 | 171 | group_size, num_bits, group_dim, symmetric = ( 172 | config.group_size, 173 | config.num_bits, 174 | config.group_dim, 175 | config.symmetric, 176 | ) 177 | 178 | # Dequantize 179 | if symmetric: 180 | data, scale, original_shape = packed_data 181 | data = data / scale 182 | else: 183 | data, mn, scale, original_shape = packed_data 184 | data = data / scale 185 | data += mn 186 | 187 | # Unpad 188 | pad_len = group_size - original_shape[group_dim] % group_size 189 | if pad_len: 190 | padded_original_shape = ( 191 | original_shape[:group_dim] 192 | + (original_shape[group_dim] + pad_len,) 193 | + original_shape[group_dim + 1 :] 194 | ) 195 | data = data.reshape(padded_original_shape) 196 | indices = [slice(0, x) for x in original_shape] 197 | return data[indices].contiguous() 198 | else: 199 | return data.view(original_shape) 200 | -------------------------------------------------------------------------------- /toolbench/model/make_delta.py: -------------------------------------------------------------------------------- 1 | """ 2 | Make the delta weights by subtracting base weights. 3 | 4 | Usage: 5 | python3 -m fastchat.model.make_delta --base ~/model_weights/llama-13b --target ~/model_weights/vicuna-13b --delta ~/model_weights/vicuna-13b-delta --hub-repo-id lmsys/vicuna-13b-delta-v1.1 6 | """ 7 | import argparse 8 | 9 | import torch 10 | from tqdm import tqdm 11 | from transformers import AutoTokenizer, AutoModelForCausalLM 12 | 13 | 14 | def make_delta(base_model_path, target_model_path, delta_path): 15 | print(f"Loading the base model from {base_model_path}") 16 | base = AutoModelForCausalLM.from_pretrained( 17 | base_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True 18 | ) 19 | 20 | print(f"Loading the target model from {target_model_path}") 21 | target = AutoModelForCausalLM.from_pretrained( 22 | target_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True 23 | ) 24 | target_tokenizer = AutoTokenizer.from_pretrained(target_model_path, use_fast=False) 25 | 26 | print("Calculating the delta") 27 | for name, param in tqdm(target.state_dict().items(), desc="Calculating delta"): 28 | assert name in base.state_dict() 29 | param.data -= base.state_dict()[name] 30 | 31 | print(f"Saving the delta to {delta_path}") 32 | if args.hub_repo_id: 33 | kwargs = {"push_to_hub": True, "repo_id": args.hub_repo_id} 34 | else: 35 | kwargs = {} 36 | target.save_pretrained(delta_path, **kwargs) 37 | target_tokenizer.save_pretrained(delta_path, **kwargs) 38 | 39 | 40 | if __name__ == "__main__": 41 | parser = argparse.ArgumentParser() 42 | parser.add_argument("--base-model-path", type=str, required=True) 43 | parser.add_argument("--target-model-path", type=str, required=True) 44 | parser.add_argument("--delta-path", type=str, required=True) 45 | parser.add_argument("--hub-repo-id", type=str) 46 | args = parser.parse_args() 47 | 48 | make_delta(args.base_model_path, args.target_model_path, args.delta_path) 49 | -------------------------------------------------------------------------------- /toolbench/retrieval/inference_example.py: -------------------------------------------------------------------------------- 1 | from sentence_transformers import SentenceTransformer, util 2 | import json 3 | import pandas as pd 4 | from collections import defaultdict 5 | import torch 6 | from tqdm import tqdm 7 | import argparse 8 | import os 9 | 10 | # 创建参数解析器并添加参数 11 | parser = argparse.ArgumentParser() 12 | parser.add_argument('model_path', type=str, required=True, help='Your trained model path') 13 | parser.add_argument('dataset_path', help='The processed dataset files path') 14 | 15 | # 解析命令行参数 16 | args = parser.parse_args() 17 | 18 | # Check if a GPU is available and if not, use a CPU 19 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 20 | 21 | model_path = args.model_path 22 | 23 | # Load the trained model 24 | model = SentenceTransformer(model_path).to(device) 25 | 26 | # Load test data 27 | documents_df = pd.read_csv(os.path.join(args.dataset_path, 'corpus.tsv'), sep='\t') 28 | test_queries_df = pd.read_csv(os.path.join(args.dataset_path, 'test.query.txt'), sep='\t', names=['qid', 'query_text']) 29 | test_labels_df = pd.read_csv(os.path.join(args.dataset_path, 'qrels.test.tsv'), sep='\t', names=['qid', 'useless', 'docid', 'label']) 30 | 31 | # Create mappings, get 'tool_name' and 'api_name' from the document_content 32 | ir_corpus = {row.docid: (json.loads(row.document_content)['tool_name'], json.loads(row.document_content)['api_name']) for _, row in documents_df.iterrows()} 33 | ir_test_queries = {row.qid: row.query_text for _, row in test_queries_df.iterrows()} 34 | 35 | # Create query-doc mapping from the test set 36 | ir_relevant_docs = defaultdict(list) 37 | for _, row in test_labels_df.iterrows(): 38 | ir_relevant_docs[row.qid].append(row.docid) 39 | 40 | # Convert queries and documents to embeddings 41 | test_query_embeddings = model.encode(list(ir_test_queries.values()), convert_to_tensor=True).to(device) 42 | corpus_embeddings = model.encode(list(map(' '.join, ir_corpus.values())), convert_to_tensor=True).to(device) 43 | 44 | # Compute cosine similarity between queries and documents 45 | cos_scores = util.pytorch_cos_sim(test_query_embeddings, corpus_embeddings) 46 | 47 | # Get the top_k most similar documents for each query 48 | top_k = 5 49 | top_results = {} 50 | for query_index, (query_id, query) in enumerate(ir_test_queries.items()): 51 | relevant_docs_indices = cos_scores[query_index].topk(top_k).indices 52 | relevant_docs_scores = cos_scores[query_index].topk(top_k).values 53 | relevant_docs = [(list(ir_corpus.keys())[index], list(ir_corpus.values())[index]) for index in relevant_docs_indices] 54 | relevant_docs_with_scores = {str((doc_id, tool_name_api_name)): {'score': float(score)} for (doc_id, tool_name_api_name), score in zip(relevant_docs, relevant_docs_scores)} 55 | 56 | # Count the number of successful matches 57 | matches = len(set([doc_id for doc_id, _ in relevant_docs]) & set(ir_relevant_docs[query_id])) 58 | 59 | # Save query, original docs, top 5 docs with scores, and successful match count 60 | top_results[query] = { 61 | 'original_docs': [' '.join(ir_corpus[doc_id]) for doc_id in ir_relevant_docs[query_id]], 62 | 'top_docs': relevant_docs_with_scores, 63 | 'successful_matches': matches 64 | } 65 | 66 | # Save the results to a json file 67 | with open('top5_results_with_matches.json', 'w') as f: 68 | json.dump(top_results, f, indent=4) -------------------------------------------------------------------------------- /toolbench/retrieval/train.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import json 4 | import pandas as pd 5 | from datetime import datetime 6 | import torch 7 | import torch.nn as nn 8 | from sentence_transformers import SentenceTransformer, models, InputExample, losses, LoggingHandler 9 | from torch.utils.data import DataLoader 10 | from torch.utils.tensorboard import SummaryWriter 11 | from api_evaluator import APIEvaluator 12 | import argparse 13 | import os 14 | from toolbench.utils import process_retrieval_ducoment 15 | 16 | import os 17 | 18 | parser = argparse.ArgumentParser() 19 | parser.add_argument("--data_path", default=None, type=str, required=True, 20 | help="The input data dir. Should contain the .tsv files for the task.") 21 | parser.add_argument("--model_name", default=None, type=str, required=True, 22 | help="The base model name.") 23 | parser.add_argument("--output_path", default=None, type=str, required=True, 24 | help="The base path where the model output will be saved.") 25 | parser.add_argument("--num_epochs", default=5, type=int, required=True, 26 | help="Train epochs.") 27 | parser.add_argument("--train_batch_size", default=32, type=int, required=True, 28 | help="Train batch size.") 29 | parser.add_argument("--learning_rate", default=2e-5, type=float, required=True, 30 | help="Learning rate.") 31 | parser.add_argument("--warmup_steps", default=500, type=float, required=True, 32 | help="Warmup steps.") 33 | parser.add_argument("--max_seq_length", default=256, type=int, required=True, 34 | help="Max sequence length.") 35 | args = parser.parse_args() 36 | 37 | logging.basicConfig(format='%(asctime)s - %(message)s', 38 | datefmt='%Y-%m-%d %H:%M:%S', 39 | level=logging.INFO, 40 | handlers=[LoggingHandler()]) 41 | logger = logging.getLogger(__name__) 42 | 43 | torch.manual_seed(42) 44 | torch.cuda.manual_seed(42) 45 | 46 | num_epochs = args.num_epochs 47 | train_batch_size = args.train_batch_size 48 | lr = args.learning_rate 49 | warmup_steps = args.warmup_steps 50 | data_path = args.data_path 51 | output_path = args.output_path 52 | os.makedirs(output_path, exist_ok=True) 53 | 54 | model_save_path = os.path.join(output_path, datetime.now().strftime("%Y-%m-%d_%H-%M-%S")) 55 | os.makedirs(model_save_path, exist_ok=True) 56 | 57 | tensorboard_name = 'name_desc' 58 | logs_writer = SummaryWriter(os.path.join(output_path, 'tensorboard', tensorboard_name)) 59 | 60 | 61 | def log_callback_st(train_ix, global_step, training_steps, current_lr, loss_value): 62 | logs_writer.add_scalar('train_loss', loss_value, global_step) 63 | logs_writer.add_scalar('lr', current_lr[0], global_step) 64 | 65 | 66 | # Model definition 67 | word_embedding_model = models.Transformer(args.model_name, max_seq_length=args.max_seq_length) 68 | pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension()) 69 | model = SentenceTransformer(modules=[word_embedding_model, pooling_model]) 70 | 71 | ir_train_queries = {} 72 | ir_test_queries = {} 73 | ir_relevant_docs = {} 74 | train_samples = [] 75 | 76 | documents_df = pd.read_csv(os.path.join(data_path, 'corpus.tsv'), sep='\t') 77 | ir_corpus, _ = process_retrieval_ducoment(documents_df) 78 | 79 | train_queries_df = pd.read_csv(os.path.join(data_path, 'train.query.txt'), sep='\t', names=['qid', 'query']) 80 | for row in train_queries_df.itertuples(): 81 | ir_train_queries[row.qid] = row.query 82 | train_queries_df = pd.read_csv(os.path.join(data_path, 'test.query.txt'), sep='\t', names=['qid', 'query']) 83 | for row in train_queries_df.itertuples(): 84 | ir_test_queries[row.qid] = row.query 85 | 86 | labels_df = pd.read_csv(os.path.join(data_path, 'qrels.train.tsv'), sep='\t', names=['qid', 'useless', 'docid', 'label']) 87 | for row in labels_df.itertuples(): 88 | sample = InputExample(texts=[ir_train_queries[row.qid], ir_corpus[row.docid]], label=row.label) 89 | train_samples.append(sample) 90 | labels_df = pd.read_csv(os.path.join(data_path, 'qrels.test.tsv'), sep='\t', names=['qid', 'useless', 'docid', 'label']) 91 | for row in labels_df.itertuples(): 92 | ir_relevant_docs.setdefault(row.qid, set()).add(row.docid) 93 | 94 | train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=train_batch_size, pin_memory=True) 95 | train_loss = losses.MultipleNegativesRankingLoss(model) 96 | ir_evaluator = APIEvaluator(ir_test_queries, ir_corpus, ir_relevant_docs) 97 | 98 | # You may need to modify the .fit() method to ensure all data is moved to the correct device during parallel computations 99 | 100 | model.fit(train_objectives=[(train_dataloader, train_loss)], 101 | evaluator=ir_evaluator, 102 | epochs=num_epochs, 103 | warmup_steps=warmup_steps, 104 | optimizer_params={'lr': lr}, 105 | output_path=model_save_path 106 | ) 107 | 108 | 109 | -------------------------------------------------------------------------------- /toolbench/tooleval/README_ZH.md: -------------------------------------------------------------------------------- 1 |
2 |

🛠️Tool Eval🤖

3 |
4 | 5 | 通过在ToolBench上对LLaMA进行微调,我们得到了**ToolLLaMA**。考虑到人工评估非常耗时,我们借鉴[AlpacaEval](https://tatsu-lab.github.io/alpaca_eval/)开发了一个高效的机器自动评估**ToolEval**,其中包含两个评估指标: 6 | 7 | - **通过率**:计算在有限的OpenAI API调用次数内成功完成指令的比例。 8 | 9 | - **偏好**:通过比较给定指令的两个答案(动作序列)来衡量。我们预先定义了一组更好答案的标准,这些标准被组织成ChatGPT的提示。我们向评估器提供测试指令和两个候选答案,并获得其偏好。我们对每个答案对进行多次评估以提高系统的可靠性。然后,我们计算**优胜率**(被评估器选择为更优的百分比。有关详细信息,请参阅我们的论文。 10 | 11 | 为了验证ChatGPT评估器在通过率和胜率方面的可靠性,我们从四种不同的方法(ChatGPT+ReACT,ChatGPT+DFSDT,ToolLLaMA+DFSDT和GPT4+DFSDT)中进行采样,为每种方法的300个测试指令获取解决方案对。然后,我们请人类标注ChatGPT+DFSDT,ToolLLaMA+DFSDT和GPT4+DFSDT的通过率,以及ChatGPT+ReACT和ChatGPT+DFSDT之间的胜率。 12 | 13 | 我们的ChatGPT评估器在通过率方面与人类标注者具有高达**87.1%**的一致性,在胜率方面具有**80.3%**的一致性。这个结果表明,我们的评估器生成的评估结果与人类非常相似,并且可以视为在通过率和胜率上模拟人类评估的可靠评估器。 14 | 有关ToolEval的更多细节,请参阅我们的论文。 15 | 16 | ## 🚀用法 17 | 18 | ### Install 19 | Install Package (python>=3.9) 20 | ```bash 21 | pip install -r requirements.txt 22 | ``` 23 | 24 | ### Evaluation 25 | *若要复现结果,直接通过[Google Drive](https://drive.google.com/drive/folders/1yBUQ732mPu-KclJnuQELEhtKakdXFc3J)下载我们的`reproduction_data.zip`,解压后置`reproduction_data`于`ToolBench/data/`下即可,可以跳过数据准备流程。* 26 | - 数据准备。若要使用 ToolEval 评估您自己的模型和方法,首先需要为六个测试子集准备所有的模型预测。创建一个以您的模型和方法命名的目录,例如 `chatgpt_cot`,然后将每个测试集的预测放在该目录下。目录的文件结构应如下: 27 | ``` 28 | ├── /chatgpt_cot/ 29 | │ ├── /G1_instruction/ 30 | │ │ ├── /10160_CoT@1.json 31 | │ │ └── ... 32 | │ ├── /G1_tool/ 33 | │ │ ├── /10221_CoT@1.json 34 | │ │ └── ... 35 | │ ├── ... 36 | │ ├── /G3_instruction/ 37 | │ │ ├── /10221_CoT@1.json 38 | │ │ └── ... 39 | ``` 40 | 41 | 然后对模型预测进行预处理: 42 | 43 | ```bash 44 | export RAW_ANSWER_PATH=../../data/reproduction_data/model_predictions/ 45 | export CONVERTED_ANSWER_PATH=../../data/reproduction_data/model_predictions_converted/ 46 | export MODEL_NAME=chatgpt_cot 47 | export METHOD=CoT 48 | mkdir ${CONVERTED_ANSWER_PATH}/${MODEL_NAME} 49 | for test_set in G1_instruction G1_category G1_tool G2_category G2_instruction G3_instruction 50 | do 51 | answer_dir=${RAW_ANSWER_PATH}/${MODEL_NAME}/${test_set} 52 | output_file=${CONVERTED_ANSWER_PATH}/${MODEL_NAME}/${test_set}.json 53 | python convert_to_answer_format.py\ 54 | --answer_dir ${answer_dir} \ 55 | --method ${METHOD} \ 56 | --output ${output_file} 57 | done 58 | ``` 59 | 之后,检查`${CONVERTED_ANSWER_PATH}/${MODEL_NAME}`下是否有测试集的预处理JSON文件。如果有,你就可以准备运行以下评估过程了。如果没有,请检查模型的预测是否有问题。 60 | 61 | - OpenAI Key 62 | 准备您的OpenAI Key来搭建我们的evaluator。Key需要被存储到一个json file中,如`path/to/your/openai_key_json_file.json`: 63 | ```bash 64 | [ 65 | { 66 | "username": "your_user_name", 67 | "passwd": "your_password", 68 | "api_key": "your_openai_key", 69 | "organization": "your_organization" 70 | }, 71 | ... 72 | ] 73 | ``` 74 | - Pass rate. 75 | ```bash 76 | export CONVERTED_ANSWER_PATH=../../data/reproduction_data/model_predictions_converted/ 77 | export SAVE_PATH=pass_rate_results 78 | export CANDIDATE_MODEL=chatgpt_cot 79 | export API_POOL_FILE=path/to/your/openai_key_json_file.json 80 | 81 | python eval_pass_rate.py \ 82 | --converted_answer_path ${CONVERTED_ANSWER_PATH} \ 83 | --save_path ${SAVE_PATH} \ 84 | --reference_model ${CANDIDATE_MODEL} \ 85 | --test_ids ../../data/test_query_ids/ \ 86 | --max_eval_threads 20 \ 87 | --evaluate_times 4 88 | 89 | ``` 90 | 91 | 结果文件会被存储至${SAVE_PATH}中。 92 | 93 | - Win rate. 以下示例以ChatGPT-ReACT作为参考模型,GPT4-ReACT作为候选模型。请注意,您首先需要获取两个模型的pass rate结果,然后运行以下命令来评估GPT4-ReACT的win rate结果: 94 | ```bash 95 | export CONVERTED_ANSWER_PATH=../../data/reproduction_data/model_predictions_converted/ 96 | export SAVE_PATH=preference_results 97 | export PASS_TARE_PATH=pass_rate_results 98 | export REFERENCE_MODEL=chatgpt_cot 99 | export CANDIDATE_MODEL=gpt-4-0613_cot 100 | export API_POOL_FILE=path/to/your/openai_key_json_file.json 101 | 102 | python eval_preference.py \ 103 | --converted_answer_path ${CONVERTED_ANSWER_PATH} \ 104 | --reference_model ${REFERENCE_MODEL} \ 105 | --output_model ${CANDIDATE_MODEL} \ 106 | --test_ids ../../data/test_query_ids/ \ 107 | --save_path ${SAVE_PATH} \ 108 | --pass_rate_result_path ${PASS_TARE_PATH} \ 109 | --max_eval_threads 20 \ 110 | --use_pass_rate true \ 111 | --evaluate_times 4 112 | ``` 113 | 114 | 结果文件会被存储至${SAVE_PATH}中。 115 | 116 | ### 评估新方法 117 | 要评估除了ReACT和DFSDT之外的方法,您需要遵循以上Data preparation的步骤准备您的预处理好的answer数据。预处理好的answer数据需遵循以下json格式: 118 | 119 | ```json 120 | [ 121 | { 122 | "method":"method name", 123 | "total_steps": int, // a integer count total steps in answer details 124 | "final_answer": "final answer from the method", 125 | "answer_details":[{ 126 | "role":"node role, can be system, user, assistant and tool", 127 | "message":"message for the node", 128 | "next":[//next steps, can have multiple elements if the node have multiple candidates. 129 | { 130 | "role":"", 131 | "message":"", 132 | "next":[...] 133 | }, 134 | ...//more candidates 135 | ] 136 | }] 137 | } 138 | ... // more answers for the give query in the testdata 139 | ] 140 | ``` 141 | 142 | 143 | ### 更新排行榜 144 | 145 | 如果您想将您的模型的结果上传到[ToolEval Leaderboard](https://openbmb.github.io/ToolBench/),请您将您的结果文件整理成上述格式发送给我们(urtoolbench@gmail.com)或者开一个pull request。 146 | 我们将运行评测脚本更新结果并将您的模型添加到排行榜中。 147 | 148 | 149 | ### 创建新的自动评估器 150 | 如果您想创建新的自动评估器,您需要按下列步骤进行: 151 | 1. 在路径`toolbench/tooleval/evaluators`下创建一个评测器配置文件目录,命名与你的评测器名一致。在其中添加`config.yaml`文件与`template.txt`文件。具体配置方式可参考`toolbench/tooleval/evaluators/tooleval_gpt-3.5-turbo_normalized`中的实现。 152 | 2. 创建你的evaluator类并实现`fn_completions`函数在文件夹`toolbench/tooleval/evaluators/registered_cls`中,或者你可以使用我们预先定义好的类例如`OpenAINormalizedEvaluator`。 153 | 完成后将配置文件中`registered_cls_name`字段填写为该类的名称。 154 | 这里给出一个例子: 155 | ```Python 156 | from evaluators import register_evaluator,BaseEvaluator 157 | from typing import Dict,List 158 | 159 | @register_evaluator 160 | class MyEvaluator(BaseEvaluator): 161 | def __init__(self,config): 162 | super().__init__( 163 | fn_completions=self.fn_completions, 164 | ) 165 | # set your configures here 166 | 167 | def fn_completions(self,query:Dict,answers:List[Dict])->int: 168 | # implement your evaluator here 169 | # return the index of the preferred answer 170 | return 0 171 | ``` 172 | 其中register_evaluator是一个装饰器,用于注册评估器,BaseEvaluator是一个基类,用于实现评估器的基本功能。 173 | 3. 测试评估器的性能,运行脚本`evaluators_comparison.py`。 174 | -------------------------------------------------------------------------------- /toolbench/tooleval/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenBMB/ToolBench/d56fdd89faf8c91fa135090b212bb9057ee5cfc2/toolbench/tooleval/__init__.py -------------------------------------------------------------------------------- /toolbench/tooleval/automatic_eval_sample.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import time 4 | from concurrent.futures import ThreadPoolExecutor,as_completed 5 | from tqdm import tqdm 6 | import numpy as np 7 | import argparse 8 | import random 9 | from evaluation import UserEvaluation,BaseToolMethod 10 | from evaluators import load_registered_automatic_evaluator 11 | from typing import List,Dict,Callable 12 | import pandas as pd 13 | 14 | abs_dir = os.path.split(__file__)[0] 15 | 16 | 17 | def parse_args(): 18 | parser = argparse.ArgumentParser() 19 | parser.add_argument('--output',default=os.path.join(abs_dir,'dataset','test.json'),help='where to store the method output.') 20 | parser.add_argument('--method',default='unknown',help='what the name of the method.') 21 | parser.add_argument('--ref_method',default='gpt-3.5-turbo_CoT',help='what the reference method is') 22 | parser.add_argument('--ref_output',default=os.path.join(abs_dir,'dataset','ref_sample.json'),help='where the reference answer stored') 23 | parser.add_argument('--evaluators_cfg_path',default=os.path.join(abs_dir,'evaluators'),help='where the evaluators config files are stored') 24 | parser.add_argument('--evaluator',default='tooleval_gpt-3.5-turbo_normalized',help='which evaluator to use') 25 | parser.add_argument('--max_eval_threads',default=16,type=int,help='how many threads to use for evaluation') 26 | parser.add_argument('--evalset',default='default_evalset',help='which the evaluation dataset to use') 27 | parser.add_argument('--eval_server_address',default='http://localhost:8000',help='the address of the evaluation server') 28 | parser.add_argument('--use_existed_output',default=False,action='store_true',help='whether to use the existed output') 29 | 30 | return parser.parse_args() 31 | 32 | 33 | ## !!define your method here !! 34 | class SampleMethod(BaseToolMethod): 35 | def __init__(self): 36 | super().__init__() 37 | def forward(self,query:str,tools:List[Dict],tool_func:Callable)->Dict: 38 | return {} 39 | def convert_result_to_dict(self,result): 40 | return { 41 | 'method': 'sample', 42 | 'total_steps': 0, 43 | 'final_answer': '', 44 | 'answer_details': [] 45 | } 46 | 47 | if __name__=='__main__': 48 | args = parse_args() 49 | 50 | exec_generating_method_outputs = True 51 | if os.path.exists(args.output): 52 | print('Output file {} already exists!'.format(args.output)) 53 | if args.use_existed_output: 54 | exec_generating_method_outputs = False 55 | else: 56 | print('Overwrite? (y/n)') 57 | exec_generating_method_outputs = input()=='y' 58 | 59 | if exec_generating_method_outputs: 60 | ## change the SampleMethod to your method 61 | usereval = UserEvaluation(SampleMethod(),args.eval_server_address,args.evalset) 62 | print('Generating method outputs...') 63 | results = usereval.run() 64 | print('Saving method outputs...') 65 | with open(args.output,'w') as f: 66 | json.dump(results,f) 67 | else: 68 | print('Use existed output.') 69 | results = json.load(open(args.output)) 70 | 71 | print('Loading reference answer for evaluation...') 72 | try: 73 | ref_output = json.load(open(args.ref_output)) 74 | except: 75 | raise Exception('Cannot load reference answer from {}\n Please Download before evaluation!'.format(args.ref_output)) 76 | 77 | print('Loading automatic evaluators...') 78 | evaluators = [load_registered_automatic_evaluator(vars(args)) for _ in range(args.max_eval_threads)] 79 | 80 | def get_preference(qid,query,tools,ref_ans,ans,): 81 | global evaluators 82 | evaluator = random.choice(evaluators) 83 | ret = evaluator.annotate_preference( 84 | query, 85 | tools, 86 | [ref_ans,ans]) 87 | return qid,ret 88 | def get_most_preferred(d:list)->np.ndarray: 89 | if np.iterable(d): 90 | d = np.asanyarray(d) 91 | bins = np.bincount(d) 92 | max_val = np.max(bins) 93 | argmax = np.where(max_val==bins)[0] 94 | return argmax 95 | else: 96 | return np.asarray([d]) 97 | 98 | print('Evaluating...') 99 | prefer_dict = {} 100 | with ThreadPoolExecutor(args.max_eval_threads) as pool: 101 | future = [] 102 | for qid in ref_output.keys(): 103 | try: 104 | future.append(pool.submit( 105 | get_preference, 106 | qid, 107 | ref_output[qid]['query'], 108 | ref_output[qid]['available_tools'], 109 | ref_output[qid]['answer'], 110 | results[qid]['answer'] 111 | )) 112 | except KeyError as e: 113 | print('Warning : Missing answer for query {} in answer file! '.format(e)) 114 | 115 | for thd in tqdm(as_completed(future),total=len(future),ncols=100): 116 | qid,preference = thd.result() 117 | prefer_dict[qid] = get_most_preferred(preference)[0] 118 | 119 | prefer = list(prefer_dict.values()) 120 | 121 | prefer = np.array(prefer) 122 | df = pd.DataFrame.from_dict([{ 123 | 'Method':args.method, 124 | 'Win Rate':prefer.mean(), 125 | 'Std Error':np.std(prefer)/np.sqrt(len(prefer)) 126 | }]) 127 | print('###### Leaderboard vs {} ######'.format(args.ref_method)) 128 | print(df) 129 | save_file = os.path.join(abs_dir,'results',args.evalset,args.method) 130 | os.makedirs(save_file,exist_ok=True) 131 | df.to_csv(os.path.join(save_file,'win.csv')) 132 | -------------------------------------------------------------------------------- /toolbench/tooleval/convert_answers.py: -------------------------------------------------------------------------------- 1 | from convert_to_answer_format import process_invalid_data,process_valid_data 2 | import json 3 | from glob import glob 4 | import os 5 | 6 | save_dir = 'path/to/save/dir' 7 | 8 | groups_dirs = ['path/to/dataset/eval/result/folders'] 9 | 10 | for groups_dir in groups_dirs: 11 | method = os.path.split(groups_dir)[1] 12 | print(method) 13 | groups_save_dir = os.path.join(save_dir,method) 14 | os.makedirs(groups_save_dir,exist_ok=True) 15 | groups = [os.path.split(g)[1] for g in glob(groups_dir+'/*')] 16 | full_answer = {} 17 | for g in groups: 18 | print(g) 19 | answer_dict = {} 20 | files = glob(os.path.join(groups_dir,g,'*.json')) 21 | for file in files: 22 | qid = os.path.split(file)[1].split('_')[0] 23 | try: 24 | data = json.load(open(file)) 25 | except: 26 | print('Read error: ',file) 27 | continue 28 | if not data['answer_generation']['valid_data']: 29 | answer_dict[qid] = process_invalid_data(method,data) 30 | else: 31 | answer_dict[qid] = process_valid_data(method,data['answer_generation']) 32 | json.dump(answer_dict,open(os.path.join(groups_save_dir,f'{g}.json'),'w')) 33 | full_answer.update(answer_dict) 34 | # json.dump(full_answer,open(os.path.join(groups_save_dir,f'fullanswer.json'),'w')) -------------------------------------------------------------------------------- /toolbench/tooleval/dataset/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenBMB/ToolBench/d56fdd89faf8c91fa135090b212bb9057ee5cfc2/toolbench/tooleval/dataset/__init__.py -------------------------------------------------------------------------------- /toolbench/tooleval/evaluation/__init__.py: -------------------------------------------------------------------------------- 1 | from .usereval import UserEvaluation 2 | from .methodcls import BaseToolMethod 3 | from .dataclass import ExecutionGraph,ExecutionNode,DirectedEdge -------------------------------------------------------------------------------- /toolbench/tooleval/evaluation/methodcls.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, List,Callable 2 | 3 | class BaseToolMethod: 4 | def __init__(self): 5 | pass 6 | def convert_result_to_dict(self,result): 7 | '''Return Format 8 | -------- 9 | { 10 | 'method': 'method name', 11 | 'total_steps': int, 12 | 'final_answer': 'answer', 13 | 'answer_details': [{ 14 | "role": "system", 15 | "message": "", 16 | "next": [ 17 | { 18 | "role": "user", 19 | "message": "I am planning ...", 20 | "next": [ 21 | { 22 | "role": "tool", 23 | "message": "{'name': 'Finish', 'arguments': '{\\n \"return_type\": \"give_answer\",\\n \"final_answer\": \"I encountere...", 24 | "next": [] 25 | } 26 | ] 27 | } 28 | ] 29 | }] 30 | } 31 | 32 | ''' 33 | pass 34 | def forward(self,query:str,tools:List[Dict],tool_func:Callable)->Dict: 35 | pass 36 | 37 | def __call__(self,query:str,tools:List[Dict],tool_func:Callable)->Dict: 38 | result = self.forward(query,tools,tool_func) 39 | return self.convert_result_to_dict(result) 40 | 41 | -------------------------------------------------------------------------------- /toolbench/tooleval/evaluation/usereval.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from tqdm import tqdm 3 | from typing import Union, Dict, List, Optional,Tuple 4 | from .methodcls import BaseToolMethod 5 | from .dataclass import * 6 | import json 7 | 8 | class UserEvaluation: 9 | def __init__(self, 10 | method:BaseToolMethod, 11 | eval_server_addr='http://localhost:8000', 12 | evalset='eval20230718'): 13 | self.eval_server_addr = eval_server_addr 14 | self.evalset = evalset 15 | self.method = method 16 | res = requests.post(self.eval_server_addr+'/neweval',json=self.evalset) 17 | if res.status_code != 200: 18 | raise Exception('Failed to obtain new evaluation id! Error: '+res.text) 19 | ret = res.json() 20 | self.eval_id = ret['evaluation_id'] 21 | self.len = ret['len'] 22 | 23 | def get_new_question(self)->Tuple[str,List]: 24 | res = requests.post(self.eval_server_addr+'/next_question',json=self.eval_id) 25 | if res.status_code == 204: 26 | raise EvalCompleted() 27 | if res.status_code != 200: 28 | raise Exception('Failed to obtain new question!') 29 | 30 | self.question = Question(**res.json()) 31 | self.tool_name_to_id = {} 32 | tools = [tool.model_dump() for tool in self.question.available_tools] 33 | for tool in tools: 34 | self.tool_name_to_id[tool['name']] = tool.pop('tid') 35 | 36 | 37 | return self.question.query,tools 38 | def tool_func(self,tool_name:str,tool_args:str)->requests.Response: 39 | tid = self.tool_name_to_id[tool_name] 40 | # res = requests.post(self.eval_server_addr+'/api',json={ 41 | # 'evaluation_id':self.eval_id, 42 | # 'tool_id':tid, 43 | # 'tool_args':tool_args 44 | # }) 45 | res = requests.post(self.eval_server_addr+'/rapidapi',json={ 46 | 'evaluation_id':self.eval_id, 47 | 'tool_id':tid, 48 | 'tool_args':tool_args 49 | }) 50 | 51 | return res 52 | def _forward(self,query:str,tools:List[Dict])->Dict: 53 | method_ret = self.method(query,tools,self.tool_func) 54 | 55 | return self.question.qid,{ 56 | 'query':query, 57 | 'available_tools':tools, 58 | 'answer':method_ret 59 | } 60 | 61 | 62 | def run(self)->Dict: 63 | results = {} 64 | for _ in tqdm(range(self.len),ncols=100): 65 | try: 66 | qid,ret = self._forward(*self.get_new_question()) 67 | except EvalCompleted: 68 | return results 69 | results[qid] = ret 70 | return results 71 | -------------------------------------------------------------------------------- /toolbench/tooleval/evaluators/__init__.py: -------------------------------------------------------------------------------- 1 | from .registered_cls import BaseEvaluator,register_evaluator,get_evaluator_cls 2 | 3 | __all__=['register_evaluator','get_evaluator_cls','BaseEvaluator','load_registered_automatic_evaluator'] 4 | 5 | 6 | 7 | def load_registered_automatic_evaluator(config:dict={},evaluator_name=None,evaluators_cfg_path=None)->BaseEvaluator: 8 | import os 9 | import yaml 10 | 11 | evaluator_name = config['evaluator'] if evaluator_name is None else evaluator_name 12 | cfg_path = config['evaluators_cfg_path'] if evaluators_cfg_path is None else evaluators_cfg_path 13 | cfg_path = os.path.join(cfg_path,evaluator_name) 14 | 15 | cls_name = yaml.load(open(os.path.join(cfg_path,'config.yaml')),Loader=yaml.FullLoader)['registered_cls_name'] 16 | 17 | evaluator:BaseEvaluator = get_evaluator_cls(cls_name)(cfg_path) 18 | return evaluator -------------------------------------------------------------------------------- /toolbench/tooleval/evaluators/registered_cls/__init__.py: -------------------------------------------------------------------------------- 1 | from .base import BaseEvaluator 2 | from .utils import register_evaluator,get_evaluator_cls 3 | 4 | __all__ = ['register_evaluator','get_evaluator_cls','BaseEvaluator'] 5 | 6 | import os 7 | import importlib 8 | current_dir = os.path.dirname(__file__) 9 | 10 | for item in os.listdir(current_dir): 11 | item_path = os.path.join(current_dir, item) 12 | 13 | if os.path.isfile(item_path) and item != '__init__.py' and item.endswith('.py'): 14 | module_name = item[:-3] 15 | 16 | full_module_path = f"{__name__}.{module_name}" 17 | 18 | imported_module = importlib.import_module(full_module_path) 19 | 20 | globals()[module_name] = imported_module 21 | -------------------------------------------------------------------------------- /toolbench/tooleval/evaluators/registered_cls/base.py: -------------------------------------------------------------------------------- 1 | import random 2 | from typing import List, Union, Dict, Any, Callable 3 | import os 4 | import yaml 5 | from .utils import register_evaluator 6 | 7 | def process_answer(answer: Dict): 8 | answer['final_answer'] = answer['final_answer'][:1000] 9 | answer['answer_details'] = answer['answer_details'][:3000] 10 | answer.pop('method', None) 11 | return answer 12 | 13 | 14 | def process_tools(tools: List[Dict]): 15 | for tool in tools: 16 | tool.pop('description', None) 17 | tool.pop('parameters', None) 18 | return tools 19 | 20 | @register_evaluator 21 | class BaseEvaluator: 22 | """Base class for evaluators. 23 | 24 | Attributes: 25 | ---------- 26 | fn_completions : Callable[[Dict,List[Dict]],int] 27 | The completion function of the evaluator, used to get annotated results. 28 | This function should take two arguments: `task_description`:Dict and `answers`:List[Dict], return a int stand for the index of best answer. 29 | 30 | Functions: 31 | --------- 32 | annotate_preference : Callable 33 | Annotate and return the index of the preferred answer. 34 | 35 | """ 36 | def __init__(self, 37 | fn_completions: Callable[[Dict,List[Dict]],int] = None, 38 | *args, 39 | **kwargs): 40 | self.fn_completions = fn_completions 41 | def annotate_preference(self, 42 | query: str, 43 | available_tools: List[Dict[Any, Any]], 44 | answers:List[Dict], 45 | multisample=False, 46 | sample_n=4, 47 | task_status=None, 48 | answer_statuss=[None, None]) -> Union[List[int], int]: 49 | """Annotate and return the index of the preferred answer. 50 | 51 | For given query, available tools, and two answers, return the index of the preferred answer by calling function `fn_completions` of the evaluator. 52 | 53 | Parameters: 54 | ---------- 55 | query : str 56 | The query of the task. 57 | available_tools : List[Dict[Any, Any]] 58 | The list of available tools for the task. The specific format of the tool is defined in `tooleval/evaluation/dataclass.py` 59 | answers : List[Dict] 60 | The list of answers for comparison. 61 | multisample : bool, optional 62 | Whether to use multisample to get the preference. If True, the function will return a list of preferences, otherwise return a single preference. 63 | sample_n : int, optional 64 | The number of samples to get the preference. 65 | 66 | Returns: 67 | ------- 68 | preference : Union[List[int], int] 69 | The index of the preferred answer. If `multisample` is True, return a list of preferences, otherwise return a single preference. 70 | 71 | Raise: 72 | ----- 73 | 74 | """ 75 | answers_processed = [process_answer(ans) for ans in answers] 76 | available_tools = process_tools(available_tools) 77 | 78 | def shuffle_run() -> int: 79 | indexs = list(range(len(answers_processed))) 80 | random.shuffle(indexs) 81 | 82 | answers_projected = [answers[idx] for idx in indexs] 83 | 84 | preferred_index = self.fn_completions( 85 | { 86 | 'query':query, 87 | 'available_tools':available_tools, 88 | }, 89 | answers_projected, 90 | task_status, 91 | answer_statuss 92 | ) 93 | if preferred_index in indexs: 94 | return indexs.index(preferred_index) 95 | raise ValueError(f'Preferred index {preferred_index} is invalid!') 96 | 97 | if not multisample: 98 | return shuffle_run() 99 | else: 100 | prefers = [shuffle_run() for _ in range(sample_n)] 101 | return prefers 102 | 103 | @register_evaluator 104 | class ToolEvalEvaluator(BaseEvaluator): 105 | """ToolEval common evaluator class. 106 | 107 | Attributes: 108 | ---------- 109 | cfg_path : str 110 | A path store the configuration of the evaluator. 111 | 112 | 113 | """ 114 | def __init__(self, 115 | cfg_path: str = None, 116 | ): 117 | eval_config = yaml.load(open(os.path.join(cfg_path,'config.yaml')),Loader=yaml.FullLoader) 118 | template = open(os.path.join(cfg_path,eval_config['prompt_template'])).read() 119 | 120 | super().__init__( 121 | fn_completions=getattr(self,eval_config['fn_completions']) 122 | ) 123 | self.eval_config = eval_config 124 | self.template = template -------------------------------------------------------------------------------- /toolbench/tooleval/evaluators/registered_cls/utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | from typing import List,Dict 4 | import requests 5 | from tenacity import retry, wait_random_exponential, stop_after_attempt 6 | 7 | import openai 8 | import random 9 | 10 | __registered_evaluators__ = {} 11 | 12 | def register_evaluator(cls): 13 | """ 14 | Decorator function to register classes with the registered_evaluators list. 15 | """ 16 | __registered_evaluators__[cls.__name__] = cls 17 | return cls 18 | 19 | def get_evaluator_cls(clsname): 20 | """ 21 | Return the evaluator class with the given name. 22 | """ 23 | try: 24 | return __registered_evaluators__.get(clsname) 25 | except: 26 | raise ModuleNotFoundError('Cannot find evaluator class {}'.format(clsname)) 27 | 28 | 29 | class OpenaiPoolRequest: 30 | def __init__(self, pool_json_file=None): 31 | self.pool:List[Dict] = [] 32 | __pool_file = pool_json_file 33 | if os.environ.get('API_POOL_FILE',None) is not None: 34 | __pool_file = os.environ.get('API_POOL_FILE') 35 | self.now_pos = random.randint(-1, len(self.pool)) 36 | if os.path.exists(__pool_file): 37 | self.pool = json.load(open(__pool_file)) 38 | self.now_pos = random.randint(-1, len(self.pool)) 39 | print(__pool_file) 40 | if os.environ.get('OPENAI_KEY',None) is not None: 41 | self.pool.append({ 42 | 'api_key':os.environ.get('OPENAI_KEY'), 43 | 'organization':os.environ.get('OPENAI_ORG',None), 44 | 'api_type':os.environ.get('OPENAI_TYPE',None), 45 | 'api_version':os.environ.get('OPENAI_VER',None) 46 | }) 47 | 48 | # @retry(wait=wait_random_exponential(multiplier=1, max=30), stop=stop_after_attempt(10),reraise=True) 49 | def request(self,messages,**kwargs): 50 | self.now_pos = (self.now_pos + 1) % len(self.pool) 51 | key_pos = self.now_pos 52 | item = self.pool[key_pos] 53 | print(len(self.pool)) 54 | kwargs['api_key'] = item['api_key'] 55 | if item.get('organization',None) is not None: 56 | kwargs['organization'] = item['organization'] 57 | return openai.ChatCompletion.create(messages=messages,**kwargs) 58 | 59 | def __call__(self,messages,**kwargs): 60 | return self.request(messages,**kwargs) 61 | -------------------------------------------------------------------------------- /toolbench/tooleval/evaluators/tooleval_gpt-3.5-turbo_default/config.yaml: -------------------------------------------------------------------------------- 1 | evaluator_name: "tooleval_gpt-3.5-turbo_default" 2 | registered_cls_name: "ReinforceToolLearningEvaluator" 3 | prompt_template: "template.txt" 4 | fn_completions: "normalized_openai_completions" 5 | apis_json: "your/path/to/api_pool.json" 6 | completions_kwargs: 7 | model: "gpt-3.5-turbo-16k" 8 | max_tokens: 1000 9 | temperature: 0.2 10 | timeout: 10 11 | functions: 12 | - name: "check_answer_status" 13 | description: "Parse the json answer with layerd nodes and return the answer_status about the answer" 14 | parameters: 15 | type: "object" 16 | properties: 17 | answer_status: 18 | type: "string" 19 | enum: ["Unsure","Unsolved","Solved"] 20 | required: ["answer_status"] 21 | - name: "parse_answer_status" 22 | description: "Parse the json answer with layerd nodes and return the answer_status about the answer" 23 | parameters: 24 | type: "object" 25 | properties: 26 | answer_status: 27 | type: "string" 28 | enum: ["Unsure","Unsolved","Solved"] 29 | required: ["answer_status"] 30 | - name: "check_task_solvable" 31 | description: "Parse the task description and return the task_status about the task" 32 | parameters: 33 | type: "object" 34 | properties: 35 | task_status: 36 | type: "string" 37 | enum: ["Unsure","Unsolvable","Solvable"] 38 | required: ["task_status"] 39 | - name: "select_better_answer" 40 | description: "Select the better answer with a comprehensive investigation on given aspects. You should ignore the impact of the order of candidate answers." 41 | parameters: 42 | type: "object" 43 | properties: 44 | index: 45 | type: "number" 46 | description: "The `index` value in the selected better answer." 47 | required: ["index"] 48 | fn_completion_parser: "index_parser" 49 | batch_size: 1 50 | -------------------------------------------------------------------------------- /toolbench/tooleval/evaluators/tooleval_gpt-3.5-turbo_default/template.txt: -------------------------------------------------------------------------------- 1 | 2 | check_answer_status 3 | 4 | Giving the query and answer, you need give `answer_status` of the answer by following rules: 5 | 1. If the answer is a sorry message or not a positive/straight response for the given query, return "Unsolved". 6 | 2. If the answer is a positive/straight response for the given query, you have to further check. 7 | 2.1 If the answer is not sufficient to determine whether the solve the query or not, return "Unsure". 8 | 2.2 If you are confident that the answer is sufficient to determine whether the solve the query or not, return "Solved" or "Unsolved". 9 | 10 | Query: 11 | {query} 12 | Answer: 13 | {answer} 14 | 15 | Now give your reason in "content" and `answer_status` of JSON to `check_answer_status`. 16 | 17 | 18 | 19 | 20 | parse_answer_status 21 | 22 | Giving the query and the correspond execution detail of an answer, you need give `answer_status` of the answer by following rules: 23 | 1. If all 'tool' nodes' message indicate that there are errors happened, return "Unsolved" 24 | 2. If you find the information in the "final_answer" is not true/valid according to the messages in 'tool' nodes, return "Unsolved" 25 | 3. If you are unable to verify the authenticity and validity of the information, return "Unsure" 26 | 4. If there are 'tool' node in the chain contains successful func calling and those calling indeed solve the query, return "Solved" 27 | 28 | Query: 29 | {query} 30 | Answer: 31 | {answer} 32 | 33 | Now you are requested to give reason in "content" and `answer_status` of JSON to `parse_answer_status`. 34 | 35 | 36 | 37 | 38 | check_task_solvable 39 | 40 | Please check whether the given task solvable with following rules: 41 | 1. If the `query` provide invalid information (e.g. invalid email address or phone number), return "Unsolvable" 42 | 2. If the `query` needs more information to solve (e.g. the target restaurant name in a navigation task), return "Unsolvable" 43 | 3. If you are unable to draw a conclusion, return "Unsure" 44 | 4. If the currently `available_tools` are enough to solve the query, return "Solvable" 45 | 46 | Task: 47 | {task} 48 | 49 | Now give your reason in "content" and `task_status` of JSON to `check_task_solvable`. 50 | 51 | 52 | 53 | 54 | 55 | 56 | select_better_answer 57 | 58 | Query: 59 | {query} 60 | 61 | Answer_0: 62 | {answer_0} 63 | 64 | Answer_1: 65 | {answer_1} 66 | 67 | Given above query and answers in JSON format, you must follow the rules to select the relatively better answer and give the index of the answer **(0 for Answer_0, 1 for Answer_1)**: 68 | 1. Compare the value of "final_answer" in following aspects: 69 | - Informative: whether it contains all necessary information to reply to the query. 70 | - Factuality: whether it accurately describes what has been done, and what failed in the end. 71 | - Reasoning: If answer does not solve the query, whether gives a detailed and accurate reason for failure. 72 | 2. If you cannot determine yet, compare the value of "answer_details" in following aspects: 73 | - Tool calling costs: calculating the percentage of failed and replicated tools calling. 74 | - Running costs: calculating the total tokens T used in execution. 75 | - Milestone: calculating the milestone(fixed subtasks) reached in execution. 76 | - Exploration: whether tries potential useful tools in execution. Just count times of successful tool calling with different tools/arguments in execution. 77 | 78 | If you have made your decision, calling `select_better_answer`, else if you cannot determine, select a random answer. 79 | 80 | -------------------------------------------------------------------------------- /toolbench/tooleval/evaluators/tooleval_gpt-3.5-turbo_fn/config.yaml: -------------------------------------------------------------------------------- 1 | evaluator_name: "tooleval_gpt-3.5-turbo_fn" 2 | registered_cls_name: "OpenAIEvaluator" 3 | prompt_template: "template.txt" 4 | fn_completions: "openai_completions" 5 | apis_json: "your/path/to/api_pool.json" 6 | completions_kwargs: 7 | model: "gpt-3.5-turbo-16k" 8 | max_tokens: 100 9 | temperature: 0 10 | timeout: 10 11 | function_call: 12 | name: "choose_preference" 13 | functions: 14 | - name: "choose_preference" 15 | description: "Choose the preferred answer for the query within all given answers." 16 | parameters: 17 | type: "object" 18 | properties: 19 | preference: 20 | type: "number" 21 | description: "The index of the preferred answer in all given answers." 22 | required: [ "preference" ] 23 | fn_completion_parser: "index_parser" 24 | batch_size: 1 25 | -------------------------------------------------------------------------------- /toolbench/tooleval/evaluators/tooleval_gpt-3.5-turbo_fn/template.txt: -------------------------------------------------------------------------------- 1 | 2 | system 3 | You are a helpful annotator, that help user to annotate data. 4 | 5 | 6 | user 7 | Giving task description and candidate answers, I want you to choose one preferred answer based on the rules. To do so, I will give you the task description that given to the models, and the candidate answers in a list for chosen. To choose the one preferred answer, you need to first analyse answers based on rules, then give the index number of the preferred answer of JSON to `choose_preference`. 8 | 9 | Here are the preference rules: 10 | 1. if both answers give the none empty `final_answer`, check whether the given `final_answer` solves the given query. 11 | 1.1 if both answers solve the query, choose one with smaller `total_steps`. 12 | 1.1.1 if `total_steps` are same, choose one answer with better `final_answer` quality. 13 | 1.2 if one answer solve while the other not, chose the answer that solve query. 14 | 1.3 if both answers failed, check the `answer_details` to choose one with considering following preference: 15 | 1.3.1 check `response` and prefer more successful tool calling. 16 | 1.3.2 check `name` and prefer using more various tool usage. 17 | 1.3.3 prefer smaller `total_steps`. 18 | 2. if one give none empty `final_answer` while other not, choose the one give `final_answer`. 19 | 3. if both failed to give none empty `final_answer`, following 1.3 to choose one with better `answer_details`. 20 | 21 | Here is the task description in JSON format: 22 | {task_description} 23 | 24 | Here are the candidate answers in JSON format: 25 | {answers} 26 | 27 | Now choose the preferred answer by analysing results and the rules given, return the index in range [0,1]. 28 | -------------------------------------------------------------------------------- /toolbench/tooleval/evaluators/tooleval_gpt-3.5-turbo_normalized/config.yaml: -------------------------------------------------------------------------------- 1 | evaluator_name: "tooleval_gpt-3.5-turbo_normalized" 2 | registered_cls_name: "OpenAINormalizedEvaluator" 3 | prompt_template: "template.txt" 4 | fn_completions: "normalized_openai_completions" 5 | apis_json: "your/path/to/api_pool.json" 6 | completions_kwargs: 7 | model: "gpt-3.5-turbo-16k" 8 | max_tokens: 100 9 | temperature: 0 10 | timeout: 10 11 | functions: 12 | - name: "parse_answer_details" 13 | description: "Parse the json answer with layerd nodes and return the informations about the answer" 14 | parameters: 15 | type: "object" 16 | properties: 17 | succeed_tool_calling: 18 | type: "number" 19 | description: "Give the number of times that the 'tool' nodes' message is called successfully without any errors in the response" 20 | used_tool_types: 21 | type: "number" 22 | description: "Give the number of different 'name' in 'tool' nodes' message" 23 | required: [ "succeed_tool_calling", "used_tool_types"] 24 | - name: "select_best_final_answer" 25 | description: "For given query, select the best answer in answers list and return the index of the best answer" 26 | parameters: 27 | type: "object" 28 | properties: 29 | best_answer_index: 30 | type: "number" 31 | description: "The index of the best answer in the answer list, start from 0" 32 | required: [ "best_answer_index"] 33 | - name: "check_solve_query" 34 | description: "Check whether the given answer solve the given query, return true or false" 35 | parameters: 36 | type: "object" 37 | properties: 38 | is_solved: 39 | type: "boolean" 40 | description: "true if solved and false if not" 41 | required: ["is_solved"] 42 | fn_completion_parser: "index_parser" 43 | batch_size: 1 44 | -------------------------------------------------------------------------------- /toolbench/tooleval/evaluators/tooleval_gpt-3.5-turbo_normalized/template.txt: -------------------------------------------------------------------------------- 1 | 2 | parse_answer_details 3 | 4 | Giving answer details in the following JSON format: 5 | {answer_details} 6 | 7 | I want you to parse the answer details and give the information of JSON to `parse_answer_details`. Now parse the answer. 8 | 9 | 10 | 11 | select_best_final_answer 12 | 13 | For query {query}, you have the following answers in JSON format: 14 | {final_answers} 15 | 16 | I want you to select the best answer from the above answers and give the index of the answer of JSON to `select_best_final_answer`. Now select the best answer. 17 | 18 | 19 | 20 | check_solve_query 21 | 22 | Please check whether the answer solve the query or not. 23 | Query: 24 | {query} 25 | 26 | Answer: 27 | {final_answer} 28 | 29 | Now give your judgment of JSON to `check_solve_query`, remember do not be too strict. 30 | 31 | -------------------------------------------------------------------------------- /toolbench/tooleval/evaluators_comparison.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import json 3 | from concurrent.futures import ThreadPoolExecutor,as_completed 4 | from tqdm import tqdm 5 | from evaluators import load_registered_automatic_evaluator 6 | import os 7 | import numpy as np 8 | import copy 9 | from typing import List 10 | from scipy.stats import pearsonr,spearmanr 11 | import random 12 | random.seed(42) 13 | 14 | abs_dir = os.path.split(__file__)[0] 15 | annotated_data = json.load(open(os.path.join(abs_dir,'dataset/human_cross_annotated_data.json'))) 16 | NUM_WORKERS=16 17 | 18 | def get_most_preferred(d:list)->np.ndarray: 19 | if np.iterable(d): 20 | d = np.asanyarray(d) 21 | bins = np.bincount(d) 22 | max_val = np.max(bins) 23 | argmax = np.where(max_val==bins)[0] 24 | return argmax 25 | else: 26 | return np.asarray([d]) 27 | 28 | def agreement_score(x,ref:list)->float: 29 | majority_x = get_most_preferred(x) 30 | majority_ref = get_most_preferred(ref) 31 | score_unit = 1/len(majority_x)/len(majority_ref) 32 | score = 0.0 33 | for x in majority_x: 34 | if x in majority_ref: 35 | score += score_unit 36 | return score 37 | def get_correlation(x,y): 38 | x= np.asarray(x) 39 | y = np.asarray(y) 40 | x = x+1 41 | y = y+1 42 | if np.var(x)==0 or np.var(y)==0: 43 | return float(random.choice(get_most_preferred(x))==random.choice(get_most_preferred(y))) 44 | return pearsonr(x,y)[0] 45 | 46 | def test_on_annotated_data(evaluator_cfg)->List[List[int]]: 47 | evaluators = [load_registered_automatic_evaluator(evaluator_cfg) for _ in range(NUM_WORKERS)] 48 | def get_preference(idx): 49 | data = annotated_data[idx] 50 | def process_tools(tools:list): 51 | for tool in tools: 52 | tool.pop('description',None) 53 | tool.pop('parameters',None) 54 | return tools 55 | 56 | tools = process_tools(data['available_tools']) 57 | ret = evaluators[idx%NUM_WORKERS].annotate_preference( 58 | data['query'], 59 | tools, 60 | data['answers'],multisample=True) 61 | return idx,ret 62 | prefer_dict = {} 63 | with ThreadPoolExecutor(NUM_WORKERS) as pool: 64 | # future = [pool.submit(get_preference,idx) for idx in range(100)] 65 | future = [pool.submit(get_preference,idx) for idx in range(len(annotated_data))] 66 | for thd in tqdm(as_completed(future),total=len(future),ncols=100): 67 | if thd.exception() is not None: 68 | pool.shutdown(cancel_futures=True) 69 | raise thd.exception() 70 | exit(-1) 71 | idx,preference = thd.result() 72 | prefer_dict[idx] = preference 73 | prefer = [prefer_dict[idx] for idx in range(len(future))] 74 | return prefer 75 | 76 | def get_popped_and_rest(d:list,index:int): 77 | l = copy.deepcopy(d) 78 | popped = l.pop(index) 79 | return popped,l 80 | 81 | def calculate_human_performance(): 82 | human_agreement = [] 83 | variance = [] 84 | for data in annotated_data: 85 | agreement_scores = [ 86 | agreement_score(*get_popped_and_rest(data['preference'],idx)) 87 | for idx in range(len(data['preference'])) 88 | ] 89 | human_agreement.append(np.mean(agreement_scores)) 90 | variance.append(np.var([1-agreement_scores[idx] for idx in range(len(agreement_scores))])) 91 | 92 | 93 | return { 94 | 'human_agreement':np.mean(human_agreement), 95 | 'bias':0, 96 | 'variance':np.mean(variance) 97 | } 98 | 99 | 100 | 101 | def calculate_evaluator_performance(evaluator_preference,human_preference): 102 | human_agreement = [] 103 | bias = [] 104 | variance = [] 105 | assert len(evaluator_preference)==len(human_preference),'length of evaluator_preference and human_preference should be the same!' 106 | correlation = [] 107 | for idx in range(len(evaluator_preference)): 108 | human_pref = human_preference[idx] 109 | evaluator_pref = evaluator_preference[idx] 110 | 111 | human_agreement.append([ 112 | agreement_score(pref,human_pref) for pref in evaluator_pref 113 | ]) 114 | bias.append( 115 | 1 - agreement_score(human_pref,evaluator_pref) 116 | ) 117 | variance.append( 118 | np.var([1-score for score in human_agreement[-1]]) 119 | ) 120 | correlation.append(get_correlation(human_pref,evaluator_pref)) 121 | 122 | return{ 123 | 'correlation': np.mean(correlation), 124 | 'human_agreement':np.mean(np.mean(human_agreement,axis=1)), 125 | 'bias':np.mean(bias), 126 | 'variance':np.mean(variance) 127 | } 128 | 129 | if __name__=='__main__': 130 | evaluators = ['tooleval_gpt-3.5-turbo_normalized',] 131 | human_perference = [ 132 | data['preference'] for data in annotated_data 133 | ] 134 | 135 | evaluator_performance = [calculate_human_performance()] 136 | for evaluator in evaluators: 137 | if not os.path.exists(os.path.join(abs_dir,'dataset',f'performance_{evaluator}.npy')): 138 | evaluator_cfg = { 139 | 'evaluators_cfg_path':os.path.join(abs_dir,'evaluators'), 140 | 'evaluator':evaluator 141 | } 142 | evaluator_perference = test_on_annotated_data(evaluator_cfg) 143 | np.save(os.path.join(abs_dir,'dataset',f'performance_{evaluator}.npy'),evaluator_perference) 144 | 145 | evaluator_perference = np.load(os.path.join(abs_dir,'dataset',f'performance_{evaluator}.npy'),allow_pickle=True) 146 | performance = calculate_evaluator_performance(evaluator_perference,human_perference) 147 | print(performance) 148 | evaluator_performance.append(performance) 149 | 150 | df = pd.DataFrame(evaluator_performance,index=['human']+evaluators) 151 | df.to_csv(os.path.join(abs_dir,'dataset','evaluator_performance.csv')) 152 | print(df) -------------------------------------------------------------------------------- /toolbench/tooleval/requirements.txt: -------------------------------------------------------------------------------- 1 | tqdm 2 | numpy 3 | pandas 4 | pydantic 5 | tenacity 6 | openai 7 | pyyaml -------------------------------------------------------------------------------- /toolbench/tooleval/results/default_evalset/DFS/win.csv: -------------------------------------------------------------------------------- 1 | ,Method,Win Rate,Std Error 2 | 0,DFS,, 3 | -------------------------------------------------------------------------------- /toolbench/tooleval/results/leaderboard###default_evalset###tooleval_gpt-3.5-turbo_normalized###ChatGPT-DFSDT.csv: -------------------------------------------------------------------------------- 1 | Method,WinRate,G1_instruction_WinRate,G1_tool_WinRate,G1_category_WinRate,G2_instruction_WinRate,G2_category_WinRate,G3_instruction_WinRate 2 | GPT4-DFSDT,70.4,60,71.5,67,79.5,77.5,71 3 | GPT4-ReACT,64.4,53.5,50,53.5,67,72,47 4 | ChatGPT-DFSDT,64.3,54.5,65,60.5,75,71.5,62 5 | ToolLLaMA-DFSDT-Retriever,63.1,64,64,60.5,81.5,68.5,65 6 | ToolLLaMA-DFSDT,60,57,61,62,77,77,66 7 | ChatGPT-ReACT,50,41.5,44,44.5,42.5,46.5,22 8 | Text-Davinci-003-DFSDT,46.3,43.5,44,46,37,42,46 9 | Claude-2-DFSDT,43.5,20.5,31,18.5,17,20.5,28 10 | Claude-2-ReACT,34.4,5.5,3.5,5.5,6,6,14 11 | Text-Davinci-003-ReACT,33.2,12,20,20,8.5,14.5,24 -------------------------------------------------------------------------------- /toolbench/tooleval/results/leaderboard###default_evalset###tooleval_gpt-3.5-turbo_normalized###gpt-3.5-turbo_CoT.csv: -------------------------------------------------------------------------------- 1 | Method,WinRate,StdError,G1_tool_WinRate,G2_instruction_WinRate,G1_category_WinRate,G1_instruction_WinRate,G2_category_WinRate,G3_instruction_WinRate,G1_tool_StdError,G2_instruction_StdError,G1_category_StdError,G1_instruction_StdError,G2_category_StdError,G3_instruction_StdError 2 | llama-65B-finetuned-5k_CoT,0.675,0.0191213231759729,0.55,0.74,0.55,0.67,0.8,0.74,0.049749371855331,0.0438634243989226,0.049749371855331,0.0470212717820349,0.04,0.0438634243989226 3 | llama-65B-finetuned-1k_CoT,0.666110183639399,0.0192690903060015,0.49,0.696969696969697,0.53,0.66,0.86,0.76,0.0499899989997999,0.0461883428464987,0.0499099188538711,0.047370877129308,0.0346987031457949,0.0427083130081252 4 | llama-65B-finetuned-300_CoT,0.5383333333333333,0.0203523362932267,0.41,0.66,0.43,0.51,0.65,0.57,0.0491833305094317,0.047370877129308,0.0495075751779462,0.0499899989997999,0.0476969600708472,0.0495075751779462 5 | gpt-3.5-turbo_CoT,0.5,0.0,0.5,0.5,0.5,0.5,0.5,0.5,0.0,0.0,0.0,0.0,0.0,0.0 6 | -------------------------------------------------------------------------------- /toolbench/tooleval/run_convert_answer.sh: -------------------------------------------------------------------------------- 1 | export RAW_ANSWER_PATH=../../data/reproduction_data/model_predictions/ 2 | export CONVERTED_ANSWER_PATH=../../data/reproduction_data/model_predictions_converted/ 3 | export MODEL_NAME=chatgpt_cot 4 | export METHOD=CoT 5 | mkdir ${CONVERTED_ANSWER_PATH}/${MODEL_NAME} 6 | 7 | for test_set in G1_instruction G1_category G1_tool G2_category G2_instruction G3_instruction 8 | do 9 | answer_dir=${RAW_ANSWER_PATH}/${MODEL_NAME}/${test_set} 10 | output_file=${CONVERTED_ANSWER_PATH}/${MODEL_NAME}/${test_set}.json 11 | 12 | python convert_to_answer_format.py\ 13 | --answer_dir ${answer_dir} \ 14 | --method ${METHOD} \ 15 | --output ${output_file} 16 | done 17 | -------------------------------------------------------------------------------- /toolbench/tooleval/run_pass_rate.sh: -------------------------------------------------------------------------------- 1 | export CONVERTED_ANSWER_PATH=../../data/reproduction_data/model_predictions_converted/ 2 | export SAVE_PATH=pass_rate_results 3 | export CANDIDATE_MODEL=chatgpt_cot 4 | export API_POOL_FILE=path/to/your/openai_key_json_file.json 5 | 6 | python eval_pass_rate.py \ 7 | --converted_answer_path ${CONVERTED_ANSWER_PATH} \ 8 | --save_path ${SAVE_PATH} \ 9 | --reference_model ${CANDIDATE_MODEL} \ 10 | --test_ids ../../data/test_query_ids/ \ 11 | --max_eval_threads 20 \ 12 | --evaluate_times 7 13 | -------------------------------------------------------------------------------- /toolbench/tooleval/run_preference.sh: -------------------------------------------------------------------------------- 1 | export CONVERTED_ANSWER_PATH=../../data/reproduction_data/model_predictions_converted/ 2 | export SAVE_PATH=preference_results 3 | export PASS_TARE_PATH=pass_rate_results 4 | export REFERENCE_MODEL=chatgpt_cot 5 | export CANDIDATE_MODEL=gpt-4-0613_cot 6 | export API_POOL_FILE=path/to/your/openai_key_json_file.json 7 | 8 | python eval_preference.py \ 9 | --converted_answer_path ${CONVERTED_ANSWER_PATH} \ 10 | --reference_model ${REFERENCE_MODEL} \ 11 | --output_model ${CANDIDATE_MODEL} \ 12 | --test_ids ../../data/test_query_ids/ \ 13 | --save_path ${SAVE_PATH} \ 14 | --pass_rate_result_path ${PASS_TARE_PATH} \ 15 | --max_eval_threads 20 \ 16 | --use_pass_rate true \ 17 | --evaluate_times 7 18 | -------------------------------------------------------------------------------- /toolbench/train/llama_condense_monkey_patch.py: -------------------------------------------------------------------------------- 1 | # code adapted from https://huggingface.co/kaiokendev/superhot-13b-8k-no-rlhf-test/blob/main/llama_rope_scaled_monkey_patch.py 2 | import torch 3 | import transformers 4 | import transformers.models.llama.modeling_llama 5 | 6 | from functools import partial 7 | 8 | class CondenseRotaryEmbedding(torch.nn.Module): 9 | def __init__(self, dim, ratio, max_position_embeddings=2048, base=10000, device=None): 10 | super().__init__() 11 | inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2).float().to(device) / dim)) 12 | self.register_buffer("inv_freq", inv_freq) 13 | 14 | # Build here to make `torch.jit.trace` work. 15 | self.ratio = ratio 16 | max_position_embeddings *= ratio 17 | print(f"Condensing Positional embeddings from {max_position_embeddings} to {max_position_embeddings // ratio}") 18 | self.max_seq_len_cached = max_position_embeddings 19 | t = torch.arange(self.max_seq_len_cached, device=self.inv_freq.device, dtype=self.inv_freq.dtype) / ratio 20 | freqs = torch.einsum("i,j->ij", t, self.inv_freq) 21 | # Different from paper, but it uses a different permutation in order to obtain the same calculation 22 | emb = torch.cat((freqs, freqs), dim=-1) 23 | dtype = torch.get_default_dtype() 24 | self.register_buffer("cos_cached", emb.cos()[None, None, :, :].to(dtype), persistent=False) 25 | self.register_buffer("sin_cached", emb.sin()[None, None, :, :].to(dtype), persistent=False) 26 | 27 | def forward(self, x, seq_len=None): 28 | # x: [bs, num_attention_heads, seq_len, head_size] 29 | # This `if` block is unlikely to be run after we build sin/cos in `__init__`. Keep the logic here just in case. 30 | if seq_len > self.max_seq_len_cached: 31 | self.max_seq_len_cached = seq_len 32 | t = torch.arange(self.max_seq_len_cached, device=x.device, dtype=self.inv_freq.dtype) / self.ratio 33 | freqs = torch.einsum("i,j->ij", t, self.inv_freq) 34 | # Different from paper, but it uses a different permutation in order to obtain the same calculation 35 | emb = torch.cat((freqs, freqs), dim=-1).to(x.device) 36 | self.register_buffer("cos_cached", emb.cos()[None, None, :, :].to(x.dtype), persistent=False) 37 | self.register_buffer("sin_cached", emb.sin()[None, None, :, :].to(x.dtype), persistent=False) 38 | return ( 39 | self.cos_cached[:, :, :seq_len, ...].to(dtype=x.dtype), 40 | self.sin_cached[:, :, :seq_len, ...].to(dtype=x.dtype), 41 | ) 42 | 43 | def replace_llama_with_condense(ratio): 44 | transformers.models.llama.modeling_llama.LlamaRotaryEmbedding = partial(CondenseRotaryEmbedding, ratio=ratio) 45 | -------------------------------------------------------------------------------- /toolbench/train/llama_flash_attn_monkey_patch.py: -------------------------------------------------------------------------------- 1 | from typing import List, Optional, Tuple 2 | 3 | import torch 4 | from torch import nn 5 | 6 | import transformers 7 | from transformers.models.llama.modeling_llama import apply_rotary_pos_emb 8 | import torch 9 | from torch import nn 10 | import torch.nn.functional as F 11 | import math 12 | import transformers 13 | from transformers.models.llama.modeling_llama import apply_rotary_pos_emb 14 | 15 | 16 | def forward_2( 17 | self, 18 | hidden_states: torch.Tensor, 19 | attention_mask: Optional[torch.Tensor] = None, 20 | position_ids: Optional[torch.LongTensor] = None, 21 | past_key_value: Optional[Tuple[torch.Tensor]] = None, 22 | output_attentions: bool = False, 23 | use_cache: bool = False, 24 | ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: 25 | bsz, q_len, _ = hidden_states.size() 26 | 27 | query_states = self.q_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) 28 | key_states = self.k_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) 29 | value_states = self.v_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) 30 | 31 | kv_seq_len = key_states.shape[-2] 32 | if past_key_value is not None: 33 | kv_seq_len += past_key_value[0].shape[-2] 34 | cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) 35 | query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) 36 | 37 | assert not output_attentions, "output_attentions is not supported" 38 | assert not use_cache, "use_cache is not supported" 39 | assert past_key_value is None, "past_key_value is not supported" 40 | 41 | 42 | if past_key_value is not None: 43 | # reuse k, v, self_attention 44 | key_states = torch.cat([past_key_value[0], key_states], dim=2) 45 | value_states = torch.cat([past_key_value[1], value_states], dim=2) 46 | 47 | past_key_value = (key_states, value_states) if use_cache else None 48 | attn_output= F.scaled_dot_product_attention(query_states,key_states,value_states,dropout_p=0.0, is_causal=True) 49 | attn_weights = None 50 | 51 | if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim): 52 | raise ValueError( 53 | f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is" 54 | f" {attn_output.size()}" 55 | ) 56 | 57 | attn_output = attn_output.transpose(1, 2) 58 | attn_output = attn_output.reshape(bsz, q_len, self.hidden_size) 59 | 60 | attn_output = self.o_proj(attn_output) 61 | 62 | if not output_attentions: 63 | attn_weights = None 64 | 65 | return attn_output, attn_weights, past_key_value 66 | 67 | 68 | def _prepare_decoder_attention_mask(self, attention_mask, input_shape, 69 | inputs_embeds, past_key_values_length): 70 | # [bsz, seq_len] 71 | return attention_mask 72 | 73 | def replace_llama_attn_with_flash_attn(): 74 | transformers.models.llama.modeling_llama.LlamaModel._prepare_decoder_attention_mask = _prepare_decoder_attention_mask 75 | transformers.models.llama.modeling_llama.LlamaAttention.forward = forward_2 76 | -------------------------------------------------------------------------------- /toolbench/train/train_lora.py: -------------------------------------------------------------------------------- 1 | # Usage: deepspeed train_lora.py --deepspeed <$PATH_TO_DEEPSPEED_CONFIG> 2 | 3 | # Adopted from tatsu-lab@stanford_alpaca. Below is the original copyright: 4 | # Copyright 2023 Rohan Taori, Ishaan Gulrajani, Tianyi Zhang, Yann Dubois, Xuechen Li 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | from dataclasses import dataclass, field 19 | import logging 20 | import pathlib 21 | import typing 22 | import os 23 | from deepspeed import zero 24 | from deepspeed.runtime.zero.partition_parameters import ZeroParamStatus 25 | from peft import LoraConfig, get_peft_model 26 | import transformers 27 | from transformers import Trainer 28 | 29 | from toolbench.train.train import ( 30 | DataArguments, 31 | ModelArguments, 32 | TrainingArguments, 33 | make_supervised_data_module, 34 | ) 35 | 36 | from toolbench.train.llama_flash_attn_monkey_patch import ( 37 | replace_llama_attn_with_flash_attn, 38 | ) 39 | from toolbench.train.llama_condense_monkey_patch import replace_llama_with_condense 40 | replace_llama_attn_with_flash_attn() 41 | 42 | 43 | @dataclass 44 | class LoraArguments: 45 | lora_r: int = 8 46 | lora_alpha: int = 16 47 | lora_dropout: float = 0.05 48 | lora_target_modules: typing.List[str] = field( 49 | default_factory=lambda: ["q_proj", "v_proj"] 50 | ) 51 | lora_weight_path: str = "" 52 | lora_bias: str = "none" 53 | 54 | 55 | def maybe_zero_3(param): 56 | if hasattr(param, "ds_id"): 57 | assert param.ds_status == ZeroParamStatus.NOT_AVAILABLE 58 | with zero.GatheredParameters([param]): 59 | param = param.data.detach().cpu().clone() 60 | else: 61 | param = param.detach().cpu().clone() 62 | return param 63 | 64 | 65 | # Borrowed from peft.utils.get_peft_model_state_dict 66 | def get_peft_state_maybe_zero_3(named_params, bias): 67 | if bias == "none": 68 | to_return = {k: t for k, t in named_params if "lora_" in k} 69 | elif bias == "all": 70 | to_return = {k: t for k, t in named_params if "lora_" in k or "bias" in k} 71 | elif bias == "lora_only": 72 | to_return = {} 73 | maybe_lora_bias = {} 74 | lora_bias_names = set() 75 | for k, t in named_params: 76 | if "lora_" in k: 77 | to_return[k] = t 78 | bias_name = k.split("lora_")[0] + "bias" 79 | lora_bias_names.add(bias_name) 80 | elif "bias" in k: 81 | maybe_lora_bias[k] = t 82 | for k, t in maybe_lora_bias: 83 | if bias_name in lora_bias_names: 84 | to_return[bias_name] = t 85 | else: 86 | raise NotImplementedError 87 | to_return = {k: maybe_zero_3(v) for k, v in to_return.items()} 88 | return to_return 89 | 90 | 91 | def train(): 92 | parser = transformers.HfArgumentParser( 93 | (ModelArguments, DataArguments, TrainingArguments, LoraArguments) 94 | ) 95 | ( 96 | model_args, 97 | data_args, 98 | training_args, 99 | lora_args, 100 | ) = parser.parse_args_into_dataclasses() 101 | 102 | if training_args.source_model_max_length < training_args.model_max_length: 103 | condense_ratio = int(training_args.model_max_length/training_args.source_model_max_length) 104 | # ratio = N means the sequence length is expanded by N, remember to change the model_max_length to 8192 (2048 * ratio) for ratio = 4 105 | replace_llama_with_condense(ratio=condense_ratio) 106 | 107 | world_size = int(os.environ.get("WORLD_SIZE", 1)) 108 | ddp = world_size != 1 109 | device_map = {"": int(os.environ.get("LOCAL_RANK") or 0)} if ddp else None 110 | model = transformers.AutoModelForCausalLM.from_pretrained( 111 | model_args.model_name_or_path, 112 | cache_dir=training_args.cache_dir, 113 | device_map=device_map 114 | ) 115 | lora_config = LoraConfig( 116 | r=lora_args.lora_r, 117 | lora_alpha=lora_args.lora_alpha, 118 | target_modules=lora_args.lora_target_modules, 119 | lora_dropout=lora_args.lora_dropout, 120 | bias=lora_args.lora_bias, 121 | task_type="CAUSAL_LM", 122 | ) 123 | model = get_peft_model(model, lora_config) 124 | if training_args.deepspeed is not None and training_args.local_rank == 0: 125 | model.print_trainable_parameters() 126 | 127 | if training_args.gradient_checkpointing: 128 | logging.warning( 129 | "gradient checkpointing with lora makes requires_grad " 130 | "incorrect and needs a monkey patch in Trainer or the " 131 | "wrapped model's forward. ref: " 132 | "https://github.com/lm-sys/FastChat/pull/138#issuecomment-1509172198" 133 | ) 134 | model.enable_input_require_grads() 135 | 136 | tokenizer = transformers.AutoTokenizer.from_pretrained( 137 | model_args.model_name_or_path, 138 | cache_dir=training_args.cache_dir, 139 | model_max_length=training_args.model_max_length, 140 | padding_side="right", 141 | use_fast=False, 142 | ) 143 | tokenizer.pad_token = tokenizer.unk_token 144 | 145 | data_module = make_supervised_data_module(tokenizer=tokenizer, data_args=data_args) 146 | trainer = Trainer( 147 | model=model, tokenizer=tokenizer, args=training_args, **data_module 148 | ) 149 | 150 | model.config.use_cache = False 151 | 152 | if list(pathlib.Path(training_args.output_dir).glob("checkpoint-*")): 153 | trainer.train(resume_from_checkpoint=True) 154 | else: 155 | trainer.train() 156 | trainer.save_state() 157 | 158 | # Save states. Weights might be a placeholder in zero3 and need a gather 159 | state_dict = get_peft_state_maybe_zero_3( 160 | model.named_parameters(), lora_args.lora_bias 161 | ) 162 | if training_args.local_rank == 0: 163 | model.save_pretrained(training_args.output_dir, state_dict=state_dict) 164 | 165 | 166 | if __name__ == "__main__": 167 | train() -------------------------------------------------------------------------------- /toolbench/train/train_mem.py: -------------------------------------------------------------------------------- 1 | # Make it more memory efficient by monkey patching the LLaMA model with FlashAttn. 2 | 3 | # Need to call this before importing transformers. 4 | from toolbench.train.llama_flash_attn_monkey_patch import ( 5 | replace_llama_attn_with_flash_attn, 6 | ) 7 | 8 | replace_llama_attn_with_flash_attn() 9 | 10 | from toolbench.train.train import train 11 | 12 | if __name__ == "__main__": 13 | train() 14 | -------------------------------------------------------------------------------- /toolbench/utils.py: -------------------------------------------------------------------------------- 1 | import json 2 | import re 3 | import torch 4 | import transformers 5 | import transformers.models.llama.modeling_llama 6 | from functools import partial 7 | 8 | 9 | def process_system_message(system_message, functions): 10 | assert "with a function call to actually excute your step." in system_message 11 | # we find that following ReACT format and merging the thought node and function call node is easier for model to learn to integrate the action input json string in its prediction than learn to predict a json string directly. 12 | system_message = system_message.replace("with a function call to actually excute your step.", "with a function call to actually excute your step. Your output should follow this format:\nThought:\nAction\nAction Input:\n") 13 | # add all the function dicts in the prompt. 14 | system_message = system_message + "\nSpecifically, you have access to the following APIs: " + str(functions) 15 | return system_message 16 | 17 | def get_gpu_memory(max_gpus=None): 18 | """Get available memory for each GPU.""" 19 | gpu_memory = [] 20 | num_gpus = ( 21 | torch.cuda.device_count() 22 | if max_gpus is None 23 | else min(max_gpus, torch.cuda.device_count()) 24 | ) 25 | 26 | for gpu_id in range(num_gpus): 27 | with torch.cuda.device(gpu_id): 28 | device = torch.cuda.current_device() 29 | gpu_properties = torch.cuda.get_device_properties(device) 30 | total_memory = gpu_properties.total_memory / (1024**3) 31 | allocated_memory = torch.cuda.memory_allocated() / (1024**3) 32 | available_memory = total_memory - allocated_memory 33 | gpu_memory.append(available_memory) 34 | return gpu_memory 35 | 36 | 37 | def standardize_category(category): 38 | save_category = category.replace(" ", "_").replace(",", "_").replace("/", "_") 39 | while " " in save_category or "," in save_category: 40 | save_category = save_category.replace(" ", "_").replace(",", "_") 41 | save_category = save_category.replace("__", "_") 42 | return save_category 43 | 44 | def standardize(string): 45 | res = re.compile("[^\\u4e00-\\u9fa5^a-z^A-Z^0-9^_]") 46 | string = res.sub("_", string) 47 | string = re.sub(r"(_)\1+","_", string).lower() 48 | while True: 49 | if len(string) == 0: 50 | return string 51 | if string[0] == "_": 52 | string = string[1:] 53 | else: 54 | break 55 | while True: 56 | if len(string) == 0: 57 | return string 58 | if string[-1] == "_": 59 | string = string[:-1] 60 | else: 61 | break 62 | if string[0].isdigit(): 63 | string = "get_" + string 64 | return string 65 | 66 | def change_name(name): 67 | change_list = ["from", "class", "return", "false", "true", "id", "and"] 68 | if name in change_list: 69 | name = "is_" + name 70 | return name 71 | 72 | # code adapted from https://huggingface.co/kaiokendev/superhot-13b-8k-no-rlhf-test/blob/main/llama_rope_scaled_monkey_patch.py 73 | class CondenseRotaryEmbedding(torch.nn.Module): 74 | def __init__(self, dim, ratio, max_position_embeddings=2048, base=10000, device=None): 75 | super().__init__() 76 | inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2).float().to(device) / dim)) 77 | self.register_buffer("inv_freq", inv_freq) 78 | 79 | # Build here to make `torch.jit.trace` work. 80 | self.ratio = ratio 81 | max_position_embeddings *= ratio 82 | print(f"Condensing Positional embeddings from {max_position_embeddings} to {max_position_embeddings // ratio}") 83 | self.max_seq_len_cached = max_position_embeddings 84 | t = torch.arange(self.max_seq_len_cached, device=self.inv_freq.device, dtype=self.inv_freq.dtype) / ratio 85 | freqs = torch.einsum("i,j->ij", t, self.inv_freq) 86 | # Different from paper, but it uses a different permutation in order to obtain the same calculation 87 | emb = torch.cat((freqs, freqs), dim=-1) 88 | dtype = torch.get_default_dtype() 89 | self.register_buffer("cos_cached", emb.cos()[None, None, :, :].to(dtype), persistent=False) 90 | self.register_buffer("sin_cached", emb.sin()[None, None, :, :].to(dtype), persistent=False) 91 | 92 | def forward(self, x, seq_len=None): 93 | # x: [bs, num_attention_heads, seq_len, head_size] 94 | # This `if` block is unlikely to be run after we build sin/cos in `__init__`. Keep the logic here just in case. 95 | if seq_len > self.max_seq_len_cached: 96 | self.max_seq_len_cached = seq_len 97 | t = torch.arange(self.max_seq_len_cached, device=x.device, dtype=self.inv_freq.dtype) / self.ratio 98 | freqs = torch.einsum("i,j->ij", t, self.inv_freq) 99 | # Different from paper, but it uses a different permutation in order to obtain the same calculation 100 | emb = torch.cat((freqs, freqs), dim=-1).to(x.device) 101 | self.register_buffer("cos_cached", emb.cos()[None, None, :, :].to(x.dtype), persistent=False) 102 | self.register_buffer("sin_cached", emb.sin()[None, None, :, :].to(x.dtype), persistent=False) 103 | return ( 104 | self.cos_cached[:, :, :seq_len, ...].to(dtype=x.dtype), 105 | self.sin_cached[:, :, :seq_len, ...].to(dtype=x.dtype), 106 | ) 107 | 108 | def replace_llama_with_condense(ratio): 109 | transformers.models.llama.modeling_llama.LlamaRotaryEmbedding = partial(CondenseRotaryEmbedding, ratio=ratio) 110 | 111 | 112 | def process_retrieval_ducoment(documents_df): 113 | ir_corpus = {} 114 | corpus2tool = {} 115 | for row in documents_df.itertuples(): 116 | doc = json.loads(row.document_content) 117 | ir_corpus[row.docid] = (doc.get('category_name', '') or '') + ', ' + \ 118 | (doc.get('tool_name', '') or '') + ', ' + \ 119 | (doc.get('api_name', '') or '') + ', ' + \ 120 | (doc.get('api_description', '') or '') + \ 121 | ', required_params: ' + json.dumps(doc.get('required_parameters', '')) + \ 122 | ', optional_params: ' + json.dumps(doc.get('optional_parameters', '')) + \ 123 | ', return_schema: ' + json.dumps(doc.get('template_response', '')) 124 | corpus2tool[(doc.get('category_name', '') or '') + ', ' + \ 125 | (doc.get('tool_name', '') or '') + ', ' + \ 126 | (doc.get('api_name', '') or '') + ', ' + \ 127 | (doc.get('api_description', '') or '') + \ 128 | ', required_params: ' + json.dumps(doc.get('required_parameters', '')) + \ 129 | ', optional_params: ' + json.dumps(doc.get('optional_parameters', '')) + \ 130 | ', return_schema: ' + json.dumps(doc.get('template_response', ''))] = doc['category_name'] + '\t' + doc['tool_name'] + '\t' + doc['api_name'] 131 | return ir_corpus, corpus2tool 132 | --------------------------------------------------------------------------------