├── .github └── ISSUE_TEMPLATE │ ├── bug_report.md │ └── feature-request-or-questions.md ├── LICENSE ├── README.md ├── assets ├── authors.png ├── framework.jpg └── icon.png ├── configs └── agent_config.yaml ├── data_generation.sh ├── data_generation └── gaia_pipeline │ ├── 0_query_generation_tonggpt.py │ ├── 1_query2file_content_parallel_tonggpt.py │ ├── 2_file_content2file_tonggpt.py │ ├── 3_traj_genetation_tonggpt.py │ ├── merge.py │ ├── prompts │ ├── file │ │ ├── gaia_file_generation_system.prompt │ │ ├── gaia_file_generation_user.prompt │ │ ├── gaia_file_verifier_system.prompt │ │ ├── gaia_file_verifier_user.prompt │ │ ├── gaia_system.prompt │ │ └── gaia_user.prompt │ ├── query │ │ ├── gaia_test_query_generation.prompt │ │ ├── gaia_test_tool.prompt │ │ ├── gaia_val_metadata.jsonl │ │ └── gaia_val_query_generation.prompt │ └── statistics │ │ ├── pie_chart.py │ │ └── topic.prompt │ └── verifier │ ├── 0_collect.py │ ├── 1_gaia_q_f_filter.py │ ├── 2_convert_format.py │ ├── 3_gaia_verifier_parallel.py │ ├── mdconvert.py │ └── prompt │ ├── gaia_file_verifier_system.prompt │ ├── gaia_file_verifier_user.prompt │ ├── gaia_traj_verifier_system.prompt │ ├── gaia_traj_verifier_user.prompt │ ├── gta_file_verifier_system.prompt │ ├── gta_file_verifier_user.prompt │ ├── gta_traj_verifier_system.prompt │ └── gta_traj_verifier_user.prompt ├── examples ├── gaia │ ├── analysis.py │ ├── eval.py │ ├── main.py │ ├── playground.py │ └── view.py └── gta │ ├── eval.py │ └── main.py ├── experiments ├── CPM-FT │ ├── README.md │ ├── assets │ │ ├── airplane.jpeg │ │ ├── prompt.txt │ │ └── sosa.png │ ├── finetune │ │ ├── __init__.py │ │ ├── dataset.py │ │ ├── ds_config_zero2.json │ │ ├── ds_config_zero3.json │ │ ├── finetune.py │ │ ├── finetune_ds.sh │ │ ├── finetune_lora.sh │ │ ├── readme.md │ │ └── trainer.py │ ├── inference │ │ ├── __init__.py │ │ ├── eval.py │ │ ├── inference.py │ │ └── utils.py │ ├── main.py │ ├── output │ │ └── cpm_fire_test.json │ ├── requirements.txt │ ├── scripts │ │ ├── convert_baai_stats.py │ │ ├── download_cauldron.py │ │ ├── ds_config_zero2.json │ │ ├── ds_config_zero3.json │ │ ├── filter_baai_dataset.py │ │ ├── finetune_lora.sh │ │ ├── reset_system_prompt.py │ │ ├── sanity_check.py │ │ ├── subset.py │ │ └── tokenizer.py │ ├── slurm_jobs │ │ ├── job_lora_5_gaia_1206.sh │ │ └── job_lora_5_gta_with_verifier.sh │ └── tests │ │ ├── __init__.py │ │ ├── test_infer.py │ │ └── test_infer_lora.py └── Qwen-VL │ ├── .github │ └── ISSUE_TEMPLATE │ │ ├── bug_report.yaml │ │ ├── config.yaml │ │ └── feature_request.yaml │ ├── .gitignore │ ├── BUILD.md │ ├── Dockerfile.qwendemo │ ├── Dockerfile.qwenint4openai │ ├── Dockerfile.qwenopenai │ ├── FAQ.md │ ├── FAQ_ja.md │ ├── FAQ_ko.md │ ├── FAQ_zh.md │ ├── LICENSE │ ├── NOTICE │ ├── README.md │ ├── README_CN.md │ ├── README_JA.md │ ├── README_KO.md │ ├── TUTORIAL.md │ ├── TUTORIAL_ja.md │ ├── TUTORIAL_ko.md │ ├── TUTORIAL_zh.md │ ├── assets │ ├── apple.jpeg │ ├── apple_r.jpeg │ ├── demo.jpeg │ ├── demo_highfive.jpg │ ├── demo_spotting_caption.jpg │ ├── demo_vl.gif │ ├── logo.jpg │ ├── mm_tutorial │ │ ├── Beijing.jpeg │ │ ├── Beijing_Small.jpeg │ │ ├── Chongqing.jpeg │ │ ├── Chongqing_Small.jpeg │ │ ├── Hospital.jpg │ │ ├── Hospital_Small.jpg │ │ ├── Menu.jpeg │ │ ├── Rebecca_(1939_poster).jpeg │ │ ├── Rebecca_(1939_poster)_Small.jpeg │ │ ├── Shanghai.jpg │ │ ├── Shanghai_Output.jpg │ │ ├── Shanghai_Output_Small.jpeg │ │ ├── Shanghai_Small.jpeg │ │ └── TUTORIAL.ipynb │ ├── qwenvl.jpeg │ ├── radar.png │ ├── radar_qwenvlplus.jpg │ ├── touchstone_datasets.jpg │ ├── touchstone_eval.png │ ├── touchstone_logo.png │ └── wechat.png │ ├── data │ ├── train_20241116_1625_subset.json │ └── train_20241116_1628_subset.json │ ├── eval_mm │ ├── EVALUATION.md │ ├── data │ ├── evaluate_caption.py │ ├── evaluate_grounding.py │ ├── evaluate_multiple_choice.py │ ├── evaluate_vqa.py │ ├── infographicsvqa_eval.py │ ├── mmbench │ │ ├── MMBENCH.md │ │ ├── evaluate_multiple_choice_mmbench.py │ │ ├── mmbench_converter_dev.py │ │ ├── mmbench_converter_test.py │ │ ├── mmbench_evaluation.py │ │ ├── mmbench_evaluation_tricky.py │ │ └── mmbench_predict_to_submission.py │ ├── mme │ │ ├── EVAL_MME.md │ │ ├── cognition.jpg │ │ ├── eval.py │ │ ├── get_images.py │ │ └── perception.jpg │ ├── seed_bench │ │ ├── EVAL_SEED.md │ │ ├── eval.py │ │ ├── leaderboard.jpg │ │ └── trans.py │ ├── vqa.py │ └── vqa_eval.py │ ├── finetune.py │ ├── finetune │ ├── ds_config_zero2.json │ ├── ds_config_zero3.json │ ├── finetune_ds.sh │ ├── finetune_lora_ds.sh │ ├── finetune_lora_ds_gaia.sh │ ├── finetune_lora_single_gpu.sh │ ├── finetune_qlora_ds.sh │ └── finetune_qlora_single_gpu.sh │ ├── openai_api.py │ ├── output │ └── error-out-7903664.out │ ├── requirements.txt │ ├── requirements_openai_api.txt │ ├── requirements_web_demo.txt │ ├── scripts │ ├── convert_dataset.py │ ├── convert_dataset_v2.py │ ├── inference.py │ ├── inference_lora.py │ └── tokenizer.py │ ├── slurm_jobs │ ├── train_gaia.sh │ └── train_gta.sh │ ├── touchstone │ ├── README.md │ ├── README_CN.md │ ├── README_JA.md │ └── README_KO.md │ └── web_demo_mm.py ├── main.py ├── requirements.txt ├── requirements_generation.txt ├── scripts ├── report.py └── search.py ├── slurm_jobs ├── deploy_qwen2_5_72b.sh ├── deploy_qwen2_VL_72b.sh ├── evaluate.sh ├── evaluate_gaia.sh ├── evaluate_gaia_exp1.sh ├── evaluate_gaia_exp1_setting1.sh ├── evaluate_gaia_exp1_setting2.sh ├── evaluate_gaia_exp1_setting3.sh ├── evaluate_gaia_exp2.sh ├── evaluate_gaia_exp3.sh ├── evaluate_gaia_exp4.sh ├── evaluate_gaia_internvl2.sh ├── evaluate_gaia_llava.sh ├── evaluate_gaia_qwen.sh ├── evaluate_gaia_qwen_tuned.sh ├── evaluate_gta_internvl.sh ├── evaluate_gta_internvl2.sh ├── evaluate_gta_llava.sh ├── evaluate_gta_qwen.sh ├── evaluate_gta_qwen_llm.sh ├── gaia_pipeline.sh ├── gaia_pipeline_query_gen.sh ├── occupy.sh ├── qwen_test.sh ├── train.slurm └── traj_gen.sh ├── tests ├── __init__.py ├── data │ ├── 254.jpg │ ├── annotated_cars.png │ ├── cars.png │ └── draw.jpg ├── test_activate.py ├── test_agent.py ├── test_agent_data.py ├── test_agent_gaia.py ├── test_code.py ├── test_create_agent.py ├── test_debug.py ├── test_edit.py ├── test_engine.py ├── test_file_reader.py ├── test_find.py ├── test_format_answer.py ├── test_gaia_1107.py ├── test_inpector.py ├── test_internvl.py ├── test_llava_ov.py ├── test_llm.py ├── test_ocr.py ├── test_ov_engine.py ├── test_qwen.py ├── test_seg.py ├── test_vision_map.py └── test_vllm.py └── tongagent ├── __init__.py ├── agents ├── __init__.py ├── data_sampling_agent.py ├── gaia_agent.py ├── general_agent.py ├── search_agent.py └── search_agent_api.py ├── cmd ├── __init__.py └── task_generate.py ├── evaluation ├── __init__.py ├── evaluation.py ├── gaia_scorer.py ├── optimize_prompt.py └── unsolved_questions.py ├── llm_engine ├── __init__.py ├── gpt.py ├── internvl2.py ├── llava.py ├── mini_cpm.py └── qwen.py ├── prompt.py ├── tools ├── __init__.py ├── browser.py ├── cookies.py ├── mdconvert.py ├── new_added │ ├── __init__.py │ ├── face_det.py │ ├── image_edit.py │ ├── image_generation.py │ ├── object_loc.py │ ├── ocr.py │ ├── seg.py │ └── video_qa.py ├── rag_browser.py ├── text_inspector.py ├── tool_box.py ├── visual_qa.py └── web_surfer.py └── utils.py /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Describe the bug** 11 | A clear and concise description of what the bug is. 12 | 13 | **To Reproduce** 14 | Steps to reproduce the behavior: 15 | 1. runing script 16 | ```bash 17 | python main.py 18 | ``` 19 | 2. It throw an error 20 | Key Error:xxxx 21 | 22 | **Expected behavior** 23 | A clear and concise description of what you expected to happen. 24 | 25 | **Screenshots** 26 | If applicable, add screenshots to help explain your problem. 27 | 28 | **OS (please complete the following information):** 29 | - Mac/Linux/Windows 30 | 31 | **Additional context** 32 | Add any other context about the problem here. 33 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature-request-or-questions.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request Or Questions 3 | about: Suggest an idea for this project or you have questions for maintainers 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Is your feature request related to a problem? Please describe.** 11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] 12 | 13 | **Describe the solution you'd like** 14 | A clear and concise description of what you want to happen. 15 | 16 | **Describe alternatives you've considered** 17 | A clear and concise description of any alternative solutions or features you've considered. 18 | 19 | **Additional context** 20 | Add any other context or screenshots about the feature request here. 21 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 Machine Learning Lab @ BIGAI 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /assets/authors.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mat-agent/MAT-Agent/7215f7bb7ee2284cfaca2fda79725c39d5b3bb89/assets/authors.png -------------------------------------------------------------------------------- /assets/framework.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mat-agent/MAT-Agent/7215f7bb7ee2284cfaca2fda79725c39d5b3bb89/assets/framework.jpg -------------------------------------------------------------------------------- /assets/icon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mat-agent/MAT-Agent/7215f7bb7ee2284cfaca2fda79725c39d5b3bb89/assets/icon.png -------------------------------------------------------------------------------- /configs/agent_config.yaml: -------------------------------------------------------------------------------- 1 | 2 | 3 | tonggpt: 4 | model_name: gpt-4o-2024-08-06 5 | region: eastus 6 | api_key: 7 | open_ai_client_type: openai # or azure 8 | endpoint: # only for azure, you need to specify the endpoint you are using 9 | 10 | visualizer: 11 | model_name: gpt-4o-2024-08-06 12 | region: eastus 13 | api_key: 14 | open_ai_client_type: openai # or azure 15 | endpoint: # only for azure, you need to specify the endpoint you are using 16 | 17 | qwen: 18 | model_name: Qwen/Qwen2-VL-7B-Instruct 19 | endpoint: 20 | 21 | agent_controller: 22 | engine_type: qwen # qwen, minicpm, tonggpt 23 | web_qa: 24 | model_name: gpt-4o-mini-2024-07-18 25 | 26 | internvl2: 27 | model_name: OpenGVLab/InternVL2-8B 28 | 29 | llava: 30 | model_name: Lin-Chen/open-llava-next-llama3-8b 31 | 32 | search_agent: 33 | type: agent 34 | model_name: gpt-4o-2024-08-06 35 | region: eastus 36 | api_key: 37 | 38 | search_engine: 39 | - 40 | cx: 41 | key: 42 | 43 | data_generation: 44 | # the llm can be azure or openai 45 | llm: azure 46 | model: gpt-4o-mini-2024-07-18 47 | api_key: 48 | ape_base: 49 | region: eastus 50 | query_embedding_save_path: image_source/support_embedding_sharegpt4v_100k_chartqa_all.npy 51 | image_base_path: image_source/open_llava_next 52 | caption_data_path: image_source/chartqa_sharegpt4v_all.json 53 | -------------------------------------------------------------------------------- /data_generation.sh: -------------------------------------------------------------------------------- 1 | #1. query, file, and traj generation 2 | python -m data_generation.gaia_pipeline.0_query_generation_tonggpt --timestamp 20241223-213646 3 | python -m data_generation.gaia_pipeline.1_query2file_content_parallel_tonggpt --timestamp 20241223-213646 4 | python -m data_generation.gaia_pipeline.2_file_content2file_tonggpt --timestamp 20241223-213646 --start 0 --end 1000 5 | python -m data_generation.gaia_pipeline.3_traj_genetation_tonggpt --timestamp 20241223-213646 --start 0 --end 1000 6 | 7 | #2. verification and structure conversation 8 | python -m data_generation.gaia_pipeline.verifier.0_collect --timestamp 20241223-213646 9 | python -m data_generation.gaia_pipeline.verifier.1_gaia_q_f_filter --timestamp 20241223-213646 10 | python -m data_generation.gaia_pipeline.verifier.2_convert_format --timestamp 20241223-213646 11 | python -m data_generation.gaia_pipeline.verifier.3_gaia_verifier_parallel --timestamp 20241223-213646 12 | -------------------------------------------------------------------------------- /data_generation/gaia_pipeline/merge.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | 4 | 5 | def merge(source_folder , output_folder, filename): 6 | # source_folder = path + '/query/query_json/' 7 | # output_folder = path + '/query/queries_merged' 8 | if os.path.exists(output_folder) is False: 9 | os.makedirs(output_folder) 10 | 11 | save_path = os.path.join(output_folder, filename) 12 | json_files = [pos_json for pos_json in os.listdir(source_folder) if pos_json.endswith('.json')] 13 | data = [] 14 | for json_file in json_files: 15 | print ('===============',os.path.join(source_folder, json_file)) 16 | with open(os.path.join(source_folder, json_file)) as f: 17 | tmp = json.load(f) 18 | if isinstance(tmp, list) and len(tmp) == 1: 19 | tmp = tmp[0] 20 | if isinstance(tmp, list): 21 | data += tmp 22 | else: 23 | data.append(tmp) 24 | length = len(data) 25 | 26 | if os.path.exists(output_folder): 27 | pass 28 | else: 29 | os.makedirs(output_folder) 30 | 31 | with open(save_path, 'w') as f: 32 | json.dump(data, f) 33 | print(f"Successfully merged {length} json files") -------------------------------------------------------------------------------- /data_generation/gaia_pipeline/prompts/file/gaia_file_generation_system.prompt: -------------------------------------------------------------------------------- 1 | You are a helpful assistant and can to generate a file by writing Python code. You will be given a content description of the file. You need to firstly largely extend the content, and then write Python code to generate a file. GUARANTEE that the provided content is in the file. 2 | 3 | The output Python code MUST use the following template. 4 | ``` 5 | ##extention start 6 | Extened content: 7 | 8 | ##code start 9 | ```python 10 | file> 11 | ``` 12 | ##code end 13 | ``` -------------------------------------------------------------------------------- /data_generation/gaia_pipeline/prompts/file/gaia_file_generation_user.prompt: -------------------------------------------------------------------------------- 1 | Now, given the following content: , first largely extend the content, and output a code to generate a file, where file name is and the file will be saved in . -------------------------------------------------------------------------------- /data_generation/gaia_pipeline/prompts/file/gaia_file_verifier_user.prompt: -------------------------------------------------------------------------------- 1 | Following are files, the query: , inference whether the files can solve the query based on the perception ability, reasoning ability, and information search ability of an AI agent. -------------------------------------------------------------------------------- /data_generation/gaia_pipeline/prompts/file/gaia_user.prompt: -------------------------------------------------------------------------------- 1 | Now given the query: , and suggested tools to solve this query: . firstly analyze the needed information to solve the query and divide the information into two groups: searching from Internet or extracted from files using tools. Then for information from files, imagine concrete answer of each information (it should be concrete answers instead of description). Finally, output the json for the inferenced information and the content of files. -------------------------------------------------------------------------------- /data_generation/gaia_pipeline/prompts/query/gaia_test_query_generation.prompt: -------------------------------------------------------------------------------- 1 | You are tasked with generating user queries that will prompt an agent to call various tools (only use the tool listed in our toolset), including internet search capabilities, to solve real-world, practical problems. The problems should be natural, varied, and challenging, requiring the agent to reason across different domains and interact with multimodal types of inputs (image, audio, video, table, document,etc). Ensure that the problems span a range of practical scenarios. 2 | 3 | Our toolset: TOOL_SET 4 | [ 5 | { 6 | "tool_name":"ask_search_agent", 7 | "description": "This will send a message to a agent that will browse the internet to answer your question. Ask him for all your web-search related questions, but he's unable to do problem-solving. Provide him as much context as possible, in particular if you need to search on a specific timeframe! And don't hesitate to provide them with a complex search task, like finding a difference between two webpages." 8 | }, 9 | { 10 | "tool_name":"visualizer", 11 | "description": "A tool that can answer questions about attached images." 12 | }, 13 | { 14 | "tool_name":"PythonInterpreter", 15 | "description": "A tool that can excute python codes to do calculation and plot, etc." 16 | }, 17 | { 18 | "tool_name":"inspect_file_as_text", 19 | "description": "A tool that can read a file as markdown text and answer questions about it. This tool handles the following file extensions: [".html", ".htm", ".xlsx", ".pptx", ".wav", ".mp3", ".flac", ".pdf", ".docx"], and all other types of text files. IT DOES NOT HANDLE IMAGES." 20 | } 21 | ] 22 | 23 | I will now provide examples, along with the tools. Examples of user queries: IN_CONTEXT_EXAMPLES 24 | 25 | Please output the Queries in a json format. Make sure that the queries share a similar style of the in-context examples. The output template is : 26 | ```json 27 | [ 28 | { 29 | "query": "What is the weather today?", # 30 | "tools": ["tool1", "tool2",...] # 31 | }, 32 | ... 33 | ] 34 | ``` -------------------------------------------------------------------------------- /data_generation/gaia_pipeline/prompts/query/gaia_test_tool.prompt: -------------------------------------------------------------------------------- 1 | You are tasked with inferening what tools will an agent uses to solve a given user query. Concretely, the agent will be given a user query, and it needs to call various tools (only use the tool listed in our toolset), to solve real-world, practical problems. Now, you will be given a query, ensure that the inferenced tools are indeed and necessary to solve the qeury. 2 | 3 | Our toolset: TOOL_SET 4 | [ 5 | { 6 | "tool_name":"ask_search_agent", 7 | "description": "This will send a message to a agent that will browse the internet to answer your question. Ask him for all your web-search related questions, but he's unable to do problem-solving. Provide him as much context as possible, in particular if you need to search on a specific timeframe! And don't hesitate to provide them with a complex search task, like finding a difference between two webpages." 8 | }, 9 | { 10 | "tool_name":"visualizer", 11 | "description": "A tool that can answer questions about attached images." 12 | }, 13 | { 14 | "tool_name":"PythonInterpreter", 15 | "description": "A tool that can excute python codes to do calculation and plot, etc." 16 | }, 17 | { 18 | "tool_name":"inspect_file_as_text", 19 | "description": "A tool that can read a file as markdown text and answer questions about it. This tool handles the following file extensions: [".html", ".htm", ".xlsx", ".pptx", ".wav", ".mp3", ".flac", ".pdf", ".docx"], and all other types of text files. IT DOES NOT HANDLE IMAGES." 20 | } 21 | ] 22 | 23 | I will now provide examples, along with the tools. Examples of user queries and tools: IN_CONTEXT_EXAMPLES 24 | 25 | Please output the tools in a json format. Make sure that the tools are in the toolset. The output template is 26 | ```json 27 | { 28 | "Tools": ["tool1",...] # 29 | } 30 | ``` -------------------------------------------------------------------------------- /data_generation/gaia_pipeline/prompts/query/gaia_val_query_generation.prompt: -------------------------------------------------------------------------------- 1 | You are tasked with generating user queries that will prompt an agent to call various tools (only use the tool listed in our toolset), including internet search capabilities, to solve real-world, practical problems. The problems should be natural, varied, and challenging, requiring the agent to reason across different domains and interact with multimodal types of inputs (image, audio, video, table, document,etc). Ensure that the problems span a range of practical scenarios. 2 | 3 | Our toolset: TOOL_SET 4 | [ 5 | { 6 | "tool_name":"ask_search_agent", 7 | "description": "This will send a message to a agent that will browse the internet to answer your question. Ask him for all your web-search related questions, but he's unable to do problem-solving. Provide him as much context as possible, in particular if you need to search on a specific timeframe! And don't hesitate to provide them with a complex search task, like finding a difference between two webpages." 8 | }, 9 | { 10 | "tool_name":"visualizer", 11 | "description": "A tool that can answer questions about attached images." 12 | }, 13 | { 14 | "tool_name":"PythonInterpreter", 15 | "description": "A tool that can excute python codes to do calculation and plot, etc." 16 | }, 17 | { 18 | "tool_name":"inspect_file_as_text", 19 | "description": "A tool that can read a file as markdown text and answer questions about it. This tool handles the following file extensions: [".html", ".htm", ".xlsx", ".pptx", ".wav", ".mp3", ".flac", ".pdf", ".docx"], and all other types of text files. IT DOES NOT HANDLE IMAGES." 20 | } 21 | ] 22 | 23 | I will now provide examples, along with the tools. Examples of user queries: IN_CONTEXT_EXAMPLES 24 | 25 | Please output the Queries in a json format. Make sure that the queries share a similar style of the in-context examples. The output template is : 26 | ```json 27 | [ 28 | { 29 | "query": "What is the weather today?", # 30 | "tools": ["tool1", "tool2",...] # 31 | }, 32 | ... 33 | ] 34 | ``` -------------------------------------------------------------------------------- /data_generation/gaia_pipeline/prompts/statistics/pie_chart.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import numpy as np 3 | 4 | import json 5 | 6 | # Data to plot 7 | def pie_chart(labels, sizes, pdf_save_path): 8 | 9 | 10 | # Generate a list of colors from a colormap 11 | cmap = plt.get_cmap("tab20c") 12 | colors = cmap(np.linspace(0, 1, len(labels))) 13 | 14 | # Plot 15 | plt.figure(figsize=(10, 7)) 16 | plt.pie(sizes, labels=labels, colors=colors, 17 | autopct='%1.1f%%', shadow=False, startangle=140, textprops={'fontsize': 14}, 18 | pctdistance=0.9) # Move the percentage text outward 19 | 20 | plt.axis('equal') # Equal aspect ratio ensures that pie is drawn as a circle. 21 | 22 | plt.title('Programming Language Usage', fontsize=20) 23 | 24 | # Save the figure as a PDF 25 | plt.savefig(pdf_save_path) 26 | 27 | plt.show() 28 | 29 | 30 | 31 | def load_json(path): 32 | with open(path, 'r', encoding='utf-8') as file: 33 | data=json.load(file) 34 | return data 35 | 36 | def write_json(data, filename): 37 | """ 38 | Write a JSON-compatible Python dictionary to a file. 39 | 40 | :param data: The JSON-compatible dictionary to write. 41 | :param filename: The name of the file to write to. 42 | """ 43 | try: 44 | with open(filename, 'w', encoding='utf-8') as file: 45 | json.dump(data, file, ensure_ascii=False, indent=4) 46 | print(f"Data successfully written to {filename}") 47 | except Exception as e: 48 | print(f"An error occurred while writing to the file: {e}") 49 | 50 | 51 | 52 | json_path="data/final_dataset/tool_statistics.json" 53 | pdf_save_path='data/final_dataset/tool_statistics.pdf' 54 | 55 | # json_path="data/final_dataset/file_statistics.json" 56 | # pdf_save_path='data/final_dataset/file_statistics.pdf' 57 | 58 | # json_path="data/final_dataset/topic_statistics.json" 59 | # pdf_save_path='data/final_dataset/topic_statistics.pdf' 60 | 61 | json_data=load_json(json_path) 62 | labels=list(json_data.keys()) 63 | values=list(json_data.values()) 64 | 65 | pie_chart(labels,values,pdf_save_path) -------------------------------------------------------------------------------- /data_generation/gaia_pipeline/prompts/statistics/topic.prompt: -------------------------------------------------------------------------------- 1 | You are a helpful assistant. You will be given a query, and you need to classify the topic of the given query from the following candidates: CANDIDATE. 2 | 3 | Please output the topic in a json format. Make sure the output topic is one of the above candidates. The output template is 4 | ```json 5 | [ 6 | { 7 | "Topic": 8 | "Other topic": 9 | } 10 | ] 11 | ``` -------------------------------------------------------------------------------- /data_generation/gaia_pipeline/verifier/0_collect.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import argparse 4 | 5 | def read_json(path): 6 | with open(path, 'r', encoding='utf-8') as file: 7 | data=json.load(file) 8 | return data 9 | 10 | 11 | def save_json(path, data): 12 | with open(path, 'w') as file: 13 | json.dump(data, file, indent=4) 14 | 15 | 16 | 17 | def list_files_in_folder(folder_path): 18 | try: 19 | # Get the list of files and directories in the specified folder 20 | files_and_dirs = os.listdir(folder_path) 21 | 22 | # Filter out directories, keeping only files 23 | files = [f for f in files_and_dirs if os.path.isfile(os.path.join(folder_path, f))] 24 | 25 | return files 26 | except Exception as e: 27 | print(f"An error occurred: {e}") 28 | return [] 29 | 30 | 31 | 32 | parser = argparse.ArgumentParser(description='Generate queries using GAIA data') 33 | parser.add_argument("--timestamp", type=str) 34 | 35 | args = parser.parse_args() 36 | timestamp=args.timestamp 37 | 38 | def list_files_in_directory(path): 39 | try: 40 | # Get a list of all files and directories in the given path 41 | items = os.listdir(path) 42 | 43 | # Filter out directories, keeping only files 44 | files = [item for item in items if os.path.isfile(os.path.join(path, item))] 45 | 46 | return files 47 | except Exception as e: 48 | print(f"An error occurred: {e}") 49 | return [] 50 | 51 | json_list = list_files_in_directory(f'./data_generation/gaia_pipeline/save/{timestamp}/traj/') 52 | 53 | # Example usage 54 | json_root_path = './data_generation/gaia_pipeline/final_save/' 55 | 56 | print ('json list', json_list) 57 | save_name=f'all_json_{timestamp}_gpt4omini.json' 58 | 59 | 60 | all_data=[] 61 | for json_name in json_list: 62 | data = read_json(os.path.join(json_root_path,json_name)) 63 | all_data=all_data+data 64 | 65 | save_json(os.path.join(json_root_path,save_name),all_data) 66 | 67 | print ('total num', len(all_data)) 68 | 69 | 70 | 71 | -------------------------------------------------------------------------------- /data_generation/gaia_pipeline/verifier/prompt/gaia_file_verifier_system.prompt: -------------------------------------------------------------------------------- 1 | You are a helpful assistant that are given a query and several files. You need to check whether the files are matched with the query. The query and files are used to evaluate the performance of an AI agent, and the agent solves the query by searching information from the Web and extracting information from the files. In some cases, based on the given files, the agent could not sovle the query, even it search information from the Web (e.g., some specific knowledge). You need to pick up these bad cases. 2 | 3 | Thus, the files should follow these requirements. 4 | 1. Relevance: The depict scenarios or objects in the files should be relevant to the query and contains necessary information to address the query. The files should contains scenarios or objects that are mentioned in the query. 5 | 2. Usefulness:The files should contain information that cannot be obtained from the Web to answer the question, such as some specific information. It should not be too simplistic or lack necessary details. 6 | 3. Some queries require the agent to search some knowledge from the Web, and combine them with information in the files to solve the queries. Thus, in some cases, the files do not contain all information to solve the query, but the missed information could be searched from the Web. These cases should be regarded as correct cases. 7 | 8 | The output MUST use the following json template to evaluate files. 9 | ''' 10 | ### start json 11 | { 12 | "information_for_query": 13 | "useful_information_in_files": 14 | "missed_information_in_files": 15 | "missed_information_web_search": 16 | "missed_information_computed": 17 | "thought": 18 | "correct": 19 | } 20 | ### end json 21 | ''' -------------------------------------------------------------------------------- /data_generation/gaia_pipeline/verifier/prompt/gaia_file_verifier_user.prompt: -------------------------------------------------------------------------------- 1 | Following the query: , and contents of given files: , inference whether the files are correct or not. -------------------------------------------------------------------------------- /data_generation/gaia_pipeline/verifier/prompt/gaia_traj_verifier_system.prompt: -------------------------------------------------------------------------------- 1 | As a data quality evaluator that needs to determine whether a query-solving trajectory between human and an agent is correct. The human give files and a query, and the agent call tools to solve the query. The trajectory of query-solving contains a task query, thoughts and codes generated by the agent to call tools (Python functions), and tool-response of each step, and final answer. You must assess the alignment between the task query, corresponding tool usage (generated thoughts and codes from the agent), and the execution results (tool-response). Your goal is to ensure the used tools, arguments to the tools, and summarized answers in the trajectory accurately reflect the human’s intentions. 2 | 3 | The query-solving trajectory is incorrent if: 4 | 1. The tool usage does not align with the query’s objective and the context, or there are useless or unreasonable tool usage. In addition, the agent does not use tools and solve the query by itself. 5 | 2. The input arguments to the tools appear incorrect or unreasonable. 6 | 3. The final answers or intermediate results summarized from the observation appear incorrect or unreasonable. 7 | 4. The final answer is not relevant to the task query or the final answer seems incorrect. 8 | 5. The trajectory (such as tool-usage and observation) confilicts or is not consistent with the file content. -------------------------------------------------------------------------------- /data_generation/gaia_pipeline/verifier/prompt/gaia_traj_verifier_user.prompt: -------------------------------------------------------------------------------- 1 | Now, given used files and corresponding information, determine the trajectory is correct or not. 2 | − All Available Tools: 3 | 4 | − User Query: 5 | − Trajectory, including generated thought and code from the agent, and intermediate results of using tools: 6 | 7 | − Execution Results: 8 | 9 | Output MUST use the following json template to determine the query-solving trajectory is correct or not. 10 | ''' 11 | ### start json 12 | { 13 | "thought": "Concisely describe your reasoning here", 14 | "correct": "yes" or "no" 15 | } 16 | ### end json 17 | ''' -------------------------------------------------------------------------------- /data_generation/gaia_pipeline/verifier/prompt/gta_file_verifier_user.prompt: -------------------------------------------------------------------------------- 1 | Following are images, the query: , inference whether the images can solve the query based on the perception ability, reasoning ability, and information search ability of an AI agent. -------------------------------------------------------------------------------- /data_generation/gaia_pipeline/verifier/prompt/gta_traj_verifier_user.prompt: -------------------------------------------------------------------------------- 1 | Now, given used images and corresponding information, determine the trajectory is correct or not. 2 | 3 | − User Query: 4 | − Image Content: 5 | − Trajectory, including generated thought and code from the agent, and intermediate results of using tools: 6 | 7 | − Execution Results: 8 | 9 | Output MUST use the following json template to determine the query-solving trajectory is correct or not. 10 | ''' 11 | ### start json 12 | { 13 | "thought": "Concisely describe your reasoning here", 14 | "correct": "yes" or "no" 15 | } 16 | ### end json 17 | ''' -------------------------------------------------------------------------------- /examples/gaia/analysis.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | import os 4 | import numpy as np 5 | parser = argparse.ArgumentParser() 6 | parser.add_argument("--data-path") 7 | args = parser.parse_args() 8 | 9 | 10 | files = os.listdir(args.data_path) 11 | 12 | files = [os.path.join(args.data_path, f) for f in files] 13 | 14 | files = [f for f in files if os.path.isdir(f)] 15 | 16 | counts = [] 17 | for f in files: 18 | 19 | f = os.path.join(f, "agent_memory.json") 20 | with open(f, "r") as f: 21 | dataset = json.load(f) 22 | 23 | conv = dataset["conversations"] 24 | turn = len(conv) 25 | steps = (turn - 2) // 2 26 | print(steps) 27 | counts.append(steps) 28 | # print(conv) 29 | # break 30 | 31 | import matplotlib.pyplot as plt 32 | 33 | plt.figure(dpi=300) 34 | plt.hist(counts, bins=7) 35 | plt.xlabel("Steps") 36 | plt.ylabel("Task counts") 37 | plt.grid() 38 | plt.show() -------------------------------------------------------------------------------- /examples/gaia/eval.py: -------------------------------------------------------------------------------- 1 | import sys 2 | sys.path.insert(0, "./") 3 | import sqlite3 4 | 5 | from tongagent.evaluation.gaia_scorer import question_scorer 6 | from tongagent.llm_engine.gpt import get_tonggpt_open_ai_client 7 | from tongagent.prompt import FORMAT_ANSWER_PROMPT_GAIA 8 | from langchain.prompts import ChatPromptTemplate 9 | from tqdm import tqdm 10 | 11 | import argparse 12 | 13 | parser = argparse.ArgumentParser() 14 | parser.add_argument("--data-path") 15 | args = parser.parse_args() 16 | 17 | cache_db = sqlite3.connect(args.data_path) 18 | cursor = cache_db.cursor() 19 | cursor.execute(f"SELECT * FROM qa_cache") 20 | rows = cursor.fetchall() 21 | cache_db.close() 22 | 23 | # print(rows) 24 | client, model = get_tonggpt_open_ai_client() 25 | template = ChatPromptTemplate.from_template(FORMAT_ANSWER_PROMPT_GAIA) 26 | n_total = len(rows) 27 | correct = 0 28 | eval_data = [] 29 | for row in tqdm(rows): 30 | try: 31 | is_this_correct = question_scorer( 32 | ground_truth=row[-1], 33 | model_answer=row[-2] 34 | ) 35 | except Exception as e: 36 | print("question_scorer failed", e) 37 | is_this_correct = 0 38 | if is_this_correct == 0: 39 | task = row[0] 40 | final_answer = row[-2] 41 | prompt_input = { 42 | "question": task, 43 | "answer": final_answer 44 | } 45 | prompt = template.invoke(prompt_input) 46 | messages = [ 47 | {"role": "user", "content": prompt.to_messages()[0].content} 48 | ] 49 | 50 | response = client.chat.completions.create( 51 | messages = messages, 52 | model = model 53 | ) 54 | final_answer: str = response.choices[0].message.content 55 | if "Educated guess:" in final_answer: 56 | final_answer = final_answer.replace("Educated guess:", "").strip() 57 | try: 58 | is_this_correct = question_scorer( 59 | ground_truth=row[-1], 60 | model_answer=final_answer 61 | ) 62 | except Exception as e: 63 | print("question_scorer failed", e) 64 | is_this_correct = 0 65 | else: 66 | final_answer = row[-2] 67 | eval_data.append( 68 | row + (final_answer, is_this_correct) 69 | ) 70 | print("Correct" if is_this_correct == 1 else 'Incorrect', "GT:",row[-1], "Prediction:", row[-2]) 71 | correct += is_this_correct 72 | import pandas as pd 73 | 74 | df = pd.DataFrame(eval_data, columns=["question", 'task_id', 'answer', 'ground_truth', 'formatted_answer', "correct"]) 75 | df.to_csv(args.data_path.replace('.db', '.csv')) 76 | print("Total:", n_total) 77 | print("Correct Item:", correct) 78 | print("Accuracy:", round(100 * correct / n_total, 2), "%") -------------------------------------------------------------------------------- /examples/gaia/playground.py: -------------------------------------------------------------------------------- 1 | import sys 2 | sys.path.insert(0, "./") 3 | import sqlite3 4 | 5 | from tongagent.agents.data_sampling_agent import create_agent 6 | from tongagent.utils import load_config 7 | from datasets import load_dataset 8 | import os 9 | import argparse 10 | from typing import Optional 11 | from tqdm import tqdm 12 | 13 | DATA_NAME = "2023_level2" 14 | SPLIT = "validation" 15 | 16 | def run(agent, raw_question, attachment_name): 17 | if attachment_name is not None and attachment_name.strip() != "": 18 | question = f"{raw_question}\nAttachment: data/GAIA/2023/{SPLIT}/{attachment_name}" 19 | else: 20 | question = raw_question 21 | 22 | if attachment_name is not None and (attachment_name.endswith(".png") or attachment_name.endswith(".jpg")): 23 | agent.image_paths = [f'data/GAIA/2023/{SPLIT}/{attachment_name}'] 24 | else: 25 | agent.image_paths = [] 26 | 27 | result = agent.run(question) 28 | agent.save_trajectory() 29 | return result 30 | 31 | 32 | ds = load_dataset("gaia-benchmark/GAIA", DATA_NAME, split=SPLIT) 33 | agent = create_agent(llm_engine="tonggpt", task="gaia", error_tolerance=3) 34 | 35 | # selected = "e8cb5b03-41e0-4086-99e5-f6806cd97211" 36 | # item = [item for item in ds if item["task_id"] == selected][0] 37 | # print("item", item) 38 | 39 | # question = "The object in the British Museum's collection with a museum number of 2012,5015.17 is the shell of a particular mollusk species. According to the abstract of a research article published in Science Advances in 2021, beads made from the shells of this species were found that are at least how many thousands of years old?" 40 | 41 | # question = "The year is 2022. I am at the National Air and Space Museum east of the Potomac River. I want to go to Fire Station 301 DCA ARFF using the metro. I go in the wrong direction and end up at the station closest to Cleveland Elementary School. How many metro stations am I away from my original destination if I don't change lines? Your answer should be a numerical integer value." 42 | 43 | # question = "In the YouTube 360 VR video from March 2018 narrated by the voice actor of Lord of the Rings' Gollum, what number was mentioned by the narrator directly after dinosaurs were first shown in the video?" 44 | 45 | # question = "In the YouTube 360 VR video from March 2018 narrated by the voice actor of Lord of the Rings' Gollum, what chemical terminology was mentioned by the narrator directly after H2O were first mentioned in the video?" 46 | 47 | question = "Visit Bofei's Site to find his current position in industry." 48 | file_name = None 49 | result = run( 50 | agent, 51 | raw_question=question, 52 | attachment_name=file_name 53 | ) 54 | 55 | print(result) -------------------------------------------------------------------------------- /examples/gaia/view.py: -------------------------------------------------------------------------------- 1 | import sys 2 | sys.path.insert(0, "./") 3 | import sqlite3 4 | 5 | from tongagent.evaluation.gaia_scorer import question_scorer 6 | from tongagent.llm_engine.gpt import get_tonggpt_open_ai_client 7 | from tongagent.prompt import FORMAT_ANSWER_PROMPT_GAIA 8 | from langchain.prompts import ChatPromptTemplate 9 | from tqdm import tqdm 10 | 11 | import argparse 12 | 13 | parser = argparse.ArgumentParser() 14 | parser.add_argument("--data-path") 15 | args = parser.parse_args() 16 | 17 | # cache_db = sqlite3.connect(args.data_path) 18 | # cursor = cache_db.cursor() 19 | # cursor.execute(f"SELECT * FROM qa_cache") 20 | # rows = cursor.fetchall() 21 | # print(rows) 22 | # print(len(rows)) 23 | 24 | from datasets import load_dataset 25 | ds = load_dataset("gaia-benchmark/GAIA", "2023_level1", split="validation") 26 | 27 | subset = ds[0:10] 28 | for k, v in subset.items(): 29 | print(k, len(v)) 30 | -------------------------------------------------------------------------------- /examples/gta/eval.py: -------------------------------------------------------------------------------- 1 | import json 2 | import argparse 3 | import os 4 | 5 | parser = argparse.ArgumentParser() 6 | parser.add_argument("--folder", required=True) 7 | 8 | args = parser.parse_args() 9 | subfolders = os.listdir(args.folder) 10 | 11 | total = 0 12 | correct = 0 13 | all_samples = len(subfolders) 14 | for subfolder in subfolders: 15 | data_path = os.path.join(args.folder, subfolder, "agent_memory.json") 16 | 17 | with open(data_path, "r") as f: 18 | dataset = json.load(f) 19 | 20 | gt, answer = dataset["ground_truth"], dataset["final_answer"] 21 | if gt is None: 22 | continue 23 | skip = False 24 | is_correct = True 25 | for each in gt: 26 | if type(each) is str: 27 | skip = True 28 | break 29 | 30 | if type(each) is list: 31 | is_this_gt_correct = [] 32 | for item in each: 33 | is_this_gt_correct.append(item.lower() in str(answer).lower()) 34 | 35 | is_correct = is_correct and any(is_this_gt_correct) 36 | else: 37 | raise ValueError("unexpected") 38 | 39 | if skip: 40 | continue 41 | if is_correct: 42 | print("Correct:", gt, answer) 43 | correct += 1 44 | else: 45 | print("Incorrect", gt, answer) 46 | total += 1 47 | # print(gt, answer) 48 | 49 | print("Folder", args) 50 | print("Total samples valid:", total, "Correct sample", correct, "all samples", all_samples) 51 | print("Accuracy", round(correct / total, 4) * 100, "%") 52 | print("Accuracy (all samples)", round(correct / all_samples, 4) * 100, "%") -------------------------------------------------------------------------------- /experiments/CPM-FT/README.md: -------------------------------------------------------------------------------- 1 | # Setup 2 | 3 | Install environment 4 | ```bash 5 | conda create -n cpm python=3.10 6 | conda activate cpm 7 | pip install -r requirements.txt 8 | ``` 9 | 10 | Setup data path 11 | ``` 12 | 13 | ln -s /home/lipengxiang/codes/TongAgent/data/tongagent data/tongagent 14 | ``` -------------------------------------------------------------------------------- /experiments/CPM-FT/assets/airplane.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mat-agent/MAT-Agent/7215f7bb7ee2284cfaca2fda79725c39d5b3bb89/experiments/CPM-FT/assets/airplane.jpeg -------------------------------------------------------------------------------- /experiments/CPM-FT/assets/prompt.txt: -------------------------------------------------------------------------------- 1 | You are an autonomous intelligent agent tasked with navigating a web browser . You will be given web - based tasks . These tasks will be accomplished through the use of specific actions you can issue. Here’s the information you’ll have: 2 | The user’s objective: Tell me about birthday of Mercedes Sosa 3 | The current web page’s URL: https://en.wikipedia.org/wiki/Mercedes_Sosa 4 | The open tabs: Mercedes_Sosa 5 | The previous action: None 6 | The actions you can perform fall into several categories: 7 | Page Operation Actions: 8 | ```click[id]```: This action clicks on an element with a specific id on the webpage. 9 | ```type[id][content]```: Use this to type the content into the field with id. By default, the " Enter " key is pressed after typing unless press_enter_after is set to 0, i.e., ```type[id][content][0]```. 10 | ```hover[id]```: Hover over an element with id. 11 | ```press[key_comb]```: Simulates the pressing of a key combination on the keyboard (e.g., Ctrl+v). 12 | ```scroll[down]``` or ```scroll[up]```: Scroll the page up or down. 13 | Tab Management Actions : 14 | ```new_tab```: Open a new, empty browser tab. 15 | ```tab_focus[tab_index]```: Switch the browser’s focus to a specific tab using its index. 16 | ```close_tab```: Close the currently active tab. 17 | URL Navigation Actions: 18 | ```goto[url]```: Navigate to a specific URL. 19 | ```go_back```: Navigate to the previously viewed page. 20 | ```go_forward```: Navigate to the next page (if a previous’ go_back’ action was performed). 21 | Completion Action : 22 | ```stop[answer]```: Issue this action when you believe the task is 23 | complete. If the objective is to find a text-based answer, provide 24 | the answer in the bracket. 25 | Homepage: https://www.google.com.hk/ -------------------------------------------------------------------------------- /experiments/CPM-FT/assets/sosa.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mat-agent/MAT-Agent/7215f7bb7ee2284cfaca2fda79725c39d5b3bb89/experiments/CPM-FT/assets/sosa.png -------------------------------------------------------------------------------- /experiments/CPM-FT/finetune/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mat-agent/MAT-Agent/7215f7bb7ee2284cfaca2fda79725c39d5b3bb89/experiments/CPM-FT/finetune/__init__.py -------------------------------------------------------------------------------- /experiments/CPM-FT/finetune/ds_config_zero2.json: -------------------------------------------------------------------------------- 1 | { 2 | "fp16": { 3 | "enabled": "auto", 4 | "loss_scale": 0, 5 | "loss_scale_window": 1000, 6 | "initial_scale_power": 16, 7 | "hysteresis": 2, 8 | "min_loss_scale": 1 9 | }, 10 | 11 | "bf16": { 12 | "enabled": "auto" 13 | }, 14 | 15 | "optimizer": { 16 | "type": "AdamW", 17 | "params": { 18 | "lr": "auto", 19 | "betas": "auto", 20 | "eps": "auto", 21 | "weight_decay": "auto" 22 | } 23 | }, 24 | 25 | "scheduler": { 26 | "type": "WarmupLR", 27 | "params": { 28 | "warmup_min_lr": "auto", 29 | "warmup_max_lr": "auto", 30 | "warmup_num_steps": "auto" 31 | } 32 | }, 33 | 34 | "zero_optimization": { 35 | "stage": 2, 36 | "offload_optimizer": { 37 | "device": "none", 38 | "pin_memory": true 39 | }, 40 | "allgather_partitions": true, 41 | "allgather_bucket_size": 2e8, 42 | "overlap_comm": true, 43 | "reduce_scatter": true, 44 | "reduce_bucket_size": 2e8, 45 | "contiguous_gradients": true 46 | }, 47 | 48 | "gradient_accumulation_steps": "auto", 49 | "gradient_clipping": "auto", 50 | "steps_per_print": 100, 51 | "train_batch_size": "auto", 52 | "train_micro_batch_size_per_gpu": "auto", 53 | "wall_clock_breakdown": false 54 | } 55 | -------------------------------------------------------------------------------- /experiments/CPM-FT/finetune/ds_config_zero3.json: -------------------------------------------------------------------------------- 1 | 2 | { 3 | "fp16": { 4 | "enabled": "auto", 5 | "loss_scale": 0, 6 | "loss_scale_window": 1000, 7 | "initial_scale_power": 16, 8 | "hysteresis": 2, 9 | "min_loss_scale": 1 10 | }, 11 | "bf16": { 12 | "enabled": "auto" 13 | }, 14 | "optimizer": { 15 | "type": "AdamW", 16 | "params": { 17 | "lr": "auto", 18 | "betas": "auto", 19 | "eps": "auto", 20 | "weight_decay": "auto" 21 | } 22 | }, 23 | 24 | "scheduler": { 25 | "type": "WarmupLR", 26 | "params": { 27 | "warmup_min_lr": "auto", 28 | "warmup_max_lr": "auto", 29 | "warmup_num_steps": "auto" 30 | } 31 | }, 32 | 33 | "zero_optimization": { 34 | "stage": 3, 35 | "offload_optimizer": { 36 | "device": "none", 37 | "pin_memory": true 38 | }, 39 | "offload_param": { 40 | "device": "none", 41 | "pin_memory": true 42 | }, 43 | "overlap_comm": true, 44 | "contiguous_gradients": true, 45 | "sub_group_size": 1e9, 46 | "reduce_bucket_size": "auto", 47 | "stage3_prefetch_bucket_size": "auto", 48 | "stage3_param_persistence_threshold": "auto", 49 | "stage3_max_live_parameters": 1e9, 50 | "stage3_max_reuse_distance": 1e9, 51 | "stage3_gather_16bit_weights_on_model_save": true 52 | }, 53 | 54 | "gradient_accumulation_steps": "auto", 55 | "gradient_clipping": "auto", 56 | "steps_per_print": 100, 57 | "train_batch_size": "auto", 58 | "train_micro_batch_size_per_gpu": "auto", 59 | "wall_clock_breakdown": false 60 | } 61 | 62 | -------------------------------------------------------------------------------- /experiments/CPM-FT/finetune/finetune_ds.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | GPUS_PER_NODE=8 4 | NNODES=1 5 | NODE_RANK=0 6 | MASTER_ADDR=localhost 7 | MASTER_PORT=6001 8 | 9 | MODEL="openbmb/MiniCPM-V-2_6" 10 | # or openbmb/MiniCPM-V-2, openbmb/MiniCPM-Llama3-V-2_5 11 | # ATTENTION: specify the path to your training data, which should be a json file consisting of a list of conversations. 12 | # See the section for finetuning in README for more information. 13 | DATA="path/to/trainging_data" 14 | EVAL_DATA="path/to/test_data" 15 | LLM_TYPE="qwen2" # if use openbmb/MiniCPM-V-2, please set LLM_TYPE=minicpm, if use openbmb/MiniCPM-Llama3-V-2_5, please set LLM_TYPE="llama3" 16 | 17 | 18 | 19 | DISTRIBUTED_ARGS=" 20 | --nproc_per_node $GPUS_PER_NODE \ 21 | --nnodes $NNODES \ 22 | --node_rank $NODE_RANK \ 23 | --master_addr $MASTER_ADDR \ 24 | --master_port $MASTER_PORT 25 | " 26 | torchrun $DISTRIBUTED_ARGS finetune.py \ 27 | --model_name_or_path $MODEL \ 28 | --llm_type $LLM_TYPE \ 29 | --data_path $DATA \ 30 | --eval_data_path $EVAL_DATA \ 31 | --remove_unused_columns false \ 32 | --label_names "labels" \ 33 | --prediction_loss_only false \ 34 | --bf16 true \ 35 | --bf16_full_eval true \ 36 | --fp16 false \ 37 | --fp16_full_eval false \ 38 | --do_train \ 39 | --do_eval \ 40 | --tune_vision true \ 41 | --tune_llm true \ 42 | --model_max_length 2048 \ 43 | --max_slice_nums 9 \ 44 | --max_steps 10000 \ 45 | --eval_steps 1000 \ 46 | --output_dir output/output_minicpmv26 \ 47 | --logging_dir output/output_minicpmv26 \ 48 | --logging_strategy "steps" \ 49 | --per_device_train_batch_size 1 \ 50 | --per_device_eval_batch_size 1 \ 51 | --gradient_accumulation_steps 1 \ 52 | --evaluation_strategy "steps" \ 53 | --save_strategy "steps" \ 54 | --save_steps 1000 \ 55 | --save_total_limit 10 \ 56 | --learning_rate 1e-6 \ 57 | --weight_decay 0.1 \ 58 | --adam_beta2 0.95 \ 59 | --warmup_ratio 0.01 \ 60 | --lr_scheduler_type "cosine" \ 61 | --logging_steps 1 \ 62 | --gradient_checkpointing true \ 63 | --deepspeed ds_config_zero2.json \ 64 | --report_to "tensorboard" 65 | -------------------------------------------------------------------------------- /experiments/CPM-FT/finetune/finetune_lora.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | GPUS_PER_NODE=8 4 | NNODES=1 5 | NODE_RANK=0 6 | MASTER_ADDR=localhost 7 | MASTER_PORT=6001 8 | 9 | MODEL="openbmb/MiniCPM-V-2_6" # or openbmb/MiniCPM-V-2, openbmb/MiniCPM-Llama3-V-2_5 10 | # ATTENTION: specify the path to your training data, which should be a json file consisting of a list of conversations. 11 | # See the section for finetuning in README for more information. 12 | DATA="path/to/trainging_data" 13 | EVAL_DATA="path/to/test_data" 14 | LLM_TYPE="qwen2" 15 | # if use openbmb/MiniCPM-V-2, please set LLM_TYPE=minicpm 16 | #if use openbmb/MiniCPM-Llama3-V-2_5, please set LLM_TYPE=llama3 17 | DISTRIBUTED_ARGS=" 18 | --nproc_per_node $GPUS_PER_NODE \ 19 | --nnodes $NNODES \ 20 | --node_rank $NODE_RANK \ 21 | --master_addr $MASTER_ADDR \ 22 | --master_port $MASTER_PORT 23 | " 24 | torchrun $DISTRIBUTED_ARGS finetune.py \ 25 | --model_name_or_path $MODEL \ 26 | --llm_type $LLM_TYPE \ 27 | --data_path $DATA \ 28 | --eval_data_path $EVAL_DATA \ 29 | --remove_unused_columns false \ 30 | --label_names "labels" \ 31 | --prediction_loss_only false \ 32 | --bf16 false \ 33 | --bf16_full_eval false \ 34 | --fp16 true \ 35 | --fp16_full_eval true \ 36 | --do_train \ 37 | --do_eval \ 38 | --tune_vision true \ 39 | --tune_llm false \ 40 | --use_lora true \ 41 | --lora_target_modules "llm\..*layers\.\d+\.self_attn\.(q_proj|k_proj|v_proj|o_proj)" \ 42 | --model_max_length 2048 \ 43 | --max_slice_nums 9 \ 44 | --max_steps 10000 \ 45 | --eval_steps 1000 \ 46 | --output_dir output/output__lora \ 47 | --logging_dir output/output_lora \ 48 | --logging_strategy "steps" \ 49 | --per_device_train_batch_size 1 \ 50 | --per_device_eval_batch_size 1 \ 51 | --gradient_accumulation_steps 1 \ 52 | --evaluation_strategy "steps" \ 53 | --save_strategy "steps" \ 54 | --save_steps 1000 \ 55 | --save_total_limit 10 \ 56 | --learning_rate 1e-6 \ 57 | --weight_decay 0.1 \ 58 | --adam_beta2 0.95 \ 59 | --warmup_ratio 0.01 \ 60 | --lr_scheduler_type "cosine" \ 61 | --logging_steps 1 \ 62 | --gradient_checkpointing true \ 63 | --deepspeed ds_config_zero2.json \ 64 | --report_to "tensorboard" # wandb 65 | -------------------------------------------------------------------------------- /experiments/CPM-FT/inference/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mat-agent/MAT-Agent/7215f7bb7ee2284cfaca2fda79725c39d5b3bb89/experiments/CPM-FT/inference/__init__.py -------------------------------------------------------------------------------- /experiments/CPM-FT/inference/eval.py: -------------------------------------------------------------------------------- 1 | from inference.utils import load_pretrained_model 2 | from PIL import Image 3 | 4 | def eval(image_path): 5 | image = Image.open(image_path).convert('RGB') 6 | model, tokenizer = load_pretrained_model() 7 | q = '''You are an autonomous intelligent agent tasked with navigating a web browser . You will be given web - based tasks . These tasks will be accomplished through the use of specific actions you can issue. Here’s the information you’ll have: 8 | The user’s objective: Tell me about birthday of Mercedes Sosa 9 | The current web page’s URL: https://en.wikipedia.org/wiki/Mercedes_Sosa 10 | The open tabs: Mercedes_Sosa 11 | The previous action: None 12 | The actions you can perform fall into several categories: 13 | Page Operation Actions: 14 | ```click[id]```: This action clicks on an element with a specific id on the webpage. 15 | ```type[id][content]```: Use this to type the content into the field with id. By default, the " Enter " key is pressed after typing unless press_enter_after is set to 0, i.e., ```type[id][content][0]```. 16 | ```hover[id]```: Hover over an element with id. 17 | ```press[key_comb]```: Simulates the pressing of a key combination on the keyboard (e.g., Ctrl+v). 18 | ```scroll[down]``` or ```scroll[up]```: Scroll the page up or down. 19 | Tab Management Actions : 20 | ```new_tab```: Open a new, empty browser tab. 21 | ```tab_focus[tab_index]```: Switch the browser’s focus to a specific tab using its index. 22 | ```close_tab```: Close the currently active tab. 23 | URL Navigation Actions: 24 | ```goto[url]```: Navigate to a specific URL. 25 | ```go_back```: Navigate to the previously viewed page. 26 | ```go_forward```: Navigate to the next page (if a previous’ go_back’ action was performed). 27 | Completion Action : 28 | ```stop[answer]```: Issue this action when you believe the task is 29 | complete. If the objective is to find a text-based answer, provide 30 | the answer in the bracket. 31 | Homepage: https://www.google.com.hk/ 32 | ''' 33 | msgs = [{'role': 'user', 'content': [image, q]}] 34 | 35 | answer = model.chat( 36 | image=None, 37 | msgs=msgs, 38 | tokenizer=tokenizer 39 | ) 40 | return answer 41 | 42 | 43 | if __name__ == "__main__": 44 | answer = eval("assets/sosa.png") 45 | print(answer) -------------------------------------------------------------------------------- /experiments/CPM-FT/inference/utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from PIL import Image 3 | from transformers import AutoModel, AutoTokenizer 4 | 5 | def load_pretrained_model(): 6 | torch.manual_seed(0) 7 | 8 | model = AutoModel.from_pretrained('openbmb/MiniCPM-V-2_6', trust_remote_code=True, 9 | attn_implementation='sdpa', torch_dtype=torch.bfloat16) # sdpa or flash_attention_2, no eager 10 | model = model.eval().cuda() 11 | tokenizer = AutoTokenizer.from_pretrained('openbmb/MiniCPM-V-2_6', trust_remote_code=True) 12 | return model, tokenizer 13 | 14 | from peft import PeftModel 15 | 16 | def load_pretrained_model_lora(peft_model_id): 17 | # model_id = 'openbmb/MiniCPM-V-2_6' 18 | model, tokenizer = load_pretrained_model() 19 | print("Load Lora") 20 | model = PeftModel.from_pretrained(model, peft_model_id) 21 | print("Lora merge and unload") 22 | model.merge_and_unload() 23 | return model, tokenizer -------------------------------------------------------------------------------- /experiments/CPM-FT/main.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | -------------------------------------------------------------------------------- /experiments/CPM-FT/requirements.txt: -------------------------------------------------------------------------------- 1 | packaging==23.2 2 | addict==2.4.0 3 | editdistance==0.6.2 4 | einops==0.7.0 5 | fairscale==0.4.0 6 | jsonlines==4.0.0 7 | markdown2==2.4.10 8 | matplotlib==3.7.4 9 | more_itertools==10.1.0 10 | nltk==3.8.1 11 | numpy==1.24.4 12 | opencv_python_headless==4.5.5.64 13 | openpyxl==3.1.2 14 | Pillow==10.1.0 15 | sacrebleu==2.3.2 16 | seaborn==0.13.0 17 | shortuuid==1.0.11 18 | #spacy==3.7.2 19 | timm==0.9.10 20 | torch==2.1.2 21 | torchvision==0.16.2 22 | tqdm==4.66.1 23 | protobuf==4.25.0 24 | transformers==4.40.0 25 | typing_extensions==4.8.0 26 | uvicorn==0.24.0.post1 27 | #xformers==0.0.22.post7 28 | #flash_attn==2.3.4 29 | sentencepiece==0.1.99 30 | accelerate==0.30.1 31 | socksio==1.0.0 32 | gradio==4.41.0 33 | gradio_client 34 | http://thunlp.oss-cn-qingdao.aliyuncs.com/multi_modal/never_delete/modelscope_studio-0.4.0.9-py3-none-any.whl 35 | decord -------------------------------------------------------------------------------- /experiments/CPM-FT/scripts/convert_baai_stats.py: -------------------------------------------------------------------------------- 1 | from datasets import load_dataset 2 | from tqdm import tqdm 3 | from collections import defaultdict 4 | ds = load_dataset("BAAI/Infinity-Instruct", "0625") 5 | 6 | ds = ds["train"] 7 | ability = defaultdict(lambda : 0) 8 | cate_ability = defaultdict(lambda : 0) 9 | for item in tqdm(ds): 10 | # print(item) 11 | abs = item["label"]["ability_en"] 12 | for each in abs: 13 | ability[each] += 1 14 | 15 | abs = item["label"]["cate_ability_en"] 16 | for each in abs: 17 | cate_ability[each] += 1 18 | # break 19 | print(ability) 20 | print(cate_ability) 21 | 22 | with open("stats.json", "w") as f: 23 | import json 24 | json.dump({"abs": ability, "cate_abs": cate_ability}, f, indent=4, ensure_ascii=False) -------------------------------------------------------------------------------- /experiments/CPM-FT/scripts/download_cauldron.py: -------------------------------------------------------------------------------- 1 | import datasets 2 | from datasets import load_dataset 3 | 4 | def convert(item): 5 | pass 6 | ds = load_dataset("HuggingFaceM4/the_cauldron", "ai2d") 7 | dataset = ds["train"] 8 | for item in dataset: 9 | print(item) 10 | break -------------------------------------------------------------------------------- /experiments/CPM-FT/scripts/ds_config_zero2.json: -------------------------------------------------------------------------------- 1 | { 2 | "fp16": { 3 | "enabled": "auto", 4 | "loss_scale": 0, 5 | "loss_scale_window": 1000, 6 | "initial_scale_power": 16, 7 | "hysteresis": 2, 8 | "min_loss_scale": 1 9 | }, 10 | 11 | "bf16": { 12 | "enabled": "auto" 13 | }, 14 | 15 | "optimizer": { 16 | "type": "AdamW", 17 | "params": { 18 | "lr": "auto", 19 | "betas": "auto", 20 | "eps": "auto", 21 | "weight_decay": "auto" 22 | } 23 | }, 24 | 25 | "zero_optimization": { 26 | "stage": 2, 27 | "offload_optimizer": { 28 | "device": "none", 29 | "pin_memory": true 30 | }, 31 | "allgather_partitions": true, 32 | "allgather_bucket_size": 2e8, 33 | "overlap_comm": true, 34 | "reduce_scatter": true, 35 | "reduce_bucket_size": 2e8, 36 | "contiguous_gradients": true 37 | }, 38 | 39 | "gradient_accumulation_steps": "auto", 40 | "gradient_clipping": "auto", 41 | "steps_per_print": 100, 42 | "train_batch_size": "auto", 43 | "train_micro_batch_size_per_gpu": "auto", 44 | "wall_clock_breakdown": false 45 | } 46 | -------------------------------------------------------------------------------- /experiments/CPM-FT/scripts/ds_config_zero3.json: -------------------------------------------------------------------------------- 1 | 2 | { 3 | "fp16": { 4 | "enabled": "auto", 5 | "loss_scale": 0, 6 | "loss_scale_window": 1000, 7 | "initial_scale_power": 16, 8 | "hysteresis": 2, 9 | "min_loss_scale": 1 10 | }, 11 | "bf16": { 12 | "enabled": "auto" 13 | }, 14 | "optimizer": { 15 | "type": "AdamW", 16 | "params": { 17 | "lr": "auto", 18 | "betas": "auto", 19 | "eps": "auto", 20 | "weight_decay": "auto" 21 | } 22 | }, 23 | 24 | "scheduler": { 25 | "type": "WarmupLR", 26 | "params": { 27 | "warmup_min_lr": "auto", 28 | "warmup_max_lr": "auto", 29 | "warmup_num_steps": "auto" 30 | } 31 | }, 32 | 33 | "zero_optimization": { 34 | "stage": 3, 35 | "offload_optimizer": { 36 | "device": "none", 37 | "pin_memory": true 38 | }, 39 | "offload_param": { 40 | "device": "none", 41 | "pin_memory": true 42 | }, 43 | "overlap_comm": true, 44 | "contiguous_gradients": true, 45 | "sub_group_size": 1e9, 46 | "reduce_bucket_size": "auto", 47 | "stage3_prefetch_bucket_size": "auto", 48 | "stage3_param_persistence_threshold": "auto", 49 | "stage3_max_live_parameters": 1e9, 50 | "stage3_max_reuse_distance": 1e9, 51 | "stage3_gather_16bit_weights_on_model_save": true 52 | }, 53 | 54 | "gradient_accumulation_steps": "auto", 55 | "gradient_clipping": "auto", 56 | "steps_per_print": 100, 57 | "train_batch_size": "auto", 58 | "train_micro_batch_size_per_gpu": "auto", 59 | "wall_clock_breakdown": false 60 | } 61 | 62 | -------------------------------------------------------------------------------- /experiments/CPM-FT/scripts/filter_baai_dataset.py: -------------------------------------------------------------------------------- 1 | from datasets import load_dataset 2 | from tqdm import tqdm 3 | from collections import defaultdict 4 | ds = load_dataset("BAAI/Infinity-Instruct", "0625") 5 | 6 | ds = ds["train"] 7 | selected = [ 8 | "python programming", 9 | #"search skills", 10 | #"code refactoring", 11 | #"search engine optimization", 12 | #"code debugging", 13 | #"code modification", 14 | #"code implementation", 15 | ] 16 | import copy 17 | def process(item): 18 | item_new = copy.deepcopy(item) 19 | item_new["image"] = dict() 20 | 21 | conv = [] 22 | for turn in item["conversations"]: 23 | role = "user" if turn["from"] == "human" else "assistant" 24 | conv.append( 25 | {"role": role, "content": turn["value"]} 26 | ) 27 | item_new["conversations"] = conv 28 | return item_new 29 | saved = [] 30 | for item in tqdm(ds): 31 | abs = item["label"]["ability_en"] 32 | keep = False 33 | for each in abs: 34 | if each in selected: 35 | keep = True 36 | break 37 | 38 | 39 | 40 | if not keep: 41 | continue 42 | 43 | saved.append(process(item)) 44 | 45 | print("Total", len(saved)) 46 | with open("subset.json", "w") as f: 47 | import json 48 | json.dump(saved, f, indent=4, ensure_ascii=False) -------------------------------------------------------------------------------- /experiments/CPM-FT/scripts/finetune_lora.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | GPUS_PER_NODE=$(nvidia-smi -L | wc -l) 4 | NNODES=1 5 | NODE_RANK=0 6 | MASTER_ADDR=localhost 7 | MASTER_PORT=6001 8 | 9 | MODEL="openbmb/MiniCPM-V-2_6" # or openbmb/MiniCPM-V-2, openbmb/MiniCPM-Llama3-V-2_5 10 | # ATTENTION: specify the path to your training data, which should be a json file consisting of a list of conversations. 11 | # See the section for finetuning in README for more information. 12 | DATA="data/agent_tune_dataset_cpm_cleaned_9k.json" 13 | LLM_TYPE="qwen2" 14 | # if use openbmb/MiniCPM-V-2, please set LLM_TYPE=minicpm 15 | #if use openbmb/MiniCPM-Llama3-V-2_5, please set LLM_TYPE=llama3 16 | DISTRIBUTED_ARGS=" 17 | --nproc_per_node $GPUS_PER_NODE \ 18 | --nnodes $NNODES \ 19 | --node_rank $NODE_RANK \ 20 | --master_addr $MASTER_ADDR \ 21 | --master_port $MASTER_PORT 22 | " 23 | 24 | export WANDB_PROJECT=minicpm 25 | torchrun $DISTRIBUTED_ARGS finetune/finetune.py \ 26 | --model_name_or_path $MODEL \ 27 | --llm_type $LLM_TYPE \ 28 | --data_path $DATA \ 29 | --remove_unused_columns false \ 30 | --label_names "labels" \ 31 | --prediction_loss_only false \ 32 | --bf16 false \ 33 | --bf16_full_eval false \ 34 | --fp16 true \ 35 | --fp16_full_eval true \ 36 | --do_train \ 37 | --tune_vision false \ 38 | --tune_llm false \ 39 | --use_lora true \ 40 | --lora_target_modules "llm\..*layers\.\d+\.self_attn\.(q_proj|k_proj|v_proj|o_proj)" \ 41 | --model_max_length 10240 \ 42 | --max_slice_nums 9 \ 43 | --eval_steps 100000 \ 44 | --output_dir output/cpm_v2_6_$SLURM_JOB_ID \ 45 | --logging_dir output/cpm_v2_6_log_$SLURM_JOB_ID \ 46 | --logging_strategy "steps" \ 47 | --per_device_train_batch_size 2 \ 48 | --per_device_eval_batch_size 1 \ 49 | --gradient_accumulation_steps 1 \ 50 | --evaluation_strategy "steps" \ 51 | --save_strategy "steps" \ 52 | --save_steps 100000 \ 53 | --save_total_limit 1 \ 54 | --learning_rate 1e-6 \ 55 | --weight_decay 0.1 \ 56 | --adam_beta2 0.95 \ 57 | --warmup_ratio 0.01 \ 58 | --lr_scheduler_type "cosine" \ 59 | --logging_steps 1 \ 60 | --gradient_checkpointing true \ 61 | --deepspeed scripts/ds_config_zero2.json \ 62 | --report_to wandb \ 63 | --num_train_epochs 1 \ 64 | --image_base_path ./ 65 | -------------------------------------------------------------------------------- /experiments/CPM-FT/scripts/subset.py: -------------------------------------------------------------------------------- 1 | 2 | import json 3 | 4 | 5 | source_file = "data/agent_tune_dataset_cpm.json" 6 | with open(source_file, "r") as f: 7 | dataset = json.load(f) 8 | 9 | 10 | with open("data/debug_small.json", "w") as f: 11 | json.dump(dataset[:100], f, indent=4, ensure_ascii=False) -------------------------------------------------------------------------------- /experiments/CPM-FT/scripts/tokenizer.py: -------------------------------------------------------------------------------- 1 | 2 | from transformers import AutoModel, AutoTokenizer 3 | 4 | tokenizer = AutoTokenizer.from_pretrained('openbmb/MiniCPM-V-2_6', trust_remote_code=True) 5 | 6 | 7 | print(tokenizer.decode([151646, 151647, 151656, 151657])) -------------------------------------------------------------------------------- /experiments/CPM-FT/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mat-agent/MAT-Agent/7215f7bb7ee2284cfaca2fda79725c39d5b3bb89/experiments/CPM-FT/tests/__init__.py -------------------------------------------------------------------------------- /experiments/CPM-FT/tests/test_infer.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from PIL import Image 3 | from transformers import AutoModel, AutoTokenizer 4 | 5 | torch.manual_seed(0) 6 | 7 | model = AutoModel.from_pretrained('openbmb/MiniCPM-V-2_6', trust_remote_code=True, 8 | attn_implementation='sdpa', torch_dtype=torch.bfloat16) # sdpa or flash_attention_2, no eager 9 | model = model.eval().cuda() 10 | tokenizer = AutoTokenizer.from_pretrained('openbmb/MiniCPM-V-2_6', trust_remote_code=True) 11 | 12 | image = Image.open('./assets/airplane.jpeg').convert('RGB') 13 | 14 | # First round chat 15 | question = "Tell me the model of this aircraft." 16 | msgs = [{'role': 'user', 'content': [image, question]}] 17 | print(type(model)) 18 | model.terminators.append("3") 19 | # exit() 20 | answer = model.chat( 21 | image=None, 22 | msgs=msgs, 23 | system_prompt="Respond in chinese.", 24 | tokenizer=tokenizer 25 | ) 26 | print(answer) 27 | 28 | # Second round chat 29 | # pass history context of multi-turn conversation 30 | # msgs.append({"role": "assistant", "content": [answer]}) 31 | # msgs.append({"role": "user", "content": ["Introduce something about Airbus A380."]}) 32 | 33 | # answer = model.chat( 34 | # image=None, 35 | # msgs=msgs, 36 | # tokenizer=tokenizer 37 | # ) 38 | # print(answer) -------------------------------------------------------------------------------- /experiments/CPM-FT/tests/test_infer_lora.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from PIL import Image 3 | from transformers import AutoModel, AutoTokenizer 4 | from inference.utils import load_pretrained_model_lora 5 | torch.manual_seed(0) 6 | 7 | model, tokenizer = load_pretrained_model_lora("output/cpm_v2_6_7680255/") 8 | # model = AutoModel.from_pretrained('openbmb/MiniCPM-V-2_6', trust_remote_code=True, 9 | # attn_implementation='sdpa', torch_dtype=torch.bfloat16) # sdpa or flash_attention_2, no eager 10 | model = model.eval().cuda() 11 | # tokenizer = AutoTokenizer.from_pretrained('openbmb/MiniCPM-V-2_6', trust_remote_code=True) 12 | 13 | image = Image.open('./assets/airplane.jpeg').convert('RGB') 14 | 15 | # First round chat 16 | question = "Tell me the model of this aircraft." 17 | msgs = [{'role': 'user', 'content': [image, question]}] 18 | 19 | answer = model.chat( 20 | image=None, 21 | msgs=msgs, 22 | tokenizer=tokenizer 23 | ) 24 | print("=" * 10) 25 | print(answer) 26 | print("=" * 10) 27 | 28 | # Second round chat 29 | # pass history context of multi-turn conversation 30 | msgs.append({"role": "assistant", "content": [answer]}) 31 | msgs.append({"role": "user", "content": ["Introduce something about Airbus A380."]}) 32 | 33 | answer = model.chat( 34 | image=None, 35 | msgs=msgs, 36 | tokenizer=tokenizer 37 | ) 38 | print(answer) -------------------------------------------------------------------------------- /experiments/Qwen-VL/.github/ISSUE_TEMPLATE/config.yaml: -------------------------------------------------------------------------------- 1 | blank_issues_enabled: true 2 | -------------------------------------------------------------------------------- /experiments/Qwen-VL/.github/ISSUE_TEMPLATE/feature_request.yaml: -------------------------------------------------------------------------------- 1 | name: "💡 Feature Request" 2 | description: 创建新功能请求 | Create a new ticket for a new feature request 3 | title: "💡 [REQUEST] - " 4 | labels: [ 5 | "question" 6 | ] 7 | body: 8 | - type: input 9 | id: start_date 10 | attributes: 11 | label: "起始日期 | Start Date" 12 | description: | 13 | 起始开发日期 14 | Start of development 15 | placeholder: "month/day/year" 16 | validations: 17 | required: false 18 | - type: textarea 19 | id: implementation_pr 20 | attributes: 21 | label: "实现PR | Implementation PR" 22 | description: | 23 | 实现该功能的Pull request 24 | Pull request used 25 | placeholder: "#Pull Request ID" 26 | validations: 27 | required: false 28 | - type: textarea 29 | id: reference_issues 30 | attributes: 31 | label: "相关Issues | Reference Issues" 32 | description: | 33 | 与该功能相关的issues 34 | Common issues 35 | placeholder: "#Issues IDs" 36 | validations: 37 | required: false 38 | - type: textarea 39 | id: summary 40 | attributes: 41 | label: "摘要 | Summary" 42 | description: | 43 | 简要描述新功能的特点 44 | Provide a brief explanation of the feature 45 | placeholder: | 46 | Describe in a few lines your feature request 47 | validations: 48 | required: true 49 | - type: textarea 50 | id: basic_example 51 | attributes: 52 | label: "基本示例 | Basic Example" 53 | description: Indicate here some basic examples of your feature. 54 | placeholder: A few specific words about your feature request. 55 | validations: 56 | required: true 57 | - type: textarea 58 | id: drawbacks 59 | attributes: 60 | label: "缺陷 | Drawbacks" 61 | description: | 62 | 该新功能有哪些缺陷/可能造成哪些影响? 63 | What are the drawbacks/impacts of your feature request ? 64 | placeholder: | 65 | Identify the drawbacks and impacts while being neutral on your feature request 66 | validations: 67 | required: true 68 | - type: textarea 69 | id: unresolved_question 70 | attributes: 71 | label: "未解决问题 | Unresolved questions" 72 | description: | 73 | 有哪些尚未解决的问题? 74 | What questions still remain unresolved ? 75 | placeholder: | 76 | Identify any unresolved issues. 77 | validations: 78 | required: false -------------------------------------------------------------------------------- /experiments/Qwen-VL/.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__ 2 | *.so 3 | build 4 | .coverage_* 5 | *.egg-info 6 | *~ 7 | .vscode/ 8 | .idea/ 9 | .DS_Store 10 | 11 | /private/ 12 | Qwen-VL-Chat/ 13 | Qwen-VL-Chat-Int4/ 14 | SimSun.ttf 15 | -------------------------------------------------------------------------------- /experiments/Qwen-VL/BUILD.md: -------------------------------------------------------------------------------- 1 | ## qwen web demo 2 | 3 | ### build 4 | 5 | ``` 6 | docker build -t qwen-vl-chat:webdemo --platform linux/amd64 -f Dockerfile.qwendemo . 7 | ``` 8 | 9 | ### run 10 | 11 | ``` 12 | docker run -it --gpus device=0 -d --restart always -v /var/run/docker.sock:/var/run/docker.sock --name qwen-vl-chat -p 8000:8000 --user=20001:20001 --platform linux/amd64 qwen-vl-chat:webdemo 13 | ``` 14 | 15 | ## qwen openai api 16 | 17 | ### build 18 | 19 | ``` 20 | docker build -t qwen-vl-chat:openai --platform linux/amd64 -f Dockerfile.qwenopenai . 21 | ``` 22 | 23 | ### run 24 | 25 | ``` 26 | docker run -it --gpus device=0 -d --restart always -v /var/run/docker.sock:/var/run/docker.sock --name qwen-vl-chat -p 8080:8080 --user=20001:20001 --platform linux/amd64 qwen-vl-chat:openai 27 | ``` 28 | 29 | ## qwen-int4 openai api 30 | 31 | ### build 32 | 33 | ``` 34 | docker build -t qwen-vl-chat:int4-openai --platform linux/amd64 -f Dockerfile.qwenint4openai . 35 | ``` 36 | 37 | ### run 38 | 39 | ``` 40 | docker run -it --gpus device=0 -d --restart always -v /var/run/docker.sock:/var/run/docker.sock --name qwen-vl-chat-int4 -p 8080:8080 --user=20001:20001 --platform linux/amd64 qwen-vl-chat:int4-openai 41 | ``` 42 | -------------------------------------------------------------------------------- /experiments/Qwen-VL/Dockerfile.qwendemo: -------------------------------------------------------------------------------- 1 | # python 3.8 and above 2 | # pytorch 1.12 and above, 2.0 and above are recommended 3 | # CUDA 11.4 and above are recommended (this is for GPU users, flash-attention users, etc.) 4 | 5 | # based on modelscope docker image 6 | # registry.cn-hangzhou.aliyuncs.com/modelscope-repo/modelscope:ubuntu20.04-cuda11.7.1-py38-torch2.0.1-tf1.15.5-1.8.0 7 | # registry.cn-beijing.aliyuncs.com/modelscope-repo/modelscope:ubuntu20.04-cuda11.7.1-py38-torch2.0.1-tf1.15.5-1.8.0 8 | FROM registry.cn-hangzhou.aliyuncs.com/modelscope-repo/modelscope:ubuntu20.04-cuda11.7.1-py38-torch2.0.1-tf1.15.5-1.8.0 9 | 10 | ARG workdir=/var/app 11 | RUN mkdir -p ${workdir} 12 | 13 | RUN git lfs install 14 | 15 | WORKDIR ${workdir} 16 | COPY requirements.txt requirements_web_demo.txt ./ 17 | 18 | # Install Qwen dependencies 19 | RUN pip install -r requirements.txt 20 | 21 | # Install webUI dependencies 22 | WORKDIR ${workdir} 23 | RUN pip install -r requirements_web_demo.txt 24 | 25 | # Offline mode, check https://huggingface.co/docs/transformers/v4.15.0/installation#offline-mode 26 | ENV HF_DATASETS_OFFLINE=1 27 | ENV TRANSFORMERS_OFFLINE=1 28 | 29 | # set TZ, make logs dir, and expose port 8080 30 | ENV TZ=Asia/Shanghai 31 | RUN mkdir -p ${workdir}/logs && chmod 777 ${workdir}/logs 32 | VOLUME /var/app/logs 33 | 34 | # create user 20001 35 | RUN useradd -r -m appuser -u 20001 -g 0 36 | 37 | WORKDIR ${workdir} 38 | # copy model 39 | RUN git clone https://huggingface.co/Qwen/Qwen-VL-Chat 40 | # COPY --chown=20001:20001 Qwen-VL-Chat ./Qwen-VL-Chat 41 | # copy fonts 42 | ADD --chown=20001:20001 https://github.com/StellarCN/scp_zh/raw/master/fonts/SimSun.ttf ./ 43 | # COPY --chown=20001:20001 SimSun.ttf ./ 44 | # copy main app 45 | COPY --chown=20001:20001 web_demo_mm.py ./ 46 | 47 | EXPOSE 8000 48 | CMD ["python3", "web_demo_mm.py", "-c", "./Qwen-VL-Chat", "--server-name", "0.0.0.0", "--server-port", "8000"] 49 | -------------------------------------------------------------------------------- /experiments/Qwen-VL/Dockerfile.qwenint4openai: -------------------------------------------------------------------------------- 1 | # python 3.8 and above 2 | # pytorch 1.12 and above, 2.0 and above are recommended 3 | # CUDA 11.4 and above are recommended (this is for GPU users, flash-attention users, etc.) 4 | 5 | # based on modelscope docker image 6 | # registry.cn-hangzhou.aliyuncs.com/modelscope-repo/modelscope:ubuntu20.04-cuda11.7.1-py38-torch2.0.1-tf1.15.5-1.8.0 7 | # registry.cn-beijing.aliyuncs.com/modelscope-repo/modelscope:ubuntu20.04-cuda11.7.1-py38-torch2.0.1-tf1.15.5-1.8.0 8 | FROM registry.cn-hangzhou.aliyuncs.com/modelscope-repo/modelscope:ubuntu20.04-cuda11.7.1-py38-torch2.0.1-tf1.15.5-1.8.0 9 | 10 | ARG workdir=/var/app 11 | RUN mkdir -p ${workdir} 12 | 13 | RUN git lfs install 14 | 15 | WORKDIR ${workdir} 16 | COPY requirements.txt requirements_web_demo.txt ./ 17 | 18 | # Install Qwen dependencies 19 | RUN pip install -r requirements.txt 20 | 21 | # Install webUI dependencies 22 | WORKDIR ${workdir} 23 | RUN pip install -r requirements_web_demo.txt 24 | 25 | # Offline mode, check https://huggingface.co/docs/transformers/v4.15.0/installation#offline-mode 26 | ENV HF_DATASETS_OFFLINE=1 27 | ENV TRANSFORMERS_OFFLINE=1 28 | 29 | # set TZ, make logs dir, and expose port 8080 30 | ENV TZ=Asia/Shanghai 31 | RUN mkdir -p ${workdir}/logs && chmod 777 ${workdir}/logs 32 | VOLUME /var/app/logs 33 | 34 | # create user 20001 35 | RUN useradd -r -m appuser -u 20001 -g 0 36 | 37 | WORKDIR ${workdir} 38 | # copy model 39 | RUN git clone https://huggingface.co/Qwen/Qwen-VL-Chat-Int4 40 | # COPY --chown=20001:20001 Qwen-VL-Chat-Int4 ./Qwen-VL-Chat-Int4 41 | 42 | # Install AutoGPTQ 43 | RUN pip install optimum 44 | # RUN git clone https://github.com/JustinLin610/AutoGPTQ.git && \ 45 | # cd AutoGPTQ && \ 46 | # pip install -v . 47 | RUN pip install auto-gptq --extra-index-url https://huggingface.github.io/autogptq-index/whl/cu117/ 48 | 49 | # Install OpenAI API dependencies 50 | WORKDIR ${workdir} 51 | COPY requirements_openai_api.txt ./ 52 | RUN pip install -r requirements_openai_api.txt 53 | # copy fonts 54 | ADD --chown=20001:20001 https://github.com/StellarCN/scp_zh/raw/master/fonts/SimSun.ttf ./ 55 | # COPY --chown=20001:20001 SimSun.ttf ./ 56 | # copy main app 57 | COPY --chown=20001:20001 openai_api.py ./ 58 | 59 | EXPOSE 8080 60 | # CMD ["python3", "openai_api.py", "-c", "./Qwen-VL-Chat", "--server-name", "0.0.0.0", "--server-port", "8080"] 61 | CMD ["python3", "openai_api.py", "-c", "./Qwen-VL-Chat-Int4", "--server-name", "0.0.0.0", "--server-port", "8080"] 62 | -------------------------------------------------------------------------------- /experiments/Qwen-VL/Dockerfile.qwenopenai: -------------------------------------------------------------------------------- 1 | # python 3.8 and above 2 | # pytorch 1.12 and above, 2.0 and above are recommended 3 | # CUDA 11.4 and above are recommended (this is for GPU users, flash-attention users, etc.) 4 | 5 | # based on modelscope docker image 6 | # registry.cn-hangzhou.aliyuncs.com/modelscope-repo/modelscope:ubuntu20.04-cuda11.7.1-py38-torch2.0.1-tf1.15.5-1.8.0 7 | # registry.cn-beijing.aliyuncs.com/modelscope-repo/modelscope:ubuntu20.04-cuda11.7.1-py38-torch2.0.1-tf1.15.5-1.8.0 8 | FROM registry.cn-hangzhou.aliyuncs.com/modelscope-repo/modelscope:ubuntu20.04-cuda11.7.1-py38-torch2.0.1-tf1.15.5-1.8.0 9 | 10 | ARG workdir=/var/app 11 | RUN mkdir -p ${workdir} 12 | 13 | RUN git lfs install 14 | 15 | WORKDIR ${workdir} 16 | COPY requirements.txt requirements_web_demo.txt ./ 17 | 18 | # Install Qwen dependencies 19 | RUN pip install -r requirements.txt 20 | 21 | # Install webUI dependencies 22 | WORKDIR ${workdir} 23 | RUN pip install -r requirements_web_demo.txt 24 | 25 | # Offline mode, check https://huggingface.co/docs/transformers/v4.15.0/installation#offline-mode 26 | ENV HF_DATASETS_OFFLINE=1 27 | ENV TRANSFORMERS_OFFLINE=1 28 | 29 | # set TZ, make logs dir, and expose port 8080 30 | ENV TZ=Asia/Shanghai 31 | RUN mkdir -p ${workdir}/logs && chmod 777 ${workdir}/logs 32 | VOLUME /var/app/logs 33 | 34 | # create user 20001 35 | RUN useradd -r -m appuser -u 20001 -g 0 36 | 37 | WORKDIR ${workdir} 38 | # copy model 39 | RUN git clone https://huggingface.co/Qwen/Qwen-VL-Chat 40 | # COPY --chown=20001:20001 Qwen-VL-Chat ./Qwen-VL-Chat 41 | 42 | # Install OpenAI API dependencies 43 | WORKDIR ${workdir} 44 | COPY requirements_openai_api.txt ./ 45 | RUN pip install -r requirements_openai_api.txt 46 | # copy fonts 47 | ADD --chown=20001:20001 https://github.com/StellarCN/scp_zh/raw/master/fonts/SimSun.ttf ./ 48 | # COPY --chown=20001:20001 SimSun.ttf ./ 49 | # copy main app 50 | COPY --chown=20001:20001 openai_api.py ./ 51 | 52 | EXPOSE 8080 53 | CMD ["python3", "openai_api.py", "-c", "./Qwen-VL-Chat", "--server-name", "0.0.0.0", "--server-port", "8080"] 54 | -------------------------------------------------------------------------------- /experiments/Qwen-VL/FAQ.md: -------------------------------------------------------------------------------- 1 | # FAQ 2 | 3 | ## Installation & Environment 4 | 5 | #### Which version of transformers should I use? 6 | 7 | 4.31.0 is preferred. 8 | 9 | #### I downloaded the codes and checkpoints but I can't load the model locally. What should I do? 10 | 11 | Please check if you have updated the code to the latest, and correctly downloaded all the sharded checkpoint files. 12 | 13 | #### `qwen.tiktoken` is not found. What is it? 14 | 15 | This is the merge file of the tokenizer. You have to download it. Note that if you just git clone the repo without [git-lfs](https://git-lfs.com), you cannot download this file. 16 | 17 | #### transformers_stream_generator/tiktoken/accelerate not found 18 | 19 | Run the command `pip install -r requirements.txt`. You can find the file at [https://github.com/QwenLM/Qwen-VL/blob/main/requirements.txt](https://github.com/QwenLM/Qwen-VL/blob/main/requirements.txt). 20 | <br><br> 21 | 22 | 23 | 24 | ## Demo & Inference 25 | 26 | #### Is there any demo? 27 | 28 | Yes, see `web_demo_mm.py` for web demo. See README for more information. 29 | 30 | 31 | 32 | #### Can Qwen-VL support streaming? 33 | 34 | No. We do not support streaming yet. 35 | 36 | #### It seems that the generation is not related to the instruction... 37 | 38 | Please check if you are loading Qwen-VL-Chat instead of Qwen-VL. Qwen-VL is the base model without alignment, which behaves differently from the SFT/Chat model. 39 | 40 | #### Is quantization supported? 41 | 42 | No. We would support quantization asap. 43 | 44 | #### Unsatisfactory performance in processing long sequences 45 | 46 | Please ensure that NTK is applied. `use_dynamc_ntk` and `use_logn_attn` in `config.json` should be set to `true` (`true` by default). 47 | <br><br> 48 | 49 | 50 | ## Tokenizer 51 | 52 | #### bos_id/eos_id/pad_id not found 53 | 54 | In our training, we only use `<|endoftext|>` as the separator and padding token. You can set bos_id, eos_id, and pad_id to tokenizer.eod_id. Learn more about our tokenizer from our documents about the tokenizer. 55 | 56 | -------------------------------------------------------------------------------- /experiments/Qwen-VL/FAQ_ja.md: -------------------------------------------------------------------------------- 1 | # FAQ 2 | 3 | ## インストールと環境 4 | 5 | #### transformers のバージョンは? 6 | 7 | 4.31.0 が望ましいです。 8 | 9 | #### コードとチェックポイントをダウンロードしましたが、モデルをローカルにロードできません。どうすればよいでしょうか? 10 | 11 | コードを最新のものに更新し、すべてのシャードされたチェックポイントファイルを正しくダウンロードしたかどうか確認してください。 12 | 13 | #### `qwen.tiktoken` が見つかりません。これは何ですか? 14 | 15 | これは tokenizer のマージファイルです。ダウンロードする必要があります。[git-lfs](https://git-lfs.com) を使わずにリポジトリを git clone しただけでは、このファイルをダウンロードできないことに注意してください。 16 | 17 | #### transformers_stream_generator/tiktoken/accelerate が見つかりません。 18 | 19 | コマンド `pip install -r requirements.txt` を実行してください。このファイルは [https://github.com/QwenLM/Qwen-VL/blob/main/requirements.txt](https://github.com/QwenLM/Qwen-VL/blob/main/requirements.txt) にあります。 20 | <br><br> 21 | 22 | 23 | 24 | ## デモと推論 25 | 26 | #### デモはありますか? 27 | 28 | ウェブデモは `web_demo_mm.py` を参照してください。詳細は README を参照してください。 29 | 30 | 31 | 32 | #### Qwen-VLはストリーミングに対応していますか? 33 | 34 | いいえ、まだサポートしていません。 35 | 36 | #### 世代と命令は関係ないようですが... 37 | 38 | Qwen-VL ではなく Qwen-VL-Chat を読み込んでいないか確認してください。Qwen-VL はアライメントなしのベースモデルで、SFT/Chat モデルとは動作が異なります。 39 | 40 | #### 量子化はサポートされていますか? 41 | 42 | いいえ。早急に量子化をサポートするつもりです。 43 | 44 | #### 長いシーケンスの処理で不満足なパフォーマンス 45 | 46 | NTK が適用されていることを確認してください。`config.json` の `use_dynamc_ntk` と `use_logn_attn` を `true` に設定する必要がある(デフォルトでは `true`)。 47 | <br><br> 48 | 49 | 50 | ## Tokenizer 51 | 52 | #### bos_id/eos_id/pad_id が見つかりません。 53 | 54 | 私たちのトレーニングでは、セパレータとパディングトークンとして `<|endoftext|>` のみを使用しています。bos_id、eos_id、pad_id は tokenizer.eod_id に設定できます。私たちの tokenizer について詳しくは、tokenizer についてのドキュメントをご覧ください。 55 | 56 | -------------------------------------------------------------------------------- /experiments/Qwen-VL/FAQ_ko.md: -------------------------------------------------------------------------------- 1 | # FAQ 2 | 3 | ## 설치 및 환경 4 | 5 | #### 어떤 버전의 transformers를 사용해야 하나요? 6 | 7 | 4.31.0 버전을 사용하는 것을 선호합니다. 8 | 9 | #### 코드와 체크포인트를 다운로드했는데 모델을 로컬에서 불러올 수 없어요. 어떻게 해야 하나요? 10 | 11 | 코드를 최신 버전으로 업데이트했는지, 그리고 모든 샤드 체크포인트 파일을 올바르게 다운로드했는지 확인해 주세요. 12 | 13 | #### `qwen.tiktoken`을 찾을 수 없어요. 이게 무엇인가요? 14 | 15 | 이것은 토크나이저의 병합 파일입니다. 이 파일을 다운로드해야 합니다. [git-lfs](https://git-lfs.com) 없이 단순히 깃 저장소를 복제했다면 이 파일을 다운로드할 수 없습니다. 16 | 17 | #### transformers_stream_generator/tiktoken/accelerate not found 오류 18 | 19 | `pip install -r requirements.txt` 명령을 실행하세요. 이 파일은 [https://github.com/QwenLM/Qwen-VL/blob/main/requirements.txt](https://github.com/QwenLM/Qwen-VL/blob/main/requirements.txt)에서 찾을 수 있습니다. 20 | <br><br> 21 | 22 | 23 | ## Demo & Inference 24 | 25 | #### 데모가 있나요? 26 | 27 | 네, 웹 데모는 `web_demo_mm.py`를 참고하세요. 더 많은 정보는 README 파일에서 확인할 수 있습니다. 28 | 29 | 30 | 31 | #### Qwen-VL은 스트리밍을 지원하나요? 32 | 33 | 아니요. 아직 스트리밍을 지원하지 않습니다. 34 | 35 | #### 생성된 내용이 지시사항과 관련 없는 것 같습니다. 36 | 37 | Qwen-VL 대신 Qwen-VL-Chat을 로드하고 있는지 확인해 주세요. Qwen-VL은 SFT/Chat 모델과 달리 정렬이 없는 기본 모델이므로 다르게 작동합니다. 38 | 39 | #### 양자화를 지원하나요? 40 | 41 | 아니요. 가능한 빨리 양자화를 지원할 예정입니다. 42 | 43 | #### 긴 시퀀스 처리에서 만족스럽지 못한 성능 44 | 45 | NTK가 적용되었는지 확인해 주세요. `config.json`의 `use_dynamc_ntk`과 `use_logn_attn`은 `true`로 설정되어야 합니다(`true`가 기본값). 46 | <br><br> 47 | 48 | 49 | ## Tokenizer 50 | 51 | #### bos_id/eos_id/pad_id not found 오류 52 | 53 | 저희 훈련에서는 ``을 구분자 및 패딩 토큰으로만 사용합니다. bos_id, eos_id, pad_id를 tokenizer.eod_id로 설정할 수 있습니다. 토크나이저에 대한 문서에서 토크나이저에 대해 더 알아보세요. -------------------------------------------------------------------------------- /experiments/Qwen-VL/FAQ_zh.md: -------------------------------------------------------------------------------- 1 | # FAQ 2 | 3 | ## 安装&环境 4 | 5 | #### 我应该用哪个transformers版本? 6 | 7 | 建议使用4.31.0。 8 | 9 | #### 我把模型和代码下到本地,按照教程无法使用,该怎么办? 10 | 11 | 答:别着急,先检查你的代码是不是更新到最新版本,然后确认你是否完整地将模型checkpoint下到本地。 12 | 13 | #### `qwen.tiktoken`这个文件找不到,怎么办? 14 | 15 | 这个是我们的tokenizer的merge文件,你必须下载它才能使用我们的tokenizer。注意,如果你使用git clone却没有使用git-lfs,这个文件不会被下载。如果你不了解git-lfs,可点击[官网](https://git-lfs.com/)了解。 16 | 17 | #### transformers_stream_generator/tiktoken/accelerate,这几个库提示找不到,怎么办? 18 | 19 | 运行如下命令:`pip install -r requirements.txt`。相关依赖库在[https://github.com/QwenLM/Qwen-VL/blob/main/requirements.txt](https://github.com/QwenLM/Qwen-VL/blob/main/requirements.txt) 可以找到。 20 | <br><br> 21 | 22 | 23 | ## Demo & 推理 24 | 25 | #### 是否提供Demo? 26 | 27 | `web_demo_mm.py`提供了Web UI。请查看README相关内容了解更多。 28 | 29 | #### Qwen-VL支持流式推理吗? 30 | 31 | Qwen-VL当前不支持流式推理。 32 | 33 | #### 模型的输出看起来与输入无关/没有遵循指令/看起来呆呆的 34 | 35 | 请检查是否加载的是Qwen-VL-Chat模型进行推理,Qwen-VL模型是未经align的预训练基模型,不期望具备响应用户指令的能力。我们在模型最新版本已经对`chat`接口内进行了检查,避免您误将预训练模型作为SFT/Chat模型使用。 36 | 37 | #### 是否有量化版本模型 38 | 39 | 目前Qwen-VL不支持量化,后续我们将支持高效的量化推理实现。 40 | 41 | #### 处理长序列时效果有问题 42 | 43 | 请确认是否开启ntk。若要启用这些技巧,请将`config.json`里的`use_dynamc_ntk`和`use_logn_attn`设置为`true`。最新代码默认为`true`。 44 | <br><br> 45 | 46 | 47 | ## Tokenizer 48 | 49 | #### bos_id/eos_id/pad_id,这些token id不存在,为什么? 50 | 51 | 在训练过程中,我们仅使用<|endoftext|>这一token作为sample/document之间的分隔符及padding位置占位符,你可以将bos_id, eos_id, pad_id均指向tokenizer.eod_id。请阅读我们关于tokenizer的文档,了解如何设置这些id。 52 | 53 | -------------------------------------------------------------------------------- /experiments/Qwen-VL/NOTICE: -------------------------------------------------------------------------------- 1 | ------------- LICENSE FOR NVIDIA Megatron-LM code -------------- 2 | 3 | Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 4 | 5 | Redistribution and use in source and binary forms, with or without 6 | modification, are permitted provided that the following conditions 7 | are met: 8 | * Redistributions of source code must retain the above copyright 9 | notice, this list of conditions and the following disclaimer. 10 | * Redistributions in binary form must reproduce the above copyright 11 | notice, this list of conditions and the following disclaimer in the 12 | documentation and/or other materials provided with the distribution. 13 | * Neither the name of NVIDIA CORPORATION nor the names of its 14 | contributors may be used to endorse or promote products derived 15 | from this software without specific prior written permission. 16 | 17 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 18 | EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 20 | PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 21 | CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 22 | EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 23 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 24 | PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 25 | OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 26 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 27 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 | 29 | 30 | ------------- LICENSE FOR OpenAI tiktoken code -------------- 31 | 32 | MIT License 33 | 34 | Copyright (c) 2022 OpenAI, Shantanu Jain 35 | 36 | Permission is hereby granted, free of charge, to any person obtaining a copy 37 | of this software and associated documentation files (the "Software"), to deal 38 | in the Software without restriction, including without limitation the rights 39 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 40 | copies of the Software, and to permit persons to whom the Software is 41 | furnished to do so, subject to the following conditions: 42 | 43 | The above copyright notice and this permission notice shall be included in all 44 | copies or substantial portions of the Software. 45 | 46 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 47 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 48 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 49 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 50 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 51 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 52 | SOFTWARE. -------------------------------------------------------------------------------- /experiments/Qwen-VL/assets/apple.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mat-agent/MAT-Agent/7215f7bb7ee2284cfaca2fda79725c39d5b3bb89/experiments/Qwen-VL/assets/apple.jpeg -------------------------------------------------------------------------------- /experiments/Qwen-VL/assets/apple_r.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mat-agent/MAT-Agent/7215f7bb7ee2284cfaca2fda79725c39d5b3bb89/experiments/Qwen-VL/assets/apple_r.jpeg -------------------------------------------------------------------------------- /experiments/Qwen-VL/assets/demo.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mat-agent/MAT-Agent/7215f7bb7ee2284cfaca2fda79725c39d5b3bb89/experiments/Qwen-VL/assets/demo.jpeg -------------------------------------------------------------------------------- /experiments/Qwen-VL/assets/demo_highfive.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mat-agent/MAT-Agent/7215f7bb7ee2284cfaca2fda79725c39d5b3bb89/experiments/Qwen-VL/assets/demo_highfive.jpg -------------------------------------------------------------------------------- /experiments/Qwen-VL/assets/demo_spotting_caption.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mat-agent/MAT-Agent/7215f7bb7ee2284cfaca2fda79725c39d5b3bb89/experiments/Qwen-VL/assets/demo_spotting_caption.jpg -------------------------------------------------------------------------------- /experiments/Qwen-VL/assets/demo_vl.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mat-agent/MAT-Agent/7215f7bb7ee2284cfaca2fda79725c39d5b3bb89/experiments/Qwen-VL/assets/demo_vl.gif -------------------------------------------------------------------------------- /experiments/Qwen-VL/assets/logo.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mat-agent/MAT-Agent/7215f7bb7ee2284cfaca2fda79725c39d5b3bb89/experiments/Qwen-VL/assets/logo.jpg -------------------------------------------------------------------------------- /experiments/Qwen-VL/assets/mm_tutorial/Beijing.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mat-agent/MAT-Agent/7215f7bb7ee2284cfaca2fda79725c39d5b3bb89/experiments/Qwen-VL/assets/mm_tutorial/Beijing.jpeg -------------------------------------------------------------------------------- /experiments/Qwen-VL/assets/mm_tutorial/Beijing_Small.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mat-agent/MAT-Agent/7215f7bb7ee2284cfaca2fda79725c39d5b3bb89/experiments/Qwen-VL/assets/mm_tutorial/Beijing_Small.jpeg -------------------------------------------------------------------------------- /experiments/Qwen-VL/assets/mm_tutorial/Chongqing.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mat-agent/MAT-Agent/7215f7bb7ee2284cfaca2fda79725c39d5b3bb89/experiments/Qwen-VL/assets/mm_tutorial/Chongqing.jpeg -------------------------------------------------------------------------------- /experiments/Qwen-VL/assets/mm_tutorial/Chongqing_Small.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mat-agent/MAT-Agent/7215f7bb7ee2284cfaca2fda79725c39d5b3bb89/experiments/Qwen-VL/assets/mm_tutorial/Chongqing_Small.jpeg -------------------------------------------------------------------------------- /experiments/Qwen-VL/assets/mm_tutorial/Hospital.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mat-agent/MAT-Agent/7215f7bb7ee2284cfaca2fda79725c39d5b3bb89/experiments/Qwen-VL/assets/mm_tutorial/Hospital.jpg -------------------------------------------------------------------------------- /experiments/Qwen-VL/assets/mm_tutorial/Hospital_Small.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mat-agent/MAT-Agent/7215f7bb7ee2284cfaca2fda79725c39d5b3bb89/experiments/Qwen-VL/assets/mm_tutorial/Hospital_Small.jpg -------------------------------------------------------------------------------- /experiments/Qwen-VL/assets/mm_tutorial/Menu.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mat-agent/MAT-Agent/7215f7bb7ee2284cfaca2fda79725c39d5b3bb89/experiments/Qwen-VL/assets/mm_tutorial/Menu.jpeg -------------------------------------------------------------------------------- /experiments/Qwen-VL/assets/mm_tutorial/Rebecca_(1939_poster).jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mat-agent/MAT-Agent/7215f7bb7ee2284cfaca2fda79725c39d5b3bb89/experiments/Qwen-VL/assets/mm_tutorial/Rebecca_(1939_poster).jpeg -------------------------------------------------------------------------------- /experiments/Qwen-VL/assets/mm_tutorial/Rebecca_(1939_poster)_Small.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mat-agent/MAT-Agent/7215f7bb7ee2284cfaca2fda79725c39d5b3bb89/experiments/Qwen-VL/assets/mm_tutorial/Rebecca_(1939_poster)_Small.jpeg -------------------------------------------------------------------------------- /experiments/Qwen-VL/assets/mm_tutorial/Shanghai.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mat-agent/MAT-Agent/7215f7bb7ee2284cfaca2fda79725c39d5b3bb89/experiments/Qwen-VL/assets/mm_tutorial/Shanghai.jpg -------------------------------------------------------------------------------- /experiments/Qwen-VL/assets/mm_tutorial/Shanghai_Output.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mat-agent/MAT-Agent/7215f7bb7ee2284cfaca2fda79725c39d5b3bb89/experiments/Qwen-VL/assets/mm_tutorial/Shanghai_Output.jpg -------------------------------------------------------------------------------- /experiments/Qwen-VL/assets/mm_tutorial/Shanghai_Output_Small.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mat-agent/MAT-Agent/7215f7bb7ee2284cfaca2fda79725c39d5b3bb89/experiments/Qwen-VL/assets/mm_tutorial/Shanghai_Output_Small.jpeg -------------------------------------------------------------------------------- /experiments/Qwen-VL/assets/mm_tutorial/Shanghai_Small.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mat-agent/MAT-Agent/7215f7bb7ee2284cfaca2fda79725c39d5b3bb89/experiments/Qwen-VL/assets/mm_tutorial/Shanghai_Small.jpeg -------------------------------------------------------------------------------- /experiments/Qwen-VL/assets/mm_tutorial/TUTORIAL.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mat-agent/MAT-Agent/7215f7bb7ee2284cfaca2fda79725c39d5b3bb89/experiments/Qwen-VL/assets/mm_tutorial/TUTORIAL.ipynb -------------------------------------------------------------------------------- /experiments/Qwen-VL/assets/qwenvl.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mat-agent/MAT-Agent/7215f7bb7ee2284cfaca2fda79725c39d5b3bb89/experiments/Qwen-VL/assets/qwenvl.jpeg -------------------------------------------------------------------------------- /experiments/Qwen-VL/assets/radar.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mat-agent/MAT-Agent/7215f7bb7ee2284cfaca2fda79725c39d5b3bb89/experiments/Qwen-VL/assets/radar.png -------------------------------------------------------------------------------- /experiments/Qwen-VL/assets/radar_qwenvlplus.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mat-agent/MAT-Agent/7215f7bb7ee2284cfaca2fda79725c39d5b3bb89/experiments/Qwen-VL/assets/radar_qwenvlplus.jpg -------------------------------------------------------------------------------- /experiments/Qwen-VL/assets/touchstone_datasets.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mat-agent/MAT-Agent/7215f7bb7ee2284cfaca2fda79725c39d5b3bb89/experiments/Qwen-VL/assets/touchstone_datasets.jpg -------------------------------------------------------------------------------- /experiments/Qwen-VL/assets/touchstone_eval.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mat-agent/MAT-Agent/7215f7bb7ee2284cfaca2fda79725c39d5b3bb89/experiments/Qwen-VL/assets/touchstone_eval.png -------------------------------------------------------------------------------- /experiments/Qwen-VL/assets/touchstone_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mat-agent/MAT-Agent/7215f7bb7ee2284cfaca2fda79725c39d5b3bb89/experiments/Qwen-VL/assets/touchstone_logo.png -------------------------------------------------------------------------------- /experiments/Qwen-VL/assets/wechat.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mat-agent/MAT-Agent/7215f7bb7ee2284cfaca2fda79725c39d5b3bb89/experiments/Qwen-VL/assets/wechat.png -------------------------------------------------------------------------------- /experiments/Qwen-VL/eval_mm/data: -------------------------------------------------------------------------------- 1 | /cpfs01/shared/public/shusheng.yss/datasets/qwenvl_evaluation -------------------------------------------------------------------------------- /experiments/Qwen-VL/eval_mm/mmbench/MMBENCH.md: -------------------------------------------------------------------------------- 1 | # MMBench Evaluation 2 | 3 | ## Data 4 | 5 | ```bash 6 | /cpfs01/shared/public/shusheng.yss/workspace/23082502_qwenvl_eval_test/eval_mm/data/mmbench 7 | ``` 8 | 9 | ## Dev 10 | 11 | ```bash 12 | checkpoint=/PATH/TO/CHECKPOINT 13 | ds=mmbench_dev_20230712 14 | python -m torch.distributed.launch --use-env \ 15 | --nproc_per_node ${NPROC_PER_NODE:-8} \ 16 | --nnodes ${WORLD_SIZE:-1} \ 17 | --node_rank ${RANK:-0} \ 18 | --master_addr ${MASTER_ADDR:-127.0.0.1} \ 19 | --master_port ${MASTER_PORT:-12345} \ 20 | evaluate_multiple_choice_mmbench.py \ 21 | --checkpoint $checkpoint \ 22 | --dataset $ds \ 23 | --batch-size 2 \ 24 | --num-workers 2 25 | 26 | # the results will be saved to mmbench_dev_20230712.json 27 | 28 | # without consistency constrain 29 | 30 | python mmbench_evaluation.py 31 | 32 | # with consistency constrain 33 | 34 | python mmbench_evaluation_tricky.py 35 | 36 | ``` 37 | 38 | ## Test 39 | 40 | ```bash 41 | checkpoint=/PATH/TO/CHECKPOINT 42 | ds=mmbench_test_20230712 43 | python -m torch.distributed.launch --use-env \ 44 | --nproc_per_node ${NPROC_PER_NODE:-8} \ 45 | --nnodes ${WORLD_SIZE:-1} \ 46 | --node_rank ${RANK:-0} \ 47 | --master_addr ${MASTER_ADDR:-127.0.0.1} \ 48 | --master_port ${MASTER_PORT:-12345} \ 49 | evaluate_multiple_choice_mmbench.py \ 50 | --checkpoint $checkpoint \ 51 | --dataset $ds \ 52 | --batch-size 2 \ 53 | --num-workers 2 54 | 55 | # the results will be saved to mmbench_test_20230712.json 56 | 57 | # convert to submission format with consistency constrain 58 | 59 | python mmbench_predict_to_submission.py 60 | 61 | ``` 62 | -------------------------------------------------------------------------------- /experiments/Qwen-VL/eval_mm/mmbench/mmbench_converter_dev.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import io 3 | import base64 4 | import json 5 | from PIL import Image 6 | 7 | ''' 8 | This scripts convert mmbench_dev tsv file to jsonl 9 | ''' 10 | 11 | datas = pd.read_csv("data/mmbench/mmbench_dev_20230712/mmbench_dev_20230712.tsv", sep='\t') 12 | 13 | global_choices = ['A', 'B', 'C', 'D'] 14 | 15 | def decode_base64_to_image(base64_string): 16 | image_data = base64.b64decode(base64_string) 17 | image = Image.open(io.BytesIO(image_data)) 18 | return image 19 | 20 | 21 | with open('./data/mmbench/mmbench_dev_20230712/mmbench_dev_20230712.jsonl', 'w') as f: 22 | for idx in range(len(datas)): 23 | data = datas.iloc[idx] 24 | 25 | index = int(data['index']) 26 | question = data['question'] 27 | hint = data['hint'] if not pd.isna(data['hint']) else 'N/A' 28 | 29 | choices = [] 30 | for opt in global_choices: 31 | if pd.isna(data[opt]): 32 | continue 33 | choices.append(data[opt]) 34 | 35 | answer = global_choices.index(data['answer']) 36 | 37 | image = decode_base64_to_image(data['image']) 38 | image.save("data/mmbench/mmbench_dev_20230712/images/%d.jpg" % index) 39 | 40 | f.write(json.dumps({ 41 | "index": index, 42 | "image": "data/mmbench/mmbench_dev_20230712/images/%d.jpg" % index, 43 | "hint": hint, 44 | "question": question, 45 | "choices": choices, 46 | "answer": answer, 47 | }) + "\n") 48 | 49 | -------------------------------------------------------------------------------- /experiments/Qwen-VL/eval_mm/mmbench/mmbench_converter_test.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import io 3 | import base64 4 | import json 5 | from PIL import Image 6 | 7 | ''' 8 | This script convert mmbench_test tsv file to jsonl 9 | This script is very similar to mmbench_converter_dev except there's no answer for accuracy calculation 10 | ''' 11 | 12 | datas = pd.read_csv("data/mmbench/mmbench_test_20230712/mmbench_test_20230712.tsv", sep='\t') 13 | 14 | global_choices = ['A', 'B', 'C', 'D'] 15 | 16 | def decode_base64_to_image(base64_string): 17 | image_data = base64.b64decode(base64_string) 18 | image = Image.open(io.BytesIO(image_data)) 19 | return image 20 | 21 | 22 | with open('./data/mmbench/mmbench_test_20230712/mmbench_test_20230712.jsonl', 'w') as f: 23 | for idx in range(len(datas)): 24 | data = datas.iloc[idx] 25 | 26 | index = int(data['index']) 27 | question = data['question'] 28 | hint = data['hint'] if not pd.isna(data['hint']) else 'N/A' 29 | 30 | choices = [] 31 | for opt in global_choices: 32 | if pd.isna(data[opt]): 33 | continue 34 | choices.append(data[opt]) 35 | 36 | # answer = global_choices.index(data['answer']) 37 | 38 | image = decode_base64_to_image(data['image']) 39 | image.save("data/mmbench/mmbench_test_20230712/images/%d.jpg" % index) 40 | 41 | f.write(json.dumps({ 42 | "index": index, 43 | "image": "data/mmbench/mmbench_test_20230712/images/%d.jpg" % index, 44 | "hint": hint, 45 | "question": question, 46 | "choices": choices, 47 | # "answer": answer, 48 | }) + "\n") 49 | 50 | -------------------------------------------------------------------------------- /experiments/Qwen-VL/eval_mm/mmbench/mmbench_evaluation.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import json 3 | 4 | ''' 5 | This script provides `global top-1 accuracy` metric calculation for mmbench_dev. 6 | ''' 7 | 8 | predictions = json.load(open('mmbench_dev_20230712.json')) 9 | 10 | index2predictions = {} 11 | for pred in predictions: 12 | index2predictions[pred['index']] = pred['prediction'] 13 | 14 | datas = pd.read_csv("data/mmbench/mmbench_dev_20230712/mmbench_dev_20230712.tsv", sep='\t') 15 | 16 | glb_opts = ['A', 'B', 'C', 'D'] 17 | index2answer = {} 18 | for idx in range(len(datas)): 19 | data = datas.iloc[idx] 20 | index2answer[data['index']] = glb_opts.index(data['answer']) 21 | 22 | identity_indexes = list(set([int(_ % 1e6) for _ in index2predictions.keys()])) 23 | 24 | correct = 0 25 | total = 0 26 | for index in identity_indexes: 27 | for _ in range(4): 28 | cycle_index = int(_ * 1e6 + index) 29 | if index2predictions.get(cycle_index, None) is not None: 30 | if index2predictions[cycle_index] == index2answer[cycle_index]: 31 | continue 32 | else: 33 | print(cycle_index) 34 | break 35 | else: 36 | correct += 1 37 | total += 1 38 | 39 | print(correct, total) 40 | -------------------------------------------------------------------------------- /experiments/Qwen-VL/eval_mm/mmbench/mmbench_evaluation_tricky.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import json 3 | import random 4 | 5 | ''' 6 | This script provides metric calculation for mmbench_dev with the same accuarcy algo as OpenCompass server 7 | ''' 8 | 9 | predictions = json.load(open('mmbench_dev_20230712.json')) 10 | 11 | index2predictions = {} 12 | for pred in predictions: 13 | index2predictions[pred['index']] = pred['prediction'] 14 | 15 | 16 | from collections import Counter 17 | 18 | def most_common_elements(lst): 19 | counter = Counter(lst) 20 | max_count = max(counter.values()) 21 | most_common = [element for element, count in counter.items() if count == max_count] 22 | return random.choice(most_common) # random sample from random choice 23 | 24 | datas = pd.read_csv("data/mmbench/mmbench_dev_20230712/mmbench_dev_20230712.tsv", sep='\t') 25 | 26 | glb_opts = ['A', 'B', 'C', 'D'] 27 | index2answer = {} 28 | index2choices = {} 29 | index2rawanswer = {} 30 | for idx in range(len(datas)): 31 | data = datas.iloc[idx] 32 | 33 | choices = [] 34 | for opt in glb_opts: 35 | if not pd.isna(data[opt]): 36 | choices.append(data[opt]) 37 | index2choices[data['index']] = choices 38 | 39 | index2answer[data['index']] = glb_opts.index(data['answer']) 40 | index2rawanswer[data['index']] = choices[glb_opts.index(data['answer'])] 41 | 42 | identity_indexes = list(set([int(_ % 1e6) for _ in index2predictions.keys()])) 43 | 44 | correct = 0 45 | total = 0 46 | for index in identity_indexes: 47 | raw_preds = [] 48 | raw_answer = [] 49 | for _ in range(4): 50 | cycle_index = int(_ * 1e6 + index) 51 | if index2predictions.get(cycle_index, None) is not None: 52 | raw_answer = index2rawanswer[cycle_index] 53 | raw_pred = index2choices[cycle_index][index2predictions[cycle_index]] 54 | raw_preds.append(raw_pred) 55 | 56 | if len(set(raw_preds)) == 1: 57 | if raw_preds[0] == raw_answer: 58 | correct += 1 59 | else: 60 | result = most_common_elements(raw_preds) 61 | if result == raw_answer: 62 | correct += 1 63 | 64 | total += 1 65 | 66 | print(correct, total, correct / total * 100.) 67 | -------------------------------------------------------------------------------- /experiments/Qwen-VL/eval_mm/mmbench/mmbench_predict_to_submission.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import json 3 | import random 4 | 5 | ''' 6 | This script convert the output file of our inference processor to target formation of OpenCompass evaluator server 7 | ''' 8 | 9 | predictions = json.load(open('mmbench_test_20230712.json')) 10 | 11 | index2predictions = {} 12 | for pred in predictions: 13 | index2predictions[pred['index']] = pred['prediction'] 14 | 15 | from collections import Counter 16 | 17 | def most_common_elements(lst): 18 | counter = Counter(lst) 19 | max_count = max(counter.values()) 20 | most_common = [element for element, count in counter.items() if count == max_count] 21 | print(most_common) 22 | return random.choice(most_common) 23 | # return most_common 24 | 25 | datas = pd.read_csv("data/mmbench/mmbench_test_20230712/mmbench_test_20230712.tsv", sep='\t') 26 | 27 | datas = datas.drop('image', axis=1) 28 | 29 | glb_opts = ['A', 'B', 'C', 'D'] 30 | index2choices = {} 31 | for idx in range(len(datas)): 32 | data = datas.iloc[idx] 33 | 34 | choices = [] 35 | for opt in glb_opts: 36 | if not pd.isna(data[opt]): 37 | choices.append(data[opt]) 38 | index2choices[data['index']] = choices 39 | 40 | identity_indexes = list(set([int(_ % 1e6) for _ in index2predictions.keys()])) 41 | 42 | 43 | processed_index2predictions = {} 44 | for index in identity_indexes: 45 | raw_preds = [] 46 | for _ in range(4): 47 | cycle_index = int(_ * 1e6 + index) 48 | if index2predictions.get(cycle_index, None) is not None: 49 | raw_pred = index2choices[cycle_index][index2predictions[cycle_index]] 50 | raw_preds.append(raw_pred) 51 | 52 | if len(set(raw_preds)) == 1: 53 | pred_answer = raw_preds[0] 54 | else: 55 | pred_answer = most_common_elements(raw_preds) 56 | 57 | print(index, pred_answer) 58 | for _ in range(4): 59 | cycle_index = int(_ * 1e6 + index) 60 | if index2predictions.get(cycle_index, None) is not None: 61 | processed_index2predictions[cycle_index] = index2choices[cycle_index].index(pred_answer) 62 | 63 | 64 | predictions = [] 65 | for idx in range(len(datas)): 66 | data = datas.iloc[idx] 67 | index = data['index'] 68 | prediction = glb_opts[processed_index2predictions[index]] 69 | predictions.append(prediction) 70 | 71 | datas['prediction'] = predictions 72 | datas.to_excel("mmbench_test_20230712_230831_constrained.xlsx", index=False) 73 | # constrained means we force the model predict same answer when tested on a question for multiple times 74 | -------------------------------------------------------------------------------- /experiments/Qwen-VL/eval_mm/mme/EVAL_MME.md: -------------------------------------------------------------------------------- 1 | # MME Benchmark 2 | 3 | [MME](https://github.com/BradyFU/Awesome-Multimodal-Large-Language-Models/tree/Evaluation) is a comprehensive evaluation benchmark for multimodal large language models. It measures both perception and cognition abilities on a total of 14 subtasks, including existence, count, position, color, poster, celebrity, scene, landmark, artwork, OCR, commonsense reasoning, numerical calculation, text translation, and code reasoning. 4 | 5 | Qwen-VL-Chat achieves SOTAs on both perception and cognition evaluation. 6 | 7 | Perception Evaluation 8 | 9 | | Rank | Model | Version | Score | 10 | |:----:|:---------------:|:------------------------:|:-------:| 11 | | 1 | **[Qwen-VL-Chat](https://github.com/QwenLM/Qwen-VL/)**| **[Qwen-7B](https://github.com/QwenLM/Qwen-7B)** | **1487.57** | 12 | | 2 | Skywork-MM | Skywork-MM-13B | 1419.08 | 13 | | 3 | MMICL | FlanT5xxl | 1376.00 | 14 | | 4 | Lynx | vicuna-7b | 1373.23 | 15 | | 5 | BLIVA | FlanT5xxl | 1337.73 | 16 | 17 | Cognition Evaluation 18 | 19 | | Rank | Model | Version | Score | 20 | |:----:|:----------------:|:--------------:|:----------:| 21 | | 1 | **[Qwen-VL-Chat](https://github.com/QwenLM/Qwen-VL/)** | **[Qwen-7B](https://github.com/QwenLM/Qwen-7B)** | **360.71** | 22 | | 2 | MMICL | FlanT5xxl | 360.36 | 23 | | 3 | Skywork-MM | Skywork-MM-13B | 356.43 | 24 | | 4 | BLIVA | FlanT5xxl | 331.43 | 25 | | 5 | LRV-Instruction | LRV-7B | 328.21 | 26 | 27 | Full Metrics 28 | 29 | ``` 30 | =========== Perception =========== 31 | total score: 1487.576330532213 32 | 33 | existence score: 158.33333333333331 34 | count score: 150.0 35 | position score: 128.33333333333334 36 | color score: 170.0 37 | posters score: 178.57142857142856 38 | celebrity score: 120.58823529411764 39 | scene score: 152.25 40 | landmark score: 164.0 41 | artwork score: 125.5 42 | OCR score: 140.0 43 | 44 | 45 | =========== Cognition =========== 46 | total score: 360.71428571428567 47 | 48 | commonsense_reasoning score: 130.7142857142857 49 | numerical_calculation score: 40.0 50 | text_translation score: 147.5 51 | code_reasoning score: 42.5 52 | ``` 53 | 54 | ## How To Reproduce Results of MME Benchmark 55 | 56 | 1. Download MME images and eval_tool from the [MME repo](https://github.com/BradyFU/Awesome-Multimodal-Large-Language-Models/blob/Evaluation/README.md) 57 | 2. Rearrange images by executing `python get_images.py` 58 | 3. Evaluate Qwen-VL-Chat results by executing `python eval.py` 59 | 4. Calculate MME results by executing `python calculation.py --results_dir Qwen-VL-Chat`, which the calculation script comes from the MME eval_tool. 60 | -------------------------------------------------------------------------------- /experiments/Qwen-VL/eval_mm/mme/cognition.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mat-agent/MAT-Agent/7215f7bb7ee2284cfaca2fda79725c39d5b3bb89/experiments/Qwen-VL/eval_mm/mme/cognition.jpg -------------------------------------------------------------------------------- /experiments/Qwen-VL/eval_mm/mme/eval.py: -------------------------------------------------------------------------------- 1 | import os 2 | from tqdm import tqdm 3 | 4 | from transformers import AutoModelForCausalLM, AutoTokenizer 5 | from transformers.generation import GenerationConfig 6 | 7 | checkpoint = 'Qwen/Qwen-VL-Chat' 8 | tokenizer = AutoTokenizer.from_pretrained(checkpoint, trust_remote_code=True) 9 | model = AutoModelForCausalLM.from_pretrained( 10 | checkpoint, device_map='cuda', trust_remote_code=True).eval() 11 | 12 | model.generation_config = GenerationConfig.from_pretrained(checkpoint, trust_remote_code=True) 13 | model.generation_config.top_p = 0.01 14 | 15 | 16 | root = 'Your_Results' 17 | output = 'Qwen-VL-Chat' 18 | os.makedirs(output, exist_ok=True) 19 | for filename in os.listdir(root): 20 | with open(os.path.join(root, filename), 'r') as fin, open(os.path.join(output, filename), 'w') as fout: 21 | lines = fin.read().splitlines() 22 | filename = filename.replace('.txt', '') 23 | for line in tqdm(lines): 24 | img, question, gt = line.strip().split('\t') 25 | img_path = os.path.join('images', filename, img) 26 | assert os.path.exists(img_path), img_path 27 | query = f'<img>{img_path}</img>\n{question}' 28 | response, _ = model.chat(tokenizer, query=query, history=None) 29 | 30 | print(img, question, gt, response, sep='\t', file=fout) 31 | -------------------------------------------------------------------------------- /experiments/Qwen-VL/eval_mm/mme/get_images.py: -------------------------------------------------------------------------------- 1 | import os 2 | from tqdm import tqdm 3 | 4 | os.system('rm -rf images') 5 | os.system('mkdir images') 6 | 7 | os.system('cp -r ../MME_Benchmark_release/OCR images/') 8 | 9 | os.system('mkdir images/artwork') 10 | os.system('cp ../MME_Benchmark_release/artwork/questions_answers_YN/* images/artwork/') 11 | with open('LaVIN/artwork.txt') as fin: 12 | paths = [ line.strip().split('\t', 1)[0] for line in fin ] 13 | paths = list(set(paths)) 14 | for path in tqdm(paths): 15 | os.system(f'cp ../MME_Benchmark_release/artwork/images/toy_dataset/{path} images/artwork/{path}') 16 | 17 | os.system('mkdir images/celebrity') 18 | os.system('cp ../MME_Benchmark_release/celebrity/images/* images/celebrity/') 19 | os.system('cp ../MME_Benchmark_release/celebrity/questions_answers_YN/* images/celebrity/') 20 | 21 | os.system('cp -r ../MME_Benchmark_release/code_reasoning images/') 22 | 23 | os.system('cp -r ../MME_Benchmark_release/color images/') 24 | 25 | os.system('cp -r ../MME_Benchmark_release/commonsense_reasoning images/') 26 | 27 | os.system('cp -r ../MME_Benchmark_release/count images/') 28 | 29 | os.system('cp -r ../MME_Benchmark_release/existence images/') 30 | 31 | os.system('mkdir images/landmark') 32 | os.system('cp ../MME_Benchmark_release/landmark/images/* images/landmark/') 33 | os.system('cp ../MME_Benchmark_release/landmark/questions_answers_YN/* images/landmark/') 34 | 35 | os.system('cp -r ../MME_Benchmark_release/numerical_calculation images/') 36 | 37 | os.system('cp -r ../MME_Benchmark_release/position images/') 38 | 39 | os.system('mkdir images/posters') 40 | os.system('cp ../MME_Benchmark_release/posters/images/* images/posters/') 41 | os.system('cp ../MME_Benchmark_release/posters/questions_answers_YN/* images/posters/') 42 | 43 | os.system('mkdir images/scene') 44 | os.system('cp ../MME_Benchmark_release/scene/images/* images/scene/') 45 | os.system('cp ../MME_Benchmark_release/scene/questions_answers_YN/* images/scene/') 46 | 47 | os.system('cp -r ../MME_Benchmark_release/text_translation images/') 48 | -------------------------------------------------------------------------------- /experiments/Qwen-VL/eval_mm/mme/perception.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mat-agent/MAT-Agent/7215f7bb7ee2284cfaca2fda79725c39d5b3bb89/experiments/Qwen-VL/eval_mm/mme/perception.jpg -------------------------------------------------------------------------------- /experiments/Qwen-VL/eval_mm/seed_bench/leaderboard.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mat-agent/MAT-Agent/7215f7bb7ee2284cfaca2fda79725c39d5b3bb89/experiments/Qwen-VL/eval_mm/seed_bench/leaderboard.jpg -------------------------------------------------------------------------------- /experiments/Qwen-VL/finetune/ds_config_zero2.json: -------------------------------------------------------------------------------- 1 | { 2 | "fp16": { 3 | "enabled": "auto", 4 | "loss_scale": 0, 5 | "loss_scale_window": 1000, 6 | "initial_scale_power": 16, 7 | "hysteresis": 2, 8 | "min_loss_scale": 1 9 | }, 10 | "bf16": { 11 | "enabled": "auto" 12 | }, 13 | "optimizer": { 14 | "type": "AdamW", 15 | "params": { 16 | "lr": "auto", 17 | "betas": "auto", 18 | "eps": "auto", 19 | "weight_decay": "auto" 20 | } 21 | }, 22 | 23 | "scheduler": { 24 | "type": "WarmupLR", 25 | "params": { 26 | "warmup_min_lr": "auto", 27 | "warmup_max_lr": "auto", 28 | "warmup_num_steps": "auto" 29 | } 30 | }, 31 | 32 | "zero_optimization": { 33 | "stage": 2, 34 | "offload_optimizer": { 35 | "device": "none", 36 | "pin_memory": true 37 | }, 38 | "allgather_partitions": true, 39 | "allgather_bucket_size": 2e8, 40 | "overlap_comm": false, 41 | "reduce_scatter": true, 42 | "reduce_bucket_size": 2e8, 43 | "contiguous_gradients": true 44 | }, 45 | 46 | "gradient_accumulation_steps": "auto", 47 | "gradient_clipping": "auto", 48 | "steps_per_print": 100, 49 | "train_batch_size": "auto", 50 | "train_micro_batch_size_per_gpu": "auto", 51 | "wall_clock_breakdown": false 52 | } -------------------------------------------------------------------------------- /experiments/Qwen-VL/finetune/ds_config_zero3.json: -------------------------------------------------------------------------------- 1 | { 2 | "fp16": { 3 | "enabled": "auto", 4 | "loss_scale": 0, 5 | "loss_scale_window": 1000, 6 | "initial_scale_power": 16, 7 | "hysteresis": 2, 8 | "min_loss_scale": 1 9 | }, 10 | "bf16": { 11 | "enabled": "auto" 12 | }, 13 | "optimizer": { 14 | "type": "AdamW", 15 | "params": { 16 | "lr": "auto", 17 | "betas": "auto", 18 | "eps": "auto", 19 | "weight_decay": "auto" 20 | } 21 | }, 22 | 23 | "scheduler": { 24 | "type": "WarmupLR", 25 | "params": { 26 | "warmup_min_lr": "auto", 27 | "warmup_max_lr": "auto", 28 | "warmup_num_steps": "auto" 29 | } 30 | }, 31 | 32 | "zero_optimization": { 33 | "stage": 3, 34 | "offload_optimizer": { 35 | "device": "none", 36 | "pin_memory": true 37 | }, 38 | "offload_param": { 39 | "device": "none", 40 | "pin_memory": true 41 | }, 42 | "overlap_comm": true, 43 | "contiguous_gradients": true, 44 | "sub_group_size": 1e9, 45 | "reduce_bucket_size": "auto", 46 | "stage3_prefetch_bucket_size": "auto", 47 | "stage3_param_persistence_threshold": "auto", 48 | "stage3_max_live_parameters": 1e9, 49 | "stage3_max_reuse_distance": 1e9, 50 | "stage3_gather_16bit_weights_on_model_save": true 51 | }, 52 | 53 | "gradient_accumulation_steps": "auto", 54 | "gradient_clipping": "auto", 55 | "steps_per_print": 100, 56 | "train_batch_size": "auto", 57 | "train_micro_batch_size_per_gpu": "auto", 58 | "wall_clock_breakdown": false 59 | } 60 | -------------------------------------------------------------------------------- /experiments/Qwen-VL/finetune/finetune_ds.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | export CUDA_DEVICE_MAX_CONNECTIONS=1 3 | DIR=`pwd` 4 | 5 | GPUS_PER_NODE=8 6 | NNODES=1 7 | NODE_RANK=0 8 | MASTER_ADDR=localhost 9 | MASTER_PORT=6001 10 | 11 | MODEL="Qwen/Qwen-VL-Chat" #"Qwen/Qwen-VL-Chat"/"Qwen/Qwen-VL" # Set the path if you do not want to load from huggingface directly 12 | # ATTENTION: specify the path to your training data, which should be a json file consisting of a list of conversations. 13 | # See the section for finetuning in README for more information. 14 | DATA="path_to_data" 15 | 16 | DISTRIBUTED_ARGS=" 17 | --nproc_per_node $GPUS_PER_NODE \ 18 | --nnodes $NNODES \ 19 | --node_rank $NODE_RANK \ 20 | --master_addr $MASTER_ADDR \ 21 | --master_port $MASTER_PORT 22 | " 23 | 24 | torchrun $DISTRIBUTED_ARGS finetune.py \ 25 | --model_name_or_path $MODEL \ 26 | --data_path $DATA \ 27 | --bf16 True \ 28 | --fix_vit True \ 29 | --output_dir output_qwen \ 30 | --num_train_epochs 5 \ 31 | --per_device_train_batch_size 1 \ 32 | --per_device_eval_batch_size 1 \ 33 | --gradient_accumulation_steps 16 \ 34 | --evaluation_strategy "no" \ 35 | --save_strategy "steps" \ 36 | --save_steps 1000 \ 37 | --save_total_limit 10 \ 38 | --learning_rate 1e-5 \ 39 | --weight_decay 0.1 \ 40 | --adam_beta2 0.95 \ 41 | --warmup_ratio 0.01 \ 42 | --lr_scheduler_type "cosine" \ 43 | --logging_steps 1 \ 44 | --report_to "none" \ 45 | --model_max_length 2048 \ 46 | --gradient_checkpointing True \ 47 | --lazy_preprocess True \ 48 | --deepspeed finetune/ds_config_zero3.json 49 | -------------------------------------------------------------------------------- /experiments/Qwen-VL/finetune/finetune_lora_ds.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | export CUDA_DEVICE_MAX_CONNECTIONS=1 3 | DIR=`pwd` 4 | 5 | GPUS_PER_NODE=$(nvidia-smi --query-gpu=gpu_name --format=csv,noheader | wc -l) 6 | NNODES=1 7 | NODE_RANK=0 8 | MASTER_ADDR=localhost 9 | MASTER_PORT=6001 10 | 11 | MODEL="Qwen/Qwen2-VL-7B-Instruct" #"Qwen/Qwen-VL-Chat"/"Qwen/Qwen-VL" Set the path if you do not want to load from huggingface directly 12 | # ATTENTION: specify the path to your training data, which should be a json file consisting of a list of conversations. 13 | # See the section for finetuning in README for more information. 14 | DATA="data/train_20241209_1731.json" 15 | 16 | DISTRIBUTED_ARGS=" 17 | --nproc_per_node $GPUS_PER_NODE \ 18 | --nnodes $NNODES \ 19 | --node_rank $NODE_RANK \ 20 | --master_addr $MASTER_ADDR \ 21 | --master_port $MASTER_PORT 22 | " 23 | export WANDB_PROJECT="mat_qwen_vl_gta" 24 | torchrun $DISTRIBUTED_ARGS finetune.py \ 25 | --model_name_or_path $MODEL \ 26 | --data_path $DATA \ 27 | --bf16 True \ 28 | --fix_vit True \ 29 | --output_dir output_qwen/Qwen2-VL-7B-Instruct-$SLURM_JOB_ID \ 30 | --num_train_epochs 7 \ 31 | --per_device_train_batch_size 2 \ 32 | --per_device_eval_batch_size 1 \ 33 | --gradient_accumulation_steps 8 \ 34 | --evaluation_strategy "no" \ 35 | --save_strategy "steps" \ 36 | --save_steps 1000000 \ 37 | --save_total_limit 1 \ 38 | --learning_rate 1e-5 \ 39 | --weight_decay 0.1 \ 40 | --adam_beta2 0.95 \ 41 | --warmup_ratio 0.01 \ 42 | --lr_scheduler_type "cosine" \ 43 | --logging_steps 1 \ 44 | --report_to "wandb" \ 45 | --model_max_length 10240 \ 46 | --lazy_preprocess True \ 47 | --use_lora \ 48 | --gradient_checkpointing \ 49 | --deepspeed finetune/ds_config_zero2.json \ 50 | --lora_target_modules "llm\..*layers\.\d+\.self_attn\.(q_proj|k_proj|v_proj|o_proj)" -------------------------------------------------------------------------------- /experiments/Qwen-VL/finetune/finetune_lora_ds_gaia.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | export CUDA_DEVICE_MAX_CONNECTIONS=1 3 | DIR=`pwd` 4 | 5 | GPUS_PER_NODE=$(nvidia-smi --query-gpu=gpu_name --format=csv,noheader | wc -l) 6 | NNODES=1 7 | NODE_RANK=0 8 | MASTER_ADDR=localhost 9 | MASTER_PORT=6001 10 | 11 | MODEL="Qwen/Qwen2-VL-7B-Instruct" #"Qwen/Qwen-VL-Chat"/"Qwen/Qwen-VL" Set the path if you do not want to load from huggingface directly 12 | # ATTENTION: specify the path to your training data, which should be a json file consisting of a list of conversations. 13 | # See the section for finetuning in README for more information. 14 | DATA="data/train_20241211_1748.json" 15 | 16 | DISTRIBUTED_ARGS=" 17 | --nproc_per_node $GPUS_PER_NODE \ 18 | --nnodes $NNODES \ 19 | --node_rank $NODE_RANK \ 20 | --master_addr $MASTER_ADDR \ 21 | --master_port $MASTER_PORT 22 | " 23 | export WANDB_PROJECT="mat_qwen_vl_gaia" 24 | torchrun $DISTRIBUTED_ARGS finetune.py \ 25 | --model_name_or_path $MODEL \ 26 | --data_path $DATA \ 27 | --bf16 True \ 28 | --fix_vit True \ 29 | --output_dir output_qwen/Qwen2-VL-7B-Instruct-$SLURM_JOB_ID \ 30 | --num_train_epochs 3 \ 31 | --per_device_train_batch_size 2 \ 32 | --per_device_eval_batch_size 1 \ 33 | --gradient_accumulation_steps 4 \ 34 | --evaluation_strategy "no" \ 35 | --save_strategy "steps" \ 36 | --save_steps 10000 \ 37 | --save_total_limit 1 \ 38 | --learning_rate 1e-5 \ 39 | --weight_decay 0.1 \ 40 | --adam_beta2 0.95 \ 41 | --warmup_ratio 0.01 \ 42 | --lr_scheduler_type "cosine" \ 43 | --logging_steps 1 \ 44 | --report_to "wandb" \ 45 | --model_max_length 10240 \ 46 | --lazy_preprocess True \ 47 | --use_lora \ 48 | --gradient_checkpointing \ 49 | --deepspeed finetune/ds_config_zero2.json \ 50 | --lora_target_modules "llm\..*layers\.\d+\.self_attn\.(q_proj|k_proj|v_proj|o_proj)" -------------------------------------------------------------------------------- /experiments/Qwen-VL/finetune/finetune_lora_single_gpu.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | export CUDA_DEVICE_MAX_CONNECTIONS=1 3 | DIR=`pwd` 4 | 5 | 6 | MODEL="Qwen/Qwen-VL-Chat" #"Qwen/Qwen-VL-Chat"/"Qwen/Qwen-VL" # Set the path if you do not want to load from huggingface directly 7 | # ATTENTION: specify the path to your training data, which should be a json file consisting of a list of conversations. 8 | # See the section for finetuning in README for more information. 9 | DATA="path_to_data" 10 | 11 | export CUDA_VISIBLE_DEVICES=0 12 | 13 | python finetune.py \ 14 | --model_name_or_path $MODEL \ 15 | --data_path $DATA \ 16 | --bf16 True \ 17 | --fix_vit True \ 18 | --output_dir output_qwen \ 19 | --num_train_epochs 5 \ 20 | --per_device_train_batch_size 1 \ 21 | --per_device_eval_batch_size 1 \ 22 | --gradient_accumulation_steps 8 \ 23 | --evaluation_strategy "no" \ 24 | --save_strategy "steps" \ 25 | --save_steps 1000 \ 26 | --save_total_limit 10 \ 27 | --learning_rate 1e-5 \ 28 | --weight_decay 0.1 \ 29 | --adam_beta2 0.95 \ 30 | --warmup_ratio 0.01 \ 31 | --lr_scheduler_type "cosine" \ 32 | --logging_steps 1 \ 33 | --report_to "none" \ 34 | --model_max_length 2048 \ 35 | --lazy_preprocess True \ 36 | --gradient_checkpointing \ 37 | --use_lora -------------------------------------------------------------------------------- /experiments/Qwen-VL/finetune/finetune_qlora_ds.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | export CUDA_DEVICE_MAX_CONNECTIONS=1 3 | DIR=`pwd` 4 | 5 | GPUS_PER_NODE=8 6 | NNODES=1 7 | NODE_RANK=0 8 | MASTER_ADDR=localhost 9 | MASTER_PORT=6001 10 | 11 | MODEL="Qwen/Qwen-VL-Chat-Int4" # Qwen/Qwen-VL-Chat-Int4 Set the path if you do not want to load from huggingface directly 12 | # ATTENTION: specify the path to your training data, which should be a json file consisting of a list of conversations. 13 | # See the section for finetuning in README for more information. 14 | DATA="path_to_data" 15 | 16 | 17 | DISTRIBUTED_ARGS=" 18 | --nproc_per_node $GPUS_PER_NODE \ 19 | --nnodes $NNODES \ 20 | --node_rank $NODE_RANK \ 21 | --master_addr $MASTER_ADDR \ 22 | --master_port $MASTER_PORT 23 | " 24 | 25 | # Remember to use --fp16 instead of --bf16 due to autogptq 26 | torchrun $DISTRIBUTED_ARGS finetune.py \ 27 | --model_name_or_path $MODEL \ 28 | --data_path $DATA \ 29 | --fp16 True \ 30 | --fix_vit True \ 31 | --output_dir output_qwen \ 32 | --num_train_epochs 5 \ 33 | --per_device_train_batch_size 2 \ 34 | --per_device_eval_batch_size 1 \ 35 | --gradient_accumulation_steps 8 \ 36 | --evaluation_strategy "no" \ 37 | --save_strategy "steps" \ 38 | --save_steps 1000 \ 39 | --save_total_limit 10 \ 40 | --learning_rate 1e-5 \ 41 | --weight_decay 0.1 \ 42 | --adam_beta2 0.95 \ 43 | --warmup_ratio 0.01 \ 44 | --lr_scheduler_type "cosine" \ 45 | --logging_steps 1 \ 46 | --report_to "none" \ 47 | --model_max_length 2048 \ 48 | --lazy_preprocess True \ 49 | --use_lora \ 50 | --q_lora \ 51 | --gradient_checkpointing \ 52 | --deepspeed finetune/ds_config_zero2.json -------------------------------------------------------------------------------- /experiments/Qwen-VL/finetune/finetune_qlora_single_gpu.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | export CUDA_DEVICE_MAX_CONNECTIONS=1 3 | DIR=`pwd` 4 | 5 | MODEL="Qwen/Qwen-VL-Chat-Int4" # Qwen/Qwen-VL-Chat-Int4 Set the path if you do not want to load from huggingface directly 6 | # ATTENTION: specify the path to your training data, which should be a json file consisting of a list of conversations. 7 | # See the section for finetuning in README for more information. 8 | DATA="path_to_data" 9 | 10 | export CUDA_VISIBLE_DEVICES=0 11 | 12 | # Remember to use --fp16 instead of --bf16 due to autogptq 13 | python finetune.py \ 14 | --model_name_or_path $MODEL \ 15 | --data_path $DATA \ 16 | --fp16 True \ 17 | --fix_vit True \ 18 | --output_dir output_qwen \ 19 | --num_train_epochs 5 \ 20 | --per_device_train_batch_size 1 \ 21 | --per_device_eval_batch_size 1 \ 22 | --gradient_accumulation_steps 8 \ 23 | --evaluation_strategy "no" \ 24 | --save_strategy "steps" \ 25 | --save_steps 1000 \ 26 | --save_total_limit 10 \ 27 | --learning_rate 1e-5 \ 28 | --weight_decay 0.1 \ 29 | --adam_beta2 0.95 \ 30 | --warmup_ratio 0.01 \ 31 | --lr_scheduler_type "cosine" \ 32 | --logging_steps 1 \ 33 | --report_to "none" \ 34 | --model_max_length 2048 \ 35 | --lazy_preprocess True \ 36 | --gradient_checkpointing \ 37 | --use_lora \ 38 | --q_lora \ 39 | --deepspeed finetune/ds_config_zero2.json 40 | -------------------------------------------------------------------------------- /experiments/Qwen-VL/requirements.txt: -------------------------------------------------------------------------------- 1 | transformers==4.32.0 2 | accelerate 3 | tiktoken 4 | einops 5 | transformers_stream_generator==0.0.4 6 | scipy 7 | torchvision 8 | pillow 9 | tensorboard 10 | matplotlib 11 | -------------------------------------------------------------------------------- /experiments/Qwen-VL/requirements_openai_api.txt: -------------------------------------------------------------------------------- 1 | fastapi 2 | uvicorn 3 | openai 4 | pydantic 5 | sse_starlette 6 | -------------------------------------------------------------------------------- /experiments/Qwen-VL/requirements_web_demo.txt: -------------------------------------------------------------------------------- 1 | gradio 2 | modelscope 3 | -------------------------------------------------------------------------------- /experiments/Qwen-VL/scripts/convert_dataset.py: -------------------------------------------------------------------------------- 1 | import json 2 | from tqdm import tqdm 3 | # GTA 4 | data_path = "/scratch/zhangbofei/Projects/Multimodal-CL/iclr_09/TongAgent/experiments/CPM-FT/data/agent_tune_dataset_cpm_8k_gta_with_verifier.json" 5 | # GAIA 6 | data_path = "/scratch/zhangbofei/Projects/Multimodal-CL/iclr_09/TongAgent/experiments/CPM-FT/data/agent_tune_dataset_cpm_17k_gaia_with_verifier.json" 7 | 8 | 9 | with open(data_path, "r") as f: 10 | dataset = json.load(f) 11 | 12 | def _convert(image_path_map, conversations): 13 | output = [] 14 | for turn in conversations: 15 | role = turn["role"] 16 | content = turn["content"] 17 | turn_new = dict() 18 | turn_new["from"] = role 19 | pid = 1 20 | keys = sorted(list(image_path_map.keys())) 21 | for k in keys: 22 | v = image_path_map[k] 23 | if k in content: 24 | content = content.replace(k, f"Picture {pid}: <img>{v}</img>\n") 25 | content = content.replace(f"</img>\n\n", "</img>\n") 26 | pid += 1 27 | turn_new["value"] = content 28 | output.append(turn_new) 29 | return output 30 | 31 | 32 | for item in tqdm(dataset): 33 | #print(item["image"]) 34 | #print(item.keys()) 35 | conversations = item["conversations"] 36 | #print(len(conversations), conversations[1]) 37 | image_path_map = dict() 38 | if type(item["image"]) == str: 39 | image_path_map["<image>"] = item["image"] 40 | else: 41 | for k, v in item["image"].items(): 42 | image_path_map[k] = v 43 | item["conversations"] = _convert(image_path_map, conversations) 44 | 45 | from datetime import datetime 46 | import json 47 | 48 | now = datetime.now().strftime("%Y%m%d_%H%M") 49 | print("write to", f"data/train_{now}.json") 50 | with open(f"data/train_{now}.json", "w") as f: 51 | json.dump(dataset, f, indent=4, ensure_ascii=False) 52 | 53 | import random 54 | with open(f"data/train_{now}_subset.json", "w") as f: 55 | random.shuffle(dataset) 56 | json.dump(dataset[:1000], f, indent=4, ensure_ascii=False) -------------------------------------------------------------------------------- /experiments/Qwen-VL/scripts/convert_dataset_v2.py: -------------------------------------------------------------------------------- 1 | import json 2 | from tqdm import tqdm 3 | 4 | # GAIA 5 | data_path = "/scratch/zhangbofei/Projects/Multimodal-CL/iclr_09/TongAgent/experiments/CPM-FT/data/agent_tune_dataset_gaia_57k_20241210.json" 6 | 7 | with open(data_path, "r") as f: 8 | dataset = json.load(f) 9 | 10 | def _convert(image_path_map, conversations): 11 | output = [] 12 | for turn in conversations: 13 | role = turn["role"] 14 | content = turn["content"] 15 | turn_new = dict() 16 | turn_new["from"] = role 17 | pid = 1 18 | keys = sorted(list(image_path_map.keys())) 19 | for k in keys: 20 | v = image_path_map[k] 21 | if k in content: 22 | content = content.replace(k, f"Picture {pid}: <img>{v}</img>\n") 23 | content = content.replace(f"</img>\n\n", "</img>\n") 24 | pid += 1 25 | turn_new["value"] = content 26 | output.append(turn_new) 27 | return output 28 | 29 | 30 | for item in tqdm(dataset): 31 | #print(item["image"]) 32 | #print(item.keys()) 33 | conversations = item["conversations"] 34 | #print(len(conversations), conversations[1]) 35 | image_path_map = dict() 36 | if "image" not in item: 37 | pass 38 | elif type(item["image"]) == str: 39 | image_path_map["<image>"] = item["image"] 40 | else: 41 | for k, v in item["image"].items(): 42 | image_path_map[k] = v 43 | item["conversations"] = _convert(image_path_map, conversations) 44 | 45 | from datetime import datetime 46 | import json 47 | 48 | now = datetime.now().strftime("%Y%m%d_%H%M") 49 | print("write to", f"data/train_{now}.json") 50 | with open(f"data/train_{now}.json", "w") as f: 51 | json.dump(dataset, f, indent=4, ensure_ascii=False) 52 | 53 | import random 54 | with open(f"data/train_{now}_subset.json", "w") as f: 55 | random.shuffle(dataset) 56 | json.dump(dataset[:1000], f, indent=4, ensure_ascii=False) -------------------------------------------------------------------------------- /experiments/Qwen-VL/scripts/tokenizer.py: -------------------------------------------------------------------------------- 1 | from transformers import AutoTokenizer 2 | 3 | tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-VL-7B-Instruct") 4 | print(tokenizer.pad_token_id) 5 | print(tokenizer.eos_token_id) 6 | print(tokenizer.bos_token_id) 7 | print(tokenizer.encode("<|im_start|>")) 8 | print(tokenizer.encode("<|im_end|>")) 9 | print(tokenizer.encode("<|im_start|>assistant")) 10 | print(tokenizer.encode("<|im_end|>")) 11 | 12 | print(tokenizer.decode([872])) 13 | 14 | msgs = [ 15 | {"role": "system", "content": "You are a helpful assistant."}, 16 | {"role": "user", "content": "Hello, how are you?"}, 17 | {"role": "assistant", "content": "I am fine, thank you!"}, 18 | ] 19 | 20 | print(tokenizer.apply_chat_template(msgs, tokenize=False)) -------------------------------------------------------------------------------- /experiments/Qwen-VL/slurm_jobs/train_gaia.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=gaia_qwen_vl # create a short name for your job 3 | #SBATCH --partition=HGX,DGX # specify the partition name: gpu 4 | #SBATCH --qos=lv1 5 | #SBATCH --nodes=1 # node count 6 | #SBATCH --ntasks=1 # total number of tasks across all nodes 7 | #SBATCH --ntasks-per-node=1 8 | #SBATCH --mem=64G # total memory (RAM) per node 9 | #SBATCH --time=72:00:00 # total run time limit (HH:MM:SS) 10 | #SBATCH --cpus-per-task=32 # cpu-cores per task (>1 if multi-threaded tasks) 11 | #SBATCH --gres=gpu:8 # number of gpus per node 12 | #SBATCH --output=output/out-%j.out # output format 13 | #SBATCH --error=output/error-out-%j.out # error output file 14 | #SBATCH --account=engineering 15 | #--------------------task part------------------------- 16 | 17 | ## clean env 18 | module purge 19 | ## load environment need by this task 20 | module load slurm/BigAI/23.02.2 21 | export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/zhangbofei/anaconda3/lib 22 | source /home/zhangbofei/anaconda3/bin/activate 23 | 24 | conda activate qwen_vl 25 | 26 | bash finetune/finetune_lora_ds_gaia.sh -------------------------------------------------------------------------------- /experiments/Qwen-VL/slurm_jobs/train_gta.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=mat_qwen_vl # create a short name for your job 3 | #SBATCH --partition=HGX,DGX # specify the partition name: gpu 4 | #SBATCH --qos=lv1 5 | #SBATCH --nodes=1 # node count 6 | #SBATCH --ntasks=1 # total number of tasks across all nodes 7 | #SBATCH --ntasks-per-node=1 8 | #SBATCH --mem=64G # total memory (RAM) per node 9 | #SBATCH --time=72:00:00 # total run time limit (HH:MM:SS) 10 | #SBATCH --cpus-per-task=32 # cpu-cores per task (>1 if multi-threaded tasks) 11 | #SBATCH --gres=gpu:8 # number of gpus per node 12 | #SBATCH --output=output/out-%j.out # output format 13 | #SBATCH --error=output/error-out-%j.out # error output file 14 | #SBATCH --account=engineering 15 | #--------------------task part------------------------- 16 | 17 | ## clean env 18 | module purge 19 | ## load environment need by this task 20 | module load slurm/BigAI/23.02.2 21 | export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/zhangbofei/anaconda3/lib 22 | source /home/zhangbofei/anaconda3/bin/activate 23 | 24 | conda activate qwen_vl 25 | 26 | bash finetune/finetune_lora_ds.sh 27 | -------------------------------------------------------------------------------- /experiments/Qwen-VL/touchstone/README_CN.md: -------------------------------------------------------------------------------- 1 | <br> 2 | 3 | <p align="center"> 4 | <img src="../assets/touchstone_logo.png" width="300"/> 5 | <p> 6 | <br> 7 | 8 | <p align="center"> 9 | 中文  |  <a href="../touchstone/README.md">English</a> |  <a href="../touchstone/README_JA.md">日本語</a> 10 | </p> 11 | <br><br> 12 | 13 | **TOUCHSTONE** 是一种针对多模态语言模型(LVLM)的自动化综合评估方法,评估不仅包括基本的认知和理解,还延伸到文学创作。通过人类注解将多模态信息转换为文本,我们的 TouchStone 可以利用SOTA的语言模型来自动化地完成对LVLMs的多模态对话质量评估。 14 | 15 | ## 数据集 16 | 17 | 为了评估 LVLMs 的能力,我们构建了一个多样化且全面的数据集,涵盖五个关键维度:基本描述能力、视觉识别能力、视觉理解能力、视觉叙事能力和多图分析能力。 18 | 19 | - **基本描述能力** 图像描述考验模型总结图片信息的能力,包括简单描述和详细描述。 简单描述通常是描述图像的主要内容和关系的简短短语,而详细描述则提供有关图像场景、其属性和关系的更深入的信息。 20 | 21 | - **视觉识别能力** 图像识别考察模型提取图像中内容的属性以及关联到知识库的能力。为了考察这方面能力,测试的问题包括属性QA、影视识别、艺术识别、地标识别、名人识别、情感识别、文本识别、物体识别和结构内容识别。 22 | 23 | - **视觉理解能力** 图像理解需要模型理解图像内容并完成推理进行相关任务。 这方面包含了例如风格欣赏、抽象图像理解、模因理解、图像分析、图表分析、一般问题解决和推理问答等任务。 24 | 25 | - **视觉叙事能力** 视觉叙事能力是基于视觉内容的文学创作能力,包括撰写电子邮件、诗歌、故事、广告/商品推荐、头脑风暴等。 26 | 27 | - **多图分析能力** 多图分析是分析和比较多幅图像的任务。该领域包括比较两个/多个图像、总结多个图像信息、比较商品以及逐步分析图像等任务。 28 | 29 | <p align="center"> 30 | <img src="../assets/touchstone_datasets.jpg" width="600"/> 31 | <p> 32 | 33 | 我们从五个维度综合评估了模型的能力。 如上图所示,给出了27个子任务的示例。 从感知到认知,再到创造力,随着难度的增加,对模型的要求也越来越高。 目前,LVLM的能力还处于早期阶段。 我们的数据集包含800+道题目、27个类别。 34 | 35 | ## 测评方式 36 | 37 | 我们应用SOTA的LLM进行自动化评估。 为了有效地理解图像的内容,我们人工用细粒度的文本注释替换实际的图像输入。 通过将这些注释和相应的问题输入到像GPT4这样强LLM中,我们可以获得参考答案。 38 | 39 | 对于待测评的LVLM,我们提供实际图像和问题作为输入并获得各自的答案。 最后,我们使用GPT4根据细粒度注释和问题对LVLM生成的答案进行评分。 评分指令要求模型评估答案的有用性、相关性和准确性,并将人工注解视为图像的内容。 为了确保评估的公平性,每个模型的答案都会与 GPT4生成的参考答案进行比较。 模型在所有问题上的平均得分作为最终得分。 40 | 41 | 为了消除答案位置的影响,我们通过交换答案的位置来进行第二轮评分,然后计算获得的两次分数的平均值。 42 | 43 | <p align="center"> 44 | <img src="../assets/touchstone_eval.png" width="600"/> 45 | <p> 46 | 47 | 48 | ## 测评结果 49 | 50 | #### 英文版本测评 51 | 52 | | Model | Score | 53 | |---------------|-------| 54 | | PandaGPT | 488.5 | 55 | | MiniGPT4 | 531.7 | 56 | | InstructBLIP | 552.4 | 57 | | LLaMA-AdapterV2 | 590.1 | 58 | | mPLUG-Owl | 605.4 | 59 | | LLaVA | 602.7 | 60 | | Qwen-VL-Chat | 645.2 | 61 | 62 | #### 中文版本测评 63 | 64 | | Model | Score | 65 | |---------------|-------| 66 | | VisualGLM | 247.1 | 67 | | Qwen-VL-Chat | 401.2 | 68 | 69 | -------------------------------------------------------------------------------- /experiments/Qwen-VL/touchstone/README_JA.md: -------------------------------------------------------------------------------- 1 | <br> 2 | 3 | <p align="center"> 4 | <img src="../assets/touchstone_logo.png" width="300"/> 5 | <p> 6 | <br> 7 | 8 | <p align="center"> 9 | <a href="touchstone/README_CN.md">中文</a>  |  <a href="../touchstone/README.md">English</a>|  日本語 10 | </p> 11 | <br><br> 12 | 13 | **TOUCHSTONE** は、マルチモーダル言語モデルの包括的な評価であり、基本的な認識や理解だけでなく、文学的な創作にまで及びます。評価プロセスを自動化し、マルチモーダル情報をテキストに変換することで、私達の TouchStone は、人手を介することなく高度な言語モデルの力を活用し、対話の質を効率的かつ正確に評価することができます。 14 | 15 | ## DATASET 16 | 17 | LVLMの能力を評価するために、基本的な記述能力、視覚認識能力、視覚理解能力、視覚ストーリーテリング能力、複数画像解析能力の5つの主要な次元をカバーする多様で包括的なデータセットを構築する。 18 | 19 | - **基本的描写力** 画像記述には、単純な記述と詳細な記述を含め、画像に含まれる情報を記述するモデルの能力が含まれる。単純な記述は、通常、画像の主な主題とアクションを記述する短いフレーズであり、詳細な記述は、画像のシーン、それらの属性、および関係についてのより詳細な情報を提供します。 20 | 21 | - **視覚認識能力** 画像認識とは、画像内のオブジェクトやシーンを認識し、関連情報を推論するタスクである。この分野はさらに、属性QA、映画/テレビ認識、アート認識、ランドマーク認識、有名人認識、感情認識、テキスト認識、オブジェクト認識、構造コンテンツ認識など、いくつかのサブタスクに分けることができる。 22 | 23 | - **視覚理解能力** 画像理解とは、モデルが画像の意味や関連するタスクを理解する能力のことである。この分野には、スタイル理解、抽象画像理解、ミーム理解、画像分析、チャート分析、一般的な問題解決、推論QAなど、いくつかのサブタスクが含まれる。 24 | 25 | - **視覚的ストーリーテリング能力** ビジュアルストーリーテリング能力とは、メール、詩、物語、広告/商品推薦、ブレーンストーミングの執筆など、ビジュアルコンテンツに基づいた文学創作のプロセスである。 26 | 27 | - **マルチ画像解析能力** 複数画像解析とは、複数の画像を解析・比較する作業である。この分野には、2つまたは複数の画像を比較する、複数の画像情報を要約する、商品を比較する、画像を段階的に分析するなどのタスクが含まれます。 28 | 29 | 30 | <p align="center"> 31 | <img src="../assets/touchstone_datasets.jpg" width="600"/> 32 | <p> 33 | 34 | モデルの能力を 5 つの次元から総合的に評価する。上図のように、27 のサブタスクの例を示す。知覚から認知、創造性まで、難易度が上がるにつれて、モデルに求められる要件もどんどん高くなっている。現在、LVLM の機能は初期段階にある。我々のデータセットには 800 以上の質問と 27 のカテゴリーが含まれている。 35 | 36 | ## 方法 37 | 38 | 39 | 自動評価を可能にするために、強力な LLM を判定器として適用する。画像の内容を効果的に理解するために、実際の画像入力をきめ細かいテキスト注釈に手動で置き換える。これらの注釈と対応する質問を GPT4 のような強力な LLM に入力することで、参照解答を得る。 40 | 41 | LVLMの評価には、実際の画像と質問を入力として与え、それぞれの回答を得る。最後に、GPT4を用いて、LVLMが生成した回答を、細かいアノテーションと質問に基づいてスコアリングする。スコアリングの指示は、注釈を画像の内容とみなして、回答の有用性、関連性、正確性を評価するようモデルに要求する。評価の公平性を確保するため、各モデルの回答はGPT4の一貫した参照回答と比較されます。全問題におけるモデルの平均スコアを最終スコアとする。 42 | 43 | 解答位置の影響を排除するために、解答位置を入れ替えて2回目の採点ラウンドを行い、得られた2つのスコアの平均を計算します。このアプローチは、解答の配置によって生じるバイアスを軽減することを目的としています。 44 | <p align="center"> 45 | <img src="../assets/touchstone_eval.png" width="600"/> 46 | <p> 47 | 48 | ### 評価 49 | 50 | #### 英語ベースのマルチモーダル対話における評価 51 | 52 | | Model | Score | 53 | |---------------|-------| 54 | | PandaGPT | 488.5 | 55 | | MiniGPT4 | 531.7 | 56 | | InstructBLIP | 552.4 | 57 | | LLaMA-AdapterV2 | 590.1 | 58 | | mPLUG-Owl | 605.4 | 59 | | LLaVA | 602.7 | 60 | | Qwen-VL-Chat | 645.2 | 61 | 62 | #### 中国語ベースのマルチモーダル対話における評価 63 | 64 | | Model | Score | 65 | |---------------|-------| 66 | | VisualGLM | 247.1 | 67 | | Qwen-VL-Chat | 401.2 | 68 | 69 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from tongagent.agents.general_agent import create_agent 3 | 4 | def main(): 5 | parser = argparse.ArgumentParser() 6 | parser.add_argument( 7 | "--prompt", 8 | required=True, 9 | help="Instructions that you want agent to execute.") 10 | args = parser.parse_args() 11 | agent = create_agent() 12 | result = agent.run(args.prompt) 13 | print("Agent Response:", result) 14 | 15 | if __name__ == "__main__": 16 | main() -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | transformers==4.46.0 2 | vllm==0.6.1 3 | openai 4 | langchain 5 | pypdf 6 | markdownify 7 | pathvalidate 8 | puremagic 9 | mammoth 10 | python-pptx 11 | pandas 12 | pdfminer-six 13 | youtube-transcript-api 14 | serpapi 15 | google-search-results 16 | face-detection 17 | pygments 18 | paddlepaddle-gpu 19 | paddleocr>=2.0.1 20 | shortuuid 21 | diffusers 22 | accelerate 23 | langchain_community 24 | langchain_chroma 25 | langchain_openai 26 | omegaconf 27 | tiktoken 28 | git+https://github.com/facebookresearch/segment-anything-2.git 29 | openpyxl 30 | google-cloud-aiplatform>=1.38 31 | ray[default] 32 | vidgear 33 | xlrd>=2.0.1 34 | loguru -------------------------------------------------------------------------------- /scripts/report.py: -------------------------------------------------------------------------------- 1 | import wandb 2 | import time 3 | wandb.init(project="occupy", name="occupy") 4 | print("Get GPU!") 5 | wandb.alert(title="Get GPU!", text="Get GPU!") 6 | wandb.finish() 7 | -------------------------------------------------------------------------------- /scripts/search.py: -------------------------------------------------------------------------------- 1 | import json 2 | data_path = "experiments/CPM-FT/data/agent_tune_dataset_gaia_1206_11k.json" 3 | with open(data_path, "r") as f: 4 | data = json.load(f) 5 | 6 | search_str = '''The attached file contains a list of vendors in the Liminal Springs mall, along with each vendor’s monthly revenue and the rent they pay the mall. I want you to find the vendor that makes the most money, relative to the rent it pays. Then, tell me what is listed in the “type” column for that vendor.''' 7 | found = False 8 | for item in data: 9 | # print(item) 10 | conversations = item["conversations"] 11 | for conversation in conversations: 12 | if search_str in conversation["content"]: 13 | print(item) 14 | found = True 15 | 16 | 17 | print(found) 18 | 19 | -------------------------------------------------------------------------------- /slurm_jobs/deploy_qwen2_5_72b.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=qwen2.5_72b # create a short name for your job 3 | #SBATCH --partition=DGX,HGX # specify the partition name: gpu 4 | #SBATCH --qos=lv1 5 | #SBATCH --nodes=1 # node count 6 | #SBATCH --ntasks=1 # total number of tasks across all nodes 7 | #SBATCH --ntasks-per-node=1 8 | #SBATCH --mem=64G # total memory (RAM) per node 9 | #SBATCH --time=72:00:00 # total run time limit (HH:MM:SS) 10 | #SBATCH --cpus-per-task=32 # cpu-cores per task (>1 if multi-threaded tasks) 11 | #SBATCH --gres=gpu:4 # number of gpus per node 12 | #SBATCH --output=output/out-%j.out # output format 13 | #SBATCH --error=output/error-out-%j.out # error output file 14 | #SBATCH --account=engineering 15 | #--------------------task part------------------------- 16 | 17 | 18 | ## clean env 19 | module purge 20 | ## load environment need by this task 21 | module load slurm/BigAI/23.02.2 22 | export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/zhangbofei/anaconda3/lib 23 | source /home/zhangbofei/anaconda3/bin/activate # commented out by conda initialize 24 | conda init 25 | conda activate agent_tune 26 | GPUS_PER_NODE=$(nvidia-smi -L | wc -l) 27 | nvidia-smi 28 | 29 | 30 | vllm serve /scratch/ml/zhangxintong/A_Models/Qwen/Qwen2.5-72B-Instruct --tensor-parallel-size $GPUS_PER_NODE --dtype bfloat16 --gpu-memory-utilization 0.90 --max-model-len 20000 31 | 32 | 33 | 34 | 35 | -------------------------------------------------------------------------------- /slurm_jobs/deploy_qwen2_VL_72b.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=qwen2_vl_72b # create a short name for your job 3 | #SBATCH --partition=DGX # specify the partition name: gpu 4 | #SBATCH --qos=lv2 5 | #SBATCH --nodes=1 # node count 6 | #SBATCH --ntasks=1 # total number of tasks across all nodes 7 | #SBATCH --ntasks-per-node=1 8 | #SBATCH --mem=64G # total memory (RAM) per node 9 | #SBATCH --time=12:00:00 # total run time limit (HH:MM:SS) 10 | #SBATCH --cpus-per-task=32 # cpu-cores per task (>1 if multi-threaded tasks) 11 | #SBATCH --gres=gpu:2 # number of gpus per node 12 | #SBATCH --output=output/out-%j.out # output format 13 | #SBATCH --error=output/error-out-%j.out # error output file 14 | #SBATCH --account=engineering 15 | #--------------------task part------------------------- 16 | 17 | 18 | ## clean env 19 | module purge 20 | ## load environment need by this task 21 | module load slurm/BigAI/23.02.2 22 | export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/zhangbofei/anaconda3/lib 23 | source /home/zhangbofei/anaconda3/bin/activate # commented out by conda initialize 24 | conda init 25 | conda activate qwen_vl 26 | GPUS_PER_NODE=$(nvidia-smi -L | wc -l) 27 | nvidia-smi 28 | 29 | 30 | 31 | # vllm serve /scratch/TecManDep/llm_weights/Qwen2-VL-72B-Instruct/ --tensor-parallel-size 2 --dtype bfloat16 --gpu-memory-utilization 0.90 --max-model-len 20000 32 | 33 | python -m vllm.entrypoints.openai.api_server --tensor-parallel-size $GPUS_PER_NODE --served-model-name Qwen2-VL-7B-Instruct --model /scratch/TecManDep/llm_weights/Qwen2-VL-72B-Instruct/ 34 | -------------------------------------------------------------------------------- /slurm_jobs/evaluate.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=eval_gta # create a short name for your job 3 | #SBATCH --partition=HGX # specify the partition name: gpu 4 | #SBATCH --qos=lv1 5 | #SBATCH --nodes=1 # node count 6 | #SBATCH --ntasks=1 # total number of tasks across all nodes 7 | #SBATCH --ntasks-per-node=1 8 | #SBATCH --mem=64G # total memory (RAM) per node 9 | #SBATCH --time=72:00:00 # total run time limit (HH:MM:SS) 10 | #SBATCH --cpus-per-task=32 # cpu-cores per task (>1 if multi-threaded tasks) 11 | #SBATCH --gres=gpu:4 # number of gpus per node 12 | #SBATCH --output=output/out-%j.out # output format 13 | #SBATCH --error=output/error-out-%j.out # error output file 14 | #SBATCH --account=engineering 15 | #SBATCH --dependency=7787293 16 | #--------------------task part------------------------- 17 | 18 | 19 | ## clean env 20 | module purge 21 | ## load environment need by this task 22 | module load slurm/BigAI/23.02.2 23 | export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/zhangbofei/anaconda3/lib 24 | source /home/zhangbofei/anaconda3/bin/activate # commented out by conda initialize 25 | conda init 26 | conda activate agent_tune 27 | export AGENT_CONFIG='configs/agent_config.yaml' 28 | python examples/gta/main.py --engine minicpm --lora-path experiments/CPM-FT/output/cpm_v2_6_7879870_2024_09_30_15_55/ --disable-vision 29 | # python examples/gta/main.py --engine tonggpt --disable-vision -------------------------------------------------------------------------------- /slurm_jobs/evaluate_gaia.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=eval_gaia # create a short name for your job 3 | #SBATCH --partition=HGX # specify the partition name: gpu 4 | #SBATCH --qos=lv0a 5 | #SBATCH --nodes=1 # node count 6 | #SBATCH --ntasks=1 # total number of tasks across all nodes 7 | #SBATCH --ntasks-per-node=1 8 | #SBATCH --mem=64G # total memory (RAM) per node 9 | #SBATCH --time=08:00:00 # total run time limit (HH:MM:SS) 10 | #SBATCH --cpus-per-task=32 # cpu-cores per task (>1 if multi-threaded tasks) 11 | #SBATCH --gres=gpu:1 # number of gpus per node 12 | #SBATCH --output=output/out-%j.out # output format 13 | #SBATCH --error=output/error-out-%j.out # error output file 14 | #SBATCH --account=engineering 15 | ## SBATCH --dependency=7787293 16 | #--------------------task part------------------------- 17 | 18 | 19 | ## clean env 20 | module purge 21 | ## load environment need by this task 22 | module load slurm/BigAI/23.02.2 23 | export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/zhangbofei/anaconda3/lib 24 | source /home/zhangbofei/anaconda3/bin/activate # commented out by conda initialize 25 | conda init 26 | conda activate agent_tune 27 | export RUN_MODE=eval 28 | python examples/gaia/main.py --engine minicpm --lora-path experiments/CPM-FT/output/cpm_v2_6_7904295_2024_12_10_23_05/ --data-name 2023_level2 --split validation 29 | 30 | #python examples/gaia/main.py --engine minicpm --lora-path experiments/CPM-FT/output/cpm_v2_6_7880273_2024_10_01_16_09/ --data-name 2023_level3 --split validation 31 | 32 | #python examples/gaia/main.py --engine minicpm --lora-path experiments/CPM-FT/output/cpm_v2_6_7880273_2024_10_01_16_09/ --data-name 2023_level2 --split validation 33 | 34 | 35 | # python examples/gaia/main.py --engine minicpm --data-name 2023_level1 --split validation 36 | 37 | # python examples/gaia/main.py --engine tonggpt --data-name 2023_level1 --split validation 38 | 39 | #python examples/gaia/main.py --engine tonggpt --data-name 2023_level3 --split validation 40 | -------------------------------------------------------------------------------- /slurm_jobs/evaluate_gaia_exp1.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=eval_gaia # create a short name for your job 3 | #SBATCH --partition=DGX,HGX # specify the partition name: gpu 4 | #SBATCH --qos=lv0a 5 | #SBATCH --nodes=1 # node count 6 | #SBATCH --ntasks=1 # total number of tasks across all nodes 7 | #SBATCH --ntasks-per-node=1 8 | #SBATCH --mem=64G # total memory (RAM) per node 9 | #SBATCH --time=72:00:00 # total run time limit (HH:MM:SS) 10 | #SBATCH --cpus-per-task=32 # cpu-cores per task (>1 if multi-threaded tasks) 11 | #SBATCH --gres=gpu:1 # number of gpus per node 12 | #SBATCH --output=output/out-%j.out # output format 13 | #SBATCH --error=output/error-out-%j.out # error output file 14 | #SBATCH --account=engineering 15 | #SBATCH --dependency=7880273 16 | #--------------------task part------------------------- 17 | 18 | 19 | ## clean env 20 | module purge 21 | ## load environment need by this task 22 | module load slurm/BigAI/23.02.2 23 | export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/zhangbofei/anaconda3/lib 24 | source /home/zhangbofei/anaconda3/bin/activate # commented out by conda initialize 25 | conda init 26 | conda activate agent_tune 27 | export RUN_MODE=eval 28 | python examples/gaia/main.py --engine minicpm --lora-path experiments/CPM-FT/output/cpm_v2_6_7880273_2024_10_01_16_09/ --data-name 2023_level1 --split validation 29 | 30 | python examples/gaia/main.py --engine minicpm --lora-path experiments/CPM-FT/output/cpm_v2_6_7880273_2024_10_01_16_09/ --data-name 2023_level3 --split validation 31 | 32 | python examples/gaia/main.py --engine minicpm --lora-path experiments/CPM-FT/output/cpm_v2_6_7880273_2024_10_01_16_09/ --data-name 2023_level2 --split validation 33 | 34 | 35 | # python examples/gaia/main.py --engine minicpm --data-name 2023_level1 --split validation 36 | 37 | # python examples/gaia/main.py --engine tonggpt --data-name 2023_level1 --split validation 38 | 39 | #python examples/gaia/main.py --engine tonggpt --data-name 2023_level3 --split validation 40 | -------------------------------------------------------------------------------- /slurm_jobs/evaluate_gaia_exp1_setting1.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=eval_gaia # create a short name for your job 3 | #SBATCH --partition=DGX,HGX # specify the partition name: gpu 4 | #SBATCH --qos=lv0a 5 | #SBATCH --nodes=1 # node count 6 | #SBATCH --ntasks=1 # total number of tasks across all nodes 7 | #SBATCH --ntasks-per-node=1 8 | #SBATCH --mem=64G # total memory (RAM) per node 9 | #SBATCH --time=08:00:00 # total run time limit (HH:MM:SS) 10 | #SBATCH --cpus-per-task=32 # cpu-cores per task (>1 if multi-threaded tasks) 11 | #SBATCH --gres=gpu:1 # number of gpus per node 12 | #SBATCH --output=output/out-%j.out # output format 13 | #SBATCH --error=output/error-out-%j.out # error output file 14 | #SBATCH --account=engineering 15 | #SBATCH --dependency=7880273 16 | #--------------------task part------------------------- 17 | 18 | 19 | ## clean env 20 | module purge 21 | ## load environment need by this task 22 | module load slurm/BigAI/23.02.2 23 | export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/zhangbofei/anaconda3/lib 24 | source /home/zhangbofei/anaconda3/bin/activate # commented out by conda initialize 25 | conda init 26 | conda activate agent_tune 27 | export RUN_MODE=eval 28 | python examples/gaia/main.py --engine minicpm --lora-path experiments/CPM-FT/output/cpm_v2_6_7897481_2024_11_10_07_40/ --data-name 2023_level1 --split validation 29 | 30 | 31 | # CUDA_VISIBLE_DEVICES=1 python examples/gaia/main.py --engine minicpm --lora-path experiments/CPM-FT/output/cpm_v2_6_7897481_2024_11_10_07_40/ --data-name 2023_level3 --split validation > eval_settings1_lv3.log 2>&1 & 32 | #python examples/gaia/main.py --engine minicpm --lora-path experiments/CPM-FT/output/cpm_v2_6_7880273_2024_10_01_16_09/ --data-name 2023_level3 --split validation 33 | 34 | #python examples/gaia/main.py --engine minicpm --lora-path experiments/CPM-FT/output/cpm_v2_6_7880273_2024_10_01_16_09/ --data-name 2023_level2 --split validation 35 | 36 | 37 | # python examples/gaia/main.py --engine minicpm --data-name 2023_level1 --split validation 38 | 39 | # python examples/gaia/main.py --engine tonggpt --data-name 2023_level1 --split validation 40 | 41 | #python examples/gaia/main.py --engine tonggpt --data-name 2023_level3 --split validation 42 | -------------------------------------------------------------------------------- /slurm_jobs/evaluate_gaia_exp1_setting2.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=eval_gaia # create a short name for your job 3 | #SBATCH --partition=DGX,HGX # specify the partition name: gpu 4 | #SBATCH --qos=lv0a 5 | #SBATCH --nodes=1 # node count 6 | #SBATCH --ntasks=1 # total number of tasks across all nodes 7 | #SBATCH --ntasks-per-node=1 8 | #SBATCH --mem=64G # total memory (RAM) per node 9 | #SBATCH --time=08:00:00 # total run time limit (HH:MM:SS) 10 | #SBATCH --cpus-per-task=32 # cpu-cores per task (>1 if multi-threaded tasks) 11 | #SBATCH --gres=gpu:1 # number of gpus per node 12 | #SBATCH --output=output/out-%j.out # output format 13 | #SBATCH --error=output/error-out-%j.out # error output file 14 | #SBATCH --account=engineering 15 | #SBATCH --dependency=7880273 16 | #--------------------task part------------------------- 17 | 18 | 19 | ## clean env 20 | module purge 21 | ## load environment need by this task 22 | module load slurm/BigAI/23.02.2 23 | export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/zhangbofei/anaconda3/lib 24 | source /home/zhangbofei/anaconda3/bin/activate # commented out by conda initialize 25 | conda init 26 | conda activate agent_tune 27 | export RUN_MODE=eval 28 | 29 | CUDA_VISIBLE_DEVICES=5 python examples/gaia/main.py --engine minicpm --lora-path experiments/CPM-FT/output/cpm_v2_6_7897482_2024_11_10_09_38/ --data-name 2023_level1 --split validation > eval_settings2.log 2>&1 & 30 | 31 | CUDA_VISIBLE_DEVICES=0 python examples/gaia/main.py --engine minicpm --lora-path experiments/CPM-FT/output/cpm_v2_6_7897482_2024_11_10_09_38/ --data-name 2023_level3 --split validation > eval_settings2_lv3.log 2>&1 & 32 | # CUDA_VISIBLE_DEVICES=4 python examples/gaia/main.py --engine minicpm --lora-path experiments/CPM-FT/output/cpm_v2_6_7897481_2024_11_10_07_40/ --data-name 2023_level1 --split validation 33 | 34 | #python examples/gaia/main.py --engine minicpm --lora-path experiments/CPM-FT/output/cpm_v2_6_7880273_2024_10_01_16_09/ --data-name 2023_level3 --split validation 35 | 36 | #python examples/gaia/main.py --engine minicpm --lora-path experiments/CPM-FT/output/cpm_v2_6_7880273_2024_10_01_16_09/ --data-name 2023_level2 --split validation 37 | 38 | 39 | # python examples/gaia/main.py --engine minicpm --data-name 2023_level1 --split validation 40 | 41 | # python examples/gaia/main.py --engine tonggpt --data-name 2023_level1 --split validation 42 | 43 | #python examples/gaia/main.py --engine tonggpt --data-name 2023_level3 --split validation 44 | -------------------------------------------------------------------------------- /slurm_jobs/evaluate_gaia_exp1_setting3.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=eval_gaia # create a short name for your job 3 | #SBATCH --partition=DGX,HGX # specify the partition name: gpu 4 | #SBATCH --qos=lv1 5 | #SBATCH --nodes=1 # node count 6 | #SBATCH --ntasks=1 # total number of tasks across all nodes 7 | #SBATCH --ntasks-per-node=1 8 | #SBATCH --mem=64G # total memory (RAM) per node 9 | #SBATCH --time=08:00:00 # total run time limit (HH:MM:SS) 10 | #SBATCH --cpus-per-task=32 # cpu-cores per task (>1 if multi-threaded tasks) 11 | #SBATCH --gres=gpu:1 # number of gpus per node 12 | #SBATCH --output=output/out-%j.out # output format 13 | #SBATCH --error=output/error-out-%j.out # error output file 14 | #SBATCH --account=engineering 15 | #SBATCH --dependency=7880273 16 | #--------------------task part------------------------- 17 | 18 | 19 | ## clean env 20 | module purge 21 | ## load environment need by this task 22 | module load slurm/BigAI/23.02.2 23 | export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/zhangbofei/anaconda3/lib 24 | source /home/zhangbofei/anaconda3/bin/activate # commented out by conda initialize 25 | conda init 26 | conda activate agent_tune 27 | export RUN_MODE=eval 28 | 29 | python examples/gaia/main.py --engine minicpm --lora-path experiments/CPM-FT/output/cpm_v2_6_7897483_2024_11_10_09_41/ --data-name 2023_level2 --split validation 30 | 31 | 32 | # CUDA_VISIBLE_DEVICES=2 python examples/gaia/main.py --engine minicpm --lora-path experiments/CPM-FT/output/cpm_v2_6_7897483_2024_11_10_09_41/ --data-name 2023_level3 --split validation > eval_settings3_lv3.log 2>&1 & 33 | # CUDA_VISIBLE_DEVICES=4 python examples/gaia/main.py --engine minicpm --lora-path experiments/CPM-FT/output/cpm_v2_6_7897481_2024_11_10_07_40/ --data-name 2023_level1 --split validation 34 | 35 | #python examples/gaia/main.py --engine minicpm --lora-path experiments/CPM-FT/output/cpm_v2_6_7880273_2024_10_01_16_09/ --data-name 2023_level3 --split validation 36 | 37 | #python examples/gaia/main.py --engine minicpm --lora-path experiments/CPM-FT/output/cpm_v2_6_7880273_2024_10_01_16_09/ --data-name 2023_level2 --split validation 38 | 39 | 40 | # python examples/gaia/main.py --engine minicpm --data-name 2023_level1 --split validation 41 | 42 | # python examples/gaia/main.py --engine tonggpt --data-name 2023_level1 --split validation 43 | 44 | #python examples/gaia/main.py --engine tonggpt --data-name 2023_level3 --split validation 45 | -------------------------------------------------------------------------------- /slurm_jobs/evaluate_gaia_exp2.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=eval_gaia # create a short name for your job 3 | #SBATCH --partition=DGX # specify the partition name: gpu 4 | #SBATCH --qos=lv2 5 | #SBATCH --nodes=1 # node count 6 | #SBATCH --ntasks=1 # total number of tasks across all nodes 7 | #SBATCH --ntasks-per-node=1 8 | #SBATCH --mem=64G # total memory (RAM) per node 9 | #SBATCH --time=72:00:00 # total run time limit (HH:MM:SS) 10 | #SBATCH --cpus-per-task=32 # cpu-cores per task (>1 if multi-threaded tasks) 11 | #SBATCH --gres=gpu:1 # number of gpus per node 12 | #SBATCH --output=output/out-%j.out # output format 13 | #SBATCH --error=output/error-out-%j.out # error output file 14 | #SBATCH --account=engineering 15 | #SBATCH --dependency=7880278 16 | #--------------------task part------------------------- 17 | 18 | 19 | ## clean env 20 | module purge 21 | ## load environment need by this task 22 | module load slurm/BigAI/23.02.2 23 | export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/zhangbofei/anaconda3/lib 24 | source /home/zhangbofei/anaconda3/bin/activate # commented out by conda initialize 25 | conda init 26 | conda activate agent_tune 27 | export RUN_MODE=eval 28 | python examples/gaia/main.py --engine minicpm --lora-path experiments/CPM-FT/output/cpm_v2_6_7880278_2024_10_01_16_31/ --data-name 2023_level1 --split validation 29 | 30 | python examples/gaia/main.py --engine minicpm --lora-path experiments/CPM-FT/output/cpm_v2_6_7880278_2024_10_01_16_31/ --data-name 2023_level3 --split validation 31 | 32 | python examples/gaia/main.py --engine minicpm --lora-path experiments/CPM-FT/output/cpm_v2_6_7880278_2024_10_01_16_31/ --data-name 2023_level2 --split validation 33 | 34 | 35 | # python examples/gaia/main.py --engine minicpm --data-name 2023_level1 --split validation 36 | 37 | # python examples/gaia/main.py --engine tonggpt --data-name 2023_level1 --split validation 38 | 39 | #python examples/gaia/main.py --engine tonggpt --data-name 2023_level3 --split validation 40 | -------------------------------------------------------------------------------- /slurm_jobs/evaluate_gaia_exp3.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=eval_gaia # create a short name for your job 3 | #SBATCH --partition=DGX # specify the partition name: gpu 4 | #SBATCH --qos=lv2 5 | #SBATCH --nodes=1 # node count 6 | #SBATCH --ntasks=1 # total number of tasks across all nodes 7 | #SBATCH --ntasks-per-node=1 8 | #SBATCH --mem=64G # total memory (RAM) per node 9 | #SBATCH --time=72:00:00 # total run time limit (HH:MM:SS) 10 | #SBATCH --cpus-per-task=32 # cpu-cores per task (>1 if multi-threaded tasks) 11 | #SBATCH --gres=gpu:1 # number of gpus per node 12 | #SBATCH --output=output/out-%j.out # output format 13 | #SBATCH --error=output/error-out-%j.out # error output file 14 | #SBATCH --account=engineering 15 | #SBATCH --dependency=7880279 16 | #--------------------task part------------------------- 17 | 18 | 19 | ## clean env 20 | module purge 21 | ## load environment need by this task 22 | module load slurm/BigAI/23.02.2 23 | export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/zhangbofei/anaconda3/lib 24 | source /home/zhangbofei/anaconda3/bin/activate # commented out by conda initialize 25 | conda init 26 | conda activate agent_tune 27 | export RUN_MODE=eval 28 | python examples/gaia/main.py --engine minicpm --lora-path experiments/CPM-FT/output/cpm_v2_6_7880279_2024_10_01_16_32/ --data-name 2023_level1 --split validation 29 | 30 | #python examples/gaia/main.py --engine minicpm --lora-path experiments/CPM-FT/output/cpm_v2_6_7880279_2024_10_01_16_32/ --data-name 2023_level3 --split validation 31 | 32 | #python examples/gaia/main.py --engine minicpm --lora-path experiments/CPM-FT/output/cpm_v2_6_7880279_2024_10_01_16_32/ --data-name 2023_level2 --split validation 33 | 34 | 35 | # python examples/gaia/main.py --engine minicpm --data-name 2023_level1 --split validation 36 | 37 | # python examples/gaia/main.py --engine tonggpt --data-name 2023_level1 --split validation 38 | 39 | #python examples/gaia/main.py --engine tonggpt --data-name 2023_level3 --split validation 40 | -------------------------------------------------------------------------------- /slurm_jobs/evaluate_gaia_exp4.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=eval_gaia # create a short name for your job 3 | #SBATCH --partition=DGX # specify the partition name: gpu 4 | #SBATCH --qos=lv2 5 | #SBATCH --nodes=1 # node count 6 | #SBATCH --ntasks=1 # total number of tasks across all nodes 7 | #SBATCH --ntasks-per-node=1 8 | #SBATCH --mem=64G # total memory (RAM) per node 9 | #SBATCH --time=72:00:00 # total run time limit (HH:MM:SS) 10 | #SBATCH --cpus-per-task=32 # cpu-cores per task (>1 if multi-threaded tasks) 11 | #SBATCH --gres=gpu:1 # number of gpus per node 12 | #SBATCH --output=output/out-%j.out # output format 13 | #SBATCH --error=output/error-out-%j.out # error output file 14 | #SBATCH --account=engineering 15 | #SBATCH --dependency=7880280 16 | #--------------------task part------------------------- 17 | 18 | 19 | ## clean env 20 | module purge 21 | ## load environment need by this task 22 | module load slurm/BigAI/23.02.2 23 | export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/zhangbofei/anaconda3/lib 24 | source /home/zhangbofei/anaconda3/bin/activate # commented out by conda initialize 25 | conda init 26 | conda activate agent_tune 27 | export RUN_MODE=eval 28 | python examples/gaia/main.py --engine minicpm --lora-path experiments/CPM-FT/output/cpm_v2_6_7880280_2024_10_01_16_46/ --data-name 2023_level1 --split validation 29 | 30 | python examples/gaia/main.py --engine minicpm --lora-path experiments/CPM-FT/output/cpm_v2_6_7880280_2024_10_01_16_46/ --data-name 2023_level3 --split validation 31 | 32 | python examples/gaia/main.py --engine minicpm --lora-path experiments/CPM-FT/output/cpm_v2_6_7880280_2024_10_01_16_46/ --data-name 2023_level2 --split validation 33 | 34 | 35 | # python examples/gaia/main.py --engine minicpm --data-name 2023_level1 --split validation 36 | 37 | # python examples/gaia/main.py --engine tonggpt --data-name 2023_level1 --split validation 38 | 39 | #python examples/gaia/main.py --engine tonggpt --data-name 2023_level3 --split validation 40 | -------------------------------------------------------------------------------- /slurm_jobs/evaluate_gaia_internvl2.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=eval_gaia # create a short name for your job 3 | #SBATCH --partition=HGX # specify the partition name: gpu 4 | #SBATCH --qos=lv1 5 | #SBATCH --nodes=1 # node count 6 | #SBATCH --ntasks=1 # total number of tasks across all nodes 7 | #SBATCH --ntasks-per-node=1 8 | #SBATCH --mem=64G # total memory (RAM) per node 9 | #SBATCH --time=72:00:00 # total run time limit (HH:MM:SS) 10 | #SBATCH --cpus-per-task=32 # cpu-cores per task (>1 if multi-threaded tasks) 11 | #SBATCH --gres=gpu:1 # number of gpus per node 12 | #SBATCH --output=output/out-%j.out # output format 13 | #SBATCH --error=output/error-out-%j.out # error output file 14 | #SBATCH --account=engineering 15 | ## SBATCH --dependency=7787293 16 | #--------------------task part------------------------- 17 | 18 | 19 | ## clean env 20 | module purge 21 | ## load environment need by this task 22 | module load slurm/BigAI/23.02.2 23 | export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/zhangbofei/anaconda3/lib 24 | source /home/zhangbofei/anaconda3/bin/activate # commented out by conda initialize 25 | conda init 26 | conda activate agent_tune 27 | export AGENT_CONFIG=configs/agent_config.yaml 28 | python examples/gaia/main.py --engine internvl2 --data-name 2023_level2 --split validation 29 | 30 | #python examples/gaia/main.py --engine minicpm --lora-path experiments/CPM-FT/output/cpm_v2_6_7880273_2024_10_01_16_09/ --data-name 2023_level3 --split validation 31 | 32 | #python examples/gaia/main.py --engine minicpm --lora-path experiments/CPM-FT/output/cpm_v2_6_7880273_2024_10_01_16_09/ --data-name 2023_level2 --split validation 33 | 34 | 35 | # python examples/gaia/main.py --engine minicpm --data-name 2023_level1 --split validation 36 | 37 | # python examples/gaia/main.py --engine tonggpt --data-name 2023_level1 --split validation 38 | 39 | #python examples/gaia/main.py --engine tonggpt --data-name 2023_level3 --split validation 40 | -------------------------------------------------------------------------------- /slurm_jobs/evaluate_gaia_llava.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=eval_gaia # create a short name for your job 3 | #SBATCH --partition=HGX # specify the partition name: gpu 4 | #SBATCH --qos=lv1 5 | #SBATCH --nodes=1 # node count 6 | #SBATCH --ntasks=1 # total number of tasks across all nodes 7 | #SBATCH --ntasks-per-node=1 8 | #SBATCH --mem=64G # total memory (RAM) per node 9 | #SBATCH --time=72:00:00 # total run time limit (HH:MM:SS) 10 | #SBATCH --cpus-per-task=32 # cpu-cores per task (>1 if multi-threaded tasks) 11 | #SBATCH --gres=gpu:1 # number of gpus per node 12 | #SBATCH --output=output/out-%j.out # output format 13 | #SBATCH --error=output/error-out-%j.out # error output file 14 | #SBATCH --account=engineering 15 | ## SBATCH --dependency=7787293 16 | #--------------------task part------------------------- 17 | 18 | 19 | ## clean env 20 | module purge 21 | ## load environment need by this task 22 | module load slurm/BigAI/23.02.2 23 | export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/zhangbofei/anaconda3/lib 24 | source /home/zhangbofei/anaconda3/bin/activate # commented out by conda initialize 25 | conda init 26 | conda activate agent_tune 27 | export AGENT_CONFIG=configs/agent_config.yaml 28 | python examples/gaia/main.py --engine llava --data-name 2023_level3 --split validation 29 | 30 | 31 | -------------------------------------------------------------------------------- /slurm_jobs/evaluate_gaia_qwen.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=eval_gaia # create a short name for your job 3 | #SBATCH --partition=HGX # specify the partition name: gpu 4 | #SBATCH --qos=lv1 5 | #SBATCH --nodes=1 # node count 6 | #SBATCH --ntasks=1 # total number of tasks across all nodes 7 | #SBATCH --ntasks-per-node=1 8 | #SBATCH --mem=64G # total memory (RAM) per node 9 | #SBATCH --time=72:00:00 # total run time limit (HH:MM:SS) 10 | #SBATCH --cpus-per-task=32 # cpu-cores per task (>1 if multi-threaded tasks) 11 | #SBATCH --gres=gpu:1 # number of gpus per node 12 | #SBATCH --output=output/out-%j.out # output format 13 | #SBATCH --error=output/error-out-%j.out # error output file 14 | #SBATCH --account=engineering 15 | ## SBATCH --dependency=7787293 16 | #--------------------task part------------------------- 17 | 18 | 19 | ## clean env 20 | module purge 21 | ## load environment need by this task 22 | module load slurm/BigAI/23.02.2 23 | export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/zhangbofei/anaconda3/lib 24 | source /home/zhangbofei/anaconda3/bin/activate # commented out by conda initialize 25 | conda init 26 | conda activate agent_tune 27 | export AGENT_CONFIG=configs/agent_config.yaml 28 | python examples/gaia/main.py --engine qwen --data-name 2023_level3 --split validation 29 | 30 | #python examples/gaia/main.py --engine minicpm --lora-path experiments/CPM-FT/output/cpm_v2_6_7880273_2024_10_01_16_09/ --data-name 2023_level3 --split validation 31 | 32 | #python examples/gaia/main.py --engine minicpm --lora-path experiments/CPM-FT/output/cpm_v2_6_7880273_2024_10_01_16_09/ --data-name 2023_level2 --split validation 33 | 34 | 35 | # python examples/gaia/main.py --engine minicpm --data-name 2023_level1 --split validation 36 | 37 | # python examples/gaia/main.py --engine tonggpt --data-name 2023_level1 --split validation 38 | 39 | #python examples/gaia/main.py --engine tonggpt --data-name 2023_level3 --split validation 40 | -------------------------------------------------------------------------------- /slurm_jobs/evaluate_gaia_qwen_tuned.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=eval_gaia # create a short name for your job 3 | #SBATCH --partition=DGX # specify the partition name: gpu 4 | #SBATCH --qos=lv0b 5 | #SBATCH --nodes=1 # node count 6 | #SBATCH --ntasks=1 # total number of tasks across all nodes 7 | #SBATCH --ntasks-per-node=1 8 | #SBATCH --mem=64G # total memory (RAM) per node 9 | #SBATCH --time=08:00:00 # total run time limit (HH:MM:SS) 10 | #SBATCH --cpus-per-task=32 # cpu-cores per task (>1 if multi-threaded tasks) 11 | #SBATCH --gres=gpu:1 # number of gpus per node 12 | #SBATCH --output=output/out-%j.out # output format 13 | #SBATCH --error=output/error-out-%j.out # error output file 14 | #SBATCH --account=engineering 15 | ## SBATCH --dependency=7787293 16 | #--------------------task part------------------------- 17 | 18 | 19 | ## clean env 20 | module purge 21 | ## load environment need by this task 22 | module load slurm/BigAI/23.02.2 23 | export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/zhangbofei/anaconda3/lib 24 | source /home/zhangbofei/anaconda3/bin/activate # commented out by conda initialize 25 | conda init 26 | conda activate agent_tune 27 | export AGENT_CONFIG=configs/agent_config.yaml 28 | python examples/gaia/main.py --engine qwen --data-name 2023_level3 --split validation --lora-path output_qwen/Qwen2-VL-7B-Instruct-7906426/ 29 | 30 | 31 | #python examples/gaia/main.py --engine minicpm --lora-path experiments/CPM-FT/output/cpm_v2_6_7880273_2024_10_01_16_09/ --data-name 2023_level3 --split validation 32 | 33 | #python examples/gaia/main.py --engine minicpm --lora-path experiments/CPM-FT/output/cpm_v2_6_7880273_2024_10_01_16_09/ --data-name 2023_level2 --split validation 34 | 35 | 36 | # python examples/gaia/main.py --engine minicpm --data-name 2023_level1 --split validation 37 | 38 | # python examples/gaia/main.py --engine tonggpt --data-name 2023_level1 --split validation 39 | 40 | #python examples/gaia/main.py --engine tonggpt --data-name 2023_level3 --split validation 41 | -------------------------------------------------------------------------------- /slurm_jobs/evaluate_gta_internvl.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=eval_gta # create a short name for your job 3 | #SBATCH --partition=DGX # specify the partition name: gpu 4 | #SBATCH --qos=lv1 5 | #SBATCH --nodes=1 # node count 6 | #SBATCH --ntasks=1 # total number of tasks across all nodes 7 | #SBATCH --ntasks-per-node=1 8 | #SBATCH --mem=64G # total memory (RAM) per node 9 | #SBATCH --time=72:00:00 # total run time limit (HH:MM:SS) 10 | #SBATCH --cpus-per-task=32 # cpu-cores per task (>1 if multi-threaded tasks) 11 | #SBATCH --gres=gpu:1 # number of gpus per node 12 | #SBATCH --output=output/out-%j.out # output format 13 | #SBATCH --error=output/error-out-%j.out # error output file 14 | #SBATCH --account=engineering 15 | #--------------------task part------------------------- 16 | 17 | 18 | ## clean env 19 | module purge 20 | ## load environment need by this task 21 | module load slurm/BigAI/23.02.2 22 | export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/zhangbofei/anaconda3/lib 23 | source /home/zhangbofei/anaconda3/bin/activate # commented out by conda initialize 24 | conda init 25 | conda activate agent_tune 26 | export AGENT_CONFIG=configs/agent_config.yaml 27 | # python examples/gta/main.py --engine minicpm --lora-path experiments/CPM-FT/output/cpm_v2_6_7879846_2024_09_30_14_34 28 | python examples/gta/main.py --engine internvl2 -------------------------------------------------------------------------------- /slurm_jobs/evaluate_gta_internvl2.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=eval_gta # create a short name for your job 3 | #SBATCH --partition=HGX # specify the partition name: gpu 4 | #SBATCH --qos=lv1 5 | #SBATCH --nodes=1 # node count 6 | #SBATCH --ntasks=1 # total number of tasks across all nodes 7 | #SBATCH --ntasks-per-node=1 8 | #SBATCH --mem=64G # total memory (RAM) per node 9 | #SBATCH --time=72:00:00 # total run time limit (HH:MM:SS) 10 | #SBATCH --cpus-per-task=32 # cpu-cores per task (>1 if multi-threaded tasks) 11 | #SBATCH --gres=gpu:4 # number of gpus per node 12 | #SBATCH --output=output/out-%j.out # output format 13 | #SBATCH --error=output/error-out-%j.out # error output file 14 | #SBATCH --account=engineering 15 | #SBATCH --dependency=7899951 16 | #--------------------task part------------------------- 17 | 18 | 19 | ## clean env 20 | module purge 21 | ## load environment need by this task 22 | module load slurm/BigAI/23.02.2 23 | export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/zhangbofei/anaconda3/lib 24 | source /home/zhangbofei/anaconda3/bin/activate # commented out by conda initialize 25 | conda init 26 | conda activate agent_tune 27 | export AGENT_CONFIG=configs/agent_config.yaml 28 | # python examples/gta/main.py --engine minicpm --lora-path experiments/CPM-FT/output/cpm_v2_6_7879846_2024_09_30_14_34 29 | python examples/gta/main.py --engine internvl2 30 | 31 | # python examples/gta/main.py --engine qwen -------------------------------------------------------------------------------- /slurm_jobs/evaluate_gta_llava.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=eval_gta # create a short name for your job 3 | #SBATCH --partition=HGX,DGX # specify the partition name: gpu 4 | #SBATCH --qos=lv1 5 | #SBATCH --nodes=1 # node count 6 | #SBATCH --ntasks=1 # total number of tasks across all nodes 7 | #SBATCH --ntasks-per-node=1 8 | #SBATCH --mem=64G # total memory (RAM) per node 9 | #SBATCH --time=72:00:00 # total run time limit (HH:MM:SS) 10 | #SBATCH --cpus-per-task=32 # cpu-cores per task (>1 if multi-threaded tasks) 11 | #SBATCH --gres=gpu:4 # number of gpus per node 12 | #SBATCH --output=output/out-%j.out # output format 13 | #SBATCH --error=output/error-out-%j.out # error output file 14 | #SBATCH --account=engineering 15 | #SBATCH --dependency=7899951 16 | #--------------------task part------------------------- 17 | 18 | 19 | ## clean env 20 | module purge 21 | ## load environment need by this task 22 | module load slurm/BigAI/23.02.2 23 | export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/zhangbofei/anaconda3/lib 24 | source /home/zhangbofei/anaconda3/bin/activate # commented out by conda initialize 25 | conda init 26 | conda activate agent_tune 27 | export AGENT_CONFIG=configs/agent_config.yaml 28 | # python examples/gta/main.py --engine minicpm --lora-path experiments/CPM-FT/output/cpm_v2_6_7879846_2024_09_30_14_34 29 | python examples/gta/main.py --engine llava 30 | # python examples/gta/main.py --engine qwen -------------------------------------------------------------------------------- /slurm_jobs/evaluate_gta_qwen.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=eval_gta # create a short name for your job 3 | #SBATCH --partition=HGX,DGX # specify the partition name: gpu 4 | #SBATCH --qos=lv1 5 | #SBATCH --nodes=1 # node count 6 | #SBATCH --ntasks=1 # total number of tasks across all nodes 7 | #SBATCH --ntasks-per-node=1 8 | #SBATCH --mem=64G # total memory (RAM) per node 9 | #SBATCH --time=72:00:00 # total run time limit (HH:MM:SS) 10 | #SBATCH --cpus-per-task=32 # cpu-cores per task (>1 if multi-threaded tasks) 11 | #SBATCH --gres=gpu:4 # number of gpus per node 12 | #SBATCH --output=output/out-%j.out # output format 13 | #SBATCH --error=output/error-out-%j.out # error output file 14 | #SBATCH --account=engineering 15 | #SBATCH --dependency=7899951 16 | #--------------------task part------------------------- 17 | 18 | 19 | ## clean env 20 | module purge 21 | ## load environment need by this task 22 | module load slurm/BigAI/23.02.2 23 | export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/zhangbofei/anaconda3/lib 24 | source /home/zhangbofei/anaconda3/bin/activate # commented out by conda initialize 25 | conda init 26 | conda activate agent_tune 27 | export AGENT_CONFIG=configs/agent_config.yaml 28 | # python examples/gta/main.py --engine minicpm --lora-path experiments/CPM-FT/output/cpm_v2_6_7879846_2024_09_30_14_34 29 | python examples/gta/main.py --engine qwen --lora-path output_qwen/Qwen2-VL-7B-Instruct-7899951 30 | 31 | # python examples/gta/main.py --engine qwen -------------------------------------------------------------------------------- /slurm_jobs/evaluate_gta_qwen_llm.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=eval_gta # create a short name for your job 3 | #SBATCH --partition=HGX # specify the partition name: gpu 4 | #SBATCH --qos=lv2 5 | #SBATCH --nodes=1 # node count 6 | #SBATCH --ntasks=1 # total number of tasks across all nodes 7 | #SBATCH --ntasks-per-node=1 8 | #SBATCH --mem=64G # total memory (RAM) per node 9 | #SBATCH --time=72:00:00 # total run time limit (HH:MM:SS) 10 | #SBATCH --cpus-per-task=32 # cpu-cores per task (>1 if multi-threaded tasks) 11 | #SBATCH --gres=gpu:1 # number of gpus per node 12 | #SBATCH --output=output/out-%j.out # output format 13 | #SBATCH --error=output/error-out-%j.out # error output file 14 | #SBATCH --account=engineering 15 | #--------------------task part------------------------- 16 | 17 | 18 | ## clean env 19 | module purge 20 | ## load environment need by this task 21 | module load slurm/BigAI/23.02.2 22 | export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/zhangbofei/anaconda3/lib 23 | source /home/zhangbofei/anaconda3/bin/activate # commented out by conda initialize 24 | conda init 25 | conda activate agent_tune 26 | export AGENT_CONFIG=configs/agent_config_qwen_llm.yaml 27 | # python examples/gta/main.py --engine minicpm --lora-path experiments/CPM-FT/output/cpm_v2_6_7879846_2024_09_30_14_34 28 | python examples/gta/main.py --engine qwen -------------------------------------------------------------------------------- /slurm_jobs/gaia_pipeline.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=gaia_gen_step_2_4 # create a short name for your job 3 | #SBATCH --partition=DGX # specify the partition name: gpu 4 | #SBATCH --qos=lv2 5 | #SBATCH --nodes=1 # node count 6 | #SBATCH --ntasks=1 # total number of tasks across all nodes 7 | #SBATCH --ntasks-per-node=1 8 | #SBATCH --mem=64G # total memory (RAM) per node 9 | #SBATCH --time=72:00:00 # total run time limit (HH:MM:SS) 10 | #SBATCH --cpus-per-task=32 # cpu-cores per task (>1 if multi-threaded tasks) 11 | #SBATCH --gres=gpu:1 # number of gpus per node 12 | #SBATCH --output=output/out-%j.out # output format 13 | #SBATCH --error=output/error-out-%j.out # error output file 14 | #SBATCH --account=engineering 15 | #--------------------task part------------------------- 16 | 17 | 18 | ## clean env 19 | module purge 20 | ## load environment need by this task 21 | module load slurm/BigAI/23.02.2 22 | export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/zhangbofei/anaconda3/lib 23 | source /home/zhangbofei/anaconda3/bin/activate # commented out by conda initialize 24 | conda init 25 | 26 | conda activate gaia_gen 27 | 28 | bash data_generation/gaia_pipeline/gaia_worker_5.sh -------------------------------------------------------------------------------- /slurm_jobs/gaia_pipeline_query_gen.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=gaia_query_gen # create a short name for your job 3 | #SBATCH --partition=DGX # specify the partition name: gpu 4 | #SBATCH --qos=lv2 5 | #SBATCH --nodes=1 # node count 6 | #SBATCH --ntasks=1 # total number of tasks across all nodes 7 | #SBATCH --ntasks-per-node=1 8 | #SBATCH --mem=64G # total memory (RAM) per node 9 | #SBATCH --time=72:00:00 # total run time limit (HH:MM:SS) 10 | #SBATCH --cpus-per-task=32 # cpu-cores per task (>1 if multi-threaded tasks) 11 | #SBATCH --gres=gpu:1 # number of gpus per node 12 | #SBATCH --output=output/out-%j.out # output format 13 | #SBATCH --error=output/error-out-%j.out # error output file 14 | #SBATCH --account=engineering 15 | #--------------------task part------------------------- 16 | 17 | 18 | ## clean env 19 | module purge 20 | ## load environment need by this task 21 | module load slurm/BigAI/23.02.2 22 | export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/zhangbofei/anaconda3/lib 23 | source /home/zhangbofei/anaconda3/bin/activate # commented out by conda initialize 24 | conda init 25 | 26 | conda activate gaia_gen 27 | 28 | python -m data_generation.gaia_pipeline.gaia0_query_generation 29 | 30 | python -m data_generation.gaia_pipeline.gaia0_query_generation 31 | 32 | python -m data_generation.gaia_pipeline.gaia0_query_generation 33 | 34 | python -m data_generation.gaia_pipeline.gaia0_query_generation 35 | 36 | python -m data_generation.gaia_pipeline.gaia0_query_generation 37 | 38 | -------------------------------------------------------------------------------- /slurm_jobs/occupy.sh: -------------------------------------------------------------------------------- 1 | srun --time=72:00:00 --partition=HGX,DGX --qos=lv1 --mem=64G --account=engineering --gres=gpu:8 --cpus-per-task=32 --pty bash -c ' 2 | echo "Starting interactive session..." 3 | module load slurm/BigAI/23.02.2 4 | export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/zhangbofei/anaconda3/lib 5 | source /home/zhangbofei/anaconda3/bin/activate # commented out by conda initialize 6 | conda activate agent_tune 7 | # Execute any other commands you need 8 | echo "Environment ready!" 9 | cd /scratch/zhangbofei/Projects/Multimodal-CL/iclr_09/TongAgent 10 | GPUS_PER_NODE=$(nvidia-smi -L | wc -l) 11 | nvidia-smi 12 | python scripts/report.py 13 | vllm serve /scratch/ml/zhangxintong/A_Models/Qwen/Qwen2.5-72B-Instruct --tensor-parallel-size $GPUS_PER_NODE --dtype bfloat16 --gpu-memory-utilization 0.90 --max-model-len 20000 & 14 | # Start an interactive shell 15 | exec bash -i 16 | ' 17 | -------------------------------------------------------------------------------- /slurm_jobs/qwen_test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=qwen2.5_72b # create a short name for your job 3 | #SBATCH --partition=DGX # specify the partition name: gpu 4 | #SBATCH --qos=lv2 5 | #SBATCH --nodes=1 # node count 6 | #SBATCH --ntasks=1 # total number of tasks across all nodes 7 | #SBATCH --ntasks-per-node=1 8 | #SBATCH --mem=64G # total memory (RAM) per node 9 | #SBATCH --time=00:30:00 # total run time limit (HH:MM:SS) 10 | #SBATCH --cpus-per-task=32 # cpu-cores per task (>1 if multi-threaded tasks) 11 | #SBATCH --gres=gpu:2 # number of gpus per node 12 | #SBATCH --output=output/out-%j.out # output format 13 | #SBATCH --error=output/error-out-%j.out # error output file 14 | #SBATCH --account=engineering 15 | #--------------------task part------------------------- 16 | 17 | 18 | ## clean env 19 | module purge 20 | ## load environment need by this task 21 | module load slurm/BigAI/23.02.2 22 | export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/zhangbofei/anaconda3/lib 23 | source /home/zhangbofei/anaconda3/bin/activate # commented out by conda initialize 24 | conda init 25 | conda activate agent_tune 26 | GPUS_PER_NODE=$(nvidia-smi -L | wc -l) 27 | 28 | vllm serve Qwen/Qwen2.5-72B-Instruct --tensor-parallel-size $GPUS_PER_NODE --dtype bfloat16 --gpu-memory-utilization 0.98 --max-model-len 20000 29 | 30 | # python tests/test_vllm.py -------------------------------------------------------------------------------- /slurm_jobs/train.slurm: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH -J gaia 3 | #SBATCH -p HGX 4 | #SBATCH -o %j.out 5 | #SBATCH -e %j.err 6 | #SBATCH -q lv0b 7 | #SBATCH --time=10:00:00 8 | 9 | #SBATCH --nodes=1 10 | #SBATCH --ntasks-per-node=1 11 | #SBATCH --gres=gpu:1 12 | #SBATCH --mem=50G 13 | 14 | module load anaconda3/2021.11 15 | source activate tongagent 16 | 17 | python -m data_generation.gaia_pipeline.gaia21_file_content2file_openai >& gaia_3kfile.log 18 | -------------------------------------------------------------------------------- /slurm_jobs/traj_gen.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=chunk_16 # create a short name for your job 3 | #SBATCH --partition=DGX # specify the partition name: gpu 4 | #SBATCH --qos=lv2 5 | #SBATCH --nodes=1 # node count 6 | #SBATCH --ntasks=1 # total number of tasks across all nodes 7 | #SBATCH --ntasks-per-node=1 8 | #SBATCH --mem=64G # total memory (RAM) per node 9 | #SBATCH --time=72:00:00 # total run time limit (HH:MM:SS) 10 | #SBATCH --cpus-per-task=32 # cpu-cores per task (>1 if multi-threaded tasks) 11 | #SBATCH --gres=gpu:1 # number of gpus per node 12 | #SBATCH --output=output/out-%j.out # output format 13 | #SBATCH --error=output/error-out-%j.out # error output file 14 | #SBATCH --account=engineering 15 | #--------------------task part------------------------- 16 | 17 | 18 | ## clean env 19 | module purge 20 | ## load environment need by this task 21 | module load slurm/BigAI/23.02.2 22 | export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/zhangbofei/anaconda3/lib 23 | # source /home/zhangbofei/anaconda3/bin/activate # commented out by conda initialize 24 | 25 | conda activate agent_tune 26 | python data_generation/chunk_traj_generation/gta4_traj_genetation_chunk.py --chunk 16 27 | 28 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mat-agent/MAT-Agent/7215f7bb7ee2284cfaca2fda79725c39d5b3bb89/tests/__init__.py -------------------------------------------------------------------------------- /tests/data/254.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mat-agent/MAT-Agent/7215f7bb7ee2284cfaca2fda79725c39d5b3bb89/tests/data/254.jpg -------------------------------------------------------------------------------- /tests/data/annotated_cars.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mat-agent/MAT-Agent/7215f7bb7ee2284cfaca2fda79725c39d5b3bb89/tests/data/annotated_cars.png -------------------------------------------------------------------------------- /tests/data/cars.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mat-agent/MAT-Agent/7215f7bb7ee2284cfaca2fda79725c39d5b3bb89/tests/data/cars.png -------------------------------------------------------------------------------- /tests/data/draw.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mat-agent/MAT-Agent/7215f7bb7ee2284cfaca2fda79725c39d5b3bb89/tests/data/draw.jpg -------------------------------------------------------------------------------- /tests/test_activate.py: -------------------------------------------------------------------------------- 1 | from openai import OpenAI 2 | import openai 3 | import httpx 4 | # vllm serve Qwen/Qwen2.5-7B-Instruct > output/vllm.log 2>&1 & 5 | # vllm serve Qwen/Qwen2.5-72B-Instruct --tensor-parallel-size 2 > output/vllm.log 2>&1 & 6 | # Set OpenAI's API key and API base to use vLLM's API server. 7 | openai_api_key = "EMPTY" 8 | openai_api_base = "http://localhost:8000/v1" 9 | 10 | client = OpenAI( 11 | api_key=openai_api_key, 12 | base_url=openai_api_base, 13 | ) 14 | import time 15 | for i in range(60): 16 | try: 17 | print(client.models.list()) 18 | except httpx.ConnectError as e: 19 | print("service might not ready!") 20 | time.sleep(10) 21 | except openai.APIConnectionError as e: 22 | print("service might not ready!") 23 | time.sleep(10) 24 | except Exception as e: 25 | raise e 26 | 27 | elapse_times = [] 28 | while True: 29 | current = time.time() 30 | try: 31 | chat_response = client.chat.completions.create( 32 | model="Qwen/Qwen2.5-72B-Instruct", 33 | messages=[ 34 | { 35 | "role": "system", 36 | "content": "You are a helpful assistant" 37 | }, 38 | { 39 | "role": "user", 40 | "content": "Hi" 41 | }], 42 | temperature=0.7, 43 | top_p=0.8, 44 | max_tokens=1, 45 | extra_body={ 46 | "repetition_penalty": 1.05, 47 | }, 48 | ) 49 | except httpx.ConnectError as e: 50 | print("service might not ready!") 51 | time.sleep(60) 52 | except openai.APIConnectionError as e: 53 | print("service might not ready!") 54 | time.sleep(60) 55 | except Exception as e: 56 | raise e 57 | 58 | 59 | 60 | print("Chat response:", chat_response.choices[0].message.content) 61 | print("Elapse", round(time.time() - current, 2)) 62 | elapse_times.append(round(time.time() - current, 2)) 63 | time.sleep(600) 64 | 65 | -------------------------------------------------------------------------------- /tests/test_agent.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from tongagent.agents.general_agent import create_agent 4 | class TestAgent(unittest.TestCase): 5 | def test_ocr(self): 6 | agent = create_agent() 7 | 8 | result = agent.run("Can you try to extract text from the image path? Image path: tests/data/254.jpg") 9 | print(result) 10 | 11 | def test_sg(self): 12 | agent = create_agent() 13 | 14 | result = agent.run("Can you try to extract mask from the image path to a pickle file? Image path: tests/data/cars.png. Show me the file name you generated is good.") 15 | print(result) 16 | 17 | def test_edit(self): 18 | agent = create_agent() 19 | 20 | result = agent.run("Can you edit the image to turn him into cyborg? Image path: tests/data/draw.jpg.") 21 | print(result) 22 | 23 | def test_loc(self): 24 | agent = create_agent() 25 | 26 | result = agent.run("Can you try to first detect cars shown in the images and then extract masks for cars? Image path: tests/data/cars.png.") 27 | print(result) 28 | 29 | def test_web_search(self): 30 | agent = create_agent() 31 | question = """If Eliud Kipchoge could maintain his record-making marathon pace indefinitely, how many thousand hours would it take him to run the distance between the Earth and the Moon its closest approach? Please use the minimum perigee value on the Wikipedia page for the Moon when carrying out your calculation. Round your result to the nearest 1000 hours and do not use any comma separators if necessary.""" 32 | 33 | result = agent.run(question) 34 | print(result) 35 | 36 | def test_web_search2(self): 37 | agent = create_agent() 38 | question = """How many studio albums were published by Mercedes Sosa between 2000 and 2009 (included)? You can use the latest 2022 version of english wikipedia.""" 39 | 40 | result = agent.run(question) 41 | print(result) 42 | 43 | def test_gaia_case1(self): 44 | agent = create_agent() 45 | question = """A paper about AI regulation that was originally submitted to arXiv.org in June 2022 shows a figure with three axes, where each axis has a label word at both ends. Which of these words is used to describe a type of society in a Physics and Society article submitted to arXiv.org on August 11, 2016?""" 46 | result = agent.run(question) 47 | print(result) 48 | # answer = egalitarian 49 | 50 | def test_gaia_case2(self): 51 | question = "In the video https://www.youtube.com/watch?v=L1vXCYZAYYM, what is the highest number of bird species to be on camera simultaneously?" 52 | agent = create_agent() 53 | result = agent.run(question) 54 | print(result) 55 | if __name__ == "__main__": 56 | unittest.main() -------------------------------------------------------------------------------- /tests/test_code.py: -------------------------------------------------------------------------------- 1 | from tongagent.agents.data_sampling_agent import evaluate_python_code_modify 2 | 3 | 4 | code = ''' 5 | import pandas as pd 6 | 7 | df = pd.read_csv("data/GAIA/2023/validation/da52d699-e8d2-4dc5-9191-a2199e0b6a9b.xlsx") 8 | 9 | print(df.head()) 10 | ''' 11 | 12 | result = evaluate_python_code_modify( 13 | code, 14 | authorized_imports=["pandas"] 15 | ) 16 | print(result) 17 | -------------------------------------------------------------------------------- /tests/test_create_agent.py: -------------------------------------------------------------------------------- 1 | from tongagent.agents.data_sampling_agent import create_agent 2 | 3 | agent = create_agent( 4 | llm_engine="tonggpt", 5 | error_tolerance=10, 6 | task="gaia" 7 | ) 8 | 9 | print(agent.authorized_imports) 10 | print(agent.additional_authorized_imports) 11 | 12 | -------------------------------------------------------------------------------- /tests/test_debug.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | with open("debug.json", "r") as f: 4 | data = json.load(f) 5 | 6 | 7 | for item in data: 8 | conversations = item["conversations"] 9 | for conversation in conversations: 10 | if conversation["role"] == "user": 11 | if '.png' in conversation["content"]: 12 | print(conversation["content"]) 13 | print("-" * 10) 14 | -------------------------------------------------------------------------------- /tests/test_edit.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from tongagent.tools.new_added.image_edit import ImageEditTool 3 | class TestEdit(unittest.TestCase): 4 | def test_edit(self): 5 | tool = ImageEditTool() 6 | 7 | output_image = tool.forward("turn him into cyborg", "tests/data/draw.jpg") 8 | print(output_image) 9 | 10 | if __name__ == "__main__": 11 | unittest.main() -------------------------------------------------------------------------------- /tests/test_engine.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from tongagent.llm_engine.mini_cpm import MiniCPMEngine 4 | class TestEngine(unittest.TestCase): 5 | def test_case(self): 6 | engine = MiniCPMEngine() 7 | messages = [ 8 | {"role": "user", "content": "You are a helpful assistant. Response in chinese."}, 9 | {"role": "user", "content": "Tell me the model of this aircraft."} 10 | ] 11 | output = engine(messages, stop_sequences=[], image_paths = ["tests/data/airplane.jpeg"]) 12 | 13 | print(output) -------------------------------------------------------------------------------- /tests/test_find.py: -------------------------------------------------------------------------------- 1 | 2 | import json 3 | 4 | with open("data/gta_6350_merged.json", "r") as f: 5 | dataset = json.load(f) 6 | 7 | for item in dataset: 8 | if item["id"] == "vB9O_XTo": 9 | print(item["image"]) 10 | conv = item["conversations"] 11 | for turn in conv: 12 | print(turn["role"]) 13 | print(turn["content"]) 14 | print("-" * 100) 15 | break 16 | 17 | 18 | -------------------------------------------------------------------------------- /tests/test_format_answer.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from tongagent.prompt import FORMAT_ANSWER_PROMPT_GAIA 4 | 5 | from langchain.prompts import ChatPromptTemplate 6 | 7 | class TestFormat(unittest.TestCase): 8 | def test_case(self): 9 | template = ChatPromptTemplate.from_template(FORMAT_ANSWER_PROMPT_GAIA) 10 | prompt = template.invoke({ 11 | "question": "hi", 12 | "answer": "hi" 13 | }) 14 | print(prompt) 15 | print(prompt.to_string()) 16 | print(prompt.to_messages()[0].content) 17 | if __name__ == "__main__": 18 | unittest.main() -------------------------------------------------------------------------------- /tests/test_gaia_1107.py: -------------------------------------------------------------------------------- 1 | import json 2 | with open("data/gaia_1107_train.json", "r") as f: 3 | data = json.load(f) 4 | 5 | import random 6 | with open("data/gaia_1107_train.json", "w") as f: 7 | json.dump(data, f, indent=4, ensure_ascii=False) 8 | 9 | 10 | with open("data/gaia_1107_train_subset.json", "w") as f: 11 | json.dump(random.sample(data, 500), f, indent=4, ensure_ascii=False) 12 | 13 | 14 | -------------------------------------------------------------------------------- /tests/test_inpector.py: -------------------------------------------------------------------------------- 1 | from tongagent.tools.tool_box import TextInspectorTool 2 | 3 | data_path = "data/GAIA/2023/validation/da52d699-e8d2-4dc5-9191-a2199e0b6a9b.xlsx" 4 | 5 | tool = TextInspectorTool() 6 | 7 | question = "What is the list of books read in 2022 along with their reading speeds?" 8 | result = tool.forward(file_path=data_path, question=question) 9 | print(result) -------------------------------------------------------------------------------- /tests/test_internvl.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from tongagent.llm_engine.internvl2 import InternVL2Engine 3 | from transformers.agents.llm_engine import MessageRole, HfApiEngine, get_clean_message_list 4 | 5 | class TestInternVL(unittest.TestCase): 6 | def test_internvl(self): 7 | engine = InternVL2Engine() 8 | messages = [ 9 | {"role": MessageRole.SYSTEM, "content": "You are a helpful assistant."}, 10 | {"role": MessageRole.USER, "content": "What is the capital of France?"}, 11 | ] 12 | answer = engine(messages) 13 | print(answer) 14 | 15 | def test_internvl_with_image(self): 16 | engine = InternVL2Engine() 17 | messages = [ 18 | {"role": MessageRole.SYSTEM, "content": "Respond in chinese"}, 19 | {"role": MessageRole.USER, "content": "What airplane in the image?"}, 20 | ] 21 | answer = engine(messages, image_paths=["tests/data/airplane.jpeg"]) 22 | print(answer) 23 | 24 | if __name__ == "__main__": 25 | unittest.main() -------------------------------------------------------------------------------- /tests/test_llava_ov.py: -------------------------------------------------------------------------------- 1 | # pip install git+https://github.com/LLaVA-VL/LLaVA-NeXT.git 2 | from llava.model.builder import load_pretrained_model 3 | from llava.mm_utils import get_model_name_from_path, process_images, tokenizer_image_token 4 | from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN, IGNORE_INDEX 5 | from llava.conversation import conv_templates, SeparatorStyle 6 | 7 | from PIL import Image 8 | import requests 9 | import copy 10 | import torch 11 | 12 | import sys 13 | import warnings 14 | 15 | warnings.filterwarnings("ignore") 16 | pretrained = "lmms-lab/llava-onevision-qwen2-7b-ov-chat" 17 | pretrained = "Lin-Chen/open-llava-next-llama3-8b" 18 | model_name = "llava_llama_3" 19 | #model_name = "llava_qwen" 20 | device = "cuda" 21 | device_map = "auto" 22 | tokenizer, model, image_processor, max_length = load_pretrained_model(pretrained, None, model_name, device_map=device_map) # Add any other thing you want to pass in llava_model_args 23 | 24 | model.eval() 25 | 26 | url = "https://github.com/haotian-liu/LLaVA/blob/1a91fc274d7c35a9b50b3cb29c4247ae5837ce39/images/llava_v1_5_radar.jpg?raw=true" 27 | image = Image.open(requests.get(url, stream=True).raw) 28 | image_tensor = process_images([image], image_processor, model.config) 29 | image_tensor = [_image.to(dtype=torch.float16, device=device) for _image in image_tensor] 30 | 31 | #conv_template = "qwen_1_5" # Make sure you use correct chat template for different models 32 | conv_template = "llava_llama_3" 33 | question = DEFAULT_IMAGE_TOKEN + "\nWhat is shown in this image?" 34 | question = "How are you doing?" 35 | conv = copy.deepcopy(conv_templates[conv_template]) 36 | conv.append_message(conv.roles[0], question) 37 | conv.append_message(conv.roles[1], None) 38 | prompt_question = conv.get_prompt() 39 | 40 | input_ids = tokenizer_image_token(prompt_question, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(0).to(device) 41 | image_sizes = [image.size] 42 | 43 | for each in image_tensor: 44 | print(each.shape) 45 | 46 | cont = model.generate( 47 | input_ids, 48 | images=None, 49 | image_sizes=None, 50 | do_sample=False, 51 | temperature=0, 52 | max_new_tokens=4096, 53 | ) 54 | text_outputs = tokenizer.batch_decode(cont, skip_special_tokens=True) 55 | print(text_outputs) 56 | -------------------------------------------------------------------------------- /tests/test_ocr.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from tongagent.tools.new_added.ocr import OCRTool 3 | 4 | class TestOCR(unittest.TestCase): 5 | def test_case(self): 6 | tool = OCRTool() 7 | texts = tool.forward("tests/data/254.jpg", debug=True) 8 | print(texts) 9 | 10 | if __name__ == "__main__": 11 | unittest.main() -------------------------------------------------------------------------------- /tests/test_ov_engine.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from tongagent.llm_engine.llava import LLaVAEngine 3 | from transformers.agents.llm_engine import MessageRole, HfApiEngine, get_clean_message_list 4 | 5 | class TestOvEngine(unittest.TestCase): 6 | def test_llava1(self): 7 | engine = LLaVAEngine("Lin-Chen/open-llava-next-llama3-8b") 8 | prompt = [ 9 | {"role": MessageRole.USER, "content": "How are you doing?"}, 10 | ] 11 | answer = engine(prompt, image_path=[]) 12 | print(answer) 13 | 14 | def test_llava2(self): 15 | engine = LLaVAEngine("Lin-Chen/open-llava-next-llama3-8b") 16 | image_path = "tests/data/airplane.jpeg" 17 | prompt = [ 18 | {"role": MessageRole.USER, "content": "What is the image?"}, 19 | ] 20 | answer = engine(prompt, image_paths=[image_path]) 21 | print(answer) 22 | 23 | 24 | if __name__ == "__main__": 25 | unittest.main() -------------------------------------------------------------------------------- /tests/test_seg.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from tongagent.tools.new_added.seg import SegTool 4 | 5 | class TestSeg(unittest.TestCase): 6 | 7 | def test_seg(self): 8 | tool = SegTool() 9 | result = tool.forward("tests/data/cars.png") 10 | print(result) 11 | 12 | def test_seg_2(self): 13 | tool = SegTool() 14 | prompt = [[200.68, 451.94, 354.71, 545.96], [192.86, 359.01, 953.82, 738.95], [908.56, 197.47, 1555.35, 993.67]] 15 | result = tool.forward("tests/data/cars.png", prompt=prompt) 16 | print(result) 17 | 18 | if __name__ == "__main__": 19 | unittest.main() -------------------------------------------------------------------------------- /tests/test_vision_map.py: -------------------------------------------------------------------------------- 1 | from tongagent.llm_engine.mini_cpm import load_pretrained_model_lora 2 | import json 3 | from PIL import Image 4 | 5 | model, tokenizer = load_pretrained_model_lora("experiments/CPM-FT/output/cpm_v2_6_7882650_2024_10_14_19_25/") 6 | input_data = ".cache/gta/cpm_v2_6_7882650_2024_10_14_19_25/0/agent_memory.json" 7 | image_paths = ["data/gta_dataset/image/image_1.jpg", "data/gta_dataset/image/image_2.jpg"] 8 | 9 | 10 | with open(input_data, "r") as f: 11 | data = json.load(f) 12 | messages = data["conversations"] 13 | if image_paths is not None and len(image_paths) > 0: 14 | origin_content = messages[1]['content'] 15 | messages[1]['content'] = [] 16 | messages[1]['content'].append(dict(type="text", text=origin_content)) 17 | prompt = [] 18 | for path_item in image_paths: 19 | image = Image.open(path_item).convert('RGB') 20 | prompt.append(image) 21 | prompt.append(origin_content) 22 | messages[1]["content"] = prompt 23 | 24 | system_prompt = messages[0]["content"] 25 | print("prompt", messages[1:2]) 26 | answer = model.chat( 27 | image=None, 28 | msgs=messages[1:2], 29 | system_prompt=system_prompt, 30 | tokenizer=tokenizer 31 | ) 32 | print(answer) -------------------------------------------------------------------------------- /tongagent/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mat-agent/MAT-Agent/7215f7bb7ee2284cfaca2fda79725c39d5b3bb89/tongagent/__init__.py -------------------------------------------------------------------------------- /tongagent/agents/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mat-agent/MAT-Agent/7215f7bb7ee2284cfaca2fda79725c39d5b3bb89/tongagent/agents/__init__.py -------------------------------------------------------------------------------- /tongagent/agents/general_agent.py: -------------------------------------------------------------------------------- 1 | from typing import Callable, List 2 | from transformers.agents.prompts import DEFAULT_REACT_CODE_SYSTEM_PROMPT 3 | from transformers.agents.tools import DEFAULT_TOOL_DESCRIPTION_TEMPLATE, Tool 4 | from tongagent.tools.tool_box import get_general_tool_box, get_tool_box_gaia 5 | from tongagent.llm_engine.gpt import TongGPTEngine, get_tonggpt_open_ai_client 6 | from tongagent.prompt import DEFAULT_REACT_CODE_SYSTEM_PROMPT, FORMAT_ANSWER_PROMPT_GAIA 7 | from transformers.agents import ReactCodeAgent, HfApiEngine 8 | from transformers.agents.tools import DEFAULT_TOOL_DESCRIPTION_TEMPLATE 9 | from transformers.agents.llm_engine import MessageRole 10 | from typing import Any 11 | from langchain.prompts import ChatPromptTemplate 12 | 13 | def create_agent() -> ReactCodeAgent: 14 | llm_engine = TongGPTEngine() 15 | 16 | react_agent = ReactCodeAgent( 17 | llm_engine=llm_engine, 18 | # tools=TASK_SOLVING_TOOLBOX+WEB_TOOLS, 19 | tools=get_general_tool_box(), 20 | max_iterations=15, 21 | verbose=0, 22 | memory_verbose=True, 23 | system_prompt=DEFAULT_REACT_CODE_SYSTEM_PROMPT, 24 | add_base_tools=False, 25 | additional_authorized_imports=[ 26 | "requests", 27 | "zipfile", 28 | "os", 29 | "pandas", 30 | "numpy", 31 | "sympy", 32 | "json", 33 | "bs4", 34 | "pubchempy", 35 | "xml", 36 | "yahoo_finance", 37 | "Bio", 38 | "sklearn", 39 | "scipy", 40 | "pydub", 41 | "io", 42 | "PIL", 43 | "chess", 44 | "PyPDF2", 45 | "pptx", 46 | "torch", 47 | "datetime", 48 | "csv", 49 | "fractions", 50 | "matplotlib", 51 | "pickle" 52 | ], 53 | planning_interval=None 54 | ) 55 | return react_agent 56 | 57 | -------------------------------------------------------------------------------- /tongagent/cmd/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mat-agent/MAT-Agent/7215f7bb7ee2284cfaca2fda79725c39d5b3bb89/tongagent/cmd/__init__.py -------------------------------------------------------------------------------- /tongagent/cmd/task_generate.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | 4 | def main(): 5 | parser = argparse.ArgumentParser() 6 | parser.add_argument("--data-path") 7 | 8 | args = parser.parse_args() 9 | with open(args.data_path, "r") as f: 10 | dataset = json.load(f) 11 | 12 | conv = dataset["conversations"] 13 | task = conv[1]["content"] 14 | if not task.endswith("\n"): 15 | task += "\n" 16 | 17 | for i in range(2, len(conv)): 18 | content = conv[i]["content"] 19 | if i % 2 == 0: 20 | if not content.startswith("Thought:"): 21 | raise ValueError("This trajectory is malformed") 22 | 23 | task += "\n" 24 | task += content 25 | 26 | else: 27 | if not content.startswith("[OUTPUT OF STEP") or "Observation:" not in content: 28 | raise ValueError("This trajectory is malformed") 29 | 30 | content_idx = content.find("]") 31 | task += "\n" 32 | task += content[content_idx+1:].strip() 33 | if not task.endswith("\n"): 34 | task += "\n" 35 | print(task) 36 | if __name__ == "__main__": 37 | main() -------------------------------------------------------------------------------- /tongagent/evaluation/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mat-agent/MAT-Agent/7215f7bb7ee2284cfaca2fda79725c39d5b3bb89/tongagent/evaluation/__init__.py -------------------------------------------------------------------------------- /tongagent/llm_engine/__init__.py: -------------------------------------------------------------------------------- 1 | from tongagent.llm_engine.gpt import TongGPTEngine 2 | from tongagent.llm_engine.mini_cpm import MiniCPMEngine 3 | from tongagent.llm_engine.qwen import QwenEngine 4 | from tongagent.llm_engine.internvl2 import InternVL2Engine 5 | from tongagent.utils import load_config 6 | from tongagent.llm_engine.llava import LLaVAEngine 7 | def get_llm_engine( 8 | engine_type=None, 9 | lora_path=None, 10 | disable_vision=False, 11 | ): 12 | config = load_config() 13 | if engine_type is None: 14 | engine_type = config.agent_controller.engine_type 15 | 16 | if engine_type == "qwen": 17 | return QwenEngine( 18 | model_name=config.qwen.model_name, 19 | lora_path=lora_path 20 | ) 21 | elif engine_type == "tonggpt": 22 | return TongGPTEngine(engine_type) 23 | elif engine_type == "minicpm": 24 | return MiniCPMEngine( 25 | model=lora_path, 26 | disable_vision=disable_vision 27 | ) 28 | elif engine_type == "internvl2": 29 | return InternVL2Engine( 30 | model_name=config.internvl2.model_name, 31 | lora_path=lora_path 32 | ) 33 | elif engine_type == "llava": 34 | return LLaVAEngine( 35 | model_name=config.llava.model_name, 36 | ) 37 | else: 38 | raise ValueError(f"Unknown LLM engine {engine_type}") 39 | -------------------------------------------------------------------------------- /tongagent/tools/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mat-agent/MAT-Agent/7215f7bb7ee2284cfaca2fda79725c39d5b3bb89/tongagent/tools/__init__.py -------------------------------------------------------------------------------- /tongagent/tools/new_added/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mat-agent/MAT-Agent/7215f7bb7ee2284cfaca2fda79725c39d5b3bb89/tongagent/tools/new_added/__init__.py -------------------------------------------------------------------------------- /tongagent/tools/new_added/face_det.py: -------------------------------------------------------------------------------- 1 | from transformers import AutoProcessor, Tool 2 | import torch 3 | from PIL import Image 4 | import numpy as np 5 | import requests 6 | import face_detection 7 | 8 | class FaceDetTool(Tool): 9 | name = "facedetection" 10 | description = "A tool that can detect human faces in given images, outputing the bounding boxes of the human faces." 11 | inputs = { 12 | "image_path": { 13 | "description": "The path to the image on which to localize objects. This should be a local path to downloaded image.", 14 | "type": "string", 15 | }, 16 | } 17 | output_type = "any" 18 | 19 | 20 | device = "cuda:0" if torch.cuda.is_available() else "cpu" 21 | model = face_detection.build_detector("DSFDDetector", confidence_threshold=.5, nms_iou_threshold=.3) 22 | 23 | 24 | def forward(self,image_path:str)-> list: 25 | img = Image.open(image_path) 26 | img = img.convert('RGB') 27 | with torch.no_grad(): 28 | faces = self.model.detect(np.array(img)) 29 | 30 | W,H = img.size 31 | objs = [] 32 | for i,box in enumerate(faces): 33 | x1,y1,x2,y2,c = [int(v) for v in box.tolist()] 34 | x1,y1,x2,y2 = self.enlarge_face([x1,y1,x2,y2],W,H) 35 | mask = np.zeros([H,W]).astype(float) 36 | mask[y1:y2,x1:x2] = 1.0 37 | objs.append([x1,y1,x2,y2]) 38 | return objs 39 | 40 | 41 | def enlarge_face(self,box,W,H,f=1.5): 42 | x1,y1,x2,y2 = box 43 | w = int((f-1)*(x2-x1)/2) 44 | h = int((f-1)*(y2-y1)/2) 45 | x1 = max(0,x1-w) 46 | y1 = max(0,y1-h) 47 | x2 = min(W,x2+w) 48 | y2 = min(H,y2+h) 49 | return [x1,y1,x2,y2] 50 | 51 | 52 | 53 | # m=FaceDetTool() -------------------------------------------------------------------------------- /tongagent/tools/new_added/image_edit.py: -------------------------------------------------------------------------------- 1 | from transformers import Tool 2 | from PIL import Image 3 | from diffusers import StableDiffusionInstructPix2PixPipeline, EulerAncestralDiscreteScheduler 4 | from tongagent.utils import CACHE_FOLDER, gen_random_id 5 | import torch 6 | import os 7 | 8 | class ModelSingleton(): 9 | def __new__(cls): 10 | if hasattr(cls, "pipe"): 11 | return cls 12 | model_id = "timbrooks/instruct-pix2pix" 13 | pipe = StableDiffusionInstructPix2PixPipeline.from_pretrained(model_id, torch_dtype=torch.float16, safety_checker=None) 14 | pipe.to("cuda") 15 | pipe.scheduler = EulerAncestralDiscreteScheduler.from_config(pipe.scheduler.config) 16 | cls.pipe = pipe 17 | return cls 18 | 19 | class ImageEditTool(Tool): 20 | name = "image_edit" 21 | description = "A tool that can edit image based on the user prompt. Return a file path for printing." 22 | inputs = { 23 | "prompt": { 24 | "description": "The user prompt that instruct how to edit the image.", 25 | "type": "string", 26 | }, 27 | "image_path": { 28 | "description": "The image path that this tool will try to edit.", 29 | "type": "string", 30 | }, 31 | } 32 | output_type = "string" 33 | 34 | 35 | def forward(self, prompt: str, image_path: str) -> str: 36 | print("ImageEditTool input", prompt, image_path) 37 | image = Image.open(image_path).convert("RGB") 38 | images = ModelSingleton().pipe(prompt, image=image, num_inference_steps=10, image_guidance_scale=1).images 39 | output_image = images[0] 40 | output_image_path = os.path.join(CACHE_FOLDER, f"{gen_random_id()}.png") 41 | output_image.save(output_image_path) 42 | print("save to", output_image_path) 43 | return output_image_path -------------------------------------------------------------------------------- /tongagent/tools/new_added/image_generation.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from transformers.agents import load_tool, Tool 4 | from tongagent.utils import CACHE_FOLDER, gen_random_id 5 | from diffusers import FluxPipeline 6 | from diffusers import DiffusionPipeline 7 | 8 | import torch 9 | 10 | class ImageGenerationTool(Tool): 11 | description = "This is a tool that creates an image according to a prompt, which is a text description." 12 | name = "image_generator" 13 | inputs = {"prompt": {"type": "string", "description": "The image generator prompt. Don't hesitate to add details in the prompt to make the image look better, like 'high-res, photorealistic', etc."}} 14 | output_type = "any" 15 | 16 | def __init__(self, *args, **kwargs): 17 | super().__init__(*args, **kwargs) 18 | 19 | model_id = "stabilityai/stable-diffusion-xl-base-1.0" 20 | if model_id == "black-forest-labs/FLUX.1-dev": 21 | # model_path = '/scratch/zhangbofei/.cache/huggingface/hub/models--black-forest-labs--FLUX.1-dev/' 22 | pipeline = FluxPipeline.from_pretrained(model_id, torch_dtype=torch.bfloat16) 23 | elif model_id == "stabilityai/stable-diffusion-xl-base-1.0": 24 | pipeline = DiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16, use_safetensors=True, variant="fp16") 25 | else: 26 | raise ValueError(f"unk model {model_id}") 27 | self.pipeline = pipeline 28 | self.pipeline.to("cuda") 29 | self.model_id = model_id 30 | # pipeline.enable_model_cpu_offload() 31 | 32 | def forward(self, prompt): 33 | if self.model_id == "stabilityai/stable-diffusion-xl-base-1.0": 34 | image = self.pipeline( 35 | prompt=prompt 36 | ).images[0] 37 | else: 38 | image = self.pipeline( 39 | prompt, 40 | height=512, 41 | width=512, 42 | guidance_scale=3.5, 43 | num_inference_steps=50, 44 | max_sequence_length=512, 45 | generator=torch.Generator("cpu").manual_seed(0) 46 | ).images[0] 47 | 48 | output_image_path = os.path.join(CACHE_FOLDER, f"{gen_random_id()}.jpeg") 49 | image.save(output_image_path) 50 | # output_image.save(output_image_path) 51 | print("save to", output_image_path) 52 | return output_image_path 53 | 54 | if __name__ == "__main__": 55 | tool = ImageGenerationTool() 56 | 57 | image_path = tool.forward("high-res, photorealistic street view") 58 | -------------------------------------------------------------------------------- /tongagent/tools/new_added/object_loc.py: -------------------------------------------------------------------------------- 1 | from transformers import Tool 2 | import torch 3 | from PIL import Image 4 | from transformers import OwlViTProcessor, OwlViTForObjectDetection 5 | 6 | 7 | class ObjectLOCTool(Tool): 8 | name = "objectlocation" 9 | description = "A tool that can localize objects in given images, outputing the bounding boxes of the objects." 10 | inputs = { 11 | "object": {"description": "the object that need to be localized", "type": "string"}, 12 | "image_path": { 13 | "description": "The path to the image on which to localize objects. This should be a local path to downloaded image.", 14 | "type": "string", 15 | }, 16 | } 17 | output_type = "any" 18 | 19 | 20 | model_path = "google/owlvit-base-patch32" 21 | 22 | device = "cuda:0" if torch.cuda.is_available() else "cpu" 23 | processor = OwlViTProcessor.from_pretrained(model_path) 24 | model = OwlViTForObjectDetection.from_pretrained(model_path) 25 | model = model.to(device) 26 | 27 | 28 | def forward(self, object: str, image_path: str) -> list: 29 | image = Image.open(image_path) 30 | image = image.convert('RGB') 31 | 32 | texts=[] 33 | texts.append(f'a photo of {object}') 34 | texts=[texts] 35 | 36 | inputs = self.processor(text=texts, images=image, return_tensors="pt") 37 | inputs=inputs.to(self.device) 38 | outputs = self.model(**inputs) 39 | 40 | target_sizes = torch.Tensor([image.size[::-1]]) 41 | results = self.processor.post_process_object_detection(outputs=outputs, threshold=0.1, target_sizes=target_sizes) 42 | 43 | i = 0 44 | text = texts[i] 45 | output=[] 46 | 47 | for box, score, pred in zip(results[i]["boxes"], results[i]["scores"], results[i]["labels"]): 48 | # output.append(dict(score=score.item(), label=text[pred], box=[round(i, 2) for i in box.tolist()])) 49 | output.append([round(i, 2) for i in box.tolist()]) 50 | 51 | return output 52 | -------------------------------------------------------------------------------- /tongagent/tools/new_added/ocr.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from transformers import Tool 4 | from PIL import Image 5 | from paddleocr import PaddleOCR, draw_ocr 6 | from tongagent.utils import CACHE_FOLDER, gen_random_id 7 | 8 | class OCRTool(Tool): 9 | name = "ocr" 10 | description = "A tool that can extract texts from the image." 11 | inputs = { 12 | "image_path": { 13 | "description": "The path of image that the tool can read.", 14 | "type": "string", 15 | }, 16 | } 17 | output_type = "any" 18 | 19 | ocr = PaddleOCR(use_angle_cls=True, lang='en') 20 | 21 | def forward(self, image_path: str, debug: bool = False) -> list: 22 | image = Image.open(image_path).convert("RGB") 23 | 24 | result = self.ocr.ocr(image_path, cls=True) 25 | texts = [] 26 | for idx in range(len(result)): 27 | res = result[idx] 28 | for line in res: 29 | if debug: print(line[-1]) 30 | texts.append(line[-1][0]) 31 | if debug: 32 | result = result[0] 33 | boxes = [line[0] for line in result] 34 | txts = [line[1][0] for line in result] 35 | scores = [line[1][1] for line in result] 36 | im_show = draw_ocr(image, boxes, txts, scores, font_path='data/fonts/simfang.ttf') 37 | im_show = Image.fromarray(im_show) 38 | filename = os.path.join(CACHE_FOLDER, f"{gen_random_id()}.jpg") 39 | print("save to", filename) 40 | im_show.save(filename) 41 | return texts -------------------------------------------------------------------------------- /tongagent/utils.py: -------------------------------------------------------------------------------- 1 | import string 2 | import shortuuid 3 | import os 4 | from typing import Union 5 | 6 | from omegaconf import OmegaConf, DictConfig, ListConfig 7 | 8 | CACHE_FOLDER = ".cache" 9 | os.makedirs(CACHE_FOLDER, exist_ok=True) 10 | 11 | def get_uuid_builder() -> shortuuid.ShortUUID: 12 | alphabet = string.ascii_lowercase + string.digits 13 | su = shortuuid.ShortUUID(alphabet=alphabet) 14 | return su 15 | 16 | def load_config() -> Union[DictConfig, ListConfig]: 17 | if "AGENT_CONFIG" in os.environ and len(os.environ["AGENT_CONFIG"]) > 0: 18 | return OmegaConf.load(os.environ["AGENT_CONFIG"]) 19 | 20 | if "RUN_MODE" in os.environ and os.environ["RUN_MODE"] == "eval": 21 | return OmegaConf.load("configs/agent_config.yaml") 22 | 23 | return OmegaConf.load("configs/agent_config.yaml") 24 | 25 | import time 26 | uuid_builder = get_uuid_builder() 27 | 28 | def gen_random_id(): 29 | return f"{int(time.time()*1000)}_{uuid_builder.random(length=8)}" 30 | 31 | if __name__ == "__main__": 32 | print(load_config()) 33 | print(load_config().search_engine[0].cx) --------------------------------------------------------------------------------