├── .github
    └── ISSUE_TEMPLATE
    │   ├── bug_report.md
    │   └── feature-request-or-questions.md
├── LICENSE
├── README.md
├── assets
    ├── authors.png
    ├── framework.jpg
    └── icon.png
├── configs
    └── agent_config.yaml
├── data_generation.sh
├── data_generation
    └── gaia_pipeline
    │   ├── 0_query_generation_tonggpt.py
    │   ├── 1_query2file_content_parallel_tonggpt.py
    │   ├── 2_file_content2file_tonggpt.py
    │   ├── 3_traj_genetation_tonggpt.py
    │   ├── merge.py
    │   ├── prompts
    │       ├── file
    │       │   ├── gaia_file_generation_system.prompt
    │       │   ├── gaia_file_generation_user.prompt
    │       │   ├── gaia_file_verifier_system.prompt
    │       │   ├── gaia_file_verifier_user.prompt
    │       │   ├── gaia_system.prompt
    │       │   └── gaia_user.prompt
    │       ├── query
    │       │   ├── gaia_test_query_generation.prompt
    │       │   ├── gaia_test_tool.prompt
    │       │   ├── gaia_val_metadata.jsonl
    │       │   └── gaia_val_query_generation.prompt
    │       └── statistics
    │       │   ├── pie_chart.py
    │       │   └── topic.prompt
    │   └── verifier
    │       ├── 0_collect.py
    │       ├── 1_gaia_q_f_filter.py
    │       ├── 2_convert_format.py
    │       ├── 3_gaia_verifier_parallel.py
    │       ├── mdconvert.py
    │       └── prompt
    │           ├── gaia_file_verifier_system.prompt
    │           ├── gaia_file_verifier_user.prompt
    │           ├── gaia_traj_verifier_system.prompt
    │           ├── gaia_traj_verifier_user.prompt
    │           ├── gta_file_verifier_system.prompt
    │           ├── gta_file_verifier_user.prompt
    │           ├── gta_traj_verifier_system.prompt
    │           └── gta_traj_verifier_user.prompt
├── examples
    ├── gaia
    │   ├── analysis.py
    │   ├── eval.py
    │   ├── main.py
    │   ├── playground.py
    │   └── view.py
    └── gta
    │   ├── eval.py
    │   └── main.py
├── experiments
    ├── CPM-FT
    │   ├── README.md
    │   ├── assets
    │   │   ├── airplane.jpeg
    │   │   ├── prompt.txt
    │   │   └── sosa.png
    │   ├── finetune
    │   │   ├── __init__.py
    │   │   ├── dataset.py
    │   │   ├── ds_config_zero2.json
    │   │   ├── ds_config_zero3.json
    │   │   ├── finetune.py
    │   │   ├── finetune_ds.sh
    │   │   ├── finetune_lora.sh
    │   │   ├── readme.md
    │   │   └── trainer.py
    │   ├── inference
    │   │   ├── __init__.py
    │   │   ├── eval.py
    │   │   ├── inference.py
    │   │   └── utils.py
    │   ├── main.py
    │   ├── output
    │   │   └── cpm_fire_test.json
    │   ├── requirements.txt
    │   ├── scripts
    │   │   ├── convert_baai_stats.py
    │   │   ├── download_cauldron.py
    │   │   ├── ds_config_zero2.json
    │   │   ├── ds_config_zero3.json
    │   │   ├── filter_baai_dataset.py
    │   │   ├── finetune_lora.sh
    │   │   ├── reset_system_prompt.py
    │   │   ├── sanity_check.py
    │   │   ├── subset.py
    │   │   └── tokenizer.py
    │   ├── slurm_jobs
    │   │   ├── job_lora_5_gaia_1206.sh
    │   │   └── job_lora_5_gta_with_verifier.sh
    │   └── tests
    │   │   ├── __init__.py
    │   │   ├── test_infer.py
    │   │   └── test_infer_lora.py
    └── Qwen-VL
    │   ├── .github
    │       └── ISSUE_TEMPLATE
    │       │   ├── bug_report.yaml
    │       │   ├── config.yaml
    │       │   └── feature_request.yaml
    │   ├── .gitignore
    │   ├── BUILD.md
    │   ├── Dockerfile.qwendemo
    │   ├── Dockerfile.qwenint4openai
    │   ├── Dockerfile.qwenopenai
    │   ├── FAQ.md
    │   ├── FAQ_ja.md
    │   ├── FAQ_ko.md
    │   ├── FAQ_zh.md
    │   ├── LICENSE
    │   ├── NOTICE
    │   ├── README.md
    │   ├── README_CN.md
    │   ├── README_JA.md
    │   ├── README_KO.md
    │   ├── TUTORIAL.md
    │   ├── TUTORIAL_ja.md
    │   ├── TUTORIAL_ko.md
    │   ├── TUTORIAL_zh.md
    │   ├── assets
    │       ├── apple.jpeg
    │       ├── apple_r.jpeg
    │       ├── demo.jpeg
    │       ├── demo_highfive.jpg
    │       ├── demo_spotting_caption.jpg
    │       ├── demo_vl.gif
    │       ├── logo.jpg
    │       ├── mm_tutorial
    │       │   ├── Beijing.jpeg
    │       │   ├── Beijing_Small.jpeg
    │       │   ├── Chongqing.jpeg
    │       │   ├── Chongqing_Small.jpeg
    │       │   ├── Hospital.jpg
    │       │   ├── Hospital_Small.jpg
    │       │   ├── Menu.jpeg
    │       │   ├── Rebecca_(1939_poster).jpeg
    │       │   ├── Rebecca_(1939_poster)_Small.jpeg
    │       │   ├── Shanghai.jpg
    │       │   ├── Shanghai_Output.jpg
    │       │   ├── Shanghai_Output_Small.jpeg
    │       │   ├── Shanghai_Small.jpeg
    │       │   └── TUTORIAL.ipynb
    │       ├── qwenvl.jpeg
    │       ├── radar.png
    │       ├── radar_qwenvlplus.jpg
    │       ├── touchstone_datasets.jpg
    │       ├── touchstone_eval.png
    │       ├── touchstone_logo.png
    │       └── wechat.png
    │   ├── data
    │       ├── train_20241116_1625_subset.json
    │       └── train_20241116_1628_subset.json
    │   ├── eval_mm
    │       ├── EVALUATION.md
    │       ├── data
    │       ├── evaluate_caption.py
    │       ├── evaluate_grounding.py
    │       ├── evaluate_multiple_choice.py
    │       ├── evaluate_vqa.py
    │       ├── infographicsvqa_eval.py
    │       ├── mmbench
    │       │   ├── MMBENCH.md
    │       │   ├── evaluate_multiple_choice_mmbench.py
    │       │   ├── mmbench_converter_dev.py
    │       │   ├── mmbench_converter_test.py
    │       │   ├── mmbench_evaluation.py
    │       │   ├── mmbench_evaluation_tricky.py
    │       │   └── mmbench_predict_to_submission.py
    │       ├── mme
    │       │   ├── EVAL_MME.md
    │       │   ├── cognition.jpg
    │       │   ├── eval.py
    │       │   ├── get_images.py
    │       │   └── perception.jpg
    │       ├── seed_bench
    │       │   ├── EVAL_SEED.md
    │       │   ├── eval.py
    │       │   ├── leaderboard.jpg
    │       │   └── trans.py
    │       ├── vqa.py
    │       └── vqa_eval.py
    │   ├── finetune.py
    │   ├── finetune
    │       ├── ds_config_zero2.json
    │       ├── ds_config_zero3.json
    │       ├── finetune_ds.sh
    │       ├── finetune_lora_ds.sh
    │       ├── finetune_lora_ds_gaia.sh
    │       ├── finetune_lora_single_gpu.sh
    │       ├── finetune_qlora_ds.sh
    │       └── finetune_qlora_single_gpu.sh
    │   ├── openai_api.py
    │   ├── output
    │       └── error-out-7903664.out
    │   ├── requirements.txt
    │   ├── requirements_openai_api.txt
    │   ├── requirements_web_demo.txt
    │   ├── scripts
    │       ├── convert_dataset.py
    │       ├── convert_dataset_v2.py
    │       ├── inference.py
    │       ├── inference_lora.py
    │       └── tokenizer.py
    │   ├── slurm_jobs
    │       ├── train_gaia.sh
    │       └── train_gta.sh
    │   ├── touchstone
    │       ├── README.md
    │       ├── README_CN.md
    │       ├── README_JA.md
    │       └── README_KO.md
    │   └── web_demo_mm.py
├── main.py
├── requirements.txt
├── requirements_generation.txt
├── scripts
    ├── report.py
    └── search.py
├── slurm_jobs
    ├── deploy_qwen2_5_72b.sh
    ├── deploy_qwen2_VL_72b.sh
    ├── evaluate.sh
    ├── evaluate_gaia.sh
    ├── evaluate_gaia_exp1.sh
    ├── evaluate_gaia_exp1_setting1.sh
    ├── evaluate_gaia_exp1_setting2.sh
    ├── evaluate_gaia_exp1_setting3.sh
    ├── evaluate_gaia_exp2.sh
    ├── evaluate_gaia_exp3.sh
    ├── evaluate_gaia_exp4.sh
    ├── evaluate_gaia_internvl2.sh
    ├── evaluate_gaia_llava.sh
    ├── evaluate_gaia_qwen.sh
    ├── evaluate_gaia_qwen_tuned.sh
    ├── evaluate_gta_internvl.sh
    ├── evaluate_gta_internvl2.sh
    ├── evaluate_gta_llava.sh
    ├── evaluate_gta_qwen.sh
    ├── evaluate_gta_qwen_llm.sh
    ├── gaia_pipeline.sh
    ├── gaia_pipeline_query_gen.sh
    ├── occupy.sh
    ├── qwen_test.sh
    ├── train.slurm
    └── traj_gen.sh
├── tests
    ├── __init__.py
    ├── data
    │   ├── 254.jpg
    │   ├── annotated_cars.png
    │   ├── cars.png
    │   └── draw.jpg
    ├── test_activate.py
    ├── test_agent.py
    ├── test_agent_data.py
    ├── test_agent_gaia.py
    ├── test_code.py
    ├── test_create_agent.py
    ├── test_debug.py
    ├── test_edit.py
    ├── test_engine.py
    ├── test_file_reader.py
    ├── test_find.py
    ├── test_format_answer.py
    ├── test_gaia_1107.py
    ├── test_inpector.py
    ├── test_internvl.py
    ├── test_llava_ov.py
    ├── test_llm.py
    ├── test_ocr.py
    ├── test_ov_engine.py
    ├── test_qwen.py
    ├── test_seg.py
    ├── test_vision_map.py
    └── test_vllm.py
└── tongagent
    ├── __init__.py
    ├── agents
        ├── __init__.py
        ├── data_sampling_agent.py
        ├── gaia_agent.py
        ├── general_agent.py
        ├── search_agent.py
        └── search_agent_api.py
    ├── cmd
        ├── __init__.py
        └── task_generate.py
    ├── evaluation
        ├── __init__.py
        ├── evaluation.py
        ├── gaia_scorer.py
        ├── optimize_prompt.py
        └── unsolved_questions.py
    ├── llm_engine
        ├── __init__.py
        ├── gpt.py
        ├── internvl2.py
        ├── llava.py
        ├── mini_cpm.py
        └── qwen.py
    ├── prompt.py
    ├── tools
        ├── __init__.py
        ├── browser.py
        ├── cookies.py
        ├── mdconvert.py
        ├── new_added
        │   ├── __init__.py
        │   ├── face_det.py
        │   ├── image_edit.py
        │   ├── image_generation.py
        │   ├── object_loc.py
        │   ├── ocr.py
        │   ├── seg.py
        │   └── video_qa.py
        ├── rag_browser.py
        ├── text_inspector.py
        ├── tool_box.py
        ├── visual_qa.py
        └── web_surfer.py
    └── utils.py


/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Bug report
 3 | about: Create a report to help us improve
 4 | title: ''
 5 | labels: ''
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Describe the bug**
11 | A clear and concise description of what the bug is.
12 | 
13 | **To Reproduce**
14 | Steps to reproduce the behavior:
15 | 1. runing script
16 | ```bash
17 | python main.py
18 | ```
19 | 2. It throw an error
20 | Key Error:xxxx
21 | 
22 | **Expected behavior**
23 | A clear and concise description of what you expected to happen.
24 | 
25 | **Screenshots**
26 | If applicable, add screenshots to help explain your problem.
27 | 
28 | **OS (please complete the following information):**
29 |  - Mac/Linux/Windows
30 | 
31 | **Additional context**
32 | Add any other context about the problem here.
33 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature-request-or-questions.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Feature request Or Questions
 3 | about: Suggest an idea for this project or you have questions for maintainers
 4 | title: ''
 5 | labels: ''
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Is your feature request related to a problem? Please describe.**
11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
12 | 
13 | **Describe the solution you'd like**
14 | A clear and concise description of what you want to happen.
15 | 
16 | **Describe alternatives you've considered**
17 | A clear and concise description of any alternative solutions or features you've considered.
18 | 
19 | **Additional context**
20 | Add any other context or screenshots about the feature request here.
21 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 Machine Learning Lab @ BIGAI
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/assets/authors.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mat-agent/MAT-Agent/7215f7bb7ee2284cfaca2fda79725c39d5b3bb89/assets/authors.png


--------------------------------------------------------------------------------
/assets/framework.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mat-agent/MAT-Agent/7215f7bb7ee2284cfaca2fda79725c39d5b3bb89/assets/framework.jpg


--------------------------------------------------------------------------------
/assets/icon.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mat-agent/MAT-Agent/7215f7bb7ee2284cfaca2fda79725c39d5b3bb89/assets/icon.png


--------------------------------------------------------------------------------
/configs/agent_config.yaml:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | tonggpt:
 4 |   model_name:  gpt-4o-2024-08-06
 5 |   region: eastus
 6 |   api_key: 
 7 |   open_ai_client_type: openai # or azure
 8 |   endpoint: # only for azure, you need to specify the endpoint you are using
 9 | 
10 | visualizer:
11 |   model_name: gpt-4o-2024-08-06
12 |   region: eastus
13 |   api_key: 
14 |   open_ai_client_type: openai # or azure
15 |   endpoint: # only for azure, you need to specify the endpoint you are using
16 | 
17 | qwen:
18 |   model_name: Qwen/Qwen2-VL-7B-Instruct
19 |   endpoint: 
20 | 
21 | agent_controller:
22 |   engine_type: qwen # qwen, minicpm, tonggpt
23 | web_qa:
24 |   model_name: gpt-4o-mini-2024-07-18
25 | 
26 | internvl2:
27 |   model_name: OpenGVLab/InternVL2-8B
28 | 
29 | llava:
30 |   model_name: Lin-Chen/open-llava-next-llama3-8b
31 | 
32 | search_agent:
33 |   type: agent
34 |   model_name: gpt-4o-2024-08-06
35 |   region: eastus
36 |   api_key: 
37 | 
38 | search_engine:
39 |   -
40 |     cx: 
41 |     key: 
42 | 
43 | data_generation:
44 |   # the llm can be azure or openai
45 |   llm: azure
46 |   model: gpt-4o-mini-2024-07-18
47 |   api_key: 
48 |   ape_base: 
49 |   region: eastus
50 |   query_embedding_save_path: image_source/support_embedding_sharegpt4v_100k_chartqa_all.npy
51 |   image_base_path: image_source/open_llava_next
52 |   caption_data_path:  image_source/chartqa_sharegpt4v_all.json
53 | 


--------------------------------------------------------------------------------
/data_generation.sh:
--------------------------------------------------------------------------------
 1 | #1. query, file, and traj generation
 2 | python -m data_generation.gaia_pipeline.0_query_generation_tonggpt --timestamp 20241223-213646
 3 | python -m data_generation.gaia_pipeline.1_query2file_content_parallel_tonggpt --timestamp 20241223-213646
 4 | python -m data_generation.gaia_pipeline.2_file_content2file_tonggpt --timestamp 20241223-213646  --start 0 --end 1000 
 5 | python -m data_generation.gaia_pipeline.3_traj_genetation_tonggpt --timestamp 20241223-213646  --start 0 --end 1000 
 6 | 
 7 | #2. verification and structure conversation
 8 | python -m data_generation.gaia_pipeline.verifier.0_collect --timestamp 20241223-213646
 9 | python -m data_generation.gaia_pipeline.verifier.1_gaia_q_f_filter --timestamp 20241223-213646
10 | python -m data_generation.gaia_pipeline.verifier.2_convert_format --timestamp 20241223-213646
11 | python -m data_generation.gaia_pipeline.verifier.3_gaia_verifier_parallel --timestamp 20241223-213646
12 | 


--------------------------------------------------------------------------------
/data_generation/gaia_pipeline/merge.py:
--------------------------------------------------------------------------------
 1 | import os 
 2 | import json
 3 | 
 4 | 
 5 | def merge(source_folder , output_folder, filename):
 6 |     # source_folder = path + '/query/query_json/'
 7 |     # output_folder = path + '/query/queries_merged'
 8 |     if os.path.exists(output_folder) is False:
 9 |         os.makedirs(output_folder)  
10 | 
11 |     save_path = os.path.join(output_folder, filename)      
12 |     json_files = [pos_json for pos_json in os.listdir(source_folder) if pos_json.endswith('.json')]
13 |     data = []
14 |     for json_file in json_files:
15 |         print ('===============',os.path.join(source_folder, json_file))
16 |         with open(os.path.join(source_folder, json_file)) as f:
17 |             tmp = json.load(f)
18 |             if isinstance(tmp, list) and len(tmp) == 1:
19 |                 tmp = tmp[0]
20 |             if isinstance(tmp, list):
21 |                 data += tmp
22 |             else:
23 |                 data.append(tmp)
24 |     length = len(data)
25 | 
26 |     if os.path.exists(output_folder):
27 |         pass
28 |     else:
29 |         os.makedirs(output_folder)
30 | 
31 |     with open(save_path, 'w') as f:
32 |         json.dump(data, f)
33 |     print(f"Successfully merged {length} json files")


--------------------------------------------------------------------------------
/data_generation/gaia_pipeline/prompts/file/gaia_file_generation_system.prompt:
--------------------------------------------------------------------------------
 1 | You are a helpful assistant and can to generate a <file type placeholder> file by writing Python code. You will be given a content description of the file. You need to firstly largely extend the content, and then write Python code to generate a <file type placeholder> file. GUARANTEE that the provided content is in the file.
 2 | 
 3 | The output Python code MUST use the following template.
 4 | ```
 5 | ##extention start
 6 | Extened content: <here is the extented content>
 7 | 
 8 | ##code start
 9 | ```python
10 | <here is the Python code to generate a <file type placeholder> file>
11 | ```
12 | ##code end
13 | ```


--------------------------------------------------------------------------------
/data_generation/gaia_pipeline/prompts/file/gaia_file_generation_user.prompt:
--------------------------------------------------------------------------------
1 | Now, given the following content: <file content>, first largely extend the content, and output a code to generate a <file type placeholder> file, where file name is <file name> and the file will be saved in <save path>.


--------------------------------------------------------------------------------
/data_generation/gaia_pipeline/prompts/file/gaia_file_verifier_user.prompt:
--------------------------------------------------------------------------------
1 | Following are files, the query: <query>, inference whether the files can solve the query based on the perception ability, reasoning ability, and information search ability of an AI agent.


--------------------------------------------------------------------------------
/data_generation/gaia_pipeline/prompts/file/gaia_user.prompt:
--------------------------------------------------------------------------------
1 | Now given the query: <query>, and suggested tools to solve this query: <suggested tools>. firstly analyze the needed information to solve the query and divide the information into two groups: searching from Internet or extracted from files using tools. Then for information from files, imagine concrete answer of each information (it should be concrete answers instead of description). Finally, output the json for the inferenced information and the content of files.


--------------------------------------------------------------------------------
/data_generation/gaia_pipeline/prompts/query/gaia_test_query_generation.prompt:
--------------------------------------------------------------------------------
 1 | You are tasked with generating user queries that will prompt an agent to call various tools (only use the tool listed in our toolset), including internet search capabilities, to solve real-world, practical problems. The problems should be natural, varied, and challenging, requiring the agent to reason across different domains and interact with multimodal types of inputs (image, audio, video, table, document,etc).  Ensure that the problems span a range of practical scenarios.
 2 | 
 3 | Our toolset: TOOL_SET
 4 | [
 5 |     {
 6 |         "tool_name":"ask_search_agent",
 7 |         "description": "This will send a message to a agent that will browse the internet to answer your question. Ask him for all your web-search related questions, but he's unable to do problem-solving. Provide him as much context as possible, in particular if you need to search on a specific timeframe! And don't hesitate to provide them with a complex search task, like finding a difference between two webpages."
 8 |     },
 9 |     {
10 |         "tool_name":"visualizer",
11 |         "description": "A tool that can answer questions about attached images."
12 |     },
13 |     {
14 |         "tool_name":"PythonInterpreter",
15 |         "description": "A tool that can excute python codes to do calculation and plot, etc."
16 |     },
17 |     {
18 |         "tool_name":"inspect_file_as_text",
19 |         "description": "A tool that can read a file as markdown text and answer questions about it. This tool handles the following file extensions: [".html", ".htm", ".xlsx", ".pptx", ".wav", ".mp3", ".flac", ".pdf", ".docx"], and all other types of text files. IT DOES NOT HANDLE IMAGES."
20 |     }
21 | ]
22 | 
23 | I will now provide examples, along with the tools. Examples of user queries: IN_CONTEXT_EXAMPLES
24 | 
25 | Please output the Queries in a json format. Make sure that the queries share a similar style of the in-context examples. The output template is :
26 | ```json
27 | [
28 |     {
29 |     "query": "What is the weather today?", # <The user query to the agent.>
30 |     "tools": ["tool1", "tool2",...] # <A list consist of the tool names related to the query.>
31 |     },
32 |     ...
33 | ]
34 | ```


--------------------------------------------------------------------------------
/data_generation/gaia_pipeline/prompts/query/gaia_test_tool.prompt:
--------------------------------------------------------------------------------
 1 | You are tasked with inferening what tools will an agent uses to solve a given user query. Concretely, the agent will be given a user query, and it needs to call various tools (only use the tool listed in our toolset), to solve real-world, practical problems. Now, you will be given a query, ensure that the inferenced tools are indeed and necessary to solve the qeury.
 2 | 
 3 | Our toolset: TOOL_SET
 4 | [
 5 |     {
 6 |         "tool_name":"ask_search_agent",
 7 |         "description": "This will send a message to a agent that will browse the internet to answer your question. Ask him for all your web-search related questions, but he's unable to do problem-solving. Provide him as much context as possible, in particular if you need to search on a specific timeframe! And don't hesitate to provide them with a complex search task, like finding a difference between two webpages."
 8 |     },
 9 |     {
10 |         "tool_name":"visualizer",
11 |         "description": "A tool that can answer questions about attached images."
12 |     },
13 |     {
14 |         "tool_name":"PythonInterpreter",
15 |         "description": "A tool that can excute python codes to do calculation and plot, etc."
16 |     },
17 |     {
18 |         "tool_name":"inspect_file_as_text",
19 |         "description": "A tool that can read a file as markdown text and answer questions about it. This tool handles the following file extensions: [".html", ".htm", ".xlsx", ".pptx", ".wav", ".mp3", ".flac", ".pdf", ".docx"], and all other types of text files. IT DOES NOT HANDLE IMAGES."
20 |     }
21 | ]
22 | 
23 | I will now provide examples, along with the tools. Examples of user queries and tools: IN_CONTEXT_EXAMPLES
24 | 
25 | Please output the tools in a json format. Make sure that the tools are in the toolset. The output template is
26 | ```json
27 | {
28 |     "Tools": ["tool1",...] # <A list consist of the tool names related to the query.>
29 | }
30 | ```


--------------------------------------------------------------------------------
/data_generation/gaia_pipeline/prompts/query/gaia_val_query_generation.prompt:
--------------------------------------------------------------------------------
 1 | You are tasked with generating user queries that will prompt an agent to call various tools (only use the tool listed in our toolset), including internet search capabilities, to solve real-world, practical problems. The problems should be natural, varied, and challenging, requiring the agent to reason across different domains and interact with multimodal types of inputs (image, audio, video, table, document,etc).  Ensure that the problems span a range of practical scenarios.
 2 | 
 3 | Our toolset: TOOL_SET
 4 | [
 5 |     {
 6 |         "tool_name":"ask_search_agent",
 7 |         "description": "This will send a message to a agent that will browse the internet to answer your question. Ask him for all your web-search related questions, but he's unable to do problem-solving. Provide him as much context as possible, in particular if you need to search on a specific timeframe! And don't hesitate to provide them with a complex search task, like finding a difference between two webpages."
 8 |     },
 9 |     {
10 |         "tool_name":"visualizer",
11 |         "description": "A tool that can answer questions about attached images."
12 |     },
13 |     {
14 |         "tool_name":"PythonInterpreter",
15 |         "description": "A tool that can excute python codes to do calculation and plot, etc."
16 |     },
17 |     {
18 |         "tool_name":"inspect_file_as_text",
19 |         "description": "A tool that can read a file as markdown text and answer questions about it. This tool handles the following file extensions: [".html", ".htm", ".xlsx", ".pptx", ".wav", ".mp3", ".flac", ".pdf", ".docx"], and all other types of text files. IT DOES NOT HANDLE IMAGES."
20 |     }
21 | ]
22 | 
23 | I will now provide examples, along with the tools. Examples of user queries: IN_CONTEXT_EXAMPLES
24 | 
25 | Please output the Queries in a json format. Make sure that the queries share a similar style of the in-context examples. The output template is :
26 | ```json
27 | [
28 |     {
29 |     "query": "What is the weather today?", # <The user query to the agent.>
30 |     "tools": ["tool1", "tool2",...] # <A list consist of the tool names related to the query.>
31 |     },
32 |     ...
33 | ]
34 | ```


--------------------------------------------------------------------------------
/data_generation/gaia_pipeline/prompts/statistics/pie_chart.py:
--------------------------------------------------------------------------------
 1 | import matplotlib.pyplot as plt
 2 | import numpy as np
 3 | 
 4 | import json
 5 | 
 6 | # Data to plot
 7 | def pie_chart(labels, sizes, pdf_save_path):
 8 | 
 9 | 
10 |     # Generate a list of colors from a colormap
11 |     cmap = plt.get_cmap("tab20c")
12 |     colors = cmap(np.linspace(0, 1, len(labels)))
13 | 
14 |     # Plot
15 |     plt.figure(figsize=(10, 7))
16 |     plt.pie(sizes, labels=labels, colors=colors,
17 |             autopct='%1.1f%%', shadow=False, startangle=140, textprops={'fontsize': 14},
18 |             pctdistance=0.9)  # Move the percentage text outward
19 | 
20 |     plt.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.
21 | 
22 |     plt.title('Programming Language Usage', fontsize=20)
23 | 
24 |     # Save the figure as a PDF
25 |     plt.savefig(pdf_save_path)
26 | 
27 |     plt.show()
28 | 
29 | 
30 | 
31 | def load_json(path):
32 |     with open(path, 'r', encoding='utf-8') as file:
33 |         data=json.load(file)
34 |     return data
35 | 
36 | def write_json(data, filename):
37 |     """
38 |     Write a JSON-compatible Python dictionary to a file.
39 | 
40 |     :param data: The JSON-compatible dictionary to write.
41 |     :param filename: The name of the file to write to.
42 |     """
43 |     try:
44 |         with open(filename, 'w', encoding='utf-8') as file:
45 |             json.dump(data, file, ensure_ascii=False, indent=4)
46 |         print(f"Data successfully written to {filename}")
47 |     except Exception as e:
48 |         print(f"An error occurred while writing to the file: {e}")
49 | 
50 | 
51 | 
52 | json_path="data/final_dataset/tool_statistics.json"
53 | pdf_save_path='data/final_dataset/tool_statistics.pdf'
54 | 
55 | # json_path="data/final_dataset/file_statistics.json"
56 | # pdf_save_path='data/final_dataset/file_statistics.pdf'
57 | 
58 | # json_path="data/final_dataset/topic_statistics.json"
59 | # pdf_save_path='data/final_dataset/topic_statistics.pdf'
60 | 
61 | json_data=load_json(json_path)
62 | labels=list(json_data.keys())
63 | values=list(json_data.values())
64 | 
65 | pie_chart(labels,values,pdf_save_path)


--------------------------------------------------------------------------------
/data_generation/gaia_pipeline/prompts/statistics/topic.prompt:
--------------------------------------------------------------------------------
 1 | You are a helpful assistant. You will be given a query, and you need to classify the topic of the given query from the following candidates: CANDIDATE.
 2 | 
 3 | Please output the topic in a json format. Make sure the output topic is one of the above candidates. The output template is
 4 | ```json
 5 | [
 6 | 	{
 7 | 		"Topic": <here is the topic of the query, it should be selected from the above candidates.>
 8 | 		"Other topic": <If you select 'Other' above, here you should output the concrete topic of it. Otherwise, you should output 'None' here.>
 9 | 	}
10 | ]
11 | ```


--------------------------------------------------------------------------------
/data_generation/gaia_pipeline/verifier/0_collect.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | import argparse
 4 | 
 5 | def read_json(path):
 6 |     with open(path, 'r', encoding='utf-8') as file:
 7 |         data=json.load(file)
 8 |     return data
 9 | 
10 | 
11 | def save_json(path, data):
12 |     with open(path, 'w') as file:
13 |         json.dump(data, file, indent=4) 
14 | 
15 | 
16 | 
17 | def list_files_in_folder(folder_path):
18 |     try:
19 |         # Get the list of files and directories in the specified folder
20 |         files_and_dirs = os.listdir(folder_path)
21 |         
22 |         # Filter out directories, keeping only files
23 |         files = [f for f in files_and_dirs if os.path.isfile(os.path.join(folder_path, f))]
24 |         
25 |         return files
26 |     except Exception as e:
27 |         print(f"An error occurred: {e}")
28 |         return []
29 | 
30 | 
31 | 
32 | parser = argparse.ArgumentParser(description='Generate queries using GAIA data')
33 | parser.add_argument("--timestamp", type=str)
34 | 
35 | args = parser.parse_args()
36 | timestamp=args.timestamp
37 | 
38 | def list_files_in_directory(path):
39 |     try:
40 |         # Get a list of all files and directories in the given path
41 |         items = os.listdir(path)
42 |         
43 |         # Filter out directories, keeping only files
44 |         files = [item for item in items if os.path.isfile(os.path.join(path, item))]
45 |         
46 |         return files
47 |     except Exception as e:
48 |         print(f"An error occurred: {e}")
49 |         return []
50 | 
51 | json_list = list_files_in_directory(f'./data_generation/gaia_pipeline/save/{timestamp}/traj/')
52 | 
53 | # Example usage
54 | json_root_path = './data_generation/gaia_pipeline/final_save/'
55 | 
56 | print ('json list', json_list)
57 | save_name=f'all_json_{timestamp}_gpt4omini.json'
58 | 
59 | 
60 | all_data=[]
61 | for json_name in json_list:
62 |     data = read_json(os.path.join(json_root_path,json_name))
63 |     all_data=all_data+data
64 | 
65 |     save_json(os.path.join(json_root_path,save_name),all_data)
66 | 
67 | print ('total num', len(all_data))
68 | 
69 | 
70 | 
71 | 


--------------------------------------------------------------------------------
/data_generation/gaia_pipeline/verifier/prompt/gaia_file_verifier_system.prompt:
--------------------------------------------------------------------------------
 1 | You are a helpful assistant that are given a query and several files. You need to check whether the files are matched with the query. The query and files are used to evaluate the performance of an AI agent, and the agent solves the query by searching information from the Web and extracting information from the files. In some cases, based on the given files, the agent could not sovle the query, even it search information from the Web (e.g., some specific knowledge). You need to pick up these bad cases.
 2 | 
 3 | Thus, the files should follow these requirements.
 4 | 1. Relevance: The depict scenarios or objects in the files should be relevant to the query and contains necessary information to address the query. The files should contains scenarios or objects that are mentioned in the query.
 5 | 2. Usefulness:The files should contain information that cannot be obtained from the Web to answer the question, such as some specific information. It should not be too simplistic or lack necessary details.
 6 | 3. Some queries require the agent to search some knowledge from the Web, and combine them with information in the files to solve the queries. Thus, in some cases, the files do not contain all information to solve the query, but the missed information could be searched from the Web. These cases should be regarded as correct cases. 
 7 | 
 8 | The output MUST use the following json template to evaluate files.
 9 | '''
10 | ### start json
11 | {
12 | "information_for_query": <Required information to solve the query.>
13 | "useful_information_in_files": <Useful information that can be extracted from files to solve the query. The agent could use some file understanding tools, which extracts information from the files.>
14 | "missed_information_in_files": <Missed information that is necessary to solve the query but does not exist in the files.>
15 | "missed_information_web_search": <You need to justify whether the missed information could be searched from the Web, using your rich experience in surfing the Internet.> 
16 | "missed_information_computed": <You need to justify whether the missed information could be computed based on information extracted from the files or searched from the Web.>
17 | "thought": <Now, you need to determine whether the files are correct or not. If the missed information could be searched from the Web or compted based on existing information, the files are correct. If not, the files are incorrect.>
18 | "correct": <Based on the above reasoning processes, say "yes" or "no" for the correctness of the files.>
19 | }
20 | ### end json
21 | '''


--------------------------------------------------------------------------------
/data_generation/gaia_pipeline/verifier/prompt/gaia_file_verifier_user.prompt:
--------------------------------------------------------------------------------
1 | Following the query: <query>, and contents of given files: <file_content>, inference whether the files are correct or not.


--------------------------------------------------------------------------------
/data_generation/gaia_pipeline/verifier/prompt/gaia_traj_verifier_system.prompt:
--------------------------------------------------------------------------------
1 | As a data quality evaluator that needs to determine whether a query-solving trajectory between human and an agent is correct. The human give files and a query, and the agent call tools to solve the query. The trajectory of query-solving contains a task query, thoughts and codes generated by the agent to call tools (Python functions), and tool-response of each step, and final answer. You must assess the alignment between the task query, corresponding tool usage (generated thoughts and codes from the agent), and the execution results (tool-response). Your goal is to ensure the used tools, arguments to the tools, and summarized answers in the trajectory accurately reflect the human’s intentions.
2 | 
3 | The query-solving trajectory is incorrent if:
4 | 1. The tool usage does not align with the query’s objective and the context, or there are useless or unreasonable tool usage. In addition, the agent does not use tools and solve the query by itself.
5 | 2. The input arguments to the tools appear incorrect or unreasonable.
6 | 3. The final answers or intermediate results summarized from the observation appear incorrect or unreasonable.
7 | 4. The final answer is not relevant to the task query or the final answer seems incorrect.
8 | 5. The trajectory (such as tool-usage and observation) confilicts or is not consistent with the file content. 


--------------------------------------------------------------------------------
/data_generation/gaia_pipeline/verifier/prompt/gaia_traj_verifier_user.prompt:
--------------------------------------------------------------------------------
 1 | Now, given used files and corresponding information, determine the trajectory is correct or not.
 2 | − All Available Tools:
 3 | <tool description>
 4 | − User Query: <query>
 5 | − Trajectory, including generated thought and code from the agent, and intermediate results of using tools: 
 6 | <traj>
 7 | − Execution Results: <execution_result>
 8 | 
 9 | Output MUST use the following json template to determine the query-solving trajectory is correct or not.
10 | '''
11 | ### start json
12 | {
13 | "thought": "Concisely describe your reasoning here",
14 | "correct": "yes" or "no"
15 | }
16 | ### end json
17 | '''


--------------------------------------------------------------------------------
/data_generation/gaia_pipeline/verifier/prompt/gta_file_verifier_user.prompt:
--------------------------------------------------------------------------------
1 | Following are images, the query: <query>, inference whether the images can solve the query based on the perception ability, reasoning ability, and information search ability of an AI agent.


--------------------------------------------------------------------------------
/data_generation/gaia_pipeline/verifier/prompt/gta_traj_verifier_user.prompt:
--------------------------------------------------------------------------------
 1 | Now, given used images and corresponding information, determine the trajectory is correct or not.
 2 | 
 3 | − User Query: <query>
 4 | − Image Content: <image_content>
 5 | − Trajectory, including generated thought and code from the agent, and intermediate results of using tools: 
 6 | <traj>
 7 | − Execution Results: <execution_result>
 8 | 
 9 | Output MUST use the following json template to determine the query-solving trajectory is correct or not.
10 | '''
11 | ### start json
12 | {
13 | "thought": "Concisely describe your reasoning here",
14 | "correct": "yes" or "no"
15 | }
16 | ### end json
17 | '''


--------------------------------------------------------------------------------
/examples/gaia/analysis.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import json
 3 | import os
 4 | import numpy as np
 5 | parser = argparse.ArgumentParser()
 6 | parser.add_argument("--data-path")
 7 | args = parser.parse_args()
 8 | 
 9 | 
10 | files = os.listdir(args.data_path)
11 | 
12 | files = [os.path.join(args.data_path, f) for f in files]
13 | 
14 | files = [f for f in files if os.path.isdir(f)]
15 | 
16 | counts = []
17 | for f in files:
18 |     
19 |     f = os.path.join(f, "agent_memory.json")
20 |     with open(f, "r") as f:
21 |         dataset = json.load(f)
22 | 
23 |     conv = dataset["conversations"]
24 |     turn = len(conv)
25 |     steps = (turn - 2) // 2
26 |     print(steps)
27 |     counts.append(steps)
28 |     # print(conv)
29 |     # break
30 | 
31 | import matplotlib.pyplot as plt
32 | 
33 | plt.figure(dpi=300)
34 | plt.hist(counts, bins=7)
35 | plt.xlabel("Steps")
36 | plt.ylabel("Task counts")
37 | plt.grid()
38 | plt.show()


--------------------------------------------------------------------------------
/examples/gaia/eval.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | sys.path.insert(0, "./")
 3 | import sqlite3
 4 | 
 5 | from tongagent.evaluation.gaia_scorer import question_scorer
 6 | from tongagent.llm_engine.gpt import get_tonggpt_open_ai_client
 7 | from tongagent.prompt import FORMAT_ANSWER_PROMPT_GAIA
 8 | from langchain.prompts import ChatPromptTemplate
 9 | from tqdm import tqdm
10 | 
11 | import argparse
12 | 
13 | parser = argparse.ArgumentParser()
14 | parser.add_argument("--data-path")
15 | args = parser.parse_args()
16 | 
17 | cache_db = sqlite3.connect(args.data_path)
18 | cursor = cache_db.cursor()
19 | cursor.execute(f"SELECT * FROM qa_cache")
20 | rows = cursor.fetchall()
21 | cache_db.close()
22 | 
23 | # print(rows)
24 | client, model = get_tonggpt_open_ai_client()
25 | template = ChatPromptTemplate.from_template(FORMAT_ANSWER_PROMPT_GAIA)
26 | n_total = len(rows)
27 | correct = 0
28 | eval_data = []
29 | for row in tqdm(rows):
30 |     try:
31 |         is_this_correct = question_scorer(
32 |             ground_truth=row[-1],
33 |             model_answer=row[-2]
34 |         )
35 |     except Exception as e:
36 |         print("question_scorer failed", e)
37 |         is_this_correct = 0
38 |     if is_this_correct == 0:
39 |         task = row[0]
40 |         final_answer = row[-2]
41 |         prompt_input = {
42 |                 "question": task,
43 |                 "answer": final_answer
44 |         }
45 |         prompt = template.invoke(prompt_input)
46 |         messages = [
47 |                 {"role": "user", "content": prompt.to_messages()[0].content}
48 |         ]
49 |         
50 |         response = client.chat.completions.create(
51 |             messages = messages,
52 |             model = model
53 |         )
54 |         final_answer: str = response.choices[0].message.content
55 |         if "Educated guess:" in final_answer:
56 |             final_answer = final_answer.replace("Educated guess:", "").strip()
57 |         try:
58 |             is_this_correct = question_scorer(
59 |                 ground_truth=row[-1],
60 |                 model_answer=final_answer
61 |             )
62 |         except Exception as e:
63 |             print("question_scorer failed", e)
64 |             is_this_correct = 0
65 |     else:
66 |         final_answer = row[-2]
67 |     eval_data.append(
68 |         row + (final_answer, is_this_correct)
69 |     )
70 |     print("Correct" if is_this_correct == 1 else 'Incorrect', "GT:",row[-1], "Prediction:", row[-2])
71 |     correct += is_this_correct
72 | import pandas as pd
73 | 
74 | df = pd.DataFrame(eval_data, columns=["question", 'task_id', 'answer', 'ground_truth', 'formatted_answer', "correct"])
75 | df.to_csv(args.data_path.replace('.db', '.csv'))
76 | print("Total:", n_total)
77 | print("Correct Item:", correct)
78 | print("Accuracy:", round(100 * correct / n_total, 2), "%")


--------------------------------------------------------------------------------
/examples/gaia/playground.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | sys.path.insert(0, "./")
 3 | import sqlite3
 4 | 
 5 | from tongagent.agents.data_sampling_agent import create_agent
 6 | from tongagent.utils import load_config
 7 | from datasets import load_dataset
 8 | import os
 9 | import argparse
10 | from typing import Optional
11 | from tqdm import tqdm
12 | 
13 | DATA_NAME = "2023_level2"
14 | SPLIT = "validation"
15 | 
16 | def run(agent, raw_question, attachment_name):
17 |     if attachment_name is not None and attachment_name.strip() != "":
18 |         question = f"{raw_question}\nAttachment: data/GAIA/2023/{SPLIT}/{attachment_name}"
19 |     else:
20 |         question = raw_question
21 |     
22 |     if attachment_name is not None and (attachment_name.endswith(".png") or attachment_name.endswith(".jpg")):
23 |         agent.image_paths = [f'data/GAIA/2023/{SPLIT}/{attachment_name}']
24 |     else:
25 |         agent.image_paths = []
26 |         
27 |     result = agent.run(question)
28 |     agent.save_trajectory()
29 |     return result
30 | 
31 | 
32 | ds = load_dataset("gaia-benchmark/GAIA", DATA_NAME, split=SPLIT)
33 | agent = create_agent(llm_engine="tonggpt", task="gaia", error_tolerance=3)
34 | 
35 | # selected = "e8cb5b03-41e0-4086-99e5-f6806cd97211"
36 | # item = [item for item in ds if item["task_id"] == selected][0]
37 | # print("item", item)
38 | 
39 | # question = "The object in the British Museum's collection with a museum number of 2012,5015.17 is the shell of a particular mollusk species. According to the abstract of a research article published in Science Advances in 2021, beads made from the shells of this species were found that are at least how many thousands of years old?"
40 | 
41 | # question = "The year is 2022. I am at the National Air and Space Museum east of the Potomac River. I want to go to Fire Station 301 DCA ARFF using the metro. I go in the wrong direction and end up at the station closest to Cleveland Elementary School. How many metro stations am I away from my original destination if I don't change lines? Your answer should be a numerical integer value."
42 | 
43 | # question = "In the YouTube 360 VR video from March 2018 narrated by the voice actor of Lord of the Rings' Gollum, what number was mentioned by the narrator directly after dinosaurs were first shown in the video?"
44 | 
45 | # question = "In the YouTube 360 VR video from March 2018 narrated by the voice actor of Lord of the Rings' Gollum, what chemical terminology was mentioned by the narrator directly after H2O were first mentioned in the video?"
46 | 
47 | question = "Visit Bofei's Site to find his current position in industry."
48 | file_name = None
49 | result = run(
50 |     agent,
51 |     raw_question=question,
52 |     attachment_name=file_name
53 | )
54 | 
55 | print(result)


--------------------------------------------------------------------------------
/examples/gaia/view.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | sys.path.insert(0, "./")
 3 | import sqlite3
 4 | 
 5 | from tongagent.evaluation.gaia_scorer import question_scorer
 6 | from tongagent.llm_engine.gpt import get_tonggpt_open_ai_client
 7 | from tongagent.prompt import FORMAT_ANSWER_PROMPT_GAIA
 8 | from langchain.prompts import ChatPromptTemplate
 9 | from tqdm import tqdm
10 | 
11 | import argparse
12 | 
13 | parser = argparse.ArgumentParser()
14 | parser.add_argument("--data-path")
15 | args = parser.parse_args()
16 | 
17 | # cache_db = sqlite3.connect(args.data_path)
18 | # cursor = cache_db.cursor()
19 | # cursor.execute(f"SELECT * FROM qa_cache")
20 | # rows = cursor.fetchall()
21 | # print(rows)
22 | # print(len(rows))
23 | 
24 | from datasets import load_dataset
25 | ds = load_dataset("gaia-benchmark/GAIA", "2023_level1", split="validation")
26 | 
27 | subset = ds[0:10]
28 | for k, v in subset.items():
29 |     print(k, len(v))
30 | 


--------------------------------------------------------------------------------
/examples/gta/eval.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import argparse
 3 | import os
 4 | 
 5 | parser = argparse.ArgumentParser()
 6 | parser.add_argument("--folder", required=True)
 7 | 
 8 | args = parser.parse_args()
 9 | subfolders = os.listdir(args.folder)
10 | 
11 | total = 0
12 | correct = 0
13 | all_samples = len(subfolders)
14 | for subfolder in subfolders:
15 |     data_path = os.path.join(args.folder, subfolder, "agent_memory.json")
16 |     
17 |     with open(data_path, "r") as f:
18 |         dataset = json.load(f)
19 |     
20 |     gt, answer = dataset["ground_truth"], dataset["final_answer"]
21 |     if gt is None:
22 |         continue
23 |     skip = False
24 |     is_correct = True
25 |     for each in gt:
26 |         if type(each) is str:
27 |             skip = True
28 |             break
29 |         
30 |         if type(each) is list:
31 |             is_this_gt_correct = []
32 |             for item in each:
33 |                 is_this_gt_correct.append(item.lower() in str(answer).lower())
34 |             
35 |             is_correct = is_correct and any(is_this_gt_correct)
36 |         else:
37 |             raise ValueError("unexpected")
38 |             
39 |     if skip:
40 |         continue
41 |     if is_correct:
42 |         print("Correct:", gt, answer)
43 |         correct += 1
44 |     else:
45 |         print("Incorrect", gt, answer)
46 |     total += 1
47 |     # print(gt, answer)
48 | 
49 | print("Folder", args)
50 | print("Total samples valid:", total, "Correct sample", correct, "all samples", all_samples)
51 | print("Accuracy", round(correct / total, 4) * 100, "%")
52 | print("Accuracy (all samples)", round(correct / all_samples, 4) * 100, "%")


--------------------------------------------------------------------------------
/experiments/CPM-FT/README.md:
--------------------------------------------------------------------------------
 1 | # Setup
 2 | 
 3 | Install environment
 4 | ```bash
 5 | conda create -n cpm python=3.10
 6 | conda activate cpm
 7 | pip install -r requirements.txt
 8 | ```
 9 | 
10 | Setup data path
11 | ```
12 | 
13 | ln -s /home/lipengxiang/codes/TongAgent/data/tongagent data/tongagent
14 | ```


--------------------------------------------------------------------------------
/experiments/CPM-FT/assets/airplane.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mat-agent/MAT-Agent/7215f7bb7ee2284cfaca2fda79725c39d5b3bb89/experiments/CPM-FT/assets/airplane.jpeg


--------------------------------------------------------------------------------
/experiments/CPM-FT/assets/prompt.txt:
--------------------------------------------------------------------------------
 1 | You are an autonomous intelligent agent tasked with navigating a web browser . You will be given web - based tasks . These tasks will be accomplished through the use of specific actions you can issue. Here’s the information you’ll have:
 2 | The user’s objective: Tell me about birthday of Mercedes Sosa
 3 | The current web page’s URL: https://en.wikipedia.org/wiki/Mercedes_Sosa
 4 | The open tabs: Mercedes_Sosa
 5 | The previous action: None
 6 | The actions you can perform fall into several categories:
 7 | Page Operation Actions:
 8 | ```click[id]```: This action clicks on an element with a specific id on the webpage.
 9 | ```type[id][content]```: Use this to type the content into the field with id. By default, the " Enter " key is pressed after typing unless press_enter_after is set to 0, i.e., ```type[id][content][0]```.
10 | ```hover[id]```: Hover over an element with id.
11 | ```press[key_comb]```: Simulates the pressing of a key combination on the keyboard (e.g., Ctrl+v).
12 | ```scroll[down]``` or ```scroll[up]```: Scroll the page up or down.
13 | Tab Management Actions :
14 | ```new_tab```: Open a new, empty browser tab.
15 | ```tab_focus[tab_index]```: Switch the browser’s focus to a specific tab using its index.
16 | ```close_tab```: Close the currently active tab.
17 | URL Navigation Actions:
18 | ```goto[url]```: Navigate to a specific URL.
19 | ```go_back```: Navigate to the previously viewed page.
20 | ```go_forward```: Navigate to the next page (if a previous’ go_back’ action was performed).
21 | Completion Action :
22 | ```stop[answer]```: Issue this action when you believe the task is
23 | complete. If the objective is to find a text-based answer, provide
24 | the answer in the bracket.
25 | Homepage: https://www.google.com.hk/


--------------------------------------------------------------------------------
/experiments/CPM-FT/assets/sosa.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mat-agent/MAT-Agent/7215f7bb7ee2284cfaca2fda79725c39d5b3bb89/experiments/CPM-FT/assets/sosa.png


--------------------------------------------------------------------------------
/experiments/CPM-FT/finetune/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mat-agent/MAT-Agent/7215f7bb7ee2284cfaca2fda79725c39d5b3bb89/experiments/CPM-FT/finetune/__init__.py


--------------------------------------------------------------------------------
/experiments/CPM-FT/finetune/ds_config_zero2.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "fp16": {
 3 |         "enabled": "auto",
 4 |         "loss_scale": 0,
 5 |         "loss_scale_window": 1000,
 6 |         "initial_scale_power": 16,
 7 |         "hysteresis": 2,
 8 |         "min_loss_scale": 1
 9 |     },
10 | 
11 |     "bf16": {
12 |         "enabled": "auto"
13 |     },
14 | 
15 |     "optimizer": {
16 |         "type": "AdamW",
17 |         "params": {
18 |             "lr": "auto",
19 |             "betas": "auto",
20 |             "eps": "auto",
21 |             "weight_decay": "auto"
22 |         }
23 |     },
24 | 
25 |     "scheduler": {
26 |         "type": "WarmupLR",
27 |         "params": {
28 |             "warmup_min_lr": "auto",
29 |             "warmup_max_lr": "auto",
30 |             "warmup_num_steps": "auto"
31 |         }
32 |     },
33 | 
34 |     "zero_optimization": {
35 |         "stage": 2,
36 |         "offload_optimizer": {
37 |             "device": "none",
38 |             "pin_memory": true
39 |         },
40 |         "allgather_partitions": true,
41 |         "allgather_bucket_size": 2e8,
42 |         "overlap_comm": true,
43 |         "reduce_scatter": true,
44 |         "reduce_bucket_size": 2e8,
45 |         "contiguous_gradients": true
46 |     },
47 | 
48 |     "gradient_accumulation_steps": "auto",
49 |     "gradient_clipping": "auto",
50 |     "steps_per_print": 100,
51 |     "train_batch_size": "auto",
52 |     "train_micro_batch_size_per_gpu": "auto",
53 |     "wall_clock_breakdown": false
54 | }
55 | 


--------------------------------------------------------------------------------
/experiments/CPM-FT/finetune/ds_config_zero3.json:
--------------------------------------------------------------------------------
 1 | 
 2 | {
 3 |     "fp16": {
 4 |         "enabled": "auto",
 5 |         "loss_scale": 0,
 6 |         "loss_scale_window": 1000,
 7 |         "initial_scale_power": 16,
 8 |         "hysteresis": 2,
 9 |         "min_loss_scale": 1
10 |     },
11 |     "bf16": {
12 |         "enabled": "auto"
13 |     },
14 |     "optimizer": {
15 |         "type": "AdamW",
16 |         "params": {
17 |             "lr": "auto",
18 |             "betas": "auto",
19 |             "eps": "auto",
20 |             "weight_decay": "auto"
21 |         }
22 |     },
23 | 
24 |     "scheduler": {
25 |         "type": "WarmupLR",
26 |         "params": {
27 |             "warmup_min_lr": "auto",
28 |             "warmup_max_lr": "auto",
29 |             "warmup_num_steps": "auto"
30 |         }
31 |     },
32 | 
33 |     "zero_optimization": {
34 |         "stage": 3,
35 |         "offload_optimizer": {
36 |             "device": "none",
37 |             "pin_memory": true
38 |         },
39 |         "offload_param": {
40 |             "device": "none",
41 |             "pin_memory": true
42 |         },
43 |         "overlap_comm": true,
44 |         "contiguous_gradients": true,
45 |         "sub_group_size": 1e9,
46 |         "reduce_bucket_size": "auto",
47 |         "stage3_prefetch_bucket_size": "auto",
48 |         "stage3_param_persistence_threshold": "auto",
49 |         "stage3_max_live_parameters": 1e9,
50 |         "stage3_max_reuse_distance": 1e9,
51 |         "stage3_gather_16bit_weights_on_model_save": true
52 |     },
53 | 
54 |     "gradient_accumulation_steps": "auto",
55 |     "gradient_clipping": "auto",
56 |     "steps_per_print": 100,
57 |     "train_batch_size": "auto",
58 |     "train_micro_batch_size_per_gpu": "auto",
59 |     "wall_clock_breakdown": false
60 | }
61 | 
62 | 


--------------------------------------------------------------------------------
/experiments/CPM-FT/finetune/finetune_ds.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | GPUS_PER_NODE=8
 4 | NNODES=1
 5 | NODE_RANK=0
 6 | MASTER_ADDR=localhost
 7 | MASTER_PORT=6001
 8 | 
 9 | MODEL="openbmb/MiniCPM-V-2_6"
10 | # or openbmb/MiniCPM-V-2, openbmb/MiniCPM-Llama3-V-2_5
11 | # ATTENTION: specify the path to your training data, which should be a json file consisting of a list of conversations.
12 | # See the section for finetuning in README for more information.
13 | DATA="path/to/trainging_data"
14 | EVAL_DATA="path/to/test_data"
15 | LLM_TYPE="qwen2" # if use openbmb/MiniCPM-V-2, please set LLM_TYPE=minicpm, if use openbmb/MiniCPM-Llama3-V-2_5, please set LLM_TYPE="llama3"
16 | 
17 | 
18 | 
19 | DISTRIBUTED_ARGS="
20 |     --nproc_per_node $GPUS_PER_NODE \
21 |     --nnodes $NNODES \
22 |     --node_rank $NODE_RANK \
23 |     --master_addr $MASTER_ADDR \
24 |     --master_port $MASTER_PORT
25 | "
26 | torchrun $DISTRIBUTED_ARGS finetune.py  \
27 |     --model_name_or_path $MODEL \
28 |     --llm_type $LLM_TYPE \
29 |     --data_path $DATA \
30 |     --eval_data_path $EVAL_DATA \
31 |     --remove_unused_columns false \
32 |     --label_names "labels" \
33 |     --prediction_loss_only false \
34 |     --bf16 true \
35 |     --bf16_full_eval true \
36 |     --fp16 false \
37 |     --fp16_full_eval false \
38 |     --do_train \
39 |     --do_eval \
40 |     --tune_vision true \
41 |     --tune_llm true \
42 |     --model_max_length 2048 \
43 |     --max_slice_nums 9 \
44 |     --max_steps 10000 \
45 |     --eval_steps 1000 \
46 |     --output_dir output/output_minicpmv26 \
47 |     --logging_dir output/output_minicpmv26 \
48 |     --logging_strategy "steps" \
49 |     --per_device_train_batch_size 1 \
50 |     --per_device_eval_batch_size 1 \
51 |     --gradient_accumulation_steps 1 \
52 |     --evaluation_strategy "steps" \
53 |     --save_strategy "steps" \
54 |     --save_steps 1000 \
55 |     --save_total_limit 10 \
56 |     --learning_rate 1e-6 \
57 |     --weight_decay 0.1 \
58 |     --adam_beta2 0.95 \
59 |     --warmup_ratio 0.01 \
60 |     --lr_scheduler_type "cosine" \
61 |     --logging_steps 1 \
62 |     --gradient_checkpointing true \
63 |     --deepspeed ds_config_zero2.json \
64 |     --report_to "tensorboard" 
65 | 


--------------------------------------------------------------------------------
/experiments/CPM-FT/finetune/finetune_lora.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | GPUS_PER_NODE=8
 4 | NNODES=1
 5 | NODE_RANK=0
 6 | MASTER_ADDR=localhost
 7 | MASTER_PORT=6001
 8 | 
 9 | MODEL="openbmb/MiniCPM-V-2_6" # or openbmb/MiniCPM-V-2, openbmb/MiniCPM-Llama3-V-2_5
10 | # ATTENTION: specify the path to your training data, which should be a json file consisting of a list of conversations.
11 | # See the section for finetuning in README for more information.
12 | DATA="path/to/trainging_data"
13 | EVAL_DATA="path/to/test_data"
14 | LLM_TYPE="qwen2" 
15 | # if use openbmb/MiniCPM-V-2, please set LLM_TYPE=minicpm
16 | #if use openbmb/MiniCPM-Llama3-V-2_5, please set LLM_TYPE=llama3
17 | DISTRIBUTED_ARGS="
18 |     --nproc_per_node $GPUS_PER_NODE \
19 |     --nnodes $NNODES \
20 |     --node_rank $NODE_RANK \
21 |     --master_addr $MASTER_ADDR \
22 |     --master_port $MASTER_PORT
23 | "
24 | torchrun $DISTRIBUTED_ARGS finetune.py  \
25 |     --model_name_or_path $MODEL \
26 |     --llm_type $LLM_TYPE \
27 |     --data_path $DATA \
28 |     --eval_data_path $EVAL_DATA \
29 |     --remove_unused_columns false \
30 |     --label_names "labels" \
31 |     --prediction_loss_only false \
32 |     --bf16 false \
33 |     --bf16_full_eval false \
34 |     --fp16 true \
35 |     --fp16_full_eval true \
36 |     --do_train \
37 |     --do_eval \
38 |     --tune_vision true \
39 |     --tune_llm false \
40 |     --use_lora true \
41 |     --lora_target_modules "llm\..*layers\.\d+\.self_attn\.(q_proj|k_proj|v_proj|o_proj)" \
42 |     --model_max_length 2048 \
43 |     --max_slice_nums 9 \
44 |     --max_steps 10000 \
45 |     --eval_steps 1000 \
46 |     --output_dir output/output__lora \
47 |     --logging_dir output/output_lora \
48 |     --logging_strategy "steps" \
49 |     --per_device_train_batch_size 1 \
50 |     --per_device_eval_batch_size 1 \
51 |     --gradient_accumulation_steps 1 \
52 |     --evaluation_strategy "steps" \
53 |     --save_strategy "steps" \
54 |     --save_steps 1000 \
55 |     --save_total_limit 10 \
56 |     --learning_rate 1e-6 \
57 |     --weight_decay 0.1 \
58 |     --adam_beta2 0.95 \
59 |     --warmup_ratio 0.01 \
60 |     --lr_scheduler_type "cosine" \
61 |     --logging_steps 1 \
62 |     --gradient_checkpointing true \
63 |     --deepspeed ds_config_zero2.json \
64 |     --report_to "tensorboard" # wandb
65 | 


--------------------------------------------------------------------------------
/experiments/CPM-FT/inference/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mat-agent/MAT-Agent/7215f7bb7ee2284cfaca2fda79725c39d5b3bb89/experiments/CPM-FT/inference/__init__.py


--------------------------------------------------------------------------------
/experiments/CPM-FT/inference/eval.py:
--------------------------------------------------------------------------------
 1 | from inference.utils import load_pretrained_model
 2 | from PIL import Image
 3 | 
 4 | def eval(image_path):
 5 |     image = Image.open(image_path).convert('RGB')
 6 |     model, tokenizer = load_pretrained_model()
 7 |     q = '''You are an autonomous intelligent agent tasked with navigating a web browser . You will be given web - based tasks . These tasks will be accomplished through the use of specific actions you can issue. Here’s the information you’ll have:
 8 | The user’s objective: Tell me about birthday of Mercedes Sosa
 9 | The current web page’s URL: https://en.wikipedia.org/wiki/Mercedes_Sosa
10 | The open tabs: Mercedes_Sosa
11 | The previous action: None
12 | The actions you can perform fall into several categories:
13 | Page Operation Actions:
14 | ```click[id]```: This action clicks on an element with a specific id on the webpage.
15 | ```type[id][content]```: Use this to type the content into the field with id. By default, the " Enter " key is pressed after typing unless press_enter_after is set to 0, i.e., ```type[id][content][0]```.
16 | ```hover[id]```: Hover over an element with id.
17 | ```press[key_comb]```: Simulates the pressing of a key combination on the keyboard (e.g., Ctrl+v).
18 | ```scroll[down]``` or ```scroll[up]```: Scroll the page up or down.
19 | Tab Management Actions :
20 | ```new_tab```: Open a new, empty browser tab.
21 | ```tab_focus[tab_index]```: Switch the browser’s focus to a specific tab using its index.
22 | ```close_tab```: Close the currently active tab.
23 | URL Navigation Actions:
24 | ```goto[url]```: Navigate to a specific URL.
25 | ```go_back```: Navigate to the previously viewed page.
26 | ```go_forward```: Navigate to the next page (if a previous’ go_back’ action was performed).
27 | Completion Action :
28 | ```stop[answer]```: Issue this action when you believe the task is
29 | complete. If the objective is to find a text-based answer, provide
30 | the answer in the bracket.
31 | Homepage: https://www.google.com.hk/
32 |     '''
33 |     msgs = [{'role': 'user', 'content': [image, q]}]
34 |     
35 |     answer = model.chat(
36 |         image=None,
37 |         msgs=msgs,
38 |         tokenizer=tokenizer
39 |     )
40 |     return answer
41 | 
42 | 
43 | if __name__ == "__main__":
44 |     answer = eval("assets/sosa.png")
45 |     print(answer)


--------------------------------------------------------------------------------
/experiments/CPM-FT/inference/utils.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from PIL import Image
 3 | from transformers import AutoModel, AutoTokenizer
 4 | 
 5 | def load_pretrained_model():
 6 |     torch.manual_seed(0)
 7 | 
 8 |     model = AutoModel.from_pretrained('openbmb/MiniCPM-V-2_6', trust_remote_code=True,
 9 |         attn_implementation='sdpa', torch_dtype=torch.bfloat16) # sdpa or flash_attention_2, no eager
10 |     model = model.eval().cuda()
11 |     tokenizer = AutoTokenizer.from_pretrained('openbmb/MiniCPM-V-2_6', trust_remote_code=True)
12 |     return model, tokenizer
13 | 
14 | from peft import PeftModel
15 | 
16 | def load_pretrained_model_lora(peft_model_id):
17 |     # model_id = 'openbmb/MiniCPM-V-2_6'
18 |     model, tokenizer = load_pretrained_model()
19 |     print("Load Lora")
20 |     model = PeftModel.from_pretrained(model, peft_model_id)
21 |     print("Lora merge and unload")
22 |     model.merge_and_unload()
23 |     return model, tokenizer


--------------------------------------------------------------------------------
/experiments/CPM-FT/main.py:
--------------------------------------------------------------------------------
1 | 
2 | 
3 | 


--------------------------------------------------------------------------------
/experiments/CPM-FT/requirements.txt:
--------------------------------------------------------------------------------
 1 | packaging==23.2
 2 | addict==2.4.0
 3 | editdistance==0.6.2
 4 | einops==0.7.0
 5 | fairscale==0.4.0
 6 | jsonlines==4.0.0
 7 | markdown2==2.4.10
 8 | matplotlib==3.7.4
 9 | more_itertools==10.1.0
10 | nltk==3.8.1
11 | numpy==1.24.4
12 | opencv_python_headless==4.5.5.64
13 | openpyxl==3.1.2
14 | Pillow==10.1.0
15 | sacrebleu==2.3.2
16 | seaborn==0.13.0
17 | shortuuid==1.0.11
18 | #spacy==3.7.2
19 | timm==0.9.10
20 | torch==2.1.2
21 | torchvision==0.16.2
22 | tqdm==4.66.1
23 | protobuf==4.25.0
24 | transformers==4.40.0
25 | typing_extensions==4.8.0
26 | uvicorn==0.24.0.post1
27 | #xformers==0.0.22.post7
28 | #flash_attn==2.3.4
29 | sentencepiece==0.1.99
30 | accelerate==0.30.1
31 | socksio==1.0.0
32 | gradio==4.41.0
33 | gradio_client
34 | http://thunlp.oss-cn-qingdao.aliyuncs.com/multi_modal/never_delete/modelscope_studio-0.4.0.9-py3-none-any.whl
35 | decord


--------------------------------------------------------------------------------
/experiments/CPM-FT/scripts/convert_baai_stats.py:
--------------------------------------------------------------------------------
 1 | from datasets import load_dataset
 2 | from tqdm import tqdm
 3 | from collections import defaultdict
 4 | ds = load_dataset("BAAI/Infinity-Instruct", "0625")
 5 | 
 6 | ds = ds["train"]
 7 | ability = defaultdict(lambda : 0)
 8 | cate_ability = defaultdict(lambda : 0)
 9 | for item in tqdm(ds):
10 |     # print(item)
11 |     abs = item["label"]["ability_en"]
12 |     for each in abs:
13 |         ability[each] += 1
14 |     
15 |     abs = item["label"]["cate_ability_en"]
16 |     for each in abs:
17 |         cate_ability[each] += 1
18 |     # break
19 | print(ability)
20 | print(cate_ability)
21 | 
22 | with open("stats.json", "w") as f:
23 |     import json
24 |     json.dump({"abs": ability, "cate_abs": cate_ability}, f, indent=4, ensure_ascii=False)


--------------------------------------------------------------------------------
/experiments/CPM-FT/scripts/download_cauldron.py:
--------------------------------------------------------------------------------
 1 | import datasets
 2 | from datasets import load_dataset
 3 | 
 4 | def convert(item):
 5 |     pass
 6 | ds = load_dataset("HuggingFaceM4/the_cauldron", "ai2d")
 7 | dataset = ds["train"]
 8 | for item in dataset:
 9 |     print(item)
10 |     break


--------------------------------------------------------------------------------
/experiments/CPM-FT/scripts/ds_config_zero2.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "fp16": {
 3 |         "enabled": "auto",
 4 |         "loss_scale": 0,
 5 |         "loss_scale_window": 1000,
 6 |         "initial_scale_power": 16,
 7 |         "hysteresis": 2,
 8 |         "min_loss_scale": 1
 9 |     },
10 | 
11 |     "bf16": {
12 |         "enabled": "auto"
13 |     },
14 | 
15 |     "optimizer": {
16 |         "type": "AdamW",
17 |         "params": {
18 |             "lr": "auto",
19 |             "betas": "auto",
20 |             "eps": "auto",
21 |             "weight_decay": "auto"
22 |         }
23 |     },
24 | 
25 |     "zero_optimization": {
26 |         "stage": 2,
27 |         "offload_optimizer": {
28 |             "device": "none",
29 |             "pin_memory": true
30 |         },
31 |         "allgather_partitions": true,
32 |         "allgather_bucket_size": 2e8,
33 |         "overlap_comm": true,
34 |         "reduce_scatter": true,
35 |         "reduce_bucket_size": 2e8,
36 |         "contiguous_gradients": true
37 |     },
38 | 
39 |     "gradient_accumulation_steps": "auto",
40 |     "gradient_clipping": "auto",
41 |     "steps_per_print": 100,
42 |     "train_batch_size": "auto",
43 |     "train_micro_batch_size_per_gpu": "auto",
44 |     "wall_clock_breakdown": false
45 | }
46 | 


--------------------------------------------------------------------------------
/experiments/CPM-FT/scripts/ds_config_zero3.json:
--------------------------------------------------------------------------------
 1 | 
 2 | {
 3 |     "fp16": {
 4 |         "enabled": "auto",
 5 |         "loss_scale": 0,
 6 |         "loss_scale_window": 1000,
 7 |         "initial_scale_power": 16,
 8 |         "hysteresis": 2,
 9 |         "min_loss_scale": 1
10 |     },
11 |     "bf16": {
12 |         "enabled": "auto"
13 |     },
14 |     "optimizer": {
15 |         "type": "AdamW",
16 |         "params": {
17 |             "lr": "auto",
18 |             "betas": "auto",
19 |             "eps": "auto",
20 |             "weight_decay": "auto"
21 |         }
22 |     },
23 | 
24 |     "scheduler": {
25 |         "type": "WarmupLR",
26 |         "params": {
27 |             "warmup_min_lr": "auto",
28 |             "warmup_max_lr": "auto",
29 |             "warmup_num_steps": "auto"
30 |         }
31 |     },
32 | 
33 |     "zero_optimization": {
34 |         "stage": 3,
35 |         "offload_optimizer": {
36 |             "device": "none",
37 |             "pin_memory": true
38 |         },
39 |         "offload_param": {
40 |             "device": "none",
41 |             "pin_memory": true
42 |         },
43 |         "overlap_comm": true,
44 |         "contiguous_gradients": true,
45 |         "sub_group_size": 1e9,
46 |         "reduce_bucket_size": "auto",
47 |         "stage3_prefetch_bucket_size": "auto",
48 |         "stage3_param_persistence_threshold": "auto",
49 |         "stage3_max_live_parameters": 1e9,
50 |         "stage3_max_reuse_distance": 1e9,
51 |         "stage3_gather_16bit_weights_on_model_save": true
52 |     },
53 | 
54 |     "gradient_accumulation_steps": "auto",
55 |     "gradient_clipping": "auto",
56 |     "steps_per_print": 100,
57 |     "train_batch_size": "auto",
58 |     "train_micro_batch_size_per_gpu": "auto",
59 |     "wall_clock_breakdown": false
60 | }
61 | 
62 | 


--------------------------------------------------------------------------------
/experiments/CPM-FT/scripts/filter_baai_dataset.py:
--------------------------------------------------------------------------------
 1 | from datasets import load_dataset
 2 | from tqdm import tqdm
 3 | from collections import defaultdict
 4 | ds = load_dataset("BAAI/Infinity-Instruct", "0625")
 5 | 
 6 | ds = ds["train"]
 7 | selected = [
 8 |     "python programming",
 9 |     #"search skills",
10 |     #"code refactoring",
11 |     #"search engine optimization",
12 |     #"code debugging",
13 |     #"code modification",
14 |     #"code implementation",
15 | ]
16 | import copy
17 | def process(item):
18 |     item_new = copy.deepcopy(item)
19 |     item_new["image"] = dict()
20 |     
21 |     conv = []
22 |     for turn in item["conversations"]:
23 |         role = "user" if turn["from"] == "human" else "assistant"
24 |         conv.append(
25 |             {"role": role, "content": turn["value"]}
26 |         )
27 |     item_new["conversations"] = conv
28 |     return item_new
29 | saved = []
30 | for item in tqdm(ds):
31 |     abs = item["label"]["ability_en"]
32 |     keep = False
33 |     for each in abs:
34 |         if each in selected:
35 |             keep = True
36 |             break
37 |         
38 |         
39 |     
40 |     if not keep:
41 |         continue
42 |     
43 |     saved.append(process(item))
44 | 
45 | print("Total", len(saved))   
46 | with open("subset.json", "w") as f:
47 |     import json
48 |     json.dump(saved, f, indent=4, ensure_ascii=False)


--------------------------------------------------------------------------------
/experiments/CPM-FT/scripts/finetune_lora.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | GPUS_PER_NODE=$(nvidia-smi -L | wc -l)
 4 | NNODES=1
 5 | NODE_RANK=0
 6 | MASTER_ADDR=localhost
 7 | MASTER_PORT=6001
 8 | 
 9 | MODEL="openbmb/MiniCPM-V-2_6" # or openbmb/MiniCPM-V-2, openbmb/MiniCPM-Llama3-V-2_5
10 | # ATTENTION: specify the path to your training data, which should be a json file consisting of a list of conversations.
11 | # See the section for finetuning in README for more information.
12 | DATA="data/agent_tune_dataset_cpm_cleaned_9k.json"
13 | LLM_TYPE="qwen2" 
14 | # if use openbmb/MiniCPM-V-2, please set LLM_TYPE=minicpm
15 | #if use openbmb/MiniCPM-Llama3-V-2_5, please set LLM_TYPE=llama3
16 | DISTRIBUTED_ARGS="
17 |     --nproc_per_node $GPUS_PER_NODE \
18 |     --nnodes $NNODES \
19 |     --node_rank $NODE_RANK \
20 |     --master_addr $MASTER_ADDR \
21 |     --master_port $MASTER_PORT
22 | "
23 | 
24 | export WANDB_PROJECT=minicpm
25 | torchrun $DISTRIBUTED_ARGS finetune/finetune.py  \
26 |     --model_name_or_path $MODEL \
27 |     --llm_type $LLM_TYPE \
28 |     --data_path $DATA \
29 |     --remove_unused_columns false \
30 |     --label_names "labels" \
31 |     --prediction_loss_only false \
32 |     --bf16 false \
33 |     --bf16_full_eval false \
34 |     --fp16 true \
35 |     --fp16_full_eval true \
36 |     --do_train \
37 |     --tune_vision false \
38 |     --tune_llm false \
39 |     --use_lora true \
40 |     --lora_target_modules "llm\..*layers\.\d+\.self_attn\.(q_proj|k_proj|v_proj|o_proj)" \
41 |     --model_max_length 10240 \
42 |     --max_slice_nums 9 \
43 |     --eval_steps 100000 \
44 |     --output_dir output/cpm_v2_6_$SLURM_JOB_ID \
45 |     --logging_dir output/cpm_v2_6_log_$SLURM_JOB_ID \
46 |     --logging_strategy "steps" \
47 |     --per_device_train_batch_size 2 \
48 |     --per_device_eval_batch_size 1 \
49 |     --gradient_accumulation_steps 1 \
50 |     --evaluation_strategy "steps" \
51 |     --save_strategy "steps" \
52 |     --save_steps 100000 \
53 |     --save_total_limit 1 \
54 |     --learning_rate 1e-6 \
55 |     --weight_decay 0.1 \
56 |     --adam_beta2 0.95 \
57 |     --warmup_ratio 0.01 \
58 |     --lr_scheduler_type "cosine" \
59 |     --logging_steps 1 \
60 |     --gradient_checkpointing true \
61 |     --deepspeed scripts/ds_config_zero2.json \
62 |     --report_to wandb \
63 |     --num_train_epochs 1 \
64 |     --image_base_path ./
65 | 


--------------------------------------------------------------------------------
/experiments/CPM-FT/scripts/subset.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import json
 3 | 
 4 | 
 5 | source_file = "data/agent_tune_dataset_cpm.json"
 6 | with open(source_file, "r") as f:
 7 |     dataset = json.load(f)
 8 |     
 9 |     
10 | with open("data/debug_small.json", "w") as f:
11 |     json.dump(dataset[:100], f, indent=4, ensure_ascii=False)


--------------------------------------------------------------------------------
/experiments/CPM-FT/scripts/tokenizer.py:
--------------------------------------------------------------------------------
1 | 
2 | from transformers import AutoModel, AutoTokenizer
3 | 
4 | tokenizer = AutoTokenizer.from_pretrained('openbmb/MiniCPM-V-2_6', trust_remote_code=True)
5 | 
6 | 
7 | print(tokenizer.decode([151646, 151647, 151656, 151657]))


--------------------------------------------------------------------------------
/experiments/CPM-FT/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mat-agent/MAT-Agent/7215f7bb7ee2284cfaca2fda79725c39d5b3bb89/experiments/CPM-FT/tests/__init__.py


--------------------------------------------------------------------------------
/experiments/CPM-FT/tests/test_infer.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from PIL import Image
 3 | from transformers import AutoModel, AutoTokenizer
 4 | 
 5 | torch.manual_seed(0)
 6 | 
 7 | model = AutoModel.from_pretrained('openbmb/MiniCPM-V-2_6', trust_remote_code=True,
 8 |     attn_implementation='sdpa', torch_dtype=torch.bfloat16) # sdpa or flash_attention_2, no eager
 9 | model = model.eval().cuda()
10 | tokenizer = AutoTokenizer.from_pretrained('openbmb/MiniCPM-V-2_6', trust_remote_code=True)
11 | 
12 | image = Image.open('./assets/airplane.jpeg').convert('RGB')
13 | 
14 | # First round chat 
15 | question = "Tell me the model of this aircraft."
16 | msgs = [{'role': 'user', 'content': [image, question]}]
17 | print(type(model))
18 | model.terminators.append("3")
19 | # exit()
20 | answer = model.chat(
21 |     image=None,
22 |     msgs=msgs,
23 |     system_prompt="Respond in chinese.",
24 |     tokenizer=tokenizer
25 | )
26 | print(answer)
27 | 
28 | # Second round chat 
29 | # pass history context of multi-turn conversation
30 | # msgs.append({"role": "assistant", "content": [answer]})
31 | # msgs.append({"role": "user", "content": ["Introduce something about Airbus A380."]})
32 | 
33 | # answer = model.chat(
34 | #     image=None,
35 | #     msgs=msgs,
36 | #     tokenizer=tokenizer
37 | # )
38 | # print(answer)


--------------------------------------------------------------------------------
/experiments/CPM-FT/tests/test_infer_lora.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from PIL import Image
 3 | from transformers import AutoModel, AutoTokenizer
 4 | from inference.utils import load_pretrained_model_lora
 5 | torch.manual_seed(0)
 6 | 
 7 | model, tokenizer = load_pretrained_model_lora("output/cpm_v2_6_7680255/")
 8 | # model = AutoModel.from_pretrained('openbmb/MiniCPM-V-2_6', trust_remote_code=True,
 9 | #     attn_implementation='sdpa', torch_dtype=torch.bfloat16) # sdpa or flash_attention_2, no eager
10 | model = model.eval().cuda()
11 | # tokenizer = AutoTokenizer.from_pretrained('openbmb/MiniCPM-V-2_6', trust_remote_code=True)
12 | 
13 | image = Image.open('./assets/airplane.jpeg').convert('RGB')
14 | 
15 | # First round chat 
16 | question = "Tell me the model of this aircraft."
17 | msgs = [{'role': 'user', 'content': [image, question]}]
18 | 
19 | answer = model.chat(
20 |     image=None,
21 |     msgs=msgs,
22 |     tokenizer=tokenizer
23 | )
24 | print("=" * 10)
25 | print(answer)
26 | print("=" * 10)
27 | 
28 | # Second round chat 
29 | # pass history context of multi-turn conversation
30 | msgs.append({"role": "assistant", "content": [answer]})
31 | msgs.append({"role": "user", "content": ["Introduce something about Airbus A380."]})
32 | 
33 | answer = model.chat(
34 |     image=None,
35 |     msgs=msgs,
36 |     tokenizer=tokenizer
37 | )
38 | print(answer)


--------------------------------------------------------------------------------
/experiments/Qwen-VL/.github/ISSUE_TEMPLATE/config.yaml:
--------------------------------------------------------------------------------
1 | blank_issues_enabled: true
2 | 


--------------------------------------------------------------------------------
/experiments/Qwen-VL/.github/ISSUE_TEMPLATE/feature_request.yaml:
--------------------------------------------------------------------------------
 1 | name: "💡 Feature Request"
 2 | description: 创建新功能请求 | Create a new ticket for a new feature request
 3 | title: "💡 [REQUEST] - <title>"
 4 | labels: [
 5 |   "question"
 6 | ]
 7 | body:
 8 |   - type: input
 9 |     id: start_date
10 |     attributes:
11 |       label: "起始日期 | Start Date"
12 |       description: |
13 |         起始开发日期
14 |         Start of development
15 |       placeholder: "month/day/year"
16 |     validations:
17 |       required: false
18 |   - type: textarea
19 |     id: implementation_pr
20 |     attributes:
21 |       label: "实现PR | Implementation PR"
22 |       description: |
23 |         实现该功能的Pull request
24 |         Pull request used
25 |       placeholder: "#Pull Request ID"
26 |     validations:
27 |       required: false
28 |   - type: textarea
29 |     id: reference_issues
30 |     attributes:
31 |       label: "相关Issues | Reference Issues"
32 |       description: |
33 |         与该功能相关的issues
34 |         Common issues
35 |       placeholder: "#Issues IDs"
36 |     validations:
37 |       required: false
38 |   - type: textarea
39 |     id: summary
40 |     attributes:
41 |       label: "摘要 | Summary"
42 |       description: |
43 |         简要描述新功能的特点
44 |         Provide a brief explanation of the feature
45 |       placeholder: |
46 |         Describe in a few lines your feature request
47 |     validations:
48 |       required: true
49 |   - type: textarea
50 |     id: basic_example
51 |     attributes:
52 |       label: "基本示例 | Basic Example"
53 |       description: Indicate here some basic examples of your feature.
54 |       placeholder: A few specific words about your feature request.
55 |     validations:
56 |       required: true
57 |   - type: textarea
58 |     id: drawbacks
59 |     attributes:
60 |       label: "缺陷 | Drawbacks"
61 |       description: |
62 |         该新功能有哪些缺陷/可能造成哪些影响？
63 |         What are the drawbacks/impacts of your feature request ?
64 |       placeholder: |
65 |         Identify the drawbacks and impacts while being neutral on your feature request
66 |     validations:
67 |       required: true
68 |   - type: textarea
69 |     id: unresolved_question
70 |     attributes:
71 |       label: "未解决问题 | Unresolved questions"
72 |       description: |
73 |         有哪些尚未解决的问题？
74 |         What questions still remain unresolved ?
75 |       placeholder: |
76 |         Identify any unresolved issues.
77 |     validations:
78 |       required: false


--------------------------------------------------------------------------------
/experiments/Qwen-VL/.gitignore:
--------------------------------------------------------------------------------
 1 | __pycache__
 2 | *.so
 3 | build
 4 | .coverage_*
 5 | *.egg-info
 6 | *~
 7 | .vscode/
 8 | .idea/
 9 | .DS_Store
10 | 
11 | /private/
12 | Qwen-VL-Chat/
13 | Qwen-VL-Chat-Int4/
14 | SimSun.ttf
15 | 


--------------------------------------------------------------------------------
/experiments/Qwen-VL/BUILD.md:
--------------------------------------------------------------------------------
 1 | ## qwen web demo
 2 | 
 3 | ### build
 4 | 
 5 | ```
 6 | docker build -t qwen-vl-chat:webdemo --platform linux/amd64 -f Dockerfile.qwendemo . 
 7 | ```
 8 | 
 9 | ### run
10 | 
11 | ```
12 | docker run -it --gpus device=0 -d --restart always -v /var/run/docker.sock:/var/run/docker.sock --name qwen-vl-chat -p 8000:8000 --user=20001:20001 --platform linux/amd64 qwen-vl-chat:webdemo
13 | ```
14 | 
15 | ## qwen openai api
16 | 
17 | ### build
18 | 
19 | ```
20 | docker build -t qwen-vl-chat:openai --platform linux/amd64 -f Dockerfile.qwenopenai . 
21 | ```
22 | 
23 | ### run
24 | 
25 | ```
26 | docker run -it --gpus device=0 -d --restart always -v /var/run/docker.sock:/var/run/docker.sock --name qwen-vl-chat -p 8080:8080 --user=20001:20001 --platform linux/amd64 qwen-vl-chat:openai
27 | ```
28 | 
29 | ## qwen-int4 openai api
30 | 
31 | ### build
32 | 
33 | ```
34 | docker build -t qwen-vl-chat:int4-openai --platform linux/amd64 -f Dockerfile.qwenint4openai . 
35 | ```
36 | 
37 | ### run
38 | 
39 | ```
40 | docker run -it --gpus device=0 -d --restart always -v /var/run/docker.sock:/var/run/docker.sock --name qwen-vl-chat-int4 -p 8080:8080 --user=20001:20001 --platform linux/amd64 qwen-vl-chat:int4-openai
41 | ```
42 | 


--------------------------------------------------------------------------------
/experiments/Qwen-VL/Dockerfile.qwendemo:
--------------------------------------------------------------------------------
 1 | # python 3.8 and above
 2 | # pytorch 1.12 and above, 2.0 and above are recommended
 3 | # CUDA 11.4 and above are recommended (this is for GPU users, flash-attention users, etc.)
 4 | 
 5 | # based on modelscope docker image
 6 | # registry.cn-hangzhou.aliyuncs.com/modelscope-repo/modelscope:ubuntu20.04-cuda11.7.1-py38-torch2.0.1-tf1.15.5-1.8.0
 7 | # registry.cn-beijing.aliyuncs.com/modelscope-repo/modelscope:ubuntu20.04-cuda11.7.1-py38-torch2.0.1-tf1.15.5-1.8.0
 8 | FROM registry.cn-hangzhou.aliyuncs.com/modelscope-repo/modelscope:ubuntu20.04-cuda11.7.1-py38-torch2.0.1-tf1.15.5-1.8.0
 9 | 
10 | ARG workdir=/var/app
11 | RUN mkdir -p ${workdir}
12 | 
13 | RUN git lfs install
14 | 
15 | WORKDIR ${workdir}
16 | COPY requirements.txt requirements_web_demo.txt ./
17 | 
18 | # Install Qwen dependencies
19 | RUN pip install -r requirements.txt
20 | 
21 | # Install webUI dependencies
22 | WORKDIR ${workdir}
23 | RUN pip install -r requirements_web_demo.txt
24 | 
25 | # Offline mode, check https://huggingface.co/docs/transformers/v4.15.0/installation#offline-mode
26 | ENV HF_DATASETS_OFFLINE=1
27 | ENV TRANSFORMERS_OFFLINE=1
28 | 
29 | # set TZ, make logs dir, and expose port 8080
30 | ENV TZ=Asia/Shanghai
31 | RUN mkdir -p ${workdir}/logs && chmod 777 ${workdir}/logs
32 | VOLUME /var/app/logs
33 | 
34 | # create user 20001
35 | RUN useradd -r -m appuser -u 20001 -g 0
36 | 
37 | WORKDIR ${workdir}
38 | # copy model
39 | RUN git clone https://huggingface.co/Qwen/Qwen-VL-Chat
40 | # COPY --chown=20001:20001 Qwen-VL-Chat ./Qwen-VL-Chat
41 | # copy fonts
42 | ADD --chown=20001:20001 https://github.com/StellarCN/scp_zh/raw/master/fonts/SimSun.ttf ./
43 | # COPY --chown=20001:20001 SimSun.ttf ./
44 | # copy main app
45 | COPY --chown=20001:20001 web_demo_mm.py ./
46 | 
47 | EXPOSE 8000
48 | CMD ["python3", "web_demo_mm.py", "-c", "./Qwen-VL-Chat", "--server-name", "0.0.0.0", "--server-port", "8000"]
49 | 


--------------------------------------------------------------------------------
/experiments/Qwen-VL/Dockerfile.qwenint4openai:
--------------------------------------------------------------------------------
 1 | # python 3.8 and above
 2 | # pytorch 1.12 and above, 2.0 and above are recommended
 3 | # CUDA 11.4 and above are recommended (this is for GPU users, flash-attention users, etc.)
 4 | 
 5 | # based on modelscope docker image
 6 | # registry.cn-hangzhou.aliyuncs.com/modelscope-repo/modelscope:ubuntu20.04-cuda11.7.1-py38-torch2.0.1-tf1.15.5-1.8.0
 7 | # registry.cn-beijing.aliyuncs.com/modelscope-repo/modelscope:ubuntu20.04-cuda11.7.1-py38-torch2.0.1-tf1.15.5-1.8.0
 8 | FROM registry.cn-hangzhou.aliyuncs.com/modelscope-repo/modelscope:ubuntu20.04-cuda11.7.1-py38-torch2.0.1-tf1.15.5-1.8.0
 9 | 
10 | ARG workdir=/var/app
11 | RUN mkdir -p ${workdir}
12 | 
13 | RUN git lfs install
14 | 
15 | WORKDIR ${workdir}
16 | COPY requirements.txt requirements_web_demo.txt ./
17 | 
18 | # Install Qwen dependencies
19 | RUN pip install -r requirements.txt
20 | 
21 | # Install webUI dependencies
22 | WORKDIR ${workdir}
23 | RUN pip install -r requirements_web_demo.txt
24 | 
25 | # Offline mode, check https://huggingface.co/docs/transformers/v4.15.0/installation#offline-mode
26 | ENV HF_DATASETS_OFFLINE=1
27 | ENV TRANSFORMERS_OFFLINE=1
28 | 
29 | # set TZ, make logs dir, and expose port 8080
30 | ENV TZ=Asia/Shanghai
31 | RUN mkdir -p ${workdir}/logs && chmod 777 ${workdir}/logs
32 | VOLUME /var/app/logs
33 | 
34 | # create user 20001
35 | RUN useradd -r -m appuser -u 20001 -g 0
36 | 
37 | WORKDIR ${workdir}
38 | # copy model
39 | RUN git clone https://huggingface.co/Qwen/Qwen-VL-Chat-Int4
40 | # COPY --chown=20001:20001 Qwen-VL-Chat-Int4 ./Qwen-VL-Chat-Int4
41 | 
42 | # Install AutoGPTQ
43 | RUN pip install optimum
44 | # RUN git clone https://github.com/JustinLin610/AutoGPTQ.git && \
45 | #     cd AutoGPTQ && \
46 | #     pip install -v .
47 | RUN pip install auto-gptq --extra-index-url https://huggingface.github.io/autogptq-index/whl/cu117/
48 | 
49 | # Install OpenAI API dependencies
50 | WORKDIR ${workdir}
51 | COPY requirements_openai_api.txt ./
52 | RUN pip install -r requirements_openai_api.txt
53 | # copy fonts
54 | ADD --chown=20001:20001 https://github.com/StellarCN/scp_zh/raw/master/fonts/SimSun.ttf ./
55 | # COPY --chown=20001:20001 SimSun.ttf ./
56 | # copy main app
57 | COPY --chown=20001:20001 openai_api.py ./
58 | 
59 | EXPOSE 8080
60 | # CMD ["python3", "openai_api.py", "-c", "./Qwen-VL-Chat", "--server-name", "0.0.0.0", "--server-port", "8080"]
61 | CMD ["python3", "openai_api.py", "-c", "./Qwen-VL-Chat-Int4", "--server-name", "0.0.0.0", "--server-port", "8080"]
62 | 


--------------------------------------------------------------------------------
/experiments/Qwen-VL/Dockerfile.qwenopenai:
--------------------------------------------------------------------------------
 1 | # python 3.8 and above
 2 | # pytorch 1.12 and above, 2.0 and above are recommended
 3 | # CUDA 11.4 and above are recommended (this is for GPU users, flash-attention users, etc.)
 4 | 
 5 | # based on modelscope docker image
 6 | # registry.cn-hangzhou.aliyuncs.com/modelscope-repo/modelscope:ubuntu20.04-cuda11.7.1-py38-torch2.0.1-tf1.15.5-1.8.0
 7 | # registry.cn-beijing.aliyuncs.com/modelscope-repo/modelscope:ubuntu20.04-cuda11.7.1-py38-torch2.0.1-tf1.15.5-1.8.0
 8 | FROM registry.cn-hangzhou.aliyuncs.com/modelscope-repo/modelscope:ubuntu20.04-cuda11.7.1-py38-torch2.0.1-tf1.15.5-1.8.0
 9 | 
10 | ARG workdir=/var/app
11 | RUN mkdir -p ${workdir}
12 | 
13 | RUN git lfs install
14 | 
15 | WORKDIR ${workdir}
16 | COPY requirements.txt requirements_web_demo.txt ./
17 | 
18 | # Install Qwen dependencies
19 | RUN pip install -r requirements.txt
20 | 
21 | # Install webUI dependencies
22 | WORKDIR ${workdir}
23 | RUN pip install -r requirements_web_demo.txt
24 | 
25 | # Offline mode, check https://huggingface.co/docs/transformers/v4.15.0/installation#offline-mode
26 | ENV HF_DATASETS_OFFLINE=1
27 | ENV TRANSFORMERS_OFFLINE=1
28 | 
29 | # set TZ, make logs dir, and expose port 8080
30 | ENV TZ=Asia/Shanghai
31 | RUN mkdir -p ${workdir}/logs && chmod 777 ${workdir}/logs
32 | VOLUME /var/app/logs
33 | 
34 | # create user 20001
35 | RUN useradd -r -m appuser -u 20001 -g 0
36 | 
37 | WORKDIR ${workdir}
38 | # copy model
39 | RUN git clone https://huggingface.co/Qwen/Qwen-VL-Chat
40 | # COPY --chown=20001:20001 Qwen-VL-Chat ./Qwen-VL-Chat
41 | 
42 | # Install OpenAI API dependencies
43 | WORKDIR ${workdir}
44 | COPY requirements_openai_api.txt ./
45 | RUN pip install -r requirements_openai_api.txt
46 | # copy fonts
47 | ADD --chown=20001:20001 https://github.com/StellarCN/scp_zh/raw/master/fonts/SimSun.ttf ./
48 | # COPY --chown=20001:20001 SimSun.ttf ./
49 | # copy main app
50 | COPY --chown=20001:20001 openai_api.py ./
51 | 
52 | EXPOSE 8080
53 | CMD ["python3", "openai_api.py", "-c", "./Qwen-VL-Chat", "--server-name", "0.0.0.0", "--server-port", "8080"]
54 | 


--------------------------------------------------------------------------------
/experiments/Qwen-VL/FAQ.md:
--------------------------------------------------------------------------------
 1 | # FAQ
 2 | 
 3 | ## Installation & Environment
 4 | 
 5 | #### Which version of transformers should I use?
 6 | 
 7 | 4.31.0 is preferred.
 8 | 
 9 | #### I downloaded the codes and checkpoints but I can't load the model locally. What should I do?
10 | 
11 | Please check if you have updated the code to the latest, and correctly downloaded all the sharded checkpoint files.
12 | 
13 | #### `qwen.tiktoken` is not found. What is it?
14 | 
15 | This is the merge file of the tokenizer. You have to download it. Note that if you just git clone the repo without [git-lfs](https://git-lfs.com), you cannot download this file.
16 | 
17 | #### transformers_stream_generator/tiktoken/accelerate not found
18 | 
19 | Run the command `pip install -r requirements.txt`. You can find the file at [https://github.com/QwenLM/Qwen-VL/blob/main/requirements.txt](https://github.com/QwenLM/Qwen-VL/blob/main/requirements.txt).
20 | <br><br>
21 | 
22 | 
23 | 
24 | ## Demo & Inference
25 | 
26 | #### Is there any demo?
27 | 
28 | Yes, see `web_demo_mm.py` for web demo. See README for more information.
29 | 
30 | 
31 | 
32 | #### Can Qwen-VL support streaming?
33 | 
34 | No. We do not support streaming yet.
35 | 
36 | #### It seems that the generation is not related to the instruction...
37 | 
38 | Please check if you are loading Qwen-VL-Chat instead of Qwen-VL. Qwen-VL is the base model without alignment, which behaves differently from the SFT/Chat model.
39 | 
40 | #### Is quantization supported?
41 | 
42 | No. We would support quantization asap.
43 | 
44 | #### Unsatisfactory performance in processing long sequences
45 | 
46 | Please ensure that NTK is applied. `use_dynamc_ntk` and `use_logn_attn` in `config.json` should be set to `true` (`true` by default).
47 | <br><br>
48 | 
49 | 
50 | ## Tokenizer
51 | 
52 | #### bos_id/eos_id/pad_id not found
53 | 
54 | In our training, we only use `<|endoftext|>` as the separator and padding token. You can set bos_id, eos_id, and pad_id to tokenizer.eod_id. Learn more about our tokenizer from our documents about the tokenizer.
55 | 
56 | 


--------------------------------------------------------------------------------
/experiments/Qwen-VL/FAQ_ja.md:
--------------------------------------------------------------------------------
 1 | # FAQ
 2 | 
 3 | ## インストールと環境
 4 | 
 5 | #### transformers のバージョンは？
 6 | 
 7 | 4.31.0 が望ましいです。
 8 | 
 9 | #### コードとチェックポイントをダウンロードしましたが、モデルをローカルにロードできません。どうすればよいでしょうか？
10 | 
11 | コードを最新のものに更新し、すべてのシャードされたチェックポイントファイルを正しくダウンロードしたかどうか確認してください。
12 | 
13 | #### `qwen.tiktoken` が見つかりません。これは何ですか？
14 | 
15 | これは tokenizer のマージファイルです。ダウンロードする必要があります。[git-lfs](https://git-lfs.com) を使わずにリポジトリを git clone しただけでは、このファイルをダウンロードできないことに注意してください。
16 | 
17 | #### transformers_stream_generator/tiktoken/accelerate が見つかりません。
18 | 
19 | コマンド `pip install -r requirements.txt` を実行してください。このファイルは [https://github.com/QwenLM/Qwen-VL/blob/main/requirements.txt](https://github.com/QwenLM/Qwen-VL/blob/main/requirements.txt) にあります。
20 | <br><br>
21 | 
22 | 
23 | 
24 | ## デモと推論
25 | 
26 | #### デモはありますか？
27 | 
28 | ウェブデモは `web_demo_mm.py` を参照してください。詳細は README を参照してください。
29 | 
30 | 
31 | 
32 | #### Qwen-VLはストリーミングに対応していますか？
33 | 
34 | いいえ、まだサポートしていません。
35 | 
36 | #### 世代と命令は関係ないようですが...
37 | 
38 | Qwen-VL ではなく Qwen-VL-Chat を読み込んでいないか確認してください。Qwen-VL はアライメントなしのベースモデルで、SFT/Chat モデルとは動作が異なります。
39 | 
40 | #### 量子化はサポートされていますか？
41 | 
42 | いいえ。早急に量子化をサポートするつもりです。
43 | 
44 | #### 長いシーケンスの処理で不満足なパフォーマンス
45 | 
46 | NTK が適用されていることを確認してください。`config.json` の `use_dynamc_ntk` と `use_logn_attn` を `true` に設定する必要がある（デフォルトでは `true`）。
47 | <br><br>
48 | 
49 | 
50 | ## Tokenizer
51 | 
52 | #### bos_id/eos_id/pad_id が見つかりません。
53 | 
54 | 私たちのトレーニングでは、セパレータとパディングトークンとして `<|endoftext|>` のみを使用しています。bos_id、eos_id、pad_id は tokenizer.eod_id に設定できます。私たちの tokenizer について詳しくは、tokenizer についてのドキュメントをご覧ください。
55 | 
56 | 


--------------------------------------------------------------------------------
/experiments/Qwen-VL/FAQ_ko.md:
--------------------------------------------------------------------------------
 1 | # FAQ
 2 | 
 3 | ## 설치 및 환경
 4 | 
 5 | #### 어떤 버전의 transformers를 사용해야 하나요?
 6 | 
 7 | 4.31.0 버전을 사용하는 것을 선호합니다.
 8 | 
 9 | #### 코드와 체크포인트를 다운로드했는데 모델을 로컬에서 불러올 수 없어요. 어떻게 해야 하나요?
10 | 
11 | 코드를 최신 버전으로 업데이트했는지, 그리고 모든 샤드 체크포인트 파일을 올바르게 다운로드했는지 확인해 주세요.
12 | 
13 | #### `qwen.tiktoken`을 찾을 수 없어요. 이게 무엇인가요?
14 | 
15 | 이것은 토크나이저의 병합 파일입니다. 이 파일을 다운로드해야 합니다. [git-lfs](https://git-lfs.com) 없이 단순히 깃 저장소를 복제했다면 이 파일을 다운로드할 수 없습니다.
16 | 
17 | #### transformers_stream_generator/tiktoken/accelerate not found 오류
18 | 
19 | `pip install -r requirements.txt` 명령을 실행하세요. 이 파일은 [https://github.com/QwenLM/Qwen-VL/blob/main/requirements.txt](https://github.com/QwenLM/Qwen-VL/blob/main/requirements.txt)에서 찾을 수 있습니다.
20 | <br><br>
21 | 
22 | 
23 | ## Demo & Inference
24 | 
25 | #### 데모가 있나요?
26 | 
27 | 네, 웹 데모는 `web_demo_mm.py`를 참고하세요. 더 많은 정보는 README 파일에서 확인할 수 있습니다.
28 | 
29 | 
30 | 
31 | #### Qwen-VL은 스트리밍을 지원하나요?
32 | 
33 | 아니요. 아직 스트리밍을 지원하지 않습니다.
34 | 
35 | #### 생성된 내용이 지시사항과 관련 없는 것 같습니다.
36 | 
37 | Qwen-VL 대신 Qwen-VL-Chat을 로드하고 있는지 확인해 주세요. Qwen-VL은 SFT/Chat 모델과 달리 정렬이 없는 기본 모델이므로 다르게 작동합니다.
38 | 
39 | #### 양자화를 지원하나요?
40 | 
41 | 아니요. 가능한 빨리 양자화를 지원할 예정입니다.
42 | 
43 | #### 긴 시퀀스 처리에서 만족스럽지 못한 성능
44 | 
45 | NTK가 적용되었는지 확인해 주세요. `config.json`의 `use_dynamc_ntk`과 `use_logn_attn`은 `true`로 설정되어야 합니다(`true`가 기본값).
46 | <br><br>
47 | 
48 | 
49 | ## Tokenizer
50 | 
51 | #### bos_id/eos_id/pad_id not found 오류
52 | 
53 | 저희 훈련에서는 ``을 구분자 및 패딩 토큰으로만 사용합니다. bos_id, eos_id, pad_id를 tokenizer.eod_id로 설정할 수 있습니다. 토크나이저에 대한 문서에서 토크나이저에 대해 더 알아보세요.


--------------------------------------------------------------------------------
/experiments/Qwen-VL/FAQ_zh.md:
--------------------------------------------------------------------------------
 1 | # FAQ
 2 | 
 3 | ## 安装&环境
 4 | 
 5 | #### 我应该用哪个transformers版本？
 6 | 
 7 | 建议使用4.31.0。
 8 | 
 9 | #### 我把模型和代码下到本地，按照教程无法使用，该怎么办？
10 | 
11 | 答：别着急，先检查你的代码是不是更新到最新版本，然后确认你是否完整地将模型checkpoint下到本地。
12 | 
13 | #### `qwen.tiktoken`这个文件找不到，怎么办？
14 | 
15 | 这个是我们的tokenizer的merge文件，你必须下载它才能使用我们的tokenizer。注意，如果你使用git clone却没有使用git-lfs，这个文件不会被下载。如果你不了解git-lfs，可点击[官网](https://git-lfs.com/)了解。
16 | 
17 | #### transformers_stream_generator/tiktoken/accelerate，这几个库提示找不到，怎么办？
18 | 
19 | 运行如下命令：`pip install -r requirements.txt`。相关依赖库在[https://github.com/QwenLM/Qwen-VL/blob/main/requirements.txt](https://github.com/QwenLM/Qwen-VL/blob/main/requirements.txt) 可以找到。
20 | <br><br>
21 | 
22 | 
23 | ## Demo & 推理
24 | 
25 | #### 是否提供Demo？
26 | 
27 | `web_demo_mm.py`提供了Web UI。请查看README相关内容了解更多。
28 | 
29 | #### Qwen-VL支持流式推理吗？
30 | 
31 | Qwen-VL当前不支持流式推理。
32 | 
33 | #### 模型的输出看起来与输入无关/没有遵循指令/看起来呆呆的
34 | 
35 | 请检查是否加载的是Qwen-VL-Chat模型进行推理，Qwen-VL模型是未经align的预训练基模型，不期望具备响应用户指令的能力。我们在模型最新版本已经对`chat`接口内进行了检查，避免您误将预训练模型作为SFT/Chat模型使用。
36 | 
37 | #### 是否有量化版本模型
38 | 
39 | 目前Qwen-VL不支持量化，后续我们将支持高效的量化推理实现。
40 | 
41 | #### 处理长序列时效果有问题
42 | 
43 | 请确认是否开启ntk。若要启用这些技巧，请将`config.json`里的`use_dynamc_ntk`和`use_logn_attn`设置为`true`。最新代码默认为`true`。
44 | <br><br>
45 | 
46 | 
47 | ## Tokenizer
48 | 
49 | #### bos_id/eos_id/pad_id，这些token id不存在，为什么？
50 | 
51 | 在训练过程中，我们仅使用<|endoftext|>这一token作为sample/document之间的分隔符及padding位置占位符，你可以将bos_id, eos_id, pad_id均指向tokenizer.eod_id。请阅读我们关于tokenizer的文档，了解如何设置这些id。
52 | 
53 | 


--------------------------------------------------------------------------------
/experiments/Qwen-VL/NOTICE:
--------------------------------------------------------------------------------
 1 | ------------- LICENSE FOR NVIDIA Megatron-LM code  --------------
 2 | 
 3 | Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 4 | 
 5 | Redistribution and use in source and binary forms, with or without
 6 | modification, are permitted provided that the following conditions
 7 | are met:
 8 |   * Redistributions of source code must retain the above copyright
 9 |     notice, this list of conditions and the following disclaimer.
10 |   * Redistributions in binary form must reproduce the above copyright
11 |     notice, this list of conditions and the following disclaimer in the
12 |     documentation and/or other materials provided with the distribution.
13 |   * Neither the name of NVIDIA CORPORATION nor the names of its
14 |     contributors may be used to endorse or promote products derived
15 |     from this software without specific prior written permission.
16 | 
17 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
18 | EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
20 | PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
21 | CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
22 | EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
23 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
24 | PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
25 | OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 | 
29 | 
30 | ------------- LICENSE FOR OpenAI tiktoken code  --------------
31 | 
32 | MIT License
33 | 
34 | Copyright (c) 2022 OpenAI, Shantanu Jain
35 | 
36 | Permission is hereby granted, free of charge, to any person obtaining a copy
37 | of this software and associated documentation files (the "Software"), to deal
38 | in the Software without restriction, including without limitation the rights
39 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
40 | copies of the Software, and to permit persons to whom the Software is
41 | furnished to do so, subject to the following conditions:
42 | 
43 | The above copyright notice and this permission notice shall be included in all
44 | copies or substantial portions of the Software.
45 | 
46 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
47 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
48 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
49 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
50 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
51 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
52 | SOFTWARE.


--------------------------------------------------------------------------------
/experiments/Qwen-VL/assets/apple.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mat-agent/MAT-Agent/7215f7bb7ee2284cfaca2fda79725c39d5b3bb89/experiments/Qwen-VL/assets/apple.jpeg


--------------------------------------------------------------------------------
/experiments/Qwen-VL/assets/apple_r.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mat-agent/MAT-Agent/7215f7bb7ee2284cfaca2fda79725c39d5b3bb89/experiments/Qwen-VL/assets/apple_r.jpeg


--------------------------------------------------------------------------------
/experiments/Qwen-VL/assets/demo.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mat-agent/MAT-Agent/7215f7bb7ee2284cfaca2fda79725c39d5b3bb89/experiments/Qwen-VL/assets/demo.jpeg


--------------------------------------------------------------------------------
/experiments/Qwen-VL/assets/demo_highfive.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mat-agent/MAT-Agent/7215f7bb7ee2284cfaca2fda79725c39d5b3bb89/experiments/Qwen-VL/assets/demo_highfive.jpg


--------------------------------------------------------------------------------
/experiments/Qwen-VL/assets/demo_spotting_caption.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mat-agent/MAT-Agent/7215f7bb7ee2284cfaca2fda79725c39d5b3bb89/experiments/Qwen-VL/assets/demo_spotting_caption.jpg


--------------------------------------------------------------------------------
/experiments/Qwen-VL/assets/demo_vl.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mat-agent/MAT-Agent/7215f7bb7ee2284cfaca2fda79725c39d5b3bb89/experiments/Qwen-VL/assets/demo_vl.gif


--------------------------------------------------------------------------------
/experiments/Qwen-VL/assets/logo.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mat-agent/MAT-Agent/7215f7bb7ee2284cfaca2fda79725c39d5b3bb89/experiments/Qwen-VL/assets/logo.jpg


--------------------------------------------------------------------------------
/experiments/Qwen-VL/assets/mm_tutorial/Beijing.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mat-agent/MAT-Agent/7215f7bb7ee2284cfaca2fda79725c39d5b3bb89/experiments/Qwen-VL/assets/mm_tutorial/Beijing.jpeg


--------------------------------------------------------------------------------
/experiments/Qwen-VL/assets/mm_tutorial/Beijing_Small.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mat-agent/MAT-Agent/7215f7bb7ee2284cfaca2fda79725c39d5b3bb89/experiments/Qwen-VL/assets/mm_tutorial/Beijing_Small.jpeg


--------------------------------------------------------------------------------
/experiments/Qwen-VL/assets/mm_tutorial/Chongqing.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mat-agent/MAT-Agent/7215f7bb7ee2284cfaca2fda79725c39d5b3bb89/experiments/Qwen-VL/assets/mm_tutorial/Chongqing.jpeg


--------------------------------------------------------------------------------
/experiments/Qwen-VL/assets/mm_tutorial/Chongqing_Small.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mat-agent/MAT-Agent/7215f7bb7ee2284cfaca2fda79725c39d5b3bb89/experiments/Qwen-VL/assets/mm_tutorial/Chongqing_Small.jpeg


--------------------------------------------------------------------------------
/experiments/Qwen-VL/assets/mm_tutorial/Hospital.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mat-agent/MAT-Agent/7215f7bb7ee2284cfaca2fda79725c39d5b3bb89/experiments/Qwen-VL/assets/mm_tutorial/Hospital.jpg


--------------------------------------------------------------------------------
/experiments/Qwen-VL/assets/mm_tutorial/Hospital_Small.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mat-agent/MAT-Agent/7215f7bb7ee2284cfaca2fda79725c39d5b3bb89/experiments/Qwen-VL/assets/mm_tutorial/Hospital_Small.jpg


--------------------------------------------------------------------------------
/experiments/Qwen-VL/assets/mm_tutorial/Menu.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mat-agent/MAT-Agent/7215f7bb7ee2284cfaca2fda79725c39d5b3bb89/experiments/Qwen-VL/assets/mm_tutorial/Menu.jpeg


--------------------------------------------------------------------------------
/experiments/Qwen-VL/assets/mm_tutorial/Rebecca_(1939_poster).jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mat-agent/MAT-Agent/7215f7bb7ee2284cfaca2fda79725c39d5b3bb89/experiments/Qwen-VL/assets/mm_tutorial/Rebecca_(1939_poster).jpeg


--------------------------------------------------------------------------------
/experiments/Qwen-VL/assets/mm_tutorial/Rebecca_(1939_poster)_Small.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mat-agent/MAT-Agent/7215f7bb7ee2284cfaca2fda79725c39d5b3bb89/experiments/Qwen-VL/assets/mm_tutorial/Rebecca_(1939_poster)_Small.jpeg


--------------------------------------------------------------------------------
/experiments/Qwen-VL/assets/mm_tutorial/Shanghai.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mat-agent/MAT-Agent/7215f7bb7ee2284cfaca2fda79725c39d5b3bb89/experiments/Qwen-VL/assets/mm_tutorial/Shanghai.jpg


--------------------------------------------------------------------------------
/experiments/Qwen-VL/assets/mm_tutorial/Shanghai_Output.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mat-agent/MAT-Agent/7215f7bb7ee2284cfaca2fda79725c39d5b3bb89/experiments/Qwen-VL/assets/mm_tutorial/Shanghai_Output.jpg


--------------------------------------------------------------------------------
/experiments/Qwen-VL/assets/mm_tutorial/Shanghai_Output_Small.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mat-agent/MAT-Agent/7215f7bb7ee2284cfaca2fda79725c39d5b3bb89/experiments/Qwen-VL/assets/mm_tutorial/Shanghai_Output_Small.jpeg


--------------------------------------------------------------------------------
/experiments/Qwen-VL/assets/mm_tutorial/Shanghai_Small.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mat-agent/MAT-Agent/7215f7bb7ee2284cfaca2fda79725c39d5b3bb89/experiments/Qwen-VL/assets/mm_tutorial/Shanghai_Small.jpeg


--------------------------------------------------------------------------------
/experiments/Qwen-VL/assets/mm_tutorial/TUTORIAL.ipynb:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mat-agent/MAT-Agent/7215f7bb7ee2284cfaca2fda79725c39d5b3bb89/experiments/Qwen-VL/assets/mm_tutorial/TUTORIAL.ipynb


--------------------------------------------------------------------------------
/experiments/Qwen-VL/assets/qwenvl.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mat-agent/MAT-Agent/7215f7bb7ee2284cfaca2fda79725c39d5b3bb89/experiments/Qwen-VL/assets/qwenvl.jpeg


--------------------------------------------------------------------------------
/experiments/Qwen-VL/assets/radar.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mat-agent/MAT-Agent/7215f7bb7ee2284cfaca2fda79725c39d5b3bb89/experiments/Qwen-VL/assets/radar.png


--------------------------------------------------------------------------------
/experiments/Qwen-VL/assets/radar_qwenvlplus.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mat-agent/MAT-Agent/7215f7bb7ee2284cfaca2fda79725c39d5b3bb89/experiments/Qwen-VL/assets/radar_qwenvlplus.jpg


--------------------------------------------------------------------------------
/experiments/Qwen-VL/assets/touchstone_datasets.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mat-agent/MAT-Agent/7215f7bb7ee2284cfaca2fda79725c39d5b3bb89/experiments/Qwen-VL/assets/touchstone_datasets.jpg


--------------------------------------------------------------------------------
/experiments/Qwen-VL/assets/touchstone_eval.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mat-agent/MAT-Agent/7215f7bb7ee2284cfaca2fda79725c39d5b3bb89/experiments/Qwen-VL/assets/touchstone_eval.png


--------------------------------------------------------------------------------
/experiments/Qwen-VL/assets/touchstone_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mat-agent/MAT-Agent/7215f7bb7ee2284cfaca2fda79725c39d5b3bb89/experiments/Qwen-VL/assets/touchstone_logo.png


--------------------------------------------------------------------------------
/experiments/Qwen-VL/assets/wechat.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mat-agent/MAT-Agent/7215f7bb7ee2284cfaca2fda79725c39d5b3bb89/experiments/Qwen-VL/assets/wechat.png


--------------------------------------------------------------------------------
/experiments/Qwen-VL/eval_mm/data:
--------------------------------------------------------------------------------
1 | /cpfs01/shared/public/shusheng.yss/datasets/qwenvl_evaluation


--------------------------------------------------------------------------------
/experiments/Qwen-VL/eval_mm/mmbench/MMBENCH.md:
--------------------------------------------------------------------------------
 1 | # MMBench Evaluation
 2 | 
 3 | ## Data
 4 | 
 5 | ```bash
 6 | /cpfs01/shared/public/shusheng.yss/workspace/23082502_qwenvl_eval_test/eval_mm/data/mmbench
 7 | ```
 8 | 
 9 | ## Dev
10 | 
11 | ```bash
12 | checkpoint=/PATH/TO/CHECKPOINT
13 | ds=mmbench_dev_20230712
14 | python -m torch.distributed.launch --use-env \
15 |     --nproc_per_node ${NPROC_PER_NODE:-8} \
16 |     --nnodes ${WORLD_SIZE:-1} \
17 |     --node_rank ${RANK:-0} \
18 |     --master_addr ${MASTER_ADDR:-127.0.0.1} \
19 |     --master_port ${MASTER_PORT:-12345} \
20 |     evaluate_multiple_choice_mmbench.py \
21 |     --checkpoint $checkpoint \
22 |     --dataset $ds \
23 |     --batch-size 2 \
24 |     --num-workers 2
25 | 
26 | # the results will be saved to mmbench_dev_20230712.json
27 | 
28 | # without consistency constrain
29 | 
30 | python mmbench_evaluation.py
31 | 
32 | # with consistency constrain
33 | 
34 | python mmbench_evaluation_tricky.py
35 | 
36 | ```
37 | 
38 | ## Test
39 | 
40 | ```bash
41 | checkpoint=/PATH/TO/CHECKPOINT
42 | ds=mmbench_test_20230712
43 | python -m torch.distributed.launch --use-env \
44 |     --nproc_per_node ${NPROC_PER_NODE:-8} \
45 |     --nnodes ${WORLD_SIZE:-1} \
46 |     --node_rank ${RANK:-0} \
47 |     --master_addr ${MASTER_ADDR:-127.0.0.1} \
48 |     --master_port ${MASTER_PORT:-12345} \
49 |     evaluate_multiple_choice_mmbench.py \
50 |     --checkpoint $checkpoint \
51 |     --dataset $ds \
52 |     --batch-size 2 \
53 |     --num-workers 2
54 | 
55 | # the results will be saved to mmbench_test_20230712.json
56 | 
57 | # convert to submission format with consistency constrain
58 | 
59 | python mmbench_predict_to_submission.py
60 | 
61 | ```
62 | 


--------------------------------------------------------------------------------
/experiments/Qwen-VL/eval_mm/mmbench/mmbench_converter_dev.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import io
 3 | import base64
 4 | import json
 5 | from PIL import Image
 6 | 
 7 | '''
 8 | This scripts convert mmbench_dev tsv file to jsonl
 9 | '''
10 | 
11 | datas = pd.read_csv("data/mmbench/mmbench_dev_20230712/mmbench_dev_20230712.tsv", sep='\t')
12 | 
13 | global_choices = ['A', 'B', 'C', 'D']
14 | 
15 | def decode_base64_to_image(base64_string):
16 |     image_data = base64.b64decode(base64_string)
17 |     image = Image.open(io.BytesIO(image_data))
18 |     return image
19 | 
20 | 
21 | with open('./data/mmbench/mmbench_dev_20230712/mmbench_dev_20230712.jsonl', 'w') as f:
22 |     for idx in range(len(datas)):
23 |         data = datas.iloc[idx]
24 |         
25 |         index = int(data['index'])
26 |         question = data['question']
27 |         hint = data['hint'] if not pd.isna(data['hint']) else 'N/A'
28 | 
29 |         choices = []
30 |         for opt in global_choices:
31 |             if pd.isna(data[opt]):
32 |                 continue
33 |             choices.append(data[opt])
34 | 
35 |         answer = global_choices.index(data['answer'])
36 | 
37 |         image = decode_base64_to_image(data['image'])
38 |         image.save("data/mmbench/mmbench_dev_20230712/images/%d.jpg" % index)
39 | 
40 |         f.write(json.dumps({
41 |             "index": index,
42 |             "image": "data/mmbench/mmbench_dev_20230712/images/%d.jpg" % index,
43 |             "hint": hint,
44 |             "question": question,
45 |             "choices": choices, 
46 |             "answer": answer,
47 |         }) + "\n")
48 | 
49 | 


--------------------------------------------------------------------------------
/experiments/Qwen-VL/eval_mm/mmbench/mmbench_converter_test.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import io
 3 | import base64
 4 | import json
 5 | from PIL import Image
 6 | 
 7 | '''
 8 | This script convert mmbench_test tsv file to jsonl
 9 | This script is very similar to mmbench_converter_dev except there's no answer for accuracy calculation
10 | '''
11 | 
12 | datas = pd.read_csv("data/mmbench/mmbench_test_20230712/mmbench_test_20230712.tsv", sep='\t')
13 | 
14 | global_choices = ['A', 'B', 'C', 'D']
15 | 
16 | def decode_base64_to_image(base64_string):
17 |     image_data = base64.b64decode(base64_string)
18 |     image = Image.open(io.BytesIO(image_data))
19 |     return image
20 | 
21 | 
22 | with open('./data/mmbench/mmbench_test_20230712/mmbench_test_20230712.jsonl', 'w') as f:
23 |     for idx in range(len(datas)):
24 |         data = datas.iloc[idx]
25 |         
26 |         index = int(data['index'])
27 |         question = data['question']
28 |         hint = data['hint'] if not pd.isna(data['hint']) else 'N/A'
29 | 
30 |         choices = []
31 |         for opt in global_choices:
32 |             if pd.isna(data[opt]):
33 |                 continue
34 |             choices.append(data[opt])
35 | 
36 |         # answer = global_choices.index(data['answer'])
37 | 
38 |         image = decode_base64_to_image(data['image'])
39 |         image.save("data/mmbench/mmbench_test_20230712/images/%d.jpg" % index)
40 | 
41 |         f.write(json.dumps({
42 |             "index": index,
43 |             "image": "data/mmbench/mmbench_test_20230712/images/%d.jpg" % index,
44 |             "hint": hint,
45 |             "question": question,
46 |             "choices": choices, 
47 |             # "answer": answer,
48 |         }) + "\n")
49 | 
50 | 


--------------------------------------------------------------------------------
/experiments/Qwen-VL/eval_mm/mmbench/mmbench_evaluation.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import json
 3 | 
 4 | '''
 5 | This script provides `global top-1 accuracy` metric calculation for mmbench_dev.
 6 | '''
 7 | 
 8 | predictions = json.load(open('mmbench_dev_20230712.json'))
 9 | 
10 | index2predictions = {}
11 | for pred in predictions:
12 |     index2predictions[pred['index']] = pred['prediction']
13 | 
14 | datas = pd.read_csv("data/mmbench/mmbench_dev_20230712/mmbench_dev_20230712.tsv", sep='\t')
15 | 
16 | glb_opts = ['A', 'B', 'C', 'D']
17 | index2answer = {}
18 | for idx in range(len(datas)):
19 |     data = datas.iloc[idx]
20 |     index2answer[data['index']] = glb_opts.index(data['answer'])
21 | 
22 | identity_indexes = list(set([int(_ % 1e6) for _ in index2predictions.keys()]))
23 | 
24 | correct = 0
25 | total = 0
26 | for index in identity_indexes:
27 |     for _ in range(4):
28 |         cycle_index = int(_ * 1e6 + index)
29 |         if index2predictions.get(cycle_index, None) is not None:
30 |             if index2predictions[cycle_index] == index2answer[cycle_index]:
31 |                 continue
32 |             else:
33 |                 print(cycle_index)
34 |                 break
35 |     else:
36 |         correct += 1
37 |     total += 1
38 | 
39 | print(correct, total)
40 | 


--------------------------------------------------------------------------------
/experiments/Qwen-VL/eval_mm/mmbench/mmbench_evaluation_tricky.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import json
 3 | import random
 4 | 
 5 | '''
 6 | This script provides metric calculation for mmbench_dev with the same accuarcy algo as OpenCompass server
 7 | '''
 8 | 
 9 | predictions = json.load(open('mmbench_dev_20230712.json'))
10 | 
11 | index2predictions = {}
12 | for pred in predictions:
13 |     index2predictions[pred['index']] = pred['prediction']
14 | 
15 | 
16 | from collections import Counter
17 | 
18 | def most_common_elements(lst):
19 |     counter = Counter(lst)
20 |     max_count = max(counter.values())
21 |     most_common = [element for element, count in counter.items() if count == max_count]
22 |     return random.choice(most_common) # random sample from random choice
23 | 
24 | datas = pd.read_csv("data/mmbench/mmbench_dev_20230712/mmbench_dev_20230712.tsv", sep='\t')
25 | 
26 | glb_opts = ['A', 'B', 'C', 'D']
27 | index2answer = {}
28 | index2choices = {}
29 | index2rawanswer = {}
30 | for idx in range(len(datas)):
31 |     data = datas.iloc[idx]
32 |     
33 |     choices = []
34 |     for opt in glb_opts:
35 |         if not pd.isna(data[opt]):
36 |             choices.append(data[opt])
37 |     index2choices[data['index']] = choices
38 | 
39 |     index2answer[data['index']] = glb_opts.index(data['answer'])
40 |     index2rawanswer[data['index']] = choices[glb_opts.index(data['answer'])]
41 | 
42 | identity_indexes = list(set([int(_ % 1e6) for _ in index2predictions.keys()]))
43 | 
44 | correct = 0
45 | total = 0
46 | for index in identity_indexes:
47 |     raw_preds = []
48 |     raw_answer = []
49 |     for _ in range(4):
50 |         cycle_index = int(_ * 1e6 + index)
51 |         if index2predictions.get(cycle_index, None) is not None:
52 |             raw_answer = index2rawanswer[cycle_index]
53 |             raw_pred = index2choices[cycle_index][index2predictions[cycle_index]]
54 |             raw_preds.append(raw_pred)
55 | 
56 |     if len(set(raw_preds)) == 1:
57 |         if raw_preds[0] == raw_answer:
58 |             correct += 1
59 |     else:
60 |         result = most_common_elements(raw_preds)
61 |         if result == raw_answer:
62 |             correct += 1
63 | 
64 |     total += 1
65 | 
66 | print(correct, total, correct / total * 100.)
67 | 


--------------------------------------------------------------------------------
/experiments/Qwen-VL/eval_mm/mmbench/mmbench_predict_to_submission.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import json
 3 | import random
 4 | 
 5 | '''
 6 | This script convert the output file of our inference processor to target formation of OpenCompass evaluator server
 7 | '''
 8 | 
 9 | predictions = json.load(open('mmbench_test_20230712.json'))
10 | 
11 | index2predictions = {}
12 | for pred in predictions:
13 |     index2predictions[pred['index']] = pred['prediction']
14 | 
15 | from collections import Counter
16 | 
17 | def most_common_elements(lst):
18 |     counter = Counter(lst)
19 |     max_count = max(counter.values())
20 |     most_common = [element for element, count in counter.items() if count == max_count]
21 |     print(most_common)
22 |     return random.choice(most_common)
23 |     # return most_common
24 | 
25 | datas = pd.read_csv("data/mmbench/mmbench_test_20230712/mmbench_test_20230712.tsv", sep='\t')
26 | 
27 | datas = datas.drop('image', axis=1)
28 | 
29 | glb_opts = ['A', 'B', 'C', 'D']
30 | index2choices = {}
31 | for idx in range(len(datas)):
32 |     data = datas.iloc[idx]
33 |     
34 |     choices = []
35 |     for opt in glb_opts:
36 |         if not pd.isna(data[opt]):
37 |             choices.append(data[opt])
38 |     index2choices[data['index']] = choices
39 | 
40 | identity_indexes = list(set([int(_ % 1e6) for _ in index2predictions.keys()]))
41 | 
42 | 
43 | processed_index2predictions = {}
44 | for index in identity_indexes:
45 |     raw_preds = []
46 |     for _ in range(4):
47 |         cycle_index = int(_ * 1e6 + index)
48 |         if index2predictions.get(cycle_index, None) is not None:
49 |             raw_pred = index2choices[cycle_index][index2predictions[cycle_index]]
50 |             raw_preds.append(raw_pred)
51 |     
52 |     if len(set(raw_preds)) == 1:
53 |         pred_answer = raw_preds[0]
54 |     else:
55 |         pred_answer = most_common_elements(raw_preds)
56 | 
57 |     print(index, pred_answer)
58 |     for _ in range(4):
59 |         cycle_index = int(_ * 1e6 + index)
60 |         if index2predictions.get(cycle_index, None) is not None:
61 |             processed_index2predictions[cycle_index] = index2choices[cycle_index].index(pred_answer)
62 | 
63 | 
64 | predictions = []
65 | for idx in range(len(datas)):
66 |     data = datas.iloc[idx]
67 |     index = data['index']
68 |     prediction = glb_opts[processed_index2predictions[index]]
69 |     predictions.append(prediction)
70 | 
71 | datas['prediction'] = predictions
72 | datas.to_excel("mmbench_test_20230712_230831_constrained.xlsx", index=False)
73 | # constrained means we force the model predict same answer when tested on a question for multiple times
74 | 


--------------------------------------------------------------------------------
/experiments/Qwen-VL/eval_mm/mme/EVAL_MME.md:
--------------------------------------------------------------------------------
 1 | # MME Benchmark
 2 | 
 3 | [MME](https://github.com/BradyFU/Awesome-Multimodal-Large-Language-Models/tree/Evaluation) is a comprehensive evaluation benchmark for multimodal large language models. It measures both perception and cognition abilities on a total of 14 subtasks, including existence, count, position, color, poster, celebrity, scene, landmark, artwork, OCR, commonsense reasoning, numerical calculation, text translation, and code reasoning.
 4 | 
 5 | Qwen-VL-Chat achieves SOTAs on both perception and cognition evaluation.
 6 | 
 7 | Perception Evaluation
 8 | 
 9 | | Rank |      Model      |          Version         |  Score  |
10 | |:----:|:---------------:|:------------------------:|:-------:|
11 | |   1  | **[Qwen-VL-Chat](https://github.com/QwenLM/Qwen-VL/)**|        **[Qwen-7B](https://github.com/QwenLM/Qwen-7B)**       | **1487.57** |
12 | |   2  |    Skywork-MM   |      Skywork-MM-13B      | 1419.08 |
13 | |   3  |      MMICL      |         FlanT5xxl        | 1376.00 |
14 | |   4  |       Lynx      |         vicuna-7b        | 1373.23 |
15 | |   5  |      BLIVA      |         FlanT5xxl        | 1337.73 |
16 | 
17 | Cognition Evaluation
18 | 
19 | | Rank |       Model      |     Version    |    Score   |
20 | |:----:|:----------------:|:--------------:|:----------:|
21 | |   1  | **[Qwen-VL-Chat](https://github.com/QwenLM/Qwen-VL/)** |   **[Qwen-7B](https://github.com/QwenLM/Qwen-7B)**  | **360.71** |
22 | |   2  |       MMICL      |    FlanT5xxl   |   360.36   |
23 | |   3  |    Skywork-MM    | Skywork-MM-13B |   356.43   |
24 | |   4  |       BLIVA      |    FlanT5xxl   |   331.43   |
25 | |   5  |  LRV-Instruction |     LRV-7B     |   328.21   |
26 | 
27 | Full Metrics
28 | 
29 | ```
30 | =========== Perception ===========
31 | total score: 1487.576330532213 
32 | 
33 |          existence  score: 158.33333333333331
34 |          count  score: 150.0
35 |          position  score: 128.33333333333334
36 |          color  score: 170.0
37 |          posters  score: 178.57142857142856
38 |          celebrity  score: 120.58823529411764
39 |          scene  score: 152.25
40 |          landmark  score: 164.0
41 |          artwork  score: 125.5
42 |          OCR  score: 140.0
43 | 
44 | 
45 | =========== Cognition ===========
46 | total score: 360.71428571428567 
47 | 
48 |          commonsense_reasoning  score: 130.7142857142857
49 |          numerical_calculation  score: 40.0
50 |          text_translation  score: 147.5
51 |          code_reasoning  score: 42.5
52 | ```
53 | 
54 | ## How To Reproduce Results of MME Benchmark
55 | 
56 | 1. Download MME images and eval_tool from the [MME repo](https://github.com/BradyFU/Awesome-Multimodal-Large-Language-Models/blob/Evaluation/README.md)
57 | 2. Rearrange images by executing `python get_images.py`
58 | 3. Evaluate Qwen-VL-Chat results by executing `python eval.py`
59 | 4. Calculate MME results by executing `python calculation.py --results_dir Qwen-VL-Chat`, which the calculation script comes from the MME eval_tool.
60 | 


--------------------------------------------------------------------------------
/experiments/Qwen-VL/eval_mm/mme/cognition.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mat-agent/MAT-Agent/7215f7bb7ee2284cfaca2fda79725c39d5b3bb89/experiments/Qwen-VL/eval_mm/mme/cognition.jpg


--------------------------------------------------------------------------------
/experiments/Qwen-VL/eval_mm/mme/eval.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from tqdm import tqdm
 3 | 
 4 | from transformers import AutoModelForCausalLM, AutoTokenizer
 5 | from transformers.generation import GenerationConfig
 6 | 
 7 | checkpoint = 'Qwen/Qwen-VL-Chat'
 8 | tokenizer = AutoTokenizer.from_pretrained(checkpoint, trust_remote_code=True)
 9 | model = AutoModelForCausalLM.from_pretrained(
10 |     checkpoint, device_map='cuda', trust_remote_code=True).eval()
11 | 
12 | model.generation_config = GenerationConfig.from_pretrained(checkpoint, trust_remote_code=True)
13 | model.generation_config.top_p = 0.01
14 | 
15 | 
16 | root = 'Your_Results'
17 | output = 'Qwen-VL-Chat'
18 | os.makedirs(output, exist_ok=True)
19 | for filename in os.listdir(root):
20 |     with open(os.path.join(root, filename), 'r') as fin, open(os.path.join(output, filename), 'w') as fout:
21 |         lines = fin.read().splitlines()
22 |         filename = filename.replace('.txt', '')
23 |         for line in tqdm(lines):
24 |             img, question, gt = line.strip().split('\t')
25 |             img_path = os.path.join('images', filename, img)
26 |             assert os.path.exists(img_path), img_path
27 |             query = f'<img>{img_path}</img>\n{question}'
28 |             response, _ = model.chat(tokenizer, query=query, history=None)
29 | 
30 |             print(img, question, gt, response, sep='\t', file=fout)
31 | 


--------------------------------------------------------------------------------
/experiments/Qwen-VL/eval_mm/mme/get_images.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from tqdm import tqdm
 3 | 
 4 | os.system('rm -rf images')
 5 | os.system('mkdir images')
 6 | 
 7 | os.system('cp -r ../MME_Benchmark_release/OCR images/')
 8 | 
 9 | os.system('mkdir images/artwork')
10 | os.system('cp ../MME_Benchmark_release/artwork/questions_answers_YN/* images/artwork/')
11 | with open('LaVIN/artwork.txt') as fin:
12 |     paths = [ line.strip().split('\t', 1)[0] for line in fin ]
13 |     paths = list(set(paths))
14 |     for path in tqdm(paths):
15 |         os.system(f'cp ../MME_Benchmark_release/artwork/images/toy_dataset/{path} images/artwork/{path}')
16 | 
17 | os.system('mkdir images/celebrity')
18 | os.system('cp ../MME_Benchmark_release/celebrity/images/* images/celebrity/')
19 | os.system('cp ../MME_Benchmark_release/celebrity/questions_answers_YN/* images/celebrity/')
20 | 
21 | os.system('cp -r ../MME_Benchmark_release/code_reasoning images/')
22 | 
23 | os.system('cp -r ../MME_Benchmark_release/color images/')
24 | 
25 | os.system('cp -r ../MME_Benchmark_release/commonsense_reasoning images/')
26 | 
27 | os.system('cp -r ../MME_Benchmark_release/count images/')
28 | 
29 | os.system('cp -r ../MME_Benchmark_release/existence images/')
30 | 
31 | os.system('mkdir images/landmark')
32 | os.system('cp ../MME_Benchmark_release/landmark/images/* images/landmark/')
33 | os.system('cp ../MME_Benchmark_release/landmark/questions_answers_YN/* images/landmark/')
34 | 
35 | os.system('cp -r ../MME_Benchmark_release/numerical_calculation images/')
36 | 
37 | os.system('cp -r ../MME_Benchmark_release/position images/')
38 | 
39 | os.system('mkdir images/posters')
40 | os.system('cp ../MME_Benchmark_release/posters/images/* images/posters/')
41 | os.system('cp ../MME_Benchmark_release/posters/questions_answers_YN/* images/posters/')
42 | 
43 | os.system('mkdir images/scene')
44 | os.system('cp ../MME_Benchmark_release/scene/images/* images/scene/')
45 | os.system('cp ../MME_Benchmark_release/scene/questions_answers_YN/* images/scene/')
46 | 
47 | os.system('cp -r ../MME_Benchmark_release/text_translation images/')
48 | 


--------------------------------------------------------------------------------
/experiments/Qwen-VL/eval_mm/mme/perception.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mat-agent/MAT-Agent/7215f7bb7ee2284cfaca2fda79725c39d5b3bb89/experiments/Qwen-VL/eval_mm/mme/perception.jpg


--------------------------------------------------------------------------------
/experiments/Qwen-VL/eval_mm/seed_bench/leaderboard.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mat-agent/MAT-Agent/7215f7bb7ee2284cfaca2fda79725c39d5b3bb89/experiments/Qwen-VL/eval_mm/seed_bench/leaderboard.jpg


--------------------------------------------------------------------------------
/experiments/Qwen-VL/finetune/ds_config_zero2.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "fp16": {
 3 |         "enabled": "auto",
 4 |         "loss_scale": 0,
 5 |         "loss_scale_window": 1000,
 6 |         "initial_scale_power": 16,
 7 |         "hysteresis": 2,
 8 |         "min_loss_scale": 1
 9 |     },
10 |     "bf16": {
11 |         "enabled": "auto"
12 |     },
13 |     "optimizer": {
14 |         "type": "AdamW",
15 |         "params": {
16 |             "lr": "auto",
17 |             "betas": "auto",
18 |             "eps": "auto",
19 |             "weight_decay": "auto"
20 |         }
21 |     },
22 | 
23 |     "scheduler": {
24 |         "type": "WarmupLR",
25 |         "params": {
26 |             "warmup_min_lr": "auto",
27 |             "warmup_max_lr": "auto",
28 |             "warmup_num_steps": "auto"
29 |         }
30 |     },
31 | 
32 |     "zero_optimization": {
33 |         "stage": 2,
34 |         "offload_optimizer": {
35 |             "device": "none",
36 |             "pin_memory": true
37 |         },
38 |         "allgather_partitions": true,
39 |         "allgather_bucket_size": 2e8,
40 |         "overlap_comm": false,
41 |         "reduce_scatter": true,
42 |         "reduce_bucket_size": 2e8,
43 |         "contiguous_gradients": true
44 |     },
45 | 
46 |     "gradient_accumulation_steps": "auto",
47 |     "gradient_clipping": "auto",
48 |     "steps_per_print": 100,
49 |     "train_batch_size": "auto",
50 |     "train_micro_batch_size_per_gpu": "auto",
51 |     "wall_clock_breakdown": false
52 | }


--------------------------------------------------------------------------------
/experiments/Qwen-VL/finetune/ds_config_zero3.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "fp16": {
 3 |         "enabled": "auto",
 4 |         "loss_scale": 0,
 5 |         "loss_scale_window": 1000,
 6 |         "initial_scale_power": 16,
 7 |         "hysteresis": 2,
 8 |         "min_loss_scale": 1
 9 |     },
10 |     "bf16": {
11 |         "enabled": "auto"
12 |     },
13 |     "optimizer": {
14 |         "type": "AdamW",
15 |         "params": {
16 |             "lr": "auto",
17 |             "betas": "auto",
18 |             "eps": "auto",
19 |             "weight_decay": "auto"
20 |         }
21 |     },
22 | 
23 |     "scheduler": {
24 |         "type": "WarmupLR",
25 |         "params": {
26 |             "warmup_min_lr": "auto",
27 |             "warmup_max_lr": "auto",
28 |             "warmup_num_steps": "auto"
29 |         }
30 |     },
31 | 
32 |     "zero_optimization": {
33 |         "stage": 3,
34 |         "offload_optimizer": {
35 |             "device": "none",
36 |             "pin_memory": true
37 |         },
38 |         "offload_param": {
39 |             "device": "none",
40 |             "pin_memory": true
41 |         },
42 |         "overlap_comm": true,
43 |         "contiguous_gradients": true,
44 |         "sub_group_size": 1e9,
45 |         "reduce_bucket_size": "auto",
46 |         "stage3_prefetch_bucket_size": "auto",
47 |         "stage3_param_persistence_threshold": "auto",
48 |         "stage3_max_live_parameters": 1e9,
49 |         "stage3_max_reuse_distance": 1e9,
50 |         "stage3_gather_16bit_weights_on_model_save": true
51 |     },
52 | 
53 |     "gradient_accumulation_steps": "auto",
54 |     "gradient_clipping": "auto",
55 |     "steps_per_print": 100,
56 |     "train_batch_size": "auto",
57 |     "train_micro_batch_size_per_gpu": "auto",
58 |     "wall_clock_breakdown": false
59 | }
60 | 


--------------------------------------------------------------------------------
/experiments/Qwen-VL/finetune/finetune_ds.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | export CUDA_DEVICE_MAX_CONNECTIONS=1
 3 | DIR=`pwd`
 4 | 
 5 | GPUS_PER_NODE=8
 6 | NNODES=1
 7 | NODE_RANK=0
 8 | MASTER_ADDR=localhost
 9 | MASTER_PORT=6001
10 | 
11 | MODEL="Qwen/Qwen-VL-Chat" #"Qwen/Qwen-VL-Chat"/"Qwen/Qwen-VL" # Set the path if you do not want to load from huggingface directly
12 | # ATTENTION: specify the path to your training data, which should be a json file consisting of a list of conversations.
13 | # See the section for finetuning in README for more information.
14 | DATA="path_to_data"
15 | 
16 | DISTRIBUTED_ARGS="
17 |     --nproc_per_node $GPUS_PER_NODE \
18 |     --nnodes $NNODES \
19 |     --node_rank $NODE_RANK \
20 |     --master_addr $MASTER_ADDR \
21 |     --master_port $MASTER_PORT
22 | "
23 | 
24 | torchrun $DISTRIBUTED_ARGS finetune.py \
25 |     --model_name_or_path $MODEL \
26 |     --data_path $DATA \
27 |     --bf16 True \
28 |     --fix_vit True \
29 |     --output_dir output_qwen \
30 |     --num_train_epochs 5 \
31 |     --per_device_train_batch_size 1 \
32 |     --per_device_eval_batch_size 1 \
33 |     --gradient_accumulation_steps 16 \
34 |     --evaluation_strategy "no" \
35 |     --save_strategy "steps" \
36 |     --save_steps 1000 \
37 |     --save_total_limit 10 \
38 |     --learning_rate 1e-5 \
39 |     --weight_decay 0.1 \
40 |     --adam_beta2 0.95 \
41 |     --warmup_ratio 0.01 \
42 |     --lr_scheduler_type "cosine" \
43 |     --logging_steps 1 \
44 |     --report_to "none" \
45 |     --model_max_length 2048 \
46 |     --gradient_checkpointing True \
47 |     --lazy_preprocess True \
48 |     --deepspeed finetune/ds_config_zero3.json
49 | 


--------------------------------------------------------------------------------
/experiments/Qwen-VL/finetune/finetune_lora_ds.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | export CUDA_DEVICE_MAX_CONNECTIONS=1
 3 | DIR=`pwd`
 4 | 
 5 | GPUS_PER_NODE=$(nvidia-smi --query-gpu=gpu_name --format=csv,noheader | wc -l)
 6 | NNODES=1
 7 | NODE_RANK=0
 8 | MASTER_ADDR=localhost
 9 | MASTER_PORT=6001
10 | 
11 | MODEL="Qwen/Qwen2-VL-7B-Instruct" #"Qwen/Qwen-VL-Chat"/"Qwen/Qwen-VL"  Set the path if you do not want to load from huggingface directly
12 | # ATTENTION: specify the path to your training data, which should be a json file consisting of a list of conversations.
13 | # See the section for finetuning in README for more information.
14 | DATA="data/train_20241209_1731.json"
15 | 
16 | DISTRIBUTED_ARGS="
17 |     --nproc_per_node $GPUS_PER_NODE \
18 |     --nnodes $NNODES \
19 |     --node_rank $NODE_RANK \
20 |     --master_addr $MASTER_ADDR \
21 |     --master_port $MASTER_PORT
22 | "
23 | export WANDB_PROJECT="mat_qwen_vl_gta"
24 | torchrun $DISTRIBUTED_ARGS finetune.py \
25 |     --model_name_or_path $MODEL \
26 |     --data_path $DATA \
27 |     --bf16 True \
28 |     --fix_vit True \
29 |     --output_dir output_qwen/Qwen2-VL-7B-Instruct-$SLURM_JOB_ID \
30 |     --num_train_epochs 7 \
31 |     --per_device_train_batch_size 2 \
32 |     --per_device_eval_batch_size 1 \
33 |     --gradient_accumulation_steps 8 \
34 |     --evaluation_strategy "no" \
35 |     --save_strategy "steps" \
36 |     --save_steps 1000000 \
37 |     --save_total_limit 1 \
38 |     --learning_rate 1e-5 \
39 |     --weight_decay 0.1 \
40 |     --adam_beta2 0.95 \
41 |     --warmup_ratio 0.01 \
42 |     --lr_scheduler_type "cosine" \
43 |     --logging_steps 1 \
44 |     --report_to "wandb" \
45 |     --model_max_length 10240 \
46 |     --lazy_preprocess True \
47 |     --use_lora \
48 |     --gradient_checkpointing \
49 |     --deepspeed finetune/ds_config_zero2.json \
50 |     --lora_target_modules "llm\..*layers\.\d+\.self_attn\.(q_proj|k_proj|v_proj|o_proj)"


--------------------------------------------------------------------------------
/experiments/Qwen-VL/finetune/finetune_lora_ds_gaia.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | export CUDA_DEVICE_MAX_CONNECTIONS=1
 3 | DIR=`pwd`
 4 | 
 5 | GPUS_PER_NODE=$(nvidia-smi --query-gpu=gpu_name --format=csv,noheader | wc -l)
 6 | NNODES=1
 7 | NODE_RANK=0
 8 | MASTER_ADDR=localhost
 9 | MASTER_PORT=6001
10 | 
11 | MODEL="Qwen/Qwen2-VL-7B-Instruct" #"Qwen/Qwen-VL-Chat"/"Qwen/Qwen-VL"  Set the path if you do not want to load from huggingface directly
12 | # ATTENTION: specify the path to your training data, which should be a json file consisting of a list of conversations.
13 | # See the section for finetuning in README for more information.
14 | DATA="data/train_20241211_1748.json"
15 | 
16 | DISTRIBUTED_ARGS="
17 |     --nproc_per_node $GPUS_PER_NODE \
18 |     --nnodes $NNODES \
19 |     --node_rank $NODE_RANK \
20 |     --master_addr $MASTER_ADDR \
21 |     --master_port $MASTER_PORT
22 | "
23 | export WANDB_PROJECT="mat_qwen_vl_gaia"
24 | torchrun $DISTRIBUTED_ARGS finetune.py \
25 |     --model_name_or_path $MODEL \
26 |     --data_path $DATA \
27 |     --bf16 True \
28 |     --fix_vit True \
29 |     --output_dir output_qwen/Qwen2-VL-7B-Instruct-$SLURM_JOB_ID \
30 |     --num_train_epochs 3 \
31 |     --per_device_train_batch_size 2 \
32 |     --per_device_eval_batch_size 1 \
33 |     --gradient_accumulation_steps 4 \
34 |     --evaluation_strategy "no" \
35 |     --save_strategy "steps" \
36 |     --save_steps 10000 \
37 |     --save_total_limit 1 \
38 |     --learning_rate 1e-5 \
39 |     --weight_decay 0.1 \
40 |     --adam_beta2 0.95 \
41 |     --warmup_ratio 0.01 \
42 |     --lr_scheduler_type "cosine" \
43 |     --logging_steps 1 \
44 |     --report_to "wandb" \
45 |     --model_max_length 10240 \
46 |     --lazy_preprocess True \
47 |     --use_lora \
48 |     --gradient_checkpointing \
49 |     --deepspeed finetune/ds_config_zero2.json \
50 |     --lora_target_modules "llm\..*layers\.\d+\.self_attn\.(q_proj|k_proj|v_proj|o_proj)"


--------------------------------------------------------------------------------
/experiments/Qwen-VL/finetune/finetune_lora_single_gpu.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | export CUDA_DEVICE_MAX_CONNECTIONS=1
 3 | DIR=`pwd`
 4 | 
 5 | 
 6 | MODEL="Qwen/Qwen-VL-Chat" #"Qwen/Qwen-VL-Chat"/"Qwen/Qwen-VL" # Set the path if you do not want to load from huggingface directly
 7 | # ATTENTION: specify the path to your training data, which should be a json file consisting of a list of conversations.
 8 | # See the section for finetuning in README for more information.
 9 | DATA="path_to_data"
10 | 
11 | export CUDA_VISIBLE_DEVICES=0
12 | 
13 | python finetune.py \
14 |     --model_name_or_path $MODEL \
15 |     --data_path $DATA \
16 |     --bf16 True \
17 |     --fix_vit True \
18 |     --output_dir output_qwen \
19 |     --num_train_epochs 5 \
20 |     --per_device_train_batch_size 1 \
21 |     --per_device_eval_batch_size 1 \
22 |     --gradient_accumulation_steps 8 \
23 |     --evaluation_strategy "no" \
24 |     --save_strategy "steps" \
25 |     --save_steps 1000 \
26 |     --save_total_limit 10 \
27 |     --learning_rate 1e-5 \
28 |     --weight_decay 0.1 \
29 |     --adam_beta2 0.95 \
30 |     --warmup_ratio 0.01 \
31 |     --lr_scheduler_type "cosine" \
32 |     --logging_steps 1 \
33 |     --report_to "none" \
34 |     --model_max_length 2048 \
35 |     --lazy_preprocess True \
36 |     --gradient_checkpointing \
37 |     --use_lora


--------------------------------------------------------------------------------
/experiments/Qwen-VL/finetune/finetune_qlora_ds.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | export CUDA_DEVICE_MAX_CONNECTIONS=1
 3 | DIR=`pwd`
 4 | 
 5 | GPUS_PER_NODE=8
 6 | NNODES=1
 7 | NODE_RANK=0
 8 | MASTER_ADDR=localhost
 9 | MASTER_PORT=6001
10 | 
11 | MODEL="Qwen/Qwen-VL-Chat-Int4" # Qwen/Qwen-VL-Chat-Int4 Set the path if you do not want to load from huggingface directly
12 | # ATTENTION: specify the path to your training data, which should be a json file consisting of a list of conversations.
13 | # See the section for finetuning in README for more information.
14 | DATA="path_to_data"
15 | 
16 | 
17 | DISTRIBUTED_ARGS="
18 |     --nproc_per_node $GPUS_PER_NODE \
19 |     --nnodes $NNODES \
20 |     --node_rank $NODE_RANK \
21 |     --master_addr $MASTER_ADDR \
22 |     --master_port $MASTER_PORT
23 | "
24 | 
25 | # Remember to use --fp16 instead of --bf16 due to autogptq
26 | torchrun $DISTRIBUTED_ARGS finetune.py \
27 |     --model_name_or_path $MODEL \
28 |     --data_path $DATA \
29 |     --fp16 True \
30 |     --fix_vit True \
31 |     --output_dir output_qwen \
32 |     --num_train_epochs 5 \
33 |     --per_device_train_batch_size 2 \
34 |     --per_device_eval_batch_size 1 \
35 |     --gradient_accumulation_steps 8 \
36 |     --evaluation_strategy "no" \
37 |     --save_strategy "steps" \
38 |     --save_steps 1000 \
39 |     --save_total_limit 10 \
40 |     --learning_rate 1e-5 \
41 |     --weight_decay 0.1 \
42 |     --adam_beta2 0.95 \
43 |     --warmup_ratio 0.01 \
44 |     --lr_scheduler_type "cosine" \
45 |     --logging_steps 1 \
46 |     --report_to "none" \
47 |     --model_max_length 2048 \
48 |     --lazy_preprocess True \
49 |     --use_lora \
50 |     --q_lora \
51 |     --gradient_checkpointing \
52 |     --deepspeed finetune/ds_config_zero2.json


--------------------------------------------------------------------------------
/experiments/Qwen-VL/finetune/finetune_qlora_single_gpu.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | export CUDA_DEVICE_MAX_CONNECTIONS=1
 3 | DIR=`pwd`
 4 | 
 5 | MODEL="Qwen/Qwen-VL-Chat-Int4" # Qwen/Qwen-VL-Chat-Int4 Set the path if you do not want to load from huggingface directly
 6 | # ATTENTION: specify the path to your training data, which should be a json file consisting of a list of conversations.
 7 | # See the section for finetuning in README for more information.
 8 | DATA="path_to_data"
 9 | 
10 | export CUDA_VISIBLE_DEVICES=0
11 | 
12 | # Remember to use --fp16 instead of --bf16 due to autogptq
13 | python finetune.py \
14 |     --model_name_or_path $MODEL \
15 |     --data_path $DATA \
16 |     --fp16 True \
17 |     --fix_vit True \
18 |     --output_dir output_qwen \
19 |     --num_train_epochs 5 \
20 |     --per_device_train_batch_size 1 \
21 |     --per_device_eval_batch_size 1 \
22 |     --gradient_accumulation_steps 8 \
23 |     --evaluation_strategy "no" \
24 |     --save_strategy "steps" \
25 |     --save_steps 1000 \
26 |     --save_total_limit 10 \
27 |     --learning_rate 1e-5 \
28 |     --weight_decay 0.1 \
29 |     --adam_beta2 0.95 \
30 |     --warmup_ratio 0.01 \
31 |     --lr_scheduler_type "cosine" \
32 |     --logging_steps 1 \
33 |     --report_to "none" \
34 |     --model_max_length 2048 \
35 |     --lazy_preprocess True \
36 |     --gradient_checkpointing \
37 |     --use_lora \
38 |     --q_lora \
39 |     --deepspeed finetune/ds_config_zero2.json
40 | 


--------------------------------------------------------------------------------
/experiments/Qwen-VL/requirements.txt:
--------------------------------------------------------------------------------
 1 | transformers==4.32.0
 2 | accelerate
 3 | tiktoken
 4 | einops
 5 | transformers_stream_generator==0.0.4
 6 | scipy
 7 | torchvision
 8 | pillow
 9 | tensorboard
10 | matplotlib
11 | 


--------------------------------------------------------------------------------
/experiments/Qwen-VL/requirements_openai_api.txt:
--------------------------------------------------------------------------------
1 | fastapi
2 | uvicorn
3 | openai
4 | pydantic
5 | sse_starlette
6 | 


--------------------------------------------------------------------------------
/experiments/Qwen-VL/requirements_web_demo.txt:
--------------------------------------------------------------------------------
1 | gradio
2 | modelscope
3 | 


--------------------------------------------------------------------------------
/experiments/Qwen-VL/scripts/convert_dataset.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | from tqdm import tqdm
 3 | # GTA
 4 | data_path = "/scratch/zhangbofei/Projects/Multimodal-CL/iclr_09/TongAgent/experiments/CPM-FT/data/agent_tune_dataset_cpm_8k_gta_with_verifier.json"
 5 | # GAIA
 6 | data_path = "/scratch/zhangbofei/Projects/Multimodal-CL/iclr_09/TongAgent/experiments/CPM-FT/data/agent_tune_dataset_cpm_17k_gaia_with_verifier.json"
 7 | 
 8 | 
 9 | with open(data_path, "r") as f:
10 |     dataset = json.load(f)
11 | 
12 | def _convert(image_path_map, conversations):
13 |     output = []
14 |     for turn in conversations:
15 |         role = turn["role"]
16 |         content = turn["content"]
17 |         turn_new = dict()
18 |         turn_new["from"] = role
19 |         pid = 1
20 |         keys = sorted(list(image_path_map.keys()))
21 |         for k in keys:
22 |             v = image_path_map[k]
23 |             if k in content:
24 |                 content = content.replace(k, f"Picture {pid}: <img>{v}</img>\n")
25 |                 content = content.replace(f"</img>\n\n", "</img>\n")
26 |                 pid += 1
27 |         turn_new["value"] = content
28 |         output.append(turn_new)
29 |     return output
30 | 
31 | 
32 | for item in tqdm(dataset):
33 |     #print(item["image"])
34 |     #print(item.keys())
35 |     conversations = item["conversations"]
36 |     #print(len(conversations), conversations[1])
37 |     image_path_map = dict()
38 |     if type(item["image"]) == str:
39 |         image_path_map["<image>"] = item["image"]
40 |     else:
41 |         for k, v in item["image"].items():
42 |             image_path_map[k] = v
43 |     item["conversations"] = _convert(image_path_map, conversations)
44 | 
45 | from datetime import datetime
46 | import json
47 | 
48 | now = datetime.now().strftime("%Y%m%d_%H%M")
49 | print("write to", f"data/train_{now}.json")
50 | with open(f"data/train_{now}.json", "w") as f:
51 |     json.dump(dataset, f, indent=4, ensure_ascii=False)
52 | 
53 | import random
54 | with open(f"data/train_{now}_subset.json", "w") as f:
55 |     random.shuffle(dataset)
56 |     json.dump(dataset[:1000], f, indent=4, ensure_ascii=False)


--------------------------------------------------------------------------------
/experiments/Qwen-VL/scripts/convert_dataset_v2.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | from tqdm import tqdm
 3 | 
 4 | # GAIA
 5 | data_path = "/scratch/zhangbofei/Projects/Multimodal-CL/iclr_09/TongAgent/experiments/CPM-FT/data/agent_tune_dataset_gaia_57k_20241210.json"
 6 | 
 7 | with open(data_path, "r") as f:
 8 |     dataset = json.load(f)
 9 | 
10 | def _convert(image_path_map, conversations):
11 |     output = []
12 |     for turn in conversations:
13 |         role = turn["role"]
14 |         content = turn["content"]
15 |         turn_new = dict()
16 |         turn_new["from"] = role
17 |         pid = 1
18 |         keys = sorted(list(image_path_map.keys()))
19 |         for k in keys:
20 |             v = image_path_map[k]
21 |             if k in content:
22 |                 content = content.replace(k, f"Picture {pid}: <img>{v}</img>\n")
23 |                 content = content.replace(f"</img>\n\n", "</img>\n")
24 |                 pid += 1
25 |         turn_new["value"] = content
26 |         output.append(turn_new)
27 |     return output
28 | 
29 | 
30 | for item in tqdm(dataset):
31 |     #print(item["image"])
32 |     #print(item.keys())
33 |     conversations = item["conversations"]
34 |     #print(len(conversations), conversations[1])
35 |     image_path_map = dict()
36 |     if "image" not in item:
37 |         pass
38 |     elif type(item["image"]) == str:
39 |         image_path_map["<image>"] = item["image"]
40 |     else:
41 |         for k, v in item["image"].items():
42 |             image_path_map[k] = v
43 |     item["conversations"] = _convert(image_path_map, conversations)
44 | 
45 | from datetime import datetime
46 | import json
47 | 
48 | now = datetime.now().strftime("%Y%m%d_%H%M")
49 | print("write to", f"data/train_{now}.json")
50 | with open(f"data/train_{now}.json", "w") as f:
51 |     json.dump(dataset, f, indent=4, ensure_ascii=False)
52 | 
53 | import random
54 | with open(f"data/train_{now}_subset.json", "w") as f:
55 |     random.shuffle(dataset)
56 |     json.dump(dataset[:1000], f, indent=4, ensure_ascii=False)


--------------------------------------------------------------------------------
/experiments/Qwen-VL/scripts/tokenizer.py:
--------------------------------------------------------------------------------
 1 | from transformers import AutoTokenizer
 2 | 
 3 | tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
 4 | print(tokenizer.pad_token_id)
 5 | print(tokenizer.eos_token_id)
 6 | print(tokenizer.bos_token_id)
 7 | print(tokenizer.encode("<|im_start|>"))
 8 | print(tokenizer.encode("<|im_end|>"))
 9 | print(tokenizer.encode("<|im_start|>assistant"))
10 | print(tokenizer.encode("<|im_end|>"))
11 | 
12 | print(tokenizer.decode([872]))
13 | 
14 | msgs = [
15 |     {"role": "system", "content": "You are a helpful assistant."},
16 |     {"role": "user", "content": "Hello, how are you?"},
17 |     {"role": "assistant", "content": "I am fine, thank you!"},
18 | ]
19 | 
20 | print(tokenizer.apply_chat_template(msgs, tokenize=False))


--------------------------------------------------------------------------------
/experiments/Qwen-VL/slurm_jobs/train_gaia.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --job-name=gaia_qwen_vl  # create a short name for your job
 3 | #SBATCH --partition=HGX,DGX             # specify the partition name: gpu
 4 | #SBATCH --qos=lv1
 5 | #SBATCH --nodes=1                # node count
 6 | #SBATCH --ntasks=1              # total number of tasks across all nodes
 7 | #SBATCH --ntasks-per-node=1
 8 | #SBATCH --mem=64G               # total memory (RAM) per node
 9 | #SBATCH --time=72:00:00          # total run time limit (HH:MM:SS)
10 | #SBATCH --cpus-per-task=32        # cpu-cores per task (>1 if multi-threaded tasks)
11 | #SBATCH --gres=gpu:8            # number of gpus per node
12 | #SBATCH --output=output/out-%j.out      # output format
13 | #SBATCH --error=output/error-out-%j.out      # error output file
14 | #SBATCH --account=engineering
15 | #--------------------task  part-------------------------
16 | 
17 | ## clean env
18 | module purge
19 | ## load environment need by this task
20 | module load slurm/BigAI/23.02.2
21 | export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/zhangbofei/anaconda3/lib
22 | source /home/zhangbofei/anaconda3/bin/activate  
23 | 
24 | conda activate qwen_vl
25 | 
26 | bash finetune/finetune_lora_ds_gaia.sh


--------------------------------------------------------------------------------
/experiments/Qwen-VL/slurm_jobs/train_gta.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --job-name=mat_qwen_vl  # create a short name for your job
 3 | #SBATCH --partition=HGX,DGX             # specify the partition name: gpu
 4 | #SBATCH --qos=lv1
 5 | #SBATCH --nodes=1                # node count
 6 | #SBATCH --ntasks=1              # total number of tasks across all nodes
 7 | #SBATCH --ntasks-per-node=1
 8 | #SBATCH --mem=64G               # total memory (RAM) per node
 9 | #SBATCH --time=72:00:00          # total run time limit (HH:MM:SS)
10 | #SBATCH --cpus-per-task=32        # cpu-cores per task (>1 if multi-threaded tasks)
11 | #SBATCH --gres=gpu:8            # number of gpus per node
12 | #SBATCH --output=output/out-%j.out      # output format
13 | #SBATCH --error=output/error-out-%j.out      # error output file
14 | #SBATCH --account=engineering
15 | #--------------------task  part-------------------------
16 | 
17 | ## clean env
18 | module purge
19 | ## load environment need by this task
20 | module load slurm/BigAI/23.02.2
21 | export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/zhangbofei/anaconda3/lib
22 | source /home/zhangbofei/anaconda3/bin/activate  
23 | 
24 | conda activate qwen_vl
25 | 
26 | bash finetune/finetune_lora_ds.sh
27 | 


--------------------------------------------------------------------------------
/experiments/Qwen-VL/touchstone/README_CN.md:
--------------------------------------------------------------------------------
 1 | <br>
 2 | 
 3 | <p align="center">
 4 |     <img src="../assets/touchstone_logo.png" width="300"/>
 5 | <p>
 6 | <br>
 7 | 
 8 | <p align="center">
 9 |         中文&nbsp ｜ &nbsp<a href="../touchstone/README.md">English</a> ｜ &nbsp<a href="../touchstone/README_JA.md">日本語</a>
10 | </p>
11 | <br><br>
12 | 
13 | **TOUCHSTONE** 是一种针对多模态语言模型（LVLM）的自动化综合评估方法，评估不仅包括基本的认知和理解，还延伸到文学创作。通过人类注解将多模态信息转换为文本，我们的 TouchStone 可以利用SOTA的语言模型来自动化地完成对LVLMs的多模态对话质量评估。
14 | 
15 | ## 数据集
16 | 
17 | 为了评估 LVLMs 的能力，我们构建了一个多样化且全面的数据集，涵盖五个关键维度：基本描述能力、视觉识别能力、视觉理解能力、视觉叙事能力和多图分析能力。
18 | 
19 | - **基本描述能力** 图像描述考验模型总结图片信息的能力，包括简单描述和详细描述。 简单描述通常是描述图像的主要内容和关系的简短短语，而详细描述则提供有关图像场景、其属性和关系的更深入的信息。
20 | 
21 | - **视觉识别能力** 图像识别考察模型提取图像中内容的属性以及关联到知识库的能力。为了考察这方面能力，测试的问题包括属性QA、影视识别、艺术识别、地标识别、名人识别、情感识别、文本识别、物体识别和结构内容识别。
22 | 
23 | - **视觉理解能力** 图像理解需要模型理解图像内容并完成推理进行相关任务。 这方面包含了例如风格欣赏、抽象图像理解、模因理解、图像分析、图表分析、一般问题解决和推理问答等任务。
24 | 
25 | - **视觉叙事能力**  视觉叙事能力是基于视觉内容的文学创作能力，包括撰写电子邮件、诗歌、故事、广告/商品推荐、头脑风暴等。 
26 | 
27 | - **多图分析能力** 多图分析是分析和比较多幅图像的任务。该领域包括比较两个/多个图像、总结多个图像信息、比较商品以及逐步分析图像等任务。
28 | 
29 | <p align="center">
30 |     <img src="../assets/touchstone_datasets.jpg" width="600"/>
31 | <p>
32 | 
33 | 我们从五个维度综合评估了模型的能力。 如上图所示，给出了27个子任务的示例。 从感知到认知，再到创造力，随着难度的增加，对模型的要求也越来越高。 目前，LVLM的能力还处于早期阶段。 我们的数据集包含800+道题目、27个类别。
34 | 
35 | ## 测评方式
36 | 
37 | 我们应用SOTA的LLM进行自动化评估。 为了有效地理解图像的内容，我们人工用细粒度的文本注释替换实际的图像输入。 通过将这些注释和相应的问题输入到像GPT4这样强LLM中，我们可以获得参考答案。
38 | 
39 | 对于待测评的LVLM，我们提供实际图像和问题作为输入并获得各自的答案。 最后，我们使用GPT4根据细粒度注释和问题对LVLM生成的答案进行评分。 评分指令要求模型评估答案的有用性、相关性和准确性，并将人工注解视为图像的内容。 为了确保评估的公平性，每个模型的答案都会与 GPT4生成的参考答案进行比较。 模型在所有问题上的平均得分作为最终得分。
40 | 
41 | 为了消除答案位置的影响，我们通过交换答案的位置来进行第二轮评分，然后计算获得的两次分数的平均值。
42 | 
43 | <p align="center">
44 |     <img src="../assets/touchstone_eval.png" width="600"/>
45 | <p>
46 | 
47 | 
48 | ## 测评结果
49 | 
50 | #### 英文版本测评
51 | 
52 | | Model         | Score |
53 | |---------------|-------|
54 | | PandaGPT      | 488.5 |
55 | | MiniGPT4      | 531.7 |
56 | | InstructBLIP  | 552.4 |
57 | | LLaMA-AdapterV2 | 590.1 |
58 | | mPLUG-Owl     | 605.4 |
59 | | LLaVA         | 602.7 |
60 | | Qwen-VL-Chat   | 645.2 |
61 | 
62 | #### 中文版本测评
63 | 
64 | | Model         | Score |
65 | |---------------|-------|
66 | | VisualGLM     | 247.1 |
67 | | Qwen-VL-Chat   | 401.2 |
68 | 
69 | 


--------------------------------------------------------------------------------
/experiments/Qwen-VL/touchstone/README_JA.md:
--------------------------------------------------------------------------------
 1 | <br>
 2 | 
 3 | <p align="center">
 4 |     <img src="../assets/touchstone_logo.png" width="300"/>
 5 | <p>
 6 | <br>
 7 | 
 8 | <p align="center">
 9 |         <a href="touchstone/README_CN.md">中文</a>&nbsp ｜ &nbsp<a href="../touchstone/README.md">English</a>｜ &nbsp日本語
10 | </p>
11 | <br><br>
12 | 
13 | **TOUCHSTONE** は、マルチモーダル言語モデルの包括的な評価であり、基本的な認識や理解だけでなく、文学的な創作にまで及びます。評価プロセスを自動化し、マルチモーダル情報をテキストに変換することで、私達の TouchStone は、人手を介することなく高度な言語モデルの力を活用し、対話の質を効率的かつ正確に評価することができます。
14 | 
15 | ## DATASET
16 | 
17 | LVLMの能力を評価するために、基本的な記述能力、視覚認識能力、視覚理解能力、視覚ストーリーテリング能力、複数画像解析能力の5つの主要な次元をカバーする多様で包括的なデータセットを構築する。
18 | 
19 | - **基本的描写力** 画像記述には、単純な記述と詳細な記述を含め、画像に含まれる情報を記述するモデルの能力が含まれる。単純な記述は、通常、画像の主な主題とアクションを記述する短いフレーズであり、詳細な記述は、画像のシーン、それらの属性、および関係についてのより詳細な情報を提供します。
20 | 
21 | - **視覚認識能力** 画像認識とは、画像内のオブジェクトやシーンを認識し、関連情報を推論するタスクである。この分野はさらに、属性QA、映画/テレビ認識、アート認識、ランドマーク認識、有名人認識、感情認識、テキスト認識、オブジェクト認識、構造コンテンツ認識など、いくつかのサブタスクに分けることができる。
22 | 
23 | - **視覚理解能力** 画像理解とは、モデルが画像の意味や関連するタスクを理解する能力のことである。この分野には、スタイル理解、抽象画像理解、ミーム理解、画像分析、チャート分析、一般的な問題解決、推論QAなど、いくつかのサブタスクが含まれる。
24 | 
25 | - **視覚的ストーリーテリング能力** ビジュアルストーリーテリング能力とは、メール、詩、物語、広告／商品推薦、ブレーンストーミングの執筆など、ビジュアルコンテンツに基づいた文学創作のプロセスである。
26 | 
27 | - **マルチ画像解析能力** 複数画像解析とは、複数の画像を解析・比較する作業である。この分野には、2つまたは複数の画像を比較する、複数の画像情報を要約する、商品を比較する、画像を段階的に分析するなどのタスクが含まれます。
28 | 
29 | 
30 | <p align="center">
31 |     <img src="../assets/touchstone_datasets.jpg" width="600"/>
32 | <p>
33 | 
34 | モデルの能力を 5 つの次元から総合的に評価する。上図のように、27 のサブタスクの例を示す。知覚から認知、創造性まで、難易度が上がるにつれて、モデルに求められる要件もどんどん高くなっている。現在、LVLM の機能は初期段階にある。我々のデータセットには 800 以上の質問と 27 のカテゴリーが含まれている。
35 | 
36 | ## 方法
37 | 
38 | 
39 | 自動評価を可能にするために、強力な LLM を判定器として適用する。画像の内容を効果的に理解するために、実際の画像入力をきめ細かいテキスト注釈に手動で置き換える。これらの注釈と対応する質問を GPT4 のような強力な LLM に入力することで、参照解答を得る。
40 | 
41 | LVLMの評価には、実際の画像と質問を入力として与え、それぞれの回答を得る。最後に、GPT4を用いて、LVLMが生成した回答を、細かいアノテーションと質問に基づいてスコアリングする。スコアリングの指示は、注釈を画像の内容とみなして、回答の有用性、関連性、正確性を評価するようモデルに要求する。評価の公平性を確保するため、各モデルの回答はGPT4の一貫した参照回答と比較されます。全問題におけるモデルの平均スコアを最終スコアとする。
42 | 
43 | 解答位置の影響を排除するために、解答位置を入れ替えて2回目の採点ラウンドを行い、得られた2つのスコアの平均を計算します。このアプローチは、解答の配置によって生じるバイアスを軽減することを目的としています。
44 | <p align="center">
45 |     <img src="../assets/touchstone_eval.png" width="600"/>
46 | <p>
47 | 
48 | ### 評価
49 | 
50 | #### 英語ベースのマルチモーダル対話における評価
51 | 
52 | | Model         | Score |
53 | |---------------|-------|
54 | | PandaGPT      | 488.5 |
55 | | MiniGPT4      | 531.7 |
56 | | InstructBLIP  | 552.4 |
57 | | LLaMA-AdapterV2 | 590.1 |
58 | | mPLUG-Owl     | 605.4 |
59 | | LLaVA         | 602.7 |
60 | | Qwen-VL-Chat   | 645.2 |
61 | 
62 | #### 中国語ベースのマルチモーダル対話における評価
63 | 
64 | | Model         | Score |
65 | |---------------|-------|
66 | | VisualGLM     | 247.1 |
67 | | Qwen-VL-Chat   | 401.2 |
68 | 
69 | 


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | from tongagent.agents.general_agent import create_agent
 3 | 
 4 | def main():
 5 |     parser = argparse.ArgumentParser()
 6 |     parser.add_argument(
 7 |         "--prompt", 
 8 |         required=True, 
 9 |         help="Instructions that you want agent to execute.")
10 |     args = parser.parse_args()
11 |     agent = create_agent()
12 |     result = agent.run(args.prompt)
13 |     print("Agent Response:", result)
14 | 
15 | if __name__ == "__main__":
16 |     main()


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | transformers==4.46.0
 2 | vllm==0.6.1
 3 | openai
 4 | langchain
 5 | pypdf
 6 | markdownify
 7 | pathvalidate
 8 | puremagic
 9 | mammoth
10 | python-pptx
11 | pandas
12 | pdfminer-six
13 | youtube-transcript-api
14 | serpapi
15 | google-search-results
16 | face-detection
17 | pygments
18 | paddlepaddle-gpu
19 | paddleocr>=2.0.1
20 | shortuuid
21 | diffusers
22 | accelerate
23 | langchain_community
24 | langchain_chroma
25 | langchain_openai
26 | omegaconf
27 | tiktoken
28 | git+https://github.com/facebookresearch/segment-anything-2.git
29 | openpyxl
30 | google-cloud-aiplatform>=1.38
31 | ray[default]
32 | vidgear
33 | xlrd>=2.0.1
34 | loguru


--------------------------------------------------------------------------------
/scripts/report.py:
--------------------------------------------------------------------------------
1 | import wandb
2 | import time
3 | wandb.init(project="occupy", name="occupy")
4 | print("Get GPU!")
5 | wandb.alert(title="Get GPU!", text="Get GPU!")
6 | wandb.finish()
7 | 


--------------------------------------------------------------------------------
/scripts/search.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | data_path = "experiments/CPM-FT/data/agent_tune_dataset_gaia_1206_11k.json"
 3 | with open(data_path, "r") as f:
 4 |     data = json.load(f)
 5 |     
 6 | search_str = '''The attached file contains a list of vendors in the Liminal Springs mall, along with each vendor’s monthly revenue and the rent they pay the mall. I want you to find the vendor that makes the most money, relative to the rent it pays. Then, tell me what is listed in the “type” column for that vendor.'''
 7 | found = False
 8 | for item in data:
 9 |     # print(item)
10 |     conversations = item["conversations"]
11 |     for conversation in conversations:
12 |         if search_str in conversation["content"]:
13 |             print(item)
14 |             found = True
15 |             
16 |             
17 | print(found)
18 | 
19 | 


--------------------------------------------------------------------------------
/slurm_jobs/deploy_qwen2_5_72b.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --job-name=qwen2.5_72b  # create a short name for your job
 3 | #SBATCH --partition=DGX,HGX           # specify the partition name: gpu
 4 | #SBATCH --qos=lv1
 5 | #SBATCH --nodes=1                # node count
 6 | #SBATCH --ntasks=1              # total number of tasks across all nodes
 7 | #SBATCH --ntasks-per-node=1
 8 | #SBATCH --mem=64G               # total memory (RAM) per node
 9 | #SBATCH --time=72:00:00          # total run time limit (HH:MM:SS)
10 | #SBATCH --cpus-per-task=32        # cpu-cores per task (>1 if multi-threaded tasks)
11 | #SBATCH --gres=gpu:4             # number of gpus per node
12 | #SBATCH --output=output/out-%j.out      # output format
13 | #SBATCH --error=output/error-out-%j.out      # error output file
14 | #SBATCH --account=engineering
15 | #--------------------task  part-------------------------
16 | 
17 | 
18 | ## clean env
19 | module purge
20 | ## load environment need by this task
21 | module load slurm/BigAI/23.02.2
22 | export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/zhangbofei/anaconda3/lib
23 | source /home/zhangbofei/anaconda3/bin/activate  # commented out by conda initialize
24 | conda init
25 | conda activate agent_tune
26 | GPUS_PER_NODE=$(nvidia-smi -L | wc -l)
27 | nvidia-smi
28 | 
29 | 
30 | vllm serve /scratch/ml/zhangxintong/A_Models/Qwen/Qwen2.5-72B-Instruct --tensor-parallel-size $GPUS_PER_NODE --dtype bfloat16 --gpu-memory-utilization 0.90 --max-model-len 20000
31 | 
32 | 
33 | 
34 | 
35 | 


--------------------------------------------------------------------------------
/slurm_jobs/deploy_qwen2_VL_72b.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --job-name=qwen2_vl_72b  # create a short name for your job
 3 | #SBATCH --partition=DGX             # specify the partition name: gpu
 4 | #SBATCH --qos=lv2
 5 | #SBATCH --nodes=1                # node count
 6 | #SBATCH --ntasks=1              # total number of tasks across all nodes
 7 | #SBATCH --ntasks-per-node=1
 8 | #SBATCH --mem=64G               # total memory (RAM) per node
 9 | #SBATCH --time=12:00:00          # total run time limit (HH:MM:SS)
10 | #SBATCH --cpus-per-task=32        # cpu-cores per task (>1 if multi-threaded tasks)
11 | #SBATCH --gres=gpu:2             # number of gpus per node
12 | #SBATCH --output=output/out-%j.out      # output format
13 | #SBATCH --error=output/error-out-%j.out      # error output file
14 | #SBATCH --account=engineering
15 | #--------------------task  part-------------------------
16 | 
17 | 
18 | ## clean env
19 | module purge
20 | ## load environment need by this task
21 | module load slurm/BigAI/23.02.2
22 | export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/zhangbofei/anaconda3/lib
23 | source /home/zhangbofei/anaconda3/bin/activate  # commented out by conda initialize
24 | conda init
25 | conda activate qwen_vl
26 | GPUS_PER_NODE=$(nvidia-smi -L | wc -l)
27 | nvidia-smi
28 | 
29 | 
30 | 
31 | # vllm serve /scratch/TecManDep/llm_weights/Qwen2-VL-72B-Instruct/ --tensor-parallel-size 2 --dtype bfloat16 --gpu-memory-utilization 0.90 --max-model-len 20000
32 | 
33 | python -m vllm.entrypoints.openai.api_server --tensor-parallel-size $GPUS_PER_NODE --served-model-name  Qwen2-VL-7B-Instruct --model /scratch/TecManDep/llm_weights/Qwen2-VL-72B-Instruct/
34 | 


--------------------------------------------------------------------------------
/slurm_jobs/evaluate.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --job-name=eval_gta  # create a short name for your job
 3 | #SBATCH --partition=HGX             # specify the partition name: gpu
 4 | #SBATCH --qos=lv1
 5 | #SBATCH --nodes=1                # node count
 6 | #SBATCH --ntasks=1              # total number of tasks across all nodes
 7 | #SBATCH --ntasks-per-node=1
 8 | #SBATCH --mem=64G               # total memory (RAM) per node
 9 | #SBATCH --time=72:00:00          # total run time limit (HH:MM:SS)
10 | #SBATCH --cpus-per-task=32        # cpu-cores per task (>1 if multi-threaded tasks)
11 | #SBATCH --gres=gpu:4             # number of gpus per node
12 | #SBATCH --output=output/out-%j.out      # output format
13 | #SBATCH --error=output/error-out-%j.out      # error output file
14 | #SBATCH --account=engineering
15 | #SBATCH --dependency=7787293
16 | #--------------------task  part-------------------------
17 | 
18 | 
19 | ## clean env
20 | module purge
21 | ## load environment need by this task
22 | module load slurm/BigAI/23.02.2
23 | export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/zhangbofei/anaconda3/lib
24 | source /home/zhangbofei/anaconda3/bin/activate  # commented out by conda initialize
25 | conda init
26 | conda activate agent_tune
27 | export AGENT_CONFIG='configs/agent_config.yaml' 
28 | python examples/gta/main.py --engine minicpm --lora-path experiments/CPM-FT/output/cpm_v2_6_7879870_2024_09_30_15_55/ --disable-vision
29 | # python examples/gta/main.py --engine tonggpt --disable-vision


--------------------------------------------------------------------------------
/slurm_jobs/evaluate_gaia.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --job-name=eval_gaia  # create a short name for your job
 3 | #SBATCH --partition=HGX             # specify the partition name: gpu
 4 | #SBATCH --qos=lv0a
 5 | #SBATCH --nodes=1                # node count
 6 | #SBATCH --ntasks=1              # total number of tasks across all nodes
 7 | #SBATCH --ntasks-per-node=1
 8 | #SBATCH --mem=64G               # total memory (RAM) per node
 9 | #SBATCH --time=08:00:00          # total run time limit (HH:MM:SS)
10 | #SBATCH --cpus-per-task=32        # cpu-cores per task (>1 if multi-threaded tasks)
11 | #SBATCH --gres=gpu:1           # number of gpus per node
12 | #SBATCH --output=output/out-%j.out      # output format
13 | #SBATCH --error=output/error-out-%j.out      # error output file
14 | #SBATCH --account=engineering
15 | ## SBATCH --dependency=7787293
16 | #--------------------task  part-------------------------
17 | 
18 | 
19 | ## clean env
20 | module purge
21 | ## load environment need by this task
22 | module load slurm/BigAI/23.02.2
23 | export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/zhangbofei/anaconda3/lib
24 | source /home/zhangbofei/anaconda3/bin/activate  # commented out by conda initialize
25 | conda init
26 | conda activate agent_tune
27 | export RUN_MODE=eval
28 | python examples/gaia/main.py --engine minicpm --lora-path experiments/CPM-FT/output/cpm_v2_6_7904295_2024_12_10_23_05/ --data-name 2023_level2 --split validation
29 | 
30 | #python examples/gaia/main.py --engine minicpm --lora-path experiments/CPM-FT/output/cpm_v2_6_7880273_2024_10_01_16_09/ --data-name 2023_level3 --split validation
31 | 
32 | #python examples/gaia/main.py --engine minicpm --lora-path experiments/CPM-FT/output/cpm_v2_6_7880273_2024_10_01_16_09/ --data-name 2023_level2 --split validation
33 | 
34 | 
35 | # python examples/gaia/main.py --engine minicpm --data-name 2023_level1 --split validation
36 | 
37 | # python examples/gaia/main.py --engine tonggpt --data-name 2023_level1 --split validation
38 | 
39 | #python examples/gaia/main.py --engine tonggpt --data-name 2023_level3 --split validation
40 | 


--------------------------------------------------------------------------------
/slurm_jobs/evaluate_gaia_exp1.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --job-name=eval_gaia  # create a short name for your job
 3 | #SBATCH --partition=DGX,HGX             # specify the partition name: gpu
 4 | #SBATCH --qos=lv0a
 5 | #SBATCH --nodes=1                # node count
 6 | #SBATCH --ntasks=1              # total number of tasks across all nodes
 7 | #SBATCH --ntasks-per-node=1
 8 | #SBATCH --mem=64G               # total memory (RAM) per node
 9 | #SBATCH --time=72:00:00          # total run time limit (HH:MM:SS)
10 | #SBATCH --cpus-per-task=32        # cpu-cores per task (>1 if multi-threaded tasks)
11 | #SBATCH --gres=gpu:1           # number of gpus per node
12 | #SBATCH --output=output/out-%j.out      # output format
13 | #SBATCH --error=output/error-out-%j.out      # error output file
14 | #SBATCH --account=engineering
15 | #SBATCH --dependency=7880273
16 | #--------------------task  part-------------------------
17 | 
18 | 
19 | ## clean env
20 | module purge
21 | ## load environment need by this task
22 | module load slurm/BigAI/23.02.2
23 | export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/zhangbofei/anaconda3/lib
24 | source /home/zhangbofei/anaconda3/bin/activate  # commented out by conda initialize
25 | conda init
26 | conda activate agent_tune
27 | export RUN_MODE=eval
28 | python examples/gaia/main.py --engine minicpm --lora-path experiments/CPM-FT/output/cpm_v2_6_7880273_2024_10_01_16_09/ --data-name 2023_level1 --split validation
29 | 
30 | python examples/gaia/main.py --engine minicpm --lora-path experiments/CPM-FT/output/cpm_v2_6_7880273_2024_10_01_16_09/ --data-name 2023_level3 --split validation
31 | 
32 | python examples/gaia/main.py --engine minicpm --lora-path experiments/CPM-FT/output/cpm_v2_6_7880273_2024_10_01_16_09/ --data-name 2023_level2 --split validation
33 | 
34 | 
35 | # python examples/gaia/main.py --engine minicpm --data-name 2023_level1 --split validation
36 | 
37 | # python examples/gaia/main.py --engine tonggpt --data-name 2023_level1 --split validation
38 | 
39 | #python examples/gaia/main.py --engine tonggpt --data-name 2023_level3 --split validation
40 | 


--------------------------------------------------------------------------------
/slurm_jobs/evaluate_gaia_exp1_setting1.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --job-name=eval_gaia  # create a short name for your job
 3 | #SBATCH --partition=DGX,HGX             # specify the partition name: gpu
 4 | #SBATCH --qos=lv0a
 5 | #SBATCH --nodes=1                # node count
 6 | #SBATCH --ntasks=1              # total number of tasks across all nodes
 7 | #SBATCH --ntasks-per-node=1
 8 | #SBATCH --mem=64G               # total memory (RAM) per node
 9 | #SBATCH --time=08:00:00          # total run time limit (HH:MM:SS)
10 | #SBATCH --cpus-per-task=32        # cpu-cores per task (>1 if multi-threaded tasks)
11 | #SBATCH --gres=gpu:1           # number of gpus per node
12 | #SBATCH --output=output/out-%j.out      # output format
13 | #SBATCH --error=output/error-out-%j.out      # error output file
14 | #SBATCH --account=engineering
15 | #SBATCH --dependency=7880273
16 | #--------------------task  part-------------------------
17 | 
18 | 
19 | ## clean env
20 | module purge
21 | ## load environment need by this task
22 | module load slurm/BigAI/23.02.2
23 | export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/zhangbofei/anaconda3/lib
24 | source /home/zhangbofei/anaconda3/bin/activate  # commented out by conda initialize
25 | conda init
26 | conda activate agent_tune
27 | export RUN_MODE=eval
28 | python examples/gaia/main.py --engine minicpm --lora-path experiments/CPM-FT/output/cpm_v2_6_7897481_2024_11_10_07_40/ --data-name 2023_level1 --split validation
29 | 
30 | 
31 | # CUDA_VISIBLE_DEVICES=1 python examples/gaia/main.py --engine minicpm --lora-path experiments/CPM-FT/output/cpm_v2_6_7897481_2024_11_10_07_40/ --data-name 2023_level3 --split validation > eval_settings1_lv3.log 2>&1 &
32 | #python examples/gaia/main.py --engine minicpm --lora-path experiments/CPM-FT/output/cpm_v2_6_7880273_2024_10_01_16_09/ --data-name 2023_level3 --split validation
33 | 
34 | #python examples/gaia/main.py --engine minicpm --lora-path experiments/CPM-FT/output/cpm_v2_6_7880273_2024_10_01_16_09/ --data-name 2023_level2 --split validation
35 | 
36 | 
37 | # python examples/gaia/main.py --engine minicpm --data-name 2023_level1 --split validation
38 | 
39 | # python examples/gaia/main.py --engine tonggpt --data-name 2023_level1 --split validation
40 | 
41 | #python examples/gaia/main.py --engine tonggpt --data-name 2023_level3 --split validation
42 | 


--------------------------------------------------------------------------------
/slurm_jobs/evaluate_gaia_exp1_setting2.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --job-name=eval_gaia  # create a short name for your job
 3 | #SBATCH --partition=DGX,HGX             # specify the partition name: gpu
 4 | #SBATCH --qos=lv0a
 5 | #SBATCH --nodes=1                # node count
 6 | #SBATCH --ntasks=1              # total number of tasks across all nodes
 7 | #SBATCH --ntasks-per-node=1
 8 | #SBATCH --mem=64G               # total memory (RAM) per node
 9 | #SBATCH --time=08:00:00          # total run time limit (HH:MM:SS)
10 | #SBATCH --cpus-per-task=32        # cpu-cores per task (>1 if multi-threaded tasks)
11 | #SBATCH --gres=gpu:1           # number of gpus per node
12 | #SBATCH --output=output/out-%j.out      # output format
13 | #SBATCH --error=output/error-out-%j.out      # error output file
14 | #SBATCH --account=engineering
15 | #SBATCH --dependency=7880273
16 | #--------------------task  part-------------------------
17 | 
18 | 
19 | ## clean env
20 | module purge
21 | ## load environment need by this task
22 | module load slurm/BigAI/23.02.2
23 | export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/zhangbofei/anaconda3/lib
24 | source /home/zhangbofei/anaconda3/bin/activate  # commented out by conda initialize
25 | conda init
26 | conda activate agent_tune
27 | export RUN_MODE=eval
28 | 
29 | CUDA_VISIBLE_DEVICES=5 python examples/gaia/main.py --engine minicpm --lora-path experiments/CPM-FT/output/cpm_v2_6_7897482_2024_11_10_09_38/ --data-name 2023_level1 --split validation > eval_settings2.log 2>&1 &
30 | 
31 | CUDA_VISIBLE_DEVICES=0 python examples/gaia/main.py --engine minicpm --lora-path experiments/CPM-FT/output/cpm_v2_6_7897482_2024_11_10_09_38/ --data-name 2023_level3 --split validation > eval_settings2_lv3.log 2>&1 &
32 | # CUDA_VISIBLE_DEVICES=4 python examples/gaia/main.py --engine minicpm --lora-path experiments/CPM-FT/output/cpm_v2_6_7897481_2024_11_10_07_40/ --data-name 2023_level1 --split validation
33 | 
34 | #python examples/gaia/main.py --engine minicpm --lora-path experiments/CPM-FT/output/cpm_v2_6_7880273_2024_10_01_16_09/ --data-name 2023_level3 --split validation
35 | 
36 | #python examples/gaia/main.py --engine minicpm --lora-path experiments/CPM-FT/output/cpm_v2_6_7880273_2024_10_01_16_09/ --data-name 2023_level2 --split validation
37 | 
38 | 
39 | # python examples/gaia/main.py --engine minicpm --data-name 2023_level1 --split validation
40 | 
41 | # python examples/gaia/main.py --engine tonggpt --data-name 2023_level1 --split validation
42 | 
43 | #python examples/gaia/main.py --engine tonggpt --data-name 2023_level3 --split validation
44 | 


--------------------------------------------------------------------------------
/slurm_jobs/evaluate_gaia_exp1_setting3.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --job-name=eval_gaia  # create a short name for your job
 3 | #SBATCH --partition=DGX,HGX             # specify the partition name: gpu
 4 | #SBATCH --qos=lv1
 5 | #SBATCH --nodes=1                # node count
 6 | #SBATCH --ntasks=1              # total number of tasks across all nodes
 7 | #SBATCH --ntasks-per-node=1
 8 | #SBATCH --mem=64G               # total memory (RAM) per node
 9 | #SBATCH --time=08:00:00          # total run time limit (HH:MM:SS)
10 | #SBATCH --cpus-per-task=32        # cpu-cores per task (>1 if multi-threaded tasks)
11 | #SBATCH --gres=gpu:1           # number of gpus per node
12 | #SBATCH --output=output/out-%j.out      # output format
13 | #SBATCH --error=output/error-out-%j.out      # error output file
14 | #SBATCH --account=engineering
15 | #SBATCH --dependency=7880273
16 | #--------------------task  part-------------------------
17 | 
18 | 
19 | ## clean env
20 | module purge
21 | ## load environment need by this task
22 | module load slurm/BigAI/23.02.2
23 | export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/zhangbofei/anaconda3/lib
24 | source /home/zhangbofei/anaconda3/bin/activate  # commented out by conda initialize
25 | conda init
26 | conda activate agent_tune
27 | export RUN_MODE=eval
28 | 
29 | python examples/gaia/main.py --engine minicpm --lora-path experiments/CPM-FT/output/cpm_v2_6_7897483_2024_11_10_09_41/ --data-name 2023_level2 --split validation
30 | 
31 | 
32 | # CUDA_VISIBLE_DEVICES=2 python examples/gaia/main.py --engine minicpm --lora-path experiments/CPM-FT/output/cpm_v2_6_7897483_2024_11_10_09_41/ --data-name 2023_level3 --split validation > eval_settings3_lv3.log 2>&1 &
33 | # CUDA_VISIBLE_DEVICES=4 python examples/gaia/main.py --engine minicpm --lora-path experiments/CPM-FT/output/cpm_v2_6_7897481_2024_11_10_07_40/ --data-name 2023_level1 --split validation
34 | 
35 | #python examples/gaia/main.py --engine minicpm --lora-path experiments/CPM-FT/output/cpm_v2_6_7880273_2024_10_01_16_09/ --data-name 2023_level3 --split validation
36 | 
37 | #python examples/gaia/main.py --engine minicpm --lora-path experiments/CPM-FT/output/cpm_v2_6_7880273_2024_10_01_16_09/ --data-name 2023_level2 --split validation
38 | 
39 | 
40 | # python examples/gaia/main.py --engine minicpm --data-name 2023_level1 --split validation
41 | 
42 | # python examples/gaia/main.py --engine tonggpt --data-name 2023_level1 --split validation
43 | 
44 | #python examples/gaia/main.py --engine tonggpt --data-name 2023_level3 --split validation
45 | 


--------------------------------------------------------------------------------
/slurm_jobs/evaluate_gaia_exp2.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --job-name=eval_gaia  # create a short name for your job
 3 | #SBATCH --partition=DGX             # specify the partition name: gpu
 4 | #SBATCH --qos=lv2
 5 | #SBATCH --nodes=1                # node count
 6 | #SBATCH --ntasks=1              # total number of tasks across all nodes
 7 | #SBATCH --ntasks-per-node=1
 8 | #SBATCH --mem=64G               # total memory (RAM) per node
 9 | #SBATCH --time=72:00:00          # total run time limit (HH:MM:SS)
10 | #SBATCH --cpus-per-task=32        # cpu-cores per task (>1 if multi-threaded tasks)
11 | #SBATCH --gres=gpu:1           # number of gpus per node
12 | #SBATCH --output=output/out-%j.out      # output format
13 | #SBATCH --error=output/error-out-%j.out      # error output file
14 | #SBATCH --account=engineering
15 | #SBATCH --dependency=7880278
16 | #--------------------task  part-------------------------
17 | 
18 | 
19 | ## clean env
20 | module purge
21 | ## load environment need by this task
22 | module load slurm/BigAI/23.02.2
23 | export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/zhangbofei/anaconda3/lib
24 | source /home/zhangbofei/anaconda3/bin/activate  # commented out by conda initialize
25 | conda init
26 | conda activate agent_tune
27 | export RUN_MODE=eval
28 | python examples/gaia/main.py --engine minicpm --lora-path experiments/CPM-FT/output/cpm_v2_6_7880278_2024_10_01_16_31/ --data-name 2023_level1 --split validation
29 | 
30 | python examples/gaia/main.py --engine minicpm --lora-path experiments/CPM-FT/output/cpm_v2_6_7880278_2024_10_01_16_31/ --data-name 2023_level3 --split validation
31 | 
32 | python examples/gaia/main.py --engine minicpm --lora-path experiments/CPM-FT/output/cpm_v2_6_7880278_2024_10_01_16_31/ --data-name 2023_level2 --split validation
33 | 
34 | 
35 | # python examples/gaia/main.py --engine minicpm --data-name 2023_level1 --split validation
36 | 
37 | # python examples/gaia/main.py --engine tonggpt --data-name 2023_level1 --split validation
38 | 
39 | #python examples/gaia/main.py --engine tonggpt --data-name 2023_level3 --split validation
40 | 


--------------------------------------------------------------------------------
/slurm_jobs/evaluate_gaia_exp3.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --job-name=eval_gaia  # create a short name for your job
 3 | #SBATCH --partition=DGX             # specify the partition name: gpu
 4 | #SBATCH --qos=lv2
 5 | #SBATCH --nodes=1                # node count
 6 | #SBATCH --ntasks=1              # total number of tasks across all nodes
 7 | #SBATCH --ntasks-per-node=1
 8 | #SBATCH --mem=64G               # total memory (RAM) per node
 9 | #SBATCH --time=72:00:00          # total run time limit (HH:MM:SS)
10 | #SBATCH --cpus-per-task=32        # cpu-cores per task (>1 if multi-threaded tasks)
11 | #SBATCH --gres=gpu:1           # number of gpus per node
12 | #SBATCH --output=output/out-%j.out      # output format
13 | #SBATCH --error=output/error-out-%j.out      # error output file
14 | #SBATCH --account=engineering
15 | #SBATCH --dependency=7880279
16 | #--------------------task  part-------------------------
17 | 
18 | 
19 | ## clean env
20 | module purge
21 | ## load environment need by this task
22 | module load slurm/BigAI/23.02.2
23 | export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/zhangbofei/anaconda3/lib
24 | source /home/zhangbofei/anaconda3/bin/activate  # commented out by conda initialize
25 | conda init
26 | conda activate agent_tune
27 | export RUN_MODE=eval
28 | python examples/gaia/main.py --engine minicpm --lora-path experiments/CPM-FT/output/cpm_v2_6_7880279_2024_10_01_16_32/ --data-name 2023_level1 --split validation
29 | 
30 | #python examples/gaia/main.py --engine minicpm --lora-path experiments/CPM-FT/output/cpm_v2_6_7880279_2024_10_01_16_32/ --data-name 2023_level3 --split validation
31 | 
32 | #python examples/gaia/main.py --engine minicpm --lora-path experiments/CPM-FT/output/cpm_v2_6_7880279_2024_10_01_16_32/ --data-name 2023_level2 --split validation
33 | 
34 | 
35 | # python examples/gaia/main.py --engine minicpm --data-name 2023_level1 --split validation
36 | 
37 | # python examples/gaia/main.py --engine tonggpt --data-name 2023_level1 --split validation
38 | 
39 | #python examples/gaia/main.py --engine tonggpt --data-name 2023_level3 --split validation
40 | 


--------------------------------------------------------------------------------
/slurm_jobs/evaluate_gaia_exp4.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --job-name=eval_gaia  # create a short name for your job
 3 | #SBATCH --partition=DGX             # specify the partition name: gpu
 4 | #SBATCH --qos=lv2
 5 | #SBATCH --nodes=1                # node count
 6 | #SBATCH --ntasks=1              # total number of tasks across all nodes
 7 | #SBATCH --ntasks-per-node=1
 8 | #SBATCH --mem=64G               # total memory (RAM) per node
 9 | #SBATCH --time=72:00:00          # total run time limit (HH:MM:SS)
10 | #SBATCH --cpus-per-task=32        # cpu-cores per task (>1 if multi-threaded tasks)
11 | #SBATCH --gres=gpu:1           # number of gpus per node
12 | #SBATCH --output=output/out-%j.out      # output format
13 | #SBATCH --error=output/error-out-%j.out      # error output file
14 | #SBATCH --account=engineering
15 | #SBATCH --dependency=7880280
16 | #--------------------task  part-------------------------
17 | 
18 | 
19 | ## clean env
20 | module purge
21 | ## load environment need by this task
22 | module load slurm/BigAI/23.02.2
23 | export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/zhangbofei/anaconda3/lib
24 | source /home/zhangbofei/anaconda3/bin/activate  # commented out by conda initialize
25 | conda init
26 | conda activate agent_tune
27 | export RUN_MODE=eval
28 | python examples/gaia/main.py --engine minicpm --lora-path experiments/CPM-FT/output/cpm_v2_6_7880280_2024_10_01_16_46/ --data-name 2023_level1 --split validation
29 | 
30 | python examples/gaia/main.py --engine minicpm --lora-path experiments/CPM-FT/output/cpm_v2_6_7880280_2024_10_01_16_46/ --data-name 2023_level3 --split validation
31 | 
32 | python examples/gaia/main.py --engine minicpm --lora-path experiments/CPM-FT/output/cpm_v2_6_7880280_2024_10_01_16_46/ --data-name 2023_level2 --split validation
33 | 
34 | 
35 | # python examples/gaia/main.py --engine minicpm --data-name 2023_level1 --split validation
36 | 
37 | # python examples/gaia/main.py --engine tonggpt --data-name 2023_level1 --split validation
38 | 
39 | #python examples/gaia/main.py --engine tonggpt --data-name 2023_level3 --split validation
40 | 


--------------------------------------------------------------------------------
/slurm_jobs/evaluate_gaia_internvl2.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --job-name=eval_gaia  # create a short name for your job
 3 | #SBATCH --partition=HGX             # specify the partition name: gpu
 4 | #SBATCH --qos=lv1
 5 | #SBATCH --nodes=1                # node count
 6 | #SBATCH --ntasks=1              # total number of tasks across all nodes
 7 | #SBATCH --ntasks-per-node=1
 8 | #SBATCH --mem=64G               # total memory (RAM) per node
 9 | #SBATCH --time=72:00:00          # total run time limit (HH:MM:SS)
10 | #SBATCH --cpus-per-task=32        # cpu-cores per task (>1 if multi-threaded tasks)
11 | #SBATCH --gres=gpu:1           # number of gpus per node
12 | #SBATCH --output=output/out-%j.out      # output format
13 | #SBATCH --error=output/error-out-%j.out      # error output file
14 | #SBATCH --account=engineering
15 | ## SBATCH --dependency=7787293
16 | #--------------------task  part-------------------------
17 | 
18 | 
19 | ## clean env
20 | module purge
21 | ## load environment need by this task
22 | module load slurm/BigAI/23.02.2
23 | export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/zhangbofei/anaconda3/lib
24 | source /home/zhangbofei/anaconda3/bin/activate  # commented out by conda initialize
25 | conda init
26 | conda activate agent_tune
27 | export AGENT_CONFIG=configs/agent_config.yaml
28 | python examples/gaia/main.py --engine internvl2 --data-name 2023_level2 --split validation
29 | 
30 | #python examples/gaia/main.py --engine minicpm --lora-path experiments/CPM-FT/output/cpm_v2_6_7880273_2024_10_01_16_09/ --data-name 2023_level3 --split validation
31 | 
32 | #python examples/gaia/main.py --engine minicpm --lora-path experiments/CPM-FT/output/cpm_v2_6_7880273_2024_10_01_16_09/ --data-name 2023_level2 --split validation
33 | 
34 | 
35 | # python examples/gaia/main.py --engine minicpm --data-name 2023_level1 --split validation
36 | 
37 | # python examples/gaia/main.py --engine tonggpt --data-name 2023_level1 --split validation
38 | 
39 | #python examples/gaia/main.py --engine tonggpt --data-name 2023_level3 --split validation
40 | 


--------------------------------------------------------------------------------
/slurm_jobs/evaluate_gaia_llava.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --job-name=eval_gaia  # create a short name for your job
 3 | #SBATCH --partition=HGX             # specify the partition name: gpu
 4 | #SBATCH --qos=lv1
 5 | #SBATCH --nodes=1                # node count
 6 | #SBATCH --ntasks=1              # total number of tasks across all nodes
 7 | #SBATCH --ntasks-per-node=1
 8 | #SBATCH --mem=64G               # total memory (RAM) per node
 9 | #SBATCH --time=72:00:00          # total run time limit (HH:MM:SS)
10 | #SBATCH --cpus-per-task=32        # cpu-cores per task (>1 if multi-threaded tasks)
11 | #SBATCH --gres=gpu:1           # number of gpus per node
12 | #SBATCH --output=output/out-%j.out      # output format
13 | #SBATCH --error=output/error-out-%j.out      # error output file
14 | #SBATCH --account=engineering
15 | ## SBATCH --dependency=7787293
16 | #--------------------task  part-------------------------
17 | 
18 | 
19 | ## clean env
20 | module purge
21 | ## load environment need by this task
22 | module load slurm/BigAI/23.02.2
23 | export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/zhangbofei/anaconda3/lib
24 | source /home/zhangbofei/anaconda3/bin/activate  # commented out by conda initialize
25 | conda init
26 | conda activate agent_tune
27 | export AGENT_CONFIG=configs/agent_config.yaml
28 | python examples/gaia/main.py --engine llava --data-name 2023_level3 --split validation
29 | 
30 | 
31 | 


--------------------------------------------------------------------------------
/slurm_jobs/evaluate_gaia_qwen.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --job-name=eval_gaia  # create a short name for your job
 3 | #SBATCH --partition=HGX             # specify the partition name: gpu
 4 | #SBATCH --qos=lv1
 5 | #SBATCH --nodes=1                # node count
 6 | #SBATCH --ntasks=1              # total number of tasks across all nodes
 7 | #SBATCH --ntasks-per-node=1
 8 | #SBATCH --mem=64G               # total memory (RAM) per node
 9 | #SBATCH --time=72:00:00          # total run time limit (HH:MM:SS)
10 | #SBATCH --cpus-per-task=32        # cpu-cores per task (>1 if multi-threaded tasks)
11 | #SBATCH --gres=gpu:1           # number of gpus per node
12 | #SBATCH --output=output/out-%j.out      # output format
13 | #SBATCH --error=output/error-out-%j.out      # error output file
14 | #SBATCH --account=engineering
15 | ## SBATCH --dependency=7787293
16 | #--------------------task  part-------------------------
17 | 
18 | 
19 | ## clean env
20 | module purge
21 | ## load environment need by this task
22 | module load slurm/BigAI/23.02.2
23 | export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/zhangbofei/anaconda3/lib
24 | source /home/zhangbofei/anaconda3/bin/activate  # commented out by conda initialize
25 | conda init
26 | conda activate agent_tune
27 | export AGENT_CONFIG=configs/agent_config.yaml
28 | python examples/gaia/main.py --engine qwen --data-name 2023_level3 --split validation
29 | 
30 | #python examples/gaia/main.py --engine minicpm --lora-path experiments/CPM-FT/output/cpm_v2_6_7880273_2024_10_01_16_09/ --data-name 2023_level3 --split validation
31 | 
32 | #python examples/gaia/main.py --engine minicpm --lora-path experiments/CPM-FT/output/cpm_v2_6_7880273_2024_10_01_16_09/ --data-name 2023_level2 --split validation
33 | 
34 | 
35 | # python examples/gaia/main.py --engine minicpm --data-name 2023_level1 --split validation
36 | 
37 | # python examples/gaia/main.py --engine tonggpt --data-name 2023_level1 --split validation
38 | 
39 | #python examples/gaia/main.py --engine tonggpt --data-name 2023_level3 --split validation
40 | 


--------------------------------------------------------------------------------
/slurm_jobs/evaluate_gaia_qwen_tuned.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --job-name=eval_gaia  # create a short name for your job
 3 | #SBATCH --partition=DGX             # specify the partition name: gpu
 4 | #SBATCH --qos=lv0b
 5 | #SBATCH --nodes=1                # node count
 6 | #SBATCH --ntasks=1              # total number of tasks across all nodes
 7 | #SBATCH --ntasks-per-node=1
 8 | #SBATCH --mem=64G               # total memory (RAM) per node
 9 | #SBATCH --time=08:00:00          # total run time limit (HH:MM:SS)
10 | #SBATCH --cpus-per-task=32        # cpu-cores per task (>1 if multi-threaded tasks)
11 | #SBATCH --gres=gpu:1           # number of gpus per node
12 | #SBATCH --output=output/out-%j.out      # output format
13 | #SBATCH --error=output/error-out-%j.out      # error output file
14 | #SBATCH --account=engineering
15 | ## SBATCH --dependency=7787293
16 | #--------------------task  part-------------------------
17 | 
18 | 
19 | ## clean env
20 | module purge
21 | ## load environment need by this task
22 | module load slurm/BigAI/23.02.2
23 | export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/zhangbofei/anaconda3/lib
24 | source /home/zhangbofei/anaconda3/bin/activate  # commented out by conda initialize
25 | conda init
26 | conda activate agent_tune
27 | export AGENT_CONFIG=configs/agent_config.yaml
28 | python examples/gaia/main.py --engine qwen --data-name 2023_level3 --split validation --lora-path output_qwen/Qwen2-VL-7B-Instruct-7906426/
29 | 
30 | 
31 | #python examples/gaia/main.py --engine minicpm --lora-path experiments/CPM-FT/output/cpm_v2_6_7880273_2024_10_01_16_09/ --data-name 2023_level3 --split validation
32 | 
33 | #python examples/gaia/main.py --engine minicpm --lora-path experiments/CPM-FT/output/cpm_v2_6_7880273_2024_10_01_16_09/ --data-name 2023_level2 --split validation
34 | 
35 | 
36 | # python examples/gaia/main.py --engine minicpm --data-name 2023_level1 --split validation
37 | 
38 | # python examples/gaia/main.py --engine tonggpt --data-name 2023_level1 --split validation
39 | 
40 | #python examples/gaia/main.py --engine tonggpt --data-name 2023_level3 --split validation
41 | 


--------------------------------------------------------------------------------
/slurm_jobs/evaluate_gta_internvl.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --job-name=eval_gta  # create a short name for your job
 3 | #SBATCH --partition=DGX             # specify the partition name: gpu
 4 | #SBATCH --qos=lv1
 5 | #SBATCH --nodes=1                # node count
 6 | #SBATCH --ntasks=1              # total number of tasks across all nodes
 7 | #SBATCH --ntasks-per-node=1
 8 | #SBATCH --mem=64G               # total memory (RAM) per node
 9 | #SBATCH --time=72:00:00          # total run time limit (HH:MM:SS)
10 | #SBATCH --cpus-per-task=32        # cpu-cores per task (>1 if multi-threaded tasks)
11 | #SBATCH --gres=gpu:1             # number of gpus per node
12 | #SBATCH --output=output/out-%j.out      # output format
13 | #SBATCH --error=output/error-out-%j.out      # error output file
14 | #SBATCH --account=engineering
15 | #--------------------task  part-------------------------
16 | 
17 | 
18 | ## clean env
19 | module purge
20 | ## load environment need by this task
21 | module load slurm/BigAI/23.02.2
22 | export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/zhangbofei/anaconda3/lib
23 | source /home/zhangbofei/anaconda3/bin/activate  # commented out by conda initialize
24 | conda init
25 | conda activate agent_tune
26 | export AGENT_CONFIG=configs/agent_config.yaml
27 | # python examples/gta/main.py --engine minicpm --lora-path experiments/CPM-FT/output/cpm_v2_6_7879846_2024_09_30_14_34
28 | python examples/gta/main.py --engine internvl2


--------------------------------------------------------------------------------
/slurm_jobs/evaluate_gta_internvl2.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --job-name=eval_gta  # create a short name for your job
 3 | #SBATCH --partition=HGX             # specify the partition name: gpu
 4 | #SBATCH --qos=lv1
 5 | #SBATCH --nodes=1                # node count
 6 | #SBATCH --ntasks=1              # total number of tasks across all nodes
 7 | #SBATCH --ntasks-per-node=1
 8 | #SBATCH --mem=64G               # total memory (RAM) per node
 9 | #SBATCH --time=72:00:00          # total run time limit (HH:MM:SS)
10 | #SBATCH --cpus-per-task=32        # cpu-cores per task (>1 if multi-threaded tasks)
11 | #SBATCH --gres=gpu:4             # number of gpus per node
12 | #SBATCH --output=output/out-%j.out      # output format
13 | #SBATCH --error=output/error-out-%j.out      # error output file
14 | #SBATCH --account=engineering
15 | #SBATCH --dependency=7899951
16 | #--------------------task  part-------------------------
17 | 
18 | 
19 | ## clean env
20 | module purge
21 | ## load environment need by this task
22 | module load slurm/BigAI/23.02.2
23 | export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/zhangbofei/anaconda3/lib
24 | source /home/zhangbofei/anaconda3/bin/activate  # commented out by conda initialize
25 | conda init
26 | conda activate agent_tune
27 | export AGENT_CONFIG=configs/agent_config.yaml
28 | # python examples/gta/main.py --engine minicpm --lora-path experiments/CPM-FT/output/cpm_v2_6_7879846_2024_09_30_14_34
29 | python examples/gta/main.py --engine internvl2
30 | 
31 | # python examples/gta/main.py --engine qwen


--------------------------------------------------------------------------------
/slurm_jobs/evaluate_gta_llava.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --job-name=eval_gta  # create a short name for your job
 3 | #SBATCH --partition=HGX,DGX             # specify the partition name: gpu
 4 | #SBATCH --qos=lv1
 5 | #SBATCH --nodes=1                # node count
 6 | #SBATCH --ntasks=1              # total number of tasks across all nodes
 7 | #SBATCH --ntasks-per-node=1
 8 | #SBATCH --mem=64G               # total memory (RAM) per node
 9 | #SBATCH --time=72:00:00          # total run time limit (HH:MM:SS)
10 | #SBATCH --cpus-per-task=32        # cpu-cores per task (>1 if multi-threaded tasks)
11 | #SBATCH --gres=gpu:4             # number of gpus per node
12 | #SBATCH --output=output/out-%j.out      # output format
13 | #SBATCH --error=output/error-out-%j.out      # error output file
14 | #SBATCH --account=engineering
15 | #SBATCH --dependency=7899951
16 | #--------------------task  part-------------------------
17 | 
18 | 
19 | ## clean env
20 | module purge
21 | ## load environment need by this task
22 | module load slurm/BigAI/23.02.2
23 | export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/zhangbofei/anaconda3/lib
24 | source /home/zhangbofei/anaconda3/bin/activate  # commented out by conda initialize
25 | conda init
26 | conda activate agent_tune
27 | export AGENT_CONFIG=configs/agent_config.yaml
28 | # python examples/gta/main.py --engine minicpm --lora-path experiments/CPM-FT/output/cpm_v2_6_7879846_2024_09_30_14_34
29 | python examples/gta/main.py --engine llava
30 | # python examples/gta/main.py --engine qwen


--------------------------------------------------------------------------------
/slurm_jobs/evaluate_gta_qwen.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --job-name=eval_gta  # create a short name for your job
 3 | #SBATCH --partition=HGX,DGX             # specify the partition name: gpu
 4 | #SBATCH --qos=lv1
 5 | #SBATCH --nodes=1                # node count
 6 | #SBATCH --ntasks=1              # total number of tasks across all nodes
 7 | #SBATCH --ntasks-per-node=1
 8 | #SBATCH --mem=64G               # total memory (RAM) per node
 9 | #SBATCH --time=72:00:00          # total run time limit (HH:MM:SS)
10 | #SBATCH --cpus-per-task=32        # cpu-cores per task (>1 if multi-threaded tasks)
11 | #SBATCH --gres=gpu:4             # number of gpus per node
12 | #SBATCH --output=output/out-%j.out      # output format
13 | #SBATCH --error=output/error-out-%j.out      # error output file
14 | #SBATCH --account=engineering
15 | #SBATCH --dependency=7899951
16 | #--------------------task  part-------------------------
17 | 
18 | 
19 | ## clean env
20 | module purge
21 | ## load environment need by this task
22 | module load slurm/BigAI/23.02.2
23 | export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/zhangbofei/anaconda3/lib
24 | source /home/zhangbofei/anaconda3/bin/activate  # commented out by conda initialize
25 | conda init
26 | conda activate agent_tune
27 | export AGENT_CONFIG=configs/agent_config.yaml
28 | # python examples/gta/main.py --engine minicpm --lora-path experiments/CPM-FT/output/cpm_v2_6_7879846_2024_09_30_14_34
29 | python examples/gta/main.py --engine qwen --lora-path output_qwen/Qwen2-VL-7B-Instruct-7899951
30 | 
31 | # python examples/gta/main.py --engine qwen


--------------------------------------------------------------------------------
/slurm_jobs/evaluate_gta_qwen_llm.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --job-name=eval_gta  # create a short name for your job
 3 | #SBATCH --partition=HGX             # specify the partition name: gpu
 4 | #SBATCH --qos=lv2
 5 | #SBATCH --nodes=1                # node count
 6 | #SBATCH --ntasks=1              # total number of tasks across all nodes
 7 | #SBATCH --ntasks-per-node=1
 8 | #SBATCH --mem=64G               # total memory (RAM) per node
 9 | #SBATCH --time=72:00:00          # total run time limit (HH:MM:SS)
10 | #SBATCH --cpus-per-task=32        # cpu-cores per task (>1 if multi-threaded tasks)
11 | #SBATCH --gres=gpu:1             # number of gpus per node
12 | #SBATCH --output=output/out-%j.out      # output format
13 | #SBATCH --error=output/error-out-%j.out      # error output file
14 | #SBATCH --account=engineering
15 | #--------------------task  part-------------------------
16 | 
17 | 
18 | ## clean env
19 | module purge
20 | ## load environment need by this task
21 | module load slurm/BigAI/23.02.2
22 | export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/zhangbofei/anaconda3/lib
23 | source /home/zhangbofei/anaconda3/bin/activate  # commented out by conda initialize
24 | conda init
25 | conda activate agent_tune
26 | export AGENT_CONFIG=configs/agent_config_qwen_llm.yaml
27 | # python examples/gta/main.py --engine minicpm --lora-path experiments/CPM-FT/output/cpm_v2_6_7879846_2024_09_30_14_34
28 | python examples/gta/main.py --engine qwen


--------------------------------------------------------------------------------
/slurm_jobs/gaia_pipeline.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --job-name=gaia_gen_step_2_4  # create a short name for your job
 3 | #SBATCH --partition=DGX             # specify the partition name: gpu
 4 | #SBATCH --qos=lv2
 5 | #SBATCH --nodes=1                # node count
 6 | #SBATCH --ntasks=1              # total number of tasks across all nodes
 7 | #SBATCH --ntasks-per-node=1
 8 | #SBATCH --mem=64G               # total memory (RAM) per node
 9 | #SBATCH --time=72:00:00          # total run time limit (HH:MM:SS)
10 | #SBATCH --cpus-per-task=32        # cpu-cores per task (>1 if multi-threaded tasks)
11 | #SBATCH --gres=gpu:1             # number of gpus per node
12 | #SBATCH --output=output/out-%j.out      # output format
13 | #SBATCH --error=output/error-out-%j.out      # error output file
14 | #SBATCH --account=engineering
15 | #--------------------task  part-------------------------
16 | 
17 | 
18 | ## clean env
19 | module purge
20 | ## load environment need by this task
21 | module load slurm/BigAI/23.02.2
22 | export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/zhangbofei/anaconda3/lib
23 | source /home/zhangbofei/anaconda3/bin/activate  # commented out by conda initialize
24 | conda init
25 | 
26 | conda activate gaia_gen
27 | 
28 | bash data_generation/gaia_pipeline/gaia_worker_5.sh 


--------------------------------------------------------------------------------
/slurm_jobs/gaia_pipeline_query_gen.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --job-name=gaia_query_gen  # create a short name for your job
 3 | #SBATCH --partition=DGX             # specify the partition name: gpu
 4 | #SBATCH --qos=lv2
 5 | #SBATCH --nodes=1                # node count
 6 | #SBATCH --ntasks=1              # total number of tasks across all nodes
 7 | #SBATCH --ntasks-per-node=1
 8 | #SBATCH --mem=64G               # total memory (RAM) per node
 9 | #SBATCH --time=72:00:00          # total run time limit (HH:MM:SS)
10 | #SBATCH --cpus-per-task=32        # cpu-cores per task (>1 if multi-threaded tasks)
11 | #SBATCH --gres=gpu:1             # number of gpus per node
12 | #SBATCH --output=output/out-%j.out      # output format
13 | #SBATCH --error=output/error-out-%j.out      # error output file
14 | #SBATCH --account=engineering
15 | #--------------------task  part-------------------------
16 | 
17 | 
18 | ## clean env
19 | module purge
20 | ## load environment need by this task
21 | module load slurm/BigAI/23.02.2
22 | export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/zhangbofei/anaconda3/lib
23 | source /home/zhangbofei/anaconda3/bin/activate  # commented out by conda initialize
24 | conda init
25 | 
26 | conda activate gaia_gen
27 | 
28 | python -m data_generation.gaia_pipeline.gaia0_query_generation
29 | 
30 | python -m data_generation.gaia_pipeline.gaia0_query_generation
31 | 
32 | python -m data_generation.gaia_pipeline.gaia0_query_generation
33 | 
34 | python -m data_generation.gaia_pipeline.gaia0_query_generation
35 | 
36 | python -m data_generation.gaia_pipeline.gaia0_query_generation
37 | 
38 | 


--------------------------------------------------------------------------------
/slurm_jobs/occupy.sh:
--------------------------------------------------------------------------------
 1 | srun --time=72:00:00 --partition=HGX,DGX --qos=lv1 --mem=64G --account=engineering --gres=gpu:8 --cpus-per-task=32 --pty bash  -c '
 2 |     echo "Starting interactive session..."
 3 |     module load slurm/BigAI/23.02.2
 4 |     export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/zhangbofei/anaconda3/lib
 5 |     source /home/zhangbofei/anaconda3/bin/activate  # commented out by conda initialize
 6 |     conda activate agent_tune
 7 |     # Execute any other commands you need
 8 |     echo "Environment ready!"
 9 |     cd /scratch/zhangbofei/Projects/Multimodal-CL/iclr_09/TongAgent
10 |     GPUS_PER_NODE=$(nvidia-smi -L | wc -l)
11 |     nvidia-smi
12 |     python scripts/report.py
13 |     vllm serve /scratch/ml/zhangxintong/A_Models/Qwen/Qwen2.5-72B-Instruct --tensor-parallel-size $GPUS_PER_NODE --dtype bfloat16 --gpu-memory-utilization 0.90 --max-model-len 20000 &
14 |     # Start an interactive shell
15 |     exec bash -i
16 | '
17 | 


--------------------------------------------------------------------------------
/slurm_jobs/qwen_test.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --job-name=qwen2.5_72b  # create a short name for your job
 3 | #SBATCH --partition=DGX             # specify the partition name: gpu
 4 | #SBATCH --qos=lv2
 5 | #SBATCH --nodes=1                # node count
 6 | #SBATCH --ntasks=1              # total number of tasks across all nodes
 7 | #SBATCH --ntasks-per-node=1
 8 | #SBATCH --mem=64G               # total memory (RAM) per node
 9 | #SBATCH --time=00:30:00          # total run time limit (HH:MM:SS)
10 | #SBATCH --cpus-per-task=32        # cpu-cores per task (>1 if multi-threaded tasks)
11 | #SBATCH --gres=gpu:2             # number of gpus per node
12 | #SBATCH --output=output/out-%j.out      # output format
13 | #SBATCH --error=output/error-out-%j.out      # error output file
14 | #SBATCH --account=engineering
15 | #--------------------task  part-------------------------
16 | 
17 | 
18 | ## clean env
19 | module purge
20 | ## load environment need by this task
21 | module load slurm/BigAI/23.02.2
22 | export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/zhangbofei/anaconda3/lib
23 | source /home/zhangbofei/anaconda3/bin/activate  # commented out by conda initialize
24 | conda init
25 | conda activate agent_tune
26 | GPUS_PER_NODE=$(nvidia-smi -L | wc -l)
27 | 
28 | vllm serve Qwen/Qwen2.5-72B-Instruct --tensor-parallel-size $GPUS_PER_NODE --dtype bfloat16 --gpu-memory-utilization 0.98 --max-model-len 20000
29 | 
30 | # python tests/test_vllm.py


--------------------------------------------------------------------------------
/slurm_jobs/train.slurm:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH -J gaia
 3 | #SBATCH -p HGX
 4 | #SBATCH -o %j.out
 5 | #SBATCH -e %j.err
 6 | #SBATCH -q lv0b
 7 | #SBATCH --time=10:00:00
 8 | 
 9 | #SBATCH --nodes=1
10 | #SBATCH --ntasks-per-node=1
11 | #SBATCH --gres=gpu:1
12 | #SBATCH --mem=50G
13 | 
14 | module load anaconda3/2021.11
15 | source activate tongagent
16 | 
17 | python -m data_generation.gaia_pipeline.gaia21_file_content2file_openai >& gaia_3kfile.log
18 | 


--------------------------------------------------------------------------------
/slurm_jobs/traj_gen.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --job-name=chunk_16  # create a short name for your job
 3 | #SBATCH --partition=DGX             # specify the partition name: gpu
 4 | #SBATCH --qos=lv2
 5 | #SBATCH --nodes=1                # node count
 6 | #SBATCH --ntasks=1              # total number of tasks across all nodes
 7 | #SBATCH --ntasks-per-node=1
 8 | #SBATCH --mem=64G               # total memory (RAM) per node
 9 | #SBATCH --time=72:00:00          # total run time limit (HH:MM:SS)
10 | #SBATCH --cpus-per-task=32        # cpu-cores per task (>1 if multi-threaded tasks)
11 | #SBATCH --gres=gpu:1             # number of gpus per node
12 | #SBATCH --output=output/out-%j.out      # output format
13 | #SBATCH --error=output/error-out-%j.out      # error output file
14 | #SBATCH --account=engineering
15 | #--------------------task  part-------------------------
16 | 
17 | 
18 | ## clean env
19 | module purge
20 | ## load environment need by this task
21 | module load slurm/BigAI/23.02.2
22 | export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/zhangbofei/anaconda3/lib
23 | # source /home/zhangbofei/anaconda3/bin/activate  # commented out by conda initialize
24 | 
25 | conda activate agent_tune
26 | python data_generation/chunk_traj_generation/gta4_traj_genetation_chunk.py --chunk 16
27 | 
28 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mat-agent/MAT-Agent/7215f7bb7ee2284cfaca2fda79725c39d5b3bb89/tests/__init__.py


--------------------------------------------------------------------------------
/tests/data/254.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mat-agent/MAT-Agent/7215f7bb7ee2284cfaca2fda79725c39d5b3bb89/tests/data/254.jpg


--------------------------------------------------------------------------------
/tests/data/annotated_cars.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mat-agent/MAT-Agent/7215f7bb7ee2284cfaca2fda79725c39d5b3bb89/tests/data/annotated_cars.png


--------------------------------------------------------------------------------
/tests/data/cars.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mat-agent/MAT-Agent/7215f7bb7ee2284cfaca2fda79725c39d5b3bb89/tests/data/cars.png


--------------------------------------------------------------------------------
/tests/data/draw.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mat-agent/MAT-Agent/7215f7bb7ee2284cfaca2fda79725c39d5b3bb89/tests/data/draw.jpg


--------------------------------------------------------------------------------
/tests/test_activate.py:
--------------------------------------------------------------------------------
 1 | from openai import OpenAI
 2 | import openai
 3 | import httpx
 4 | # vllm serve Qwen/Qwen2.5-7B-Instruct > output/vllm.log 2>&1 &
 5 | # vllm serve Qwen/Qwen2.5-72B-Instruct --tensor-parallel-size 2 > output/vllm.log 2>&1 &
 6 | # Set OpenAI's API key and API base to use vLLM's API server.
 7 | openai_api_key = "EMPTY"
 8 | openai_api_base = "http://localhost:8000/v1"
 9 | 
10 | client = OpenAI(
11 |     api_key=openai_api_key,
12 |     base_url=openai_api_base,
13 | )
14 | import time
15 | for i in range(60):
16 |     try:
17 |         print(client.models.list())
18 |     except httpx.ConnectError as e:
19 |         print("service might not ready!")
20 |         time.sleep(10)
21 |     except openai.APIConnectionError as e:
22 |         print("service might not ready!")
23 |         time.sleep(10)
24 |     except Exception as e:
25 |         raise e
26 |     
27 | elapse_times = []
28 | while True:
29 |     current = time.time()
30 |     try:
31 |         chat_response = client.chat.completions.create(
32 |             model="Qwen/Qwen2.5-72B-Instruct",
33 |             messages=[
34 |                 {
35 |                     "role": "system",
36 |                     "content": "You are a helpful assistant"
37 |                 },
38 |                 {
39 |                     "role": "user",
40 |                     "content": "Hi"
41 |                 }],
42 |             temperature=0.7,
43 |             top_p=0.8,
44 |             max_tokens=1,
45 |             extra_body={
46 |                 "repetition_penalty": 1.05,
47 |             },
48 |         )
49 |     except httpx.ConnectError as e:
50 |         print("service might not ready!")
51 |         time.sleep(60)
52 |     except openai.APIConnectionError as e:
53 |         print("service might not ready!")
54 |         time.sleep(60)
55 |     except Exception as e:
56 |         raise e
57 |         
58 |     
59 | 
60 |     print("Chat response:", chat_response.choices[0].message.content)
61 |     print("Elapse", round(time.time() - current, 2))
62 |     elapse_times.append(round(time.time() - current, 2))
63 |     time.sleep(600)
64 |     
65 | 


--------------------------------------------------------------------------------
/tests/test_agent.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | from tongagent.agents.general_agent import create_agent
 4 | class TestAgent(unittest.TestCase):
 5 |     def test_ocr(self):
 6 |         agent = create_agent()
 7 | 
 8 |         result = agent.run("Can you try to extract text from the image path? Image path: tests/data/254.jpg")
 9 |         print(result)
10 |     
11 |     def test_sg(self):
12 |         agent = create_agent()
13 | 
14 |         result = agent.run("Can you try to extract mask from the image path to a pickle file?  Image path: tests/data/cars.png. Show me the file name you generated is good.")
15 |         print(result)
16 |         
17 |     def test_edit(self):
18 |         agent = create_agent()
19 | 
20 |         result = agent.run("Can you edit the image to turn him into cyborg? Image path: tests/data/draw.jpg.")
21 |         print(result)
22 |         
23 |     def test_loc(self):
24 |         agent = create_agent()
25 | 
26 |         result = agent.run("Can you try to first detect cars shown in the images and then extract masks for cars?  Image path: tests/data/cars.png.")
27 |         print(result)
28 |     
29 |     def test_web_search(self):
30 |         agent = create_agent()
31 |         question = """If Eliud Kipchoge could maintain his record-making marathon pace indefinitely, how many thousand hours would it take him to run the distance between the Earth and the Moon its closest approach? Please use the minimum perigee value on the Wikipedia page for the Moon when carrying out your calculation. Round your result to the nearest 1000 hours and do not use any comma separators if necessary."""
32 |         
33 |         result = agent.run(question)
34 |         print(result)
35 |     
36 |     def test_web_search2(self):
37 |         agent = create_agent()
38 |         question = """How many studio albums were published by Mercedes Sosa between 2000 and 2009 (included)? You can use the latest 2022 version of english wikipedia."""
39 |         
40 |         result = agent.run(question)
41 |         print(result)
42 |     
43 |     def test_gaia_case1(self):
44 |         agent = create_agent()
45 |         question = """A paper about AI regulation that was originally submitted to arXiv.org in June 2022 shows a figure with three axes, where each axis has a label word at both ends. Which of these words is used to describe a type of society in a Physics and Society article submitted to arXiv.org on August 11, 2016?"""
46 |         result = agent.run(question)
47 |         print(result)
48 |         # answer = egalitarian
49 |     
50 |     def test_gaia_case2(self):
51 |         question = "In the video https://www.youtube.com/watch?v=L1vXCYZAYYM, what is the highest number of bird species to be on camera simultaneously?"
52 |         agent = create_agent()
53 |         result = agent.run(question)
54 |         print(result)
55 | if __name__ == "__main__":
56 |     unittest.main()


--------------------------------------------------------------------------------
/tests/test_code.py:
--------------------------------------------------------------------------------
 1 | from tongagent.agents.data_sampling_agent import evaluate_python_code_modify
 2 | 
 3 | 
 4 | code = '''
 5 | import pandas as pd
 6 | 
 7 | df = pd.read_csv("data/GAIA/2023/validation/da52d699-e8d2-4dc5-9191-a2199e0b6a9b.xlsx")
 8 | 
 9 | print(df.head())
10 | '''
11 | 
12 | result = evaluate_python_code_modify(
13 |     code,
14 |     authorized_imports=["pandas"]
15 | )
16 | print(result)
17 | 


--------------------------------------------------------------------------------
/tests/test_create_agent.py:
--------------------------------------------------------------------------------
 1 | from tongagent.agents.data_sampling_agent import create_agent
 2 | 
 3 | agent = create_agent(
 4 |     llm_engine="tonggpt",
 5 |     error_tolerance=10,
 6 |     task="gaia"
 7 | )
 8 | 
 9 | print(agent.authorized_imports)
10 | print(agent.additional_authorized_imports)
11 | 
12 | 


--------------------------------------------------------------------------------
/tests/test_debug.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | 
 3 | with open("debug.json", "r") as f:
 4 |     data = json.load(f)
 5 |     
 6 | 
 7 | for item in data:
 8 |     conversations = item["conversations"]
 9 |     for conversation in conversations:
10 |         if conversation["role"] == "user":
11 |             if '.png' in conversation["content"]:
12 |                 print(conversation["content"])
13 |                 print("-" * 10)
14 |             


--------------------------------------------------------------------------------
/tests/test_edit.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | from tongagent.tools.new_added.image_edit import ImageEditTool
 3 | class TestEdit(unittest.TestCase):
 4 |     def test_edit(self):
 5 |         tool = ImageEditTool()
 6 |      
 7 |         output_image = tool.forward("turn him into cyborg", "tests/data/draw.jpg")
 8 |         print(output_image)
 9 |     
10 | if __name__ == "__main__":
11 |     unittest.main()


--------------------------------------------------------------------------------
/tests/test_engine.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | from tongagent.llm_engine.mini_cpm import MiniCPMEngine
 4 | class TestEngine(unittest.TestCase):
 5 |     def test_case(self):
 6 |         engine = MiniCPMEngine()
 7 |         messages = [
 8 |             {"role": "user", "content": "You are a helpful assistant. Response in chinese."},
 9 |             {"role": "user", "content": "Tell me the model of this aircraft."}
10 |         ]
11 |         output = engine(messages, stop_sequences=[], image_paths = ["tests/data/airplane.jpeg"])
12 |         
13 |         print(output)


--------------------------------------------------------------------------------
/tests/test_find.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import json
 3 | 
 4 | with open("data/gta_6350_merged.json", "r") as f:
 5 |     dataset = json.load(f)
 6 |     
 7 | for item in dataset:
 8 |     if item["id"] == "vB9O_XTo":
 9 |         print(item["image"])
10 |         conv = item["conversations"]
11 |         for turn in conv:
12 |             print(turn["role"])
13 |             print(turn["content"])
14 |             print("-" * 100)
15 |         break
16 | 
17 | 
18 | 


--------------------------------------------------------------------------------
/tests/test_format_answer.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | from tongagent.prompt import FORMAT_ANSWER_PROMPT_GAIA
 4 | 
 5 | from langchain.prompts import ChatPromptTemplate
 6 | 
 7 | class TestFormat(unittest.TestCase):
 8 |     def test_case(self):
 9 |         template = ChatPromptTemplate.from_template(FORMAT_ANSWER_PROMPT_GAIA)
10 |         prompt = template.invoke({
11 |             "question": "hi",
12 |             "answer": "hi"
13 |         })
14 |         print(prompt)
15 |         print(prompt.to_string())
16 |         print(prompt.to_messages()[0].content)
17 | if __name__ == "__main__":
18 |     unittest.main()


--------------------------------------------------------------------------------
/tests/test_gaia_1107.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | with open("data/gaia_1107_train.json", "r") as f:
 3 |     data = json.load(f)
 4 | 
 5 | import random
 6 | with open("data/gaia_1107_train.json", "w") as f:
 7 |     json.dump(data, f, indent=4, ensure_ascii=False)
 8 |     
 9 | 
10 | with open("data/gaia_1107_train_subset.json", "w") as f:
11 |     json.dump(random.sample(data, 500), f, indent=4, ensure_ascii=False)
12 | 
13 | 
14 | 


--------------------------------------------------------------------------------
/tests/test_inpector.py:
--------------------------------------------------------------------------------
1 | from tongagent.tools.tool_box import TextInspectorTool
2 | 
3 | data_path = "data/GAIA/2023/validation/da52d699-e8d2-4dc5-9191-a2199e0b6a9b.xlsx"
4 | 
5 | tool = TextInspectorTool()
6 | 
7 | question = "What is the list of books read in 2022 along with their reading speeds?"
8 | result = tool.forward(file_path=data_path, question=question)
9 | print(result)


--------------------------------------------------------------------------------
/tests/test_internvl.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | from tongagent.llm_engine.internvl2 import InternVL2Engine
 3 | from transformers.agents.llm_engine import MessageRole, HfApiEngine, get_clean_message_list
 4 | 
 5 | class TestInternVL(unittest.TestCase):
 6 |     def test_internvl(self):
 7 |         engine = InternVL2Engine()
 8 |         messages = [
 9 |             {"role": MessageRole.SYSTEM, "content": "You are a helpful assistant."},
10 |             {"role": MessageRole.USER, "content": "What is the capital of France?"},
11 |         ]
12 |         answer = engine(messages)
13 |         print(answer)
14 |     
15 |     def test_internvl_with_image(self):
16 |         engine = InternVL2Engine()
17 |         messages = [
18 |             {"role": MessageRole.SYSTEM, "content": "Respond in chinese"},
19 |             {"role": MessageRole.USER, "content": "What airplane in the image?"},
20 |         ]
21 |         answer = engine(messages, image_paths=["tests/data/airplane.jpeg"])
22 |         print(answer)
23 | 
24 | if __name__ == "__main__":
25 |     unittest.main()


--------------------------------------------------------------------------------
/tests/test_llava_ov.py:
--------------------------------------------------------------------------------
 1 | # pip install git+https://github.com/LLaVA-VL/LLaVA-NeXT.git
 2 | from llava.model.builder import load_pretrained_model
 3 | from llava.mm_utils import get_model_name_from_path, process_images, tokenizer_image_token
 4 | from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN, IGNORE_INDEX
 5 | from llava.conversation import conv_templates, SeparatorStyle
 6 | 
 7 | from PIL import Image
 8 | import requests
 9 | import copy
10 | import torch
11 | 
12 | import sys
13 | import warnings
14 | 
15 | warnings.filterwarnings("ignore")
16 | pretrained = "lmms-lab/llava-onevision-qwen2-7b-ov-chat"
17 | pretrained = "Lin-Chen/open-llava-next-llama3-8b"
18 | model_name = "llava_llama_3"
19 | #model_name = "llava_qwen"
20 | device = "cuda"
21 | device_map = "auto"
22 | tokenizer, model, image_processor, max_length = load_pretrained_model(pretrained, None, model_name, device_map=device_map)  # Add any other thing you want to pass in llava_model_args
23 | 
24 | model.eval()
25 | 
26 | url = "https://github.com/haotian-liu/LLaVA/blob/1a91fc274d7c35a9b50b3cb29c4247ae5837ce39/images/llava_v1_5_radar.jpg?raw=true"
27 | image = Image.open(requests.get(url, stream=True).raw)
28 | image_tensor = process_images([image], image_processor, model.config)
29 | image_tensor = [_image.to(dtype=torch.float16, device=device) for _image in image_tensor]
30 | 
31 | #conv_template = "qwen_1_5"  # Make sure you use correct chat template for different models
32 | conv_template = "llava_llama_3"
33 | question = DEFAULT_IMAGE_TOKEN + "\nWhat is shown in this image?"
34 | question = "How are you doing?"
35 | conv = copy.deepcopy(conv_templates[conv_template])
36 | conv.append_message(conv.roles[0], question)
37 | conv.append_message(conv.roles[1], None)
38 | prompt_question = conv.get_prompt()
39 | 
40 | input_ids = tokenizer_image_token(prompt_question, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(0).to(device)
41 | image_sizes = [image.size]
42 | 
43 | for each in image_tensor:
44 |     print(each.shape)
45 |     
46 | cont = model.generate(
47 |     input_ids,
48 |     images=None,
49 |     image_sizes=None,
50 |     do_sample=False,
51 |     temperature=0,
52 |     max_new_tokens=4096,
53 | )
54 | text_outputs = tokenizer.batch_decode(cont, skip_special_tokens=True)
55 | print(text_outputs)
56 | 


--------------------------------------------------------------------------------
/tests/test_ocr.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | from tongagent.tools.new_added.ocr import OCRTool
 3 | 
 4 | class TestOCR(unittest.TestCase):
 5 |     def test_case(self):
 6 |         tool =  OCRTool()
 7 |         texts = tool.forward("tests/data/254.jpg", debug=True)
 8 |         print(texts)
 9 |         
10 | if __name__ == "__main__":
11 |     unittest.main()


--------------------------------------------------------------------------------
/tests/test_ov_engine.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | from tongagent.llm_engine.llava import LLaVAEngine
 3 | from transformers.agents.llm_engine import MessageRole, HfApiEngine, get_clean_message_list
 4 | 
 5 | class TestOvEngine(unittest.TestCase):
 6 |     def test_llava1(self):
 7 |         engine = LLaVAEngine("Lin-Chen/open-llava-next-llama3-8b")
 8 |         prompt = [
 9 |             {"role": MessageRole.USER, "content": "How are you doing?"},
10 |         ]
11 |         answer = engine(prompt, image_path=[])
12 |         print(answer)
13 |     
14 |     def test_llava2(self):
15 |         engine = LLaVAEngine("Lin-Chen/open-llava-next-llama3-8b")
16 |         image_path = "tests/data/airplane.jpeg"
17 |         prompt = [
18 |             {"role": MessageRole.USER, "content": "What is the image?"},
19 |         ]
20 |         answer = engine(prompt, image_paths=[image_path])
21 |         print(answer)
22 | 
23 | 
24 | if __name__ == "__main__":
25 |     unittest.main()


--------------------------------------------------------------------------------
/tests/test_seg.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | from tongagent.tools.new_added.seg import SegTool
 4 | 
 5 | class TestSeg(unittest.TestCase):
 6 |     
 7 |     def test_seg(self):
 8 |         tool = SegTool()
 9 |         result = tool.forward("tests/data/cars.png")        
10 |         print(result)
11 |         
12 |     def test_seg_2(self):
13 |         tool = SegTool()
14 |         prompt = [[200.68, 451.94, 354.71, 545.96], [192.86, 359.01, 953.82, 738.95], [908.56, 197.47, 1555.35, 993.67]]
15 |         result = tool.forward("tests/data/cars.png", prompt=prompt)        
16 |         print(result)
17 | 
18 | if __name__ == "__main__":
19 |     unittest.main()


--------------------------------------------------------------------------------
/tests/test_vision_map.py:
--------------------------------------------------------------------------------
 1 | from tongagent.llm_engine.mini_cpm import load_pretrained_model_lora
 2 | import json
 3 | from PIL import Image
 4 | 
 5 | model, tokenizer = load_pretrained_model_lora("experiments/CPM-FT/output/cpm_v2_6_7882650_2024_10_14_19_25/")
 6 | input_data = ".cache/gta/cpm_v2_6_7882650_2024_10_14_19_25/0/agent_memory.json"
 7 | image_paths = ["data/gta_dataset/image/image_1.jpg", "data/gta_dataset/image/image_2.jpg"]
 8 | 
 9 | 
10 | with open(input_data, "r") as f:
11 |     data = json.load(f)
12 | messages = data["conversations"]
13 | if image_paths is not None and len(image_paths) > 0:
14 |     origin_content = messages[1]['content']
15 |     messages[1]['content'] = []
16 |     messages[1]['content'].append(dict(type="text", text=origin_content))
17 |     prompt = []
18 |     for path_item in image_paths:
19 |         image = Image.open(path_item).convert('RGB')
20 |         prompt.append(image)
21 |     prompt.append(origin_content)
22 |     messages[1]["content"] = prompt
23 | 
24 | system_prompt = messages[0]["content"]
25 | print("prompt", messages[1:2])
26 | answer = model.chat(
27 |     image=None,
28 |     msgs=messages[1:2],
29 |     system_prompt=system_prompt,
30 |     tokenizer=tokenizer
31 | )
32 | print(answer)


--------------------------------------------------------------------------------
/tongagent/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mat-agent/MAT-Agent/7215f7bb7ee2284cfaca2fda79725c39d5b3bb89/tongagent/__init__.py


--------------------------------------------------------------------------------
/tongagent/agents/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mat-agent/MAT-Agent/7215f7bb7ee2284cfaca2fda79725c39d5b3bb89/tongagent/agents/__init__.py


--------------------------------------------------------------------------------
/tongagent/agents/general_agent.py:
--------------------------------------------------------------------------------
 1 | from typing import Callable, List
 2 | from transformers.agents.prompts import DEFAULT_REACT_CODE_SYSTEM_PROMPT
 3 | from transformers.agents.tools import DEFAULT_TOOL_DESCRIPTION_TEMPLATE, Tool
 4 | from tongagent.tools.tool_box import get_general_tool_box, get_tool_box_gaia
 5 | from tongagent.llm_engine.gpt import TongGPTEngine, get_tonggpt_open_ai_client
 6 | from tongagent.prompt import DEFAULT_REACT_CODE_SYSTEM_PROMPT, FORMAT_ANSWER_PROMPT_GAIA
 7 | from transformers.agents import ReactCodeAgent, HfApiEngine
 8 | from transformers.agents.tools import DEFAULT_TOOL_DESCRIPTION_TEMPLATE
 9 | from transformers.agents.llm_engine import MessageRole
10 | from typing import Any
11 | from langchain.prompts import ChatPromptTemplate
12 |         
13 | def create_agent() -> ReactCodeAgent:
14 |     llm_engine = TongGPTEngine()
15 |     
16 |     react_agent = ReactCodeAgent(
17 |         llm_engine=llm_engine,
18 |         # tools=TASK_SOLVING_TOOLBOX+WEB_TOOLS,
19 |         tools=get_general_tool_box(),
20 |         max_iterations=15,
21 |         verbose=0,
22 |         memory_verbose=True,
23 |         system_prompt=DEFAULT_REACT_CODE_SYSTEM_PROMPT,
24 |         add_base_tools=False,
25 |         additional_authorized_imports=[
26 |             "requests",
27 |             "zipfile",
28 |             "os",
29 |             "pandas",
30 |             "numpy",
31 |             "sympy",
32 |             "json",
33 |             "bs4",
34 |             "pubchempy",
35 |             "xml",
36 |             "yahoo_finance",
37 |             "Bio",
38 |             "sklearn",
39 |             "scipy",
40 |             "pydub",
41 |             "io",
42 |             "PIL",
43 |             "chess",
44 |             "PyPDF2",
45 |             "pptx",
46 |             "torch",
47 |             "datetime",
48 |             "csv",
49 |             "fractions",
50 |             "matplotlib",
51 |             "pickle"
52 |         ],
53 |         planning_interval=None
54 |     )
55 |     return react_agent
56 | 
57 | 


--------------------------------------------------------------------------------
/tongagent/cmd/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mat-agent/MAT-Agent/7215f7bb7ee2284cfaca2fda79725c39d5b3bb89/tongagent/cmd/__init__.py


--------------------------------------------------------------------------------
/tongagent/cmd/task_generate.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import json
 3 | 
 4 | def main():
 5 |     parser = argparse.ArgumentParser()
 6 |     parser.add_argument("--data-path")
 7 |     
 8 |     args = parser.parse_args()
 9 |     with open(args.data_path, "r") as f:
10 |         dataset = json.load(f)
11 | 
12 |     conv = dataset["conversations"]
13 |     task = conv[1]["content"]
14 |     if not task.endswith("\n"):
15 |         task += "\n"
16 |         
17 |     for i in range(2, len(conv)):
18 |         content = conv[i]["content"]
19 |         if i % 2 == 0:
20 |             if not content.startswith("Thought:"):
21 |                 raise ValueError("This trajectory is malformed")
22 |             
23 |             task += "\n"
24 |             task += content
25 |             
26 |         else:
27 |             if not content.startswith("[OUTPUT OF STEP") or "Observation:" not in content:
28 |                 raise ValueError("This trajectory is malformed")
29 | 
30 |             content_idx = content.find("]")
31 |             task += "\n"
32 |             task += content[content_idx+1:].strip()
33 |         if not task.endswith("\n"):
34 |             task += "\n"
35 |     print(task)
36 | if __name__ == "__main__":
37 |     main()


--------------------------------------------------------------------------------
/tongagent/evaluation/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mat-agent/MAT-Agent/7215f7bb7ee2284cfaca2fda79725c39d5b3bb89/tongagent/evaluation/__init__.py


--------------------------------------------------------------------------------
/tongagent/llm_engine/__init__.py:
--------------------------------------------------------------------------------
 1 | from tongagent.llm_engine.gpt import TongGPTEngine
 2 | from tongagent.llm_engine.mini_cpm import MiniCPMEngine
 3 | from tongagent.llm_engine.qwen import QwenEngine
 4 | from tongagent.llm_engine.internvl2 import InternVL2Engine
 5 | from tongagent.utils import load_config
 6 | from tongagent.llm_engine.llava import LLaVAEngine
 7 | def get_llm_engine(
 8 |     engine_type=None,
 9 |     lora_path=None,
10 |     disable_vision=False,
11 | ):
12 |     config = load_config()
13 |     if engine_type is None:
14 |         engine_type = config.agent_controller.engine_type
15 |         
16 |     if engine_type == "qwen":
17 |         return QwenEngine(
18 |             model_name=config.qwen.model_name,
19 |             lora_path=lora_path
20 |         )
21 |     elif engine_type == "tonggpt":
22 |         return TongGPTEngine(engine_type)
23 |     elif engine_type == "minicpm":
24 |         return MiniCPMEngine(
25 |             model=lora_path,
26 |             disable_vision=disable_vision
27 |         )
28 |     elif engine_type == "internvl2":
29 |         return InternVL2Engine(
30 |             model_name=config.internvl2.model_name,
31 |             lora_path=lora_path
32 |         )
33 |     elif engine_type == "llava":
34 |         return LLaVAEngine(
35 |             model_name=config.llava.model_name,
36 |         )
37 |     else:
38 |         raise ValueError(f"Unknown LLM engine {engine_type}")
39 |     


--------------------------------------------------------------------------------
/tongagent/tools/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mat-agent/MAT-Agent/7215f7bb7ee2284cfaca2fda79725c39d5b3bb89/tongagent/tools/__init__.py


--------------------------------------------------------------------------------
/tongagent/tools/new_added/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mat-agent/MAT-Agent/7215f7bb7ee2284cfaca2fda79725c39d5b3bb89/tongagent/tools/new_added/__init__.py


--------------------------------------------------------------------------------
/tongagent/tools/new_added/face_det.py:
--------------------------------------------------------------------------------
 1 | from transformers import AutoProcessor, Tool
 2 | import torch
 3 | from PIL import Image
 4 | import numpy as np
 5 | import requests
 6 | import face_detection
 7 | 
 8 | class FaceDetTool(Tool):
 9 |     name = "facedetection"
10 |     description = "A tool that can detect human faces in given images, outputing the bounding boxes of the human faces."
11 |     inputs = {
12 |         "image_path": {
13 |             "description": "The path to the image on which to localize objects. This should be a local path to downloaded image.",
14 |             "type": "string",
15 |         },
16 |     }
17 |     output_type = "any"
18 | 
19 | 
20 |     device = "cuda:0" if torch.cuda.is_available() else "cpu"
21 |     model = face_detection.build_detector("DSFDDetector", confidence_threshold=.5, nms_iou_threshold=.3)
22 | 
23 | 
24 |     def forward(self,image_path:str)-> list:
25 |         img = Image.open(image_path)
26 |         img = img.convert('RGB')
27 |         with torch.no_grad():
28 |             faces = self.model.detect(np.array(img))
29 |         
30 |         W,H = img.size
31 |         objs = []
32 |         for i,box in enumerate(faces):
33 |             x1,y1,x2,y2,c = [int(v) for v in box.tolist()]
34 |             x1,y1,x2,y2 = self.enlarge_face([x1,y1,x2,y2],W,H)
35 |             mask = np.zeros([H,W]).astype(float)
36 |             mask[y1:y2,x1:x2] = 1.0
37 |             objs.append([x1,y1,x2,y2])
38 |         return objs
39 | 
40 | 
41 |     def enlarge_face(self,box,W,H,f=1.5):
42 |         x1,y1,x2,y2 = box
43 |         w = int((f-1)*(x2-x1)/2)
44 |         h = int((f-1)*(y2-y1)/2)
45 |         x1 = max(0,x1-w)
46 |         y1 = max(0,y1-h)
47 |         x2 = min(W,x2+w)
48 |         y2 = min(H,y2+h)
49 |         return [x1,y1,x2,y2]
50 | 
51 | 
52 | 
53 | # m=FaceDetTool()


--------------------------------------------------------------------------------
/tongagent/tools/new_added/image_edit.py:
--------------------------------------------------------------------------------
 1 | from transformers import Tool
 2 | from PIL import Image
 3 | from diffusers import StableDiffusionInstructPix2PixPipeline, EulerAncestralDiscreteScheduler
 4 | from tongagent.utils import CACHE_FOLDER, gen_random_id
 5 | import torch
 6 | import os
 7 | 
 8 | class ModelSingleton():
 9 |     def __new__(cls):
10 |         if hasattr(cls, "pipe"):
11 |             return cls
12 |         model_id = "timbrooks/instruct-pix2pix"
13 |         pipe = StableDiffusionInstructPix2PixPipeline.from_pretrained(model_id, torch_dtype=torch.float16, safety_checker=None)
14 |         pipe.to("cuda")
15 |         pipe.scheduler = EulerAncestralDiscreteScheduler.from_config(pipe.scheduler.config)
16 |         cls.pipe = pipe
17 |         return cls
18 |         
19 | class ImageEditTool(Tool):
20 |     name = "image_edit"
21 |     description = "A tool that can edit image based on the user prompt. Return a file path for printing."
22 |     inputs = {
23 |         "prompt": {
24 |             "description": "The user prompt that instruct how to edit the image.",
25 |             "type": "string",
26 |         },
27 |         "image_path": {
28 |             "description": "The image path that this tool will try to edit.",
29 |             "type": "string",
30 |         },
31 |     }
32 |     output_type = "string"
33 |     
34 |     
35 |     def forward(self, prompt: str, image_path: str) -> str:
36 |         print("ImageEditTool input", prompt, image_path)
37 |         image = Image.open(image_path).convert("RGB")        
38 |         images = ModelSingleton().pipe(prompt, image=image, num_inference_steps=10, image_guidance_scale=1).images
39 |         output_image = images[0]
40 |         output_image_path = os.path.join(CACHE_FOLDER, f"{gen_random_id()}.png")
41 |         output_image.save(output_image_path)
42 |         print("save to", output_image_path)
43 |         return output_image_path


--------------------------------------------------------------------------------
/tongagent/tools/new_added/image_generation.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from transformers.agents import load_tool, Tool
 4 | from tongagent.utils import CACHE_FOLDER, gen_random_id
 5 | from diffusers import FluxPipeline
 6 | from diffusers import DiffusionPipeline
 7 | 
 8 | import torch
 9 | 
10 | class ImageGenerationTool(Tool):
11 |     description = "This is a tool that creates an image according to a prompt, which is a text description."
12 |     name = "image_generator"
13 |     inputs = {"prompt": {"type": "string", "description": "The image generator prompt. Don't hesitate to add details in the prompt to make the image look better, like 'high-res, photorealistic', etc."}}
14 |     output_type = "any"
15 |     
16 |     def __init__(self, *args, **kwargs):
17 |         super().__init__(*args, **kwargs)
18 | 
19 |         model_id = "stabilityai/stable-diffusion-xl-base-1.0"
20 |         if model_id == "black-forest-labs/FLUX.1-dev":
21 |             # model_path = '/scratch/zhangbofei/.cache/huggingface/hub/models--black-forest-labs--FLUX.1-dev/'
22 |             pipeline = FluxPipeline.from_pretrained(model_id, torch_dtype=torch.bfloat16)
23 |         elif model_id == "stabilityai/stable-diffusion-xl-base-1.0":
24 |             pipeline = DiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16, use_safetensors=True, variant="fp16")
25 |         else:
26 |             raise ValueError(f"unk model {model_id}")
27 |         self.pipeline = pipeline
28 |         self.pipeline.to("cuda")
29 |         self.model_id = model_id
30 |         # pipeline.enable_model_cpu_offload()
31 |         
32 |     def forward(self, prompt):
33 |         if self.model_id == "stabilityai/stable-diffusion-xl-base-1.0":
34 |             image = self.pipeline(
35 |                     prompt=prompt
36 |                 ).images[0]
37 |         else:
38 |             image = self.pipeline(
39 |                 prompt,
40 |                 height=512,
41 |                 width=512,
42 |                 guidance_scale=3.5,
43 |                 num_inference_steps=50,
44 |                 max_sequence_length=512,
45 |                 generator=torch.Generator("cpu").manual_seed(0)
46 |             ).images[0]
47 |             
48 |         output_image_path = os.path.join(CACHE_FOLDER, f"{gen_random_id()}.jpeg")
49 |         image.save(output_image_path)
50 |         # output_image.save(output_image_path)
51 |         print("save to", output_image_path)
52 |         return output_image_path
53 | 
54 | if __name__ == "__main__":
55 |     tool = ImageGenerationTool()
56 | 
57 |     image_path = tool.forward("high-res, photorealistic street view")
58 | 


--------------------------------------------------------------------------------
/tongagent/tools/new_added/object_loc.py:
--------------------------------------------------------------------------------
 1 | from transformers import Tool
 2 | import torch
 3 | from PIL import Image
 4 | from transformers import OwlViTProcessor, OwlViTForObjectDetection
 5 | 
 6 | 
 7 | class ObjectLOCTool(Tool):
 8 |     name = "objectlocation"
 9 |     description = "A tool that can localize objects in given images, outputing the bounding boxes of the objects."
10 |     inputs = {
11 |         "object": {"description": "the object that need to be localized", "type": "string"},
12 |         "image_path": {
13 |             "description": "The path to the image on which to localize objects. This should be a local path to downloaded image.",
14 |             "type": "string",
15 |         },
16 |     }
17 |     output_type = "any"
18 | 
19 | 
20 |     model_path = "google/owlvit-base-patch32"
21 | 
22 |     device = "cuda:0" if torch.cuda.is_available() else "cpu"
23 |     processor = OwlViTProcessor.from_pretrained(model_path)
24 |     model = OwlViTForObjectDetection.from_pretrained(model_path)
25 |     model = model.to(device)
26 | 
27 | 
28 |     def forward(self, object: str, image_path: str) -> list:
29 |         image = Image.open(image_path)
30 |         image = image.convert('RGB')
31 | 
32 |         texts=[]
33 |         texts.append(f'a photo of {object}')
34 |         texts=[texts]    
35 | 
36 |         inputs = self.processor(text=texts, images=image, return_tensors="pt")
37 |         inputs=inputs.to(self.device)
38 |         outputs = self.model(**inputs)
39 | 
40 |         target_sizes = torch.Tensor([image.size[::-1]])
41 |         results = self.processor.post_process_object_detection(outputs=outputs, threshold=0.1, target_sizes=target_sizes)
42 | 
43 |         i = 0  
44 |         text = texts[i]   
45 |         output=[]
46 | 
47 |         for box, score, pred in zip(results[i]["boxes"], results[i]["scores"], results[i]["labels"]):
48 |             # output.append(dict(score=score.item(), label=text[pred], box=[round(i, 2) for i in box.tolist()]))
49 |             output.append([round(i, 2) for i in box.tolist()])
50 | 
51 |         return output
52 | 


--------------------------------------------------------------------------------
/tongagent/tools/new_added/ocr.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from transformers import Tool
 4 | from PIL import Image
 5 | from paddleocr import PaddleOCR, draw_ocr
 6 | from tongagent.utils import CACHE_FOLDER, gen_random_id
 7 | 
 8 | class OCRTool(Tool):
 9 |     name = "ocr"
10 |     description = "A tool that can extract texts from the image."
11 |     inputs = {
12 |         "image_path": {
13 |             "description": "The path of image that the tool can read.",
14 |             "type": "string",
15 |         },
16 |     }
17 |     output_type = "any"
18 |     
19 |     ocr = PaddleOCR(use_angle_cls=True, lang='en')
20 |     
21 |     def forward(self, image_path: str, debug: bool = False) -> list:
22 |         image = Image.open(image_path).convert("RGB")
23 |         
24 |         result = self.ocr.ocr(image_path, cls=True)
25 |         texts = []
26 |         for idx in range(len(result)):
27 |             res = result[idx]
28 |             for line in res:
29 |                 if debug: print(line[-1])
30 |                 texts.append(line[-1][0])
31 |         if debug:
32 |             result = result[0]
33 |             boxes = [line[0] for line in result]
34 |             txts = [line[1][0] for line in result]
35 |             scores = [line[1][1] for line in result]
36 |             im_show = draw_ocr(image, boxes, txts, scores, font_path='data/fonts/simfang.ttf')
37 |             im_show = Image.fromarray(im_show)
38 |             filename = os.path.join(CACHE_FOLDER, f"{gen_random_id()}.jpg")
39 |             print("save to", filename)
40 |             im_show.save(filename)
41 |         return texts


--------------------------------------------------------------------------------
/tongagent/utils.py:
--------------------------------------------------------------------------------
 1 | import string
 2 | import shortuuid
 3 | import os
 4 | from typing import Union
 5 | 
 6 | from omegaconf import OmegaConf, DictConfig, ListConfig
 7 | 
 8 | CACHE_FOLDER = ".cache"
 9 | os.makedirs(CACHE_FOLDER, exist_ok=True)
10 | 
11 | def get_uuid_builder() -> shortuuid.ShortUUID:
12 |     alphabet = string.ascii_lowercase + string.digits
13 |     su = shortuuid.ShortUUID(alphabet=alphabet)
14 |     return su
15 | 
16 | def load_config() -> Union[DictConfig, ListConfig]:
17 |     if "AGENT_CONFIG" in os.environ and len(os.environ["AGENT_CONFIG"]) > 0:
18 |         return OmegaConf.load(os.environ["AGENT_CONFIG"])
19 |     
20 |     if "RUN_MODE" in os.environ and os.environ["RUN_MODE"] == "eval":
21 |         return OmegaConf.load("configs/agent_config.yaml")
22 |     
23 |     return OmegaConf.load("configs/agent_config.yaml")
24 | 
25 | import time
26 | uuid_builder = get_uuid_builder()
27 | 
28 | def gen_random_id():
29 |     return f"{int(time.time()*1000)}_{uuid_builder.random(length=8)}"    
30 | 
31 | if __name__ == "__main__":
32 |     print(load_config())
33 |     print(load_config().search_engine[0].cx)


--------------------------------------------------------------------------------