├── README.md
├── annotations
├── ActivityNet
│ └── activitynet_annotation
│ │ ├── test.json
│ │ ├── train.json
│ │ ├── val.json
│ │ └── val_2.json
├── Charades
│ └── charades_annotation
│ │ ├── charades_test.json
│ │ ├── train.json
│ │ └── val.json
├── Got
│ ├── got_train.json
│ └── got_val.json
├── NextGQA
│ ├── nextgqa_test.json
│ └── nextgqa_val.json
└── VideoEval
│ └── Quality_Access
│ └── annotations
│ ├── Quality_Access_100shot.json
│ ├── Quality_Access_16shot.json
│ ├── Quality_Access_4shot.json
│ └── Quality_Access_test.json
├── configs
├── ddp.yaml
├── zero2.yaml
└── zero3.yaml
├── framework.png
├── lmms-eval_videochat
├── .gitignore
├── .pre-commit-config.yaml
├── LICENSE
├── README.md
├── docs
│ ├── README.md
│ ├── commands.md
│ ├── current_tasks.md
│ ├── model_guide.md
│ ├── run_examples.md
│ └── task_guide.md
├── eval_annotations
│ ├── MVBench
│ │ ├── README.md
│ │ └── json
│ │ │ ├── action_antonym.json
│ │ │ ├── action_count.json
│ │ │ ├── action_localization.json
│ │ │ ├── action_prediction.json
│ │ │ ├── action_sequence.json
│ │ │ ├── character_order.json
│ │ │ ├── counterfactual_inference.json
│ │ │ ├── egocentric_navigation.json
│ │ │ ├── episodic_reasoning.json
│ │ │ ├── fine_grained_action.json
│ │ │ ├── fine_grained_pose.json
│ │ │ ├── moving_attribute.json
│ │ │ ├── moving_count.json
│ │ │ ├── moving_direction.json
│ │ │ ├── object_existence.json
│ │ │ ├── object_interaction.json
│ │ │ ├── object_shuffle.json
│ │ │ ├── scene_transition.json
│ │ │ ├── state_change.json
│ │ │ └── unexpected_action.json
│ └── Video-MME_short
│ │ ├── README.md
│ │ └── videomme_short
│ │ └── test-00000-of-00001.parquet
├── lmms_eval
│ ├── __init__.py
│ ├── __main__.py
│ ├── api
│ │ ├── __init__.py
│ │ ├── filter.py
│ │ ├── instance.py
│ │ ├── metrics.py
│ │ ├── model.py
│ │ ├── registry.py
│ │ ├── samplers.py
│ │ └── task.py
│ ├── evaluator.py
│ ├── filters
│ │ ├── __init__.py
│ │ ├── decontamination.py
│ │ ├── extraction.py
│ │ ├── selection.py
│ │ └── transformation.py
│ ├── logging_utils.py
│ ├── models
│ │ ├── __init__.py
│ │ ├── model_utils
│ │ │ ├── __init__.py
│ │ │ ├── load_video.py
│ │ │ ├── my_qwen_utils.py
│ │ │ └── qwen
│ │ │ │ └── qwen_generate_utils.py
│ │ ├── mplug_owl_video
│ │ │ ├── __init__.py
│ │ │ ├── configuration_mplug_owl.py
│ │ │ ├── modeling_mplug_owl.py
│ │ │ ├── processing_mplug_owl.py
│ │ │ └── tokenization_mplug_owl.py
│ │ ├── qwen2_5_vl_lxh.py
│ │ ├── qwen_vl.py
│ │ └── video_chatgpt
│ │ │ ├── __init__.py
│ │ │ ├── constants.py
│ │ │ ├── eval
│ │ │ ├── __init__.py
│ │ │ └── model_utils.py
│ │ │ ├── inference.py
│ │ │ ├── model
│ │ │ ├── __init__.py
│ │ │ ├── consolidate.py
│ │ │ ├── make_delta.py
│ │ │ ├── utils.py
│ │ │ └── video_chatgpt.py
│ │ │ ├── single_video_inference.py
│ │ │ ├── utils.py
│ │ │ └── video_conversation.py
│ ├── tasks
│ │ ├── __init__.py
│ │ ├── _task_utils
│ │ │ ├── file_utils.py
│ │ │ ├── gpt_eval_utils.py
│ │ │ ├── video_loader.py
│ │ │ └── vqa_eval_metric.py
│ │ ├── dream_1k
│ │ │ ├── _default_template.yaml
│ │ │ ├── dream_1k.yaml
│ │ │ ├── dream_1k_cn.yaml
│ │ │ └── utils.py
│ │ ├── mvbench_nothink
│ │ │ ├── _default_template.yaml
│ │ │ ├── mvbench_action_antonym_nothink.yaml
│ │ │ ├── mvbench_action_count_nothink.yaml
│ │ │ ├── mvbench_action_localization_nothink.yaml
│ │ │ ├── mvbench_action_prediction_nothink.yaml
│ │ │ ├── mvbench_action_sequence_nothink.yaml
│ │ │ ├── mvbench_character_order_nothink.yaml
│ │ │ ├── mvbench_counterfactual_inference_nothink.yaml
│ │ │ ├── mvbench_egocentric_navigation_nothink.yaml
│ │ │ ├── mvbench_episodic_reasoning_nothink.yaml
│ │ │ ├── mvbench_fine_grained_action_nothink.yaml
│ │ │ ├── mvbench_fine_grained_pose_nothink.yaml
│ │ │ ├── mvbench_moving_attribute_nothink.yaml
│ │ │ ├── mvbench_moving_count_nothink.yaml
│ │ │ ├── mvbench_moving_direction_nothink.yaml
│ │ │ ├── mvbench_nothink.yaml
│ │ │ ├── mvbench_object_existence_nothink.yaml
│ │ │ ├── mvbench_object_interaction_nothink.yaml
│ │ │ ├── mvbench_object_shuffle_nothink.yaml
│ │ │ ├── mvbench_scene_transition_nothink.yaml
│ │ │ ├── mvbench_state_change_nothink.yaml
│ │ │ ├── mvbench_unexpected_action_nothink.yaml
│ │ │ └── utils.py
│ │ ├── mvbench_think
│ │ │ ├── _default_template.yaml
│ │ │ ├── mvbench_action_antonym_think.yaml
│ │ │ ├── mvbench_action_count_think.yaml
│ │ │ ├── mvbench_action_localization_think.yaml
│ │ │ ├── mvbench_action_prediction_think.yaml
│ │ │ ├── mvbench_action_sequence_think.yaml
│ │ │ ├── mvbench_character_order_think.yaml
│ │ │ ├── mvbench_counterfactual_inference_think.yaml
│ │ │ ├── mvbench_egocentric_navigation_think.yaml
│ │ │ ├── mvbench_episodic_reasoning_think.yaml
│ │ │ ├── mvbench_fine_grained_action_think.yaml
│ │ │ ├── mvbench_fine_grained_pose_think.yaml
│ │ │ ├── mvbench_moving_attribute_think.yaml
│ │ │ ├── mvbench_moving_count_think.yaml
│ │ │ ├── mvbench_moving_direction_think.yaml
│ │ │ ├── mvbench_object_existence_think.yaml
│ │ │ ├── mvbench_object_interaction_think.yaml
│ │ │ ├── mvbench_object_shuffle_think.yaml
│ │ │ ├── mvbench_scene_transition_think.yaml
│ │ │ ├── mvbench_state_change_think.yaml
│ │ │ ├── mvbench_think.yaml
│ │ │ ├── mvbench_unexpected_action_think.yaml
│ │ │ └── utils.py
│ │ ├── perceptiontest
│ │ │ └── val
│ │ │ │ ├── _default_template_yaml
│ │ │ │ ├── perceptiontest_mc_nothink.yaml
│ │ │ │ ├── perceptiontest_mc_think.yaml
│ │ │ │ └── utils.py
│ │ └── videomme
│ │ │ ├── utils.py
│ │ │ ├── videomme_short_nothink.yaml
│ │ │ ├── videomme_short_nothink_glue.yaml
│ │ │ ├── videomme_short_think.yaml
│ │ │ └── videomme_short_think_glue.yaml
│ └── utils.py
├── miscs
│ ├── example_eval.yaml
│ ├── llava_repr_requirements.txt
│ ├── llava_result_check.md
│ ├── llava_sglang_result_check.md
│ ├── repr_scripts.sh
│ ├── repr_torch_envs.txt
│ ├── scienceqa_id.txt
│ ├── script.sh
│ ├── test_llava.py
│ ├── test_scienceqa.py
│ ├── tinyllava_repr_requirements.txt
│ └── tinyllava_repr_scripts.sh
├── pyproject.toml
├── scripts_reason
│ ├── eval_qwen2_5vl_all_tasks_new.sh
│ ├── eval_qwen2_5vl_mv_ptest.sh
│ ├── eval_qwen2_5vl_nothink.sh
│ ├── eval_qwen2_5vl_nothink_glue.sh
│ └── eval_qwen2_5vl_vmme_short.sh
├── setup.py
└── tools
│ ├── get_video_avg_time.py
│ ├── lite
│ ├── embed.py
│ ├── embedder
│ │ ├── BaseEmbedder.py
│ │ ├── ClipBgeEmbedder.py
│ │ └── __init__.py
│ ├── shrink.py
│ └── shrinker
│ │ ├── BaseShrinker.py
│ │ ├── EmbedShrinker.py
│ │ ├── __init__.py
│ │ └── sampling_methods
│ │ ├── __init__.py
│ │ ├── kcenter_greedy.py
│ │ └── sampling_def.py
│ ├── live_bench
│ ├── create_dataset.py
│ ├── data_summary.ipynb
│ ├── example.ipynb
│ ├── filter.ipynb
│ ├── live_bench
│ │ ├── __init__.py
│ │ ├── api
│ │ │ └── live_bench.py
│ │ ├── data_generator
│ │ │ ├── __init__.py
│ │ │ ├── check_prompt.md
│ │ │ ├── default_criteria.md
│ │ │ ├── example
│ │ │ │ ├── example_output.json
│ │ │ │ └── example_website.png
│ │ │ ├── live_bench.py
│ │ │ ├── live_bench_data.py
│ │ │ ├── prompt.md
│ │ │ ├── qa_generator.py
│ │ │ ├── question_finalizer.py
│ │ │ ├── response.py
│ │ │ ├── score_getter.py
│ │ │ ├── score_prompt.md
│ │ │ └── utils
│ │ │ │ ├── __init__.py
│ │ │ │ ├── claude.py
│ │ │ │ ├── extract_infomation.py
│ │ │ │ ├── gemini.py
│ │ │ │ └── gpt4v.py
│ │ ├── driver
│ │ │ ├── .gitignore
│ │ │ ├── __init__.py
│ │ │ └── load_driver.py
│ │ ├── screen_shoter
│ │ │ ├── __init__.py
│ │ │ ├── screen.py
│ │ │ └── screen_shoter.py
│ │ ├── view.ipynb
│ │ └── websites
│ │ │ ├── __init__.py
│ │ │ ├── load_website.py
│ │ │ ├── website.py
│ │ │ └── website_list.yaml
│ ├── pyproject.toml
│ ├── refine_all_results.py
│ ├── script
│ │ ├── README.md
│ │ ├── modify.ipynb
│ │ └── upload_results.py
│ └── setup.py
│ ├── make_image_hf_dataset.ipynb
│ ├── make_vatex.py
│ ├── make_video_hf_dataset.ipynb
│ └── makecvrr.ipynb
├── requirements.txt
├── src
├── __init__.py
├── open_r1
│ ├── __init__.py
│ ├── evaluate.py
│ ├── generate.py
│ ├── grpo.py
│ ├── grpo_cls.py
│ ├── grpo_cls_nothink.py
│ ├── grpo_gqa.py
│ ├── grpo_gqa_nothink.py
│ ├── grpo_qa.py
│ ├── grpo_qa_nothink.py
│ ├── grpo_tasks.py
│ ├── grpo_tg.py
│ ├── grpo_video.py
│ ├── my_qwen_utils.py
│ ├── my_qwen_utils2.py
│ └── trainer
│ │ ├── __init__.py
│ │ ├── grpo_tasks_trainer.py
│ │ ├── grpo_trainer.py
│ │ ├── grpo_trainer_video_cls.py
│ │ ├── grpo_trainer_video_cls_nothink.py
│ │ ├── grpo_trainer_video_gqa.py
│ │ ├── grpo_trainer_video_gqa_nothink.py
│ │ ├── grpo_trainer_video_qa.py
│ │ ├── grpo_trainer_video_qa_nothink.py
│ │ ├── grpo_trainer_video_tg.py
│ │ ├── vllm_grpo_trainer.py
│ │ ├── vllm_grpo_trainer_video_tg.py
│ │ └── yza_vision_process.py
└── sft
│ ├── sft_cls.py
│ ├── sft_gqa.py
│ ├── sft_grounding.py
│ └── sft_track.py
├── src_eval
├── __init__.py
├── data_config.py
├── eval_prompts.py
├── evaluate_cls_quality_c8.py
├── evaluate_gqa.py
├── evaluate_grounding.py
├── evaluate_qa.py
├── evaluate_track.py
└── my_qwen_utils.py
└── training_scripts
├── run_grpo_video_cls_qa.sh
├── run_grpo_video_gqa.sh
├── run_grpo_video_gqa_nothink_3e.sh
├── run_grpo_video_qa.sh
├── run_grpo_video_qa_nothink.sh
├── run_grpo_video_task.sh
├── run_grpo_video_tg.sh
├── run_sft_video_cls_qa.sh
├── run_sft_video_gqa.sh
├── run_sft_video_track.sh
└── zero3_offload.json
/README.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
7 |
8 | [Xinhao Li](https://scholar.google.com/citations?user=evR3uR0AAAAJ)\*, [Ziang Yan](https://scholar.google.com.hk/citations?user=78lx13MAAAAJ&hl=zh-CN)\*, Desen Meng, Lu Dong, [Xiangyu Zeng](https://scholar.google.com/citations?user=jS13DXkAAAAJ&hl=zh-CN), [Yinan He](https://dblp.org/pid/93/7763.html), [Yali Wang](https://scholar.google.com/citations?user=hD948dkAAAAJ), [Yu Qiao](https://scholar.google.com/citations?user=gFtI-8QAAAAJ&hl), [Yi Wang](https://scholar.google.com.hk/citations?user=Xm2M8UwAAAAJ)^ and [Limin Wang](https://scholar.google.com/citations?user=HEuN8PcAAAAJ)^
9 |
10 |
11 | 🤗 Model    |    📑 Paper   
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 | ## :fire: Updates
22 | - [x] **2025/04/22**:🔥🔥🔥 We release our VideoChat-R1-caption at [Huggingface](https://huggingface.co/collections/OpenGVLab/videochat-r1-67fbe26e4eb08c83aa24643e).
23 | - [x] **2025/04/14**:🔥🔥🔥 We release our VideoChat-R1 and VideoChat-R1-thinking at [Huggingface](https://huggingface.co/collections/OpenGVLab/videochat-r1-67fbe26e4eb08c83aa24643e).
24 | - [x] **2025/04/10**:🔥🔥🔥 We release our paper and code.
25 |
26 |
27 | ## :parrot: Introduction
28 |
29 | 
30 |
31 |
32 |
33 | ## Demo & Inference
34 |
35 | Refer to [hf README](https://huggingface.co/OpenGVLab/VideoChat-R1_7B) to inference our model.
36 |
37 | ## Evaluation
38 |
39 | See [eval_scripts](eval_scripts) and [lmms-eval_videochat](lmms-eval_videochat).
40 |
41 |
42 | ## Training
43 |
44 | See [training_scripts](training_scripts).
45 |
46 | # :page_facing_up: Citation
47 |
48 | If you find this project useful in your research, please consider cite:
49 | ```BibTeX
50 | @article{li2025videochatr1,
51 | title={VideoChat-R1: Enhancing Spatio-Temporal
52 | Perception via Reinforcement Fine-Tuning},
53 | author={Li, Xinhao and Yan, Ziang and Meng, Desen and Dong, Lu and Zeng, Xiangyu and He, Yinan and Wang, Yali and Qiao, Yu and Wang, Yi and Wang, Limin},
54 | journal={arXiv preprint arXiv:2504.06958},
55 | year={2025}
56 | }
57 | ```
58 |
59 |
62 |
--------------------------------------------------------------------------------
/annotations/VideoEval/Quality_Access/annotations/Quality_Access_4shot.json:
--------------------------------------------------------------------------------
1 | [
2 | {
3 | "video": "0506.mp4",
4 | "instruction": "How about the quality of the video from an aesthetic and technical point of view? Choice one answer from ['low quality', 'high quality'].",
5 | "answer": "low quality"
6 | },
7 | {
8 | "video": "0717.mp4",
9 | "instruction": "How about the quality of the video from an aesthetic and technical point of view? Choice one answer from ['low quality', 'high quality'].",
10 | "answer": "low quality"
11 | },
12 | {
13 | "video": "0423.mp4",
14 | "instruction": "How about the quality of the video from an aesthetic and technical point of view? Choice one answer from ['low quality', 'high quality'].",
15 | "answer": "low quality"
16 | },
17 | {
18 | "video": "0340.mp4",
19 | "instruction": "How about the quality of the video from an aesthetic and technical point of view? Choice one answer from ['low quality', 'high quality'].",
20 | "answer": "low quality"
21 | },
22 | {
23 | "video": "3208.mp4",
24 | "instruction": "How about the quality of the video from an aesthetic and technical point of view? Choice one answer from ['low quality', 'high quality'].",
25 | "answer": "high quality"
26 | },
27 | {
28 | "video": "3424.mp4",
29 | "instruction": "How about the quality of the video from an aesthetic and technical point of view? Choice one answer from ['low quality', 'high quality'].",
30 | "answer": "high quality"
31 | },
32 | {
33 | "video": "3126.mp4",
34 | "instruction": "How about the quality of the video from an aesthetic and technical point of view? Choice one answer from ['low quality', 'high quality'].",
35 | "answer": "high quality"
36 | },
37 | {
38 | "video": "3048.mp4",
39 | "instruction": "How about the quality of the video from an aesthetic and technical point of view? Choice one answer from ['low quality', 'high quality'].",
40 | "answer": "high quality"
41 | }
42 | ]
--------------------------------------------------------------------------------
/configs/ddp.yaml:
--------------------------------------------------------------------------------
1 | compute_environment: LOCAL_MACHINE
2 | debug: false
3 | distributed_type: MULTI_GPU
4 | downcast_bf16: 'no'
5 | gpu_ids: all
6 | machine_rank: 0
7 | main_training_function: main
8 | mixed_precision: bf16
9 | num_machines: 1
10 | num_processes: 8
11 | rdzv_backend: static
12 | same_network: true
13 | tpu_env: []
14 | tpu_use_cluster: false
15 | tpu_use_sudo: false
16 | use_cpu: false
17 |
--------------------------------------------------------------------------------
/configs/zero2.yaml:
--------------------------------------------------------------------------------
1 | compute_environment: LOCAL_MACHINE
2 | debug: false
3 | deepspeed_config:
4 | deepspeed_multinode_launcher: standard
5 | offload_optimizer_device: none
6 | offload_param_device: none
7 | zero3_init_flag: false
8 | zero_stage: 2
9 | distributed_type: DEEPSPEED
10 | downcast_bf16: 'no'
11 | machine_rank: 0
12 | main_training_function: main
13 | mixed_precision: bf16
14 | num_machines: 1
15 | num_processes: 8
16 | rdzv_backend: static
17 | same_network: true
18 | tpu_env: []
19 | tpu_use_cluster: false
20 | tpu_use_sudo: false
21 | use_cpu: false
--------------------------------------------------------------------------------
/configs/zero3.yaml:
--------------------------------------------------------------------------------
1 | compute_environment: LOCAL_MACHINE
2 | debug: false
3 | deepspeed_config:
4 | deepspeed_multinode_launcher: standard
5 | offload_optimizer_device: none
6 | offload_param_device: none
7 | zero3_init_flag: true
8 | zero3_save_16bit_model: true
9 | zero_stage: 3
10 | distributed_type: DEEPSPEED
11 | downcast_bf16: 'no'
12 | machine_rank: 0
13 | main_training_function: main
14 | mixed_precision: bf16
15 | num_machines: 1
16 | num_processes: 8
17 | rdzv_backend: static
18 | same_network: true
19 | tpu_env: []
20 | tpu_use_cluster: false
21 | tpu_use_sudo: false
22 | use_cpu: false
23 |
--------------------------------------------------------------------------------
/framework.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/VideoChat-R1/dba50f32fffb763e0fdf3810244be76a44147df2/framework.png
--------------------------------------------------------------------------------
/lmms-eval_videochat/.gitignore:
--------------------------------------------------------------------------------
1 | env
2 | *.pyc
3 | output/
4 | data/
5 | lm_cache
6 | .idea
7 | build
8 | dist
9 | *.egg-info
10 | venv
11 | .vscode/
12 | temp
13 | __pycache__
14 | .ipynb_checkpoints
15 | temp
16 | .DS_STORE
17 | # IPython
18 | profile_default/
19 | ipython_config.py
20 | logs/
21 | scripts/
22 | wandb/
23 | SimSun.ttf
24 | submissions/
25 | lmms_eval/tasks/hallusion_bench/hallusion_output_vs_model.json
26 | lmms_eval/tasks/hallusion_bench/hallusion_output_vd_model.json
27 | zk.log
28 | cache_dir
29 | ckpt
30 | pretrained/
31 | LLaVA/
32 | *logs
33 | temp/
34 | InternVL/
35 | logs/
36 | data/
37 | llava-video/
38 | Video-MME/
39 | VATEX/
40 | lmms_eval/tasks/vatex/__pycache__/utils.cpython-310.pyc
41 | lmms_eval/tasks/mlvu/__pycache__/utils.cpython-310.pyc
--------------------------------------------------------------------------------
/lmms-eval_videochat/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
1 | repos:
2 | - repo: https://github.com/psf/black
3 | rev: 23.12.1
4 | hooks:
5 | - id: black
6 | language_version: python3
--------------------------------------------------------------------------------
/lmms-eval_videochat/LICENSE:
--------------------------------------------------------------------------------
1 | # For the main pipeline structure-related code, we maintain the original license provided with lm-evaluation-harness, which is the MIT License.
2 |
3 | MIT License
4 |
5 | Copyright (c) 2024 LMMs-Lab
6 |
7 | Permission is hereby granted, free of charge, to any person obtaining a copy
8 | of this software and associated documentation files (the "Software"), to deal
9 | in the Software without restriction, including without limitation the rights
10 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11 | copies of the Software, and to permit persons to whom the Software is
12 | furnished to do so, subject to the following conditions:
13 |
14 | The above copyright notice and this permission notice shall be included in all
15 | copies or substantial portions of the Software.
16 |
17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23 | SOFTWARE.
24 |
25 | # For the multimodal models and datasets that we have added (defined as code in the lmms_eval/tasks and lmms_eval/models folders), we apply the Apache License.
26 |
27 | Apache 2.0 License
28 |
29 | Copyright (c) 2024 LMMs-Lab
30 |
31 | Licensed under the Apache License, Version 2.0 (the "License");
32 | you may not use this file except in compliance with the License.
33 | You may obtain a copy of the License at
34 |
35 | http://www.apache.org/licenses/LICENSE-2.0
36 |
37 | Unless required by applicable law or agreed to in writing, software
38 | distributed under the License is distributed on an "AS IS" BASIS,
39 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
40 | See the License for the specific language governing permissions and
41 | limitations under the License.
42 |
43 | When modifying the code, please include the following information about the original lmms-eval source:
44 | # Adopted from lmms-eval from https://github.com/EvolvingLMMs-Lab/lmms-eval. Below is the original copyright:
45 | #
46 | # Licensed under the Apache License, Version 2.0 (the "License");
47 | # you may not use this file except in compliance with the License.
48 | # You may obtain a copy of the License at
49 | #
50 | # http://www.apache.org/licenses/LICENSE-2.0
51 | #
52 | # Unless required by applicable law or agreed to in writing, software
53 | # distributed under the License is distributed on an "AS IS" BASIS,
54 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
55 | # See the License for the specific language governing permissions and
56 | # limitations under the License.
57 |
--------------------------------------------------------------------------------
/lmms-eval_videochat/README.md:
--------------------------------------------------------------------------------
1 |
2 | # How to use
3 |
4 | We have modified the data loading method for lmms-eval: instead of loading from Huggingface, the data is loaded locally. Therefore, when using it, you need to **specify the data path** in the YAML file of each task. The data can be downloaded from the [lmms-eval](https://huggingface.co/lmms-lab) or the official repos of the corresponding tasks.
5 |
6 | ## Installation
7 |
8 | You can install the package by cloning the repository and running the following command:
9 | ```bash
10 | git clone https://github.com/OpenGVLab/VideoChat-R1
11 | cd lmms-eval_videochat
12 | pip install -e .
13 | ```
14 | We provide all evaluation [scripts](scripts_reason) and [annotations](eval_annotations) here.
15 |
16 | You could evaluate one task:
17 | ```bash
18 | TASK=videomme_short_nothink
19 | MODEL_NAME=qwen2_5_vl_lxh
20 | MAX_NUM_FRAMES=512
21 | CKPT_PATH=OpenGVLab/VideoChat-R1-7B
22 |
23 | echo $TASK
24 | TASK_SUFFIX="${TASK//,/_}"
25 | echo $TASK_SUFFIX
26 |
27 | JOB_NAME=$(basename $0)_$(date +"%Y%m%d_%H%M%S")
28 | MASTER_PORT=$((18000 + $RANDOM % 100))
29 | NUM_GPUS=8
30 |
31 |
32 | accelerate launch --num_processes ${NUM_GPUS} --main_process_port ${MASTER_PORT} -m lmms_eval \
33 | --model ${MODEL_NAME} \
34 | --model_args pretrained=$CKPT_PATH,max_num_frames=$MAX_NUM_FRAMES \
35 | --tasks $TASK \
36 | --batch_size 1 \
37 | --log_samples \
38 | --log_samples_suffix $TASK_SUFFIX \
39 | --output_path ./logs/${JOB_NAME}_${MODEL_NAME}_f${MAX_NUM_FRAMES}
40 | ```
41 | You could evaluate more tasks once like:
42 | ```bash
43 | TASK=mvbench_nothink,videomme_short_nothink
44 | MODEL_NAME=qwen2_5_vl_lxh
45 | MAX_NUM_FRAMES=512
46 | CKPT_PATH=OpenGVLab/VideoChat-Flash-Qwen2-7B_res448
47 |
48 | echo $TASK
49 | TASK_SUFFIX="${TASK//,/_}"
50 | echo $TASK_SUFFIX
51 |
52 | JOB_NAME=$(basename $0)_$(date +"%Y%m%d_%H%M%S")
53 | MASTER_PORT=$((18000 + $RANDOM % 100))
54 | NUM_GPUS=8
55 |
56 |
57 | accelerate launch --num_processes ${NUM_GPUS} --main_process_port ${MASTER_PORT} -m lmms_eval \
58 | --model ${MODEL_NAME} \
59 | --model_args pretrained=$CKPT_PATH,max_num_frames=$MAX_NUM_FRAMES \
60 | --tasks $TASK \
61 | --batch_size 1 \
62 | --log_samples \
63 | --log_samples_suffix $TASK_SUFFIX \
64 | --output_path ./logs/${JOB_NAME}_${MODEL_NAME}_f${MAX_NUM_FRAMES}
65 | ```
66 |
67 |
--------------------------------------------------------------------------------
/lmms-eval_videochat/docs/README.md:
--------------------------------------------------------------------------------
1 | # LMMs Eval Documentation
2 |
3 | Welcome to the docs for `lmms-eval`!
4 |
5 | Majority of this documentation is adapted from [lm-eval-harness](https://github.com/EleutherAI/lm-evaluation-harness/)
6 |
7 | ## Table of Contents
8 |
9 | * To learn about the command line flags, see the [commands](commands.md)
10 | * To learn how to add a new moddel, see the [Model Guide](model_guide.md).
11 | * For a crash course on adding new tasks to the library, see our [Task Guide](task_guide.md).
12 | * If you need to upload your datasets into correct HF format with viewer supported, please refer to [tools](https://github.com/EvolvingLMMs-Lab/lmms-eval/tree/pufanyi/hf_dataset_docs/tools)
13 |
--------------------------------------------------------------------------------
/lmms-eval_videochat/docs/commands.md:
--------------------------------------------------------------------------------
1 | # User Guide
2 | This document details the interface exposed by `lmms_eval` and provides details on what flags are available to users.
3 |
4 | ## Command-line Interface
5 |
6 |
7 | Equivalently, running the library can be done via the `lmms_eval` entrypoint at the command line.
8 |
9 | This mode supports a number of command-line arguments, the details of which can be also be seen via running with `-h` or `--help`:
10 |
11 | * `--model` : Selects which model type or provider is evaluated. Must be a mdoels registered under lmms_eval/models. For example, `--model qwen_vl` or `--model llava`.
12 |
13 | * `--model_args` : Controls parameters passed to the model constructor. Accepts a string containing comma-separated keyword arguments to the model class of the format `"arg1=val1,arg2=val2,..."`, such as, for example `--model_args pretrained=liuhaotian/llava-v1.5-7b,batch_size=1`. For a full list of what keyword arguments, see the initialization of the corresponding model class in `lmms_eval/models/`.
14 |
15 | * `--tasks` : Determines which tasks or task groups are evaluated. Accepts a comma-separated list of task names or task group names. Must be solely comprised of valid tasks/groups. You can use `--tasks list` to see all the available tasks. If you add your own tasks but not shown on the list, you can try to set `--verbosity=DEBUG` to view the error message. You can also use `--tasks list_with_num` to check every tasks and the number of question each task contains. However, `list_with_num` will download all the available datasets and may require lots of memory and time.
16 |
17 | * `--batch_size` : Sets the batch size used for evaluation. Can be a positive integer or `"auto"` to automatically select the largest batch size that will fit in memory, speeding up evaluation. One can pass `--batch_size auto:N` to re-select the maximum batch size `N` times during evaluation. This can help accelerate evaluation further, since `lm-eval` sorts documents in descending order of context length.
18 |
19 | * `--output_path` : A string of the form `dir/file.jsonl` or `dir/`. Provides a path where high-level results will be saved, either into the file named or into the directory named. If `--log_samples` is passed as well, then per-document outputs and metrics will be saved into the directory as well.
20 |
21 | * `--log_samples` : If this flag is passed, then the model's outputs, and the text fed into the model, will be saved at per-document granularity. Must be used with `--output_path`.
22 |
23 | * `--limit` : Accepts an integer, or a float between 0.0 and 1.0 . If passed, will limit the number of documents to evaluate to the first X documents (if an integer) per task or first X% of documents per task. Useful for debugging, especially on costly API models.
24 |
25 | ## Usage with SRT API
26 |
27 | > install sglang
28 |
29 | ```bash
30 | git clone https://github.com/sgl-project/sglang.git
31 | # Current version is tested on #1222
32 | cd sglang;
33 | pip install -e "python[srt]"
34 |
35 | # Install FlashInfer CUDA kernels
36 | pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
37 | ```
38 |
39 | > run sglang backend service with the following command
40 |
41 | ```bash
42 | # After update, there is no need to use an extra command to setup backend server
43 | # the server will be initialized in the init process
44 |
45 | # launch lmms-eval srt_api model
46 | CKPT_PATH=$1
47 | TASK=$2
48 | MODALITY=$3
49 | TP_SIZE=$4
50 | echo $TASK
51 | TASK_SUFFIX="${TASK//,/_}"
52 | echo $TASK_SUFFIX
53 |
54 | python3 -m lmms_eval \
55 | --model srt_api \
56 | --model_args modality=$MODALITY,model_version=$CKPT_PATH,tp=$TP_SIZE,host=127.0.0.1,port=30000,timeout=600 \
57 | --tasks $TASK \
58 | --batch_size 1 \
59 | --log_samples \
60 | --log_samples_suffix $TASK_SUFFIX \
61 | --output_path ./logs/
62 | ```
63 |
64 | You may need to install some dependencies for the above command to work (if you encounter some errors).
65 |
66 | ```bash
67 | pip install httpx==0.23.3
68 | pip install protobuf==3.20
69 | ```
70 |
71 |
72 |
--------------------------------------------------------------------------------
/lmms-eval_videochat/docs/current_tasks.md:
--------------------------------------------------------------------------------
1 | # Current Tasks
2 |
3 | > () indicates the task name in the lmms_eval. The task name is also used to specify the dataset in the configuration file.
4 | > The following is manually updated documentation. You could use `lmms_eval task --list` to list all supported tasks and their task names.
5 |
6 | - AI2D (ai2d)
7 | - ChartQA (chartqa)
8 | - CMMMU (cmmmu)
9 | - CMMMU Validation (cmmmu_val)
10 | - CMMMU Test (cmmmu_test)
11 | - COCO Caption (coco_cap)
12 | - COCO 2014 Caption (coco2014_cap)
13 | - COCO 2014 Caption Validation (coco2014_cap_val)
14 | - COCO 2014 Caption Test (coco2014_cap_test)
15 | - COCO 2017 Caption (coco2017_cap)
16 | - COCO 2017 Caption MiniVal (coco2017_cap_val)
17 | - COCO 2017 Caption MiniTest (coco2017_cap_test)
18 | - [ConBench](https://github.com/foundation-multimodal-models/ConBench) (conbench)
19 | - DOCVQA (docvqa)
20 | - DOCVQA Validation (docvqa_val)
21 | - DOCVQA Test (docvqa_test)
22 | - Ferret (ferret)
23 | - Flickr30K (flickr30k)
24 | - Ferret Test (ferret_test)
25 | - GQA (gqa)
26 | - HallusionBenchmark (hallusion_bench_image)
27 | - Infographic VQA (info_vqa)
28 | - Infographic VQA Validation (info_vqa_val)
29 | - Infographic VQA Test (info_vqa_test)
30 | - LLaVA-Bench (llava_in_the_wild)
31 | - LLaVA-Bench-COCO (llava_bench_coco)
32 | - MathVerse (mathverse)
33 | - MathVerse Text Dominant (mathverse_testmini_text_dominant)
34 | - MathVerse Text Only (mathverse_testmini_text_only)
35 | - MathVerse Text Lite (mathverse_testmini_text_lite)
36 | - MathVerse Vision Dominant (mathverse_testmini_vision_dominant)
37 | - MathVerse Vision Intensive (mathverse_testmini_vision_intensive)
38 | - MathVerse Vision Only (mathverse_testmini_vision_only)
39 | - MathVista (mathvista)
40 | - MathVista Validation (mathvista_testmini)
41 | - MathVista Test (mathvista_test)
42 | - MMBench (mmbench)
43 | - MMBench English (mmbench_en)
44 | - MMBench English Dev (mmbench_en_dev)
45 | - MMBench English Test (mmbench_en_test)
46 | - MMBench Chinese (mmbench_cn)
47 | - MMBench Chinese Dev (mmbench_cn_dev)
48 | - MMBench Chinese Test (mmbench_cn_test)
49 | - MME (mme)
50 | - MMMU (mmmu)
51 | - MMMU Validation (mmmu_val)
52 | - MMMU Test (mmmu_test)
53 | - MMStar (mmstar)
54 | - MMUPD (mmupd)
55 | - MMUPD Base (mmupd_base)
56 | - MMAAD Base (mmaad_base)
57 | - MMIASD Base (mmiasd_base)
58 | - MMIVQD Base (mmivqd_base)
59 | - MMUPD Option (mmupd_option)
60 | - MMAAD Option (mmaad_option)
61 | - MMIASD Option (mmiasd_option)
62 | - MMIVQD Option (mmivqd_option)
63 | - MMUPD Instruction (mmupd_instruction)
64 | - MMAAD Instruction (mmaad_instruction)
65 | - MMIASD Instruction (mmiasd_instruction)
66 | - MMIVQD Instruction (mmivqd_instruction)
67 | - MMVet (mmvet)
68 | - Multi-DocVQA (multidocvqa)
69 | - Multi-DocVQA Validation (multidocvqa_val)
70 | - Multi-DocVQA Test (multidocvqa_test)
71 | - NoCaps (nocaps)
72 | - NoCaps Validation (nocaps_val)
73 | - NoCaps Test (nocaps_test)
74 | - OKVQA (ok_vqa)
75 | - OKVQA Validation 2014 (ok_vqa_val2014)
76 | - POPE (pope)
77 | - RefCOCO (refcoco)
78 | - refcoco_seg_test
79 | - refcoco_seg_val
80 | - refcoco_seg_testA
81 | - refcoco_seg_testB
82 | - refcoco_bbox_test
83 | - refcoco_bbox_val
84 | - refcoco_bbox_testA
85 | - refcoco_bbox_testB
86 | - RefCOCO+ (refcoco+)
87 | - refcoco+_seg
88 | - refcoco+_seg_val
89 | - refcoco+_seg_testA
90 | - refcoco+_seg_testB
91 | - refcoco+_bbox
92 | - refcoco+_bbox_val
93 | - refcoco+_bbox_testA
94 | - refcoco+_bbox_testB
95 | - RefCOCOg (refcocog)
96 | - refcocog_seg_test
97 | - refcocog_seg_val
98 | - refcocog_bbox_test
99 | - refcocog_bbox_val
100 | - ScienceQA (scienceqa_full)
101 | - ScienceQA Full (scienceqa)
102 | - ScienceQA IMG (scienceqa_img)
103 | - ScreenSpot (screenspot)
104 | - ScreenSpot REC / Grounding (screenspot_rec)
105 | - ScreenSpot REG / Instruction Generation (screenspot_reg)
106 | - SeedBench (seedbench)
107 | - SeedBench 2 (seedbench_2)
108 | - SeedBench 2 Plus (seedbench_2_plus)
109 | - ST-VQA (stvqa)
110 | - TextCaps (textcaps)
111 | - TextCaps Validation (textcaps_val)
112 | - TextCaps Test (textcaps_test)
113 | - TextVQA (textvqa)
114 | - TextVQA Validation (textvqa_val)
115 | - TextVQA Test (textvqa_test)
116 | - VizWizVQA (vizwiz_vqa)
117 | - VizWizVQA Validation (vizwiz_vqa_val)
118 | - VizWizVQA Test (vizwiz_vqa_test)
119 | - VQAv2 (vqav2)
120 | - VQAv2 Validation (vqav2_val)
121 | - VQAv2 Test (vqav2_test)
122 | - WebSRC (websrc)
123 | - WebSRC Validation (websrc_val)
124 | - WebSRC Test (websrc_test)
--------------------------------------------------------------------------------
/lmms-eval_videochat/eval_annotations/MVBench/README.md:
--------------------------------------------------------------------------------
1 | ---
2 | license: mit
3 | extra_gated_prompt: >-
4 | You agree to not use the dataset to conduct experiments that cause harm to
5 | human subjects. Please note that the data in this dataset may be subject to
6 | other agreements. Before using the data, be sure to read the relevant
7 | agreements carefully to ensure compliant use. Video copyrights belong to the
8 | original video creators or platforms and are for academic research use only.
9 | task_categories:
10 | - visual-question-answering
11 | - video-classification
12 | extra_gated_fields:
13 | Name: text
14 | Company/Organization: text
15 | Country: text
16 | E-Mail: text
17 | modalities:
18 | - Video
19 | - Text
20 | configs:
21 | - config_name: action_sequence
22 | data_files: json/action_sequence.json
23 | - config_name: moving_count
24 | data_files: json/moving_count.json
25 | - config_name: action_prediction
26 | data_files: json/action_prediction.json
27 | - config_name: episodic_reasoning
28 | data_files: json/episodic_reasoning.json
29 | - config_name: action_antonym
30 | data_files: json/action_antonym.json
31 | - config_name: action_count
32 | data_files: json/action_count.json
33 | - config_name: scene_transition
34 | data_files: json/scene_transition.json
35 | - config_name: object_shuffle
36 | data_files: json/object_shuffle.json
37 | - config_name: object_existence
38 | data_files: json/object_existence.json
39 | - config_name: fine_grained_pose
40 | data_files: json/fine_grained_pose.json
41 | - config_name: unexpected_action
42 | data_files: json/unexpected_action.json
43 | - config_name: moving_direction
44 | data_files: json/moving_direction.json
45 | - config_name: state_change
46 | data_files: json/state_change.json
47 | - config_name: object_interaction
48 | data_files: json/object_interaction.json
49 | - config_name: character_order
50 | data_files: json/character_order.json
51 | - config_name: action_localization
52 | data_files: json/action_localization.json
53 | - config_name: counterfactual_inference
54 | data_files: json/counterfactual_inference.json
55 | - config_name: fine_grained_action
56 | data_files: json/fine_grained_action.json
57 | - config_name: moving_attribute
58 | data_files: json/moving_attribute.json
59 | - config_name: egocentric_navigation
60 | data_files: json/egocentric_navigation.json
61 | language:
62 | - en
63 | size_categories:
64 | - 1K