├── README.md ├── annotations ├── ActivityNet │ └── activitynet_annotation │ │ ├── test.json │ │ ├── train.json │ │ ├── val.json │ │ └── val_2.json ├── Charades │ └── charades_annotation │ │ ├── charades_test.json │ │ ├── train.json │ │ └── val.json ├── Got │ ├── got_train.json │ └── got_val.json ├── NextGQA │ ├── nextgqa_test.json │ └── nextgqa_val.json └── VideoEval │ └── Quality_Access │ └── annotations │ ├── Quality_Access_100shot.json │ ├── Quality_Access_16shot.json │ ├── Quality_Access_4shot.json │ └── Quality_Access_test.json ├── configs ├── ddp.yaml ├── zero2.yaml └── zero3.yaml ├── framework.png ├── lmms-eval_videochat ├── .gitignore ├── .pre-commit-config.yaml ├── LICENSE ├── README.md ├── docs │ ├── README.md │ ├── commands.md │ ├── current_tasks.md │ ├── model_guide.md │ ├── run_examples.md │ └── task_guide.md ├── eval_annotations │ ├── MVBench │ │ ├── README.md │ │ └── json │ │ │ ├── action_antonym.json │ │ │ ├── action_count.json │ │ │ ├── action_localization.json │ │ │ ├── action_prediction.json │ │ │ ├── action_sequence.json │ │ │ ├── character_order.json │ │ │ ├── counterfactual_inference.json │ │ │ ├── egocentric_navigation.json │ │ │ ├── episodic_reasoning.json │ │ │ ├── fine_grained_action.json │ │ │ ├── fine_grained_pose.json │ │ │ ├── moving_attribute.json │ │ │ ├── moving_count.json │ │ │ ├── moving_direction.json │ │ │ ├── object_existence.json │ │ │ ├── object_interaction.json │ │ │ ├── object_shuffle.json │ │ │ ├── scene_transition.json │ │ │ ├── state_change.json │ │ │ └── unexpected_action.json │ └── Video-MME_short │ │ ├── README.md │ │ └── videomme_short │ │ └── test-00000-of-00001.parquet ├── lmms_eval │ ├── __init__.py │ ├── __main__.py │ ├── api │ │ ├── __init__.py │ │ ├── filter.py │ │ ├── instance.py │ │ ├── metrics.py │ │ ├── model.py │ │ ├── registry.py │ │ ├── samplers.py │ │ └── task.py │ ├── evaluator.py │ ├── filters │ │ ├── __init__.py │ │ ├── decontamination.py │ │ ├── extraction.py │ │ ├── selection.py │ │ └── transformation.py │ ├── logging_utils.py │ ├── models │ │ ├── __init__.py │ │ ├── model_utils │ │ │ ├── __init__.py │ │ │ ├── load_video.py │ │ │ ├── my_qwen_utils.py │ │ │ └── qwen │ │ │ │ └── qwen_generate_utils.py │ │ ├── mplug_owl_video │ │ │ ├── __init__.py │ │ │ ├── configuration_mplug_owl.py │ │ │ ├── modeling_mplug_owl.py │ │ │ ├── processing_mplug_owl.py │ │ │ └── tokenization_mplug_owl.py │ │ ├── qwen2_5_vl_lxh.py │ │ ├── qwen_vl.py │ │ └── video_chatgpt │ │ │ ├── __init__.py │ │ │ ├── constants.py │ │ │ ├── eval │ │ │ ├── __init__.py │ │ │ └── model_utils.py │ │ │ ├── inference.py │ │ │ ├── model │ │ │ ├── __init__.py │ │ │ ├── consolidate.py │ │ │ ├── make_delta.py │ │ │ ├── utils.py │ │ │ └── video_chatgpt.py │ │ │ ├── single_video_inference.py │ │ │ ├── utils.py │ │ │ └── video_conversation.py │ ├── tasks │ │ ├── __init__.py │ │ ├── _task_utils │ │ │ ├── file_utils.py │ │ │ ├── gpt_eval_utils.py │ │ │ ├── video_loader.py │ │ │ └── vqa_eval_metric.py │ │ ├── dream_1k │ │ │ ├── _default_template.yaml │ │ │ ├── dream_1k.yaml │ │ │ ├── dream_1k_cn.yaml │ │ │ └── utils.py │ │ ├── mvbench_nothink │ │ │ ├── _default_template.yaml │ │ │ ├── mvbench_action_antonym_nothink.yaml │ │ │ ├── mvbench_action_count_nothink.yaml │ │ │ ├── mvbench_action_localization_nothink.yaml │ │ │ ├── mvbench_action_prediction_nothink.yaml │ │ │ ├── mvbench_action_sequence_nothink.yaml │ │ │ ├── mvbench_character_order_nothink.yaml │ │ │ ├── mvbench_counterfactual_inference_nothink.yaml │ │ │ ├── mvbench_egocentric_navigation_nothink.yaml │ │ │ ├── mvbench_episodic_reasoning_nothink.yaml │ │ │ ├── mvbench_fine_grained_action_nothink.yaml │ │ │ ├── mvbench_fine_grained_pose_nothink.yaml │ │ │ ├── mvbench_moving_attribute_nothink.yaml │ │ │ ├── mvbench_moving_count_nothink.yaml │ │ │ ├── mvbench_moving_direction_nothink.yaml │ │ │ ├── mvbench_nothink.yaml │ │ │ ├── mvbench_object_existence_nothink.yaml │ │ │ ├── mvbench_object_interaction_nothink.yaml │ │ │ ├── mvbench_object_shuffle_nothink.yaml │ │ │ ├── mvbench_scene_transition_nothink.yaml │ │ │ ├── mvbench_state_change_nothink.yaml │ │ │ ├── mvbench_unexpected_action_nothink.yaml │ │ │ └── utils.py │ │ ├── mvbench_think │ │ │ ├── _default_template.yaml │ │ │ ├── mvbench_action_antonym_think.yaml │ │ │ ├── mvbench_action_count_think.yaml │ │ │ ├── mvbench_action_localization_think.yaml │ │ │ ├── mvbench_action_prediction_think.yaml │ │ │ ├── mvbench_action_sequence_think.yaml │ │ │ ├── mvbench_character_order_think.yaml │ │ │ ├── mvbench_counterfactual_inference_think.yaml │ │ │ ├── mvbench_egocentric_navigation_think.yaml │ │ │ ├── mvbench_episodic_reasoning_think.yaml │ │ │ ├── mvbench_fine_grained_action_think.yaml │ │ │ ├── mvbench_fine_grained_pose_think.yaml │ │ │ ├── mvbench_moving_attribute_think.yaml │ │ │ ├── mvbench_moving_count_think.yaml │ │ │ ├── mvbench_moving_direction_think.yaml │ │ │ ├── mvbench_object_existence_think.yaml │ │ │ ├── mvbench_object_interaction_think.yaml │ │ │ ├── mvbench_object_shuffle_think.yaml │ │ │ ├── mvbench_scene_transition_think.yaml │ │ │ ├── mvbench_state_change_think.yaml │ │ │ ├── mvbench_think.yaml │ │ │ ├── mvbench_unexpected_action_think.yaml │ │ │ └── utils.py │ │ ├── perceptiontest │ │ │ └── val │ │ │ │ ├── _default_template_yaml │ │ │ │ ├── perceptiontest_mc_nothink.yaml │ │ │ │ ├── perceptiontest_mc_think.yaml │ │ │ │ └── utils.py │ │ └── videomme │ │ │ ├── utils.py │ │ │ ├── videomme_short_nothink.yaml │ │ │ ├── videomme_short_nothink_glue.yaml │ │ │ ├── videomme_short_think.yaml │ │ │ └── videomme_short_think_glue.yaml │ └── utils.py ├── miscs │ ├── example_eval.yaml │ ├── llava_repr_requirements.txt │ ├── llava_result_check.md │ ├── llava_sglang_result_check.md │ ├── repr_scripts.sh │ ├── repr_torch_envs.txt │ ├── scienceqa_id.txt │ ├── script.sh │ ├── test_llava.py │ ├── test_scienceqa.py │ ├── tinyllava_repr_requirements.txt │ └── tinyllava_repr_scripts.sh ├── pyproject.toml ├── scripts_reason │ ├── eval_qwen2_5vl_all_tasks_new.sh │ ├── eval_qwen2_5vl_mv_ptest.sh │ ├── eval_qwen2_5vl_nothink.sh │ ├── eval_qwen2_5vl_nothink_glue.sh │ └── eval_qwen2_5vl_vmme_short.sh ├── setup.py └── tools │ ├── get_video_avg_time.py │ ├── lite │ ├── embed.py │ ├── embedder │ │ ├── BaseEmbedder.py │ │ ├── ClipBgeEmbedder.py │ │ └── __init__.py │ ├── shrink.py │ └── shrinker │ │ ├── BaseShrinker.py │ │ ├── EmbedShrinker.py │ │ ├── __init__.py │ │ └── sampling_methods │ │ ├── __init__.py │ │ ├── kcenter_greedy.py │ │ └── sampling_def.py │ ├── live_bench │ ├── create_dataset.py │ ├── data_summary.ipynb │ ├── example.ipynb │ ├── filter.ipynb │ ├── live_bench │ │ ├── __init__.py │ │ ├── api │ │ │ └── live_bench.py │ │ ├── data_generator │ │ │ ├── __init__.py │ │ │ ├── check_prompt.md │ │ │ ├── default_criteria.md │ │ │ ├── example │ │ │ │ ├── example_output.json │ │ │ │ └── example_website.png │ │ │ ├── live_bench.py │ │ │ ├── live_bench_data.py │ │ │ ├── prompt.md │ │ │ ├── qa_generator.py │ │ │ ├── question_finalizer.py │ │ │ ├── response.py │ │ │ ├── score_getter.py │ │ │ ├── score_prompt.md │ │ │ └── utils │ │ │ │ ├── __init__.py │ │ │ │ ├── claude.py │ │ │ │ ├── extract_infomation.py │ │ │ │ ├── gemini.py │ │ │ │ └── gpt4v.py │ │ ├── driver │ │ │ ├── .gitignore │ │ │ ├── __init__.py │ │ │ └── load_driver.py │ │ ├── screen_shoter │ │ │ ├── __init__.py │ │ │ ├── screen.py │ │ │ └── screen_shoter.py │ │ ├── view.ipynb │ │ └── websites │ │ │ ├── __init__.py │ │ │ ├── load_website.py │ │ │ ├── website.py │ │ │ └── website_list.yaml │ ├── pyproject.toml │ ├── refine_all_results.py │ ├── script │ │ ├── README.md │ │ ├── modify.ipynb │ │ └── upload_results.py │ └── setup.py │ ├── make_image_hf_dataset.ipynb │ ├── make_vatex.py │ ├── make_video_hf_dataset.ipynb │ └── makecvrr.ipynb ├── requirements.txt ├── src ├── __init__.py ├── open_r1 │ ├── __init__.py │ ├── evaluate.py │ ├── generate.py │ ├── grpo.py │ ├── grpo_cls.py │ ├── grpo_cls_nothink.py │ ├── grpo_gqa.py │ ├── grpo_gqa_nothink.py │ ├── grpo_qa.py │ ├── grpo_qa_nothink.py │ ├── grpo_tasks.py │ ├── grpo_tg.py │ ├── grpo_video.py │ ├── my_qwen_utils.py │ ├── my_qwen_utils2.py │ └── trainer │ │ ├── __init__.py │ │ ├── grpo_tasks_trainer.py │ │ ├── grpo_trainer.py │ │ ├── grpo_trainer_video_cls.py │ │ ├── grpo_trainer_video_cls_nothink.py │ │ ├── grpo_trainer_video_gqa.py │ │ ├── grpo_trainer_video_gqa_nothink.py │ │ ├── grpo_trainer_video_qa.py │ │ ├── grpo_trainer_video_qa_nothink.py │ │ ├── grpo_trainer_video_tg.py │ │ ├── vllm_grpo_trainer.py │ │ ├── vllm_grpo_trainer_video_tg.py │ │ └── yza_vision_process.py └── sft │ ├── sft_cls.py │ ├── sft_gqa.py │ ├── sft_grounding.py │ └── sft_track.py ├── src_eval ├── __init__.py ├── data_config.py ├── eval_prompts.py ├── evaluate_cls_quality_c8.py ├── evaluate_gqa.py ├── evaluate_grounding.py ├── evaluate_qa.py ├── evaluate_track.py └── my_qwen_utils.py └── training_scripts ├── run_grpo_video_cls_qa.sh ├── run_grpo_video_gqa.sh ├── run_grpo_video_gqa_nothink_3e.sh ├── run_grpo_video_qa.sh ├── run_grpo_video_qa_nothink.sh ├── run_grpo_video_task.sh ├── run_grpo_video_tg.sh ├── run_sft_video_cls_qa.sh ├── run_sft_video_gqa.sh ├── run_sft_video_track.sh └── zero3_offload.json /README.md: -------------------------------------------------------------------------------- 1 | 2 |
3 | 4 | 5 |

VideoChat-R1: Enhancing Spatio-Temporal 6 | Perception via Reinforcement Fine-Tuning

7 | 8 | [Xinhao Li](https://scholar.google.com/citations?user=evR3uR0AAAAJ)\*, [Ziang Yan](https://scholar.google.com.hk/citations?user=78lx13MAAAAJ&hl=zh-CN)\*, Desen Meng, Lu Dong, [Xiangyu Zeng](https://scholar.google.com/citations?user=jS13DXkAAAAJ&hl=zh-CN), [Yinan He](https://dblp.org/pid/93/7763.html), [Yali Wang](https://scholar.google.com/citations?user=hD948dkAAAAJ), [Yu Qiao](https://scholar.google.com/citations?user=gFtI-8QAAAAJ&hl), [Yi Wang](https://scholar.google.com.hk/citations?user=Xm2M8UwAAAAJ)^ and [Limin Wang](https://scholar.google.com/citations?user=HEuN8PcAAAAJ)^ 9 | 10 |

11 | 🤗 Model    |    📑 Paper    12 |
13 | 14 |

15 | 16 | 17 |
18 | 19 | 20 | 21 | ## :fire: Updates 22 | - [x] **2025/04/22**:🔥🔥🔥 We release our VideoChat-R1-caption at [Huggingface](https://huggingface.co/collections/OpenGVLab/videochat-r1-67fbe26e4eb08c83aa24643e). 23 | - [x] **2025/04/14**:🔥🔥🔥 We release our VideoChat-R1 and VideoChat-R1-thinking at [Huggingface](https://huggingface.co/collections/OpenGVLab/videochat-r1-67fbe26e4eb08c83aa24643e). 24 | - [x] **2025/04/10**:🔥🔥🔥 We release our paper and code. 25 | 26 | 27 | ## :parrot: Introduction 28 | 29 | ![alt text](framework.png) 30 | 31 | 32 | 33 | ## Demo & Inference 34 | 35 | Refer to [hf README](https://huggingface.co/OpenGVLab/VideoChat-R1_7B) to inference our model. 36 | 37 | ## Evaluation 38 | 39 | See [eval_scripts](eval_scripts) and [lmms-eval_videochat](lmms-eval_videochat). 40 | 41 | 42 | ## Training 43 | 44 | See [training_scripts](training_scripts). 45 | 46 | # :page_facing_up: Citation 47 | 48 | If you find this project useful in your research, please consider cite: 49 | ```BibTeX 50 | @article{li2025videochatr1, 51 | title={VideoChat-R1: Enhancing Spatio-Temporal 52 | Perception via Reinforcement Fine-Tuning}, 53 | author={Li, Xinhao and Yan, Ziang and Meng, Desen and Dong, Lu and Zeng, Xiangyu and He, Yinan and Wang, Yali and Qiao, Yu and Wang, Yi and Wang, Limin}, 54 | journal={arXiv preprint arXiv:2504.06958}, 55 | year={2025} 56 | } 57 | ``` 58 | 59 | 62 | -------------------------------------------------------------------------------- /annotations/VideoEval/Quality_Access/annotations/Quality_Access_4shot.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "video": "0506.mp4", 4 | "instruction": "How about the quality of the video from an aesthetic and technical point of view? Choice one answer from ['low quality', 'high quality'].", 5 | "answer": "low quality" 6 | }, 7 | { 8 | "video": "0717.mp4", 9 | "instruction": "How about the quality of the video from an aesthetic and technical point of view? Choice one answer from ['low quality', 'high quality'].", 10 | "answer": "low quality" 11 | }, 12 | { 13 | "video": "0423.mp4", 14 | "instruction": "How about the quality of the video from an aesthetic and technical point of view? Choice one answer from ['low quality', 'high quality'].", 15 | "answer": "low quality" 16 | }, 17 | { 18 | "video": "0340.mp4", 19 | "instruction": "How about the quality of the video from an aesthetic and technical point of view? Choice one answer from ['low quality', 'high quality'].", 20 | "answer": "low quality" 21 | }, 22 | { 23 | "video": "3208.mp4", 24 | "instruction": "How about the quality of the video from an aesthetic and technical point of view? Choice one answer from ['low quality', 'high quality'].", 25 | "answer": "high quality" 26 | }, 27 | { 28 | "video": "3424.mp4", 29 | "instruction": "How about the quality of the video from an aesthetic and technical point of view? Choice one answer from ['low quality', 'high quality'].", 30 | "answer": "high quality" 31 | }, 32 | { 33 | "video": "3126.mp4", 34 | "instruction": "How about the quality of the video from an aesthetic and technical point of view? Choice one answer from ['low quality', 'high quality'].", 35 | "answer": "high quality" 36 | }, 37 | { 38 | "video": "3048.mp4", 39 | "instruction": "How about the quality of the video from an aesthetic and technical point of view? Choice one answer from ['low quality', 'high quality'].", 40 | "answer": "high quality" 41 | } 42 | ] -------------------------------------------------------------------------------- /configs/ddp.yaml: -------------------------------------------------------------------------------- 1 | compute_environment: LOCAL_MACHINE 2 | debug: false 3 | distributed_type: MULTI_GPU 4 | downcast_bf16: 'no' 5 | gpu_ids: all 6 | machine_rank: 0 7 | main_training_function: main 8 | mixed_precision: bf16 9 | num_machines: 1 10 | num_processes: 8 11 | rdzv_backend: static 12 | same_network: true 13 | tpu_env: [] 14 | tpu_use_cluster: false 15 | tpu_use_sudo: false 16 | use_cpu: false 17 | -------------------------------------------------------------------------------- /configs/zero2.yaml: -------------------------------------------------------------------------------- 1 | compute_environment: LOCAL_MACHINE 2 | debug: false 3 | deepspeed_config: 4 | deepspeed_multinode_launcher: standard 5 | offload_optimizer_device: none 6 | offload_param_device: none 7 | zero3_init_flag: false 8 | zero_stage: 2 9 | distributed_type: DEEPSPEED 10 | downcast_bf16: 'no' 11 | machine_rank: 0 12 | main_training_function: main 13 | mixed_precision: bf16 14 | num_machines: 1 15 | num_processes: 8 16 | rdzv_backend: static 17 | same_network: true 18 | tpu_env: [] 19 | tpu_use_cluster: false 20 | tpu_use_sudo: false 21 | use_cpu: false -------------------------------------------------------------------------------- /configs/zero3.yaml: -------------------------------------------------------------------------------- 1 | compute_environment: LOCAL_MACHINE 2 | debug: false 3 | deepspeed_config: 4 | deepspeed_multinode_launcher: standard 5 | offload_optimizer_device: none 6 | offload_param_device: none 7 | zero3_init_flag: true 8 | zero3_save_16bit_model: true 9 | zero_stage: 3 10 | distributed_type: DEEPSPEED 11 | downcast_bf16: 'no' 12 | machine_rank: 0 13 | main_training_function: main 14 | mixed_precision: bf16 15 | num_machines: 1 16 | num_processes: 8 17 | rdzv_backend: static 18 | same_network: true 19 | tpu_env: [] 20 | tpu_use_cluster: false 21 | tpu_use_sudo: false 22 | use_cpu: false 23 | -------------------------------------------------------------------------------- /framework.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/VideoChat-R1/dba50f32fffb763e0fdf3810244be76a44147df2/framework.png -------------------------------------------------------------------------------- /lmms-eval_videochat/.gitignore: -------------------------------------------------------------------------------- 1 | env 2 | *.pyc 3 | output/ 4 | data/ 5 | lm_cache 6 | .idea 7 | build 8 | dist 9 | *.egg-info 10 | venv 11 | .vscode/ 12 | temp 13 | __pycache__ 14 | .ipynb_checkpoints 15 | temp 16 | .DS_STORE 17 | # IPython 18 | profile_default/ 19 | ipython_config.py 20 | logs/ 21 | scripts/ 22 | wandb/ 23 | SimSun.ttf 24 | submissions/ 25 | lmms_eval/tasks/hallusion_bench/hallusion_output_vs_model.json 26 | lmms_eval/tasks/hallusion_bench/hallusion_output_vd_model.json 27 | zk.log 28 | cache_dir 29 | ckpt 30 | pretrained/ 31 | LLaVA/ 32 | *logs 33 | temp/ 34 | InternVL/ 35 | logs/ 36 | data/ 37 | llava-video/ 38 | Video-MME/ 39 | VATEX/ 40 | lmms_eval/tasks/vatex/__pycache__/utils.cpython-310.pyc 41 | lmms_eval/tasks/mlvu/__pycache__/utils.cpython-310.pyc -------------------------------------------------------------------------------- /lmms-eval_videochat/.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/psf/black 3 | rev: 23.12.1 4 | hooks: 5 | - id: black 6 | language_version: python3 -------------------------------------------------------------------------------- /lmms-eval_videochat/LICENSE: -------------------------------------------------------------------------------- 1 | # For the main pipeline structure-related code, we maintain the original license provided with lm-evaluation-harness, which is the MIT License. 2 | 3 | MIT License 4 | 5 | Copyright (c) 2024 LMMs-Lab 6 | 7 | Permission is hereby granted, free of charge, to any person obtaining a copy 8 | of this software and associated documentation files (the "Software"), to deal 9 | in the Software without restriction, including without limitation the rights 10 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | copies of the Software, and to permit persons to whom the Software is 12 | furnished to do so, subject to the following conditions: 13 | 14 | The above copyright notice and this permission notice shall be included in all 15 | copies or substantial portions of the Software. 16 | 17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | SOFTWARE. 24 | 25 | # For the multimodal models and datasets that we have added (defined as code in the lmms_eval/tasks and lmms_eval/models folders), we apply the Apache License. 26 | 27 | Apache 2.0 License 28 | 29 | Copyright (c) 2024 LMMs-Lab 30 | 31 | Licensed under the Apache License, Version 2.0 (the "License"); 32 | you may not use this file except in compliance with the License. 33 | You may obtain a copy of the License at 34 | 35 | http://www.apache.org/licenses/LICENSE-2.0 36 | 37 | Unless required by applicable law or agreed to in writing, software 38 | distributed under the License is distributed on an "AS IS" BASIS, 39 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 40 | See the License for the specific language governing permissions and 41 | limitations under the License. 42 | 43 | When modifying the code, please include the following information about the original lmms-eval source: 44 | # Adopted from lmms-eval from https://github.com/EvolvingLMMs-Lab/lmms-eval. Below is the original copyright: 45 | # 46 | # Licensed under the Apache License, Version 2.0 (the "License"); 47 | # you may not use this file except in compliance with the License. 48 | # You may obtain a copy of the License at 49 | # 50 | # http://www.apache.org/licenses/LICENSE-2.0 51 | # 52 | # Unless required by applicable law or agreed to in writing, software 53 | # distributed under the License is distributed on an "AS IS" BASIS, 54 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 55 | # See the License for the specific language governing permissions and 56 | # limitations under the License. 57 | -------------------------------------------------------------------------------- /lmms-eval_videochat/README.md: -------------------------------------------------------------------------------- 1 | 2 | # How to use 3 | 4 | We have modified the data loading method for lmms-eval: instead of loading from Huggingface, the data is loaded locally. Therefore, when using it, you need to **specify the data path** in the YAML file of each task. The data can be downloaded from the [lmms-eval](https://huggingface.co/lmms-lab) or the official repos of the corresponding tasks. 5 | 6 | ## Installation 7 | 8 | You can install the package by cloning the repository and running the following command: 9 | ```bash 10 | git clone https://github.com/OpenGVLab/VideoChat-R1 11 | cd lmms-eval_videochat 12 | pip install -e . 13 | ``` 14 | We provide all evaluation [scripts](scripts_reason) and [annotations](eval_annotations) here. 15 | 16 | You could evaluate one task: 17 | ```bash 18 | TASK=videomme_short_nothink 19 | MODEL_NAME=qwen2_5_vl_lxh 20 | MAX_NUM_FRAMES=512 21 | CKPT_PATH=OpenGVLab/VideoChat-R1-7B 22 | 23 | echo $TASK 24 | TASK_SUFFIX="${TASK//,/_}" 25 | echo $TASK_SUFFIX 26 | 27 | JOB_NAME=$(basename $0)_$(date +"%Y%m%d_%H%M%S") 28 | MASTER_PORT=$((18000 + $RANDOM % 100)) 29 | NUM_GPUS=8 30 | 31 | 32 | accelerate launch --num_processes ${NUM_GPUS} --main_process_port ${MASTER_PORT} -m lmms_eval \ 33 | --model ${MODEL_NAME} \ 34 | --model_args pretrained=$CKPT_PATH,max_num_frames=$MAX_NUM_FRAMES \ 35 | --tasks $TASK \ 36 | --batch_size 1 \ 37 | --log_samples \ 38 | --log_samples_suffix $TASK_SUFFIX \ 39 | --output_path ./logs/${JOB_NAME}_${MODEL_NAME}_f${MAX_NUM_FRAMES} 40 | ``` 41 | You could evaluate more tasks once like: 42 | ```bash 43 | TASK=mvbench_nothink,videomme_short_nothink 44 | MODEL_NAME=qwen2_5_vl_lxh 45 | MAX_NUM_FRAMES=512 46 | CKPT_PATH=OpenGVLab/VideoChat-Flash-Qwen2-7B_res448 47 | 48 | echo $TASK 49 | TASK_SUFFIX="${TASK//,/_}" 50 | echo $TASK_SUFFIX 51 | 52 | JOB_NAME=$(basename $0)_$(date +"%Y%m%d_%H%M%S") 53 | MASTER_PORT=$((18000 + $RANDOM % 100)) 54 | NUM_GPUS=8 55 | 56 | 57 | accelerate launch --num_processes ${NUM_GPUS} --main_process_port ${MASTER_PORT} -m lmms_eval \ 58 | --model ${MODEL_NAME} \ 59 | --model_args pretrained=$CKPT_PATH,max_num_frames=$MAX_NUM_FRAMES \ 60 | --tasks $TASK \ 61 | --batch_size 1 \ 62 | --log_samples \ 63 | --log_samples_suffix $TASK_SUFFIX \ 64 | --output_path ./logs/${JOB_NAME}_${MODEL_NAME}_f${MAX_NUM_FRAMES} 65 | ``` 66 | 67 | -------------------------------------------------------------------------------- /lmms-eval_videochat/docs/README.md: -------------------------------------------------------------------------------- 1 | # LMMs Eval Documentation 2 | 3 | Welcome to the docs for `lmms-eval`! 4 | 5 | Majority of this documentation is adapted from [lm-eval-harness](https://github.com/EleutherAI/lm-evaluation-harness/) 6 | 7 | ## Table of Contents 8 | 9 | * To learn about the command line flags, see the [commands](commands.md) 10 | * To learn how to add a new moddel, see the [Model Guide](model_guide.md). 11 | * For a crash course on adding new tasks to the library, see our [Task Guide](task_guide.md). 12 | * If you need to upload your datasets into correct HF format with viewer supported, please refer to [tools](https://github.com/EvolvingLMMs-Lab/lmms-eval/tree/pufanyi/hf_dataset_docs/tools) 13 | -------------------------------------------------------------------------------- /lmms-eval_videochat/docs/commands.md: -------------------------------------------------------------------------------- 1 | # User Guide 2 | This document details the interface exposed by `lmms_eval` and provides details on what flags are available to users. 3 | 4 | ## Command-line Interface 5 | 6 | 7 | Equivalently, running the library can be done via the `lmms_eval` entrypoint at the command line. 8 | 9 | This mode supports a number of command-line arguments, the details of which can be also be seen via running with `-h` or `--help`: 10 | 11 | * `--model` : Selects which model type or provider is evaluated. Must be a mdoels registered under lmms_eval/models. For example, `--model qwen_vl` or `--model llava`. 12 | 13 | * `--model_args` : Controls parameters passed to the model constructor. Accepts a string containing comma-separated keyword arguments to the model class of the format `"arg1=val1,arg2=val2,..."`, such as, for example `--model_args pretrained=liuhaotian/llava-v1.5-7b,batch_size=1`. For a full list of what keyword arguments, see the initialization of the corresponding model class in `lmms_eval/models/`. 14 | 15 | * `--tasks` : Determines which tasks or task groups are evaluated. Accepts a comma-separated list of task names or task group names. Must be solely comprised of valid tasks/groups. You can use `--tasks list` to see all the available tasks. If you add your own tasks but not shown on the list, you can try to set `--verbosity=DEBUG` to view the error message. You can also use `--tasks list_with_num` to check every tasks and the number of question each task contains. However, `list_with_num` will download all the available datasets and may require lots of memory and time. 16 | 17 | * `--batch_size` : Sets the batch size used for evaluation. Can be a positive integer or `"auto"` to automatically select the largest batch size that will fit in memory, speeding up evaluation. One can pass `--batch_size auto:N` to re-select the maximum batch size `N` times during evaluation. This can help accelerate evaluation further, since `lm-eval` sorts documents in descending order of context length. 18 | 19 | * `--output_path` : A string of the form `dir/file.jsonl` or `dir/`. Provides a path where high-level results will be saved, either into the file named or into the directory named. If `--log_samples` is passed as well, then per-document outputs and metrics will be saved into the directory as well. 20 | 21 | * `--log_samples` : If this flag is passed, then the model's outputs, and the text fed into the model, will be saved at per-document granularity. Must be used with `--output_path`. 22 | 23 | * `--limit` : Accepts an integer, or a float between 0.0 and 1.0 . If passed, will limit the number of documents to evaluate to the first X documents (if an integer) per task or first X% of documents per task. Useful for debugging, especially on costly API models. 24 | 25 | ## Usage with SRT API 26 | 27 | > install sglang 28 | 29 | ```bash 30 | git clone https://github.com/sgl-project/sglang.git 31 | # Current version is tested on #1222 32 | cd sglang; 33 | pip install -e "python[srt]" 34 | 35 | # Install FlashInfer CUDA kernels 36 | pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ 37 | ``` 38 | 39 | > run sglang backend service with the following command 40 | 41 | ```bash 42 | # After update, there is no need to use an extra command to setup backend server 43 | # the server will be initialized in the init process 44 | 45 | # launch lmms-eval srt_api model 46 | CKPT_PATH=$1 47 | TASK=$2 48 | MODALITY=$3 49 | TP_SIZE=$4 50 | echo $TASK 51 | TASK_SUFFIX="${TASK//,/_}" 52 | echo $TASK_SUFFIX 53 | 54 | python3 -m lmms_eval \ 55 | --model srt_api \ 56 | --model_args modality=$MODALITY,model_version=$CKPT_PATH,tp=$TP_SIZE,host=127.0.0.1,port=30000,timeout=600 \ 57 | --tasks $TASK \ 58 | --batch_size 1 \ 59 | --log_samples \ 60 | --log_samples_suffix $TASK_SUFFIX \ 61 | --output_path ./logs/ 62 | ``` 63 | 64 | You may need to install some dependencies for the above command to work (if you encounter some errors). 65 | 66 | ```bash 67 | pip install httpx==0.23.3 68 | pip install protobuf==3.20 69 | ``` 70 | 71 | 72 | -------------------------------------------------------------------------------- /lmms-eval_videochat/docs/current_tasks.md: -------------------------------------------------------------------------------- 1 | # Current Tasks 2 | 3 | > () indicates the task name in the lmms_eval. The task name is also used to specify the dataset in the configuration file. 4 | > The following is manually updated documentation. You could use `lmms_eval task --list` to list all supported tasks and their task names. 5 | 6 | - AI2D (ai2d) 7 | - ChartQA (chartqa) 8 | - CMMMU (cmmmu) 9 | - CMMMU Validation (cmmmu_val) 10 | - CMMMU Test (cmmmu_test) 11 | - COCO Caption (coco_cap) 12 | - COCO 2014 Caption (coco2014_cap) 13 | - COCO 2014 Caption Validation (coco2014_cap_val) 14 | - COCO 2014 Caption Test (coco2014_cap_test) 15 | - COCO 2017 Caption (coco2017_cap) 16 | - COCO 2017 Caption MiniVal (coco2017_cap_val) 17 | - COCO 2017 Caption MiniTest (coco2017_cap_test) 18 | - [ConBench](https://github.com/foundation-multimodal-models/ConBench) (conbench) 19 | - DOCVQA (docvqa) 20 | - DOCVQA Validation (docvqa_val) 21 | - DOCVQA Test (docvqa_test) 22 | - Ferret (ferret) 23 | - Flickr30K (flickr30k) 24 | - Ferret Test (ferret_test) 25 | - GQA (gqa) 26 | - HallusionBenchmark (hallusion_bench_image) 27 | - Infographic VQA (info_vqa) 28 | - Infographic VQA Validation (info_vqa_val) 29 | - Infographic VQA Test (info_vqa_test) 30 | - LLaVA-Bench (llava_in_the_wild) 31 | - LLaVA-Bench-COCO (llava_bench_coco) 32 | - MathVerse (mathverse) 33 | - MathVerse Text Dominant (mathverse_testmini_text_dominant) 34 | - MathVerse Text Only (mathverse_testmini_text_only) 35 | - MathVerse Text Lite (mathverse_testmini_text_lite) 36 | - MathVerse Vision Dominant (mathverse_testmini_vision_dominant) 37 | - MathVerse Vision Intensive (mathverse_testmini_vision_intensive) 38 | - MathVerse Vision Only (mathverse_testmini_vision_only) 39 | - MathVista (mathvista) 40 | - MathVista Validation (mathvista_testmini) 41 | - MathVista Test (mathvista_test) 42 | - MMBench (mmbench) 43 | - MMBench English (mmbench_en) 44 | - MMBench English Dev (mmbench_en_dev) 45 | - MMBench English Test (mmbench_en_test) 46 | - MMBench Chinese (mmbench_cn) 47 | - MMBench Chinese Dev (mmbench_cn_dev) 48 | - MMBench Chinese Test (mmbench_cn_test) 49 | - MME (mme) 50 | - MMMU (mmmu) 51 | - MMMU Validation (mmmu_val) 52 | - MMMU Test (mmmu_test) 53 | - MMStar (mmstar) 54 | - MMUPD (mmupd) 55 | - MMUPD Base (mmupd_base) 56 | - MMAAD Base (mmaad_base) 57 | - MMIASD Base (mmiasd_base) 58 | - MMIVQD Base (mmivqd_base) 59 | - MMUPD Option (mmupd_option) 60 | - MMAAD Option (mmaad_option) 61 | - MMIASD Option (mmiasd_option) 62 | - MMIVQD Option (mmivqd_option) 63 | - MMUPD Instruction (mmupd_instruction) 64 | - MMAAD Instruction (mmaad_instruction) 65 | - MMIASD Instruction (mmiasd_instruction) 66 | - MMIVQD Instruction (mmivqd_instruction) 67 | - MMVet (mmvet) 68 | - Multi-DocVQA (multidocvqa) 69 | - Multi-DocVQA Validation (multidocvqa_val) 70 | - Multi-DocVQA Test (multidocvqa_test) 71 | - NoCaps (nocaps) 72 | - NoCaps Validation (nocaps_val) 73 | - NoCaps Test (nocaps_test) 74 | - OKVQA (ok_vqa) 75 | - OKVQA Validation 2014 (ok_vqa_val2014) 76 | - POPE (pope) 77 | - RefCOCO (refcoco) 78 | - refcoco_seg_test 79 | - refcoco_seg_val 80 | - refcoco_seg_testA 81 | - refcoco_seg_testB 82 | - refcoco_bbox_test 83 | - refcoco_bbox_val 84 | - refcoco_bbox_testA 85 | - refcoco_bbox_testB 86 | - RefCOCO+ (refcoco+) 87 | - refcoco+_seg 88 | - refcoco+_seg_val 89 | - refcoco+_seg_testA 90 | - refcoco+_seg_testB 91 | - refcoco+_bbox 92 | - refcoco+_bbox_val 93 | - refcoco+_bbox_testA 94 | - refcoco+_bbox_testB 95 | - RefCOCOg (refcocog) 96 | - refcocog_seg_test 97 | - refcocog_seg_val 98 | - refcocog_bbox_test 99 | - refcocog_bbox_val 100 | - ScienceQA (scienceqa_full) 101 | - ScienceQA Full (scienceqa) 102 | - ScienceQA IMG (scienceqa_img) 103 | - ScreenSpot (screenspot) 104 | - ScreenSpot REC / Grounding (screenspot_rec) 105 | - ScreenSpot REG / Instruction Generation (screenspot_reg) 106 | - SeedBench (seedbench) 107 | - SeedBench 2 (seedbench_2) 108 | - SeedBench 2 Plus (seedbench_2_plus) 109 | - ST-VQA (stvqa) 110 | - TextCaps (textcaps) 111 | - TextCaps Validation (textcaps_val) 112 | - TextCaps Test (textcaps_test) 113 | - TextVQA (textvqa) 114 | - TextVQA Validation (textvqa_val) 115 | - TextVQA Test (textvqa_test) 116 | - VizWizVQA (vizwiz_vqa) 117 | - VizWizVQA Validation (vizwiz_vqa_val) 118 | - VizWizVQA Test (vizwiz_vqa_test) 119 | - VQAv2 (vqav2) 120 | - VQAv2 Validation (vqav2_val) 121 | - VQAv2 Test (vqav2_test) 122 | - WebSRC (websrc) 123 | - WebSRC Validation (websrc_val) 124 | - WebSRC Test (websrc_test) -------------------------------------------------------------------------------- /lmms-eval_videochat/eval_annotations/MVBench/README.md: -------------------------------------------------------------------------------- 1 | --- 2 | license: mit 3 | extra_gated_prompt: >- 4 | You agree to not use the dataset to conduct experiments that cause harm to 5 | human subjects. Please note that the data in this dataset may be subject to 6 | other agreements. Before using the data, be sure to read the relevant 7 | agreements carefully to ensure compliant use. Video copyrights belong to the 8 | original video creators or platforms and are for academic research use only. 9 | task_categories: 10 | - visual-question-answering 11 | - video-classification 12 | extra_gated_fields: 13 | Name: text 14 | Company/Organization: text 15 | Country: text 16 | E-Mail: text 17 | modalities: 18 | - Video 19 | - Text 20 | configs: 21 | - config_name: action_sequence 22 | data_files: json/action_sequence.json 23 | - config_name: moving_count 24 | data_files: json/moving_count.json 25 | - config_name: action_prediction 26 | data_files: json/action_prediction.json 27 | - config_name: episodic_reasoning 28 | data_files: json/episodic_reasoning.json 29 | - config_name: action_antonym 30 | data_files: json/action_antonym.json 31 | - config_name: action_count 32 | data_files: json/action_count.json 33 | - config_name: scene_transition 34 | data_files: json/scene_transition.json 35 | - config_name: object_shuffle 36 | data_files: json/object_shuffle.json 37 | - config_name: object_existence 38 | data_files: json/object_existence.json 39 | - config_name: fine_grained_pose 40 | data_files: json/fine_grained_pose.json 41 | - config_name: unexpected_action 42 | data_files: json/unexpected_action.json 43 | - config_name: moving_direction 44 | data_files: json/moving_direction.json 45 | - config_name: state_change 46 | data_files: json/state_change.json 47 | - config_name: object_interaction 48 | data_files: json/object_interaction.json 49 | - config_name: character_order 50 | data_files: json/character_order.json 51 | - config_name: action_localization 52 | data_files: json/action_localization.json 53 | - config_name: counterfactual_inference 54 | data_files: json/counterfactual_inference.json 55 | - config_name: fine_grained_action 56 | data_files: json/fine_grained_action.json 57 | - config_name: moving_attribute 58 | data_files: json/moving_attribute.json 59 | - config_name: egocentric_navigation 60 | data_files: json/egocentric_navigation.json 61 | language: 62 | - en 63 | size_categories: 64 | - 1K None: 18 | """ 19 | Can define custom behavior here, if an individual instantiation of a Filter class should have state. 20 | """ 21 | 22 | def apply(self, resps, docs): 23 | """ 24 | Defines the operation to perform on a list of the `inst.resps` properties of `Instance` objects. 25 | Should return the list of (filtered) response lists *in the same order as they were input*, e.g. 26 | if pass in [, ] should return 27 | [, ] 28 | """ 29 | return resps 30 | 31 | 32 | @dataclass 33 | class FilterEnsemble: 34 | """ 35 | FilterEnsemble creates a pipeline applying multiple filters. 36 | Its intended usage is to stack multiple post-processing steps in order. 37 | `task.apply_filters` should use a list of FilterEnsemble classes that it stores, to apply each 38 | pipeline separately. 39 | """ 40 | 41 | name: str 42 | filters: List[Filter] 43 | 44 | def apply(self, instances: List[Instance], docs: List[Dataset]) -> None: 45 | resps = [inst.resps for inst in instances] # operate just on the model responses 46 | for f in self.filters: 47 | # apply filters in sequence 48 | resps = f.apply(resps, docs) 49 | 50 | # add the end results after filtering to filtered_requests of their respective source instances. 51 | # has key `self.name`: each FilterEnsemble applied in a given run should use a different name. 52 | for inst, resp in zip(instances, resps): 53 | inst.filtered_resps[self.name] = resp 54 | -------------------------------------------------------------------------------- /lmms-eval_videochat/lmms_eval/api/instance.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass, field 2 | from typing import Literal, Tuple 3 | 4 | 5 | @dataclass 6 | class Instance: 7 | request_type: Literal["loglikelihood", "generate_until"] 8 | arguments: tuple 9 | idx: int 10 | metadata: Tuple[str, int, int] = field(default_factory=lambda: (None, None, None)) # TODO: better typehints here 11 | resps: list = field(default_factory=list) 12 | filtered_resps: dict = field(default_factory=dict) 13 | 14 | # initialized after init 15 | task_name: str = None 16 | doc_id: str = None 17 | repeats: str = None 18 | doc: dict = None 19 | 20 | def __post_init__(self) -> None: 21 | # unpack metadata field 22 | self.task_name, self.doc_id, self.repeats = self.metadata["task"], self.metadata["doc_id"], self.metadata["repeats"] 23 | 24 | @property 25 | def args(self): 26 | """ 27 | Returns (string,) where `string` is the string to calculate loglikelihood over 28 | """ 29 | return self.arguments if isinstance(self.arguments, tuple) else (self.arguments,) 30 | -------------------------------------------------------------------------------- /lmms-eval_videochat/lmms_eval/api/registry.py: -------------------------------------------------------------------------------- 1 | from lmms_eval.api.model import lmms 2 | 3 | from typing import Callable, Dict 4 | import evaluate as hf_evaluate 5 | 6 | from loguru import logger as eval_logger 7 | 8 | MODEL_REGISTRY = {} 9 | 10 | 11 | def register_model(*names): 12 | # either pass a list or a single alias. 13 | # function receives them as a tuple of strings 14 | 15 | def decorate(cls): 16 | for name in names: 17 | assert issubclass(cls, lmms), f"Model '{name}' ({cls.__name__}) must extend lmms class" 18 | 19 | assert name not in MODEL_REGISTRY, f"Model named '{name}' conflicts with existing model! Please register with a non-conflicting alias instead." 20 | 21 | MODEL_REGISTRY[name] = cls 22 | return cls 23 | 24 | return decorate 25 | 26 | 27 | def get_model(model_name): 28 | try: 29 | return MODEL_REGISTRY[model_name] 30 | except KeyError: 31 | raise ValueError(f"Attempted to load model '{model_name}', but no model for this name found! Supported model names: {', '.join(MODEL_REGISTRY.keys())}") 32 | 33 | 34 | TASK_REGISTRY = {} # Key: task name, Value: task ConfigurableTask class 35 | GROUP_REGISTRY = {} # Key: group name, Value: list of task names or group names 36 | TASK_INITIALIZED = False 37 | ALL_TASKS = set() # Set of all task names and group names 38 | func2task_index = {} # Key: task ConfigurableTask class, Value: task name 39 | 40 | 41 | def register_task(name): 42 | def decorate(fn): 43 | assert name not in TASK_REGISTRY, f"task named '{name}' conflicts with existing registered task!" 44 | 45 | TASK_REGISTRY[name] = fn 46 | ALL_TASKS.add(name) 47 | func2task_index[fn.__name__] = name 48 | return fn 49 | 50 | return decorate 51 | 52 | 53 | def register_group(name): 54 | def decorate(fn): 55 | func_name = func2task_index[fn.__name__] 56 | if name in GROUP_REGISTRY: 57 | GROUP_REGISTRY[name].append(func_name) 58 | else: 59 | GROUP_REGISTRY[name] = [func_name] 60 | ALL_TASKS.add(name) 61 | return fn 62 | 63 | return decorate 64 | 65 | 66 | OUTPUT_TYPE_REGISTRY = {} 67 | METRIC_REGISTRY = {} 68 | METRIC_AGGREGATION_REGISTRY = {} 69 | AGGREGATION_REGISTRY = {} 70 | HIGHER_IS_BETTER_REGISTRY = {} 71 | 72 | DEFAULT_METRIC_REGISTRY = { 73 | "loglikelihood": [ 74 | "perplexity", 75 | "acc", 76 | ], 77 | "multiple_choice": ["acc", "acc_norm"], 78 | "generate_until": ["exact_match"], 79 | } 80 | 81 | 82 | def register_metric(**args): 83 | # TODO: do we want to enforce a certain interface to registered metrics? 84 | def decorate(fn): 85 | assert "metric" in args 86 | name = args["metric"] 87 | 88 | for key, registry in [ 89 | ("metric", METRIC_REGISTRY), 90 | ("higher_is_better", HIGHER_IS_BETTER_REGISTRY), 91 | ("aggregation", METRIC_AGGREGATION_REGISTRY), 92 | ]: 93 | if key in args: 94 | value = args[key] 95 | assert value not in registry, f"{key} named '{value}' conflicts with existing registered {key}!" 96 | 97 | if key == "metric": 98 | registry[name] = fn 99 | elif key == "aggregation": 100 | registry[name] = AGGREGATION_REGISTRY[value] 101 | else: 102 | registry[name] = value 103 | 104 | return fn 105 | 106 | return decorate 107 | 108 | 109 | def get_metric(name: str, hf_evaluate_metric=False) -> Callable: 110 | if not hf_evaluate_metric: 111 | if name in METRIC_REGISTRY: 112 | return METRIC_REGISTRY[name] 113 | else: 114 | eval_logger.warning(f"Could not find registered metric '{name}' in lm-eval, searching in HF Evaluate library...") 115 | 116 | try: 117 | metric_object = hf_evaluate.load(name) 118 | return metric_object.compute 119 | except Exception: 120 | eval_logger.error( 121 | f"{name} not found in the evaluate library! Please check https://huggingface.co/evaluate-metric", 122 | ) 123 | 124 | 125 | def register_aggregation(name): 126 | def decorate(fn): 127 | assert name not in AGGREGATION_REGISTRY, f"aggregation named '{name}' conflicts with existing registered aggregation!" 128 | 129 | AGGREGATION_REGISTRY[name] = fn 130 | return fn 131 | 132 | return decorate 133 | 134 | 135 | def get_aggregation(name): 136 | try: 137 | return AGGREGATION_REGISTRY[name] 138 | except KeyError: 139 | eval_logger.warning( 140 | "{} not a registered aggregation metric!".format(name), 141 | ) 142 | 143 | 144 | def get_metric_aggregation(name): 145 | try: 146 | return METRIC_AGGREGATION_REGISTRY[name] 147 | except KeyError: 148 | eval_logger.warning( 149 | "{} metric is not assigned a default aggregation!".format(name), 150 | ) 151 | 152 | 153 | def is_higher_better(metric_name): 154 | try: 155 | return HIGHER_IS_BETTER_REGISTRY[metric_name] 156 | except KeyError: 157 | eval_logger.warning(f"higher_is_better not specified for metric '{metric_name}'!") 158 | -------------------------------------------------------------------------------- /lmms-eval_videochat/lmms_eval/api/samplers.py: -------------------------------------------------------------------------------- 1 | class ContextSampler: 2 | def __init__(self, docs, task, fewshot_indices=None, rnd=None) -> None: 3 | self.rnd = rnd 4 | assert self.rnd, "must pass rnd to FewShotSampler!" 5 | 6 | self.task = task 7 | self.config = task._config 8 | 9 | self.target_delimiter = self.config.target_delimiter 10 | self.fewshot_delimiter = self.config.fewshot_delimiter 11 | 12 | self.doc_to_text = self.task.doc_to_text 13 | self.doc_to_target = self.task.doc_to_target 14 | self.doc_to_choice = self.task.doc_to_choice 15 | 16 | self.docs = docs # HF dataset split, provided by task._fewshot_docs() 17 | if fewshot_indices: # subset few-shot docs from 18 | self.docs = self.docs.select(fewshot_indices) 19 | 20 | def get_context(self, doc, num_fewshot): 21 | # draw an extra fewshot sample if using same split as evaluating on 22 | n_samples = num_fewshot + 1 if self.config.fewshot_split == self.config.test_split else num_fewshot 23 | 24 | # draw `n_samples` docs from fewshot_docs 25 | fewshotex = self.sample(n_samples) 26 | 27 | # get rid of the doc that's the one we're evaluating, if it's in the fewshot 28 | # TODO: should we just stop people from using fewshot from same split as evaluating? 29 | selected_docs = [x for x in fewshotex if x != doc][:num_fewshot] 30 | 31 | labeled_examples = ( 32 | self.fewshot_delimiter.join( 33 | [ 34 | # TODO: is separating doc_to_text and doc_to_target by one space always desired? 35 | (self.doc_to_text(doc) if (self.config.doc_to_choice is None or type(self.doc_to_text(doc)) is str) else self.doc_to_choice(doc)[self.doc_to_text(doc)]) 36 | + self.target_delimiter 37 | + ( 38 | str(self.doc_to_target(doc)[0]) 39 | if type(self.doc_to_target(doc)) is list 40 | else self.doc_to_target(doc) if (self.config.doc_to_choice is None or type(self.doc_to_target(doc)) is str) else str(self.doc_to_choice(doc)[self.doc_to_target(doc)]) 41 | ) 42 | for doc in selected_docs 43 | ] 44 | ) 45 | + self.fewshot_delimiter 46 | ) 47 | 48 | return labeled_examples 49 | 50 | def sample(self, n): 51 | """ 52 | Draw `n` samples from our fewshot docs. This method should be overridden by subclasses. 53 | """ 54 | 55 | return self.rnd.sample(self.docs, n) 56 | 57 | 58 | class FirstNSampler(ContextSampler): 59 | def sample(self, n) -> None: 60 | """ 61 | Draw the first `n` samples in order from the specified split. 62 | Used for tasks with "canonical" ordered fewshot examples, such as MMLU and CMMLU. 63 | """ 64 | assert n <= len(self.docs), f"Error: number of fewshot samples requested exceeds the {len(self.docs)} that are available." 65 | return self.docs[:n] 66 | 67 | 68 | class BalancedSampler(ContextSampler): 69 | def sample(self, n) -> None: 70 | """ 71 | TODO: this should return approximately class-balanced samples from our fewshot examples. 72 | TODO: what order should they be in? maybe random? 73 | """ 74 | 75 | pass 76 | 77 | 78 | class ManualSampler(ContextSampler): 79 | def sample(self, n) -> None: 80 | """ """ 81 | pass 82 | 83 | 84 | SAMPLER_REGISTRY = { 85 | "default": ContextSampler, 86 | "first_n": FirstNSampler, 87 | } 88 | 89 | 90 | def get_sampler(name): 91 | try: 92 | return SAMPLER_REGISTRY[name] 93 | except KeyError: 94 | raise ValueError(f"Attempted to use contextsampler '{name}', but no sampling strategy for this name found! Supported model names: {', '.join(SAMPLER_REGISTRY.keys())}") 95 | -------------------------------------------------------------------------------- /lmms-eval_videochat/lmms_eval/filters/__init__.py: -------------------------------------------------------------------------------- 1 | from lmms_eval.api.filter import FilterEnsemble, Filter 2 | from . import selection 3 | from . import extraction 4 | from . import transformation 5 | 6 | 7 | FILTER_REGISTRY = { 8 | "take_first": selection.TakeFirstFilter, 9 | "regex": extraction.RegexFilter, 10 | "majority_vote": selection.MajorityVoteFilter, 11 | "take_first_k": selection.TakeKFilter, 12 | "remove_whitespace": extraction.WhitespaceFilter, 13 | "lowercase": transformation.LowercaseFilter, 14 | "uppercase": transformation.UppercaseFilter, 15 | "map": transformation.MapFilter, 16 | "multi_choice_regex": extraction.MultiChoiceRegexFilter, 17 | # TODO: implement this filter. either it should take in an arbitrary "scoring"/reward function 18 | # that takes an input and returns a scalar and then should select the max reward, 19 | # or should implement different filters for different ways of handling a reward model's inference. 20 | # "arg_max": selection.ArgMaxFilter, 21 | } 22 | 23 | 24 | def get_filter(filter_name): 25 | if filter_name in FILTER_REGISTRY: 26 | return FILTER_REGISTRY[filter_name] 27 | else: 28 | return filter_name 29 | 30 | 31 | def build_filter_ensemble(filter_name, components): 32 | """ 33 | Create a filtering pipeline. 34 | """ 35 | filters = [] 36 | for function, kwargs in components: 37 | if kwargs is None: 38 | f = get_filter(function)() 39 | else: 40 | # create a filter given its name in the registry 41 | f = get_filter(function)(**kwargs) # TODO: pass kwargs to filters properly 42 | # add the filter as a pipeline step 43 | filters.append(f) 44 | 45 | return FilterEnsemble(name=filter_name, filters=filters) 46 | -------------------------------------------------------------------------------- /lmms-eval_videochat/lmms_eval/filters/decontamination.py: -------------------------------------------------------------------------------- 1 | from lmms_eval.api.filter import Filter 2 | 3 | 4 | class DecontaminationFilter(Filter): 5 | """ 6 | A filter which evaluates 7 | """ 8 | 9 | name = "track_decontamination" 10 | 11 | def __init__(self, path) -> None: 12 | """ 13 | 14 | TODO: make sure only ever run one time on the train set (should this be cached as a class var? keyed by value for "path"). 15 | should further cache result on a given (task_name, doc_id) 16 | """ 17 | self._decontam_results = None 18 | 19 | def apply(self, resps, docs) -> None: 20 | """ 21 | Return {"no_contamination", "only_contamination"} keys for the 2 different subsets 22 | """ 23 | pass 24 | -------------------------------------------------------------------------------- /lmms-eval_videochat/lmms_eval/filters/selection.py: -------------------------------------------------------------------------------- 1 | from collections import Counter 2 | 3 | from lmms_eval.api.filter import Filter 4 | 5 | 6 | class TakeFirstFilter(Filter): 7 | def __init__(self) -> None: 8 | """ 9 | Can define custom behavior here, if an individual instantiation of a Filter class should have state. 10 | """ 11 | 12 | def apply(self, resps, docs): 13 | """ 14 | Assuming each entry of `resps` is a list of model responses, we discard all but the first response. 15 | """ 16 | return map(lambda r: r[0], resps) 17 | 18 | 19 | class TakeKFilter(Filter): 20 | def __init__(self, *args, **kwargs) -> None: 21 | self.k = kwargs.pop("k") 22 | 23 | super().__init__(*args, **kwargs) 24 | 25 | def apply(self, resps, docs): 26 | # check we have at least k responses per doc, else we can't take the first k 27 | assert len(resps[0]) >= self.k, f"Need at least {self.k} responses per doc to take first {self.k}, but got {len(resps[0])} only! Please increase TaskConfig.repeats ." 28 | return map(lambda r: r[: self.k], resps) 29 | 30 | 31 | class MajorityVoteFilter(Filter): 32 | def __init__(self) -> None: 33 | """ 34 | Can define custom behavior here, if an individual instantiation of a Filter class should have state. 35 | """ 36 | 37 | def apply(self, resps, docs): 38 | """ 39 | Each entry of `resps` is a list of model responses. 40 | We select the response that occurs most frequently in each entry of `resps`. 41 | """ 42 | 43 | def select_majority(resp): 44 | counts = Counter(resp) 45 | vote = counts.most_common(1)[0][0] 46 | return vote 47 | 48 | return map(lambda r: [select_majority(r)], resps) 49 | -------------------------------------------------------------------------------- /lmms-eval_videochat/lmms_eval/filters/transformation.py: -------------------------------------------------------------------------------- 1 | from lmms_eval.api.filter import Filter 2 | 3 | 4 | class LowercaseFilter(Filter): 5 | def __init__(self) -> None: 6 | pass 7 | 8 | def apply(self, resps, docs): 9 | def filter_set(inst): 10 | return [resp.lower() for resp in inst] 11 | 12 | return [filter_set(resp) for resp in resps] 13 | 14 | 15 | class UppercaseFilter(Filter): 16 | def __init__(self) -> None: 17 | pass 18 | 19 | def apply(self, resps, docs): 20 | def filter_set(inst): 21 | return [resp.upper() for resp in inst] 22 | 23 | return [filter_set(resp) for resp in resps] 24 | 25 | 26 | class MapFilter(Filter): 27 | def __init__(self, mapping_dict: dict = {}, default_value=None) -> None: 28 | """ 29 | Initializes the MapFilter with a given mapping dictionary and default value. 30 | 31 | Args: 32 | - mapping_dict (dict): A dictionary containing the key-value mappings. 33 | Default is an empty dictionary. 34 | - default_value (Any): The value to be returned when a key is not found in the mapping_dict. 35 | Default is None. 36 | 37 | Example: 38 | mapper = MapFilter({'A': 1, 'B': 2}, default_value=0) 39 | """ 40 | assert isinstance(mapping_dict, dict), "Provided mapping_dict is not a dictionary" 41 | self.mapping_dict = mapping_dict 42 | self.default_value = default_value 43 | 44 | def apply(self, resps, docs): 45 | def filter_set(inst): 46 | return [self.mapping_dict.get(resp, self.default_value) for resp in inst] 47 | 48 | return [filter_set(resp) for resp in resps] 49 | -------------------------------------------------------------------------------- /lmms-eval_videochat/lmms_eval/models/__init__.py: -------------------------------------------------------------------------------- 1 | import importlib 2 | import os 3 | import hf_transfer 4 | from loguru import logger 5 | import sys 6 | import hf_transfer 7 | 8 | os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1" 9 | 10 | logger.remove() 11 | logger.add(sys.stdout, level="WARNING") 12 | 13 | AVAILABLE_MODELS = { 14 | # "batch_gpt4": "BatchGPT4", 15 | # "claude": "Claude", 16 | # "from_log": "FromLog", 17 | # "fuyu": "Fuyu", 18 | # "gemini_api": "GeminiAPI", 19 | # "gpt4v": "GPT4V", 20 | # "idefics2": "Idefics2", 21 | # "instructblip": "InstructBLIP", 22 | # "internvl": "InternVLChat", 23 | # "internvl2": "InternVL2", 24 | "internvl2_video": "InternVL2_video", 25 | "internvl2_video_new": "InternVL2_video_new", 26 | # "llama_vid": "LLaMAVid", 27 | # "llava": "Llava", 28 | # "llava_hf": "LlavaHf", 29 | "llava_onevision": "Llava_OneVision", 30 | # "llava_sglang": "LlavaSglang", 31 | # "llava_vid": "LlavaVid", 32 | # "longva": "LongVA", 33 | # "mantis": "Mantis", 34 | # "minicpm_v": "MiniCPM_V", 35 | # "mplug_owl_video": "mplug_Owl", 36 | # "phi3v": "Phi3v", 37 | # "qwen_vl": "Qwen_VL", 38 | # "qwen_vl_api": "Qwen_VL_API", 39 | # "reka": "Reka", 40 | # "srt_api": "SRT_API", 41 | # "tinyllava": "TinyLlava", 42 | # "videoChatGPT": "VideoChatGPT", 43 | # "video_llava": "VideoLLaVA", 44 | # "vila": "VILA", 45 | # "xcomposer2_4KHD": "XComposer2_4KHD", 46 | # "xcomposer2d5": "XComposer2D5", 47 | "videochat_next": "VideoChat_NeXT", 48 | "videochat_next_image": "VideoChat_NeXT_image", 49 | "videochat_next_dynamic": "VideoChat_NeXT_dynamic", 50 | "videochat_next_pdrop": "VideoChat_NeXT_Pdrop", 51 | "videochat_next_fastv": "VideoChat_NeXT_FastV", 52 | "videochat_pdrop": "VideoChat_Pdrop", 53 | "videochat": "VideoChat", 54 | "videochat_next_old": "VideoChat_NeXT_old", 55 | "videochat_next_dynamic_pdrop":"VideoChat_NeXT_dynamic_pdrop", 56 | "videochat_next_dynamic_newprompt":"VideoChat_NeXT_dynamic_newprompt", 57 | "videochat_next_dynamic_pdrop_newprompt":"VideoChat_NeXT_dynamic_pdrop_newprompt", 58 | "videochat_flash": "VideoChat_Flash", 59 | "videochat_flash2": "VideoChat_Flash2", 60 | "qwen2_5_vl_lxh": "Qwen2_5_VL" 61 | } 62 | 63 | for model_name, model_class in AVAILABLE_MODELS.items(): 64 | try: 65 | exec(f"from .{model_name} import {model_class}") 66 | except Exception as e: 67 | logger.debug(f"Failed to import {model_class} from {model_name}: {e}") 68 | 69 | if os.environ.get("LMMS_EVAL_PLUGINS", None): 70 | # Allow specifying other packages to import models from 71 | for plugin in os.environ["LMMS_EVAL_PLUGINS"].split(","): 72 | m = importlib.import_module(f"{plugin}.models") 73 | for model_name, model_class in getattr(m, "AVAILABLE_MODELS").items(): 74 | try: 75 | exec(f"from {plugin}.models.{model_name} import {model_class}") 76 | except ImportError as e: 77 | logger.debug(f"Failed to import {model_class} from {model_name}: {e}") 78 | -------------------------------------------------------------------------------- /lmms-eval_videochat/lmms_eval/models/model_utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/VideoChat-R1/dba50f32fffb763e0fdf3810244be76a44147df2/lmms-eval_videochat/lmms_eval/models/model_utils/__init__.py -------------------------------------------------------------------------------- /lmms-eval_videochat/lmms_eval/models/model_utils/load_video.py: -------------------------------------------------------------------------------- 1 | import av 2 | from av.codec.context import CodecContext 3 | import numpy as np 4 | 5 | 6 | # This one is faster 7 | def record_video_length_stream(container, indices): 8 | frames = [] 9 | start_index = indices[0] 10 | end_index = indices[-1] 11 | for i, frame in enumerate(container.decode(video=0)): 12 | if i > end_index: 13 | break 14 | if i >= start_index and i in indices: 15 | frames.append(frame) 16 | return frames 17 | 18 | 19 | # This one works for all types of video 20 | def record_video_length_packet(container): 21 | frames = [] 22 | # https://github.com/PyAV-Org/PyAV/issues/1269 23 | # https://www.cnblogs.com/beyond-tester/p/17641872.html 24 | # context = CodecContext.create("libvpx-vp9", "r") 25 | for packet in container.demux(video=0): 26 | for frame in packet.decode(): 27 | frames.append(frame) 28 | return frames 29 | 30 | 31 | def read_video_pyav(video_path, num_frm=8): 32 | container = av.open(video_path) 33 | 34 | if "webm" not in video_path and "mkv" not in video_path: 35 | # For mp4, we try loading with stream first 36 | try: 37 | container = av.open(video_path) 38 | total_frames = container.streams.video[0].frames 39 | sampled_frm = min(total_frames, num_frm) 40 | indices = np.linspace(0, total_frames - 1, sampled_frm, dtype=int) 41 | frames = record_video_length_stream(container, indices) 42 | except: 43 | container = av.open(video_path) 44 | frames = record_video_length_packet(container) 45 | total_frames = len(frames) 46 | sampled_frm = min(total_frames, num_frm) 47 | indices = np.linspace(0, total_frames - 1, sampled_frm, dtype=int) 48 | frames = [frames[i] for i in indices] 49 | else: 50 | container = av.open(video_path) 51 | frames = record_video_length_packet(container) 52 | total_frames = len(frames) 53 | sampled_frm = min(total_frames, num_frm) 54 | indices = np.linspace(0, total_frames - 1, sampled_frm, dtype=int) 55 | frames = [frames[i] for i in indices] 56 | return np.stack([x.to_ndarray(format="rgb24") for x in frames]) 57 | -------------------------------------------------------------------------------- /lmms-eval_videochat/lmms_eval/models/mplug_owl_video/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | from typing import TYPE_CHECKING 15 | 16 | from transformers.utils import OptionalDependencyNotAvailable, _LazyModule, is_tokenizers_available, is_torch_available 17 | 18 | 19 | _import_structure = { 20 | "configuration_mplug_owl": ["MPLUG_OWL_PRETRAINED_CONFIG_ARCHIVE_MAP", "MplugOwlConfig"], 21 | "processing_mplug_owl": ["MplugOwlImageProcessor", "MplugOwlProcessor"], 22 | "tokenization_mplug_owl": ["MplugOwlTokenizer"], 23 | } 24 | 25 | try: 26 | if not is_tokenizers_available(): 27 | raise OptionalDependencyNotAvailable() 28 | except OptionalDependencyNotAvailable: 29 | pass 30 | 31 | 32 | try: 33 | if not is_torch_available(): 34 | raise OptionalDependencyNotAvailable() 35 | except OptionalDependencyNotAvailable: 36 | pass 37 | else: 38 | _import_structure["modeling_mplug_owl"] = [ 39 | "MPLUG_OWL_PRETRAINED_MODEL_ARCHIVE_LIST", 40 | "MplugOwlForConditionalGeneration", 41 | "MplugOwlModel", 42 | ] 43 | 44 | 45 | if TYPE_CHECKING: 46 | from .configuration_mplug_owl import MPLUG_OWL_PRETRAINED_CONFIG_ARCHIVE_MAP, MplugOwlConfig 47 | from .tokenization_mplug_owl import MplugOwlTokenizer 48 | 49 | try: 50 | if not is_tokenizers_available(): 51 | raise OptionalDependencyNotAvailable() 52 | except OptionalDependencyNotAvailable: 53 | pass 54 | 55 | try: 56 | if not is_torch_available(): 57 | raise OptionalDependencyNotAvailable() 58 | except OptionalDependencyNotAvailable: 59 | pass 60 | else: 61 | from .modeling_mplug_owl import ( 62 | MPLUG_OWL_PRETRAINED_MODEL_ARCHIVE_LIST, 63 | MplugOwlForConditionalGeneration, 64 | MplugOwlModel, 65 | MplugOwlPreTrainedModel, 66 | ) 67 | 68 | 69 | else: 70 | import sys 71 | 72 | sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__) 73 | 74 | from .configuration_mplug_owl import * 75 | from .modeling_mplug_owl import * 76 | from .processing_mplug_owl import * 77 | from .tokenization_mplug_owl import * 78 | -------------------------------------------------------------------------------- /lmms-eval_videochat/lmms_eval/models/mplug_owl_video/tokenization_mplug_owl.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2022 x-plug and The HuggingFace Inc. team. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Tokenization classes for MplugOwl.""" 16 | 17 | from transformers.models.llama.tokenization_llama import LlamaTokenizer 18 | 19 | from loguru import logger 20 | 21 | VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"} 22 | 23 | PRETRAINED_VOCAB_FILES_MAP = { 24 | "vocab_file": { 25 | "MAGAer13/mplug-owl-llama-7b": "https://huggingface.co/MAGAer13/mplug-owl-llama-7b/resolve/main/vocab.txt", 26 | }, 27 | } 28 | 29 | PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { 30 | "MAGAer13/mplug-owl-llama-7b": 2048, 31 | } 32 | 33 | 34 | class MplugOwlTokenizer(LlamaTokenizer): 35 | def __init__( 36 | self, 37 | vocab_file, 38 | unk_token="", 39 | bos_token="", 40 | eos_token="", 41 | pad_token="", 42 | sp_model_kwargs=None, 43 | add_bos_token=False, 44 | add_eos_token=False, 45 | clean_up_tokenization_spaces=False, 46 | **kwargs, 47 | ): 48 | super().__init__( 49 | vocab_file, 50 | unk_token, 51 | bos_token, 52 | eos_token, 53 | pad_token, 54 | sp_model_kwargs, 55 | add_bos_token, 56 | add_eos_token, 57 | clean_up_tokenization_spaces, 58 | **kwargs, 59 | ) 60 | self.eod_id = self.eos_token_id 61 | -------------------------------------------------------------------------------- /lmms-eval_videochat/lmms_eval/models/video_chatgpt/__init__.py: -------------------------------------------------------------------------------- 1 | from .model import VideoChatGPTLlamaForCausalLM 2 | -------------------------------------------------------------------------------- /lmms-eval_videochat/lmms_eval/models/video_chatgpt/constants.py: -------------------------------------------------------------------------------- 1 | CONTROLLER_HEART_BEAT_EXPIRATION = 30 2 | WORKER_HEART_BEAT_INTERVAL = 15 3 | 4 | LOGDIR = "." 5 | 6 | 7 | # Defining model 8 | DEFAULT_VIDEO_TOKEN = "