├── README.md
├── annotations
    ├── ActivityNet
    │   └── activitynet_annotation
    │   │   ├── test.json
    │   │   ├── train.json
    │   │   ├── val.json
    │   │   └── val_2.json
    ├── Charades
    │   └── charades_annotation
    │   │   ├── charades_test.json
    │   │   ├── train.json
    │   │   └── val.json
    ├── Got
    │   ├── got_train.json
    │   └── got_val.json
    ├── NextGQA
    │   ├── nextgqa_test.json
    │   └── nextgqa_val.json
    └── VideoEval
    │   └── Quality_Access
    │       └── annotations
    │           ├── Quality_Access_100shot.json
    │           ├── Quality_Access_16shot.json
    │           ├── Quality_Access_4shot.json
    │           └── Quality_Access_test.json
├── configs
    ├── ddp.yaml
    ├── zero2.yaml
    └── zero3.yaml
├── framework.png
├── lmms-eval_videochat
    ├── .gitignore
    ├── .pre-commit-config.yaml
    ├── LICENSE
    ├── README.md
    ├── docs
    │   ├── README.md
    │   ├── commands.md
    │   ├── current_tasks.md
    │   ├── model_guide.md
    │   ├── run_examples.md
    │   └── task_guide.md
    ├── eval_annotations
    │   ├── MVBench
    │   │   ├── README.md
    │   │   └── json
    │   │   │   ├── action_antonym.json
    │   │   │   ├── action_count.json
    │   │   │   ├── action_localization.json
    │   │   │   ├── action_prediction.json
    │   │   │   ├── action_sequence.json
    │   │   │   ├── character_order.json
    │   │   │   ├── counterfactual_inference.json
    │   │   │   ├── egocentric_navigation.json
    │   │   │   ├── episodic_reasoning.json
    │   │   │   ├── fine_grained_action.json
    │   │   │   ├── fine_grained_pose.json
    │   │   │   ├── moving_attribute.json
    │   │   │   ├── moving_count.json
    │   │   │   ├── moving_direction.json
    │   │   │   ├── object_existence.json
    │   │   │   ├── object_interaction.json
    │   │   │   ├── object_shuffle.json
    │   │   │   ├── scene_transition.json
    │   │   │   ├── state_change.json
    │   │   │   └── unexpected_action.json
    │   └── Video-MME_short
    │   │   ├── README.md
    │   │   └── videomme_short
    │   │       └── test-00000-of-00001.parquet
    ├── lmms_eval
    │   ├── __init__.py
    │   ├── __main__.py
    │   ├── api
    │   │   ├── __init__.py
    │   │   ├── filter.py
    │   │   ├── instance.py
    │   │   ├── metrics.py
    │   │   ├── model.py
    │   │   ├── registry.py
    │   │   ├── samplers.py
    │   │   └── task.py
    │   ├── evaluator.py
    │   ├── filters
    │   │   ├── __init__.py
    │   │   ├── decontamination.py
    │   │   ├── extraction.py
    │   │   ├── selection.py
    │   │   └── transformation.py
    │   ├── logging_utils.py
    │   ├── models
    │   │   ├── __init__.py
    │   │   ├── model_utils
    │   │   │   ├── __init__.py
    │   │   │   ├── load_video.py
    │   │   │   ├── my_qwen_utils.py
    │   │   │   └── qwen
    │   │   │   │   └── qwen_generate_utils.py
    │   │   ├── mplug_owl_video
    │   │   │   ├── __init__.py
    │   │   │   ├── configuration_mplug_owl.py
    │   │   │   ├── modeling_mplug_owl.py
    │   │   │   ├── processing_mplug_owl.py
    │   │   │   └── tokenization_mplug_owl.py
    │   │   ├── qwen2_5_vl_lxh.py
    │   │   ├── qwen_vl.py
    │   │   └── video_chatgpt
    │   │   │   ├── __init__.py
    │   │   │   ├── constants.py
    │   │   │   ├── eval
    │   │   │       ├── __init__.py
    │   │   │       └── model_utils.py
    │   │   │   ├── inference.py
    │   │   │   ├── model
    │   │   │       ├── __init__.py
    │   │   │       ├── consolidate.py
    │   │   │       ├── make_delta.py
    │   │   │       ├── utils.py
    │   │   │       └── video_chatgpt.py
    │   │   │   ├── single_video_inference.py
    │   │   │   ├── utils.py
    │   │   │   └── video_conversation.py
    │   ├── tasks
    │   │   ├── __init__.py
    │   │   ├── _task_utils
    │   │   │   ├── file_utils.py
    │   │   │   ├── gpt_eval_utils.py
    │   │   │   ├── video_loader.py
    │   │   │   └── vqa_eval_metric.py
    │   │   ├── dream_1k
    │   │   │   ├── _default_template.yaml
    │   │   │   ├── dream_1k.yaml
    │   │   │   ├── dream_1k_cn.yaml
    │   │   │   └── utils.py
    │   │   ├── mvbench_nothink
    │   │   │   ├── _default_template.yaml
    │   │   │   ├── mvbench_action_antonym_nothink.yaml
    │   │   │   ├── mvbench_action_count_nothink.yaml
    │   │   │   ├── mvbench_action_localization_nothink.yaml
    │   │   │   ├── mvbench_action_prediction_nothink.yaml
    │   │   │   ├── mvbench_action_sequence_nothink.yaml
    │   │   │   ├── mvbench_character_order_nothink.yaml
    │   │   │   ├── mvbench_counterfactual_inference_nothink.yaml
    │   │   │   ├── mvbench_egocentric_navigation_nothink.yaml
    │   │   │   ├── mvbench_episodic_reasoning_nothink.yaml
    │   │   │   ├── mvbench_fine_grained_action_nothink.yaml
    │   │   │   ├── mvbench_fine_grained_pose_nothink.yaml
    │   │   │   ├── mvbench_moving_attribute_nothink.yaml
    │   │   │   ├── mvbench_moving_count_nothink.yaml
    │   │   │   ├── mvbench_moving_direction_nothink.yaml
    │   │   │   ├── mvbench_nothink.yaml
    │   │   │   ├── mvbench_object_existence_nothink.yaml
    │   │   │   ├── mvbench_object_interaction_nothink.yaml
    │   │   │   ├── mvbench_object_shuffle_nothink.yaml
    │   │   │   ├── mvbench_scene_transition_nothink.yaml
    │   │   │   ├── mvbench_state_change_nothink.yaml
    │   │   │   ├── mvbench_unexpected_action_nothink.yaml
    │   │   │   └── utils.py
    │   │   ├── mvbench_think
    │   │   │   ├── _default_template.yaml
    │   │   │   ├── mvbench_action_antonym_think.yaml
    │   │   │   ├── mvbench_action_count_think.yaml
    │   │   │   ├── mvbench_action_localization_think.yaml
    │   │   │   ├── mvbench_action_prediction_think.yaml
    │   │   │   ├── mvbench_action_sequence_think.yaml
    │   │   │   ├── mvbench_character_order_think.yaml
    │   │   │   ├── mvbench_counterfactual_inference_think.yaml
    │   │   │   ├── mvbench_egocentric_navigation_think.yaml
    │   │   │   ├── mvbench_episodic_reasoning_think.yaml
    │   │   │   ├── mvbench_fine_grained_action_think.yaml
    │   │   │   ├── mvbench_fine_grained_pose_think.yaml
    │   │   │   ├── mvbench_moving_attribute_think.yaml
    │   │   │   ├── mvbench_moving_count_think.yaml
    │   │   │   ├── mvbench_moving_direction_think.yaml
    │   │   │   ├── mvbench_object_existence_think.yaml
    │   │   │   ├── mvbench_object_interaction_think.yaml
    │   │   │   ├── mvbench_object_shuffle_think.yaml
    │   │   │   ├── mvbench_scene_transition_think.yaml
    │   │   │   ├── mvbench_state_change_think.yaml
    │   │   │   ├── mvbench_think.yaml
    │   │   │   ├── mvbench_unexpected_action_think.yaml
    │   │   │   └── utils.py
    │   │   ├── perceptiontest
    │   │   │   └── val
    │   │   │   │   ├── _default_template_yaml
    │   │   │   │   ├── perceptiontest_mc_nothink.yaml
    │   │   │   │   ├── perceptiontest_mc_think.yaml
    │   │   │   │   └── utils.py
    │   │   └── videomme
    │   │   │   ├── utils.py
    │   │   │   ├── videomme_short_nothink.yaml
    │   │   │   ├── videomme_short_nothink_glue.yaml
    │   │   │   ├── videomme_short_think.yaml
    │   │   │   └── videomme_short_think_glue.yaml
    │   └── utils.py
    ├── miscs
    │   ├── example_eval.yaml
    │   ├── llava_repr_requirements.txt
    │   ├── llava_result_check.md
    │   ├── llava_sglang_result_check.md
    │   ├── repr_scripts.sh
    │   ├── repr_torch_envs.txt
    │   ├── scienceqa_id.txt
    │   ├── script.sh
    │   ├── test_llava.py
    │   ├── test_scienceqa.py
    │   ├── tinyllava_repr_requirements.txt
    │   └── tinyllava_repr_scripts.sh
    ├── pyproject.toml
    ├── scripts_reason
    │   ├── eval_qwen2_5vl_all_tasks_new.sh
    │   ├── eval_qwen2_5vl_mv_ptest.sh
    │   ├── eval_qwen2_5vl_nothink.sh
    │   ├── eval_qwen2_5vl_nothink_glue.sh
    │   └── eval_qwen2_5vl_vmme_short.sh
    ├── setup.py
    └── tools
    │   ├── get_video_avg_time.py
    │   ├── lite
    │       ├── embed.py
    │       ├── embedder
    │       │   ├── BaseEmbedder.py
    │       │   ├── ClipBgeEmbedder.py
    │       │   └── __init__.py
    │       ├── shrink.py
    │       └── shrinker
    │       │   ├── BaseShrinker.py
    │       │   ├── EmbedShrinker.py
    │       │   ├── __init__.py
    │       │   └── sampling_methods
    │       │       ├── __init__.py
    │       │       ├── kcenter_greedy.py
    │       │       └── sampling_def.py
    │   ├── live_bench
    │       ├── create_dataset.py
    │       ├── data_summary.ipynb
    │       ├── example.ipynb
    │       ├── filter.ipynb
    │       ├── live_bench
    │       │   ├── __init__.py
    │       │   ├── api
    │       │   │   └── live_bench.py
    │       │   ├── data_generator
    │       │   │   ├── __init__.py
    │       │   │   ├── check_prompt.md
    │       │   │   ├── default_criteria.md
    │       │   │   ├── example
    │       │   │   │   ├── example_output.json
    │       │   │   │   └── example_website.png
    │       │   │   ├── live_bench.py
    │       │   │   ├── live_bench_data.py
    │       │   │   ├── prompt.md
    │       │   │   ├── qa_generator.py
    │       │   │   ├── question_finalizer.py
    │       │   │   ├── response.py
    │       │   │   ├── score_getter.py
    │       │   │   ├── score_prompt.md
    │       │   │   └── utils
    │       │   │   │   ├── __init__.py
    │       │   │   │   ├── claude.py
    │       │   │   │   ├── extract_infomation.py
    │       │   │   │   ├── gemini.py
    │       │   │   │   └── gpt4v.py
    │       │   ├── driver
    │       │   │   ├── .gitignore
    │       │   │   ├── __init__.py
    │       │   │   └── load_driver.py
    │       │   ├── screen_shoter
    │       │   │   ├── __init__.py
    │       │   │   ├── screen.py
    │       │   │   └── screen_shoter.py
    │       │   ├── view.ipynb
    │       │   └── websites
    │       │   │   ├── __init__.py
    │       │   │   ├── load_website.py
    │       │   │   ├── website.py
    │       │   │   └── website_list.yaml
    │       ├── pyproject.toml
    │       ├── refine_all_results.py
    │       ├── script
    │       │   ├── README.md
    │       │   ├── modify.ipynb
    │       │   └── upload_results.py
    │       └── setup.py
    │   ├── make_image_hf_dataset.ipynb
    │   ├── make_vatex.py
    │   ├── make_video_hf_dataset.ipynb
    │   └── makecvrr.ipynb
├── requirements.txt
├── src
    ├── __init__.py
    ├── open_r1
    │   ├── __init__.py
    │   ├── evaluate.py
    │   ├── generate.py
    │   ├── grpo.py
    │   ├── grpo_cls.py
    │   ├── grpo_cls_nothink.py
    │   ├── grpo_gqa.py
    │   ├── grpo_gqa_nothink.py
    │   ├── grpo_qa.py
    │   ├── grpo_qa_nothink.py
    │   ├── grpo_tasks.py
    │   ├── grpo_tg.py
    │   ├── grpo_video.py
    │   ├── my_qwen_utils.py
    │   ├── my_qwen_utils2.py
    │   └── trainer
    │   │   ├── __init__.py
    │   │   ├── grpo_tasks_trainer.py
    │   │   ├── grpo_trainer.py
    │   │   ├── grpo_trainer_video_cls.py
    │   │   ├── grpo_trainer_video_cls_nothink.py
    │   │   ├── grpo_trainer_video_gqa.py
    │   │   ├── grpo_trainer_video_gqa_nothink.py
    │   │   ├── grpo_trainer_video_qa.py
    │   │   ├── grpo_trainer_video_qa_nothink.py
    │   │   ├── grpo_trainer_video_tg.py
    │   │   ├── vllm_grpo_trainer.py
    │   │   ├── vllm_grpo_trainer_video_tg.py
    │   │   └── yza_vision_process.py
    └── sft
    │   ├── sft_cls.py
    │   ├── sft_gqa.py
    │   ├── sft_grounding.py
    │   └── sft_track.py
├── src_eval
    ├── __init__.py
    ├── data_config.py
    ├── eval_prompts.py
    ├── evaluate_cls_quality_c8.py
    ├── evaluate_gqa.py
    ├── evaluate_grounding.py
    ├── evaluate_qa.py
    ├── evaluate_track.py
    └── my_qwen_utils.py
└── training_scripts
    ├── run_grpo_video_cls_qa.sh
    ├── run_grpo_video_gqa.sh
    ├── run_grpo_video_gqa_nothink_3e.sh
    ├── run_grpo_video_qa.sh
    ├── run_grpo_video_qa_nothink.sh
    ├── run_grpo_video_task.sh
    ├── run_grpo_video_tg.sh
    ├── run_sft_video_cls_qa.sh
    ├── run_sft_video_gqa.sh
    ├── run_sft_video_track.sh
    └── zero3_offload.json


/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | <div align="center">
 3 | 
 4 | 
 5 | <h2><a href="https://arxiv.org/pdf/2504.06958">VideoChat-R1: Enhancing Spatio-Temporal
 6 | Perception via Reinforcement Fine-Tuning</a></h2>
 7 | 
 8 | [Xinhao Li](https://scholar.google.com/citations?user=evR3uR0AAAAJ)\*, [Ziang Yan](https://scholar.google.com.hk/citations?user=78lx13MAAAAJ&hl=zh-CN)\*, Desen Meng, Lu Dong, [Xiangyu Zeng](https://scholar.google.com/citations?user=jS13DXkAAAAJ&hl=zh-CN), [Yinan He](https://dblp.org/pid/93/7763.html), [Yali Wang](https://scholar.google.com/citations?user=hD948dkAAAAJ), [Yu Qiao](https://scholar.google.com/citations?user=gFtI-8QAAAAJ&hl), [Yi Wang](https://scholar.google.com.hk/citations?user=Xm2M8UwAAAAJ)^ and [Limin Wang](https://scholar.google.com/citations?user=HEuN8PcAAAAJ)^
 9 | 
10 | <p align="center">
11 |         🤗 <a href="https://huggingface.co/collections/OpenGVLab/videochat-r1-67fbe26e4eb08c83aa24643e">Model</a> &nbsp&nbsp | &nbsp&nbsp 📑 <a href="https://arxiv.org/pdf/2504.06958">Paper</a> &nbsp&nbsp 
12 | <br>
13 | 
14 | </p>
15 | 
16 | 
17 | </div>
18 | 
19 | 
20 | 
21 | ## :fire: Updates
22 | - [x] **2025/04/22**:🔥🔥🔥 We release our VideoChat-R1-caption at [Huggingface](https://huggingface.co/collections/OpenGVLab/videochat-r1-67fbe26e4eb08c83aa24643e).
23 | - [x] **2025/04/14**:🔥🔥🔥 We release our VideoChat-R1 and  VideoChat-R1-thinking at [Huggingface](https://huggingface.co/collections/OpenGVLab/videochat-r1-67fbe26e4eb08c83aa24643e).
24 | - [x] **2025/04/10**:🔥🔥🔥 We release our paper and code.
25 | 
26 | 
27 | ## :parrot: Introduction
28 | 
29 | ![alt text](framework.png)
30 | 
31 | 
32 | 
33 | ## Demo & Inference
34 | 
35 | Refer to [hf README](https://huggingface.co/OpenGVLab/VideoChat-R1_7B) to inference our model.
36 | 
37 | ## Evaluation
38 | 
39 | See [eval_scripts](eval_scripts) and [lmms-eval_videochat](lmms-eval_videochat).
40 | <!-- See [evaluation codes](lmms-eval_videochat). And [lmms-eval](https://github.com/EvolvingLMMs-Lab/lmms-eval) have supported our model, you also could use it to evaluate our model on varous benchmarks. -->
41 | 
42 | ## Training
43 | 
44 | See [training_scripts](training_scripts).
45 | 
46 | # :page_facing_up: Citation
47 | 
48 | If you find this project useful in your research, please consider cite:
49 | ```BibTeX
50 | @article{li2025videochatr1,
51 |   title={VideoChat-R1: Enhancing Spatio-Temporal
52 | Perception via Reinforcement Fine-Tuning},
53 |   author={Li, Xinhao and Yan, Ziang and Meng, Desen and Dong, Lu and Zeng, Xiangyu and He, Yinan and Wang, Yali and Qiao, Yu and Wang, Yi and Wang, Limin},
54 |   journal={arXiv preprint arXiv:2504.06958},
55 |   year={2025}
56 | }
57 | ```
58 | 
59 | <!-- # :dizzy: Acknowledgement
60 | 
61 | Thanks to the open source of the following projects: [Qwen](https://github.com/QwenLM/Qwen), [lmms-eval](https://github.com/EvolvingLMMs-Lab/lmms-eval), their implementation provides valuable reference experience for our project. -->
62 | 


--------------------------------------------------------------------------------
/annotations/VideoEval/Quality_Access/annotations/Quality_Access_4shot.json:
--------------------------------------------------------------------------------
 1 | [
 2 |     {
 3 |         "video": "0506.mp4",
 4 |         "instruction": "How about the quality of the video from an aesthetic and technical point of view? Choice one answer from ['low quality', 'high quality'].",
 5 |         "answer": "low quality"
 6 |     },
 7 |     {
 8 |         "video": "0717.mp4",
 9 |         "instruction": "How about the quality of the video from an aesthetic and technical point of view? Choice one answer from ['low quality', 'high quality'].",
10 |         "answer": "low quality"
11 |     },
12 |     {
13 |         "video": "0423.mp4",
14 |         "instruction": "How about the quality of the video from an aesthetic and technical point of view? Choice one answer from ['low quality', 'high quality'].",
15 |         "answer": "low quality"
16 |     },
17 |     {
18 |         "video": "0340.mp4",
19 |         "instruction": "How about the quality of the video from an aesthetic and technical point of view? Choice one answer from ['low quality', 'high quality'].",
20 |         "answer": "low quality"
21 |     },
22 |     {
23 |         "video": "3208.mp4",
24 |         "instruction": "How about the quality of the video from an aesthetic and technical point of view? Choice one answer from ['low quality', 'high quality'].",
25 |         "answer": "high quality"
26 |     },
27 |     {
28 |         "video": "3424.mp4",
29 |         "instruction": "How about the quality of the video from an aesthetic and technical point of view? Choice one answer from ['low quality', 'high quality'].",
30 |         "answer": "high quality"
31 |     },
32 |     {
33 |         "video": "3126.mp4",
34 |         "instruction": "How about the quality of the video from an aesthetic and technical point of view? Choice one answer from ['low quality', 'high quality'].",
35 |         "answer": "high quality"
36 |     },
37 |     {
38 |         "video": "3048.mp4",
39 |         "instruction": "How about the quality of the video from an aesthetic and technical point of view? Choice one answer from ['low quality', 'high quality'].",
40 |         "answer": "high quality"
41 |     }
42 | ]


--------------------------------------------------------------------------------
/configs/ddp.yaml:
--------------------------------------------------------------------------------
 1 | compute_environment: LOCAL_MACHINE
 2 | debug: false
 3 | distributed_type: MULTI_GPU
 4 | downcast_bf16: 'no'
 5 | gpu_ids: all
 6 | machine_rank: 0
 7 | main_training_function: main
 8 | mixed_precision: bf16
 9 | num_machines: 1
10 | num_processes: 8
11 | rdzv_backend: static
12 | same_network: true
13 | tpu_env: []
14 | tpu_use_cluster: false
15 | tpu_use_sudo: false
16 | use_cpu: false
17 | 


--------------------------------------------------------------------------------
/configs/zero2.yaml:
--------------------------------------------------------------------------------
 1 | compute_environment: LOCAL_MACHINE
 2 | debug: false
 3 | deepspeed_config:
 4 |   deepspeed_multinode_launcher: standard
 5 |   offload_optimizer_device: none
 6 |   offload_param_device: none
 7 |   zero3_init_flag: false
 8 |   zero_stage: 2
 9 | distributed_type: DEEPSPEED
10 | downcast_bf16: 'no'
11 | machine_rank: 0
12 | main_training_function: main
13 | mixed_precision: bf16
14 | num_machines: 1
15 | num_processes: 8
16 | rdzv_backend: static
17 | same_network: true
18 | tpu_env: []
19 | tpu_use_cluster: false
20 | tpu_use_sudo: false
21 | use_cpu: false


--------------------------------------------------------------------------------
/configs/zero3.yaml:
--------------------------------------------------------------------------------
 1 | compute_environment: LOCAL_MACHINE
 2 | debug: false
 3 | deepspeed_config:
 4 |   deepspeed_multinode_launcher: standard
 5 |   offload_optimizer_device: none
 6 |   offload_param_device: none
 7 |   zero3_init_flag: true
 8 |   zero3_save_16bit_model: true
 9 |   zero_stage: 3
10 | distributed_type: DEEPSPEED
11 | downcast_bf16: 'no'
12 | machine_rank: 0
13 | main_training_function: main
14 | mixed_precision: bf16
15 | num_machines: 1
16 | num_processes: 8
17 | rdzv_backend: static
18 | same_network: true
19 | tpu_env: []
20 | tpu_use_cluster: false
21 | tpu_use_sudo: false
22 | use_cpu: false
23 | 


--------------------------------------------------------------------------------
/framework.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/VideoChat-R1/dba50f32fffb763e0fdf3810244be76a44147df2/framework.png


--------------------------------------------------------------------------------
/lmms-eval_videochat/.gitignore:
--------------------------------------------------------------------------------
 1 | env
 2 | *.pyc
 3 | output/
 4 | data/
 5 | lm_cache
 6 | .idea
 7 | build
 8 | dist
 9 | *.egg-info
10 | venv
11 | .vscode/
12 | temp
13 | __pycache__
14 | .ipynb_checkpoints
15 | temp
16 | .DS_STORE
17 | # IPython
18 | profile_default/
19 | ipython_config.py
20 | logs/
21 | scripts/
22 | wandb/
23 | SimSun.ttf
24 | submissions/
25 | lmms_eval/tasks/hallusion_bench/hallusion_output_vs_model.json
26 | lmms_eval/tasks/hallusion_bench/hallusion_output_vd_model.json
27 | zk.log
28 | cache_dir
29 | ckpt
30 | pretrained/
31 | LLaVA/
32 | *logs
33 | temp/
34 | InternVL/
35 | logs/
36 | data/
37 | llava-video/
38 | Video-MME/
39 | VATEX/
40 | lmms_eval/tasks/vatex/__pycache__/utils.cpython-310.pyc
41 | lmms_eval/tasks/mlvu/__pycache__/utils.cpython-310.pyc


--------------------------------------------------------------------------------
/lmms-eval_videochat/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
1 | repos:
2 |   - repo: https://github.com/psf/black
3 |     rev: 23.12.1
4 |     hooks:
5 |       - id: black
6 |         language_version: python3


--------------------------------------------------------------------------------
/lmms-eval_videochat/LICENSE:
--------------------------------------------------------------------------------
 1 | # For the main pipeline structure-related code, we maintain the original license provided with lm-evaluation-harness, which is the MIT License.
 2 | 
 3 | MIT License
 4 | 
 5 | Copyright (c) 2024 LMMs-Lab
 6 | 
 7 | Permission is hereby granted, free of charge, to any person obtaining a copy
 8 | of this software and associated documentation files (the "Software"), to deal
 9 | in the Software without restriction, including without limitation the rights
10 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11 | copies of the Software, and to permit persons to whom the Software is
12 | furnished to do so, subject to the following conditions:
13 | 
14 | The above copyright notice and this permission notice shall be included in all
15 | copies or substantial portions of the Software.
16 | 
17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23 | SOFTWARE.
24 | 
25 | # For the multimodal models and datasets that we have added (defined as code in the lmms_eval/tasks and lmms_eval/models folders), we apply the Apache License.
26 | 
27 | Apache 2.0 License
28 | 
29 | Copyright (c) 2024 LMMs-Lab
30 | 
31 | Licensed under the Apache License, Version 2.0 (the "License");
32 | you may not use this file except in compliance with the License.
33 | You may obtain a copy of the License at
34 | 
35 |     http://www.apache.org/licenses/LICENSE-2.0
36 | 
37 | Unless required by applicable law or agreed to in writing, software
38 | distributed under the License is distributed on an "AS IS" BASIS,
39 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
40 | See the License for the specific language governing permissions and
41 | limitations under the License.
42 | 
43 | When modifying the code, please include the following information about the original lmms-eval source:
44 | # Adopted from lmms-eval from https://github.com/EvolvingLMMs-Lab/lmms-eval. Below is the original copyright:
45 | #
46 | #    Licensed under the Apache License, Version 2.0 (the "License");
47 | #    you may not use this file except in compliance with the License.
48 | #    You may obtain a copy of the License at
49 | #
50 | #        http://www.apache.org/licenses/LICENSE-2.0
51 | #
52 | #    Unless required by applicable law or agreed to in writing, software
53 | #    distributed under the License is distributed on an "AS IS" BASIS,
54 | #    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
55 | #    See the License for the specific language governing permissions and
56 | #    limitations under the License.
57 | 


--------------------------------------------------------------------------------
/lmms-eval_videochat/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | # How to use
 3 | 
 4 | We have modified the data loading method for lmms-eval: instead of loading from Huggingface, the data is loaded locally. Therefore, when using it, you need to **specify the data path** in the YAML file of each task. The data can be downloaded from the [lmms-eval](https://huggingface.co/lmms-lab) or the official repos of the corresponding tasks.
 5 | 
 6 | ## Installation
 7 | 
 8 | You can install the package by cloning the repository and running the following command:
 9 | ```bash
10 | git clone https://github.com/OpenGVLab/VideoChat-R1
11 | cd lmms-eval_videochat
12 | pip install -e .
13 | ```
14 | We provide all evaluation [scripts](scripts_reason) and [annotations](eval_annotations) here.
15 | 
16 | You could evaluate one task:
17 | ```bash
18 | TASK=videomme_short_nothink
19 | MODEL_NAME=qwen2_5_vl_lxh
20 | MAX_NUM_FRAMES=512
21 | CKPT_PATH=OpenGVLab/VideoChat-R1-7B
22 | 
23 | echo $TASK
24 | TASK_SUFFIX="${TASK//,/_}"
25 | echo $TASK_SUFFIX
26 | 
27 | JOB_NAME=$(basename $0)_$(date +"%Y%m%d_%H%M%S")
28 | MASTER_PORT=$((18000 + $RANDOM % 100))
29 | NUM_GPUS=8
30 | 
31 | 
32 | accelerate launch --num_processes ${NUM_GPUS} --main_process_port ${MASTER_PORT} -m lmms_eval \
33 |     --model ${MODEL_NAME} \
34 |     --model_args pretrained=$CKPT_PATH,max_num_frames=$MAX_NUM_FRAMES \
35 |     --tasks $TASK \
36 |     --batch_size 1 \
37 |     --log_samples \
38 |     --log_samples_suffix $TASK_SUFFIX \
39 |     --output_path ./logs/${JOB_NAME}_${MODEL_NAME}_f${MAX_NUM_FRAMES}
40 | ```
41 | You could evaluate more tasks once like:
42 | ```bash
43 | TASK=mvbench_nothink,videomme_short_nothink
44 | MODEL_NAME=qwen2_5_vl_lxh
45 | MAX_NUM_FRAMES=512
46 | CKPT_PATH=OpenGVLab/VideoChat-Flash-Qwen2-7B_res448
47 | 
48 | echo $TASK
49 | TASK_SUFFIX="${TASK//,/_}"
50 | echo $TASK_SUFFIX
51 | 
52 | JOB_NAME=$(basename $0)_$(date +"%Y%m%d_%H%M%S")
53 | MASTER_PORT=$((18000 + $RANDOM % 100))
54 | NUM_GPUS=8
55 | 
56 | 
57 | accelerate launch --num_processes ${NUM_GPUS} --main_process_port ${MASTER_PORT} -m lmms_eval \
58 |     --model ${MODEL_NAME} \
59 |     --model_args pretrained=$CKPT_PATH,max_num_frames=$MAX_NUM_FRAMES \
60 |     --tasks $TASK \
61 |     --batch_size 1 \
62 |     --log_samples \
63 |     --log_samples_suffix $TASK_SUFFIX \
64 |     --output_path ./logs/${JOB_NAME}_${MODEL_NAME}_f${MAX_NUM_FRAMES}
65 | ```
66 | 
67 | 


--------------------------------------------------------------------------------
/lmms-eval_videochat/docs/README.md:
--------------------------------------------------------------------------------
 1 | # LMMs Eval Documentation
 2 | 
 3 | Welcome to the docs for `lmms-eval`!
 4 | 
 5 | Majority of this documentation is adapted from [lm-eval-harness](https://github.com/EleutherAI/lm-evaluation-harness/)
 6 | 
 7 | ## Table of Contents
 8 | 
 9 | * To learn about the command line flags, see the [commands](commands.md)
10 | * To learn how to add a new moddel,  see the [Model Guide](model_guide.md).
11 | * For a crash course on adding new tasks to the library, see our [Task Guide](task_guide.md).
12 | * If you need to upload your datasets into correct HF format with viewer supported, please refer to [tools](https://github.com/EvolvingLMMs-Lab/lmms-eval/tree/pufanyi/hf_dataset_docs/tools)
13 | 


--------------------------------------------------------------------------------
/lmms-eval_videochat/docs/commands.md:
--------------------------------------------------------------------------------
 1 | # User Guide
 2 | This document details the interface exposed by `lmms_eval` and provides details on what flags are available to users.
 3 | 
 4 | ## Command-line Interface
 5 | 
 6 | 
 7 | Equivalently, running the library can be done via the `lmms_eval` entrypoint at the command line.
 8 | 
 9 | This mode supports a number of command-line arguments, the details of which can be also be seen via running with `-h` or `--help`:
10 | 
11 | * `--model` : Selects which model type or provider is evaluated. Must be a mdoels registered under lmms_eval/models. For example, `--model qwen_vl` or `--model llava`.
12 | 
13 | * `--model_args` : Controls parameters passed to the model constructor. Accepts a string containing comma-separated keyword arguments to the model class of the format `"arg1=val1,arg2=val2,..."`, such as, for example `--model_args pretrained=liuhaotian/llava-v1.5-7b,batch_size=1`. For a full list of what keyword arguments, see the initialization of the corresponding model class in `lmms_eval/models/`.
14 | 
15 | * `--tasks` : Determines which tasks or task groups are evaluated. Accepts a comma-separated list of task names or task group names. Must be solely comprised of valid tasks/groups. You can use `--tasks list` to see all the available tasks. If you add your own tasks but not shown on the list, you can try to set `--verbosity=DEBUG` to view the error message. You can also use `--tasks list_with_num` to check every tasks and the number of question each task contains. However, `list_with_num` will download all the available datasets and may require lots of memory and time.
16 | 
17 | * `--batch_size` : Sets the batch size used for evaluation. Can be a positive integer or `"auto"` to automatically select the largest batch size that will fit in memory, speeding up evaluation. One can pass `--batch_size auto:N` to re-select the maximum batch size `N` times during evaluation. This can help accelerate evaluation further, since `lm-eval` sorts documents in descending order of context length.
18 | 
19 | * `--output_path` : A string of the form `dir/file.jsonl` or `dir/`. Provides a path where high-level results will be saved, either into the file named or into the directory named. If `--log_samples` is passed as well, then per-document outputs and metrics will be saved into the directory as well.
20 | 
21 | * `--log_samples` : If this flag is passed, then the model's outputs, and the text fed into the model, will be saved at per-document granularity. Must be used with `--output_path`.
22 | 
23 | * `--limit` : Accepts an integer, or a float between 0.0 and 1.0 . If passed, will limit the number of documents to evaluate to the first X documents (if an integer) per task or first X% of documents per task. Useful for debugging, especially on costly API models.
24 | 
25 | ## Usage with SRT API
26 | 
27 | > install sglang
28 | 
29 | ```bash
30 | git clone https://github.com/sgl-project/sglang.git
31 | # Current version is tested on #1222
32 | cd sglang;
33 | pip install -e "python[srt]"
34 | 
35 | # Install FlashInfer CUDA kernels
36 | pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
37 | ```
38 | 
39 | > run sglang backend service with the following command
40 | 
41 | ```bash
42 | # After update, there is no need to use an extra command to setup backend server
43 | # the server will be initialized in the init process
44 | 
45 | # launch lmms-eval srt_api model
46 | CKPT_PATH=$1
47 | TASK=$2
48 | MODALITY=$3
49 | TP_SIZE=$4
50 | echo $TASK
51 | TASK_SUFFIX="${TASK//,/_}"
52 | echo $TASK_SUFFIX
53 | 
54 | python3 -m lmms_eval \
55 |     --model srt_api \
56 |     --model_args modality=$MODALITY,model_version=$CKPT_PATH,tp=$TP_SIZE,host=127.0.0.1,port=30000,timeout=600 \
57 |     --tasks $TASK \
58 |     --batch_size 1 \
59 |     --log_samples \
60 |     --log_samples_suffix $TASK_SUFFIX \
61 |     --output_path ./logs/
62 | ```
63 | 
64 | You may need to install some dependencies for the above command to work (if you encounter some errors).
65 | 
66 | ```bash
67 | pip install httpx==0.23.3
68 | pip install protobuf==3.20
69 | ```
70 | 
71 | 
72 | 


--------------------------------------------------------------------------------
/lmms-eval_videochat/docs/current_tasks.md:
--------------------------------------------------------------------------------
  1 | # Current Tasks
  2 | 
  3 | > () indicates the task name in the lmms_eval. The task name is also used to specify the dataset in the configuration file.
  4 | > The following is manually updated documentation. You could use `lmms_eval task --list` to list all supported tasks and their task names. 
  5 | 
  6 | - AI2D (ai2d)
  7 | - ChartQA (chartqa)
  8 | - CMMMU (cmmmu)
  9 |   - CMMMU Validation (cmmmu_val)
 10 |   - CMMMU Test (cmmmu_test)
 11 | - COCO Caption (coco_cap)
 12 |   - COCO 2014 Caption (coco2014_cap)
 13 |     - COCO 2014 Caption Validation (coco2014_cap_val)
 14 |     - COCO 2014 Caption Test (coco2014_cap_test)
 15 |   - COCO 2017 Caption (coco2017_cap)
 16 |     - COCO 2017 Caption MiniVal (coco2017_cap_val)
 17 |     - COCO 2017 Caption MiniTest (coco2017_cap_test)
 18 | - [ConBench](https://github.com/foundation-multimodal-models/ConBench) (conbench)
 19 | - DOCVQA (docvqa)
 20 |   - DOCVQA Validation (docvqa_val)
 21 |   - DOCVQA Test (docvqa_test)
 22 | - Ferret (ferret)
 23 | - Flickr30K (flickr30k)
 24 |   - Ferret Test (ferret_test)
 25 | - GQA (gqa)
 26 | - HallusionBenchmark (hallusion_bench_image)
 27 | - Infographic VQA (info_vqa)
 28 |   - Infographic VQA Validation (info_vqa_val)
 29 |   - Infographic VQA Test (info_vqa_test)
 30 | - LLaVA-Bench (llava_in_the_wild)
 31 | - LLaVA-Bench-COCO (llava_bench_coco)
 32 | - MathVerse (mathverse)
 33 |   - MathVerse Text Dominant (mathverse_testmini_text_dominant)
 34 |   - MathVerse Text Only (mathverse_testmini_text_only)
 35 |   - MathVerse Text Lite (mathverse_testmini_text_lite)
 36 |   - MathVerse Vision Dominant (mathverse_testmini_vision_dominant)
 37 |   - MathVerse Vision Intensive (mathverse_testmini_vision_intensive)
 38 |   - MathVerse Vision Only (mathverse_testmini_vision_only)
 39 | - MathVista (mathvista)
 40 |   - MathVista Validation (mathvista_testmini)
 41 |   - MathVista Test (mathvista_test)
 42 | - MMBench (mmbench)
 43 |   - MMBench English (mmbench_en)
 44 |     - MMBench English Dev (mmbench_en_dev)
 45 |     - MMBench English Test (mmbench_en_test)
 46 |   - MMBench Chinese (mmbench_cn)
 47 |     - MMBench Chinese Dev (mmbench_cn_dev)
 48 |     - MMBench Chinese Test (mmbench_cn_test)
 49 | - MME (mme)
 50 | - MMMU (mmmu)
 51 |   - MMMU Validation (mmmu_val)
 52 |   - MMMU Test (mmmu_test)
 53 | - MMStar (mmstar)
 54 | - MMUPD (mmupd)
 55 |   - MMUPD Base (mmupd_base)
 56 |     - MMAAD Base (mmaad_base)
 57 |     - MMIASD Base (mmiasd_base)
 58 |     - MMIVQD Base (mmivqd_base)
 59 |   - MMUPD Option (mmupd_option)
 60 |     - MMAAD Option (mmaad_option)
 61 |     - MMIASD Option (mmiasd_option)
 62 |     - MMIVQD Option (mmivqd_option)
 63 |   - MMUPD Instruction (mmupd_instruction)
 64 |     - MMAAD Instruction (mmaad_instruction)
 65 |     - MMIASD Instruction (mmiasd_instruction)
 66 |     - MMIVQD Instruction (mmivqd_instruction)
 67 | - MMVet (mmvet)
 68 | - Multi-DocVQA (multidocvqa)
 69 |   - Multi-DocVQA Validation (multidocvqa_val)
 70 |   - Multi-DocVQA Test (multidocvqa_test)
 71 | - NoCaps (nocaps)
 72 |   - NoCaps Validation (nocaps_val)
 73 |   - NoCaps Test (nocaps_test)
 74 | - OKVQA (ok_vqa)
 75 |   - OKVQA Validation 2014 (ok_vqa_val2014)
 76 | - POPE (pope)
 77 | - RefCOCO (refcoco)
 78 |     - refcoco_seg_test
 79 |     - refcoco_seg_val
 80 |     - refcoco_seg_testA
 81 |     - refcoco_seg_testB
 82 |     - refcoco_bbox_test
 83 |     - refcoco_bbox_val
 84 |     - refcoco_bbox_testA
 85 |     - refcoco_bbox_testB
 86 | - RefCOCO+ (refcoco+)
 87 |     - refcoco+_seg
 88 |         - refcoco+_seg_val
 89 |         - refcoco+_seg_testA
 90 |         - refcoco+_seg_testB
 91 |     - refcoco+_bbox
 92 |         - refcoco+_bbox_val
 93 |         - refcoco+_bbox_testA
 94 |         - refcoco+_bbox_testB
 95 | - RefCOCOg (refcocog)
 96 |     - refcocog_seg_test
 97 |     - refcocog_seg_val
 98 |     - refcocog_bbox_test
 99 |     - refcocog_bbox_val
100 | - ScienceQA (scienceqa_full)
101 |   - ScienceQA Full (scienceqa)
102 |   - ScienceQA IMG (scienceqa_img)
103 | - ScreenSpot (screenspot)
104 |   - ScreenSpot REC / Grounding (screenspot_rec)
105 |   - ScreenSpot REG / Instruction Generation (screenspot_reg)
106 | - SeedBench (seedbench)
107 | - SeedBench 2 (seedbench_2)
108 | - SeedBench 2 Plus (seedbench_2_plus)
109 | - ST-VQA (stvqa)
110 | - TextCaps (textcaps)
111 |   - TextCaps Validation (textcaps_val)
112 |   - TextCaps Test (textcaps_test)
113 | - TextVQA (textvqa)
114 |   - TextVQA Validation (textvqa_val)
115 |   - TextVQA Test (textvqa_test)
116 | - VizWizVQA (vizwiz_vqa)
117 |   - VizWizVQA Validation (vizwiz_vqa_val)
118 |   - VizWizVQA Test (vizwiz_vqa_test)
119 | - VQAv2 (vqav2)
120 |   - VQAv2 Validation (vqav2_val)
121 |   - VQAv2 Test (vqav2_test)
122 | - WebSRC (websrc)
123 |   - WebSRC Validation (websrc_val)
124 |   - WebSRC Test (websrc_test)


--------------------------------------------------------------------------------
/lmms-eval_videochat/eval_annotations/MVBench/README.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | license: mit
 3 | extra_gated_prompt: >-
 4 |   You agree to not use the dataset to conduct experiments that cause harm to
 5 |   human subjects. Please note that the data in this dataset may be subject to
 6 |   other agreements. Before using the data, be sure to read the relevant
 7 |   agreements carefully to ensure compliant use. Video copyrights belong to the
 8 |   original video creators or platforms and are for academic research use only.
 9 | task_categories:
10 | - visual-question-answering
11 | - video-classification
12 | extra_gated_fields:
13 |   Name: text
14 |   Company/Organization: text
15 |   Country: text
16 |   E-Mail: text
17 | modalities:
18 | - Video
19 | - Text
20 | configs:
21 | - config_name: action_sequence
22 |   data_files: json/action_sequence.json
23 | - config_name: moving_count
24 |   data_files: json/moving_count.json
25 | - config_name: action_prediction
26 |   data_files: json/action_prediction.json
27 | - config_name: episodic_reasoning
28 |   data_files: json/episodic_reasoning.json
29 | - config_name: action_antonym
30 |   data_files: json/action_antonym.json
31 | - config_name: action_count
32 |   data_files: json/action_count.json
33 | - config_name: scene_transition
34 |   data_files: json/scene_transition.json
35 | - config_name: object_shuffle
36 |   data_files: json/object_shuffle.json
37 | - config_name: object_existence
38 |   data_files: json/object_existence.json
39 | - config_name: fine_grained_pose
40 |   data_files: json/fine_grained_pose.json
41 | - config_name: unexpected_action
42 |   data_files: json/unexpected_action.json
43 | - config_name: moving_direction
44 |   data_files: json/moving_direction.json
45 | - config_name: state_change
46 |   data_files: json/state_change.json
47 | - config_name: object_interaction
48 |   data_files: json/object_interaction.json
49 | - config_name: character_order
50 |   data_files: json/character_order.json
51 | - config_name: action_localization
52 |   data_files: json/action_localization.json
53 | - config_name: counterfactual_inference
54 |   data_files: json/counterfactual_inference.json
55 | - config_name: fine_grained_action
56 |   data_files: json/fine_grained_action.json
57 | - config_name: moving_attribute
58 |   data_files: json/moving_attribute.json
59 | - config_name: egocentric_navigation
60 |   data_files: json/egocentric_navigation.json
61 | language:
62 | - en
63 | size_categories:
64 | - 1K<n<10K
65 | ---


--------------------------------------------------------------------------------
/lmms-eval_videochat/eval_annotations/Video-MME_short/README.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | dataset_info:
 3 |   config_name: videomme_short
 4 |   features:
 5 |   - name: video_id
 6 |     dtype: string
 7 |   - name: duration
 8 |     dtype: string
 9 |   - name: domain
10 |     dtype: string
11 |   - name: sub_category
12 |     dtype: string
13 |   - name: url
14 |     dtype: string
15 |   - name: videoID
16 |     dtype: string
17 |   - name: question_id
18 |     dtype: string
19 |   - name: task_type
20 |     dtype: string
21 |   - name: question
22 |     dtype: string
23 |   - name: options
24 |     sequence: string
25 |   - name: answer
26 |     dtype: string
27 |   splits:
28 |   - name: test
29 |     num_bytes: 1003241.0
30 |     num_examples: 900
31 |   download_size: 405167
32 |   dataset_size: 1003241.0
33 | configs:
34 | - config_name: videomme_short
35 |   data_files:
36 |   - split: test
37 |     path: videomme_short/test-*
38 | ---
39 | 


--------------------------------------------------------------------------------
/lmms-eval_videochat/eval_annotations/Video-MME_short/videomme_short/test-00000-of-00001.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/VideoChat-R1/dba50f32fffb763e0fdf3810244be76a44147df2/lmms-eval_videochat/eval_annotations/Video-MME_short/videomme_short/test-00000-of-00001.parquet


--------------------------------------------------------------------------------
/lmms-eval_videochat/lmms_eval/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/VideoChat-R1/dba50f32fffb763e0fdf3810244be76a44147df2/lmms-eval_videochat/lmms_eval/__init__.py


--------------------------------------------------------------------------------
/lmms-eval_videochat/lmms_eval/api/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/VideoChat-R1/dba50f32fffb763e0fdf3810244be76a44147df2/lmms-eval_videochat/lmms_eval/api/__init__.py


--------------------------------------------------------------------------------
/lmms-eval_videochat/lmms_eval/api/filter.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass
 2 | from typing import List
 3 | 
 4 | from lmms_eval.api.instance import Instance
 5 | from datasets import Dataset
 6 | 
 7 | 
 8 | class Filter:
 9 |     """
10 |     Filter classes operate on a per-task level.
11 |     They take all model outputs (`instance.resps` for all `task.instances`)
12 |     across all instances of a task, and perform operations.
13 |     In a single run, one can configure any number of separate filters or lists of filters.
14 | 
15 |     """
16 | 
17 |     def __init__(self, *args, **kwargs) -> None:
18 |         """
19 |         Can define custom behavior here, if an individual instantiation of a Filter class should have state.
20 |         """
21 | 
22 |     def apply(self, resps, docs):
23 |         """
24 |         Defines the operation to perform on a list of the `inst.resps` properties of `Instance` objects.
25 |         Should return the list of (filtered) response lists *in the same order as they were input*, e.g.
26 |         if pass in [<inst.resps for instance 0>, <inst.resps for instance 1>] should return
27 |         [<filtered resps for instance 0>, <filtered resps for instance 1>]
28 |         """
29 |         return resps
30 | 
31 | 
32 | @dataclass
33 | class FilterEnsemble:
34 |     """
35 |     FilterEnsemble creates a pipeline applying multiple filters.
36 |     Its intended usage is to stack multiple post-processing steps in order.
37 |     `task.apply_filters` should use a list of FilterEnsemble classes that it stores, to apply each
38 |     pipeline separately.
39 |     """
40 | 
41 |     name: str
42 |     filters: List[Filter]
43 | 
44 |     def apply(self, instances: List[Instance], docs: List[Dataset]) -> None:
45 |         resps = [inst.resps for inst in instances]  # operate just on the model responses
46 |         for f in self.filters:
47 |             # apply filters in sequence
48 |             resps = f.apply(resps, docs)
49 | 
50 |         # add the end results after filtering to filtered_requests of their respective source instances.
51 |         # has key `self.name`: each FilterEnsemble applied in a given run should use a different name.
52 |         for inst, resp in zip(instances, resps):
53 |             inst.filtered_resps[self.name] = resp
54 | 


--------------------------------------------------------------------------------
/lmms-eval_videochat/lmms_eval/api/instance.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass, field
 2 | from typing import Literal, Tuple
 3 | 
 4 | 
 5 | @dataclass
 6 | class Instance:
 7 |     request_type: Literal["loglikelihood", "generate_until"]
 8 |     arguments: tuple
 9 |     idx: int
10 |     metadata: Tuple[str, int, int] = field(default_factory=lambda: (None, None, None))  # TODO: better typehints here
11 |     resps: list = field(default_factory=list)
12 |     filtered_resps: dict = field(default_factory=dict)
13 | 
14 |     # initialized after init
15 |     task_name: str = None
16 |     doc_id: str = None
17 |     repeats: str = None
18 |     doc: dict = None
19 | 
20 |     def __post_init__(self) -> None:
21 |         # unpack metadata field
22 |         self.task_name, self.doc_id, self.repeats = self.metadata["task"], self.metadata["doc_id"], self.metadata["repeats"]
23 | 
24 |     @property
25 |     def args(self):
26 |         """
27 |         Returns (string,) where `string` is the string to calculate loglikelihood over
28 |         """
29 |         return self.arguments if isinstance(self.arguments, tuple) else (self.arguments,)
30 | 


--------------------------------------------------------------------------------
/lmms-eval_videochat/lmms_eval/api/registry.py:
--------------------------------------------------------------------------------
  1 | from lmms_eval.api.model import lmms
  2 | 
  3 | from typing import Callable, Dict
  4 | import evaluate as hf_evaluate
  5 | 
  6 | from loguru import logger as eval_logger
  7 | 
  8 | MODEL_REGISTRY = {}
  9 | 
 10 | 
 11 | def register_model(*names):
 12 |     # either pass a list or a single alias.
 13 |     # function receives them as a tuple of strings
 14 | 
 15 |     def decorate(cls):
 16 |         for name in names:
 17 |             assert issubclass(cls, lmms), f"Model '{name}' ({cls.__name__}) must extend lmms class"
 18 | 
 19 |             assert name not in MODEL_REGISTRY, f"Model named '{name}' conflicts with existing model! Please register with a non-conflicting alias instead."
 20 | 
 21 |             MODEL_REGISTRY[name] = cls
 22 |         return cls
 23 | 
 24 |     return decorate
 25 | 
 26 | 
 27 | def get_model(model_name):
 28 |     try:
 29 |         return MODEL_REGISTRY[model_name]
 30 |     except KeyError:
 31 |         raise ValueError(f"Attempted to load model '{model_name}', but no model for this name found! Supported model names: {', '.join(MODEL_REGISTRY.keys())}")
 32 | 
 33 | 
 34 | TASK_REGISTRY = {}  # Key: task name, Value: task ConfigurableTask class
 35 | GROUP_REGISTRY = {}  # Key: group name, Value: list of task names or group names
 36 | TASK_INITIALIZED = False
 37 | ALL_TASKS = set()  # Set of all task names and group names
 38 | func2task_index = {}  # Key: task ConfigurableTask class, Value: task name
 39 | 
 40 | 
 41 | def register_task(name):
 42 |     def decorate(fn):
 43 |         assert name not in TASK_REGISTRY, f"task named '{name}' conflicts with existing registered task!"
 44 | 
 45 |         TASK_REGISTRY[name] = fn
 46 |         ALL_TASKS.add(name)
 47 |         func2task_index[fn.__name__] = name
 48 |         return fn
 49 | 
 50 |     return decorate
 51 | 
 52 | 
 53 | def register_group(name):
 54 |     def decorate(fn):
 55 |         func_name = func2task_index[fn.__name__]
 56 |         if name in GROUP_REGISTRY:
 57 |             GROUP_REGISTRY[name].append(func_name)
 58 |         else:
 59 |             GROUP_REGISTRY[name] = [func_name]
 60 |             ALL_TASKS.add(name)
 61 |         return fn
 62 | 
 63 |     return decorate
 64 | 
 65 | 
 66 | OUTPUT_TYPE_REGISTRY = {}
 67 | METRIC_REGISTRY = {}
 68 | METRIC_AGGREGATION_REGISTRY = {}
 69 | AGGREGATION_REGISTRY = {}
 70 | HIGHER_IS_BETTER_REGISTRY = {}
 71 | 
 72 | DEFAULT_METRIC_REGISTRY = {
 73 |     "loglikelihood": [
 74 |         "perplexity",
 75 |         "acc",
 76 |     ],
 77 |     "multiple_choice": ["acc", "acc_norm"],
 78 |     "generate_until": ["exact_match"],
 79 | }
 80 | 
 81 | 
 82 | def register_metric(**args):
 83 |     # TODO: do we want to enforce a certain interface to registered metrics?
 84 |     def decorate(fn):
 85 |         assert "metric" in args
 86 |         name = args["metric"]
 87 | 
 88 |         for key, registry in [
 89 |             ("metric", METRIC_REGISTRY),
 90 |             ("higher_is_better", HIGHER_IS_BETTER_REGISTRY),
 91 |             ("aggregation", METRIC_AGGREGATION_REGISTRY),
 92 |         ]:
 93 |             if key in args:
 94 |                 value = args[key]
 95 |                 assert value not in registry, f"{key} named '{value}' conflicts with existing registered {key}!"
 96 | 
 97 |                 if key == "metric":
 98 |                     registry[name] = fn
 99 |                 elif key == "aggregation":
100 |                     registry[name] = AGGREGATION_REGISTRY[value]
101 |                 else:
102 |                     registry[name] = value
103 | 
104 |         return fn
105 | 
106 |     return decorate
107 | 
108 | 
109 | def get_metric(name: str, hf_evaluate_metric=False) -> Callable:
110 |     if not hf_evaluate_metric:
111 |         if name in METRIC_REGISTRY:
112 |             return METRIC_REGISTRY[name]
113 |         else:
114 |             eval_logger.warning(f"Could not find registered metric '{name}' in lm-eval, searching in HF Evaluate library...")
115 | 
116 |     try:
117 |         metric_object = hf_evaluate.load(name)
118 |         return metric_object.compute
119 |     except Exception:
120 |         eval_logger.error(
121 |             f"{name} not found in the evaluate library! Please check https://huggingface.co/evaluate-metric",
122 |         )
123 | 
124 | 
125 | def register_aggregation(name):
126 |     def decorate(fn):
127 |         assert name not in AGGREGATION_REGISTRY, f"aggregation named '{name}' conflicts with existing registered aggregation!"
128 | 
129 |         AGGREGATION_REGISTRY[name] = fn
130 |         return fn
131 | 
132 |     return decorate
133 | 
134 | 
135 | def get_aggregation(name):
136 |     try:
137 |         return AGGREGATION_REGISTRY[name]
138 |     except KeyError:
139 |         eval_logger.warning(
140 |             "{} not a registered aggregation metric!".format(name),
141 |         )
142 | 
143 | 
144 | def get_metric_aggregation(name):
145 |     try:
146 |         return METRIC_AGGREGATION_REGISTRY[name]
147 |     except KeyError:
148 |         eval_logger.warning(
149 |             "{} metric is not assigned a default aggregation!".format(name),
150 |         )
151 | 
152 | 
153 | def is_higher_better(metric_name):
154 |     try:
155 |         return HIGHER_IS_BETTER_REGISTRY[metric_name]
156 |     except KeyError:
157 |         eval_logger.warning(f"higher_is_better not specified for metric '{metric_name}'!")
158 | 


--------------------------------------------------------------------------------
/lmms-eval_videochat/lmms_eval/api/samplers.py:
--------------------------------------------------------------------------------
 1 | class ContextSampler:
 2 |     def __init__(self, docs, task, fewshot_indices=None, rnd=None) -> None:
 3 |         self.rnd = rnd
 4 |         assert self.rnd, "must pass rnd to FewShotSampler!"
 5 | 
 6 |         self.task = task
 7 |         self.config = task._config
 8 | 
 9 |         self.target_delimiter = self.config.target_delimiter
10 |         self.fewshot_delimiter = self.config.fewshot_delimiter
11 | 
12 |         self.doc_to_text = self.task.doc_to_text
13 |         self.doc_to_target = self.task.doc_to_target
14 |         self.doc_to_choice = self.task.doc_to_choice
15 | 
16 |         self.docs = docs  # HF dataset split, provided by task._fewshot_docs()
17 |         if fewshot_indices:  # subset few-shot docs from
18 |             self.docs = self.docs.select(fewshot_indices)
19 | 
20 |     def get_context(self, doc, num_fewshot):
21 |         # draw an extra fewshot sample if using same split as evaluating on
22 |         n_samples = num_fewshot + 1 if self.config.fewshot_split == self.config.test_split else num_fewshot
23 | 
24 |         # draw `n_samples` docs from fewshot_docs
25 |         fewshotex = self.sample(n_samples)
26 | 
27 |         # get rid of the doc that's the one we're evaluating, if it's in the fewshot
28 |         # TODO: should we just stop people from using fewshot from same split as evaluating?
29 |         selected_docs = [x for x in fewshotex if x != doc][:num_fewshot]
30 | 
31 |         labeled_examples = (
32 |             self.fewshot_delimiter.join(
33 |                 [
34 |                     # TODO: is separating doc_to_text and doc_to_target by one space always desired?
35 |                     (self.doc_to_text(doc) if (self.config.doc_to_choice is None or type(self.doc_to_text(doc)) is str) else self.doc_to_choice(doc)[self.doc_to_text(doc)])
36 |                     + self.target_delimiter
37 |                     + (
38 |                         str(self.doc_to_target(doc)[0])
39 |                         if type(self.doc_to_target(doc)) is list
40 |                         else self.doc_to_target(doc) if (self.config.doc_to_choice is None or type(self.doc_to_target(doc)) is str) else str(self.doc_to_choice(doc)[self.doc_to_target(doc)])
41 |                     )
42 |                     for doc in selected_docs
43 |                 ]
44 |             )
45 |             + self.fewshot_delimiter
46 |         )
47 | 
48 |         return labeled_examples
49 | 
50 |     def sample(self, n):
51 |         """
52 |         Draw `n` samples from our fewshot docs. This method should be overridden by subclasses.
53 |         """
54 | 
55 |         return self.rnd.sample(self.docs, n)
56 | 
57 | 
58 | class FirstNSampler(ContextSampler):
59 |     def sample(self, n) -> None:
60 |         """
61 |         Draw the first `n` samples in order from the specified split.
62 |         Used for tasks with "canonical" ordered fewshot examples, such as MMLU and CMMLU.
63 |         """
64 |         assert n <= len(self.docs), f"Error: number of fewshot samples requested exceeds the {len(self.docs)} that are available."
65 |         return self.docs[:n]
66 | 
67 | 
68 | class BalancedSampler(ContextSampler):
69 |     def sample(self, n) -> None:
70 |         """
71 |         TODO: this should return approximately class-balanced samples from our fewshot examples.
72 |         TODO: what order should they be in? maybe random?
73 |         """
74 | 
75 |         pass
76 | 
77 | 
78 | class ManualSampler(ContextSampler):
79 |     def sample(self, n) -> None:
80 |         """ """
81 |         pass
82 | 
83 | 
84 | SAMPLER_REGISTRY = {
85 |     "default": ContextSampler,
86 |     "first_n": FirstNSampler,
87 | }
88 | 
89 | 
90 | def get_sampler(name):
91 |     try:
92 |         return SAMPLER_REGISTRY[name]
93 |     except KeyError:
94 |         raise ValueError(f"Attempted to use contextsampler '{name}', but no sampling strategy for this name found! Supported model names: {', '.join(SAMPLER_REGISTRY.keys())}")
95 | 


--------------------------------------------------------------------------------
/lmms-eval_videochat/lmms_eval/filters/__init__.py:
--------------------------------------------------------------------------------
 1 | from lmms_eval.api.filter import FilterEnsemble, Filter
 2 | from . import selection
 3 | from . import extraction
 4 | from . import transformation
 5 | 
 6 | 
 7 | FILTER_REGISTRY = {
 8 |     "take_first": selection.TakeFirstFilter,
 9 |     "regex": extraction.RegexFilter,
10 |     "majority_vote": selection.MajorityVoteFilter,
11 |     "take_first_k": selection.TakeKFilter,
12 |     "remove_whitespace": extraction.WhitespaceFilter,
13 |     "lowercase": transformation.LowercaseFilter,
14 |     "uppercase": transformation.UppercaseFilter,
15 |     "map": transformation.MapFilter,
16 |     "multi_choice_regex": extraction.MultiChoiceRegexFilter,
17 |     # TODO: implement this filter. either it should take in an arbitrary "scoring"/reward function
18 |     # that takes an input and returns a scalar and then should select the max reward,
19 |     # or should implement different filters for different ways of handling a reward model's inference.
20 |     # "arg_max": selection.ArgMaxFilter,
21 | }
22 | 
23 | 
24 | def get_filter(filter_name):
25 |     if filter_name in FILTER_REGISTRY:
26 |         return FILTER_REGISTRY[filter_name]
27 |     else:
28 |         return filter_name
29 | 
30 | 
31 | def build_filter_ensemble(filter_name, components):
32 |     """
33 |     Create a filtering pipeline.
34 |     """
35 |     filters = []
36 |     for function, kwargs in components:
37 |         if kwargs is None:
38 |             f = get_filter(function)()
39 |         else:
40 |             # create a filter given its name in the registry
41 |             f = get_filter(function)(**kwargs)  # TODO: pass kwargs to filters properly
42 |         # add the filter as a pipeline step
43 |         filters.append(f)
44 | 
45 |     return FilterEnsemble(name=filter_name, filters=filters)
46 | 


--------------------------------------------------------------------------------
/lmms-eval_videochat/lmms_eval/filters/decontamination.py:
--------------------------------------------------------------------------------
 1 | from lmms_eval.api.filter import Filter
 2 | 
 3 | 
 4 | class DecontaminationFilter(Filter):
 5 |     """
 6 |     A filter which evaluates
 7 |     """
 8 | 
 9 |     name = "track_decontamination"
10 | 
11 |     def __init__(self, path) -> None:
12 |         """
13 | 
14 |         TODO: make sure only ever run one time on the train set (should this be cached as a class var? keyed by value for "path").
15 |         should further cache result on a given (task_name, doc_id)
16 |         """
17 |         self._decontam_results = None
18 | 
19 |     def apply(self, resps, docs) -> None:
20 |         """
21 |         Return {"no_contamination", "only_contamination"} keys for the 2 different subsets
22 |         """
23 |         pass
24 | 


--------------------------------------------------------------------------------
/lmms-eval_videochat/lmms_eval/filters/selection.py:
--------------------------------------------------------------------------------
 1 | from collections import Counter
 2 | 
 3 | from lmms_eval.api.filter import Filter
 4 | 
 5 | 
 6 | class TakeFirstFilter(Filter):
 7 |     def __init__(self) -> None:
 8 |         """
 9 |         Can define custom behavior here, if an individual instantiation of a Filter class should have state.
10 |         """
11 | 
12 |     def apply(self, resps, docs):
13 |         """
14 |         Assuming each entry of `resps` is a list of model responses, we discard all but the first response.
15 |         """
16 |         return map(lambda r: r[0], resps)
17 | 
18 | 
19 | class TakeKFilter(Filter):
20 |     def __init__(self, *args, **kwargs) -> None:
21 |         self.k = kwargs.pop("k")
22 | 
23 |         super().__init__(*args, **kwargs)
24 | 
25 |     def apply(self, resps, docs):
26 |         # check we have at least k responses per doc, else we can't take the first k
27 |         assert len(resps[0]) >= self.k, f"Need at least {self.k} responses per doc to take first {self.k}, but got {len(resps[0])} only! Please increase TaskConfig.repeats ."
28 |         return map(lambda r: r[: self.k], resps)
29 | 
30 | 
31 | class MajorityVoteFilter(Filter):
32 |     def __init__(self) -> None:
33 |         """
34 |         Can define custom behavior here, if an individual instantiation of a Filter class should have state.
35 |         """
36 | 
37 |     def apply(self, resps, docs):
38 |         """
39 |         Each entry of `resps` is a list of model responses.
40 |         We select the response that occurs most frequently in each entry of `resps`.
41 |         """
42 | 
43 |         def select_majority(resp):
44 |             counts = Counter(resp)
45 |             vote = counts.most_common(1)[0][0]
46 |             return vote
47 | 
48 |         return map(lambda r: [select_majority(r)], resps)
49 | 


--------------------------------------------------------------------------------
/lmms-eval_videochat/lmms_eval/filters/transformation.py:
--------------------------------------------------------------------------------
 1 | from lmms_eval.api.filter import Filter
 2 | 
 3 | 
 4 | class LowercaseFilter(Filter):
 5 |     def __init__(self) -> None:
 6 |         pass
 7 | 
 8 |     def apply(self, resps, docs):
 9 |         def filter_set(inst):
10 |             return [resp.lower() for resp in inst]
11 | 
12 |         return [filter_set(resp) for resp in resps]
13 | 
14 | 
15 | class UppercaseFilter(Filter):
16 |     def __init__(self) -> None:
17 |         pass
18 | 
19 |     def apply(self, resps, docs):
20 |         def filter_set(inst):
21 |             return [resp.upper() for resp in inst]
22 | 
23 |         return [filter_set(resp) for resp in resps]
24 | 
25 | 
26 | class MapFilter(Filter):
27 |     def __init__(self, mapping_dict: dict = {}, default_value=None) -> None:
28 |         """
29 |         Initializes the MapFilter with a given mapping dictionary and default value.
30 | 
31 |         Args:
32 |         - mapping_dict (dict): A dictionary containing the key-value mappings.
33 |                                Default is an empty dictionary.
34 |         - default_value (Any): The value to be returned when a key is not found in the mapping_dict.
35 |                                Default is None.
36 | 
37 |         Example:
38 |         mapper = MapFilter({'A': 1, 'B': 2}, default_value=0)
39 |         """
40 |         assert isinstance(mapping_dict, dict), "Provided mapping_dict is not a dictionary"
41 |         self.mapping_dict = mapping_dict
42 |         self.default_value = default_value
43 | 
44 |     def apply(self, resps, docs):
45 |         def filter_set(inst):
46 |             return [self.mapping_dict.get(resp, self.default_value) for resp in inst]
47 | 
48 |         return [filter_set(resp) for resp in resps]
49 | 


--------------------------------------------------------------------------------
/lmms-eval_videochat/lmms_eval/models/__init__.py:
--------------------------------------------------------------------------------
 1 | import importlib
 2 | import os
 3 | import hf_transfer
 4 | from loguru import logger
 5 | import sys
 6 | import hf_transfer
 7 | 
 8 | os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
 9 | 
10 | logger.remove()
11 | logger.add(sys.stdout, level="WARNING")
12 | 
13 | AVAILABLE_MODELS = {
14 |     # "batch_gpt4": "BatchGPT4",
15 |     # "claude": "Claude",
16 |     # "from_log": "FromLog",
17 |     # "fuyu": "Fuyu",
18 |     # "gemini_api": "GeminiAPI",
19 |     # "gpt4v": "GPT4V",
20 |     # "idefics2": "Idefics2",
21 |     # "instructblip": "InstructBLIP",
22 |     # "internvl": "InternVLChat",
23 |     # "internvl2": "InternVL2",
24 |     "internvl2_video": "InternVL2_video",
25 |     "internvl2_video_new": "InternVL2_video_new",
26 |     # "llama_vid": "LLaMAVid",
27 |     # "llava": "Llava",
28 |     # "llava_hf": "LlavaHf",
29 |     "llava_onevision": "Llava_OneVision",
30 |     # "llava_sglang": "LlavaSglang",
31 |     # "llava_vid": "LlavaVid",
32 |     # "longva": "LongVA",
33 |     # "mantis": "Mantis",
34 |     # "minicpm_v": "MiniCPM_V",
35 |     # "mplug_owl_video": "mplug_Owl",
36 |     # "phi3v": "Phi3v",
37 |     # "qwen_vl": "Qwen_VL",
38 |     # "qwen_vl_api": "Qwen_VL_API",
39 |     # "reka": "Reka",
40 |     # "srt_api": "SRT_API",
41 |     # "tinyllava": "TinyLlava",
42 |     # "videoChatGPT": "VideoChatGPT",
43 |     # "video_llava": "VideoLLaVA",
44 |     # "vila": "VILA",
45 |     # "xcomposer2_4KHD": "XComposer2_4KHD",
46 |     # "xcomposer2d5": "XComposer2D5",
47 |     "videochat_next": "VideoChat_NeXT",
48 |     "videochat_next_image": "VideoChat_NeXT_image",
49 |     "videochat_next_dynamic": "VideoChat_NeXT_dynamic",
50 |     "videochat_next_pdrop": "VideoChat_NeXT_Pdrop",
51 |     "videochat_next_fastv": "VideoChat_NeXT_FastV",
52 |     "videochat_pdrop": "VideoChat_Pdrop",
53 |     "videochat": "VideoChat",
54 |     "videochat_next_old": "VideoChat_NeXT_old",
55 |     "videochat_next_dynamic_pdrop":"VideoChat_NeXT_dynamic_pdrop",
56 |     "videochat_next_dynamic_newprompt":"VideoChat_NeXT_dynamic_newprompt",
57 |     "videochat_next_dynamic_pdrop_newprompt":"VideoChat_NeXT_dynamic_pdrop_newprompt",
58 |     "videochat_flash": "VideoChat_Flash",
59 |     "videochat_flash2": "VideoChat_Flash2",
60 |     "qwen2_5_vl_lxh": "Qwen2_5_VL"
61 | }
62 | 
63 | for model_name, model_class in AVAILABLE_MODELS.items():
64 |     try:
65 |         exec(f"from .{model_name} import {model_class}")
66 |     except Exception as e:
67 |         logger.debug(f"Failed to import {model_class} from {model_name}: {e}")
68 | 
69 | if os.environ.get("LMMS_EVAL_PLUGINS", None):
70 |     # Allow specifying other packages to import models from
71 |     for plugin in os.environ["LMMS_EVAL_PLUGINS"].split(","):
72 |         m = importlib.import_module(f"{plugin}.models")
73 |         for model_name, model_class in getattr(m, "AVAILABLE_MODELS").items():
74 |             try:
75 |                 exec(f"from {plugin}.models.{model_name} import {model_class}")
76 |             except ImportError as e:
77 |                 logger.debug(f"Failed to import {model_class} from {model_name}: {e}")
78 | 


--------------------------------------------------------------------------------
/lmms-eval_videochat/lmms_eval/models/model_utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/VideoChat-R1/dba50f32fffb763e0fdf3810244be76a44147df2/lmms-eval_videochat/lmms_eval/models/model_utils/__init__.py


--------------------------------------------------------------------------------
/lmms-eval_videochat/lmms_eval/models/model_utils/load_video.py:
--------------------------------------------------------------------------------
 1 | import av
 2 | from av.codec.context import CodecContext
 3 | import numpy as np
 4 | 
 5 | 
 6 | # This one is faster
 7 | def record_video_length_stream(container, indices):
 8 |     frames = []
 9 |     start_index = indices[0]
10 |     end_index = indices[-1]
11 |     for i, frame in enumerate(container.decode(video=0)):
12 |         if i > end_index:
13 |             break
14 |         if i >= start_index and i in indices:
15 |             frames.append(frame)
16 |     return frames
17 | 
18 | 
19 | # This one works for all types of video
20 | def record_video_length_packet(container):
21 |     frames = []
22 |     # https://github.com/PyAV-Org/PyAV/issues/1269
23 |     # https://www.cnblogs.com/beyond-tester/p/17641872.html
24 |     # context = CodecContext.create("libvpx-vp9", "r")
25 |     for packet in container.demux(video=0):
26 |         for frame in packet.decode():
27 |             frames.append(frame)
28 |     return frames
29 | 
30 | 
31 | def read_video_pyav(video_path, num_frm=8):
32 |     container = av.open(video_path)
33 | 
34 |     if "webm" not in video_path and "mkv" not in video_path:
35 |         # For mp4, we try loading with stream first
36 |         try:
37 |             container = av.open(video_path)
38 |             total_frames = container.streams.video[0].frames
39 |             sampled_frm = min(total_frames, num_frm)
40 |             indices = np.linspace(0, total_frames - 1, sampled_frm, dtype=int)
41 |             frames = record_video_length_stream(container, indices)
42 |         except:
43 |             container = av.open(video_path)
44 |             frames = record_video_length_packet(container)
45 |             total_frames = len(frames)
46 |             sampled_frm = min(total_frames, num_frm)
47 |             indices = np.linspace(0, total_frames - 1, sampled_frm, dtype=int)
48 |             frames = [frames[i] for i in indices]
49 |     else:
50 |         container = av.open(video_path)
51 |         frames = record_video_length_packet(container)
52 |         total_frames = len(frames)
53 |         sampled_frm = min(total_frames, num_frm)
54 |         indices = np.linspace(0, total_frames - 1, sampled_frm, dtype=int)
55 |         frames = [frames[i] for i in indices]
56 |     return np.stack([x.to_ndarray(format="rgb24") for x in frames])
57 | 


--------------------------------------------------------------------------------
/lmms-eval_videochat/lmms_eval/models/mplug_owl_video/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2020 The HuggingFace Team. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | from typing import TYPE_CHECKING
15 | 
16 | from transformers.utils import OptionalDependencyNotAvailable, _LazyModule, is_tokenizers_available, is_torch_available
17 | 
18 | 
19 | _import_structure = {
20 |     "configuration_mplug_owl": ["MPLUG_OWL_PRETRAINED_CONFIG_ARCHIVE_MAP", "MplugOwlConfig"],
21 |     "processing_mplug_owl": ["MplugOwlImageProcessor", "MplugOwlProcessor"],
22 |     "tokenization_mplug_owl": ["MplugOwlTokenizer"],
23 | }
24 | 
25 | try:
26 |     if not is_tokenizers_available():
27 |         raise OptionalDependencyNotAvailable()
28 | except OptionalDependencyNotAvailable:
29 |     pass
30 | 
31 | 
32 | try:
33 |     if not is_torch_available():
34 |         raise OptionalDependencyNotAvailable()
35 | except OptionalDependencyNotAvailable:
36 |     pass
37 | else:
38 |     _import_structure["modeling_mplug_owl"] = [
39 |         "MPLUG_OWL_PRETRAINED_MODEL_ARCHIVE_LIST",
40 |         "MplugOwlForConditionalGeneration",
41 |         "MplugOwlModel",
42 |     ]
43 | 
44 | 
45 | if TYPE_CHECKING:
46 |     from .configuration_mplug_owl import MPLUG_OWL_PRETRAINED_CONFIG_ARCHIVE_MAP, MplugOwlConfig
47 |     from .tokenization_mplug_owl import MplugOwlTokenizer
48 | 
49 |     try:
50 |         if not is_tokenizers_available():
51 |             raise OptionalDependencyNotAvailable()
52 |     except OptionalDependencyNotAvailable:
53 |         pass
54 | 
55 |     try:
56 |         if not is_torch_available():
57 |             raise OptionalDependencyNotAvailable()
58 |     except OptionalDependencyNotAvailable:
59 |         pass
60 |     else:
61 |         from .modeling_mplug_owl import (
62 |             MPLUG_OWL_PRETRAINED_MODEL_ARCHIVE_LIST,
63 |             MplugOwlForConditionalGeneration,
64 |             MplugOwlModel,
65 |             MplugOwlPreTrainedModel,
66 |         )
67 | 
68 | 
69 | else:
70 |     import sys
71 | 
72 |     sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
73 | 
74 | from .configuration_mplug_owl import *
75 | from .modeling_mplug_owl import *
76 | from .processing_mplug_owl import *
77 | from .tokenization_mplug_owl import *
78 | 


--------------------------------------------------------------------------------
/lmms-eval_videochat/lmms_eval/models/mplug_owl_video/tokenization_mplug_owl.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2022 x-plug and The HuggingFace Inc. team. All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Tokenization classes for MplugOwl."""
16 | 
17 | from transformers.models.llama.tokenization_llama import LlamaTokenizer
18 | 
19 | from loguru import logger
20 | 
21 | VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"}
22 | 
23 | PRETRAINED_VOCAB_FILES_MAP = {
24 |     "vocab_file": {
25 |         "MAGAer13/mplug-owl-llama-7b": "https://huggingface.co/MAGAer13/mplug-owl-llama-7b/resolve/main/vocab.txt",
26 |     },
27 | }
28 | 
29 | PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
30 |     "MAGAer13/mplug-owl-llama-7b": 2048,
31 | }
32 | 
33 | 
34 | class MplugOwlTokenizer(LlamaTokenizer):
35 |     def __init__(
36 |         self,
37 |         vocab_file,
38 |         unk_token="<unk>",
39 |         bos_token="<s>",
40 |         eos_token="</s>",
41 |         pad_token="<unk>",
42 |         sp_model_kwargs=None,
43 |         add_bos_token=False,
44 |         add_eos_token=False,
45 |         clean_up_tokenization_spaces=False,
46 |         **kwargs,
47 |     ):
48 |         super().__init__(
49 |             vocab_file,
50 |             unk_token,
51 |             bos_token,
52 |             eos_token,
53 |             pad_token,
54 |             sp_model_kwargs,
55 |             add_bos_token,
56 |             add_eos_token,
57 |             clean_up_tokenization_spaces,
58 |             **kwargs,
59 |         )
60 |         self.eod_id = self.eos_token_id
61 | 


--------------------------------------------------------------------------------
/lmms-eval_videochat/lmms_eval/models/video_chatgpt/__init__.py:
--------------------------------------------------------------------------------
1 | from .model import VideoChatGPTLlamaForCausalLM
2 | 


--------------------------------------------------------------------------------
/lmms-eval_videochat/lmms_eval/models/video_chatgpt/constants.py:
--------------------------------------------------------------------------------
 1 | CONTROLLER_HEART_BEAT_EXPIRATION = 30
 2 | WORKER_HEART_BEAT_INTERVAL = 15
 3 | 
 4 | LOGDIR = "."
 5 | 
 6 | 
 7 | # Defining model
 8 | DEFAULT_VIDEO_TOKEN = "<video>"
 9 | DEFAULT_VIDEO_PATCH_TOKEN = "<vid_patch>"
10 | DEFAULT_VID_START_TOKEN = "<vid_start>"
11 | DEFAULT_VID_END_TOKEN = "<vid_end>"
12 | 


--------------------------------------------------------------------------------
/lmms-eval_videochat/lmms_eval/models/video_chatgpt/eval/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/VideoChat-R1/dba50f32fffb763e0fdf3810244be76a44147df2/lmms-eval_videochat/lmms_eval/models/video_chatgpt/eval/__init__.py


--------------------------------------------------------------------------------
/lmms-eval_videochat/lmms_eval/models/video_chatgpt/model/__init__.py:
--------------------------------------------------------------------------------
1 | from lmms_eval.models.video_chatgpt.model.video_chatgpt import VideoChatGPTLlamaForCausalLM, VideoChatGPTConfig
2 | 


--------------------------------------------------------------------------------
/lmms-eval_videochat/lmms_eval/models/video_chatgpt/model/consolidate.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Usage:
 3 | python3 -m llava.model.consolidate --src ~/model_weights/llava-7b --dst ~/model_weights/llava-7b_consolidate
 4 | """
 5 | 
 6 | import argparse
 7 | 
 8 | import torch
 9 | from transformers import AutoTokenizer, AutoModelForCausalLM
10 | from lmms_eval.models.video_chatgpt.model import *
11 | 
12 | 
13 | def consolidate_ckpt(src_path, dst_path):
14 |     print("Loading model")
15 |     src_model = AutoModelForCausalLM.from_pretrained(src_path, torch_dtype=torch.float16, low_cpu_mem_usage=True)
16 |     src_tokenizer = AutoTokenizer.from_pretrained(src_path)
17 |     src_model.save_pretrained(dst_path)
18 |     src_tokenizer.save_pretrained(dst_path)
19 | 
20 | 
21 | if __name__ == "__main__":
22 |     parser = argparse.ArgumentParser()
23 |     parser.add_argument("--src", type=str, required=True)
24 |     parser.add_argument("--dst", type=str, required=True)
25 | 
26 |     args = parser.parse_args()
27 | 
28 |     consolidate_ckpt(args.src, args.dst)
29 | 


--------------------------------------------------------------------------------
/lmms-eval_videochat/lmms_eval/models/video_chatgpt/model/make_delta.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Usage:
 3 | python
 4 | """
 5 | 
 6 | import argparse
 7 | 
 8 | import torch
 9 | from tqdm import tqdm
10 | from transformers import AutoTokenizer, AutoModelForCausalLM
11 | 
12 | 
13 | def make_delta(base_model_path, target_model_path, delta_path, hub_repo_id):
14 |     print("Loading base model")
15 |     base = AutoModelForCausalLM.from_pretrained(base_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True)
16 | 
17 |     print("Loading target model")
18 |     target = AutoModelForCausalLM.from_pretrained(target_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True)
19 | 
20 |     print("Calculating delta")
21 |     for name, param in tqdm(target.state_dict().items(), desc="Calculating delta"):
22 |         if name not in base.state_dict():
23 |             assert name in ["model.mm_projector.weight", "model.mm_projector.bias"], f"{name} not in base model"
24 |             continue
25 |         if param.data.shape == base.state_dict()[name].shape:
26 |             param.data -= base.state_dict()[name]
27 |         else:
28 |             assert name in ["model.embed_tokens.weight", "lm_head.weight"], f"{name} dimension mismatch: {param.data.shape} vs {base.state_dict()[name].shape}"
29 |             bparam = base.state_dict()[name]
30 |             param.data[: bparam.shape[0], : bparam.shape[1]] -= bparam
31 | 
32 |     print("Saving delta")
33 |     if hub_repo_id:
34 |         kwargs = {"push_to_hub": True, "repo_id": hub_repo_id}
35 |     else:
36 |         kwargs = {}
37 |     target.save_pretrained(delta_path, **kwargs)
38 |     target_tokenizer = AutoTokenizer.from_pretrained(target_model_path)
39 |     target_tokenizer.save_pretrained(delta_path, **kwargs)
40 | 
41 | 
42 | if __name__ == "__main__":
43 |     parser = argparse.ArgumentParser()
44 |     parser.add_argument("--base-model-path", type=str, required=True)
45 |     parser.add_argument("--target-model-path", type=str, required=True)
46 |     parser.add_argument("--delta-path", type=str, required=True)
47 |     parser.add_argument("--hub-repo-id", type=str, default=None)
48 |     args = parser.parse_args()
49 | 
50 |     make_delta(args.base_model_path, args.target_model_path, args.delta_path, args.hub_repo_id)
51 | 


--------------------------------------------------------------------------------
/lmms-eval_videochat/lmms_eval/models/video_chatgpt/model/utils.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from lmms_eval.models.video_chatgpt.model import *
 3 | from transformers import StoppingCriteria
 4 | 
 5 | 
 6 | class KeywordsStoppingCriteria(StoppingCriteria):
 7 |     def __init__(self, keywords, tokenizer, input_ids):
 8 |         self.keywords = keywords
 9 |         self.keyword_ids = [tokenizer(keyword).input_ids for keyword in keywords]
10 |         self.keyword_ids = [keyword_id[0] for keyword_id in self.keyword_ids if type(keyword_id) is list and len(keyword_id) == 1]
11 |         self.tokenizer = tokenizer
12 |         self.start_len = None
13 |         self.input_ids = input_ids
14 | 
15 |     def __call__(self, output_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
16 |         if self.start_len is None:
17 |             self.start_len = self.input_ids.shape[1]
18 |         else:
19 |             for keyword_id in self.keyword_ids:
20 |                 if output_ids[0, -1] == keyword_id:
21 |                     return True
22 |             outputs = self.tokenizer.batch_decode(output_ids[:, self.start_len :], skip_special_tokens=True)[0]
23 |             for keyword in self.keywords:
24 |                 if keyword in outputs:
25 |                     return True
26 |         return False
27 | 


--------------------------------------------------------------------------------
/lmms-eval_videochat/lmms_eval/models/video_chatgpt/utils.py:
--------------------------------------------------------------------------------
1 | def disable_torch_init():
2 |     """
3 |     Disable the redundant torch default initialization to accelerate model creation.
4 |     """
5 |     import torch
6 | 
7 |     setattr(torch.nn.Linear, "reset_parameters", lambda self: None)
8 |     setattr(torch.nn.LayerNorm, "reset_parameters", lambda self: None)
9 | 


--------------------------------------------------------------------------------
/lmms-eval_videochat/lmms_eval/tasks/_task_utils/file_utils.py:
--------------------------------------------------------------------------------
1 | import os
2 | 
3 | 
4 | def generate_submission_file(file_name, args, subpath="submissions"):
5 |     path = os.path.join(args.output_path, subpath)
6 |     os.makedirs(path, exist_ok=True)
7 |     path = os.path.join(path, file_name)
8 |     return os.path.abspath(path)
9 | 


--------------------------------------------------------------------------------
/lmms-eval_videochat/lmms_eval/tasks/_task_utils/gpt_eval_utils.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/VideoChat-R1/dba50f32fffb763e0fdf3810244be76a44147df2/lmms-eval_videochat/lmms_eval/tasks/_task_utils/gpt_eval_utils.py


--------------------------------------------------------------------------------
/lmms-eval_videochat/lmms_eval/tasks/_task_utils/video_loader.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | 
 4 | def get_cache_dir(config, sub_dir="videos"):
 5 |     HF_HOME = os.environ["HF_HOME"]
 6 |     cache_dir = config["dataset_kwargs"]["cache_dir"]
 7 |     cache_dir = os.path.join(HF_HOME, cache_dir)
 8 |     cache_dir = os.path.join(cache_dir, sub_dir)
 9 |     return cache_dir
10 | 
11 | 
12 | def _get_video_file(prefix: str, video_name: str, suffix: str):
13 |     if not isinstance(video_name, str):
14 |         video_name = str(video_name)
15 |     if not video_name.endswith(suffix):
16 |         video_name = f"{video_name}.{suffix}"
17 |     video_path = os.path.join(prefix, video_name)
18 |     return video_path
19 | 
20 | 
21 | def get_video(prefix: str, video_name: str, suffix: str = "mp4"):
22 |     tried = [os.path.abspath(_get_video_file(prefix, video_name, suffix)), os.path.abspath(_get_video_file(prefix, video_name, suffix.upper())), os.path.abspath(_get_video_file(prefix, video_name, suffix.lower()))]
23 |     for video_path in tried:
24 |         if os.path.exists(video_path):
25 |             return video_path
26 |     raise FileNotFoundError(f"Tried both {tried} but none of them exist, please check")
27 | 


--------------------------------------------------------------------------------
/lmms-eval_videochat/lmms_eval/tasks/dream_1k/_default_template.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-eval_data/Dream1K
 2 | dataset_kwargs:
 3 |   token: True
 4 |   cache_dir: lmms-eval_data/Dream1K
 5 |   video: True
 6 | generation_kwargs:
 7 |   max_new_tokens: 512
 8 |   temperature: 0
 9 |   top_p: 1.0
10 |   num_beams: 1
11 |   do_sample: false
12 | 
13 | output_type: generate_until
14 | doc_to_visual: !function utils.video_caption_doc_to_visual
15 | doc_to_text: !function utils.video_caption_doc_to_text
16 | doc_to_target: !function utils.video_caption_doc_to_answer
17 | process_results: !function utils.video_caption_process_results_generation


--------------------------------------------------------------------------------
/lmms-eval_videochat/lmms_eval/tasks/dream_1k/dream_1k.yaml:
--------------------------------------------------------------------------------
 1 | include: _default_template.yaml
 2 | task: dream_1k
 3 | dataset_name: dream_1k
 4 | test_split: train
 5 | metric_list:
 6 |   - metric: submission
 7 |     aggregation: !function utils.video_caption_aggregate
 8 |     higher_is_better: true
 9 | lmms_eval_specific_kwargs:
10 |   default:
11 |     post_prompt: "Describe this video in detail."


--------------------------------------------------------------------------------
/lmms-eval_videochat/lmms_eval/tasks/dream_1k/dream_1k_cn.yaml:
--------------------------------------------------------------------------------
 1 | include: _default_template.yaml
 2 | task: dream_1k_cn
 3 | dataset_name: dream_1k
 4 | test_split: train
 5 | metric_list:
 6 |   - metric: submission
 7 |     aggregation: !function utils.video_caption_aggregate
 8 |     higher_is_better: true
 9 | lmms_eval_specific_kwargs:
10 |   default:
11 |     post_prompt: "详细描述该视频内容。"


--------------------------------------------------------------------------------
/lmms-eval_videochat/lmms_eval/tasks/dream_1k/utils.py:
--------------------------------------------------------------------------------
 1 | from decord import VideoReader, cpu
 2 | import numpy as np
 3 | import os
 4 | import sys
 5 | import datetime
 6 | import lmms_eval.tasks._task_utils.file_utils as file_utils
 7 | import json
 8 | 
 9 | import yaml
10 | import random
11 | from pathlib import Path
12 | 
13 | with open(Path(__file__).parent / "_default_template.yaml", "r") as f:
14 |     raw_data = f.readlines()
15 |     safe_data = []
16 |     for i, line in enumerate(raw_data):
17 |         # remove function definition since yaml load cannot handle it
18 |         if "!function" not in line:
19 |             safe_data.append(line)
20 | 
21 |     config = yaml.safe_load("".join(safe_data))
22 | 
23 | 
24 | from loguru import logger as eval_logger
25 | 
26 | 
27 | 
28 | # Pass in video path here
29 | # Can only work correctly with video llm
30 | def video_caption_doc_to_visual(doc, lmms_eval_specific_kwargs=None):
31 |     video_path = doc["video"]
32 |     data_root = config["dataset_kwargs"]["cache_dir"]
33 |     video_path = os.path.join(data_root, video_path)
34 |     if os.path.exists(video_path):
35 |         video_path = video_path
36 |     elif "s3://" not in video_path:
37 |         sys.exit(f"video path:{video_path} does not exist, please check")
38 | 
39 |     return [video_path]
40 | 
41 | 
42 | # This is the place where you format your question
43 | def video_caption_doc_to_text(doc, lmms_eval_specific_kwargs=None):
44 |     if lmms_eval_specific_kwargs is None:
45 |         lmms_eval_specific_kwargs = {}
46 | 
47 |     # if "pre_prompt" in lmms_eval_specific_kwargs:
48 |     #     pre_prompt = lmms_eval_specific_kwargs["pre_prompt"]
49 |     if "post_prompt" in lmms_eval_specific_kwargs:
50 |         post_prompt = lmms_eval_specific_kwargs["post_prompt"]
51 | 
52 |     # question = doc["caption"]
53 |     
54 |     return f"{post_prompt}"
55 | 
56 | 
57 | def video_caption_doc_to_answer(doc):
58 |     return doc["caption"]
59 | 
60 | 
61 | # Process result for mcq answer generation
62 | def video_caption_process_results_generation(doc, result):
63 |     pred = result[0]
64 |     return {"submission": {f'{doc["video"]}': {"pred":pred, "gt":doc["caption"]}}}
65 | 
66 | 
67 | def video_caption_aggregate(results, args):
68 |     video_caption_aggregate_submissions(results, args, "charades")
69 |     
70 | def video_caption_aggregate_submissions(results, args, task):
71 |     now_date_time = datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
72 |     submission_file_name = f"inference_results_video_caption_{task}_{now_date_time}.json"
73 |     path = file_utils.generate_submission_file(submission_file_name, args)
74 | 
75 |     # results is a list of 5031 dict,
76 |     # need to convert results into a single dict with 5031 key-value pairs
77 |     combined_submission = {}
78 | 
79 |     for submission_dict in results:
80 |         combined_submission.update(submission_dict)
81 | 
82 |     with open(path, "w") as f:
83 |         json.dump(combined_submission, f, indent=4)
84 | 
85 |     eval_logger.info(f"Submission file saved to {path}")
86 | 
87 | 
88 | 
89 | 
90 | 
91 | 
92 | 


--------------------------------------------------------------------------------
/lmms-eval_videochat/lmms_eval/tasks/mvbench_nothink/_default_template.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-eval_data/MVBench
 2 | dataset_kwargs:
 3 |   token: True
 4 |   cache_dir: lmms-eval_data/MVBench
 5 |   video: True
 6 | generation_kwargs:
 7 |   max_new_tokens: 16
 8 |   temperature: 0
 9 |   top_p: 1.0
10 |   num_beams: 1
11 |   do_sample: false
12 | 
13 | output_type: generate_until
14 | doc_to_visual: !function utils.mvbench_doc_to_visual
15 | doc_to_text: !function utils.mvbench_doc_to_text_nothink
16 | doc_to_target: "answer"
17 | # The return value of process_results will be used by metrics
18 | process_results: !function utils.mvbench_process_results_answer
19 | # Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results
20 | metric_list:
21 |   - metric: mvbench_accuracy
22 |     aggregation: !function utils.mvbench_aggregate_results
23 |     higher_is_better: true


--------------------------------------------------------------------------------
/lmms-eval_videochat/lmms_eval/tasks/mvbench_nothink/mvbench_action_antonym_nothink.yaml:
--------------------------------------------------------------------------------
 1 | include: _default_template.yaml
 2 | task: mvbench_action_antonym_nothink
 3 | dataset_name: action_antonym
 4 | test_split: train
 5 | lmms_eval_specific_kwargs:
 6 |   default:
 7 |     sub_task: action_antonym
 8 |     yinan_prompt: "()"
 9 |     post_prompt: "\nOnly give the best option."
10 | 


--------------------------------------------------------------------------------
/lmms-eval_videochat/lmms_eval/tasks/mvbench_nothink/mvbench_action_count_nothink.yaml:
--------------------------------------------------------------------------------
1 | include: _default_template.yaml
2 | task: mvbench_action_count_nothink
3 | dataset_name: action_count
4 | test_split: train
5 | lmms_eval_specific_kwargs:
6 |   default:
7 |     sub_task: action_count
8 |     yinan_prompt: "()"
9 |     post_prompt: "\nOnly give the best option."


--------------------------------------------------------------------------------
/lmms-eval_videochat/lmms_eval/tasks/mvbench_nothink/mvbench_action_localization_nothink.yaml:
--------------------------------------------------------------------------------
1 | include: _default_template.yaml
2 | task: mvbench_action_localization_nothink
3 | dataset_name: action_localization
4 | test_split: train
5 | lmms_eval_specific_kwargs:
6 |   default:
7 |     sub_task: action_localization
8 |     post_prompt: "Answer with the option's letter from the given choices directly."
9 | 


--------------------------------------------------------------------------------
/lmms-eval_videochat/lmms_eval/tasks/mvbench_nothink/mvbench_action_prediction_nothink.yaml:
--------------------------------------------------------------------------------
1 | include: _default_template.yaml
2 | task: mvbench_action_prediction_nothink
3 | dataset_name: action_prediction
4 | test_split: train
5 | lmms_eval_specific_kwargs:
6 |   default:
7 |     sub_task: action_prediction
8 |     post_prompt: "Answer with the option's letter from the given choices directly."


--------------------------------------------------------------------------------
/lmms-eval_videochat/lmms_eval/tasks/mvbench_nothink/mvbench_action_sequence_nothink.yaml:
--------------------------------------------------------------------------------
1 | include: _default_template.yaml
2 | task: mvbench_action_sequence_nothink
3 | dataset_name: action_sequence
4 | test_split: train
5 | lmms_eval_specific_kwargs:
6 |   default:
7 |     sub_task: action_sequence
8 |     post_prompt: "Answer with the option's letter from the given choices directly."


--------------------------------------------------------------------------------
/lmms-eval_videochat/lmms_eval/tasks/mvbench_nothink/mvbench_character_order_nothink.yaml:
--------------------------------------------------------------------------------
1 | include: _default_template.yaml
2 | task: mvbench_character_order_nothink
3 | dataset_name: character_order
4 | test_split: train
5 | lmms_eval_specific_kwargs:
6 |   default:
7 |     sub_task: character_order
8 |     post_prompt: "Answer with the option's letter from the given choices directly."


--------------------------------------------------------------------------------
/lmms-eval_videochat/lmms_eval/tasks/mvbench_nothink/mvbench_counterfactual_inference_nothink.yaml:
--------------------------------------------------------------------------------
1 | include: _default_template.yaml
2 | task: mvbench_counterfactual_inference_nothink
3 | dataset_name: counterfactual_inference
4 | test_split: train
5 | lmms_eval_specific_kwargs:
6 |   default:
7 |     sub_task: counterfactual_inference
8 |     yinan_prompt: "()"
9 |     post_prompt: "\nOnly give the best option."


--------------------------------------------------------------------------------
/lmms-eval_videochat/lmms_eval/tasks/mvbench_nothink/mvbench_egocentric_navigation_nothink.yaml:
--------------------------------------------------------------------------------
1 | include: _default_template.yaml
2 | task: mvbench_egocentric_navigation_nothink
3 | dataset_name: egocentric_navigation
4 | test_split: train
5 | lmms_eval_specific_kwargs:
6 |   default:
7 |     sub_task: egocentric_navigation
8 |     post_prompt: "Answer with the option's letter from the given choices directly."
9 | 


--------------------------------------------------------------------------------
/lmms-eval_videochat/lmms_eval/tasks/mvbench_nothink/mvbench_episodic_reasoning_nothink.yaml:
--------------------------------------------------------------------------------
 1 | include: _default_template.yaml
 2 | task: mvbench_episodic_reasoning_nothink
 3 | dataset_name: episodic_reasoning
 4 | test_split: train
 5 | doc_to_visual: !function utils.mvbench_frames_doc_to_visual
 6 | generation_kwargs:
 7 |   max_new_tokens: 16
 8 |   temperature: 0
 9 |   top_p: 1.0
10 |   num_beams: 1
11 |   do_sample: false
12 |   
13 | lmms_eval_specific_kwargs:
14 |   default:
15 |     sub_task: episodic_reasoning
16 |     post_prompt: "Answer with the option's letter from the given choices directly."


--------------------------------------------------------------------------------
/lmms-eval_videochat/lmms_eval/tasks/mvbench_nothink/mvbench_fine_grained_action_nothink.yaml:
--------------------------------------------------------------------------------
1 | include: _default_template.yaml
2 | task: mvbench_fine_grained_action_nothink
3 | dataset_name: fine_grained_action
4 | test_split: train
5 | lmms_eval_specific_kwargs:
6 |   default:
7 |     sub_task: fine_grained_action
8 |     post_prompt: "Answer with the option's letter from the given choices directly."


--------------------------------------------------------------------------------
/lmms-eval_videochat/lmms_eval/tasks/mvbench_nothink/mvbench_fine_grained_pose_nothink.yaml:
--------------------------------------------------------------------------------
1 | include: _default_template.yaml
2 | task: mvbench_fine_grained_pose_nothink
3 | dataset_name: fine_grained_pose
4 | test_split: train
5 | lmms_eval_specific_kwargs:
6 |   default:
7 |     sub_task: fine_grained_pose
8 |     post_prompt: "Answer with the option's letter from the given choices directly."


--------------------------------------------------------------------------------
/lmms-eval_videochat/lmms_eval/tasks/mvbench_nothink/mvbench_moving_attribute_nothink.yaml:
--------------------------------------------------------------------------------
1 | include: _default_template.yaml
2 | task: mvbench_moving_attribute_nothink
3 | dataset_name: moving_attribute
4 | test_split: train
5 | lmms_eval_specific_kwargs:
6 |   default:
7 |     sub_task: moving_attribute
8 |     post_prompt: "Answer with the option's letter from the given choices directly."


--------------------------------------------------------------------------------
/lmms-eval_videochat/lmms_eval/tasks/mvbench_nothink/mvbench_moving_count_nothink.yaml:
--------------------------------------------------------------------------------
1 | include: _default_template.yaml
2 | task: mvbench_moving_count_nothink
3 | dataset_name: moving_count
4 | test_split: train
5 | lmms_eval_specific_kwargs:
6 |   default:
7 |     sub_task: moving_count
8 |     post_prompt: "Answer with the option's letter from the given choices directly."


--------------------------------------------------------------------------------
/lmms-eval_videochat/lmms_eval/tasks/mvbench_nothink/mvbench_moving_direction_nothink.yaml:
--------------------------------------------------------------------------------
1 | include: _default_template.yaml
2 | task: mvbench_moving_direction_nothink
3 | dataset_name: moving_direction
4 | test_split: train
5 | lmms_eval_specific_kwargs:
6 |   default:
7 |     sub_task: moving_direction
8 |     post_prompt: "Answer with the option's letter from the given choices directly."


--------------------------------------------------------------------------------
/lmms-eval_videochat/lmms_eval/tasks/mvbench_nothink/mvbench_nothink.yaml:
--------------------------------------------------------------------------------
 1 | group: mvbench_nothink
 2 | task:
 3 |   - mvbench_action_sequence_nothink
 4 |   - mvbench_moving_count_nothink
 5 |   - mvbench_action_prediction_nothink
 6 |   - mvbench_episodic_reasoning_nothink
 7 |   - mvbench_action_antonym_nothink
 8 |   - mvbench_action_count_nothink
 9 |   - mvbench_scene_transition_nothink
10 |   - mvbench_object_shuffle_nothink
11 |   - mvbench_object_existence_nothink
12 |   - mvbench_fine_grained_pose_nothink
13 |   - mvbench_unexpected_action_nothink
14 |   - mvbench_moving_direction_nothink
15 |   - mvbench_state_change_nothink
16 |   - mvbench_object_interaction_nothink
17 |   - mvbench_character_order_nothink
18 |   - mvbench_action_localization_nothink
19 |   - mvbench_counterfactual_inference_nothink
20 |   - mvbench_fine_grained_action_nothink
21 |   - mvbench_moving_attribute_nothink
22 |   - mvbench_egocentric_navigation_nothink
23 | 
24 | # ['action_sequence', 'moving_count', 'action_prediction', 'episodic_reasoning', 'action_antonym', 'action_count', 'scene_transition', 'object_shuffle', 'object_existence', 'fine_grained_pose', 'unexpected_action', 'moving_direction', 'state_change', 'object_interaction', 'character_order', 'action_localization', 'counterfactual_inference', 'fine_grained_action', 'moving_attribute', 'egocentric_navigation']


--------------------------------------------------------------------------------
/lmms-eval_videochat/lmms_eval/tasks/mvbench_nothink/mvbench_object_existence_nothink.yaml:
--------------------------------------------------------------------------------
1 | include: _default_template.yaml
2 | task: mvbench_object_existence_nothink
3 | dataset_name: object_existence
4 | test_split: train
5 | lmms_eval_specific_kwargs:
6 |   default:
7 |     sub_task: object_existence
8 |     post_prompt: "Answer with the option's letter from the given choices directly."


--------------------------------------------------------------------------------
/lmms-eval_videochat/lmms_eval/tasks/mvbench_nothink/mvbench_object_interaction_nothink.yaml:
--------------------------------------------------------------------------------
1 | include: _default_template.yaml
2 | task: mvbench_object_interaction_nothink
3 | dataset_name: object_interaction
4 | test_split: train
5 | lmms_eval_specific_kwargs:
6 |   default:
7 |     sub_task: object_interaction
8 |     post_prompt: "Answer with the option's letter from the given choices directly."


--------------------------------------------------------------------------------
/lmms-eval_videochat/lmms_eval/tasks/mvbench_nothink/mvbench_object_shuffle_nothink.yaml:
--------------------------------------------------------------------------------
1 | include: _default_template.yaml
2 | task: mvbench_object_shuffle_nothink
3 | dataset_name: object_shuffle
4 | test_split: train
5 | lmms_eval_specific_kwargs:
6 |   default:
7 |     sub_task: object_shuffle
8 |     post_prompt: "Answer with the option's letter from the given choices directly."


--------------------------------------------------------------------------------
/lmms-eval_videochat/lmms_eval/tasks/mvbench_nothink/mvbench_scene_transition_nothink.yaml:
--------------------------------------------------------------------------------
1 | include: _default_template.yaml
2 | task: mvbench_scene_transition_nothink
3 | dataset_name: scene_transition
4 | test_split: train
5 | lmms_eval_specific_kwargs:
6 |   default:
7 |     sub_task: scene_transition
8 |     post_prompt: "Answer with the option's letter from the given choices directly."


--------------------------------------------------------------------------------
/lmms-eval_videochat/lmms_eval/tasks/mvbench_nothink/mvbench_state_change_nothink.yaml:
--------------------------------------------------------------------------------
1 | include: _default_template.yaml
2 | task: mvbench_state_change_nothink
3 | dataset_name: state_change
4 | test_split: train
5 | lmms_eval_specific_kwargs:
6 |   default:
7 |     sub_task: state_change
8 |     post_prompt: "Answer with the option's letter from the given choices directly."


--------------------------------------------------------------------------------
/lmms-eval_videochat/lmms_eval/tasks/mvbench_nothink/mvbench_unexpected_action_nothink.yaml:
--------------------------------------------------------------------------------
1 | include: _default_template.yaml
2 | task: mvbench_unexpected_action_nothink
3 | dataset_name: unexpected_action
4 | test_split: train
5 | lmms_eval_specific_kwargs:
6 |   default:
7 |     sub_task: unexpected_action
8 |     post_prompt: "Answer with the option's letter from the given choices directly."


--------------------------------------------------------------------------------
/lmms-eval_videochat/lmms_eval/tasks/mvbench_think/_default_template.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-eval_data/MVBench
 2 | dataset_kwargs:
 3 |   token: True
 4 |   cache_dir: lmms-eval_data/MVBench
 5 |   video: True
 6 | generation_kwargs:
 7 |   max_new_tokens: 1024
 8 |   temperature: 0
 9 |   top_p: 1.0
10 |   num_beams: 1
11 |   do_sample: false
12 | 
13 | output_type: generate_until
14 | doc_to_visual: !function utils.mvbench_doc_to_visual
15 | doc_to_text: !function utils.mvbench_doc_to_text_think
16 | doc_to_target: "answer"
17 | # The return value of process_results will be used by metrics
18 | process_results: !function utils.mvbench_process_results_answer
19 | # Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results
20 | metric_list:
21 |   - metric: mvbench_accuracy
22 |     aggregation: !function utils.mvbench_aggregate_results
23 |     higher_is_better: true


--------------------------------------------------------------------------------
/lmms-eval_videochat/lmms_eval/tasks/mvbench_think/mvbench_action_antonym_think.yaml:
--------------------------------------------------------------------------------
 1 | include: _default_template.yaml
 2 | task: mvbench_action_antonym_think
 3 | dataset_name: action_antonym
 4 | test_split: train
 5 | lmms_eval_specific_kwargs:
 6 |   default:
 7 |     sub_task: action_antonym_think
 8 |     yinan_prompt: "()"
 9 |     post_prompt: "\nOnly give the best option."
10 | 


--------------------------------------------------------------------------------
/lmms-eval_videochat/lmms_eval/tasks/mvbench_think/mvbench_action_count_think.yaml:
--------------------------------------------------------------------------------
1 | include: _default_template.yaml
2 | task: mvbench_action_count_think
3 | dataset_name: action_count
4 | test_split: train
5 | lmms_eval_specific_kwargs:
6 |   default:
7 |     sub_task: action_count_think
8 |     yinan_prompt: "()"
9 |     post_prompt: "\nOnly give the best option."


--------------------------------------------------------------------------------
/lmms-eval_videochat/lmms_eval/tasks/mvbench_think/mvbench_action_localization_think.yaml:
--------------------------------------------------------------------------------
1 | include: _default_template.yaml
2 | task: mvbench_action_localization_think
3 | dataset_name: action_localization
4 | test_split: train
5 | lmms_eval_specific_kwargs:
6 |   default:
7 |     sub_task: action_localization_think
8 |     post_prompt: "Answer with the option's letter from the given choices directly."
9 | 


--------------------------------------------------------------------------------
/lmms-eval_videochat/lmms_eval/tasks/mvbench_think/mvbench_action_prediction_think.yaml:
--------------------------------------------------------------------------------
1 | include: _default_template.yaml
2 | task: mvbench_action_prediction_think
3 | dataset_name: action_prediction
4 | test_split: train
5 | lmms_eval_specific_kwargs:
6 |   default:
7 |     sub_task: action_prediction_think
8 |     post_prompt: "Answer with the option's letter from the given choices directly."


--------------------------------------------------------------------------------
/lmms-eval_videochat/lmms_eval/tasks/mvbench_think/mvbench_action_sequence_think.yaml:
--------------------------------------------------------------------------------
1 | include: _default_template.yaml
2 | task: mvbench_action_sequence_think
3 | dataset_name: action_sequence
4 | test_split: train
5 | lmms_eval_specific_kwargs:
6 |   default:
7 |     sub_task: action_sequence_think
8 |     post_prompt: "Answer with the option's letter from the given choices directly."


--------------------------------------------------------------------------------
/lmms-eval_videochat/lmms_eval/tasks/mvbench_think/mvbench_character_order_think.yaml:
--------------------------------------------------------------------------------
1 | include: _default_template.yaml
2 | task: mvbench_character_order_think
3 | dataset_name: character_order
4 | test_split: train
5 | lmms_eval_specific_kwargs:
6 |   default:
7 |     sub_task: character_order_think
8 |     post_prompt: "Answer with the option's letter from the given choices directly."


--------------------------------------------------------------------------------
/lmms-eval_videochat/lmms_eval/tasks/mvbench_think/mvbench_counterfactual_inference_think.yaml:
--------------------------------------------------------------------------------
1 | include: _default_template.yaml
2 | task: mvbench_counterfactual_inference_think
3 | dataset_name: counterfactual_inference
4 | test_split: train
5 | lmms_eval_specific_kwargs:
6 |   default:
7 |     sub_task: counterfactual_inference_think
8 |     yinan_prompt: "()"
9 |     post_prompt: "\nOnly give the best option."


--------------------------------------------------------------------------------
/lmms-eval_videochat/lmms_eval/tasks/mvbench_think/mvbench_egocentric_navigation_think.yaml:
--------------------------------------------------------------------------------
1 | include: _default_template.yaml
2 | task: mvbench_egocentric_navigation_think
3 | dataset_name: egocentric_navigation
4 | test_split: train
5 | lmms_eval_specific_kwargs:
6 |   default:
7 |     sub_task: egocentric_navigation_think
8 |     post_prompt: "Answer with the option's letter from the given choices directly."
9 | 


--------------------------------------------------------------------------------
/lmms-eval_videochat/lmms_eval/tasks/mvbench_think/mvbench_episodic_reasoning_think.yaml:
--------------------------------------------------------------------------------
 1 | include: _default_template.yaml
 2 | task: mvbench_episodic_reasoning_think
 3 | dataset_name: episodic_reasoning
 4 | test_split: train
 5 | doc_to_visual: !function utils.mvbench_frames_doc_to_visual
 6 | generation_kwargs:
 7 |   max_new_tokens: 16
 8 |   temperature: 0
 9 |   top_p: 1.0
10 |   num_beams: 1
11 |   do_sample: false
12 |   
13 | lmms_eval_specific_kwargs:
14 |   default:
15 |     sub_task: episodic_reasoning_think
16 |     post_prompt: "Answer with the option's letter from the given choices directly."


--------------------------------------------------------------------------------
/lmms-eval_videochat/lmms_eval/tasks/mvbench_think/mvbench_fine_grained_action_think.yaml:
--------------------------------------------------------------------------------
1 | include: _default_template.yaml
2 | task: mvbench_fine_grained_action_think
3 | dataset_name: fine_grained_action
4 | test_split: train
5 | lmms_eval_specific_kwargs:
6 |   default:
7 |     sub_task: fine_grained_action_think
8 |     post_prompt: "Answer with the option's letter from the given choices directly."


--------------------------------------------------------------------------------
/lmms-eval_videochat/lmms_eval/tasks/mvbench_think/mvbench_fine_grained_pose_think.yaml:
--------------------------------------------------------------------------------
1 | include: _default_template.yaml
2 | task: mvbench_fine_grained_pose_think
3 | dataset_name: fine_grained_pose
4 | test_split: train
5 | lmms_eval_specific_kwargs:
6 |   default:
7 |     sub_task: fine_grained_pose_think
8 |     post_prompt: "Answer with the option's letter from the given choices directly."


--------------------------------------------------------------------------------
/lmms-eval_videochat/lmms_eval/tasks/mvbench_think/mvbench_moving_attribute_think.yaml:
--------------------------------------------------------------------------------
1 | include: _default_template.yaml
2 | task: mvbench_moving_attribute_think
3 | dataset_name: moving_attribute
4 | test_split: train
5 | lmms_eval_specific_kwargs:
6 |   default:
7 |     sub_task: moving_attribute_think
8 |     post_prompt: "Answer with the option's letter from the given choices directly."


--------------------------------------------------------------------------------
/lmms-eval_videochat/lmms_eval/tasks/mvbench_think/mvbench_moving_count_think.yaml:
--------------------------------------------------------------------------------
1 | include: _default_template.yaml
2 | task: mvbench_moving_count_think
3 | dataset_name: moving_count
4 | test_split: train
5 | lmms_eval_specific_kwargs:
6 |   default:
7 |     sub_task: moving_count_think
8 |     post_prompt: "Answer with the option's letter from the given choices directly."


--------------------------------------------------------------------------------
/lmms-eval_videochat/lmms_eval/tasks/mvbench_think/mvbench_moving_direction_think.yaml:
--------------------------------------------------------------------------------
1 | include: _default_template.yaml
2 | task: mvbench_moving_direction_think
3 | dataset_name: moving_direction
4 | test_split: train
5 | lmms_eval_specific_kwargs:
6 |   default:
7 |     sub_task: moving_direction_think
8 |     post_prompt: "Answer with the option's letter from the given choices directly."


--------------------------------------------------------------------------------
/lmms-eval_videochat/lmms_eval/tasks/mvbench_think/mvbench_object_existence_think.yaml:
--------------------------------------------------------------------------------
1 | include: _default_template.yaml
2 | task: mvbench_object_existence_think
3 | dataset_name: object_existence
4 | test_split: train
5 | lmms_eval_specific_kwargs:
6 |   default:
7 |     sub_task: object_existence_think
8 |     post_prompt: "Answer with the option's letter from the given choices directly."


--------------------------------------------------------------------------------
/lmms-eval_videochat/lmms_eval/tasks/mvbench_think/mvbench_object_interaction_think.yaml:
--------------------------------------------------------------------------------
1 | include: _default_template.yaml
2 | task: mvbench_object_interaction_think
3 | dataset_name: object_interaction
4 | test_split: train
5 | lmms_eval_specific_kwargs:
6 |   default:
7 |     sub_task: object_interaction_think
8 |     post_prompt: "Answer with the option's letter from the given choices directly."


--------------------------------------------------------------------------------
/lmms-eval_videochat/lmms_eval/tasks/mvbench_think/mvbench_object_shuffle_think.yaml:
--------------------------------------------------------------------------------
1 | include: _default_template.yaml
2 | task: mvbench_object_shuffle_think
3 | dataset_name: object_shuffle
4 | test_split: train
5 | lmms_eval_specific_kwargs:
6 |   default:
7 |     sub_task: object_shuffle_think
8 |     post_prompt: "Answer with the option's letter from the given choices directly."


--------------------------------------------------------------------------------
/lmms-eval_videochat/lmms_eval/tasks/mvbench_think/mvbench_scene_transition_think.yaml:
--------------------------------------------------------------------------------
1 | include: _default_template.yaml
2 | task: mvbench_scene_transition_think
3 | dataset_name: scene_transition
4 | test_split: train
5 | lmms_eval_specific_kwargs:
6 |   default:
7 |     sub_task: scene_transition_think
8 |     post_prompt: "Answer with the option's letter from the given choices directly."


--------------------------------------------------------------------------------
/lmms-eval_videochat/lmms_eval/tasks/mvbench_think/mvbench_state_change_think.yaml:
--------------------------------------------------------------------------------
1 | include: _default_template.yaml
2 | task: mvbench_state_change_think
3 | dataset_name: state_change
4 | test_split: train
5 | lmms_eval_specific_kwargs:
6 |   default:
7 |     sub_task: state_change_think
8 |     post_prompt: "Answer with the option's letter from the given choices directly."


--------------------------------------------------------------------------------
/lmms-eval_videochat/lmms_eval/tasks/mvbench_think/mvbench_think.yaml:
--------------------------------------------------------------------------------
 1 | group: mvbench_think
 2 | task:
 3 |   - mvbench_action_sequence_think
 4 |   - mvbench_moving_count_think
 5 |   - mvbench_action_prediction_think
 6 |   - mvbench_episodic_reasoning_think
 7 |   - mvbench_action_antonym_think
 8 |   - mvbench_action_count_think
 9 |   - mvbench_scene_transition_think
10 |   - mvbench_object_shuffle_think
11 |   - mvbench_object_existence_think
12 |   - mvbench_fine_grained_pose_think
13 |   - mvbench_unexpected_action_think
14 |   - mvbench_moving_direction_think
15 |   - mvbench_state_change_think
16 |   - mvbench_object_interaction_think
17 |   - mvbench_character_order_think
18 |   - mvbench_action_localization_think
19 |   - mvbench_counterfactual_inference_think
20 |   - mvbench_fine_grained_action_think
21 |   - mvbench_moving_attribute_think
22 |   - mvbench_egocentric_navigation_think
23 | 
24 | # ['action_sequence', 'moving_count', 'action_prediction', 'episodic_reasoning', 'action_antonym', 'action_count', 'scene_transition', 'object_shuffle', 'object_existence', 'fine_grained_pose', 'unexpected_action', 'moving_direction', 'state_change', 'object_interaction', 'character_order', 'action_localization', 'counterfactual_inference', 'fine_grained_action', 'moving_attribute', 'egocentric_navigation']


--------------------------------------------------------------------------------
/lmms-eval_videochat/lmms_eval/tasks/mvbench_think/mvbench_unexpected_action_think.yaml:
--------------------------------------------------------------------------------
1 | include: _default_template.yaml
2 | task: mvbench_unexpected_action_think
3 | dataset_name: unexpected_action
4 | test_split: train
5 | lmms_eval_specific_kwargs:
6 |   default:
7 |     sub_task: unexpected_action_think
8 |     post_prompt: "Answer with the option's letter from the given choices directly."


--------------------------------------------------------------------------------
/lmms-eval_videochat/lmms_eval/tasks/perceptiontest/val/_default_template_yaml:
--------------------------------------------------------------------------------
1 | dataset_path:  lmms-eval_data/PerceptionTest_Val
2 | dataset_kwargs:
3 |   token: True
4 |   video: True
5 |   cache_dir: pssd:s3://perception/
6 | lmms_eval_specific_kwargs:
7 |   default:
8 |     pre_prompt: ""
9 |     post_prompt: ""


--------------------------------------------------------------------------------
/lmms-eval_videochat/lmms_eval/tasks/perceptiontest/val/perceptiontest_mc_nothink.yaml:
--------------------------------------------------------------------------------
 1 | dataset_name: "mc_question_val"
 2 | task: "perceptiontest_val_mc_nothink"
 3 | test_split: validation
 4 | output_type: generate_until
 5 | doc_to_visual: !function utils.perceptiontest_val_doc_to_visual
 6 | doc_to_text: !function utils.perceptiontest_val_doc_to_text_nothink
 7 | doc_to_target: !function utils.perceptiontest_val_doc_to_answer
 8 | process_results: !function utils.perceptiontest_val_process_results_mc_answer
 9 | metric_list:
10 |   - metric: accuracy
11 |     aggregation: !function utils.perceptiontest_val_aggregate_accuracy
12 |     higher_is_better: true
13 | include: _default_template_yaml
14 | 


--------------------------------------------------------------------------------
/lmms-eval_videochat/lmms_eval/tasks/perceptiontest/val/perceptiontest_mc_think.yaml:
--------------------------------------------------------------------------------
 1 | dataset_name: "mc_question_val"
 2 | task: "perceptiontest_val_mc_think"
 3 | test_split: validation
 4 | output_type: generate_until
 5 | doc_to_visual: !function utils.perceptiontest_val_doc_to_visual
 6 | doc_to_text: !function utils.perceptiontest_val_doc_to_text_think
 7 | doc_to_target: !function utils.perceptiontest_val_doc_to_answer
 8 | process_results: !function utils.perceptiontest_val_process_results_mc_answer
 9 | metric_list:
10 |   - metric: accuracy
11 |     aggregation: !function utils.perceptiontest_val_aggregate_accuracy
12 |     higher_is_better: true
13 | include: _default_template_yaml
14 | 


--------------------------------------------------------------------------------
/lmms-eval_videochat/lmms_eval/tasks/videomme/videomme_short_nothink.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-eval_data/Video-MME_short
 2 | dataset_kwargs:
 3 |   token: True
 4 |   cache_dir: shdd:s3://VideoMME_0629
 5 |   video: True
 6 |   # From_YouTube: True
 7 | task: videomme_short_nothink
 8 | test_split: test
 9 | output_type: generate_until
10 | doc_to_visual: !function utils.videomme_doc_to_visual
11 | doc_to_text: !function utils.videomme_doc_to_text_nothink
12 | doc_to_target: "answer"
13 | generation_kwargs:
14 |   max_new_tokens: 10240
15 |   temperature: 0
16 |   top_p: 1.0
17 |   num_beams: 1
18 |   do_sample: false
19 | # The return value of process_results will be used by metrics
20 | process_results: !function utils.videomme_process_results_think
21 | # Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results
22 | metric_list:
23 |   - metric: videomme_percetion_score
24 |     aggregation: !function utils.videomme_aggregate_results
25 |     higher_is_better: true
26 | lmms_eval_specific_kwargs:
27 |   default:
28 |     pre_prompt: ""
29 |     post_prompt: ""
30 | 
31 | metadata:
32 |   - version: 0.0
33 | 


--------------------------------------------------------------------------------
/lmms-eval_videochat/lmms_eval/tasks/videomme/videomme_short_nothink_glue.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-eval_data/Video-MME_short
 2 | dataset_kwargs:
 3 |   token: True
 4 |   cache_dir: shdd:s3://VideoMME_0629
 5 |   video: True
 6 |   # From_YouTube: True
 7 | task: videomme_short_nothink_glue
 8 | test_split: test
 9 | output_type: generate_until
10 | doc_to_visual: !function utils.videomme_doc_to_visual
11 | doc_to_text: !function utils.videomme_doc_to_text_nothink_glue
12 | doc_to_target: "answer"
13 | generation_kwargs:
14 |   max_new_tokens: 10240
15 |   temperature: 0
16 |   top_p: 1.0
17 |   num_beams: 1
18 |   do_sample: false
19 | # The return value of process_results will be used by metrics
20 | process_results: !function utils.videomme_process_results_think
21 | # Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results
22 | metric_list:
23 |   - metric: videomme_percetion_score
24 |     aggregation: !function utils.videomme_aggregate_results
25 |     higher_is_better: true
26 | lmms_eval_specific_kwargs:
27 |   default:
28 |     pre_prompt: ""
29 |     post_prompt: ""
30 | 
31 | metadata:
32 |   - version: 0.0
33 | 


--------------------------------------------------------------------------------
/lmms-eval_videochat/lmms_eval/tasks/videomme/videomme_short_think.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-eval_data/Video-MME_short
 2 | dataset_kwargs:
 3 |   token: True
 4 |   cache_dir: shdd:s3://VideoMME_0629
 5 |   video: True
 6 |   # From_YouTube: True
 7 | task: videomme_short_think
 8 | test_split: test
 9 | output_type: generate_until
10 | doc_to_visual: !function utils.videomme_doc_to_visual
11 | doc_to_text: !function utils.videomme_doc_to_text_think
12 | doc_to_target: "answer"
13 | generation_kwargs:
14 |   max_new_tokens: 10240
15 |   temperature: 0
16 |   top_p: 1.0
17 |   num_beams: 1
18 |   do_sample: false
19 | # The return value of process_results will be used by metrics
20 | process_results: !function utils.videomme_process_results_think
21 | # Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results
22 | metric_list:
23 |   - metric: videomme_percetion_score
24 |     aggregation: !function utils.videomme_aggregate_results
25 |     higher_is_better: true
26 | lmms_eval_specific_kwargs:
27 |   default:
28 |     pre_prompt: ""
29 |     post_prompt: ""
30 | 
31 | metadata:
32 |   - version: 0.0
33 | 


--------------------------------------------------------------------------------
/lmms-eval_videochat/lmms_eval/tasks/videomme/videomme_short_think_glue.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-eval_data/Video-MME_short
 2 | dataset_kwargs:
 3 |   token: True
 4 |   cache_dir: shdd:s3://VideoMME_0629
 5 |   video: True
 6 |   # From_YouTube: True
 7 | task: videomme_short_think_glue
 8 | test_split: test
 9 | output_type: generate_until
10 | doc_to_visual: !function utils.videomme_doc_to_visual
11 | doc_to_text: !function utils.videomme_doc_to_text_think_glue
12 | doc_to_target: "answer"
13 | generation_kwargs:
14 |   max_new_tokens: 10240
15 |   temperature: 0
16 |   top_p: 1.0
17 |   num_beams: 1
18 |   do_sample: false
19 | # The return value of process_results will be used by metrics
20 | process_results: !function utils.videomme_process_results_think
21 | # Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results
22 | metric_list:
23 |   - metric: videomme_percetion_score
24 |     aggregation: !function utils.videomme_aggregate_results
25 |     higher_is_better: true
26 | lmms_eval_specific_kwargs:
27 |   default:
28 |     pre_prompt: ""
29 |     post_prompt: ""
30 | 
31 | metadata:
32 |   - version: 0.0
33 | 


--------------------------------------------------------------------------------
/lmms-eval_videochat/miscs/example_eval.yaml:
--------------------------------------------------------------------------------
1 | - model: llava
2 |   model_args: pretrained=liuhaotian/llava-v1.5-7b
3 |   tasks: mmmu_val
4 |   batch_size: 1
5 |   log_samples: true
6 |   log_samples_suffix: eval_mmmu
7 |   output_path: "./logs/"
8 | 
9 | 


--------------------------------------------------------------------------------
/lmms-eval_videochat/miscs/llava_repr_requirements.txt:
--------------------------------------------------------------------------------
 1 | accelerate==0.21.0
 2 | datasets==2.16.1
 3 | evaluate==0.4.1
 4 | hf_transfer==0.1.6
 5 | Jinja2==3.1.3
 6 | numpy==1.26.4
 7 | openai==1.13.3
 8 | packaging==23.2
 9 | pandas==2.2.1
10 | Pillow==10.2.0
11 | protobuf==4.25.3
12 | pycocoevalcap==1.2
13 | pycocotools==2.0.7
14 | pytablewriter==1.2.0
15 | pytest==8.0.2
16 | python_Levenshtein==0.25.0
17 | pytz==2024.1
18 | PyYAML==6.0.1
19 | PyYAML==6.0.1
20 | Requests==2.31.0
21 | sacrebleu==2.4.0
22 | scikit_learn==1.2.2
23 | sentencepiece==0.1.99
24 | setuptools==68.2.2
25 | sglang==0.1.12
26 | shortuuid==1.0.12
27 | sqlitedict==2.1.0
28 | tenacity==8.2.3
29 | torch==2.0.1
30 | tokenizers==0.15.2
31 | tqdm==4.66.2
32 | transformers==4.37.2
33 | 


--------------------------------------------------------------------------------
/lmms-eval_videochat/miscs/llava_result_check.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/VideoChat-R1/dba50f32fffb763e0fdf3810244be76a44147df2/lmms-eval_videochat/miscs/llava_result_check.md


--------------------------------------------------------------------------------
/lmms-eval_videochat/miscs/llava_sglang_result_check.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/VideoChat-R1/dba50f32fffb763e0fdf3810244be76a44147df2/lmms-eval_videochat/miscs/llava_sglang_result_check.md


--------------------------------------------------------------------------------
/lmms-eval_videochat/miscs/repr_scripts.sh:
--------------------------------------------------------------------------------
 1 | # install lmms_eval without building dependencies
 2 | cd lmms_eval;
 3 | pip install --no-deps -U -e .
 4 | 
 5 | # install LLaVA without building dependencies
 6 | cd LLaVA
 7 | pip install --no-deps -U -e .
 8 | 
 9 | # install all the requirements that require for reproduce llava results
10 | pip install -r llava_repr_requirements.txt
11 | 
12 | # Run and exactly reproduce llava_v1.5 results!
13 | # mme as an example
14 | accelerate launch --num_processes=1 -m lmms_eval --model llava   --model_args pretrained="liuhaotian/llava-v1.5-7b,use_flash_attention_2=False,device_map=auto"   --tasks mme  --batch_size 1 --log_samples --log_samples_suffix reproduce --output_path ./logs/


--------------------------------------------------------------------------------
/lmms-eval_videochat/miscs/repr_torch_envs.txt:
--------------------------------------------------------------------------------
 1 | Collecting environment information...
 2 | PyTorch version: 2.0.1+cu117
 3 | Is debug build: False
 4 | CUDA used to build PyTorch: 11.7
 5 | ROCM used to build PyTorch: N/A
 6 | 
 7 | OS: Ubuntu 22.04.2 LTS (x86_64)
 8 | GCC version: (Ubuntu 11.3.0-1ubuntu1~22.04.1) 11.3.0
 9 | Clang version: Could not collect
10 | CMake version: version 3.28.3
11 | Libc version: glibc-2.35
12 | 
13 | Python version: 3.9.18 (main, Sep 11 2023, 13:41:44)  [GCC 11.2.0] (64-bit runtime)
14 | Python platform: Linux-5.15.0-76-generic-x86_64-with-glibc2.35
15 | Is CUDA available: False
16 | CUDA runtime version: 11.8.89
17 | CUDA_MODULE_LOADING set to: N/A
18 | GPU models and configuration: Could not collect
19 | Nvidia driver version: Could not collect
20 | cuDNN version: Could not collect
21 | HIP runtime version: N/A
22 | MIOpen runtime version: N/A
23 | Is XNNPACK available: True
24 | 
25 | CPU:
26 | Architecture:                    x86_64
27 | CPU op-mode(s):                  32-bit, 64-bit
28 | Address sizes:                   42 bits physical, 48 bits virtual
29 | Byte Order:                      Little Endian
30 | CPU(s):                          16
31 | On-line CPU(s) list:             0-15
32 | Vendor ID:                       GenuineIntel
33 | Model name:                      Intel(R) Xeon(R) Gold 6348 CPU @ 2.60GHz
34 | CPU family:                      6
35 | Model:                           106
36 | Thread(s) per core:              2
37 | Core(s) per socket:              8
38 | Socket(s):                       1
39 | Stepping:                        6
40 | BogoMIPS:                        5200.01
41 | Flags:                           fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ss ht syscall nx pdpe1gb rdtscp lm constant_tsc rep_good nopl xtopology nonstop_tsc cpuid tsc_known_freq pni pclmulqdq ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt tsc_deadline_timer aes xsave avx f16c rdrand hypervisor lahf_lm abm 3dnowprefetch cpuid_fault invpcid_single ssbd ibrs ibpb stibp ibrs_enhanced fsgsbase tsc_adjust bmi1 avx2 smep bmi2 erms invpcid avx512f avx512dq rdseed adx smap avx512ifma clflushopt clwb avx512cd sha_ni avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves wbnoinvd arat avx512vbmi umip avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bitalg avx512_vpopcntdq fsrm md_clear arch_capabilities
42 | Hypervisor vendor:               KVM
43 | Virtualization type:             full
44 | L1d cache:                       384 KiB (8 instances)
45 | L1i cache:                       256 KiB (8 instances)
46 | L2 cache:                        10 MiB (8 instances)
47 | L3 cache:                        42 MiB (1 instance)
48 | NUMA node(s):                    1
49 | NUMA node0 CPU(s):               0-15
50 | Vulnerability Itlb multihit:     Not affected
51 | Vulnerability L1tf:              Not affected
52 | Vulnerability Mds:               Not affected
53 | Vulnerability Meltdown:          Not affected
54 | Vulnerability Mmio stale data:   Vulnerable: Clear CPU buffers attempted, no microcode; SMT Host state unknown
55 | Vulnerability Retbleed:          Not affected
56 | Vulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl and seccomp
57 | Vulnerability Spectre v1:        Mitigation; usercopy/swapgs barriers and __user pointer sanitization
58 | Vulnerability Spectre v2:        Mitigation; Enhanced IBRS, IBPB conditional, RSB filling, PBRSB-eIBRS SW sequence
59 | Vulnerability Srbds:             Not affected
60 | Vulnerability Tsx async abort:   Not affected
61 | 
62 | Versions of relevant libraries:
63 | [pip3] mypy-extensions==1.0.0
64 | [pip3] numpy==1.26.4
65 | [pip3] torch==2.0.1
66 | [pip3] torchvision==0.16.2
67 | [conda] numpy                     1.26.4                   pypi_0    pypi
68 | [conda] torch                     2.0.1                    pypi_0    pypi
69 | [conda] torchvision               0.16.2                   pypi_0    pypi


--------------------------------------------------------------------------------
/lmms-eval_videochat/miscs/script.sh:
--------------------------------------------------------------------------------
 1 | accelerate launch --num_processes=1 -m lmms_eval --model llava   --model_args pretrained="liuhaotian/llava-v1.5-7b"   --tasks mme_llava_prompt  --batch_size 1 --log_samples --log_samples_sufix debug --output_path ./logs/
 2 | 
 3 | 
 4 | gpu = 8 bs 1:
 5 | 
 6 | llava (pretrained=llava-hf/llava-1.5-7b-hf), gen_kwargs: (), limit: None, num_fewshot: None, batch_size: 1
 7 | |     Tasks      |Version|Filter|n-shot|  Metric   |Value|   |Stderr |
 8 | |----------------|-------|------|-----:|-----------|----:|---|------:|
 9 | |mme_llava_prompt|Yaml   |none  |     0|exact_match| 1873|±  |38.4331|
10 | 
11 | gpu = 8 bs 1 use_flash_attention_2=True:
12 | 
13 | 
14 | 
15 | 
16 | 
17 | gpu = 4 bs 1 use_flash_attention_2=True:
18 | 
19 | 
20 | 
21 | accelerate launch --num_processes=8 --main_process_port 12345 -m lmms_eval --model qwen_vl   --model_args pretrained="Qwen/Qwen-VL"   --tasks mme  --batch_size 1 --log_samples --log_samples_suffix debug --output_path ./logs/
22 | 


--------------------------------------------------------------------------------
/lmms-eval_videochat/miscs/test_llava.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | from PIL import Image
 3 | 
 4 | import torch
 5 | from transformers import AutoProcessor, LlavaForConditionalGeneration
 6 | 
 7 | model_id = "llava-hf/llava-1.5-7b-hf"
 8 | 
 9 | prompt_1 = "USER: <image>\nWhat does this image show?\nASSISTANT:"
10 | prompt_2 = "USER: <image> <image> \nWhat is the difference between these two images?\nASSISTANT:"
11 | image_file_1 = "image1.png"
12 | image_file_2 = "image2.png"
13 | model = LlavaForConditionalGeneration.from_pretrained(model_id, torch_dtype=torch.float16, low_cpu_mem_usage=True, use_flash_attention_2=True).to(0)
14 | processor = AutoProcessor.from_pretrained(model_id)
15 | raw_image_1 = Image.open(image_file_1)
16 | raw_image_2 = Image.open(image_file_2)
17 | inputs = processor([prompt_1, prompt_2], [raw_image_1, raw_image_1, raw_image_2], padding=True, return_tensors="pt").to(0, torch.float16)
18 | import pdb
19 | 
20 | pdb.set_trace()
21 | output = model.generate(**inputs, max_new_tokens=200, do_sample=False)
22 | print(processor.batch_decode(output, skip_special_tokens=True))
23 | 


--------------------------------------------------------------------------------
/lmms-eval_videochat/miscs/test_scienceqa.py:
--------------------------------------------------------------------------------
1 | from datasets import load_dataset
2 | 
3 | dataset = load_dataset("Otter-AI/ScienceQA", trust_remote_code=True)["test"]
4 | for doc in dataset:
5 |     print(doc["id"])
6 | 


--------------------------------------------------------------------------------
/lmms-eval_videochat/miscs/tinyllava_repr_requirements.txt:
--------------------------------------------------------------------------------
 1 | accelerate==0.27.2
 2 | datasets==2.16.1
 3 | deepspeed==0.14.0
 4 | einops==0.6.1
 5 | einops-exts==0.0.4
 6 | evaluate==0.4.1
 7 | hf_transfer==0.1.6
 8 | Jinja2==3.1.3
 9 | numpy==1.26.4
10 | openai==1.13.3
11 | openpyxl
12 | packaging==23.2
13 | pandas==2.2.1
14 | peft==0.10.0
15 | Pillow==10.2.0
16 | protobuf==4.25.3
17 | pycocoevalcap==1.2
18 | pycocotools==2.0.7
19 | pytablewriter==1.2.0
20 | pytest==8.0.2
21 | python_Levenshtein==0.25.0
22 | pytz==2024.1
23 | PyYAML==6.0.1
24 | Requests==2.31.0
25 | sacrebleu==2.4.0
26 | scikit_learn==1.2.2
27 | sentencepiece==0.1.99
28 | setuptools==68.2.2
29 | sglang==0.1.12
30 | shortuuid==1.0.12
31 | sqlitedict==2.1.0
32 | tenacity==8.2.3
33 | tiktoken
34 | # torch==2.0.1
35 | # torchvision==0.15.2
36 | tokenizers==0.15.1
37 | timm==0.6.13
38 | tqdm==4.66.2
39 | transformers==4.39.3
40 | 


--------------------------------------------------------------------------------
/lmms-eval_videochat/miscs/tinyllava_repr_scripts.sh:
--------------------------------------------------------------------------------
 1 | # install lmms_eval without building dependencies
 2 | cd lmms_eval;
 3 | pip install --no-deps -U -e .
 4 | 
 5 | # install TinyLLaVA without building dependencies
 6 | cd ..
 7 | git clone https://github.com/TinyLLaVA/TinyLLaVA_Factory
 8 | cd TinyLLaVA_Factory
 9 | pip install --no-deps -U -e .
10 | 
11 | # install all the requirements that require for reproduce llava results
12 | pip install torch==2.0.1 torchvision==0.15.2 --index-url https://download.pytorch.org/whl/cu118
13 | pip install -r tinyllava_repr_requirements.txt
14 | 
15 | # Run and reproduce tinyllava best results!
16 | accelerate launch \
17 |     --num_processes=1 \
18 |     -m lmms_eval \
19 |     --model tinyllava \
20 |     --model_args pretrained=tinyllava/TinyLLaVA-Phi-2-SigLIP-3.1B,conv_mode=phi \
21 |     --tasks vqav2,gqa,scienceqa_img,textvqa,mmvet,pope,mme,mmmu_val \
22 |     --batch_size 1 \
23 |     --log_samples \
24 |     --log_samples_suffix tinyllava-phi2-siglip-3.1b \
25 |     --output_path ./logs/


--------------------------------------------------------------------------------
/lmms-eval_videochat/pyproject.toml:
--------------------------------------------------------------------------------
  1 | [tool.black]
  2 | line-length = 240
  3 | 
  4 | [build-system]
  5 | requires = ["setuptools>=42", "wheel", "setuptools_scm[tomli]>=6.3"]
  6 | build-backend = "setuptools.build_meta"
  7 | 
  8 | [project]
  9 | name = "lmms_eval"
 10 | version = "0.2.1"
 11 | authors = [
 12 |     { name = "LMMMs-Lab Evaluation Team", email = "lmms_eval@outlook.com" },
 13 | ]
 14 | description = "A framework for evaluating large multi-modality language models"
 15 | readme = "README.md"
 16 | classifiers = [
 17 |     "Programming Language :: Python :: 3",
 18 |     "License :: OSI Approved :: MIT License",
 19 |     "Operating System :: OS Independent",
 20 | ]
 21 | requires-python = ">=3.8"
 22 | license = { text = "MIT" }
 23 | dependencies = [
 24 |     "accelerate>=0.29.1",
 25 |     "black==24.1.0",
 26 |     "datasets==2.16.1",
 27 |     "evaluate>=0.4.0",
 28 |     "httpx==0.23.3",
 29 |     "jsonlines",
 30 |     "numexpr",
 31 |     "numpy==1.26.4",
 32 |     "peft>=0.2.0",
 33 |     "pybind11>=2.6.2",
 34 |     "pytablewriter",
 35 |     "sacrebleu>=1.5.0",
 36 |     "scikit-learn>=0.24.1",
 37 |     "sqlitedict==2.1.0",
 38 |     "torch>=2.1.0", # to enable sdpa mode for running 34B model on one 80GB GPU
 39 |     "torchvision>=0.16.0",
 40 |     "timm",
 41 |     "einops",
 42 |     "ftfy",
 43 |     "openai",
 44 |     "opencv-python-headless",
 45 |     "av",
 46 |     "hf_transfer",
 47 |     "pywsd",
 48 |     "nltk",
 49 |     "sentencepiece==0.1.99",
 50 |     "yt-dlp",
 51 |     "pycocoevalcap",
 52 |     "tqdm-multiprocess",
 53 |     "transformers==4.39.2",
 54 |     "transformers-stream-generator",
 55 |     "zstandard",
 56 |     "pillow",
 57 |     "pyyaml",
 58 |     "sympy",
 59 |     "mpmath",
 60 |     "Jinja2",
 61 |     "openpyxl",
 62 |     "loguru",
 63 |     "Levenshtein",
 64 |     "hf_transfer",
 65 |     "tenacity==8.3.0",
 66 |     "wandb>=0.16.0",
 67 |     "tiktoken",
 68 |     "pre-commit",
 69 |     "pydantic",
 70 |     "packaging",
 71 |     "decord",
 72 |     "zss",
 73 |     "pywsd",
 74 |     "spacy",
 75 |     "anls",
 76 |     "rouge",
 77 |     "capture_metric",
 78 |     "protobuf==3.20",
 79 | ]
 80 | 
 81 | [project.optional-dependencies]
 82 | gemini = [
 83 |     "google-generativeai",
 84 | ]
 85 | reka = [
 86 |     "httpx==0.23.3",
 87 |     "reka-api",
 88 | ]
 89 | all = [
 90 |     "vila",
 91 |     "gemini",
 92 |     "reka",
 93 | ]
 94 | 
 95 | [tool.setuptools.packages.find]
 96 | include = ["lmms_eval*"]
 97 | exclude = [
 98 |     "assets*",
 99 |     "benchmark*",
100 |     "docs",
101 |     "dist*",
102 |     "playground*",
103 |     "scripts*",
104 |     "tests*",
105 |     "checkpoints*",
106 |     "project_checkpoints*",
107 |     "debug_checkpoints*",
108 |     "mlx_configs*",
109 |     "wandb*",
110 |     "notebooks*",
111 |     "logs*",
112 | ]
113 | 
114 | [tool.wheel]
115 | exclude = [
116 |     "assets*",
117 |     "benchmark*",
118 |     "docs",
119 |     "dist*",
120 |     "playground*",
121 |     "scripts*",
122 |     "tests*",
123 |     "checkpoints*",
124 |     "project_checkpoints*",
125 |     "debug_checkpoints*",
126 |     "mlx_configs*",
127 |     "wandb*",
128 |     "notebooks*",
129 |     "logs*",
130 | ]
131 | 
132 | [project.scripts]
133 | lmms-eval = "lmms_eval.__main__:cli_evaluate"
134 | 
135 | [project.urls]
136 | Homepage = "https://lmms-lab.github.io"
137 | Repository = "https://github.com/EvolvingLMMs-Lab/lmms-eval"


--------------------------------------------------------------------------------
/lmms-eval_videochat/scripts_reason/eval_qwen2_5vl_all_tasks_new.sh:
--------------------------------------------------------------------------------
  1 | # # export OPENAI_API_KEY="<YOUR_API_KEY>"
  2 | # export HF_HOME="/mnt/petrelfs/lixinhao/lxh_exp/LongVideo/hf_home" 
  3 | # export HF_TOKEN="<YOUR_API_KEY>"
  4 | export HF_DATASETS_OFFLINE=1
  5 | # export REKA_API_KEY="<YOUR_API_KEY>"
  6 | 
  7 | MASTER_PORT=$((18000 + $RANDOM % 100))
  8 | TASK=mvbench_think
  9 | 
 10 | 
 11 | CKPT_PATH=xxxxxxxxxxxxxxxxxxxx
 12 | MODEL_NAME=qwen2_5_vl_lxh
 13 | MAX_NUM_FRAMES=256
 14 | echo $TASK
 15 | TASK_SUFFIX="${TASK//,/_}"
 16 | echo $TASK_SUFFIX
 17 | 
 18 | JOB_NAME=$(basename $0)_$(date +"%Y%m%d_%H%M%S")
 19 | 
 20 | NUM_GPUS=8
 21 | 
 22 | srun -p videop1 \
 23 |     --job-name=${JOB_NAME} \
 24 |     --ntasks=1 \
 25 |     --gres=gpu:${NUM_GPUS} \
 26 |     --ntasks-per-node=1 \
 27 |     --cpus-per-task=16 \
 28 |     --quotatype=spot \
 29 |     --kill-on-bad-exit=1 \
 30 |     accelerate launch --num_processes ${NUM_GPUS} --main_process_port 10078 -m lmms_eval \
 31 |         --model ${MODEL_NAME} \
 32 |         --model_args pretrained=$CKPT_PATH\
 33 |         --tasks $TASK \
 34 |         --batch_size 1 \
 35 |         --log_samples \
 36 |         --log_samples_suffix $TASK_SUFFIX \
 37 |         --output_path ./logs_reason/${JOB_NAME}
 38 | 
 39 | MASTER_PORT=$((18000 + $RANDOM % 100))
 40 | TASK=videomme_short_nothink
 41 | 
 42 | 
 43 | CKPT_PATH=xxxxxxxxxxxxxxxxxxxx
 44 | MODEL_NAME=qwen2_5_vl_lxh
 45 | MAX_NUM_FRAMES=256
 46 | echo $TASK
 47 | TASK_SUFFIX="${TASK//,/_}"
 48 | echo $TASK_SUFFIX
 49 | 
 50 | JOB_NAME=$(basename $0)_$(date +"%Y%m%d_%H%M%S")
 51 | 
 52 | NUM_GPUS=8
 53 | 
 54 | srun -p videop1 \
 55 |     --job-name=${JOB_NAME} \
 56 |     --ntasks=1 \
 57 |     --gres=gpu:${NUM_GPUS} \
 58 |     --ntasks-per-node=1 \
 59 |     --cpus-per-task=16 \
 60 |     --quotatype=spot \
 61 |     --kill-on-bad-exit=1 \
 62 |     accelerate launch --num_processes ${NUM_GPUS} --main_process_port 10078 -m lmms_eval \
 63 |         --model ${MODEL_NAME} \
 64 |         --model_args pretrained=$CKPT_PATH\
 65 |         --tasks $TASK \
 66 |         --batch_size 1 \
 67 |         --log_samples \
 68 |         --log_samples_suffix $TASK_SUFFIX \
 69 |         --output_path ./logs_reason/${JOB_NAME}
 70 | 
 71 | 
 72 | MASTER_PORT=$((18000 + $RANDOM % 100))
 73 | TASK=mvbench_nothink
 74 | 
 75 | 
 76 | 
 77 | MODEL_NAME=qwen2_5_vl_lxh
 78 | MAX_NUM_FRAMES=256
 79 | echo $TASK
 80 | TASK_SUFFIX="${TASK//,/_}"
 81 | echo $TASK_SUFFIX
 82 | 
 83 | JOB_NAME=$(basename $0)_$(date +"%Y%m%d_%H%M%S")
 84 | 
 85 | NUM_GPUS=8
 86 | 
 87 | srun -p videop1 \
 88 |     --job-name=${JOB_NAME} \
 89 |     --ntasks=1 \
 90 |     --gres=gpu:${NUM_GPUS} \
 91 |     --ntasks-per-node=1 \
 92 |     --cpus-per-task=16 \
 93 |     --quotatype=spot \
 94 |     --kill-on-bad-exit=1 \
 95 |     accelerate launch --num_processes ${NUM_GPUS} --main_process_port 10078 -m lmms_eval \
 96 |         --model ${MODEL_NAME} \
 97 |         --model_args pretrained=$CKPT_PATH \
 98 |         --tasks $TASK \
 99 |         --batch_size 1 \
100 |         --log_samples \
101 |         --log_samples_suffix $TASK_SUFFIX \
102 |         --output_path ./logs_reason/${JOB_NAME}
103 | 
104 | 


--------------------------------------------------------------------------------
/lmms-eval_videochat/scripts_reason/eval_qwen2_5vl_mv_ptest.sh:
--------------------------------------------------------------------------------
  1 | MASTER_PORT=$((18000 + $RANDOM % 100))
  2 | TASK=mvbench_nothink
  3 | 
  4 | 
  5 | CKPT_PATH=OpenGVLab/VideoChat-R1_7B
  6 | MODEL_NAME=qwen2_5_vl_lxh
  7 | MAX_NUM_FRAMES=256
  8 | echo $TASK
  9 | TASK_SUFFIX="${TASK//,/_}"
 10 | echo $TASK_SUFFIX
 11 | 
 12 | JOB_NAME=$(basename $0)_$(date +"%Y%m%d_%H%M%S")
 13 | 
 14 | NUM_GPUS=8
 15 | 
 16 | srun -p videop1 \
 17 |     --job-name=${JOB_NAME} \
 18 |     --ntasks=1 \
 19 |     --gres=gpu:${NUM_GPUS} \
 20 |     --ntasks-per-node=1 \
 21 |     --cpus-per-task=16 \
 22 |     --quotatype=spot \
 23 |     --kill-on-bad-exit=1 \
 24 |     accelerate launch --num_processes ${NUM_GPUS} --main_process_port 10078 -m lmms_eval \
 25 |         --model ${MODEL_NAME} \
 26 |         --model_args pretrained=$CKPT_PATH\
 27 |         --tasks $TASK \
 28 |         --batch_size 1 \
 29 |         --log_samples \
 30 |         --log_samples_suffix $TASK_SUFFIX \
 31 |         --output_path ./logs_reason/${JOB_NAME}
 32 | 
 33 | 
 34 | MASTER_PORT=$((18000 + $RANDOM % 100))
 35 | TASK=mvbench_nothink
 36 | 
 37 | 
 38 | CKPT_PATH=OpenGVLab/VideoChat-R1_7B
 39 | MODEL_NAME=qwen2_5_vl_lxh
 40 | MAX_NUM_FRAMES=256
 41 | echo $TASK
 42 | TASK_SUFFIX="${TASK//,/_}"
 43 | echo $TASK_SUFFIX
 44 | 
 45 | JOB_NAME=$(basename $0)_$(date +"%Y%m%d_%H%M%S")
 46 | 
 47 | NUM_GPUS=8
 48 | 
 49 | srun -p videop1 \
 50 |     --job-name=${JOB_NAME} \
 51 |     --ntasks=1 \
 52 |     --gres=gpu:${NUM_GPUS} \
 53 |     --ntasks-per-node=1 \
 54 |     --cpus-per-task=16 \
 55 |     --quotatype=spot \
 56 |     --kill-on-bad-exit=1 \
 57 |     accelerate launch --num_processes ${NUM_GPUS} --main_process_port 10078 -m lmms_eval \
 58 |         --model ${MODEL_NAME} \
 59 |         --model_args pretrained=$CKPT_PATH\
 60 |         --tasks $TASK \
 61 |         --batch_size 1 \
 62 |         --log_samples \
 63 |         --log_samples_suffix $TASK_SUFFIX \
 64 |         --output_path ./logs_reason/${JOB_NAME}
 65 | 
 66 | 
 67 | 
 68 | 
 69 | MASTER_PORT=$((18000 + $RANDOM % 100))
 70 | TASK=perceptiontest_val_mc_nothink
 71 | 
 72 | CKPT_PATH=OpenGVLab/VideoChat-R1_7B
 73 | MODEL_NAME=qwen2_5_vl_lxh
 74 | MAX_NUM_FRAMES=256
 75 | echo $TASK
 76 | TASK_SUFFIX="${TASK//,/_}"
 77 | echo $TASK_SUFFIX
 78 | 
 79 | JOB_NAME=$(basename $0)_$(date +"%Y%m%d_%H%M%S")
 80 | 
 81 | NUM_GPUS=8
 82 | 
 83 | srun -p videop1 \
 84 |     --job-name=${JOB_NAME} \
 85 |     --ntasks=1 \
 86 |     --gres=gpu:${NUM_GPUS} \
 87 |     --ntasks-per-node=1 \
 88 |     --cpus-per-task=16 \
 89 |     --quotatype=spot \
 90 |     --kill-on-bad-exit=1 \
 91 |     accelerate launch --num_processes ${NUM_GPUS} --main_process_port 10078 -m lmms_eval \
 92 |         --model ${MODEL_NAME} \
 93 |         --model_args pretrained=$CKPT_PATH\
 94 |         --tasks $TASK \
 95 |         --batch_size 1 \
 96 |         --log_samples \
 97 |         --log_samples_suffix $TASK_SUFFIX \
 98 |         --output_path ./logs_reason/${JOB_NAME}
 99 | 
100 | 
101 | 
102 | 


--------------------------------------------------------------------------------
/lmms-eval_videochat/scripts_reason/eval_qwen2_5vl_nothink.sh:
--------------------------------------------------------------------------------
 1 | # # export OPENAI_API_KEY="<YOUR_API_KEY>"
 2 | # export HF_HOME="/mnt/petrelfs/lixinhao/lxh_exp/LongVideo/hf_home" 
 3 | # export HF_TOKEN="<YOUR_API_KEY>"
 4 | export HF_DATASETS_OFFLINE=1
 5 | # export REKA_API_KEY="<YOUR_API_KEY>"
 6 | 
 7 | MASTER_PORT=$((18000 + $RANDOM % 100))
 8 | TASK=videomme_short_nothink
 9 | 
10 | CKPT_PATH=xxxxxxxxxxxxxxxxxxx
11 | MODEL_NAME=qwen2_5_vl_lxh
12 | MAX_NUM_FRAMES=256
13 | echo $TASK
14 | TASK_SUFFIX="${TASK//,/_}"
15 | echo $TASK_SUFFIX
16 | 
17 | JOB_NAME=$(basename $0)_$(date +"%Y%m%d_%H%M%S")
18 | 
19 | NUM_GPUS=8
20 | 
21 | srun -p videop1 \
22 |     --job-name=${JOB_NAME} \
23 |     --ntasks=1 \
24 |     --gres=gpu:${NUM_GPUS} \
25 |     --ntasks-per-node=1 \
26 |     --cpus-per-task=16 \
27 |     --kill-on-bad-exit=1 \
28 |     accelerate launch --num_processes ${NUM_GPUS} --main_process_port 10078 -m lmms_eval \
29 |         --model ${MODEL_NAME} \
30 |         --model_args pretrained=$CKPT_PATH\
31 |         --tasks $TASK \
32 |         --batch_size 1 \
33 |         --log_samples \
34 |         --log_samples_suffix $TASK_SUFFIX \
35 |         --output_path ./logs_reason/${JOB_NAME}
36 | 
37 | MASTER_PORT=$((18000 + $RANDOM % 100))
38 | TASK=videomme_short_nothink
39 | 
40 | CKPT_PATH=xxxxxxxxxxxxxxxxxx
41 | MODEL_NAME=qwen2_5_vl_lxh
42 | MAX_NUM_FRAMES=256
43 | echo $TASK
44 | TASK_SUFFIX="${TASK//,/_}"
45 | echo $TASK_SUFFIX
46 | 
47 | JOB_NAME=$(basename $0)_$(date +"%Y%m%d_%H%M%S")
48 | 
49 | NUM_GPUS=8
50 | 
51 | srun -p videop1 \
52 |     --job-name=${JOB_NAME} \
53 |     --ntasks=1 \
54 |     --gres=gpu:${NUM_GPUS} \
55 |     --ntasks-per-node=1 \
56 |     --cpus-per-task=16 \
57 |     --kill-on-bad-exit=1 \
58 |     accelerate launch --num_processes ${NUM_GPUS} --main_process_port 10078 -m lmms_eval \
59 |         --model ${MODEL_NAME} \
60 |         --model_args pretrained=$CKPT_PATH\
61 |         --tasks $TASK \
62 |         --batch_size 1 \
63 |         --log_samples \
64 |         --log_samples_suffix $TASK_SUFFIX \
65 |         --output_path ./logs_reason/${JOB_NAME}


--------------------------------------------------------------------------------
/lmms-eval_videochat/scripts_reason/eval_qwen2_5vl_nothink_glue.sh:
--------------------------------------------------------------------------------
 1 | # # export OPENAI_API_KEY="<YOUR_API_KEY>"
 2 | # export HF_HOME="/mnt/petrelfs/lixinhao/lxh_exp/LongVideo/hf_home" 
 3 | # export HF_TOKEN="<YOUR_API_KEY>"
 4 | export HF_DATASETS_OFFLINE=1
 5 | # export REKA_API_KEY="<YOUR_API_KEY>"
 6 | 
 7 | MASTER_PORT=$((18000 + $RANDOM % 100))
 8 | TASK=videomme_short_nothink_glue
 9 | 
10 | CKPT_PATH=Qwen2.5-VL-7B-Instruct
11 | MODEL_NAME=qwen2_5_vl_lxh
12 | MAX_NUM_FRAMES=256
13 | echo $TASK
14 | TASK_SUFFIX="${TASK//,/_}"
15 | echo $TASK_SUFFIX
16 | 
17 | JOB_NAME=$(basename $0)_$(date +"%Y%m%d_%H%M%S")
18 | 
19 | NUM_GPUS=8
20 | 
21 | srun -p videop1 \
22 |     --job-name=${JOB_NAME} \
23 |     --ntasks=1 \
24 |     --gres=gpu:${NUM_GPUS} \
25 |     --ntasks-per-node=1 \
26 |     --cpus-per-task=16 \
27 |     --kill-on-bad-exit=1 \
28 |     accelerate launch --num_processes ${NUM_GPUS} --main_process_port 10078 -m lmms_eval \
29 |         --model ${MODEL_NAME} \
30 |         --model_args pretrained=$CKPT_PATH\
31 |         --tasks $TASK \
32 |         --batch_size 1 \
33 |         --log_samples \
34 |         --log_samples_suffix $TASK_SUFFIX \
35 |         --output_path ./logs_reason/${JOB_NAME}
36 | 
37 | MASTER_PORT=$((18000 + $RANDOM % 100))
38 | TASK=videomme_short_nothink_glue
39 | 
40 | CKPT_PATH=OpenGVLab/VideoChat-R1_7B
41 | MODEL_NAME=qwen2_5_vl_lxh
42 | MAX_NUM_FRAMES=256
43 | echo $TASK
44 | TASK_SUFFIX="${TASK//,/_}"
45 | echo $TASK_SUFFIX
46 | 
47 | JOB_NAME=$(basename $0)_$(date +"%Y%m%d_%H%M%S")
48 | 
49 | NUM_GPUS=8
50 | 
51 | srun -p videop1 \
52 |     --job-name=${JOB_NAME} \
53 |     --ntasks=1 \
54 |     --gres=gpu:${NUM_GPUS} \
55 |     --ntasks-per-node=1 \
56 |     --cpus-per-task=16 \
57 |     --kill-on-bad-exit=1 \
58 |     accelerate launch --num_processes ${NUM_GPUS} --main_process_port 10078 -m lmms_eval \
59 |         --model ${MODEL_NAME} \
60 |         --model_args pretrained=$CKPT_PATH\
61 |         --tasks $TASK \
62 |         --batch_size 1 \
63 |         --log_samples \
64 |         --log_samples_suffix $TASK_SUFFIX \
65 |         --output_path ./logs_reason/${JOB_NAME}


--------------------------------------------------------------------------------
/lmms-eval_videochat/scripts_reason/eval_qwen2_5vl_vmme_short.sh:
--------------------------------------------------------------------------------
 1 | 
 2 | MASTER_PORT=$((18000 + $RANDOM % 100))
 3 | TASK=videomme_short_think
 4 | 
 5 | 
 6 | CKPT_PATH=OpenGVLab/VideoChat-R1_7B_thinking
 7 | MODEL_NAME=qwen2_5_vl_lxh
 8 | MAX_NUM_FRAMES=256
 9 | echo $TASK
10 | TASK_SUFFIX="${TASK//,/_}"
11 | echo $TASK_SUFFIX
12 | 
13 | JOB_NAME=$(basename $0)_$(date +"%Y%m%d_%H%M%S")
14 | 
15 | NUM_GPUS=8
16 | 
17 | srun -p videop1 \
18 |     --job-name=${JOB_NAME} \
19 |     --ntasks=1 \
20 |     --gres=gpu:${NUM_GPUS} \
21 |     --ntasks-per-node=1 \
22 |     --cpus-per-task=16 \
23 |     --quotatype=spot \
24 |     --kill-on-bad-exit=1 \
25 |     accelerate launch --num_processes ${NUM_GPUS} --main_process_port 10078 -m lmms_eval \
26 |         --model ${MODEL_NAME} \
27 |         --model_args pretrained=$CKPT_PATH\
28 |         --tasks $TASK \
29 |         --batch_size 1 \
30 |         --log_samples \
31 |         --log_samples_suffix $TASK_SUFFIX \
32 |         --output_path ./logs_reason/${JOB_NAME}
33 | 


--------------------------------------------------------------------------------
/lmms-eval_videochat/setup.py:
--------------------------------------------------------------------------------
1 | import setuptools
2 | 
3 | # This is to make sure that the package supports editable installs
4 | setuptools.setup()
5 | 


--------------------------------------------------------------------------------
/lmms-eval_videochat/tools/get_video_avg_time.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | from lmms_eval.tasks import initialize_tasks, include_path, get_task_dict
 3 | import av
 4 | from tqdm import tqdm
 5 | from av.codec.context import CodecContext
 6 | 
 7 | tasks = ["worldqa_gen", "activitynetqa", "nextqa_oe_val", "nextqa_oe_test", "videochatgpt_gen", "egoschema"]
 8 | # tasks = ["nextqa_oe_val"]
 9 | data_stats = {}
10 | 
11 | 
12 | # This one is faster
13 | def record_video_length_stream(container):
14 |     video = container.streams.video[0]
15 |     video_length = float(video.duration * video.time_base)  # in seconds
16 |     return video_length
17 | 
18 | 
19 | # This one works for all types of video
20 | def record_video_length_packet(container):
21 |     video_length = 0
22 |     # context = CodecContext.create("libvpx-vp9", "r")
23 |     for packet in container.demux(video=0):
24 |         for frame in packet.decode():
25 |             video_length = frame.time  # The last frame time represent the video time
26 | 
27 |     return video_length
28 | 
29 | 
30 | if __name__ == "__main__":
31 |     initialize_tasks()
32 | 
33 |     task_dict = get_task_dict(tasks, model_name="llavavid")
34 |     for task_name in task_dict.keys():
35 |         task_obj = task_dict[task_name]
36 |         if type(task_obj) == tuple:
37 |             group, task_obj = task_obj
38 | 
39 |         docs = task_obj.test_docs()
40 |         doc_to_visual = task_obj.doc_to_visual
41 |         data_stats[task_name] = 0
42 |         for doc in tqdm(docs, desc=f"Processing {task_name}"):
43 |             video_path = doc_to_visual(doc)
44 |             container = av.open(video_path[0])
45 | 
46 |             if "webm" not in video_path[0] and "mkv" not in video_path[0]:
47 |                 try:
48 |                     video_length = record_video_length_stream(container)  # in seconds
49 |                 except:
50 |                     video_length = record_video_length_packet(container)
51 |             else:
52 |                 video_length = record_video_length_packet(container)
53 | 
54 |             data_stats[task_name] += video_length
55 | 
56 |         data_stats[task_name] /= len(docs)  # into seconds
57 |         # data_stats[task_name] /= 60 # into minutes
58 | 
59 |     with open("./video_benchmarks_stats.json", "w") as f:
60 |         json.dump(data_stats, f, indent=4, ensure_ascii=False)
61 | 


--------------------------------------------------------------------------------
/lmms-eval_videochat/tools/lite/embed.py:
--------------------------------------------------------------------------------
 1 | import embedder
 2 | from lmms_eval.utils import simple_parse_args_string
 3 | from lmms_eval.tasks import initialize_tasks, include_path, get_task_dict, ConfigurableTask
 4 | from lmms_eval.api.registry import ALL_TASKS, GROUP_REGISTRY
 5 | 
 6 | import argparse
 7 | import os
 8 | 
 9 | import torch.distributed as dist
10 | 
11 | 
12 | def rank0_print(*args):
13 |     if dist.is_initialized():
14 |         if dist.get_rank() == 0:
15 |             print(f"Rank {dist.get_rank()}: ", *args)
16 |     else:
17 |         print(*args)
18 | 
19 | 
20 | def parse_arguments():
21 |     parser = argparse.ArgumentParser()
22 |     parser.add_argument("--name", type=str)
23 |     parser.add_argument("--output_path", type=str)
24 |     parser.add_argument("--tasks", type=str, required=False, default="")
25 |     parser.add_argument("--data_path", type=str, required=False, default="")
26 |     parser.add_argument("--image_folder", type=str, required=False, default="")
27 |     parser.add_argument("--embedder_kwargs", type=str, default="")
28 | 
29 |     return parser.parse_args()
30 | 
31 | 
32 | if __name__ == "__main__":
33 |     args = parse_arguments()
34 |     embedder_name = args.name
35 |     output_path = args.output_path
36 |     if args.tasks.lower().strip() == "all":
37 |         initialize_tasks()
38 |         for task in list(ALL_TASKS):
39 |             if task in GROUP_REGISTRY:
40 |                 ALL_TASKS.remove(task)
41 |         tasks = list(ALL_TASKS)
42 |     else:
43 |         tasks = args.tasks.split(",")
44 | 
45 |     cached_idx = []
46 |     for idx in range(len(tasks)):
47 |         if os.path.exists(os.path.join(output_path, f"{tasks[idx]}_embed.npy")):
48 |             rank0_print(f"Task {tasks[idx]} exists in cache folder, load from cache")
49 |             cached_idx.append(idx)
50 |     tasks = [tasks[idx] for idx in range(len(tasks)) if idx not in cached_idx]
51 |     rank0_print(f"Tasks : {tasks}")
52 |     embedder_kwargs = simple_parse_args_string(args.embedder_kwargs)
53 | 
54 |     embedder_cls = getattr(embedder, embedder_name)
55 |     embedder_obj = embedder_cls(name=embedder_name, output_path=output_path, **embedder_kwargs)
56 |     for task in tasks:
57 |         embedder_obj.embed_task(task)
58 | 


--------------------------------------------------------------------------------
/lmms-eval_videochat/tools/lite/embedder/BaseEmbedder.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod
 2 | from typing import List, Optional, Union, Dict
 3 | import os
 4 | 
 5 | from lmms_eval.tasks import initialize_tasks, include_path, get_task_dict, ConfigurableTask
 6 | from lmms_eval.api.registry import ALL_TASKS
 7 | 
 8 | import torch.distributed as dist
 9 | 
10 | 
11 | def rank0_print(*args):
12 |     if dist.is_initialized():
13 |         if dist.get_rank() == 0:
14 |             print(f"Rank {dist.get_rank()}: ", *args)
15 |     else:
16 |         print(*args)
17 | 
18 | 
19 | class BaseEmbedder(ABC):
20 |     def __init__(self, name: str, output_path: str) -> None:
21 |         super().__init__()
22 |         self.name = name
23 |         self.output_path = output_path
24 |         os.makedirs(self.output_path, exist_ok=True)
25 |         initialize_tasks()
26 | 
27 |     def flatten(self, input):
28 |         new_list = []
29 |         for i in input:
30 |             for j in i:
31 |                 new_list.append(j)
32 |         return new_list
33 | 
34 |     # A static method to build requests for lmms_eval tasks
35 |     # Pass in task name and return a list of Requests
36 |     @staticmethod
37 |     def init_task(task: str, ignored_ids: Union[set, List] = None):
38 |         task_dict = get_task_dict([task], model_name="llava")
39 |         task_obj = task_dict[task]
40 |         if type(task_obj) == tuple:
41 |             group, task_obj = task_obj
42 |         DATASET_PATH = task_obj.DATASET_PATH
43 |         DATASET_NAME = None
44 |         if task_obj.DATASET_NAME is not None:
45 |             DATASET_NAME = task_obj.DATASET_NAME
46 | 
47 |         docs = task_obj.test_docs() if task_obj.has_test_docs() else task_obj.validation_docs()
48 |         split = task_obj.config.test_split if task_obj.has_test_docs() else task_obj.config.validation_split
49 |         rank0_print(f"\nTask : {task_obj.config.task}\n - #num : {len(task_obj.test_docs()) if task_obj.has_test_docs() else task_obj.validation_docs()}")
50 |         task_obj.build_all_requests()
51 |         requests = []
52 |         for instance in task_obj.instances:
53 |             reqtype = instance.request_type
54 |             contexts, all_gen_kwargs, doc_to_visual, doc_id, task, split = instance.args
55 |             if ignored_ids is not None and doc_id in ignored_ids:
56 |                 continue
57 |             requests.append(instance)
58 |         return DATASET_PATH, DATASET_NAME, split, requests, task_obj, docs
59 | 
60 |     @abstractmethod
61 |     def embed_task(self, task: str):
62 |         return
63 | 


--------------------------------------------------------------------------------
/lmms-eval_videochat/tools/lite/embedder/ClipBgeEmbedder.py:
--------------------------------------------------------------------------------
 1 | from copy import deepcopy
 2 | from datetime import timedelta
 3 | from typing import Optional, Union, List, Sequence, Dict
 4 | import os
 5 | import json
 6 | 
 7 | from .BaseEmbedder import BaseEmbedder
 8 | 
 9 | from accelerate import Accelerator, DistributedType, InitProcessGroupKwargs
10 | from accelerate.state import AcceleratorState
11 | from accelerate.utils import gather_object
12 | 
13 | import numpy as np
14 | from transformers import CLIPModel, CLIPProcessor
15 | from sentence_transformers import SentenceTransformer
16 | from tqdm import tqdm
17 | import torch
18 | from PIL import Image
19 | 
20 | 
21 | class ClipBgeEmbedder(BaseEmbedder):
22 |     def __init__(
23 |         self,
24 |         name: str,
25 |         output_path: str,
26 |         mm_pretrained: str = "openai/clip-vit-large-patch14",
27 |         txt_pretrained: str = "BAAI/bge-m3",
28 |         device: str = "cuda",
29 |         device_map: str = "",
30 |     ) -> None:
31 |         super().__init__(name, output_path)
32 |         self.model = CLIPModel.from_pretrained(mm_pretrained)
33 |         self.processor = CLIPProcessor.from_pretrained(mm_pretrained)
34 |         self.text_model = SentenceTransformer(txt_pretrained)
35 |         accelerator_kwargs = InitProcessGroupKwargs(timeout=timedelta(weeks=52))
36 |         self.accelerator = Accelerator(kwargs_handlers=[accelerator_kwargs])
37 |         if self.accelerator.num_processes > 1 and device_map == "":
38 |             self.device = torch.device(f"cuda:{self.accelerator.local_process_index}")
39 |             self.device_map = f"cuda:{self.accelerator.local_process_index}"
40 |         else:
41 |             self.device = torch.device(device)
42 |             self.device_map = device_map
43 | 
44 |         self.model.to(self.device)
45 |         self.text_model.to(self.device)
46 | 
47 |     def embed_task(self, task: str, ignored_ids: Union[set, List] = None):
48 |         DATASET_PATH, DATASET_NAME, split, requests, task_obj, self.docs = BaseEmbedder.init_task(task, ignored_ids)
49 |         self.accelerator.wait_for_everyone()
50 |         with self.accelerator.split_between_processes(requests, apply_padding=False) as requests_split:
51 |             results = {"outputs": []}
52 |             for req in tqdm(requests_split, disable=not self.accelerator.is_main_process):
53 |                 contexts, all_gen_kwargs, doc_to_visual, doc_id, task, split = req.args
54 |                 visuals = [doc_to_visual(self.docs[doc_id])]
55 |                 visuals = self.flatten(visuals)
56 | 
57 |                 text_embedding = self.text_model.encode([contexts])
58 |                 text_embedding = torch.from_numpy(text_embedding).flatten()
59 | 
60 |                 if len(visuals) > 0:
61 |                     img_inputs = self.processor(images=visuals, return_tensors="pt")
62 |                     img_inputs = {k: v.to(self.device) for k, v in img_inputs.items()}
63 | 
64 |                     # For multiple images, we take the mean of it
65 |                     image_embedding = self.model.get_image_features(**img_inputs).mean(dim=0).detach().cpu()
66 |                 else:
67 |                     image_embedding = torch.zeros(self.model.config.projection_dim)
68 | 
69 |                 embedding = torch.concat([image_embedding, text_embedding])
70 | 
71 |                 results["outputs"].append(embedding)
72 |             results = [results]
73 | 
74 |         self.accelerator.wait_for_everyone()
75 |         results_gathered = gather_object(results)
76 |         if self.accelerator.is_main_process:
77 |             outputs = []
78 |             for r in results_gathered:
79 |                 outputs += r["outputs"]
80 |             results_gathered = torch.stack(outputs)
81 |             np.save(open(os.path.join(self.output_path, f"{task}_embed.npy"), "wb"), results_gathered)
82 |             return results_gathered
83 | 
84 | 
85 | if __name__ == "__main__":
86 |     model = CLIPModel.from_pretrained("openai/clip-vit-large-patch14").to("cuda")
87 |     processor = CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14")
88 |     text = ["a photo of a cat", "a photo of a dog"]
89 |     inputs = processor(text=text, return_tensors="pt")
90 |     inputs = {k: v.to("cuda") for k, v in inputs.items()}
91 |     outputs = model.get_text_features(**inputs)
92 |     print(outputs.mean(dim=0).shape)
93 | 


--------------------------------------------------------------------------------
/lmms-eval_videochat/tools/lite/embedder/__init__.py:
--------------------------------------------------------------------------------
1 | from .BaseEmbedder import BaseEmbedder
2 | from .ClipBgeEmbedder import ClipBgeEmbedder
3 | 


--------------------------------------------------------------------------------
/lmms-eval_videochat/tools/lite/shrink.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import importlib
 3 | import os
 4 | import yaml
 5 | import json
 6 | from pathlib import Path
 7 | import hashlib
 8 | 
 9 | from accelerate import Accelerator
10 | from accelerate.utils import InitProcessGroupKwargs
11 | import numpy as np
12 | import datetime
13 | 
14 | from lmms_eval.utils import simple_parse_args_string
15 | import shrinker as shrinker_module
16 | 
17 | 
18 | AVAILABEL_SHRINKER = {"embed": "Embed_Shrinker"}
19 | 
20 | 
21 | def parse_arguments():
22 |     parser = argparse.ArgumentParser()
23 |     parser.add_argument("--shrinker", type=str, help="The type of shrinker you want to use")
24 |     parser.add_argument("--num_items", type=str, help="The number of items you want in your shrinked dataset")
25 |     parser.add_argument("--tasks", type=str, help="The task you want to shrink. Separate each task with comma, will be parsed in to list")
26 |     parser.add_argument("--push_to_hub", action="store_true", default=False, help="Whether to push the shrinked dataset to hub")
27 |     parser.add_argument("--shrinker_kwargs", type=str, help="In args=xxx,args2=xxx format. Will be parsed into dict")
28 | 
29 |     return parser.parse_args()
30 | 
31 | 
32 | if __name__ == "__main__":
33 |     args = parse_arguments()
34 |     shrinker_kwargs = simple_parse_args_string(args.shrinker_kwargs)
35 |     shrinker_name = args.shrinker
36 |     tasks = args.tasks.split(",")
37 |     num_items = args.num_items.split(",")
38 |     assert len(num_items) == 1 or len(num_items) == len(tasks), "Either provide one num items for all task or one num item for each task"
39 |     if len(num_items) == 1:
40 |         num_items = [float(num_items[0])] * len(tasks)
41 |     else:
42 |         num_items = [float(n) for n in num_items]
43 |     push_to_hub = args.push_to_hub
44 |     assert len(num_items) == len(tasks) or len(num_items) == 1, "Either pass in one num_items for whole tasks, or pass in num items for each task"
45 |     assert shrinker_name in AVAILABEL_SHRINKER, f"Unavailable shrinker {shrinker_name}. You can choose from {AVAILABEL_SHRINKER.keys()}"
46 | 
47 |     kwargs_handler = InitProcessGroupKwargs(timeout=datetime.timedelta(seconds=60000))
48 |     accelerator = Accelerator(kwargs_handlers=[kwargs_handler])
49 |     for idx, task in enumerate(tasks):
50 |         shrinker = getattr(shrinker_module, f"{AVAILABEL_SHRINKER[shrinker_name]}")
51 |         shrinker = shrinker(task=task, num_items=num_items[idx], push_to_hub=push_to_hub, name=shrinker_name, **shrinker_kwargs)
52 |         shrinker.shrink()
53 |         accelerator.wait_for_everyone()
54 | 


--------------------------------------------------------------------------------
/lmms-eval_videochat/tools/lite/shrinker/BaseShrinker.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod
 2 | from glob import glob
 3 | import os
 4 | import json
 5 | import numpy as np
 6 | from numpy.polynomial.polynomial import polyfit
 7 | import matplotlib.pyplot as plt
 8 | from sklearn.metrics import mean_squared_error
 9 | from typing import Dict, List, Union
10 | from distutils.dir_util import copy_tree
11 | import random
12 | import torch
13 | import lmms_eval
14 | from lmms_eval.evaluator import evaluate
15 | from lmms_eval.tasks import initialize_tasks, include_path, get_task_dict, ConfigurableTask
16 | from lmms_eval.api.registry import ALL_TASKS
17 | import logging
18 | from lmms_eval.utils import simple_parse_args_string
19 | 
20 | 
21 | eval_logger = logging.getLogger("lmms-eval")
22 | 
23 | 
24 | class BaseShrinker(ABC):
25 |     def __init__(self, task: str, num_items: Union[int, float], name: str, push_to_hub: bool = True) -> None:
26 | 
27 |         super().__init__()
28 |         self.name = name
29 |         self.task = task
30 |         self.num_items = float(num_items)
31 |         self.push_to_hub = push_to_hub
32 | 
33 |     @abstractmethod
34 |     def shrink(self):
35 |         return
36 | 


--------------------------------------------------------------------------------
/lmms-eval_videochat/tools/lite/shrinker/EmbedShrinker.py:
--------------------------------------------------------------------------------
 1 | from .BaseShrinker import BaseShrinker
 2 | import sys
 3 | from .sampling_methods import AVAILABEL_METHODS
 4 | from lmms_eval.tasks import initialize_tasks
 5 | 
 6 | import torch
 7 | from typing import List, Dict, Union
 8 | import numpy as np
 9 | import os
10 | 
11 | sys.path.append("..")
12 | from embedder import BaseEmbedder
13 | from shrinker import sampling_methods as sampling_methods_module
14 | 
15 | 
16 | class Embed_Shrinker(BaseShrinker):
17 |     def __init__(
18 |         self,
19 |         task: str,
20 |         num_items: Union[int, float],
21 |         name: str,
22 |         embed_cache_path: str,
23 |         sampling_methods: str,
24 |         push_to_hub: bool,
25 |     ) -> None:
26 |         super().__init__(task, num_items, name, push_to_hub)
27 |         self.embed_cache_path = embed_cache_path
28 |         initialize_tasks()
29 |         self.DATASET_PATH, self.DATASET_NAME, self.split, _, self.task_obj, docs = BaseEmbedder.init_task(task)
30 |         assert sampling_methods in AVAILABEL_METHODS, f"Not available sampling methods, Choose from {AVAILABEL_METHODS.keys()}"
31 |         self.sampling_methods = getattr(sampling_methods_module, AVAILABEL_METHODS[sampling_methods])
32 | 
33 |     def shrink(self):
34 |         task_embedding = np.load(open(os.path.join(self.embed_cache_path, f"{self.task}_embed.npy"), "rb"))
35 |         task_embedding = torch.from_numpy(task_embedding)
36 |         # I know torch.squeeze is safe but numpy reshape sometimes may not
37 |         # so I just do it here by converting to torch
38 |         if len(task_embedding.shape) == 3:
39 |             task_embedding = task_embedding.squeeze(1)
40 |         task_embedding = task_embedding.numpy()
41 |         self.sampling_methods = self.sampling_methods(X=task_embedding)
42 |         # centroids = self.cluster(task_embedding)
43 |         if self.num_items < 1.0:
44 |             self.num_items = int(task_embedding.shape[0] * self.num_items)
45 |         else:
46 |             self.num_items = int(self.num_items)
47 |         anchor_points = self.sampling_methods.select_batch(N=self.num_items)
48 |         dataset = self.task_obj.dataset[self.split]
49 |         tiny_dataset = dataset.select(anchor_points)
50 | 
51 |         if self.push_to_hub:
52 |             tiny_dataset.push_to_hub(repo_id=f"lmms-lab/LMMs-Eval-Lite", config_name=self.task, split="lite")
53 |         return
54 | 


--------------------------------------------------------------------------------
/lmms-eval_videochat/tools/lite/shrinker/__init__.py:
--------------------------------------------------------------------------------
1 | from .BaseShrinker import BaseShrinker
2 | from .EmbedShrinker import Embed_Shrinker
3 | 


--------------------------------------------------------------------------------
/lmms-eval_videochat/tools/lite/shrinker/sampling_methods/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2017 Google Inc.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | 
16 | AVAILABEL_METHODS = {
17 |     "kcenter_greedy": "kCenterGreedy",
18 | }
19 | 
20 | for name, class_name in AVAILABEL_METHODS.items():
21 |     try:
22 |         exec(f"from .{name} import {class_name}")
23 |     except Exception as e:
24 |         print(f"Error : \n\n{e}\n\n When trying to import {class_name}")
25 | 


--------------------------------------------------------------------------------
/lmms-eval_videochat/tools/lite/shrinker/sampling_methods/kcenter_greedy.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from sklearn.metrics import pairwise_distances
 3 | from tqdm import tqdm
 4 | 
 5 | 
 6 | from .sampling_def import SamplingMethod
 7 | 
 8 | 
 9 | class kCenterGreedy(SamplingMethod):
10 |     def __init__(self, X: np.array):
11 |         self.X = X
12 |         self.flat_X = self.flatten_X()
13 |         self.name = "kcenter"
14 |         self.features = self.flat_X
15 |         self.min_distances = None
16 |         self.n_obs = self.X.shape[0]
17 |         self.already_selected = None
18 | 
19 |     def update_distances(self, cluster_centers, only_new=True, reset_dist=False):
20 |         """Update min distances given cluster centers.
21 | 
22 |         Args:
23 |             cluster_centers: indices of cluster centers
24 |             only_new: only calculate distance for newly selected points and update
25 |             min_distances.
26 |             rest_dist: whether to reset min_distances.
27 |         """
28 | 
29 |         if reset_dist:
30 |             self.min_distances = None
31 |         if only_new:
32 |             cluster_centers = [d for d in cluster_centers if d not in self.already_selected]
33 |         if cluster_centers:
34 |             # Update min_distances for all examples given new cluster center.
35 |             x = self.features[cluster_centers]
36 |             dist = pairwise_distances(self.features, x, metric="euclidean")
37 | 
38 |             if self.min_distances is None:
39 |                 self.min_distances = np.min(dist, axis=1).reshape(-1, 1)
40 |             else:
41 |                 self.min_distances = np.minimum(self.min_distances, dist)
42 | 
43 |     def select_batch(self, N):
44 |         """
45 |         Diversity promoting active learning method that greedily forms a batch
46 |         to minimize the maximum distance to a cluster center among all unlabeled
47 |         datapoints.
48 | 
49 |         Args:
50 |         model: model with scikit-like API with decision_function implemented
51 |         already_selected: index of datapoints already selected
52 |         N: batch size
53 | 
54 |         Returns:
55 |         indices of points selected to minimize distance to cluster centers
56 |         """
57 | 
58 |         print("Using flat_X as features.")
59 | 
60 |         new_batch = []
61 | 
62 |         for _ in tqdm(range(N), desc="K-Center Greedy"):
63 |             if self.already_selected is None:
64 |                 # Initialize centers with a randomly selected datapoint
65 |                 # ind = np.random.choice(np.arange(self.n_obs))
66 |                 ind = 0  # To avoid randomness
67 |                 self.already_selected = []
68 |             else:
69 |                 ind = np.argmax(self.min_distances)
70 |             # New examples should not be in already selected since those points
71 |             # should have min_distance of zero to a cluster center.
72 |             assert ind not in self.already_selected
73 | 
74 |             self.update_distances([ind], only_new=True, reset_dist=False)
75 |             new_batch.append(ind)
76 |             self.already_selected.append(ind)
77 |         print("Maximum distance from cluster centers is %0.2f" % max(self.min_distances))
78 | 
79 |         new_batch = np.array(new_batch)
80 |         return new_batch
81 | 


--------------------------------------------------------------------------------
/lmms-eval_videochat/tools/lite/shrinker/sampling_methods/sampling_def.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2017 Google Inc.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | """Abstract class for sampling methods.
16 | 
17 | Provides interface to sampling methods that allow same signature
18 | for select_batch.  Each subclass implements select_batch_ with the desired
19 | signature for readability.
20 | """
21 | 
22 | import abc
23 | import numpy as np
24 | 
25 | 
26 | class SamplingMethod(object):
27 |     __metaclass__ = abc.ABCMeta
28 | 
29 |     @abc.abstractmethod
30 |     def __init__(self, X, **kwargs):
31 |         self.X = X
32 | 
33 |     def flatten_X(self):
34 |         shape = self.X.shape
35 |         flat_X = self.X
36 |         if len(shape) > 2:
37 |             flat_X = np.reshape(self.X, (shape[0], np.product(shape[1:])))
38 |         return flat_X
39 | 
40 |     @abc.abstractmethod
41 |     def select_batch(self):
42 |         return
43 | 
44 |     def to_dict(self):
45 |         return None
46 | 


--------------------------------------------------------------------------------
/lmms-eval_videochat/tools/live_bench/create_dataset.py:
--------------------------------------------------------------------------------
 1 | from live_bench.websites import load_websites, load_websites_from_file
 2 | from live_bench import LiveBench
 3 | 
 4 | 
 5 | if __name__ == "__main__":
 6 |     website = load_websites()
 7 |     dataset = LiveBench()
 8 |     dataset.capture(websites=website, driver_kwargs={"headless": True}, screen_shoter="single_screen", shoter_kwargs={"screen_size": (1024, 1024)}, qa_generator="gpt4v", scorer="claude", checker="gemini")
 9 | 
10 |     website = load_websites_from_file("/data/pufanyi/project/lmms-eval/temp/images")
11 |     dataset.capture(websites=website, screen_shoter="human", qa_generator="gpt4v", scorer="claude", checker="gemini", driver_kwargs={}, shoter_kwargs={}, generator_kwargs={})
12 |     dataset.upload()
13 | 


--------------------------------------------------------------------------------
/lmms-eval_videochat/tools/live_bench/live_bench/__init__.py:
--------------------------------------------------------------------------------
1 | from live_bench.data_generator import LiveBench
2 | from live_bench.api.live_bench import generate_live_bench_from_path, generate_live_bench
3 | 


--------------------------------------------------------------------------------
/lmms-eval_videochat/tools/live_bench/live_bench/api/live_bench.py:
--------------------------------------------------------------------------------
 1 | from live_bench.websites import load_websites, load_websites_from_file
 2 | from live_bench import LiveBench
 3 | 
 4 | 
 5 | def generate_live_bench(*, force_clear=False, screen_shoter="single_screen", qa_generator="gpt4v", scorer="gpt4v", checker="gemini", driver_kwargs={}, shoter_kwargs={}, generator_kwargs={}):
 6 |     website = load_websites()
 7 |     dataset = LiveBench(force_clear=force_clear)
 8 |     dataset.capture(websites=website, screen_shoter=screen_shoter, qa_generator=qa_generator, scorer=scorer, checker=checker, driver_kwargs=driver_kwargs, shoter_kwargs=shoter_kwargs, generator_kwargs=generator_kwargs)
 9 |     dataset.upload()
10 | 
11 | 
12 | def generate_live_bench_from_path(path, *, force_clear=False, qa_generator="gpt4v", scorer="gpt4v", checker="gemini", driver_kwargs={}, shoter_kwargs={}, generator_kwargs={}):
13 |     website = load_websites_from_file(path)
14 |     dataset: LiveBench = LiveBench(force_clear=force_clear)
15 |     dataset.capture(websites=website, screen_shoter="human", qa_generator=qa_generator, scorer=scorer, checker=checker, driver_kwargs=driver_kwargs, shoter_kwargs=shoter_kwargs, generator_kwargs=generator_kwargs)
16 |     dataset.upload()
17 | 
18 | 
19 | if __name__ == "__main__":
20 |     generate_live_bench()
21 | 


--------------------------------------------------------------------------------
/lmms-eval_videochat/tools/live_bench/live_bench/data_generator/__init__.py:
--------------------------------------------------------------------------------
1 | from live_bench.data_generator.qa_generator import get_generator, get_random_generator
2 | from live_bench.data_generator.live_bench_data import LiveBenchData
3 | from live_bench.data_generator.live_bench import LiveBench
4 | from live_bench.data_generator.response import Response
5 | 


--------------------------------------------------------------------------------
/lmms-eval_videochat/tools/live_bench/live_bench/data_generator/check_prompt.md:
--------------------------------------------------------------------------------
 1 | I would like you to act as a quizmaster who designs questions based on a provided image that would challenge adults to think critically. The image in question is a screenshot from the homepage or section of news website. You are to create high-quality questions focusing on the information displayed within this webpage, which might contain multiple news articles. Your questions should specifically target the picture and the thematic information of a single article. Your question should be answerable, and checkable. Please disregard redundant elements of the website such as headers, and focus on the events depicted in the images themselves. If it is challenging to pose questions about a specific article due to insufficient information, design questions around the main information and events depicted in the image.
 2 | 
 3 | Now, you are given a screenshot of the homepage of a news website, with a already generated question and answer. Your task is to refine the question and answer, and refractor them to make the question more answerable, checkable, and challenging. If you don't think the question is good, please provide a new question and answer.
 4 | 
 5 | Note that the subtask must be one of these five:
 6 | 
 7 | - Basic Understanding
 8 | - Contextual Analysis
 9 | - Deeper Implications
10 | - Broader Implications
11 | - Further Insights
12 | 
13 | If you think the question does not correspond to the subtask, you have two options:
14 | 1. Modify the question to correspond to the subtask.
15 | 2. Modify the subtask to correspond to the question.
16 | 
17 | However, you should not change the original question's subtask unless the original subtask is not one of these five. If you feel the original question's subtask does not match the question, modify the question to match the subtask instead of rewriting the subtask.
18 | 
19 | Please note that although the image may contain a lot of political content, try to avoid questions with any political bias when asking questions. The question should focus on understanding and thinking about the image, not on political opinions. Within your capabilities, try to make the questions more challenging. However, you also need to consider the gradability of the questions you set. It is reiterated that what you need to assess is the ability to understand the news webpage, not politics.
20 | 
21 | You should try to be innovative, and you can also try different types of questions, like multiple-choice questions, fill-in-the-blank questions, or even image-text matching questions, and sequencing questions if possible. Within your capabilities, try to make the questions more challenging.
22 | 
23 | If you think the question is not good, or it is not answerable, please provide a new question and answer.
24 | 
25 | Reminder again that you cannot change the original subtask unless the original subtask is not one of the five listed above.
26 | 


--------------------------------------------------------------------------------
/lmms-eval_videochat/tools/live_bench/live_bench/data_generator/default_criteria.md:
--------------------------------------------------------------------------------
 1 | ### 1. Authenticity (5 points)
 2 | - **5 Points**: The information is directly observable in the image or can be reasonably inferred with strong evidence.
 3 | - **3 Points**: The information has a plausible connection to the image but requires assumptions that are not strongly supported by the image.
 4 | - **1 Point**: The information cannot be observed or reasonably inferred from the image; it seems unrelated or speculative.
 5 | 
 6 | ### 2. Logical Coherence (3 points)
 7 | - **3 Points**: The answer logically follows from the question and maintains consistency with the image context.
 8 | - **2 Points**: There are minor logical gaps or inconsistencies in the answer relative to the question.
 9 | - **1 Point**: The answer is logically inconsistent or contradictory to the question or image context.
10 | 
11 | ### 3. Clarity and Precision (2 points)
12 | - **2 Points**: The question and answer are clearly articulated and precisely address specifics of the image.
13 | - **1 Point**: The question or answer is somewhat vague or overly general, lacking specific details related to the image.
14 | - **0 Points**: The question or answer is unclear or too ambiguous to determine relevance to the image.
15 | 
16 | Each Q&A pair can score a maximum of 10 points. The sum of points from these three categories determines the final score for each pair. Provide a brief explanation for each rating, focusing on how well the Q&A adheres to these criteria.
17 | 


--------------------------------------------------------------------------------
/lmms-eval_videochat/tools/live_bench/live_bench/data_generator/example/example_website.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/VideoChat-R1/dba50f32fffb763e0fdf3810244be76a44147df2/lmms-eval_videochat/tools/live_bench/live_bench/data_generator/example/example_website.png


--------------------------------------------------------------------------------
/lmms-eval_videochat/tools/live_bench/live_bench/data_generator/prompt.md:
--------------------------------------------------------------------------------
 1 | I would like you to act as a quizmaster who designs questions based on a provided image that would challenge adults to think critically. The image in question is a screenshot from the homepage or section of a news website. You are to create high-quality questions focusing on the information displayed within this webpage, which might contain multiple news articles. Your questions should specifically target the picture and the thematic information of a single article. Your question should be answerable, and checkable. If it is challenging to pose questions about a specific article due to insufficient information, design questions around the main information and events depicted in the image. Within your capabilities, try to make the questions more challenging.
 2 | 
 3 | A well-crafted question about an event should allow respondents to gain deeper insights by observing and analyzing the event, paying attention to the following aspects:
 4 | 
 5 | - Basic Understanding: Questions that require direct observation or recall of the information presented in the image. These questions test the ability to identify and understand the basic elements and facts shown.
 6 | - Contextual Analysis: Questions that delve into the context or setting of the information presented. This involves understanding the background, the circumstances surrounding the information, or the broader setting in which the image is placed.
 7 | - Deeper Implications: Questions that explore the underlying meanings, implications, or consequences of the information in the image. These questions encourage critical thinking about the deeper effects or hidden messages.
 8 | - Broader Implications: Questions that extend the discussion beyond the immediate context of the image to its wider impact on society, other fields, or global issues.
 9 | - Further Insights: Questions that prompt exploration of additional layers of understanding or connections to other knowledge and concepts not immediately apparent from the image.
10 | 
11 | Consider designing a multi-round Q&A process, progressively deepening the understanding of the event’s essence.
12 | 
13 | Please note that although the image may contain a lot of political content, try to avoid questions with any political bias when asking questions. Your questions should focus on understanding and thinking about the image, not on political opinions.
14 | 
15 | You should try to be innovative, and you may propose some difficult questions, as well as multiple-choice questions, fill-in-the-blank questions, or even image-text matching questions, and sequencing questions. Within your capabilities, try to make the questions more challenging.
16 | 
17 | At the same time, you need to generate how this question should be scored, that is, the criteria. Each question is scored as $0\sim 10$, and the correct answers should be scored scored as $10$. Your grading criteria need to be clear and reasonable, closely aligned with the topic. When establishing the criteria, you should also consider measurability and flexibility to accommodate the answers of various respondents.
18 | 


--------------------------------------------------------------------------------
/lmms-eval_videochat/tools/live_bench/live_bench/data_generator/response.py:
--------------------------------------------------------------------------------
 1 | class Response(object):
 2 |     def __init__(self, success: bool, content: str, full_log: dict):
 3 |         self.success = success
 4 |         self.content = content
 5 |         self.full_log = full_log
 6 | 
 7 |     def to_dict(self):
 8 |         return {
 9 |             "success": self.success,
10 |             "content": self.content,
11 |             "full_log": self.full_log,
12 |         }
13 | 


--------------------------------------------------------------------------------
/lmms-eval_videochat/tools/live_bench/live_bench/data_generator/score_prompt.md:
--------------------------------------------------------------------------------
 1 | Based on the multi-round Q&A regarding the image, please evaluate each question and answer from the multi-round Q&A based on the image for their authenticity (whether the information can be directly obtained from the image or reasonably inferred) and logical coherence. For each Q&A pair, provide a rating from 1 to 10, where 1 indicates very poor and 10 indicates excellent. Additionally, please provide a brief explanation for each rating.
 2 | 
 3 | Here are the criteria for evaluating the Q&A pairs:
 4 | 
 5 | ### 1. Authenticity (5 points)
 6 | - **5 Points**: The information is directly observable in the image or can be reasonably inferred with strong evidence.
 7 | - **3 Points**: The information has a plausible connection to the image but requires assumptions that are not strongly supported by the image.
 8 | - **1 Point**: The information cannot be observed or reasonably inferred from the image; it seems unrelated or speculative.
 9 | 
10 | ### 2. Logical Coherence (3 points)
11 | - **3 Points**: The answer logically follows from the question and maintains consistency with the image context.
12 | - **2 Points**: There are minor logical gaps or inconsistencies in the answer relative to the question.
13 | - **1 Point**: The answer is logically inconsistent or contradictory to the question or image context.
14 | 
15 | ### 3. Clarity and Precision (2 points)
16 | - **2 Points**: The question and answer are clearly articulated and precisely address specifics of the image.
17 | - **1 Point**: The question or answer is somewhat vague or overly general, lacking specific details related to the image.
18 | - **0 Points**: The question or answer is unclear or too ambiguous to determine relevance to the image.
19 | 
20 | Each Q&A pair can score a maximum of 10 points. The sum of points from these three categories determines the final score for each pair. Provide a brief explanation for each rating, focusing on how well the Q&A adheres to these criteria.
21 | 


--------------------------------------------------------------------------------
/lmms-eval_videochat/tools/live_bench/live_bench/data_generator/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/VideoChat-R1/dba50f32fffb763e0fdf3810244be76a44147df2/lmms-eval_videochat/tools/live_bench/live_bench/data_generator/utils/__init__.py


--------------------------------------------------------------------------------
/lmms-eval_videochat/tools/live_bench/live_bench/data_generator/utils/claude.py:
--------------------------------------------------------------------------------
 1 | from PIL import Image
 2 | import io
 3 | import base64
 4 | from live_bench.data_generator.response import Response
 5 | import anthropic
 6 | import logging
 7 | from time import sleep
 8 | from typing import Union, List
 9 | 
10 | logger = logging.getLogger("lmms-eval")
11 | 
12 | 
13 | def format_claude_images(image: Union[Image.Image, List[Image.Image]]):
14 |     if isinstance(image, list):
15 |         return [format_claude_images(img) for img in image]
16 |     buffered = io.BytesIO()
17 |     image.save(buffered, format="PNG")
18 |     img_str = base64.b64encode(buffered.getvalue()).decode("utf-8")
19 |     return {
20 |         "type": "image",
21 |         "source": {
22 |             "type": "base64",
23 |             "media_type": "image/png",
24 |             "data": img_str,
25 |         },
26 |     }
27 | 
28 | 
29 | def claude_generate_response(client: anthropic.Anthropic, model, messages, max_tokens: int = 4096, max_try_times=5, system=None, json_format="auto", test=False, **kwargs):
30 |     if json_format == "auto":
31 |         json_format = False
32 |         for message in messages:
33 |             if message.get("role") == "user":
34 |                 contents = message.get("content", [])
35 |                 if isinstance(contents, str):
36 |                     if "json" in contents:
37 |                         json_format = True
38 |                         break
39 |                 else:
40 |                     for content in contents:
41 |                         if content.get("type", None) == "text" and "json" in content.get("text", ""):
42 |                             json_format = True
43 |                             break
44 | 
45 |     if json_format:
46 |         messages.append({"role": "assistant", "content": "{"})
47 | 
48 |     def _generate():
49 |         if system:
50 |             return client.messages.create(model=model, messages=messages, max_tokens=max_tokens, system=system, **kwargs)
51 |         else:
52 |             return client.messages.create(model=model, messages=messages, max_tokens=max_tokens, **kwargs)
53 | 
54 |     for times in range(max_try_times):
55 |         try:
56 |             response = _generate()
57 |             response_str = response.content[0].text
58 |             if json_format:
59 |                 response_str = "{" + response_str
60 |             return Response(success=True, content=response_str, full_log={"input": messages, "output": response.to_dict()})
61 |         except Exception as e:
62 |             logger.error(f"Failed to generate response: {e}")
63 |             if times < max_try_times - 1:
64 |                 logger.info(f"Retrying... ({times+1}/{max_try_times})")
65 |                 sleep(3)
66 |             else:
67 |                 logger.error("Failed to generate response after retrying.")
68 |                 return Response(success=False, content=str(e), full_log={"input": messages, "output": None})
69 | 


--------------------------------------------------------------------------------
/lmms-eval_videochat/tools/live_bench/live_bench/data_generator/utils/gemini.py:
--------------------------------------------------------------------------------
 1 | import google.generativeai as genai
 2 | from time import sleep
 3 | from live_bench.data_generator.response import Response
 4 | import logging
 5 | from google.generativeai.types import HarmCategory, HarmBlockThreshold
 6 | 
 7 | logger = logging.getLogger("lmms-eval")
 8 | 
 9 | 
10 | def gemini_generate_response(client: genai.GenerativeModel, messages, max_tokens: int, max_try_times: int = 5, **kwargs):
11 |     generation_config = genai.GenerationConfig(max_output_tokens=max_tokens)
12 | 
13 |     def _generate():
14 |         return client.generate_content(
15 |             messages,
16 |             generation_config=generation_config,
17 |             safety_settings={
18 |                 HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE,
19 |                 HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE,
20 |                 HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE,
21 |                 HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE,
22 |             },
23 |             **kwargs,
24 |         )
25 | 
26 |     for times in range(max_try_times):
27 |         try:
28 |             response = _generate()
29 |             return Response(success=True, content=response.text, full_log={"input": messages, "output": response})
30 |         except Exception as e:
31 |             logger.error(f"Failed to generate response: {e}")
32 |             if times < max_try_times - 1:
33 |                 logger.info(f"Retrying... ({times+1}/{max_try_times})")
34 |                 sleep(3)
35 |             else:
36 |                 logger.error("Failed to generate response after retrying.")
37 |                 return Response(success=False, content=str(e), full_log={"input": messages, "output": None})
38 | 


--------------------------------------------------------------------------------
/lmms-eval_videochat/tools/live_bench/live_bench/data_generator/utils/gpt4v.py:
--------------------------------------------------------------------------------
 1 | from PIL import Image
 2 | import io
 3 | import base64
 4 | from live_bench.data_generator.response import Response
 5 | import logging
 6 | from time import sleep
 7 | 
 8 | logger = logging.getLogger("lmms-eval")
 9 | 
10 | 
11 | def format_gpt4v_images(image):
12 |     if isinstance(image, Image.Image):
13 |         buffered = io.BytesIO()
14 |         image.save(buffered, format="PNG")
15 |         img_str = base64.b64encode(buffered.getvalue()).decode("utf-8")
16 |         return {
17 |             "type": "image_url",
18 |             "image_url": {
19 |                 "url": f"data:image/png;base64,{img_str}",
20 |             },
21 |         }
22 |     elif isinstance(image, list):
23 |         return [format_gpt4v_images(img) for img in image]
24 |     else:
25 |         raise ValueError(f"Unsupported image type: {type(image)}")
26 | 
27 | 
28 | def format_printable_messages(messages):
29 |     for message in messages:
30 |         if "content" in message and isinstance(message["content"], list):
31 |             for content in message["content"]:
32 |                 if "type" in content and content["type"] == "image_url":
33 |                     content["image_url"]["url"] = "<image_url>"
34 |     return messages
35 | 
36 | 
37 | def gpt4v_generate_response(messages, *, client=None, model="gpt-4o", max_tokens: int = 4096, max_try_times: int = 5, json_format="auto", test=False, system=None, **kwargs) -> Response:
38 |     if system:
39 |         messages = [{"role": "system", "content": system}] + messages
40 | 
41 |     if json_format == "auto":
42 |         json_format = False
43 |         for message in messages:
44 |             if message.get("role") == "user":
45 |                 contents = message.get("content", [])
46 |                 if isinstance(contents, str):
47 |                     if "json" in contents:
48 |                         json_format = True
49 |                         break
50 |                 else:
51 |                     for content in contents:
52 |                         if content.get("type", None) == "text" and "json" in content.get("text", ""):
53 |                             json_format = True
54 |                             break
55 | 
56 |     if json_format:
57 |         response_format = {"type": "json_object"}
58 |     else:
59 |         response_format = None
60 | 
61 |     def _generate():
62 |         return client.chat.completions.create(model=model, messages=messages, max_tokens=max_tokens, response_format=response_format, **kwargs)
63 | 
64 |     for times in range(max_try_times):
65 |         try:
66 |             response = _generate()
67 |             return Response(success=True, content=response.choices[0].message.content, full_log={"input": format_printable_messages(messages), "output": response.choices[0].message.content})
68 |         except Exception as e:
69 |             logger.error(f"Failed to generate response: {e}")
70 |             if times < max_try_times - 1:
71 |                 logger.info(f"Retrying... ({times+1}/{max_try_times})")
72 |                 sleep(3)
73 |             else:
74 |                 logger.error("Failed to generate response after retrying.")
75 |                 return Response(success=False, content=str(e), full_log={"input": format_printable_messages(messages), "output": None})
76 | 


--------------------------------------------------------------------------------
/lmms-eval_videochat/tools/live_bench/live_bench/driver/.gitignore:
--------------------------------------------------------------------------------
1 | extensions/
2 | 


--------------------------------------------------------------------------------
/lmms-eval_videochat/tools/live_bench/live_bench/driver/__init__.py:
--------------------------------------------------------------------------------
1 | from live_bench.driver.load_driver import load_driver
2 | 


--------------------------------------------------------------------------------
/lmms-eval_videochat/tools/live_bench/live_bench/driver/load_driver.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import zipfile
 3 | import requests
 4 | from selenium import webdriver
 5 | from webdriver_manager.chrome import ChromeDriverManager
 6 | from webdriver_manager.core.os_manager import ChromeType
 7 | from selenium.webdriver.chrome.options import Options
 8 | 
 9 | import undetected_chromedriver as uc
10 | 
11 | 
12 | def load_driver(
13 |     window_size="auto",
14 |     headless=True,
15 |     driver="undetected_chromedriver",
16 |     driver_version=None,
17 |     chrome_type="CHROME",
18 |     adblock=True,
19 |     adblock_version="6.0.2-mv3",
20 |     extension_cache_dir=os.path.join(os.path.dirname(__file__), "extensions"),
21 |     *,
22 |     service=None,
23 |     additional_options=None,
24 | ):
25 |     options = Options()
26 |     if service is None:
27 |         chrome_type = chrome_type.upper()
28 |         if chrome_type == "CHROMIUM":
29 |             chrome_type = ChromeType.CHROMIUM
30 |         elif chrome_type == "CHROME":
31 |             chrome_type = ChromeType.GOOGLE
32 |         elif chrome_type == "BRAVE":
33 |             chrome_type = ChromeType.BRAVE
34 |         service = ChromeDriverManager(driver_version=driver_version, chrome_type=chrome_type).install()
35 |     if headless:
36 |         options.add_argument("--headless")
37 |     if adblock:
38 |         try:
39 |             adblock_url = f"https://code.getadblock.com/releases/adblockchrome-{adblock_version}.zip"
40 |             adblock_path = os.path.join(extension_cache_dir, f"adblockchrome-{adblock_version}")
41 |             if not os.path.isdir(adblock_path):
42 |                 os.makedirs(os.path.join(adblock_path, ".."), exist_ok=True)
43 |                 # Download the adblock zip file
44 |                 response = requests.get(adblock_url)
45 |                 with open(f"{adblock_path}.zip", "wb") as file:
46 |                     file.write(response.content)
47 |                 # Unzip the downloaded file
48 |                 with zipfile.ZipFile(f"{adblock_path}.zip", "r") as zip_ref:
49 |                     zip_ref.extractall(adblock_path)
50 |                 # Remove the zip file after extraction
51 |                 os.remove(f"{adblock_path}.zip")
52 |             options.add_argument(f"--load-extension={os.path.abspath(adblock_path)}")
53 |         except Exception as e:
54 |             print(f"Error loading adblock extension: {e}")
55 |     if driver == "undetected_chromedriver":
56 |         driver = uc.Chrome(headless=headless, options=options, driver_executable_path=service)
57 |         if window_size != "auto":
58 |             driver.set_window_size(*window_size)
59 |         return driver
60 |     elif driver == "chrome":
61 |         options = Options()
62 |         if additional_options is not None:
63 |             for option in additional_options:
64 |                 options.add_argument(option)
65 |         service = webdriver.chrome.service.Service(service)
66 |         driver = webdriver.Chrome(service=service, options=options)
67 |         if window_size != "auto":
68 |             driver.set_window_size(*window_size)
69 |         return driver
70 |     else:
71 |         raise ValueError(f"Unknown driver: {driver}")
72 | 


--------------------------------------------------------------------------------
/lmms-eval_videochat/tools/live_bench/live_bench/screen_shoter/__init__.py:
--------------------------------------------------------------------------------
1 | from live_bench.screen_shoter.screen_shoter import ScreenShoter, register_shoter, get_shoter
2 | from live_bench.screen_shoter.screen import ScreenImage
3 | 


--------------------------------------------------------------------------------
/lmms-eval_videochat/tools/live_bench/live_bench/screen_shoter/screen.py:
--------------------------------------------------------------------------------
 1 | import io
 2 | import base64
 3 | 
 4 | from PIL import Image
 5 | from typing import List, Tuple
 6 | 
 7 | from live_bench.websites import Website
 8 | 
 9 | 
10 | def image_to_base64(image: Image.Image) -> str:
11 |     buffered = io.BytesIO()
12 |     image.save(buffered, format="PNG")
13 |     return base64.b64encode(buffered.getvalue()).decode("utf-8")
14 | 
15 | 
16 | class ScreenImage(object):
17 |     def __init__(self, images: List[Image.Image], website: Website, shoter: str, screen_size: Tuple[int, int], capture_datetime: str):
18 |         self.images = images
19 |         self.website = website
20 |         self.shoter = shoter
21 |         self.screen_size = screen_size
22 |         self.capture_datetime = capture_datetime
23 | 
24 |     def to_dict(self):
25 |         return {"images": self.images, "website": self.website.get_info(), "shoter": self.shoter, "screen_size": self.screen_size, "capture_datetime": self.capture_datetime}
26 | 
27 |     def to_output_dict(self):
28 |         output = self.to_dict()
29 |         output["images"] = [image_to_base64(image) for image in self.images]
30 |         return output
31 | 


--------------------------------------------------------------------------------
/lmms-eval_videochat/tools/live_bench/live_bench/websites/__init__.py:
--------------------------------------------------------------------------------
1 | from live_bench.websites.load_website import load_websites, load_websites_from_file
2 | from live_bench.websites.website import Website
3 | 


--------------------------------------------------------------------------------
/lmms-eval_videochat/tools/live_bench/live_bench/websites/load_website.py:
--------------------------------------------------------------------------------
 1 | import yaml
 2 | import os
 3 | from random import sample
 4 | from live_bench.websites.website import Website, DefaultWebsite, HumanScreenShotWebsite
 5 | 
 6 | 
 7 | def get_website(website_dict):
 8 |     if "website_class" not in website_dict:
 9 |         website_class = DefaultWebsite
10 |     else:
11 |         website_class = website_dict["website_class"]
12 |     url = website_dict["url"]
13 |     if "args" in website_dict:
14 |         return website_class(url, **website_dict["args"])
15 |     else:
16 |         return website_class(url)
17 | 
18 | 
19 | def load_websites(num_sample: int = -1):
20 |     website_list_path = os.path.join(os.path.dirname(__file__), "website_list.yaml")
21 |     with open(website_list_path, "r") as f:
22 |         website_list = yaml.full_load(f)["websites"]
23 |     if num_sample > 0:
24 |         website_list = sample(website_list, num_sample)
25 |     return [get_website(website_dict) for website_dict in website_list]
26 | 
27 | 
28 | def load_websites_from_file(file_path):
29 |     names = os.listdir(file_path)
30 |     websites = []
31 |     for name in names:
32 |         path = os.path.join(file_path, name)
33 |         websites.append(HumanScreenShotWebsite(path=path, name=name))
34 |     return websites
35 | 


--------------------------------------------------------------------------------
/lmms-eval_videochat/tools/live_bench/live_bench/websites/website.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | import os
 3 | 
 4 | from webdriver_manager.core.driver import Driver
 5 | from abc import ABC, abstractmethod
 6 | 
 7 | 
 8 | class Website(ABC):
 9 |     def __init__(self, url=None, name=None, path=None):
10 |         self.url = url
11 |         self.name = name
12 |         self.path = path
13 |         assert self.url is not None or self.path is not None, "Either url or path must be provided"
14 | 
15 |     def get_path(self):
16 |         if self.url:
17 |             return self.url
18 |         else:
19 |             return self.path
20 | 
21 |     def visit(self, driver: Driver):
22 |         self.pre_visit(driver)
23 |         driver.get(self.url)
24 |         self.post_visit(driver)
25 | 
26 |     def get_info(self):
27 |         info = {}
28 |         if self.url:
29 |             info["url"] = self.url
30 |         if self.name:
31 |             info["name"] = self.name
32 |         return info
33 | 
34 |     @abstractmethod
35 |     def pre_visit(self, driver: Driver):
36 |         raise NotImplementedError("pre_action not implemented")
37 | 
38 |     @abstractmethod
39 |     def post_visit(self, driver: Driver):
40 |         raise NotImplementedError("post_action not implemented")
41 | 
42 | 
43 | class DefaultWebsite(Website):
44 |     def __init__(self, url, name=None):
45 |         super().__init__(url, name)
46 | 
47 |     def pre_visit(self, driver: Driver):
48 |         pass
49 | 
50 |     def post_visit(self, driver: Driver):
51 |         time.sleep(5)  # Wait for 5 seconds to allow adblock to finish
52 | 
53 | 
54 | class HumanScreenShotWebsite(Website):
55 |     def __init__(self, name=None, path=None):
56 |         super().__init__(name=name, path=path)
57 | 
58 |     def pre_visit(self, driver: Driver):
59 |         pass
60 | 
61 |     def post_visit(self, driver: Driver):
62 |         pass
63 | 


--------------------------------------------------------------------------------
/lmms-eval_videochat/tools/live_bench/live_bench/websites/website_list.yaml:
--------------------------------------------------------------------------------
 1 | websites:
 2 | - url: https://www.bbc.com/
 3 |   # can add below line to specify the class to use for this website
 4 |   # website_class: !constructor website.DefaultWebsite
 5 |   # can add args tag to specify the arguments to pass to the class constructor
 6 |   # args:
 7 |   #  arg1: value1
 8 |   #  arg2: value2
 9 | # - url: https://www.bbc.com/news
10 | # - url: https://www.bbc.com/sport
11 | # - url: https://www.bbc.com/business
12 | # - url: https://www.bbc.com/innovation
13 | # - url: https://www.bbc.com/culture
14 | # - url: https://www.bbc.com/travel
15 | # - url: https://www.bbc.com/future-planet
16 | # - url: https://edition.cnn.com/
17 | # - url: https://edition.cnn.com/politics
18 | # - url: https://edition.cnn.com/entertainment
19 | # - url: https://edition.cnn.com/style
20 | # - url: https://www.bloomberg.com/economics
21 | # - url: https://www.bloomberg.com/industries
22 | # - url: https://www.bloomberg.com/technology
23 | # - url: https://www.bloomberg.com/politics
24 | # - url: https://www.bloomberg.com/opinion
25 | # - url: https://www.wsj.com/
26 | # - url: https://www.wsj.com/world/africa?mod=nav_top_subsection
27 | # - url: https://www.wsj.com/world/americas?mod=nav_top_subsection
28 | # - url: https://www.wsj.com/world/asia?mod=nav_top_subsection
29 | # - url: https://www.wsj.com/world/china?mod=nav_top_subsection
30 | # - url: https://www.wsj.com/world/europe?mod=nav_top_subsection
31 | # - url: https://www.wsj.com/world/middle-east?mod=nav_top_subsection
32 | # - url: https://www.wsj.com/world/india?mod=nav_top_subsection
33 | # - url: https://www.wsj.com/world/oceania?mod=nav_top_subsection
34 | # - url: https://www.wsj.com/world/russia?mod=nav_top_subsection
35 | # - url: https://www.wsj.com/world/uk?mod=nav_top_subsection
36 | # - url: https://www.wsj.com/science?mod=nav_top_subsection
37 | # - url: https://www.wsj.com/science/archaeology?mod=nav_top_subsection
38 | # - url: https://www.wsj.com/science/biology?mod=nav_top_subsection
39 | # - url: https://www.wsj.com/science/environment?mod=nav_top_subsection
40 | # - url: https://www.wsj.com/science/physics?mod=nav_top_subsection
41 | # - url: https://www.wsj.com/science/space-astronomy?mod=nav_top_subsection
42 | # - url: https://www.wsj.com/economy/central-banking?mod=nav_top_subsection
43 | # - url: https://www.wsj.com/economy/consumers?mod=nav_top_subsection
44 | # - url: https://www.wsj.com/economy/housing?mod=nav_top_subsection
45 | # - url: https://www.wsj.com/economy/jobs?mod=nav_top_subsection
46 | # - url: https://www.wsj.com/economy/trade?mod=nav_top_subsection
47 | # - url: https://www.wsj.com/economy/global
48 | # - url: https://www.wsj.com/tech/ai?mod=nav_top_subsection
49 | # - url: https://www.wsj.com/tech/biotech
50 | # - url: https://www.wsj.com/tech/cybersecurity?mod=nav_top_subsection
51 | # - url: https://www.wsj.com/tech/personal-tech?mod=nav_top_subsection
52 | # - url: https://www.reuters.com/
53 | # - url: https://www.reuters.com/business/aerospace-defense/
54 | # - url: https://www.reuters.com/business/autos-transportation/
55 | # - url: https://www.reuters.com/business/davos/
56 | # - url: https://www.reuters.com/business/energy/
57 | # - url: https://www.reuters.com/business/environment/
58 | # - url: https://www.reuters.com/business/finance/
59 | # - url: https://www.reuters.com/business/healthcare-pharmaceuticals/
60 | # - url: https://www.reuters.com/business/media-telecom/
61 | # - url: https://www.reuters.com/business/retail-consumer/
62 | # - url: https://www.reuters.com/business/future-of-health/
63 | # - url: https://www.reuters.com/business/future-of-money/
64 | # - url: https://www.reuters.com/business/take-five/
65 | # - url: https://www.reuters.com/business/world-at-work/
66 | # - url: https://www.reuters.com/breakingviews/
67 | # - url: https://www.reuters.com/technology/
68 | # - url: https://www.reuters.com/technology/cybersecurity/
69 | # - url: https://www.reuters.com/technology/space/
70 | # - url: https://www.reuters.com/technology/disrupted/
71 | # - url: https://www.reuters.com/technology/reuters-momentum/
72 | # - url: https://www.reuters.com/investigations/
73 | # - url: https://a16z.com/news-content/#latest
74 | # - url: https://news.ycombinator.com/
75 | # - url: https://www.reddit.com/?rdt=48006
76 | # - url: https://news.crunchbase.com/
77 | # - url: https://www.cctv.com/
78 | # - url: https://sports.cctv.com/
79 | 


--------------------------------------------------------------------------------
/lmms-eval_videochat/tools/live_bench/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.black]
 2 | line-length = 240
 3 | 
 4 | [build-system]
 5 | requires = ["setuptools>=42", "wheel", "setuptools_scm[tomli]>=6.3"]
 6 | build-backend = "setuptools.build_meta"
 7 | 
 8 | [project]
 9 | name = "live_bench"
10 | version = "0.0.1"
11 | authors = [
12 |     { name = "LMMMs-Lab Evaluation Team", email = "lmms_eval@outlook.com" },
13 | ]
14 | description = "Live Bench"
15 | readme = "README.md"
16 | classifiers = [
17 |     "Programming Language :: Python :: 3",
18 |     "License :: OSI Approved :: MIT License",
19 |     "Operating System :: OS Independent",
20 | ]
21 | requires-python = ">=3.9"
22 | license = { text = "MIT" }
23 | dependencies = [
24 |     "PyYAML >= 6.0.1",
25 |     "webdriver_manager >= 4.0.1",
26 |     "openai >= 1.32.0",
27 |     "google-generativeai >= 0.6.0",
28 |     "datasets >= 2.19.2",
29 |     "Pillow >= 10.3.0",
30 |     "selenium >= 4.21.0",
31 |     "undetected-chromedriver >= 3.5.5",
32 |     "anthropic >= 0.28.0",
33 |     "bs4 >= 0.0.2",
34 | ]
35 | 
36 | [tool.setuptools.packages.find]
37 | include = ["lmms_eval*"]
38 | 
39 | [tool.setuptools.package-data]
40 | lmms_eval = ["**/*.yaml", "tasks/**/*"]
41 | 
42 | [project.scripts]
43 | lmms-eval = "lmms_eval.__main__:cli_evaluate"
44 | 
45 | [project.urls]
46 | Homepage = "https://lmms-lab.github.io/"
47 | Repository = "https://github.com/EvolvingLMMs-Lab/lmms-eval"
48 | 


--------------------------------------------------------------------------------
/lmms-eval_videochat/tools/live_bench/refine_all_results.py:
--------------------------------------------------------------------------------
 1 | from tqdm import tqdm
 2 | from datasets import Dataset, load_dataset
 3 | from live_bench.data_generator.question_finalizer import QuestionFinalizer
 4 | 
 5 | 
 6 | if __name__ == "__main__":
 7 |     hf_data = load_dataset("lmms-lab/LiveBench", "2024-07", split="test")
 8 |     finalizer = QuestionFinalizer()
 9 | 
10 |     def load_results():
11 |         for item in tqdm(hf_data):
12 |             try:
13 |                 res = finalizer.finalize_question(question=item["question"], answer=item["answer"], criteria=item["criteria"], images=item["images"])
14 |                 final_answer = item.copy()
15 |                 final_answer["question"] = res["question"]
16 |                 final_answer["answer"] = res["answer"]
17 |                 final_answer["criteria"] = res["criteria"]
18 |                 print(item)
19 |                 print(final_answer)
20 |             except Exception as e:
21 |                 print(f"Error in {item['id']}: {e}")
22 |                 final_answer = item
23 | 
24 |             yield final_answer
25 |             # break
26 | 
27 |     final_data = {}
28 |     for data in load_results():
29 |         for item, value in data.items():
30 |             if item not in final_data:
31 |                 final_data[item] = []
32 |             final_data[item].append(value)
33 |     # final_data = Dataset.from_generator(load_results)
34 |     final_data = Dataset.from_dict(final_data, features=hf_data.features)
35 |     final_data.save_to_disk("logs/2024-07-final")
36 |     final_data.push_to_hub("lmms-lab/LiveBench", "2024-07")
37 | 


--------------------------------------------------------------------------------
/lmms-eval_videochat/tools/live_bench/script/README.md:
--------------------------------------------------------------------------------
 1 | ## Usage
 2 | 
 3 | ### Upload Results
 4 | 
 5 | ```sh
 6 | python upload_results.py -f <log_folder> -m <model_name> [-F]
 7 | ```
 8 | 
 9 | `[-F]` means the script will automatically upload the results without human checking. Otherwise, the script will print the results and ask for confirmation before uploading.
10 | 
11 | Example:
12 | 
13 | ```sh
14 | python upload_results.py -f logs/0706_0959_model_outputs_gpt4v_model_args_c974bc -m gpt-4o -F
15 | ```
16 | 


--------------------------------------------------------------------------------
/lmms-eval_videochat/tools/live_bench/setup.py:
--------------------------------------------------------------------------------
1 | import setuptools
2 | 
3 | setuptools.setup()
4 | 


--------------------------------------------------------------------------------
/lmms-eval_videochat/tools/make_vatex.py:
--------------------------------------------------------------------------------
 1 | from datasets import load_dataset, Dataset
 2 | import json
 3 | 
 4 | with open("data/vatex_public_test_english_v1.1.json", "r") as f:
 5 |     data = json.load(f)
 6 | 
 7 | for da in data:
 8 |     da["url"] = "https://www.youtube.com/watch?v=" + da["videoID"]
 9 | 
10 | vatex_dataset = Dataset.from_list(data)
11 | # vatex_dataset.rename_columns({
12 | #     'videoID': 'video_name',
13 | #     'enCap': 'caption'
14 | # }) #if change name is needed
15 | hub_dataset_path = "lmms-lab/vatex_from_url"
16 | 
17 | vatex_dataset.push_to_hub(repo_id=hub_dataset_path, split="test", config_name="vatex_test", token=True)
18 | 
19 | with open("data/vatex_validation_v1.0.json", "r") as f:
20 |     data = json.load(f)
21 | for da in data:
22 |     da["url"] = "https://www.youtube.com/watch?v=" + da["videoID"]
23 | 
24 | vatex_dataset = Dataset.from_list(data)
25 | # vatex_dataset.rename_columns({
26 | #     'videoID': 'video_name',
27 | #     'enCap': 'caption'
28 | # }) #if change name is needed
29 | hub_dataset_path = "lmms-lab/vatex_from_url"
30 | 
31 | vatex_dataset.push_to_hub(repo_id=hub_dataset_path, split="validation", config_name="vatex_val_zh", token=True)
32 | 


--------------------------------------------------------------------------------
/src/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/VideoChat-R1/dba50f32fffb763e0fdf3810244be76a44147df2/src/__init__.py


--------------------------------------------------------------------------------
/src/open_r1/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/VideoChat-R1/dba50f32fffb763e0fdf3810244be76a44147df2/src/open_r1/__init__.py


--------------------------------------------------------------------------------
/src/open_r1/evaluate.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 The HuggingFace Team. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | """Custom evaluation tasks for LightEval."""
16 | 
17 | from lighteval.metrics.dynamic_metrics import (
18 |     ExprExtractionConfig,
19 |     LatexExtractionConfig,
20 |     multilingual_extractive_match_metric,
21 | )
22 | from lighteval.tasks.lighteval_task import LightevalTaskConfig
23 | from lighteval.tasks.requests import Doc
24 | from lighteval.utils.language import Language
25 | 
26 | 
27 | metric = multilingual_extractive_match_metric(
28 |     language=Language.ENGLISH,
29 |     fallback_mode="first_match",
30 |     precision=5,
31 |     gold_extraction_target=(LatexExtractionConfig(),),
32 |     pred_extraction_target=(ExprExtractionConfig(), LatexExtractionConfig()),
33 |     aggregation_function=max,
34 | )
35 | 
36 | 
37 | def prompt_fn(line, task_name: str = None):
38 |     """Assumes the model is either prompted to emit \\boxed{answer} or does so automatically"""
39 |     return Doc(
40 |         task_name=task_name,
41 |         query=line["problem"],
42 |         choices=[line["solution"]],
43 |         gold_index=0,
44 |     )
45 | 
46 | 
47 | # Define tasks
48 | aime24 = LightevalTaskConfig(
49 |     name="aime24",
50 |     suite=["custom"],
51 |     prompt_function=prompt_fn,
52 |     hf_repo="HuggingFaceH4/aime_2024",
53 |     hf_subset="default",
54 |     hf_avail_splits=["train"],
55 |     evaluation_splits=["train"],
56 |     few_shots_split=None,
57 |     few_shots_select=None,
58 |     generation_size=32768,
59 |     metric=[metric],
60 |     version=1,
61 | )
62 | math_500 = LightevalTaskConfig(
63 |     name="math_500",
64 |     suite=["custom"],
65 |     prompt_function=prompt_fn,
66 |     hf_repo="HuggingFaceH4/MATH-500",
67 |     hf_subset="default",
68 |     hf_avail_splits=["test"],
69 |     evaluation_splits=["test"],
70 |     few_shots_split=None,
71 |     few_shots_select=None,
72 |     generation_size=32768,
73 |     metric=[metric],
74 |     version=1,
75 | )
76 | 
77 | # Add tasks to the table
78 | TASKS_TABLE = []
79 | TASKS_TABLE.append(aime24)
80 | TASKS_TABLE.append(math_500)
81 | 
82 | # MODULE LOGIC
83 | if __name__ == "__main__":
84 |     print([t["name"] for t in TASKS_TABLE])
85 |     print(len(TASKS_TABLE))
86 | 


--------------------------------------------------------------------------------
/src/open_r1/trainer/__init__.py:
--------------------------------------------------------------------------------
 1 | from .grpo_trainer_video_tg import Qwen2VLGRPOTrainer_Video_TG
 2 | from .grpo_trainer_video_gqa import Qwen2VLGRPOTrainer_Video_GQA
 3 | from .grpo_trainer_video_gqa_nothink import Qwen2VLGRPOTrainer_Video_GQA_nothink
 4 | from .grpo_trainer_video_qa import Qwen2VLGRPOTrainer_Video_QA
 5 | from .grpo_trainer_video_qa_nothink import Qwen2VLGRPOTrainer_Video_QA_nothink
 6 | from .grpo_trainer_video_cls import Qwen2VLGRPOTrainer_Video_CLS
 7 | from .grpo_trainer_video_cls_nothink import Qwen2VLGRPOTrainer_Video_CLS_nothink
 8 | from .grpo_tasks_trainer import Qwen2VLGRPOTrainer_Video_Tasks
 9 | from .vllm_grpo_trainer_video_tg import Qwen2VLGRPOVLLMTrainer_Video_TG
10 | 
11 | __all__ = ["Qwen2VLGRPOTrainer_Video_QA", "Qwen2VLGRPOTrainer_Video_GQA", "Qwen2VLGRPOTrainer_Video_GQA_nothink", "Qwen2VLGRPOTrainer_Video_TG", "Qwen2VLGRPOVLLMTrainer_Video_TG", "Qwen2VLGRPOTrainer_Video_CLS", "Qwen2VLGRPOTrainer_Video_CLS_nothink", "Qwen2VLGRPOTrainer_Video_Tasks"]
12 | 


--------------------------------------------------------------------------------
/src_eval/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/VideoChat-R1/dba50f32fffb763e0fdf3810244be76a44147df2/src_eval/__init__.py


--------------------------------------------------------------------------------
/src_eval/data_config.py:
--------------------------------------------------------------------------------
 1 | DATASETS={
 2 |     'charades': {
 3 |         'video_path': 'your video root',
 4 |         'stride': 20,
 5 |         'max_stride_factor': 0.5,
 6 |         'splits': {
 7 |             'default': {
 8 |                 'annotation_file': 'Annotations/Charades/charades_annotation/charades_test.json',
 9 |                 'pad_sec': 0.0,
10 |             }
11 |         }
12 |     },
13 |     'activitynet': {
14 |         'video_path': 'your video root',
15 |         'stride': 40,
16 |         'max_stride_factor': 1,
17 |         'splits': {
18 |             'default': {
19 |                 'annotation_file': 'Annotations/ActivityNet/activitynet_annotation/test.json',
20 |                 'pad_sec': 0.0,
21 |             }
22 |         }
23 |     },
24 |     'nextgqa': {
25 |         'video_path': 'your video root',
26 |         'stride': 40,
27 |         'max_stride_factor': 1,
28 |         'splits': {
29 |             'default': {
30 |                 'annotation_file': 'Annotations/NextGQA/nextgqa_test.json',
31 |                 'pad_sec': 0.0,
32 |             },
33 |         }
34 |     },
35 |     'got': {
36 |         'video_path': 'your video root',
37 |         'stride': 40,
38 |         'max_stride_factor': 1,
39 |         'splits': {
40 |             'default': {
41 |                 'annotation_file': 'Annotations/Got/got_val.json',
42 |                 'pad_sec': 0.0,
43 |             },
44 |         }
45 |     }, 
46 |     'cls_quality': {
47 |         'video_path': 'your video root',
48 |         'stride': 40,
49 |         'max_stride_factor': 1,
50 |         'splits': {
51 |             'default': {
52 |                 'annotation_file': 'Annotations/VideoEval/Quality_Access/annotations/Quality_Access_test.json',
53 |                 'pad_sec': 0.0,
54 |             },
55 |         }
56 |     }
57 | }


--------------------------------------------------------------------------------
/src_eval/eval_prompts.py:
--------------------------------------------------------------------------------
 1 | CLS_QA_TEMPLATE = """[PROBLEM]
 2 | 
 3 | Output your thought process within the <think> </think> tags, including analysis with either specific timestamps (xx.xx) or time ranges (xx.xx to xx.xx) in <timestep> </timestep> tags.
 4 | 
 5 | Then, provide your final answer within the <answer> </answer> tags.
 6 | """
 7 | 
 8 | GROUND_TEMPLATE_THINK = """To accurately pinpoint the event "[EVENT]" in the video, determine the precise time period of the event.
 9 | 
10 | Output your thought process within the <think> </think> tags, including analysis with either specific timestamps (xx.xx) or time ranges (xx.xx to xx.xx) in <timestep> </timestep> tags.
11 | 
12 | Then, provide the start and end times (in seconds, precise to two decimal places) in the format "start time to end time" within the <answer> </answer> tags. For example: "12.54 to 17.83"."""
13 | 
14 | GROUND_TEMPLATE_NOTHINK = """To accurately pinpoint the event "[EVENT]" in the video, determine the precise time period of the event.
15 | 
16 | Provide the start and end times (in seconds, precise to two decimal places) in the format "start time to end time" within the <answer> </answer> tags. For example: "12.54 to 17.83"."""
17 | 
18 | GQA_ANSWER = """Answer the question: "[QUESTION]" according to the content of the video. Select the answer from :[OPTION]. Provide your answer within the <answer> </answer> tags, output the corresponding letter of the option.
19 | """
20 | 
21 | GQA_THINK_GLUE = """Answer the question: "[QUESTION]" according to the content of the video. Select the answer from :[OPTION].
22 | 
23 | Output your thought process within the <think> </think> tags, including analysis with either specific timestamps (xx.xx) or time ranges (xx.xx to xx.xx) in <timestep> </timestep> tags.
24 | 
25 | Then, provide your answer within the <answer> </answer> tags, output the corresponding letter of the option.
26 | """
27 | 
28 | GQA_THINK_ANSWER_GLUE = """Answer the question: "[QUESTION]" according to the content of the video. Select the answer from :[OPTION].
29 | 
30 | Output your thought process within the <think> </think> tags, including analysis with either specific timestamps (xx.xx) or time ranges (xx.xx to xx.xx) in <timestep> </timestep> tags.
31 | 
32 | Then, provide your answer within the <answer> </answer> tags, output the corresponding letter of the option. At the same time, in the <glue> </glue> tags, present the precise time period in seconds of the video clips on which you base your answer to this question in the format of [(s1, e1), (s2, e2), ...]. For example: <answer>A</answer><glue>[(5.2, 10.4)]</glue>.
33 | """
34 | 
35 | GQA_ANSWER_GLUE = """Answer the question: "[QUESTION]" according to the content of the video. Select the answer from :[OPTION]. Provide your answer within the <answer> </answer> tags, output the corresponding letter of the option. At the same time, in the <glue> </glue> tags, present the precise time period in seconds of the video clips on which you base your answer to this question in the format of [(s1, e1), (s2, e2), ...]. For example: <answer>A</answer><glue>[(5.2, 10.4)]</glue>.
36 | """
37 | 
38 | QA_THINK = """Answer the question: "[QUESTION]" according to the content of the video. Select the answer from :[OPTION].
39 | 
40 | Output your thought process within the <think> </think> tags, including analysis with either specific timestamps (xx.xx) or time ranges (xx.xx to xx.xx) in <timestep> </timestep> tags.
41 | 
42 | Then, provide your answer within the <answer> </answer> tags, output the corresponding letter of the option.
43 | """
44 | 
45 | 
46 | QA_NOTHINK = """Answer the question: "[QUESTION]" according to the content of the video. Select the answer from :[OPTION].
47 | 
48 | Provide your answer within the <answer> </answer> tags, output the corresponding letter of the option.
49 | """
50 | 
51 | TRACK_THINK = """Track the "[OBJECT]" in the video based on its initial coordinates "[START]". The output should be a list containing eight sublists. Each sublist includes four normalized coordinates [x0, y0, x1, y1] representing the bounding box of the object at specific time intervals.
52 | 
53 | Output your thought process within the <think> </think> tags.
54 | 
55 | Provide your answer within the <answer> </answer> tags as a list of eight sublists, where each sublist contains the normalized coordinates [x0, y0, x1, y1]. For example: <answer>[[0.1, 0.5, 0.3, 0.55], [0.72, 0.25, 0.84, 0.43], ...]</answer>.
56 | """
57 | 
58 | TRACK_NO_THINK = """Track the "[OBJECT]" in the video based on its initial coordinates "[START]". The output should be a list containing eight sublists. Each sublist includes four normalized coordinates [x0, y0, x1, y1] representing the bounding box of the object at specific time intervals.
59 | 
60 | Provide your answer within the <answer> </answer> tags as a list of eight sublists, where each sublist contains the normalized coordinates [x0, y0, x1, y1]. For example: <answer>[[0.1, 0.5, 0.3, 0.55], [0.72, 0.25, 0.84, 0.43], ...]</answer>.
61 | """


--------------------------------------------------------------------------------
/training_scripts/run_grpo_video_cls_qa.sh:
--------------------------------------------------------------------------------
 1 | 
 2 | # export WANDB_PROJECT=Video-GRPO
 3 | export OMP_NUM_THREADS=1
 4 | export DISABLE_ADDMM_CUDA_LT=1
 5 | export TORCH_CUDNN_USE_HEURISTIC_MODE_B=1
 6 | export NCCL_SOCKET_IFNAME=bond0
 7 | # export NCCL_DEBUG="INFO"
 8 | export NCCL_IB_HCA=mlx5_0
 9 | 
10 | export WANDB_NAME=$(basename $0)_$(date +"%Y%m%d_%H%M%S")
11 | NCCL_DEBUG=INFO
12 | export PYTHONPATH=".:$PYTHONPATH"
13 | OUTDIR=./checkpoints/$WANDB_NAME
14 | 
15 | export DEBUG_MODE="true"
16 | export LOG_PATH="./logs/${WANDB_NAME}.log"
17 | 
18 | 
19 | srun -p videop1 \
20 |     --job-name=${WANDB_NAME} \
21 |     -n1 \
22 |     --quotatype=spot \
23 |     --gres=gpu:8 \
24 |     --ntasks-per-node=1 \
25 |     --cpus-per-task=128 \
26 |     --kill-on-bad-exit=1 torchrun --nproc_per_node="8" \
27 |     --nnodes="1" \
28 |     --node_rank="0" \
29 |     --master_addr="127.0.0.1" \
30 |     --master_port="10668" \
31 |     src/open_r1/grpo_cls.py \
32 |     --deepspeed scripts/zero3_offload.json \
33 |     --output_dir $OUTDIR \
34 |     --model_name_or_path  your_base_dir/Qwen2.5-VL-7B-Instruct \
35 |     --train_data_path your_base_dir/VideoEval/Quality_Access/annotations/Quality_Access_100shot.json \
36 |     --eval_data_path your_base_dir/VideoEval/Quality_Access/annotations/Quality_Access_100shot.json \
37 |     --video_folder your_base_dir/VideoEval/Quality_Access/videos \
38 |     --dataset_name xxx \
39 |     --max_prompt_length 8192 \
40 |     --max_completion_length 1024 \
41 |     --num_generations 8 \
42 |     --per_device_train_batch_size 1 \
43 |     --gradient_accumulation_steps 1 \
44 |     --logging_steps 1 \
45 |     --bf16 \
46 |     --torch_dtype bfloat16 \
47 |     --data_seed 42 \
48 |     --gradient_checkpointing true \
49 |     --attn_implementation flash_attention_2 \
50 |     --num_train_epochs 1 \
51 |     --run_name $WANDB_NAME \
52 |     --report_to tensorboard \
53 |     --save_steps 100 \
54 |     --save_total_limit 1 \
55 |     --save_only_model true
56 | 


--------------------------------------------------------------------------------
/training_scripts/run_grpo_video_gqa.sh:
--------------------------------------------------------------------------------
 1 | 
 2 | # export WANDB_PROJECT=Video-GRPO
 3 | export WANDB_NAME=$(basename $0)_$(date +"%Y%m%d_%H%M%S")
 4 | 
 5 | export PYTHONPATH=".:$PYTHONPATH"
 6 | OUTDIR=./checkpoints/$WANDB_NAME
 7 | 
 8 | export DEBUG_MODE="true"
 9 | export LOG_PATH="./logs/${WANDB_NAME}.log"
10 | 
11 | 
12 | srun -p videop1 \
13 |     --job-name=${WANDB_NAME} \
14 |     -n1 \
15 |     --gres=gpu:8 \
16 |     --ntasks-per-node=1 \
17 |     --cpus-per-task=128 \
18 |     --kill-on-bad-exit=1 torchrun --nproc_per_node="8" \
19 |     --nnodes="1" \
20 |     --node_rank="0" \
21 |     --master_addr="127.0.0.1" \
22 |     --master_port="10668" \
23 |     src/open_r1/grpo_gqa.py \
24 |     --deepspeed scripts/zero3_offload.json \
25 |     --output_dir $OUTDIR \
26 |     --model_name_or_path  your_base_dir/Qwen2.5-VL-7B-Instruct \
27 |     --train_data_path your_base_dir/NextGQA/nextgqa_val.json \
28 |     --eval_data_path your_base_dir/NextGQA/nextgqa_test.json \
29 |     --video_folder p2:s3://nextqa \
30 |     --dataset_name xxx \
31 |     --max_prompt_length 8192 \
32 |     --max_completion_length 1024 \
33 |     --num_generations 8 \
34 |     --per_device_train_batch_size 1 \
35 |     --gradient_accumulation_steps 1 \
36 |     --logging_steps 1 \
37 |     --bf16 \
38 |     --torch_dtype bfloat16 \
39 |     --data_seed 42 \
40 |     --gradient_checkpointing true \
41 |     --attn_implementation flash_attention_2 \
42 |     --num_train_epochs 1 \
43 |     --run_name $WANDB_NAME \
44 |     --report_to tensorboard \
45 |     --save_steps 100 \
46 |     --save_total_limit 1 \
47 |     --save_only_model true
48 | 


--------------------------------------------------------------------------------
/training_scripts/run_grpo_video_gqa_nothink_3e.sh:
--------------------------------------------------------------------------------
 1 | 
 2 | # export WANDB_PROJECT=Video-GRPO
 3 | export WANDB_NAME=$(basename $0)_$(date +"%Y%m%d_%H%M%S")
 4 | 
 5 | export PYTHONPATH=".:$PYTHONPATH"
 6 | OUTDIR=./checkpoints/$WANDB_NAME
 7 | 
 8 | export DEBUG_MODE="true"
 9 | export LOG_PATH="./logs/${WANDB_NAME}.log"
10 | 
11 | 
12 | srun -p videop1 \
13 |     --job-name=${WANDB_NAME} \
14 |     -n1 \
15 |     --gres=gpu:8 \
16 |     --ntasks-per-node=1 \
17 |     --cpus-per-task=128 \
18 |     --kill-on-bad-exit=1 torchrun --nproc_per_node="8" \
19 |     --nnodes="1" \
20 |     --node_rank="0" \
21 |     --master_addr="127.0.0.1" \
22 |     --master_port="10668" \
23 |     src/open_r1/grpo_gqa_nothink.py \
24 |     --deepspeed scripts/zero3_offload.json \
25 |     --output_dir $OUTDIR \
26 |     --model_name_or_path  your_base_dir/Qwen2.5-VL-7B-Instruct \
27 |     --train_data_path your_base_dir/NextGQA/nextgqa_val.json \
28 |     --eval_data_path your_base_dir/NextGQA/nextgqa_test.json \
29 |     --video_folder p2:s3://nextqa \
30 |     --dataset_name xxx \
31 |     --max_prompt_length 8192 \
32 |     --max_completion_length 1024 \
33 |     --num_generations 8 \
34 |     --per_device_train_batch_size 1 \
35 |     --gradient_accumulation_steps 1 \
36 |     --logging_steps 1 \
37 |     --bf16 \
38 |     --torch_dtype bfloat16 \
39 |     --data_seed 42 \
40 |     --gradient_checkpointing true \
41 |     --attn_implementation flash_attention_2 \
42 |     --num_train_epochs 1 \
43 |     --run_name $WANDB_NAME \
44 |     --report_to tensorboard \
45 |     --save_steps 100 \
46 |     --save_total_limit 1 \
47 |     --save_only_model true
48 | 


--------------------------------------------------------------------------------
/training_scripts/run_grpo_video_qa.sh:
--------------------------------------------------------------------------------
 1 | 
 2 | # export WANDB_PROJECT=Video-GRPO
 3 | export WANDB_NAME=$(basename $0)_$(date +"%Y%m%d_%H%M%S")
 4 | 
 5 | export PYTHONPATH=".:$PYTHONPATH"
 6 | OUTDIR=./checkpoints/$WANDB_NAME
 7 | 
 8 | export DEBUG_MODE="true"
 9 | export LOG_PATH="./logs/${WANDB_NAME}.log"
10 | 
11 | 
12 | srun -p videop1 \
13 |     --job-name=${WANDB_NAME} \
14 |     -n1 \
15 |     --gres=gpu:8 \
16 |     --ntasks-per-node=1 \
17 |     --cpus-per-task=128 \
18 |     --kill-on-bad-exit=1 torchrun --nproc_per_node="8" \
19 |     --nnodes="1" \
20 |     --node_rank="0" \
21 |     --master_addr="127.0.0.1" \
22 |     --master_port="10668" \
23 |     src/open_r1/grpo_qa.py \
24 |     --deepspeed scripts/zero3_offload.json \
25 |     --output_dir $OUTDIR \
26 |     --model_name_or_path  your_base_dir/Qwen2.5-VL-7B-Instruct \
27 |     --train_data_path your_base_dir/NextGQA/nextgqa_val.json \
28 |     --eval_data_path your_base_dir/NextGQA/nextgqa_test.json \
29 |     --video_folder p2:s3://nextqa \
30 |     --dataset_name xxx \
31 |     --max_prompt_length 8192 \
32 |     --max_completion_length 1024 \
33 |     --num_generations 8 \
34 |     --per_device_train_batch_size 1 \
35 |     --gradient_accumulation_steps 1 \
36 |     --logging_steps 1 \
37 |     --bf16 \
38 |     --torch_dtype bfloat16 \
39 |     --data_seed 42 \
40 |     --gradient_checkpointing true \
41 |     --attn_implementation flash_attention_2 \
42 |     --num_train_epochs 1 \
43 |     --run_name $WANDB_NAME \
44 |     --report_to tensorboard \
45 |     --save_steps 100 \
46 |     --save_total_limit 1 \
47 |     --save_only_model true
48 | 


--------------------------------------------------------------------------------
/training_scripts/run_grpo_video_qa_nothink.sh:
--------------------------------------------------------------------------------
 1 | 
 2 | # export WANDB_PROJECT=Video-GRPO
 3 | export WANDB_NAME=$(basename $0)_$(date +"%Y%m%d_%H%M%S")
 4 | 
 5 | export PYTHONPATH=".:$PYTHONPATH"
 6 | OUTDIR=./checkpoints/$WANDB_NAME
 7 | 
 8 | export DEBUG_MODE="true"
 9 | export LOG_PATH="./logs/${WANDB_NAME}.log"
10 | 
11 | 
12 | srun -p videop1 \
13 |     --job-name=${WANDB_NAME} \
14 |     -n1 \
15 |     --gres=gpu:8 \
16 |     --ntasks-per-node=1 \
17 |     --cpus-per-task=128 \
18 |     --kill-on-bad-exit=1 torchrun --nproc_per_node="8" \
19 |     --nnodes="1" \
20 |     --node_rank="0" \
21 |     --master_addr="127.0.0.1" \
22 |     --master_port="10668" \
23 |     src/open_r1/grpo_qa_nothink.py \
24 |     --deepspeed scripts/zero3_offload.json \
25 |     --output_dir $OUTDIR \
26 |     --model_name_or_path  your_base_dir/Qwen2.5-VL-7B-Instruct \
27 |     --train_data_path your_base_dir/NextGQA/nextgqa_val.json \
28 |     --eval_data_path your_base_dir/NextGQA/nextgqa_test.json \
29 |     --video_folder p2:s3://nextqa \
30 |     --dataset_name xxx \
31 |     --max_prompt_length 8192 \
32 |     --max_completion_length 1024 \
33 |     --num_generations 8 \
34 |     --per_device_train_batch_size 1 \
35 |     --gradient_accumulation_steps 1 \
36 |     --logging_steps 1 \
37 |     --bf16 \
38 |     --torch_dtype bfloat16 \
39 |     --data_seed 42 \
40 |     --gradient_checkpointing true \
41 |     --attn_implementation flash_attention_2 \
42 |     --num_train_epochs 1 \
43 |     --run_name $WANDB_NAME \
44 |     --report_to tensorboard \
45 |     --save_steps 100 \
46 |     --save_total_limit 1 \
47 |     --save_only_model true
48 | 
49 | 
50 | 


--------------------------------------------------------------------------------
/training_scripts/run_grpo_video_task.sh:
--------------------------------------------------------------------------------
 1 | 
 2 | # export WANDB_PROJECT=Video-GRPO
 3 | export OMP_NUM_THREADS=1
 4 | export DISABLE_ADDMM_CUDA_LT=1
 5 | export TORCH_CUDNN_USE_HEURISTIC_MODE_B=1
 6 | export NCCL_SOCKET_IFNAME=bond0
 7 | # export NCCL_DEBUG="INFO"
 8 | export NCCL_IB_HCA=mlx5_0
 9 | 
10 | export WANDB_NAME=$(basename $0)_$(date +"%Y%m%d_%H%M%S")
11 | 
12 | export PYTHONPATH=".:$PYTHONPATH"
13 | OUTDIR=./checkpoints/$WANDB_NAME
14 | 
15 | export DEBUG_MODE="true"
16 | export LOG_PATH="./logs/${WANDB_NAME}.log"
17 | 
18 | srun torchrun --nproc_per_node="8" \
19 |     --nnodes="1" \
20 |     --node_rank="0" \
21 |     --master_addr="127.0.0.1" \
22 |     --master_port="12351" \
23 |     src/open_r1/grpo_tasks.py \
24 |     --deepspeed scripts/zero3_offload.json \
25 |     --output_dir $OUTDIR \
26 |     --model_name_or_path your_base_dir/Qwen2.5-VL-7B-Instruct \
27 |     --train_data_path_gqa videochat-next/reason_origin/nextgqa/nextgqa_val.json \
28 |     --train_data_path_tg ./Charades/charades_annotation/train.json \
29 |     --train_data_path_tracking your_base_dir/track_got_train.json \
30 |     --eval_data_path videochat-next/reason_origin/nextgqa/nextgqa_val.json \
31 |     --video_folder_gqa p2:s3://nextqa \
32 |     --video_folder_tg p2:s3://star/Charades_v1_480 \
33 |     --dataset_name charades \
34 |     --max_prompt_length 8192 \
35 |     --max_completion_length 1024 \
36 |     --num_generations 8 \
37 |     --per_device_train_batch_size 1 \
38 |     --gradient_accumulation_steps 1 \
39 |     --logging_steps 1 \
40 |     --bf16 \
41 |     --torch_dtype bfloat16 \
42 |     --data_seed 42 \
43 |     --gradient_checkpointing true \
44 |     --attn_implementation flash_attention_2 \
45 |     --num_train_epochs 1 \
46 |     --run_name $WANDB_NAME \
47 |     --report_to tensorboard \
48 |     --save_steps 10000 \
49 |     --save_only_model true
50 | 


--------------------------------------------------------------------------------
/training_scripts/run_grpo_video_tg.sh:
--------------------------------------------------------------------------------
 1 | 
 2 | # export WANDB_PROJECT=Video-GRPO
 3 | export WANDB_NAME=Qwen2.5_7b_TG
 4 | 
 5 | export PYTHONPATH=".:$PYTHONPATH"
 6 | OUTDIR=outputs_video
 7 | 
 8 | export DEBUG_MODE="true"
 9 | export LOG_PATH="./qwen2.5_7b_vl_tg_video.txt"
10 | 
11 | 
12 | srun -p videop1 \
13 |     --job-name=${WANDB_NAME} \
14 |     -n1 \
15 |     --gres=gpu:8 \
16 |     --ntasks-per-node=1 \
17 |     --cpus-per-task=128 \
18 |     --kill-on-bad-exit=1 torchrun --nproc_per_node="8" \
19 |     --nnodes="1" \
20 |     --node_rank="0" \
21 |     --master_addr="127.0.0.1" \
22 |     --master_port="10668" \
23 |     src/open_r1/grpo_tg.py \
24 |     --deepspeed scripts/zero3_offload.json \
25 |     --output_dir $OUTDIR \
26 |     --model_name_or_path  your_base_dir/Qwen2.5-VL-7B-Instruct \
27 |     --train_data_path ./Charades/charades_annotation/train.json \
28 |     --eval_data_path ./Charades/charades_annotation/val.json \
29 |     --video_folder p2:s3://star/Charades_v1_480 \
30 |     --dataset_name xxx \
31 |     --max_prompt_length 8192 \
32 |     --max_completion_length 1024 \
33 |     --num_generations 8 \
34 |     --per_device_train_batch_size 1 \
35 |     --gradient_accumulation_steps 1 \
36 |     --logging_steps 1 \
37 |     --bf16 \
38 |     --torch_dtype bfloat16 \
39 |     --data_seed 42 \
40 |     --gradient_checkpointing true \
41 |     --attn_implementation flash_attention_2 \
42 |     --num_train_epochs 1 \
43 |     --run_name $WANDB_NAME \
44 |     --report_to tensorboard \
45 |     --save_steps 200 \
46 |     --save_total_limit 3 \
47 |     --save_only_model true
48 | 


--------------------------------------------------------------------------------
/training_scripts/run_sft_video_cls_qa.sh:
--------------------------------------------------------------------------------
 1 | 
 2 | # export WANDB_PROJECT=Video-GRPO
 3 | export OMP_NUM_THREADS=1
 4 | export DISABLE_ADDMM_CUDA_LT=1
 5 | export TORCH_CUDNN_USE_HEURISTIC_MODE_B=1
 6 | export NCCL_SOCKET_IFNAME=bond0
 7 | # export NCCL_DEBUG="INFO"
 8 | export NCCL_IB_HCA=mlx5_0
 9 | 
10 | export WANDB_NAME=$(basename $0)_$(date +"%Y%m%d_%H%M%S")
11 | 
12 | export PYTHONPATH=".:$PYTHONPATH"
13 | OUTDIR=./checkpoints/$WANDB_NAME
14 | 
15 | export DEBUG_MODE="true"
16 | export LOG_PATH="./logs/${WANDB_NAME}.log"
17 | 
18 | 
19 | # srun accelerate launch --config_file=/mnt/petrelfs/yanziang/videoo1/TimeZero/configs/zero3.yaml 
20 | srun torchrun --nproc_per_node="8" \
21 |     --nnodes="1" \
22 |     --node_rank="0" \
23 |     --master_addr="127.0.0.1" \
24 |     --master_port="12951" \
25 |     src/sft_cls.py \
26 |     --deepspeed scripts/zero3_offload.json \
27 |     --model_name_or_path your_base_dir/Qwen2.5-VL-7B-Instruct \
28 |     --preprocessed_data_path ./Charades_preprocessed_data_maxpix_3584 \
29 |     --train_data_path your_base_dir/VideoEval/Quality_Access/annotations/Quality_Access_100shot.json \
30 |     --eval_data_path your_base_dir/VideoEval/Quality_Access/annotations/Quality_Access_100shot.json \
31 |     --video_folder your_base_dir/VideoEval/Quality_Access/videos \
32 |     --dataset_name xxx \
33 |     --learning_rate 2.0e-5 \
34 |     --num_train_epochs 1 \
35 |     --packing \
36 |     --max_seq_length 4096 \
37 |     --per_device_train_batch_size 1 \
38 |     --gradient_accumulation_steps 1 \
39 |     --gradient_checkpointing \
40 |     --bf16 \
41 |     --torch_dtype bfloat16 \
42 |     --logging_steps 5 \
43 |     --eval_strategy no \
44 |     --report_to tensorboard \
45 |     --output_dir $OUTDIR \
46 |     --save_steps 30000 \
47 |     --save_only_model true


--------------------------------------------------------------------------------
/training_scripts/run_sft_video_gqa.sh:
--------------------------------------------------------------------------------
 1 | 
 2 | # export WANDB_PROJECT=Video-GRPO
 3 | export OMP_NUM_THREADS=1
 4 | export DISABLE_ADDMM_CUDA_LT=1
 5 | export TORCH_CUDNN_USE_HEURISTIC_MODE_B=1
 6 | export NCCL_SOCKET_IFNAME=bond0
 7 | # export NCCL_DEBUG="INFO"
 8 | export NCCL_IB_HCA=mlx5_0
 9 | 
10 | export WANDB_NAME=$(basename $0)_$(date +"%Y%m%d_%H%M%S")
11 | 
12 | export PYTHONPATH=".:$PYTHONPATH"
13 | OUTDIR=./checkpoints/$WANDB_NAME
14 | 
15 | export DEBUG_MODE="true"
16 | export LOG_PATH="./logs/${WANDB_NAME}.log"
17 | 
18 | 
19 | # srun accelerate launch --config_file=/mnt/petrelfs/yanziang/videoo1/TimeZero/configs/zero3.yaml 
20 | srun torchrun --nproc_per_node="8" \
21 |     --nnodes="1" \
22 |     --node_rank="0" \
23 |     --master_addr="127.0.0.1" \
24 |     --master_port="12951" \
25 |     src/sft_gqa.py \
26 |     --deepspeed scripts/zero3_offload.json \
27 |     --model_name_or_path your_base_dir/Qwen2.5-VL-7B-Instruct \
28 |     --preprocessed_data_path ./Charades_preprocessed_data_maxpix_3584 \
29 |     --train_data_path your_base_dir/NextGQA/nextgqa_val.json \
30 |     --eval_data_path your_base_dir/NextGQA/nextgqa_test.json \
31 |     --video_folder p2:s3://nextqa \
32 |     --dataset_name xxx \
33 |     --learning_rate 2.0e-5 \
34 |     --num_train_epochs 1 \
35 |     --max_seq_length 8192 \
36 |     --per_device_train_batch_size 1 \
37 |     --gradient_accumulation_steps 1 \
38 |     --gradient_checkpointing \
39 |     --bf16 \
40 |     --torch_dtype bfloat16 \
41 |     --logging_steps 5 \
42 |     --eval_strategy no \
43 |     --report_to tensorboard \
44 |     --output_dir $OUTDIR \
45 |     --save_steps 30000 \
46 |     --save_only_model true


--------------------------------------------------------------------------------
/training_scripts/run_sft_video_track.sh:
--------------------------------------------------------------------------------
 1 | 
 2 | # export WANDB_PROJECT=Video-GRPO
 3 | export OMP_NUM_THREADS=1
 4 | export DISABLE_ADDMM_CUDA_LT=1
 5 | export TORCH_CUDNN_USE_HEURISTIC_MODE_B=1
 6 | export NCCL_SOCKET_IFNAME=bond0
 7 | # export NCCL_DEBUG="INFO"
 8 | export NCCL_IB_HCA=mlx5_0
 9 | 
10 | export WANDB_NAME=$(basename $0)_$(date +"%Y%m%d_%H%M%S")
11 | 
12 | export PYTHONPATH=".:$PYTHONPATH"
13 | OUTDIR=./checkpoints/$WANDB_NAME
14 | 
15 | export DEBUG_MODE="true"
16 | export LOG_PATH="./logs/${WANDB_NAME}.log"
17 | 
18 | 
19 | # srun accelerate launch --config_file=/mnt/petrelfs/yanziang/videoo1/TimeZero/configs/zero3.yaml 
20 | srun torchrun --nproc_per_node="8" \
21 |     --nnodes="1" \
22 |     --node_rank="0" \
23 |     --master_addr="127.0.0.1" \
24 |     --master_port="12951" \
25 |     src/sft_track.py \
26 |     --deepspeed scripts/zero3_offload.json \
27 |     --model_name_or_path your_base_dir/Qwen2.5-VL-7B-Instruct \
28 |     --preprocessed_data_path ./Charades_preprocessed_data_maxpix_3584 \
29 |     --train_data_path your_base_dir/track_got_train.json \
30 |     --eval_data_path your_base_dir/track_got_train.json \
31 |     --video_folder p2:s3://nextqa \
32 |     --dataset_name xxx \
33 |     --learning_rate 2.0e-5 \
34 |     --num_train_epochs 1 \
35 |     --max_seq_length 8192 \
36 |     --per_device_train_batch_size 1 \
37 |     --gradient_accumulation_steps 1 \
38 |     --gradient_checkpointing \
39 |     --bf16 \
40 |     --torch_dtype bfloat16 \
41 |     --logging_steps 5 \
42 |     --eval_strategy no \
43 |     --report_to tensorboard \
44 |     --output_dir $OUTDIR \
45 |     --save_steps 30000 \
46 |     --save_only_model true


--------------------------------------------------------------------------------
/training_scripts/zero3_offload.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "fp16": {
 3 |         "enabled": "auto",
 4 |         "loss_scale": 0,
 5 |         "loss_scale_window": 1000,
 6 |         "initial_scale_power": 16,
 7 |         "hysteresis": 2,
 8 |         "min_loss_scale": 1
 9 |     },
10 |     "bf16": {
11 |         "enabled": "auto"
12 |     },
13 |     "optimizer": {
14 |         "type": "AdamW",
15 |         "params": {
16 |             "lr": "auto",
17 |             "betas": "auto",
18 |             "eps": "auto",
19 |             "weight_decay": "auto"
20 |         }
21 |     },
22 |     "zero_optimization": {
23 |         "stage": 3,
24 |         "offload_optimizer": {
25 |             "device": "cpu",
26 |             "pin_memory": true
27 |         },
28 |         "offload_param": {
29 |             "device": "cpu",
30 |             "pin_memory": true
31 |         },
32 |         "overlap_comm": true,
33 |         "contiguous_gradients": true,
34 |         "sub_group_size": 1e9,
35 |         "reduce_bucket_size": "auto",
36 |         "stage3_prefetch_bucket_size": "auto",
37 |         "stage3_param_persistence_threshold": "auto",
38 |         "stage3_max_live_parameters": 1e9,
39 |         "stage3_max_reuse_distance": 1e9,
40 |         "gather_16bit_weights_on_model_save": true
41 |     },
42 |     "gradient_accumulation_steps": "auto",
43 |     "gradient_clipping": "auto",
44 |     "train_batch_size": "auto",
45 |     "train_micro_batch_size_per_gpu": "auto",
46 |     "steps_per_print": 1e5,
47 |     "wall_clock_breakdown": false
48 | }


--------------------------------------------------------------------------------